From f9e96793ccd0fabd33fc1cda4805f3a388ba27c1 Mon Sep 17 00:00:00 2001 From: Vladislav Mikhalin Date: Tue, 16 Jul 2024 15:03:07 +0300 Subject: [PATCH] Implemented load_buffer_format_* conversions (#295) * Implemented load_buffer_format_* conversions * clang-format insists on ugly things --- .../spirv/emit_spirv_context_get_set.cpp | 241 +++++++++++++++--- .../backend/spirv/emit_spirv_instructions.h | 4 + .../backend/spirv/spirv_emit_context.cpp | 5 +- .../backend/spirv/spirv_emit_context.h | 1 + .../frontend/translate/translate.cpp | 33 ++- .../frontend/translate/translate.h | 2 +- .../frontend/translate/vector_memory.cpp | 6 +- src/shader_recompiler/ir/ir_emitter.cpp | 16 ++ src/shader_recompiler/ir/ir_emitter.h | 2 + src/shader_recompiler/ir/opcodes.inc | 4 + .../ir/passes/resource_tracking_pass.cpp | 133 ++++++++-- src/shader_recompiler/runtime_info.h | 3 +- src/video_core/amdgpu/pixel_format.cpp | 106 ++++++++ src/video_core/amdgpu/pixel_format.h | 2 + src/video_core/amdgpu/resource.h | 8 - 15 files changed, 475 insertions(+), 91 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 75ee3ae9..c88a1cbb 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -4,6 +4,8 @@ #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" +#include + namespace Shader::Backend::SPIRV { namespace { @@ -209,57 +211,216 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen ctx.OpStore(pointer, value); } -Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - const auto info = inst->Flags(); - const auto& buffer = ctx.buffers[handle]; - if (info.index_enable && info.offset_enable) { - UNREACHABLE(); - } else if (info.index_enable) { - const Id ptr{ - ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)}; - return ctx.OpLoad(buffer.data_types->Get(1), ptr); - } - UNREACHABLE(); -} - Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { return EmitLoadBufferF32(ctx, inst, handle, address); } -Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - const auto info = inst->Flags(); +template +static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) { const auto& buffer = ctx.buffers[handle]; - boost::container::static_vector ids; - for (u32 i = 0; i < 2; i++) { - const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); + Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + if constexpr (N == 1) { + const Id ptr{ + ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)}; + return ctx.OpLoad(buffer.data_types->Get(1), ptr); + } else { + boost::container::static_vector ids; + for (u32 i = 0; i < N; i++) { + index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); + const Id ptr{ + ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); + } + return ctx.OpCompositeConstruct(buffer.data_types->Get(N), ids); } - return ctx.OpCompositeConstruct(buffer.data_types->Get(2), ids); } -Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - const auto info = inst->Flags(); - const auto& buffer = ctx.buffers[handle]; - boost::container::static_vector ids; - for (u32 i = 0; i < 3; i++) { - const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); - } - return ctx.OpCompositeConstruct(buffer.data_types->Get(3), ids); +Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + return EmitLoadBufferF32xN<1>(ctx, handle, address); } -Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - const auto info = inst->Flags(); - const auto& buffer = ctx.buffers[handle]; - boost::container::static_vector ids; - for (u32 i = 0; i < 4; i++) { - const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); +Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + return EmitLoadBufferF32xN<2>(ctx, handle, address); +} + +Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + return EmitLoadBufferF32xN<3>(ctx, handle, address); +} + +Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + return EmitLoadBufferF32xN<4>(ctx, handle, address); +} + +static bool IsSignedInteger(AmdGpu::NumberFormat format) { + switch (format) { + case AmdGpu::NumberFormat::Unorm: + case AmdGpu::NumberFormat::Uscaled: + case AmdGpu::NumberFormat::Uint: + return false; + case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::Sscaled: + case AmdGpu::NumberFormat::Sint: + case AmdGpu::NumberFormat::SnormNz: + return true; + case AmdGpu::NumberFormat::Float: + default: + UNREACHABLE(); } - return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids); +} + +static u32 UXBitsMax(u32 bit_width) { + return (1u << bit_width) - 1u; +} + +static u32 SXBitsMax(u32 bit_width) { + return (1u << (bit_width - 1u)) - 1u; +} + +static Id ConvertValue(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) { + switch (format) { + case AmdGpu::NumberFormat::Unorm: + return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width)))); + case AmdGpu::NumberFormat::Snorm: + return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width)))); + case AmdGpu::NumberFormat::SnormNz: + // (x * 2 + 1) / (Format::SMAX * 2) + value = ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(2.f)); + value = ctx.OpFAdd(ctx.F32[1], value, ctx.ConstF32(1.f)); + return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width) * 2))); + case AmdGpu::NumberFormat::Uscaled: + case AmdGpu::NumberFormat::Sscaled: + case AmdGpu::NumberFormat::Uint: + case AmdGpu::NumberFormat::Sint: + case AmdGpu::NumberFormat::Float: + return value; + default: + UNREACHABLE_MSG("Unsupported number fromat for conversion: {}", + magic_enum::enum_name(format)); + } +} + +static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offset) { + Id comp_offset = ctx.ConstU32(bit_offset); + if (stride < 4) { + // comp_offset += (address % 4) * 8; + const Id byte_offset = ctx.OpUMod(ctx.U32[1], address, ctx.ConstU32(4u)); + const Id bit_offset = ctx.OpShiftLeftLogical(ctx.U32[1], byte_offset, ctx.ConstU32(3u)); + comp_offset = ctx.OpIAdd(ctx.U32[1], comp_offset, bit_offset); + } + return comp_offset; +} + +static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) { + const auto& buffer = ctx.buffers[handle]; + const auto format = buffer.buffer.GetDataFmt(); + switch (format) { + case AmdGpu::DataFormat::FormatInvalid: + return ctx.f32_zero_value; + case AmdGpu::DataFormat::Format8: + case AmdGpu::DataFormat::Format16: + case AmdGpu::DataFormat::Format32: + case AmdGpu::DataFormat::Format8_8: + case AmdGpu::DataFormat::Format16_16: + case AmdGpu::DataFormat::Format10_11_11: + case AmdGpu::DataFormat::Format11_11_10: + case AmdGpu::DataFormat::Format10_10_10_2: + case AmdGpu::DataFormat::Format2_10_10_10: + case AmdGpu::DataFormat::Format8_8_8_8: + case AmdGpu::DataFormat::Format32_32: + case AmdGpu::DataFormat::Format16_16_16_16: + case AmdGpu::DataFormat::Format32_32_32: + case AmdGpu::DataFormat::Format32_32_32_32: { + const u32 num_components = AmdGpu::NumComponents(format); + if (comp >= num_components) { + return ctx.f32_zero_value; + } + + // uint index = address / 4; + Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const u32 stride = buffer.buffer.GetStride(); + if (stride > 4) { + const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32); + if (index_offset > 0) { + // index += index_offset; + index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(index_offset)); + } + } + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); + + const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32; + const u32 bit_width = AmdGpu::ComponentBits(format, comp); + const auto num_format = buffer.buffer.GetNumberFmt(); + if (num_format == AmdGpu::NumberFormat::Float) { + if (bit_width == 32) { + return ctx.OpLoad(ctx.F32[1], ptr); + } else if (bit_width == 16) { + const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); + Id value = ctx.OpLoad(ctx.U32[1], ptr); + value = + ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width)); + value = ctx.OpSConvert(ctx.U16, value); + value = ctx.OpBitcast(ctx.F16[1], value); + return ctx.OpFConvert(ctx.F32[1], value); + } else { + UNREACHABLE_MSG("Invalid float bit width {}", bit_width); + } + } else { + Id value = ctx.OpLoad(ctx.U32[1], ptr); + const bool is_signed = IsSignedInteger(num_format); + if (bit_width < 32) { + const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); + if (is_signed) { + value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, + ctx.ConstU32(bit_width)); + value = ctx.OpConvertSToF(ctx.F32[1], value); + } else { + value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset, + ctx.ConstU32(bit_width)); + value = ctx.OpConvertUToF(ctx.F32[1], value); + } + } else { + if (is_signed) { + value = ctx.OpConvertSToF(ctx.F32[1], value); + } else { + value = ctx.OpConvertUToF(ctx.F32[1], value); + } + } + return ConvertValue(ctx, value, num_format, bit_width); + } + break; + } + default: + UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); + } +} + +template +static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + if constexpr (N == 1) { + return GetBufferFormatValue(ctx, handle, address, 0); + } else { + boost::container::static_vector ids; + for (u32 i = 0; i < N; i++) { + ids.push_back(GetBufferFormatValue(ctx, handle, address, i)); + } + return ctx.OpCompositeConstruct(ctx.F32[N], ids); + } +} + +Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return EmitLoadBufferFormatF32xN<1>(ctx, inst, handle, address); +} + +Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return EmitLoadBufferFormatF32xN<2>(ctx, inst, handle, address); +} + +Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return EmitLoadBufferFormatF32xN<3>(ctx, inst, handle, address); +} + +Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return EmitLoadBufferFormatF32xN<4>(ctx, inst, handle, address); } void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index e0b19f4f..f43ea3b3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -66,6 +66,10 @@ Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 98f9d1c7..3ea01a1d 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -301,9 +301,7 @@ void EmitContext::DefineBuffers(const Info& info) { for (u32 i = 0; const auto& buffer : info.buffers) { const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; const Id data_type = (*data_types)[1]; - const u32 stride = buffer.stride == 0 ? 1 : buffer.stride; - const u32 num_elements = stride * buffer.num_records; - const Id record_array_type{TypeArray(data_type, ConstU32(num_elements))}; + const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))}; const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { Decorate(record_array_type, spv::Decoration::ArrayStride, 4); @@ -333,6 +331,7 @@ void EmitContext::DefineBuffers(const Info& info) { .id = id, .data_types = data_types, .pointer_type = pointer_type, + .buffer = buffer.GetVsharp(info), }); interfaces.push_back(id); i++; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index b51edd63..0f8081fd 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -201,6 +201,7 @@ public: Id id; const VectorIds* data_types; Id pointer_type; + AmdGpu::Buffer buffer; }; u32& binding; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 96f08519..bc2e0bf2 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -254,8 +254,7 @@ void Translator::EmitFetch(const GcnInst& inst) { info.buffers.push_back({ .sgpr_base = attrib.sgpr_base, .dword_offset = attrib.dword_offset, - .stride = buffer.GetStride(), - .num_records = buffer.num_records, + .length = buffer.num_records, .used_types = IR::Type::F32, .is_storage = true, // we may not fit into UBO with large meshes .is_instance_data = true, @@ -571,28 +570,40 @@ void Translate(IR::Block* block, u32 block_base, std::span inst_l translator.V_CNDMASK_B32(inst); break; case Opcode::TBUFFER_LOAD_FORMAT_X: - translator.BUFFER_LOAD_FORMAT(1, true, inst); + translator.BUFFER_LOAD_FORMAT(1, true, true, inst); break; case Opcode::TBUFFER_LOAD_FORMAT_XY: - translator.BUFFER_LOAD_FORMAT(2, true, inst); + translator.BUFFER_LOAD_FORMAT(2, true, true, inst); break; case Opcode::TBUFFER_LOAD_FORMAT_XYZ: - translator.BUFFER_LOAD_FORMAT(3, true, inst); + translator.BUFFER_LOAD_FORMAT(3, true, true, inst); break; case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - translator.BUFFER_LOAD_FORMAT(4, true, inst); + translator.BUFFER_LOAD_FORMAT(4, true, true, inst); break; case Opcode::BUFFER_LOAD_FORMAT_X: - case Opcode::BUFFER_LOAD_DWORD: - translator.BUFFER_LOAD_FORMAT(1, false, inst); + translator.BUFFER_LOAD_FORMAT(1, false, true, inst); + break; + case Opcode::BUFFER_LOAD_FORMAT_XY: + translator.BUFFER_LOAD_FORMAT(2, false, true, inst); break; case Opcode::BUFFER_LOAD_FORMAT_XYZ: - case Opcode::BUFFER_LOAD_DWORDX3: - translator.BUFFER_LOAD_FORMAT(3, false, inst); + translator.BUFFER_LOAD_FORMAT(3, false, true, inst); break; case Opcode::BUFFER_LOAD_FORMAT_XYZW: + translator.BUFFER_LOAD_FORMAT(4, false, true, inst); + break; + case Opcode::BUFFER_LOAD_DWORD: + translator.BUFFER_LOAD_FORMAT(1, false, false, inst); + break; + case Opcode::BUFFER_LOAD_DWORDX2: + translator.BUFFER_LOAD_FORMAT(2, false, false, inst); + break; + case Opcode::BUFFER_LOAD_DWORDX3: + translator.BUFFER_LOAD_FORMAT(3, false, false, inst); + break; case Opcode::BUFFER_LOAD_DWORDX4: - translator.BUFFER_LOAD_FORMAT(4, false, inst); + translator.BUFFER_LOAD_FORMAT(4, false, false, inst); break; case Opcode::BUFFER_STORE_FORMAT_X: case Opcode::BUFFER_STORE_DWORD: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 6dd0a481..2aa6f712 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -160,7 +160,7 @@ public: void V_CMP_CLASS_F32(const GcnInst& inst); // Vector Memory - void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); // Vector interpolation diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index f0ef85b3..1ddee523 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -225,7 +225,8 @@ void Translator::IMAGE_STORE(const GcnInst& inst) { ir.ImageWrite(handle, body, value, {}); } -void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { +void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, + const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -254,7 +255,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info); + const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info) + : ir.LoadBuffer(num_dwords, handle, address, info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::F32{value}); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 5dabbb4c..cd4fdaa2 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -327,6 +327,22 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad } } +Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info) { + switch (num_dwords) { + case 1: + return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); + case 2: + return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address); + case 3: + return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address); + case 4: + return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address); + default: + UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); + } +} + void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info) { switch (num_dwords) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 5d6fd714..e7512430 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -89,6 +89,8 @@ public: [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info); void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 94ef1784..9aefc8b3 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -79,6 +79,10 @@ OPCODE(LoadBufferF32, F32, Opaq OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32, F32, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32x2, F32x2, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32x3, F32x3, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32x4, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index b7d6a722..f58b4d96 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -27,6 +27,10 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: + case IR::Opcode::LoadBufferFormatF32: + case IR::Opcode::LoadBufferFormatF32x2: + case IR::Opcode::LoadBufferFormatF32x3: + case IR::Opcode::LoadBufferFormatF32x4: case IR::Opcode::LoadBufferU32: case IR::Opcode::ReadConstBuffer: case IR::Opcode::ReadConstBufferU32: @@ -41,8 +45,49 @@ bool IsBufferInstruction(const IR::Inst& inst) { } } -IR::Type BufferDataType(const IR::Inst& inst) { +static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) { + switch (num_format) { + case AmdGpu::NumberFormat::Float: + switch (data_format) { + case AmdGpu::DataFormat::Format16: + case AmdGpu::DataFormat::Format16_16: + case AmdGpu::DataFormat::Format16_16_16_16: + return true; + default: + return false; + } + case AmdGpu::NumberFormat::Unorm: + case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::Uscaled: + case AmdGpu::NumberFormat::Sscaled: + case AmdGpu::NumberFormat::Uint: + case AmdGpu::NumberFormat::Sint: + case AmdGpu::NumberFormat::SnormNz: + default: + return false; + } +} + +IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { switch (inst.GetOpcode()) { + case IR::Opcode::LoadBufferFormatF32: + case IR::Opcode::LoadBufferFormatF32x2: + case IR::Opcode::LoadBufferFormatF32x3: + case IR::Opcode::LoadBufferFormatF32x4: + switch (num_format) { + case AmdGpu::NumberFormat::Unorm: + case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::Uscaled: + case AmdGpu::NumberFormat::Sscaled: + case AmdGpu::NumberFormat::Uint: + case AmdGpu::NumberFormat::Sint: + case AmdGpu::NumberFormat::SnormNz: + return IR::Type::U32; + case AmdGpu::NumberFormat::Float: + return IR::Type::F32; + default: + UNREACHABLE(); + } case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: @@ -141,7 +186,7 @@ public: desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; - ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records); + ASSERT(buffer.length == desc.length); buffer.is_storage |= desc.is_storage; buffer.used_types |= desc.used_types; return index; @@ -263,6 +308,41 @@ SharpLocation TrackSharp(const IR::Inst* inst) { static constexpr size_t MaxUboSize = 65536; +static bool IsLoadBufferFormat(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadBufferFormatF32: + case IR::Opcode::LoadBufferFormatF32x2: + case IR::Opcode::LoadBufferFormatF32x3: + case IR::Opcode::LoadBufferFormatF32x4: + return true; + default: + return false; + } +} + +static bool IsReadConstBuffer(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::ReadConstBuffer: + case IR::Opcode::ReadConstBufferU32: + return true; + default: + return false; + } +} + +static u32 BufferLength(const AmdGpu::Buffer& buffer) { + const auto stride = buffer.GetStride(); + if (stride < sizeof(f32)) { + ASSERT(sizeof(f32) % stride == 0); + return (((buffer.num_records - 1) / sizeof(f32)) + 1) * stride; + } else if (stride == sizeof(f32)) { + return buffer.num_records; + } else { + ASSERT(stride % sizeof(f32) == 0); + return buffer.num_records * (stride / sizeof(f32)); + } +} + s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, AmdGpu::Buffer& cbuf) { @@ -298,9 +378,8 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, return descriptors.Add(BufferResource{ .sgpr_base = std::numeric_limits::max(), .dword_offset = 0, - .stride = cbuf.GetStride(), - .num_records = u32(cbuf.num_records), - .used_types = BufferDataType(inst), + .length = BufferLength(cbuf), + .used_types = BufferDataType(inst, cbuf.GetNumberFmt()), .inline_cbuf = cbuf, .is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize, }); @@ -318,9 +397,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, binding = descriptors.Add(BufferResource{ .sgpr_base = sharp.sgpr_base, .dword_offset = sharp.dword_offset, - .stride = buffer.GetStride(), - .num_records = u32(buffer.num_records), - .used_types = BufferDataType(inst), + .length = BufferLength(buffer), + .used_types = BufferDataType(inst, buffer.GetNumberFmt()), .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, }); } @@ -337,24 +415,31 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, inst_info.dmft == AmdGpu::DataFormat::Format32_32 || inst_info.dmft == AmdGpu::DataFormat::Format32)); } - if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer || - inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) { + + if (IsReadConstBuffer(inst)) { return; } - // Calculate buffer address. - const u32 dword_stride = buffer.GetStrideElements(sizeof(u32)); - const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32); - IR::U32 address = ir.Imm32(dword_offset); - if (inst_info.index_enable && inst_info.offset_enable) { - const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 1)}; - const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 0)}; - address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); - address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2))); - } else if (inst_info.index_enable) { - const IR::U32 index{inst.Arg(1)}; - address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); - } else if (inst_info.offset_enable) { - const IR::U32 offset{inst.Arg(1)}; + + if (IsLoadBufferFormat(inst)) { + if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) { + info.uses_fp16 = true; + } + } else { + const u32 stride = buffer.GetStride(); + ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}", + stride); + } + + IR::U32 address = ir.Imm32(inst_info.inst_offset.Value()); + if (inst_info.index_enable) { + const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} + : IR::U32{inst.Arg(1)}; + address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride()))); + } + if (inst_info.offset_enable) { + const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)} + : IR::U32{inst.Arg(1)}; + address = ir.IAdd(address, offset); } inst.SetArg(1, address); } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 2b2103ca..8824e344 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -74,8 +74,7 @@ struct Info; struct BufferResource { u32 sgpr_base; u32 dword_offset; - u32 stride; - u32 num_records; + u32 length; IR::Type used_types; AmdGpu::Buffer inline_cbuf; bool is_storage{false}; diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 5f6eb903..6618e72a 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -66,4 +66,110 @@ int NumBits(DataFormat format) { return num_bits_per_element[index]; } +static constexpr std::array component_bits = { + std::array{0, 0, 0, 0}, // 0 FormatInvalid + std::array{8, 0, 0, 0}, // 1 Format8 + std::array{16, 0, 0, 0}, // 2 Format16 + std::array{8, 8, 0, 0}, // 3 Format8_8 + std::array{32, 0, 0, 0}, // 4 Format32 + std::array{16, 16, 0, 0}, // 5 Format16_16 + std::array{10, 11, 11, 0}, // 6 Format10_11_11 + std::array{11, 11, 10, 0}, // 7 Format11_11_10 + std::array{10, 10, 10, 2}, // 8 Format10_10_10_2 + std::array{2, 10, 10, 10}, // 9 Format2_10_10_10 + std::array{8, 8, 8, 8}, // 10 Format8_8_8_8 + std::array{32, 32, 0, 0}, // 11 Format32_32 + std::array{16, 16, 16, 16}, // 12 Format16_16_16_16 + std::array{32, 32, 32, 0}, // 13 Format32_32_32 + std::array{32, 32, 32, 32}, // 14 Format32_32_32_32 + std::array{0, 0, 0, 0}, // 15 + std::array{5, 6, 5, 0}, // 16 Format5_6_5 + std::array{1, 5, 5, 5}, // 17 Format1_5_5_5 + std::array{5, 5, 5, 1}, // 18 Format5_5_5_1 + std::array{4, 4, 4, 4}, // 19 Format4_4_4_4 + std::array{8, 24, 0, 0}, // 20 Format8_24 + std::array{24, 8, 0, 0}, // 21 Format24_8 + std::array{24, 8, 0, 0}, // 22 FormatX24_8_32 + std::array{0, 0, 0, 0}, // 23 + std::array{0, 0, 0, 0}, // 24 + std::array{0, 0, 0, 0}, // 25 + std::array{0, 0, 0, 0}, // 26 + std::array{0, 0, 0, 0}, // 27 + std::array{0, 0, 0, 0}, // 28 + std::array{0, 0, 0, 0}, // 29 + std::array{0, 0, 0, 0}, // 30 + std::array{0, 0, 0, 0}, // 31 + std::array{0, 0, 0, 0}, // 32 FormatGB_GR + std::array{0, 0, 0, 0}, // 33 FormatBG_RG + std::array{0, 0, 0, 0}, // 34 Format5_9_9_9 + std::array{0, 0, 0, 0}, // 35 FormatBc1 + std::array{0, 0, 0, 0}, // 36 FormatBc2 + std::array{0, 0, 0, 0}, // 37 FormatBc3 + std::array{0, 0, 0, 0}, // 38 FormatBc4 + std::array{0, 0, 0, 0}, // 39 FormatBc5 + std::array{0, 0, 0, 0}, // 40 FormatBc6 + std::array{0, 0, 0, 0}, // 41 FormatBc7 +}; + +u32 ComponentBits(DataFormat format, u32 comp) { + const u32 index = static_cast(format); + if (index >= component_bits.size() || comp >= 4) { + return 0; + } + return component_bits[index][comp]; +} + +static constexpr std::array component_offset = { + std::array{-1, -1, -1, -1}, // 0 FormatInvalid + std::array{0, -1, -1, -1}, // 1 Format8 + std::array{0, -1, -1, -1}, // 2 Format16 + std::array{0, 8, -1, -1}, // 3 Format8_8 + std::array{0, -1, -1, -1}, // 4 Format32 + std::array{0, 16, -1, -1}, // 5 Format16_16 + std::array{0, 10, 21, -1}, // 6 Format10_11_11 + std::array{0, 11, 22, -1}, // 7 Format11_11_10 + std::array{0, 10, 20, 30}, // 8 Format10_10_10_2 + std::array{0, 2, 12, 22}, // 9 Format2_10_10_10 + std::array{0, 8, 16, 24}, // 10 Format8_8_8_8 + std::array{0, 32, -1, -1}, // 11 Format32_32 + std::array{0, 16, 32, 48}, // 12 Format16_16_16_16 + std::array{0, 32, 64, -1}, // 13 Format32_32_32 + std::array{0, 32, 64, 96}, // 14 Format32_32_32_32 + std::array{-1, -1, -1, -1}, // 15 + std::array{0, 5, 11, -1}, // 16 Format5_6_5 + std::array{0, 1, 6, 11}, // 17 Format1_5_5_5 + std::array{0, 5, 10, 15}, // 18 Format5_5_5_1 + std::array{0, 4, 8, 12}, // 19 Format4_4_4_4 + std::array{0, 8, -1, -1}, // 20 Format8_24 + std::array{0, 24, -1, -1}, // 21 Format24_8 + std::array{0, 24, -1, -1}, // 22 FormatX24_8_32 + std::array{-1, -1, -1, -1}, // 23 + std::array{-1, -1, -1, -1}, // 24 + std::array{-1, -1, -1, -1}, // 25 + std::array{-1, -1, -1, -1}, // 26 + std::array{-1, -1, -1, -1}, // 27 + std::array{-1, -1, -1, -1}, // 28 + std::array{-1, -1, -1, -1}, // 29 + std::array{-1, -1, -1, -1}, // 30 + std::array{-1, -1, -1, -1}, // 31 + std::array{-1, -1, -1, -1}, // 32 FormatGB_GR + std::array{-1, -1, -1, -1}, // 33 FormatBG_RG + std::array{-1, -1, -1, -1}, // 34 Format5_9_9_9 + std::array{-1, -1, -1, -1}, // 35 FormatBc1 + std::array{-1, -1, -1, -1}, // 36 FormatBc2 + std::array{-1, -1, -1, -1}, // 37 FormatBc3 + std::array{-1, -1, -1, -1}, // 38 FormatBc4 + std::array{-1, -1, -1, -1}, // 39 FormatBc5 + std::array{-1, -1, -1, -1}, // 40 FormatBc6 + std::array{-1, -1, -1, -1}, // 41 FormatBc7 +}; + +s32 ComponentOffset(DataFormat format, u32 comp) { + const u32 index = static_cast(format); + if (index >= component_offset.size() || comp >= 4) { + return -1; + } + return component_offset[index][comp]; +} + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 22d102af..2a38c5a0 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -65,6 +65,8 @@ enum class NumberFormat : u32 { int NumComponents(DataFormat format); int NumBits(DataFormat format); +u32 ComponentBits(DataFormat format, u32 comp); +s32 ComponentOffset(DataFormat format, u32 comp); } // namespace AmdGpu diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index ba3de154..1247c025 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -62,14 +62,6 @@ struct Buffer { return stride == 0 ? 1U : stride; } - u32 GetStrideElements(u32 element_size) const noexcept { - if (stride == 0) { - return 1U; - } - ASSERT(stride % element_size == 0); - return stride / element_size; - } - u32 GetSize() const noexcept { return GetStride() * num_records; }