From 9adc6382201abf609bebb69e841a3de80b15554e Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Thu, 15 Aug 2024 00:15:07 +0200 Subject: [PATCH] shader_recompiler: basic implementation of `BUFFER_STORE_FORMAT_` (#431) * shader_recompiler: basic implementation of buffer store w\ fmt conversion * added `Format16` dfmt --- .../spirv/emit_spirv_context_get_set.cpp | 92 +++++++++++++++++++ .../backend/spirv/emit_spirv_instructions.h | 4 + .../frontend/translate/translate.h | 2 +- .../frontend/translate/vector_memory.cpp | 35 +++++-- src/shader_recompiler/ir/ir_emitter.cpp | 20 ++++ src/shader_recompiler/ir/ir_emitter.h | 2 + src/shader_recompiler/ir/microinstruction.cpp | 4 + src/shader_recompiler/ir/opcodes.inc | 4 + .../ir/passes/resource_tracking_pass.cpp | 12 +++ 9 files changed, 165 insertions(+), 10 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 02600b94..bbf259fe 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -467,4 +467,96 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address EmitStoreBufferF32xN<1>(ctx, handle, address, value); } +static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, + u32 bit_width) { + switch (format) { + case AmdGpu::NumberFormat::Unorm: + return ctx.OpConvertFToU( + ctx.U32[1], ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width))))); + case AmdGpu::NumberFormat::Uint: + return ctx.OpBitcast(ctx.U32[1], value); + case AmdGpu::NumberFormat::Float: + return value; + default: + UNREACHABLE_MSG("Unsupported number fromat for conversion: {}", + magic_enum::enum_name(format)); + } +} + +template +static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { + auto& buffer = ctx.buffers[handle]; + const auto format = buffer.buffer.GetDataFmt(); + const auto num_format = buffer.buffer.GetNumberFmt(); + + switch (format) { + case AmdGpu::DataFormat::FormatInvalid: + return; + case AmdGpu::DataFormat::Format8_8_8_8: + case AmdGpu::DataFormat::Format16: + case AmdGpu::DataFormat::Format32: + case AmdGpu::DataFormat::Format32_32_32_32: { + ASSERT(N == AmdGpu::NumComponents(format)); + + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); + + Id packed_value{}; + for (u32 i = 0; i < N; i++) { + const u32 bit_width = AmdGpu::ComponentBits(format, i); + const u32 bit_offset = AmdGpu::ComponentOffset(format, i) % 32; + + const Id comp{ConvertF32ToFormat( + ctx, N == 1 ? value : ctx.OpCompositeExtract(ctx.F32[1], value, i), num_format, + bit_width)}; + + if (bit_width == 32) { + if constexpr (N == 1) { + ctx.OpStore(ptr, comp); + } else { + const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, + ctx.u32_zero_value, index_i); + ctx.OpStore(ptr, comp); + } + } else { + if (i == 0) { + packed_value = comp; + } else { + packed_value = + ctx.OpBitFieldInsert(ctx.U32[1], packed_value, comp, + ctx.ConstU32(bit_offset), ctx.ConstU32(bit_width)); + } + + if (i == N - 1) { + ctx.OpStore(ptr, packed_value); + } + } + } + } break; + default: + UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); + } +} + +void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferFormatF32xN<1>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<2>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<3>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<4>(ctx, handle, address, value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index f868527f..8a0fcd4b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -76,6 +76,10 @@ void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 2e12209d..9ebcb116 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -186,7 +186,7 @@ public: // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); - void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); // Vector interpolation void V_INTERP_P2_F32(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index f708b9fb..63f6c3b4 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -53,6 +53,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::IMAGE_GET_RESINFO: return IMAGE_GET_RESINFO(inst); + // Buffer load operations case Opcode::TBUFFER_LOAD_FORMAT_X: return BUFFER_LOAD_FORMAT(1, true, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XY: @@ -61,6 +62,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, true, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZW: return BUFFER_LOAD_FORMAT(4, true, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_X: return BUFFER_LOAD_FORMAT(1, false, true, inst); case Opcode::BUFFER_LOAD_FORMAT_XY: @@ -69,6 +71,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, false, true, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZW: return BUFFER_LOAD_FORMAT(4, false, true, inst); + case Opcode::BUFFER_LOAD_DWORD: return BUFFER_LOAD_FORMAT(1, false, false, inst); case Opcode::BUFFER_LOAD_DWORDX2: @@ -77,16 +80,25 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, false, false, inst); case Opcode::BUFFER_LOAD_DWORDX4: return BUFFER_LOAD_FORMAT(4, false, false, inst); + + // Buffer store operations case Opcode::BUFFER_STORE_FORMAT_X: - case Opcode::BUFFER_STORE_DWORD: - return BUFFER_STORE_FORMAT(1, false, inst); - case Opcode::BUFFER_STORE_DWORDX2: - return BUFFER_STORE_FORMAT(2, false, inst); - case Opcode::BUFFER_STORE_DWORDX3: - return BUFFER_STORE_FORMAT(3, false, inst); + return BUFFER_STORE_FORMAT(1, false, true, inst); + case Opcode::BUFFER_STORE_FORMAT_XY: + return BUFFER_STORE_FORMAT(2, false, true, inst); + case Opcode::BUFFER_STORE_FORMAT_XYZ: + return BUFFER_STORE_FORMAT(3, false, true, inst); case Opcode::BUFFER_STORE_FORMAT_XYZW: + return BUFFER_STORE_FORMAT(4, false, true, inst); + + case Opcode::BUFFER_STORE_DWORD: + return BUFFER_STORE_FORMAT(1, false, false, inst); + case Opcode::BUFFER_STORE_DWORDX2: + return BUFFER_STORE_FORMAT(2, false, false, inst); + case Opcode::BUFFER_STORE_DWORDX3: + return BUFFER_STORE_FORMAT(3, false, false, inst); case Opcode::BUFFER_STORE_DWORDX4: - return BUFFER_STORE_FORMAT(4, false, inst); + return BUFFER_STORE_FORMAT(4, false, false, inst); default: LogMissingOpcode(inst); } @@ -359,7 +371,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma } } -void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { +void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, + const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -410,7 +423,11 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnIns const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - ir.StoreBuffer(num_dwords, handle, address, value, info); + if (is_format) { + ir.StoreBufferFormat(num_dwords, handle, address, value, info); + } else { + ir.StoreBuffer(num_dwords, handle, address, value, info); + } } void Translator::IMAGE_GET_LOD(const GcnInst& inst) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3ff347fb..4271ac35 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -347,6 +347,26 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad } } +void IREmitter::StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info) { + switch (num_dwords) { + case 1: + Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); + break; + case 2: + Inst(Opcode::StoreBufferFormatF32x2, Flags{info}, handle, address, data); + break; + case 3: + Inst(Opcode::StoreBufferFormatF32x3, Flags{info}, handle, address, data); + break; + case 4: + Inst(Opcode::StoreBufferFormatF32x4, Flags{info}, handle, address, data); + break; + default: + UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); + } +} + U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index c226edac..59ced93e 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -93,6 +93,8 @@ public: BufferInstInfo info); void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); + void StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info); [[nodiscard]] U32 LaneId(); [[nodiscard]] U32 WarpId(); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 5d413c8a..a8166125 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -55,6 +55,10 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x2: case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: + case Opcode::StoreBufferFormatF32: + case Opcode::StoreBufferFormatF32x2: + case Opcode::StoreBufferFormatF32x3: + case Opcode::StoreBufferFormatF32x4: case Opcode::StoreBufferU32: case Opcode::WriteSharedU128: case Opcode::WriteSharedU64: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 0e25b777..4c6122a8 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -82,6 +82,10 @@ OPCODE(StoreBufferF32, Void, Opaq OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) +OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32, ) +OPCODE(StoreBufferFormatF32x2, Void, Opaque, Opaque, F32x2, ) +OPCODE(StoreBufferFormatF32x3, Void, Opaque, Opaque, F32x3, ) +OPCODE(StoreBufferFormatF32x4, Void, Opaque, Opaque, F32x4, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) // Vector utility diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index b3d2311e..97fc5b99 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -37,6 +37,10 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: @@ -73,6 +77,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { case IR::Opcode::LoadBufferFormatF32x2: case IR::Opcode::LoadBufferFormatF32x3: case IR::Opcode::LoadBufferFormatF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: switch (num_format) { case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: @@ -112,6 +120,10 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: