From 63801cfa35a5b5367d23e40d59809a04dd391cde Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Wed, 3 Jul 2024 05:43:36 +0300 Subject: [PATCH] shader_recompiler: Inline constant buffer impl --- .../backend/spirv/emit_spirv.cpp | 8 ++ .../backend/spirv/emit_spirv_instructions.h | 3 + .../backend/spirv/emit_spirv_integer.cpp | 4 + .../frontend/structured_control_flow.cpp | 2 +- .../frontend/translate/scalar_alu.cpp | 12 +++ .../frontend/translate/scalar_memory.cpp | 3 +- .../frontend/translate/translate.cpp | 9 ++- .../frontend/translate/translate.h | 4 +- .../frontend/translate/vector_memory.cpp | 8 +- src/shader_recompiler/ir/ir_emitter.cpp | 20 +++++ src/shader_recompiler/ir/ir_emitter.h | 5 +- src/shader_recompiler/ir/opcodes.inc | 3 + .../ir/passes/resource_tracking_pass.cpp | 74 +++++++++++++++---- .../ir/passes/ssa_rewrite_pass.cpp | 21 +++++- src/shader_recompiler/runtime_info.h | 11 ++- src/video_core/amdgpu/liverpool.h | 18 ++--- src/video_core/amdgpu/resource.h | 10 +++ .../renderer_vulkan/vk_compute_pipeline.cpp | 2 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 2 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 12 ++- 20 files changed, 189 insertions(+), 42 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index f53b24ca..2564f4ce 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -288,6 +288,10 @@ void EmitGetVcc(EmitContext& ctx) { UNREACHABLE_MSG("Unreachable instruction"); } +void EmitGetSccLo(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + void EmitGetVccLo(EmitContext& ctx) { UNREACHABLE_MSG("Unreachable instruction"); } @@ -308,6 +312,10 @@ void EmitSetVcc(EmitContext& ctx) { UNREACHABLE_MSG("Unreachable instruction"); } +void EmitSetSccLo(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + void EmitSetVccLo(EmitContext& ctx) { UNREACHABLE_MSG("Unreachable instruction"); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 97130bf5..abd5f8a3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -33,11 +33,13 @@ void EmitDeviceMemoryBarrier(EmitContext& ctx); void EmitGetScc(EmitContext& ctx); void EmitGetExec(EmitContext& ctx); void EmitGetVcc(EmitContext& ctx); +void EmitGetSccLo(EmitContext& ctx); void EmitGetVccLo(EmitContext& ctx); void EmitGetVccHi(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); void EmitSetExec(EmitContext& ctx); void EmitSetVcc(EmitContext& ctx); +void EmitSetSccLo(EmitContext& ctx); void EmitSetVccLo(EmitContext& ctx); void EmitSetVccHi(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); @@ -245,6 +247,7 @@ Id EmitFPIsInf32(EmitContext& ctx, Id value); Id EmitFPIsInf64(EmitContext& ctx, Id value); Id EmitIAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitIAdd64(EmitContext& ctx, Id a, Id b); +Id EmitIAddCary32(EmitContext& ctx, Id a, Id b); Id EmitISub32(EmitContext& ctx, Id a, Id b); Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitSMulExt(EmitContext& ctx, Id a, Id b); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 1d52a3ed..d5a0f276 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -60,6 +60,10 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b) { return ctx.OpIAdd(ctx.U64, a, b); } +Id EmitIAddCary32(EmitContext& ctx, Id a, Id b) { + return ctx.OpIAddCarry(ctx.full_result_u32x2, a, b); +} + Id EmitISub32(EmitContext& ctx, Id a, Id b) { return ctx.OpISub(ctx.U32[1], a, b); } diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 9b5cc3e6..3f59cb94 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -633,7 +633,7 @@ private: if (!stmt.block->is_dummy) { const u32 start = stmt.block->begin_index; const u32 size = stmt.block->end_index - start + 1; - Translate(current_block, inst_list.subspan(start, size), info); + Translate(current_block, stmt.block->begin, inst_list.subspan(start, size), info); } break; } diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 1b762c3a..03b4af7e 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -318,4 +318,16 @@ void Translator::S_SUB_U32(const GcnInst& inst) { ir.SetScc(ir.Imm1(false)); } +void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) { + // This only really exists to let resource tracking pass know + // there is an inline cbuf. + SetDst(inst.dst[0], ir.Imm32(pc)); +} + +void Translator::S_ADDC_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo())); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 2cf5c5b2..25003ef2 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -30,7 +30,8 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { } return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2)); }(); - const IR::Value vsharp = ir.GetScalarReg(sbase); + const IR::Value vsharp = ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1), + ir.GetScalarReg(sbase + 2), ir.GetScalarReg(sbase + 3)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 58fcbd2f..87dd114a 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -236,7 +236,7 @@ void Translator::EmitFetch(const GcnInst& inst) { } } -void Translate(IR::Block* block, std::span inst_list, Info& info) { +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info) { if (inst_list.empty()) { return; } @@ -833,6 +833,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_ADD_U32: translator.S_ADD_U32(inst); break; + case Opcode::S_ADDC_U32: + translator.S_ADDC_U32(inst); + break; case Opcode::S_SUB_U32: case Opcode::S_SUB_I32: translator.S_SUB_U32(inst); @@ -878,6 +881,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_READFIRSTLANE_B32: translator.V_READFIRSTLANE_B32(inst); break; + case Opcode::S_GETPC_B64: + translator.S_GETPC_B64(block_base, inst); + break; case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: @@ -895,6 +901,7 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) magic_enum::enum_name(inst.opcode), opcode); info.translation_failed = true; } + block_base += inst.length; } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index fa86719f..e1f72e5f 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -80,6 +80,8 @@ public: void S_BREV_B32(const GcnInst& inst); void S_ADD_U32(const GcnInst& inst); void S_SUB_U32(const GcnInst& inst); + void S_GETPC_B64(u32 pc, const GcnInst& inst); + void S_ADDC_U32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -192,6 +194,6 @@ private: static std::array exec_contexts; }; -void Translate(IR::Block* block, std::span inst_list, Info& info); +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 704fca5b..8d0dd262 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -250,7 +250,9 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst info.nfmt.Assign(static_cast(mtbuf.nfmt)); } - const IR::Value value = ir.LoadBuffer(num_dwords, ir.GetScalarReg(sharp), address, info); + const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::F32{value}); @@ -309,7 +311,9 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnIns ir.GetVectorReg(src_reg + 3)); break; } - ir.StoreBuffer(num_dwords, ir.GetScalarReg(sharp), address, value, info); + const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + ir.StoreBuffer(num_dwords, handle, address, value, info); } void Translator::IMAGE_GET_LOD(const GcnInst& inst) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 7e5e46eb..2b126472 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -212,6 +212,10 @@ U1 IREmitter::GetVcc() { return Inst(Opcode::GetVcc); } +U32 IREmitter::GetSccLo() { + return Inst(Opcode::GetSccLo); +} + U32 IREmitter::GetVccLo() { return Inst(Opcode::GetVccLo); } @@ -232,6 +236,10 @@ void IREmitter::SetVcc(const U1& value) { Inst(Opcode::SetVcc, value); } +void IREmitter::SetSccLo(const U32& value) { + Inst(Opcode::SetSccLo, value); +} + void IREmitter::SetVccLo(const U32& value) { Inst(Opcode::SetVccLo, value); } @@ -898,6 +906,18 @@ U32U64 IREmitter::IAdd(const U32U64& a, const U32U64& b) { } } +Value IREmitter::IAddCary(const U32& a, const U32& b) { + if (a.Type() != b.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); + } + switch (a.Type()) { + case Type::U32: + return Inst(Opcode::IAddCary32, a, b); + default: + ThrowInvalidType(a.Type()); + } +} + U32U64 IREmitter::ISub(const U32U64& a, const U32U64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a78785d6..c3342530 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -64,11 +64,13 @@ public: [[nodiscard]] U1 GetScc(); [[nodiscard]] U1 GetExec(); [[nodiscard]] U1 GetVcc(); + [[nodiscard]] U32 GetSccLo(); [[nodiscard]] U32 GetVccLo(); [[nodiscard]] U32 GetVccHi(); void SetScc(const U1& value); void SetExec(const U1& value); void SetVcc(const U1& value); + void SetSccLo(const U32& value); void SetVccLo(const U32& value); void SetVccHi(const U32& value); @@ -151,8 +153,9 @@ public: [[nodiscard]] F32F64 FPMin(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); + [[nodiscard]] Value IAddCary(const U32& a, const U32& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); - [[nodiscard]] IR::Value IMulExt(const U32& a, const U32& b, bool is_signed = false); + [[nodiscard]] Value IMulExt(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32 IMul(const U32& a, const U32& b); [[nodiscard]] U32 IDiv(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 INeg(const U32U64& value); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index f7519fb4..bed6766e 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -56,11 +56,13 @@ OPCODE(SetAttribute, Void, Attr OPCODE(GetScc, U1, Void, ) OPCODE(GetExec, U1, Void, ) OPCODE(GetVcc, U1, Void, ) +OPCODE(GetSccLo, U32, Void, ) OPCODE(GetVccLo, U32, Void, ) OPCODE(GetVccHi, U32, Void, ) OPCODE(SetScc, Void, U1, ) OPCODE(SetExec, Void, U1, ) OPCODE(SetVcc, Void, U1, ) +OPCODE(SetSccLo, Void, U32, ) OPCODE(SetVccLo, Void, U32, ) OPCODE(SetVccHi, Void, U32, ) @@ -216,6 +218,7 @@ OPCODE(FPIsInf64, U1, F64, // Integer operations OPCODE(IAdd32, U32, U32, U32, ) OPCODE(IAdd64, U64, U64, U64, ) +OPCODE(IAddCary32, U32x2, U32, U32, ) OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 10e8a31a..d0389410 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -138,7 +138,8 @@ public: u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + desc.dword_offset == existing.dword_offset && + desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records); @@ -219,20 +220,64 @@ SharpLocation TrackSharp(const IR::Inst* inst) { }; } +static constexpr size_t MaxUboSize = 65536; + +s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, AmdGpu::Buffer& cbuf) { + /** + * Assert for the following pattern + * s_getpc_b64 s[32:33] + * s_add_u32 s32, , s32 + * s_addc_u32 s33, 0, s33 + * s_mov_b32 s35, + * s_movk_i32 s34, + * buffer_load_format_xyz v[8:10], v1, s[32:35], 0 ... + **/ + IR::Inst* handle = inst.Arg(0).InstRecursive(); + IR::Inst* p0 = handle->Arg(0).InstRecursive(); + if (p0->GetOpcode() != IR::Opcode::IAdd32 || !p0->Arg(0).IsImmediate()) { + return -1; + } + IR::Inst* p1 = handle->Arg(1).InstRecursive(); + if (p1->GetOpcode() != IR::Opcode::IAdd32) { + return -1; + } + if (!handle->Arg(3).IsImmediate() || !handle->Arg(2).IsImmediate()) { + return -1; + } + // We have found this pattern. Build the sharp and assign a binding to it. + cbuf.raw0 = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32(); + cbuf.num_records = handle->Arg(2).U32(); + cbuf.raw11 = handle->Arg(3).U32(); + return descriptors.Add(BufferResource{ + .sgpr_base = std::numeric_limits::max(), + .dword_offset = 0, + .stride = cbuf.GetStride(), + .num_records = u32(cbuf.num_records), + .used_types = BufferDataType(inst), + .inline_cbuf = cbuf, + .is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize, + }); +} + void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { - static constexpr size_t MaxUboSize = 65536; - IR::Inst* producer = inst.Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); - const u32 binding = descriptors.Add(BufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, - .stride = buffer.GetStride(), - .num_records = u32(buffer.num_records), - .used_types = BufferDataType(inst), - .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, - }); + s32 binding{}; + AmdGpu::Buffer buffer; + if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) { + IR::Inst* handle = inst.Arg(0).InstRecursive(); + IR::Inst* producer = handle->Arg(0).InstRecursive(); + const auto sharp = TrackSharp(producer); + buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + binding = descriptors.Add(BufferResource{ + .sgpr_base = sharp.sgpr_base, + .dword_offset = sharp.dword_offset, + .stride = buffer.GetStride(), + .num_records = u32(buffer.num_records), + .used_types = BufferDataType(inst), + .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, + }); + } + const auto inst_info = inst.Flags(); IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; // Replace handle with binding index in buffer resource list. @@ -240,7 +285,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); if (inst_info.is_typed) { ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float && - inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32); + (inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || + inst_info.dmft == AmdGpu::DataFormat::Format32_32_32)); } if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer || inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) { diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index 9ee01953..8a24a68b 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -32,6 +32,7 @@ struct SccFlagTag : FlagTag {}; struct ExecFlagTag : FlagTag {}; struct VccFlagTag : FlagTag {}; struct VccLoTag : FlagTag {}; +struct SccLoTag : FlagTag {}; struct VccHiTag : FlagTag {}; struct GotoVariable : FlagTag { @@ -44,7 +45,7 @@ struct GotoVariable : FlagTag { }; using Variant = std::variant; + VccFlagTag, SccLoTag, VccLoTag, VccHiTag>; using ValueMap = std::unordered_map; struct DefTable { @@ -83,6 +84,13 @@ struct DefTable { exec_flag.insert_or_assign(block, value); } + const IR::Value& Def(IR::Block* block, SccLoTag) { + return scc_lo_flag[block]; + } + void SetDef(IR::Block* block, SccLoTag, const IR::Value& value) { + scc_lo_flag.insert_or_assign(block, value); + } + const IR::Value& Def(IR::Block* block, VccLoTag) { return vcc_lo_flag[block]; } @@ -108,6 +116,7 @@ struct DefTable { ValueMap scc_flag; ValueMap exec_flag; ValueMap vcc_flag; + ValueMap scc_lo_flag; ValueMap vcc_lo_flag; ValueMap vcc_hi_flag; }; @@ -124,6 +133,10 @@ IR::Opcode UndefOpcode(const VccLoTag&) noexcept { return IR::Opcode::UndefU32; } +IR::Opcode UndefOpcode(const SccLoTag&) noexcept { + return IR::Opcode::UndefU32; +} + IR::Opcode UndefOpcode(const VccHiTag&) noexcept { return IR::Opcode::UndefU32; } @@ -321,6 +334,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::SetVcc: pass.WriteVariable(VccFlagTag{}, block, inst.Arg(0)); break; + case IR::Opcode::SetSccLo: + pass.WriteVariable(SccLoTag{}, block, inst.Arg(0)); + break; case IR::Opcode::SetVccLo: pass.WriteVariable(VccLoTag{}, block, inst.Arg(0)); break; @@ -350,6 +366,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetVcc: inst.ReplaceUsesWith(pass.ReadVariable(VccFlagTag{}, block)); break; + case IR::Opcode::GetSccLo: + inst.ReplaceUsesWith(pass.ReadVariable(SccLoTag{}, block)); + break; case IR::Opcode::GetVccLo: inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block)); break; diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index eeb5db63..cb24f0a2 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -4,7 +4,6 @@ #pragma once #include -#include #include #include "common/assert.h" #include "common/types.h" @@ -69,15 +68,18 @@ enum class VsOutput : u32 { }; using VsOutputMap = std::array; +struct Info; + struct BufferResource { u32 sgpr_base; u32 dword_offset; u32 stride; u32 num_records; IR::Type used_types; + AmdGpu::Buffer inline_cbuf; bool is_storage; - auto operator<=>(const BufferResource&) const = default; + constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept; }; using BufferResourceList = boost::container::static_vector; @@ -162,6 +164,7 @@ struct Info { std::span user_data; Stage stage; + uintptr_t pgm_base{}; u32 shared_memory_size{}; bool uses_group_quad{}; bool uses_shared_u8{}; @@ -180,6 +183,10 @@ struct Info { } }; +constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexcept { + return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); +} + } // namespace Shader template <> diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 608cc470..c59244e4 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -85,14 +85,14 @@ struct Liverpool { } settings; UserData user_data; - template - const T* Address() const { + template + const T Address() const { const uintptr_t addr = uintptr_t(address_hi) << 40 | uintptr_t(address_lo) << 8; - return reinterpret_cast(addr); + return reinterpret_cast(addr); } std::span Code() const { - const u32* code = Address(); + const u32* code = Address(); BinaryInfo bininfo; std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); const u32 num_dwords = bininfo.length / sizeof(u32); @@ -128,10 +128,10 @@ struct Liverpool { INSERT_PADDING_WORDS(0x2A); UserData user_data; - template - const T* Address() const { + template + const T Address() const { const uintptr_t addr = uintptr_t(address_hi) << 40 | uintptr_t(address_lo) << 8; - return reinterpret_cast(addr); + return reinterpret_cast(addr); } u32 SharedMemSize() const noexcept { @@ -140,7 +140,7 @@ struct Liverpool { } std::span Code() const { - const u32* code = Address(); + const u32* code = Address(); BinaryInfo bininfo; std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); const u32 num_dwords = bininfo.length / sizeof(u32); @@ -150,7 +150,7 @@ struct Liverpool { template static constexpr auto* GetBinaryInfo(const Shader& sh) { - const auto* code = sh.template Address(); + const auto* code = sh.template Address(); const auto* bininfo = std::bit_cast(code + (code[1] + 1) * 2); ASSERT_MSG(bininfo->Valid(), "Invalid shader binary header"); return bininfo; diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 17686b7a..b2315b6e 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -22,6 +22,7 @@ enum class CompSwizzle : u32 { // Table 8.5 Buffer Resource Descriptor [Sea Islands Series Instruction Set Architecture] struct Buffer { union { + u64 raw0; BitField<0, 44, u64> base_address; BitField<48, 14, u64> stride; BitField<62, 1, u64> cache_swizzle; @@ -29,6 +30,7 @@ struct Buffer { }; u32 num_records; union { + u32 raw11; BitField<0, 3, u32> dst_sel_x; BitField<3, 3, u32> dst_sel_y; BitField<6, 3, u32> dst_sel_z; @@ -41,6 +43,14 @@ struct Buffer { BitField<23, 1, u32> add_tid_enable; }; + operator bool() const noexcept { + return base_address != 0; + } + + bool operator==(const Buffer& other) const noexcept { + return std::memcmp(this, &other, sizeof(Buffer)) == 0; + } + CompSwizzle GetSwizzle(u32 comp) const noexcept { return static_cast((dst_sel.Value() >> (comp * 3)) & 0x7); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 0f5a64ce..dbcbc343 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -91,7 +91,7 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s u32 binding{}; for (const auto& buffer : info.buffers) { - const auto vsharp = info.ReadUd(buffer.sgpr_base, buffer.dword_offset); + const auto vsharp = buffer.GetVsharp(info); const u32 size = vsharp.GetSize(); const VAddr address = vsharp.base_address.Value(); texture_cache.OnCpuWrite(address); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5d638a69..87fda51b 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -326,7 +326,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& for (const auto& stage : stages) { for (const auto& buffer : stage.buffers) { - const auto vsharp = stage.ReadUd(buffer.sgpr_base, buffer.dword_offset); + const auto vsharp = buffer.GetVsharp(stage); const VAddr address = vsharp.base_address.Value(); const u32 size = vsharp.GetSize(); const u32 offset = staging.Copy(address, size, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 5ba9e6a1..58765a5f 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -198,7 +198,7 @@ void PipelineCache::RefreshGraphicsKey() { for (u32 i = 0; i < MaxShaderStages; i++) { auto* pgm = regs.ProgramForStage(i); - if (!pgm || !pgm->Address()) { + if (!pgm || !pgm->Address()) { key.stage_hashes[i] = 0; continue; } @@ -248,17 +248,14 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { DumpShader(code, hash, stage, "bin"); } - if (hash == 0xcafe3773 || hash == 0xc6602df2) { - return nullptr; - } - block_pool.ReleaseContents(); inst_pool.ReleaseContents(); // Recompile shader to IR. try { LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x}", stage, hash); - const Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); + Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); + info.pgm_base = pgm->Address(); programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); // Compile IR to SPIR-V @@ -296,8 +293,9 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { // Recompile shader to IR. try { LOG_INFO(Render_Vulkan, "Compiling cs shader {:#x}", compute_key); - const Shader::Info info = + Shader::Info info = MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); + info.pgm_base = cs_pgm.Address(); auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); // Compile IR to SPIR-V