From 1d1c88ad312456f4bc3c1eb00c2680c2137d095a Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Fri, 16 Aug 2024 20:05:37 +0300 Subject: [PATCH] control_flow_graph: Initial divergence handling (#434) * control_flow_graph: Initial divergence handling * cfg: Handle additional case * spirv: Handle tgid enable bits * clang format * spirv: Use proper format * translator: Add more instructions --- src/core/libraries/network/net.cpp | 2 +- .../spirv/emit_spirv_context_get_set.cpp | 10 +- .../backend/spirv/spirv_emit_context.cpp | 4 +- .../backend/spirv/spirv_emit_context.h | 4 +- .../frontend/control_flow_graph.cpp | 120 ++++++++++++++---- .../frontend/control_flow_graph.h | 19 +++ .../frontend/translate/scalar_alu.cpp | 2 + .../frontend/translate/translate.cpp | 12 +- .../frontend/translate/vector_memory.cpp | 5 + src/shader_recompiler/runtime_info.h | 1 + src/video_core/amdgpu/liverpool.h | 5 + src/video_core/buffer_cache/buffer_cache.cpp | 2 +- src/video_core/buffer_cache/buffer_cache.h | 1 - .../renderer_vulkan/vk_pipeline_cache.cpp | 3 + 14 files changed, 154 insertions(+), 36 deletions(-) diff --git a/src/core/libraries/network/net.cpp b/src/core/libraries/network/net.cpp index 958f9264..2c03dde3 100644 --- a/src/core/libraries/network/net.cpp +++ b/src/core/libraries/network/net.cpp @@ -10,7 +10,7 @@ #include #endif -#include +#include "common/assert.h" #include "common/logging/log.h" #include "core/libraries/error_codes.h" #include "core/libraries/libs.h" diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index bbf259fe..f933ed3c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -323,7 +323,7 @@ static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offs static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) { auto& buffer = ctx.buffers[handle]; - const auto format = buffer.buffer.GetDataFmt(); + const auto format = buffer.dfmt; switch (format) { case AmdGpu::DataFormat::FormatInvalid: return ctx.f32_zero_value; @@ -348,7 +348,7 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com // uint index = address / 4; Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const u32 stride = buffer.buffer.GetStride(); + const u32 stride = buffer.stride; if (stride > 4) { const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32); if (index_offset > 0) { @@ -360,7 +360,7 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32; const u32 bit_width = AmdGpu::ComponentBits(format, comp); - const auto num_format = buffer.buffer.GetNumberFmt(); + const auto num_format = buffer.nfmt; if (num_format == AmdGpu::NumberFormat::Float) { if (bit_width == 32) { return ctx.OpLoad(ctx.F32[1], ptr); @@ -486,8 +486,8 @@ static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat fo template static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { auto& buffer = ctx.buffers[handle]; - const auto format = buffer.buffer.GetDataFmt(); - const auto num_format = buffer.buffer.GetNumberFmt(); + const auto format = buffer.dfmt; + const auto num_format = buffer.nfmt; switch (format) { case AmdGpu::DataFormat::FormatInvalid: diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 4b732ecd..8b99482f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -363,7 +363,9 @@ void EmitContext::DefineBuffers() { .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, - .buffer = buffer.GetVsharp(info), + .dfmt = buffer.dfmt, + .nfmt = buffer.nfmt, + .stride = buffer.GetVsharp(info).GetStride(), }); interfaces.push_back(id); i++; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 81237a9a..768b591f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -207,7 +207,9 @@ public: u32 binding; const VectorIds* data_types; Id pointer_type; - AmdGpu::Buffer buffer; + AmdGpu::DataFormat dfmt; + AmdGpu::NumberFormat nfmt; + u32 stride; }; u32& binding; diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 2925c05d..4f3ab86e 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -35,15 +35,22 @@ static IR::Condition MakeCondition(Opcode opcode) { return IR::Condition::Execz; case Opcode::S_CBRANCH_EXECNZ: return IR::Condition::Execnz; + case Opcode::S_AND_SAVEEXEC_B64: + case Opcode::S_ANDN2_B64: + return IR::Condition::Execnz; default: return IR::Condition::True; } } +static constexpr size_t LabelReserveSize = 32; + CFG::CFG(Common::ObjectPool& block_pool_, std::span inst_list_) : block_pool{block_pool_}, inst_list{inst_list_} { index_to_pc.resize(inst_list.size() + 1); + labels.reserve(LabelReserveSize); EmitLabels(); + EmitDivergenceLabels(); EmitBlocks(); LinkBlocks(); } @@ -51,14 +58,7 @@ CFG::CFG(Common::ObjectPool& block_pool_, std::span inst_l void CFG::EmitLabels() { // Always set a label at entry point. u32 pc = 0; - labels.push_back(pc); - - const auto add_label = [this](u32 address) { - const auto it = std::ranges::find(labels, address); - if (it == labels.end()) { - labels.push_back(address); - } - }; + AddLabel(pc); // Iterate instruction list and add labels to branch targets. for (u32 i = 0; i < inst_list.size(); i++) { @@ -66,15 +66,15 @@ void CFG::EmitLabels() { const GcnInst inst = inst_list[i]; if (inst.IsUnconditionalBranch()) { const u32 target = inst.BranchTarget(pc); - add_label(target); + AddLabel(target); } else if (inst.IsConditionalBranch()) { const u32 true_label = inst.BranchTarget(pc); const u32 false_label = pc + inst.length; - add_label(true_label); - add_label(false_label); + AddLabel(true_label); + AddLabel(false_label); } else if (inst.opcode == Opcode::S_ENDPGM) { const u32 next_label = pc + inst.length; - add_label(next_label); + AddLabel(next_label); } pc += inst.length; } @@ -84,16 +84,70 @@ void CFG::EmitLabels() { std::ranges::sort(labels); } -void CFG::EmitBlocks() { - const auto get_index = [this](Label label) -> size_t { - if (label == 0) { - return 0ULL; - } - const auto it_index = std::ranges::lower_bound(index_to_pc, label); - ASSERT(it_index != index_to_pc.end() || label > index_to_pc.back()); - return std::distance(index_to_pc.begin(), it_index); +void CFG::EmitDivergenceLabels() { + const auto is_open_scope = [](const GcnInst& inst) { + // An open scope instruction is an instruction that modifies EXEC + // but also saves the previous value to restore later. This indicates + // we are entering a scope. + return inst.opcode == Opcode::S_AND_SAVEEXEC_B64 || + // While this instruction does not save EXEC it is often used paired + // with SAVEEXEC to mask the threads that didn't pass the condition + // of initial branch. + inst.opcode == Opcode::S_ANDN2_B64; + }; + const auto is_close_scope = [](const GcnInst& inst) { + // Closing an EXEC scope can be either a branch instruction + // (typical case when S_AND_SAVEEXEC_B64 is right before a branch) + // or by a move instruction to EXEC that restores the backup. + return (inst.opcode == Opcode::S_MOV_B64 && inst.dst[0].field == OperandField::ExecLo) || + // Sometimes compiler might insert instructions between the SAVEEXEC and the branch. + // Those instructions need to be wrapped in the condition as well so allow branch + // as end scope instruction. + inst.opcode == Opcode::S_CBRANCH_EXECZ || inst.opcode == Opcode::S_ANDN2_B64; }; + // Since we will be adding new labels, avoid iterating those as well. + const size_t end_size = labels.size(); + for (u32 l = 0; l < end_size; l++) { + const Label start = labels[l]; + // Stop if we reached end of existing labels. + if (l == end_size - 1) { + break; + } + const Label end = labels[l + 1]; + const size_t end_index = GetIndex(end); + + s32 curr_begin = -1; + for (size_t index = GetIndex(start); index < end_index; index++) { + const auto& inst = inst_list[index]; + if (is_close_scope(inst) && curr_begin != -1) { + // If there are no instructions inside scope don't do anything. + if (index - curr_begin == 1) { + curr_begin = -1; + continue; + } + // Add a label to the instruction right after the open scope call. + // It is the start of a new basic block. + const auto& save_inst = inst_list[curr_begin]; + const Label label = index_to_pc[curr_begin] + save_inst.length; + AddLabel(label); + // Add a label to the close scope instruction as well. + AddLabel(index_to_pc[index]); + // Reset scope begin. + curr_begin = -1; + } + // Mark a potential start of an exec scope. + if (is_open_scope(inst)) { + curr_begin = index; + } + } + } + + // Sort labels to make sure block insertion is correct. + std::ranges::sort(labels); +} + +void CFG::EmitBlocks() { for (auto it = labels.begin(); it != labels.end(); it++) { const Label start = *it; const auto next_it = std::next(it); @@ -102,8 +156,10 @@ void CFG::EmitBlocks() { // Last label is special. return; } + // The end label is the start instruction of next block. + // The end instruction of this block is the previous one. const Label end = *next_it; - const size_t end_index = get_index(end) - 1; + const size_t end_index = GetIndex(end) - 1; const auto& end_inst = inst_list[end_index]; // Insert block between the labels using the last instruction @@ -111,7 +167,7 @@ void CFG::EmitBlocks() { Block* block = block_pool.Create(); block->begin = start; block->end = end; - block->begin_index = get_index(start); + block->begin_index = GetIndex(start); block->end_index = end_index; block->end_inst = end_inst; block->cond = MakeCondition(end_inst.opcode); @@ -126,8 +182,26 @@ void CFG::LinkBlocks() { return &*it; }; - for (auto& block : blocks) { + for (auto it = blocks.begin(); it != blocks.end(); it++) { + auto& block = *it; const auto end_inst{block.end_inst}; + // Handle divergence block inserted here. + if (end_inst.opcode == Opcode::S_AND_SAVEEXEC_B64 || + end_inst.opcode == Opcode::S_ANDN2_B64) { + // Blocks are stored ordered by address in the set + auto next_it = std::next(it); + auto* target_block = &(*next_it); + ++target_block->num_predecessors; + block.branch_true = target_block; + + auto merge_it = std::next(next_it); + auto* merge_block = &(*merge_it); + ++merge_block->num_predecessors; + block.branch_false = merge_block; + block.end_class = EndClass::Branch; + continue; + } + // If the block doesn't end with a branch we simply // need to link with the next block. if (!end_inst.IsTerminateInstruction()) { diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h index ebe614ee..d98d4b05 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.h +++ b/src/shader_recompiler/frontend/control_flow_graph.h @@ -3,11 +3,13 @@ #pragma once +#include #include #include #include #include +#include "common/assert.h" #include "common/object_pool.h" #include "common/types.h" #include "shader_recompiler/frontend/instruction.h" @@ -55,9 +57,26 @@ public: private: void EmitLabels(); + void EmitDivergenceLabels(); void EmitBlocks(); void LinkBlocks(); + void AddLabel(Label address) { + const auto it = std::ranges::find(labels, address); + if (it == labels.end()) { + labels.push_back(address); + } + }; + + size_t GetIndex(Label label) { + if (label == 0) { + return 0ULL; + } + const auto it_index = std::ranges::lower_bound(index_to_pc, label); + ASSERT(it_index != index_to_pc.end() || label > index_to_pc.back()); + return std::distance(index_to_pc.begin(), it_index); + }; + public: Common::ObjectPool& block_pool; std::span inst_list; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 795b148d..812d93ba 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -29,6 +29,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { return S_CMP(ConditionOp::LG, true, inst); case Opcode::S_CMP_GT_I32: return S_CMP(ConditionOp::GT, true, inst); + case Opcode::S_CMP_LE_I32: + return S_CMP(ConditionOp::LE, true, inst); case Opcode::S_CMP_GE_I32: return S_CMP(ConditionOp::GE, true, inst); case Opcode::S_CMP_EQ_I32: diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index d48e4def..4070560a 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -64,9 +64,15 @@ void Translator::EmitPrologue() { ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 1)); ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 2)); - ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 0)); - ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 1)); - ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2)); + if (info.tgid_enable[0]) { + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 0)); + } + if (info.tgid_enable[1]) { + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 1)); + } + if (info.tgid_enable[2]) { + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2)); + } break; default: throw NotImplementedException("Unknown shader stage"); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 63f6c3b4..01a549f4 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -91,6 +91,11 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::BUFFER_STORE_FORMAT_XYZW: return BUFFER_STORE_FORMAT(4, false, true, inst); + case Opcode::TBUFFER_STORE_FORMAT_X: + return BUFFER_STORE_FORMAT(1, true, true, inst); + case Opcode::TBUFFER_STORE_FORMAT_XYZ: + return BUFFER_STORE_FORMAT(3, true, true, inst); + case Opcode::BUFFER_STORE_DWORD: return BUFFER_STORE_FORMAT(1, false, false, inst); case Opcode::BUFFER_STORE_DWORDX2: diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 674d7c5f..b1eb6aea 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -180,6 +180,7 @@ struct Info { SamplerResourceList samplers; std::array workgroup_size{}; + std::array tgid_enable; u32 num_user_data; u32 num_input_vgprs; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index e888bdcc..92a24795 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -130,6 +130,7 @@ struct Liverpool { BitField<0, 6, u64> num_vgprs; BitField<6, 4, u64> num_sgprs; BitField<33, 5, u64> num_user_regs; + BitField<39, 3, u64> tgid_enable; BitField<47, 9, u64> lds_dwords; } settings; INSERT_PADDING_WORDS(1); @@ -148,6 +149,10 @@ struct Liverpool { return settings.lds_dwords.Value() * 128 * 4; } + bool IsTgidEnabled(u32 i) const noexcept { + return (settings.tgid_enable.Value() >> i) & 1; + } + std::span Code() const { const u32* code = Address(); BinaryInfo bininfo; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 2246807a..02d6b2ce 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -13,7 +13,7 @@ namespace VideoCore { -static constexpr size_t StagingBufferSize = 256_MB; +static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 64_MB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 33ea3f86..2bcc4f0e 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include #include diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 6974d850..c11705e7 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -93,6 +93,8 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d info.num_user_data = cs_pgm.settings.num_user_regs; info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, cs_pgm.num_thread_z.full}; + info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1), + cs_pgm.IsTgidEnabled(2)}; info.shared_memory_size = cs_pgm.SharedMemSize(); break; } @@ -324,6 +326,7 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { Shader::Info info = MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); info.pgm_base = cs_pgm.Address(); + info.pgm_hash = compute_key; auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile);