From e8a2cb74749760f39ba3d7a2780e11e13cc60371 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Tue, 2 Jul 2024 02:04:45 +0300 Subject: [PATCH] shader_recompiler: Implement shared memory load/store --- src/common/div_ceil.h | 25 +++ .../backend/spirv/emit_spirv_barriers.cpp | 36 ++++ .../backend/spirv/emit_spirv_convert.cpp | 6 +- .../backend/spirv/emit_spirv_instructions.h | 16 +- .../spirv/emit_spirv_shared_memory.cpp | 165 ++++++++++++++++++ .../backend/spirv/spirv_emit_context.cpp | 52 +++++- .../backend/spirv/spirv_emit_context.h | 15 ++ .../frontend/translate/data_share.cpp | 19 +- .../frontend/translate/translate.cpp | 18 ++ .../frontend/translate/translate.h | 1 + src/shader_recompiler/ir/ir_emitter.cpp | 37 ++-- src/shader_recompiler/ir/ir_emitter.h | 6 +- src/shader_recompiler/ir/opcodes.inc | 19 ++ .../ir/passes/shader_info_collection_pass.cpp | 10 ++ src/shader_recompiler/profile.h | 37 +--- src/shader_recompiler/runtime_info.h | 3 + src/video_core/amdgpu/liverpool.h | 6 + .../renderer_vulkan/vk_pipeline_cache.cpp | 2 + src/video_core/texture_cache/tile_manager.cpp | 7 +- 19 files changed, 410 insertions(+), 70 deletions(-) create mode 100755 src/common/div_ceil.h create mode 100644 src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp create mode 100644 src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp diff --git a/src/common/div_ceil.h b/src/common/div_ceil.h new file mode 100755 index 00000000..de275e76 --- /dev/null +++ b/src/common/div_ceil.h @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +namespace Common { + +/// Ceiled integer division. +template + requires std::is_integral_v && std::is_unsigned_v +[[nodiscard]] constexpr N DivCeil(N number, D divisor) { + return static_cast((static_cast(number) + divisor - 1) / divisor); +} + +/// Ceiled integer division with logarithmic divisor in base 2 +template + requires std::is_integral_v && std::is_unsigned_v +[[nodiscard]] constexpr N DivCeilLog2(N value, D alignment_log2) { + return static_cast((static_cast(value) + (D(1) << alignment_log2) - 1) >> alignment_log2); +} + +} // namespace Common diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp new file mode 100644 index 00000000..9c32f53f --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp @@ -0,0 +1,36 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { +namespace { +void MemoryBarrier(EmitContext& ctx, spv::Scope scope) { + const auto semantics{ + spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory | + spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AtomicCounterMemory | + spv::MemorySemanticsMask::ImageMemory}; + ctx.OpMemoryBarrier(ctx.ConstU32(static_cast(scope)), ctx.ConstU32(static_cast(semantics))); +} +} // Anonymous namespace + +void EmitBarrier(EmitContext& ctx) { + const auto execution{spv::Scope::Workgroup}; + const auto memory{spv::Scope::Workgroup}; + const auto memory_semantics{spv::MemorySemanticsMask::AcquireRelease | + spv::MemorySemanticsMask::WorkgroupMemory}; + ctx.OpControlBarrier(ctx.ConstU32(static_cast(execution)), + ctx.ConstU32(static_cast(memory)), + ctx.ConstU32(static_cast(memory_semantics))); +} + +void EmitWorkgroupMemoryBarrier(EmitContext& ctx) { + MemoryBarrier(ctx, spv::Scope::Workgroup); +} + +void EmitDeviceMemoryBarrier(EmitContext& ctx) { + MemoryBarrier(ctx, spv::Scope::Device); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index b033f91b..7c8012c9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -68,11 +68,7 @@ Id EmitConvertS32F16(EmitContext& ctx, Id value) { } Id EmitConvertS32F32(EmitContext& ctx, Id value) { - if (ctx.profile.has_broken_signed_operations) { - return ctx.OpBitcast(ctx.U32[1], ctx.OpConvertFToS(ctx.S32[1], value)); - } else { - return ctx.OpConvertFToS(ctx.U32[1], value); - } + return ctx.OpConvertFToS(ctx.U32[1], value); } Id EmitConvertS32F64(EmitContext& ctx, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 52791b00..705c555b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -43,6 +43,9 @@ void EmitSetVccHi(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); void EmitDiscard(EmitContext& ctx); +void EmitBarrier(EmitContext& ctx); +void EmitWorkgroupMemoryBarrier(EmitContext& ctx); +void EmitDeviceMemoryBarrier(EmitContext& ctx); Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg); void EmitGetThreadBitScalarReg(EmitContext& ctx); void EmitSetThreadBitScalarReg(EmitContext& ctx); @@ -82,12 +85,13 @@ Id EmitUndefU8(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); -Id EmitReadSharedU8(EmitContext& ctx, Id offset); -Id EmitReadSharedS8(EmitContext& ctx, Id offset); -Id EmitReadSharedU16(EmitContext& ctx, Id offset); -Id EmitReadSharedS16(EmitContext& ctx, Id offset); -Id EmitReadSharedU32(EmitContext& ctx, Id offset); -Id EmitReadSharedU64(EmitContext& ctx, Id offset); +Id EmitLoadSharedU8(EmitContext& ctx, Id offset); +Id EmitLoadSharedS8(EmitContext& ctx, Id offset); +Id EmitLoadSharedU16(EmitContext& ctx, Id offset); +Id EmitLoadSharedS16(EmitContext& ctx, Id offset); +Id EmitLoadSharedU32(EmitContext& ctx, Id offset); +Id EmitLoadSharedU64(EmitContext& ctx, Id offset); +Id EmitLoadSharedU128(EmitContext& ctx, Id offset); void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp new file mode 100644 index 00000000..9c9bfb93 --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { +namespace { +Id Pointer(EmitContext& ctx, Id pointer_type, Id array, Id offset, u32 shift) { + const Id shift_id{ctx.ConstU32(shift)}; + const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + return ctx.OpAccessChain(pointer_type, array, ctx.u32_zero_value, index); +} + +Id Word(EmitContext& ctx, Id offset) { + const Id shift_id{ctx.ConstU32(2U)}; + const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + return ctx.OpLoad(ctx.U32[1], pointer); +} + +std::pair ExtractArgs(EmitContext& ctx, Id offset, u32 mask, u32 count) { + const Id shift{ctx.OpShiftLeftLogical(ctx.U32[1], offset, ctx.ConstU32(3U))}; + const Id bit{ctx.OpBitwiseAnd(ctx.U32[1], shift, ctx.ConstU32(mask))}; + const Id count_id{ctx.ConstU32(count)}; + return {bit, count_id}; +} +} // Anonymous namespace + +Id EmitLoadSharedU8(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedS8(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; + return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedS16(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; + return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2)}; + return ctx.OpLoad(ctx.U32[1], pointer); + } else { + return Word(ctx, offset); + } +} + +Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; + return ctx.OpLoad(ctx.U32[2], pointer); + } else { + const Id shift_id{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; + return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), + ctx.OpLoad(ctx.U32[1], rhs_pointer)); + } +} + +Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; + return ctx.OpLoad(ctx.U32[4], pointer); + } + const Id shift_id{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + std::array values{}; + for (u32 i = 0; i < 4; ++i) { + const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + values[i] = ctx.OpLoad(ctx.U32[1], pointer); + } + return ctx.OpCompositeConstruct(ctx.U32[4], values); +} + +void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + ctx.OpStore(pointer, ctx.OpUConvert(ctx.U8, value)); +} + +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + ctx.OpStore(pointer, ctx.OpUConvert(ctx.U16, value)); +} + +void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { + Id pointer{}; + if (ctx.profile.support_explicit_workgroup_layout) { + pointer = Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2); + } else { + const Id shift{ctx.ConstU32(2U)}; + const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); + } + ctx.OpStore(pointer, value); +} + +void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; + ctx.OpStore(pointer, value); + return; + } + const Id shift{ctx.ConstU32(2U)}; + const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; + ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); + ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); +} + +void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; + ctx.OpStore(pointer, value); + return; + } + const Id shift{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + for (u32 i = 0; i < 4; ++i) { + const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i)); + } +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 330ee87d..27040813 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -3,6 +3,7 @@ #include #include +#include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { @@ -41,8 +42,9 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); DefineInterfaces(program); - DefineBuffers(program.info); - DefineImagesAndSamplers(program.info); + DefineBuffers(info); + DefineImagesAndSamplers(info); + DefineSharedMemory(info); } EmitContext::~EmitContext() = default; @@ -358,4 +360,50 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { } } +void EmitContext::DefineSharedMemory(const Info& info) { + if (info.shared_memory_size == 0) { + return; + } + const auto make{[&](Id element_type, u32 element_size) { + const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)}; + const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; + Decorate(array_type, spv::Decoration::ArrayStride, element_size); + + const Id struct_type{TypeStruct(array_type)}; + MemberDecorate(struct_type, 0U, spv::Decoration::Offset, 0U); + Decorate(struct_type, spv::Decoration::Block); + + const Id pointer{TypePointer(spv::StorageClass::Workgroup, struct_type)}; + const Id element_pointer{TypePointer(spv::StorageClass::Workgroup, element_type)}; + const Id variable{AddGlobalVariable(pointer, spv::StorageClass::Workgroup)}; + Decorate(variable, spv::Decoration::Aliased); + interfaces.push_back(variable); + + return std::make_tuple(variable, element_pointer, pointer); + }}; + if (profile.support_explicit_workgroup_layout) { + AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); + if (info.uses_shared_u8) { + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout8BitAccessKHR); + std::tie(shared_memory_u8, shared_u8, std::ignore) = make(U8, 1); + } + if (info.uses_shared_u16) { + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); + std::tie(shared_memory_u16, shared_u16, std::ignore) = make(U16, 2); + } + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make(U32[1], 4); + std::tie(shared_memory_u32x2, shared_u32x2, std::ignore) = make(U32[2], 8); + std::tie(shared_memory_u32x4, shared_u32x4, std::ignore) = make(U32[4], 16); + return; + } + const u32 num_elements{Common::DivCeil(info.shared_memory_size, 4U)}; + const Id type{TypeArray(U32[1], ConstU32(num_elements))}; + shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); + + shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); + shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); + interfaces.push_back(shared_memory_u32); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 335d40c4..08e2bf8b 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -147,6 +147,12 @@ public: Id u32_zero_value{}; Id f32_zero_value{}; + Id shared_u8{}; + Id shared_u16{}; + Id shared_u32{}; + Id shared_u32x2{}; + Id shared_u32x4{}; + Id input_u32{}; Id input_f32{}; Id input_s32{}; @@ -169,6 +175,14 @@ public: Id subgroup_local_invocation_id{}; Id image_u32{}; + Id shared_memory_u8{}; + Id shared_memory_u16{}; + Id shared_memory_u32{}; + Id shared_memory_u32x2{}; + Id shared_memory_u32x4{}; + + Id shared_memory_u32_type{}; + struct TextureDefinition { Id id; Id sampled_type; @@ -206,6 +220,7 @@ private: void DefineOutputs(const Info& info); void DefineBuffers(const Info& info); void DefineImagesAndSamplers(const Info& info); + void DefineSharedMemory(const Info& info); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id); }; diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 99883015..1c782997 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -22,16 +22,18 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg dst_reg{inst.dst[0].code}; if (is_pair) { + // Pair loads are either 32 or 64-bit. We assume 32-bit for now. + ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.SetVectorReg(dst_reg, ir.ReadShared(32, is_signed, addr0)); + ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)}); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.SetVectorReg(dst_reg + 1, ir.ReadShared(32, is_signed, addr1)); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)}); } else if (bit_size == 64) { - const IR::Value data = ir.UnpackUint2x32(ir.ReadShared(bit_size, is_signed, addr)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); } else { - const IR::U32 data = ir.ReadShared(bit_size, is_signed, addr); + const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr)}; ir.SetVectorReg(dst_reg, data); } } @@ -41,17 +43,22 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; if (is_pair) { + ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); ir.WriteShared(32, ir.GetVectorReg(data0), addr0); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); ir.WriteShared(32, ir.GetVectorReg(data1), addr1); } else if (bit_size == 64) { - const IR::U64 data = ir.PackUint2x32( - ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))); + const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), + ir.GetVectorReg(data0 + 1)); ir.WriteShared(bit_size, data, addr); } else { ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr); } } +void Translator::S_BARRIER() { + ir.Barrier(); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 1a5f9a49..28aee56a 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -466,6 +466,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CMP_LT_U32: translator.S_CMP(ConditionOp::LT, false, inst); break; + case Opcode::S_CMP_LE_U32: + translator.S_CMP(ConditionOp::LE, false, inst); + break; case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); break; @@ -816,9 +819,24 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::IMAGE_GET_RESINFO: translator.IMAGE_GET_RESINFO(inst); break; + case Opcode::S_BARRIER: + translator.S_BARRIER(); + break; case Opcode::S_TTRACEDATA: LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); break; + case Opcode::DS_READ_B32: + translator.DS_READ(32, false, false, inst); + break; + case Opcode::DS_READ2_B32: + translator.DS_READ(32, false, true, inst); + break; + case Opcode::DS_WRITE_B32: + translator.DS_WRITE(32, false, false, inst); + break; + case Opcode::DS_WRITE2_B32: + translator.DS_WRITE(32, false, true, inst); + break; case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index c86712a2..bcc65f9a 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -165,6 +165,7 @@ public: void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); + void S_BARRIER(); // MIMG void IMAGE_GET_RESINFO(const GcnInst& inst); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index b54f32d4..8c982c98 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -115,6 +115,18 @@ void IREmitter::Discard() { Inst(Opcode::Discard); } +void IREmitter::Barrier() { + Inst(Opcode::Barrier); +} + +void IREmitter::WorkgroupMemoryBarrier() { + Inst(Opcode::WorkgroupMemoryBarrier); +} + +void IREmitter::DeviceMemoryBarrier() { + Inst(Opcode::DeviceMemoryBarrier); +} + U32 IREmitter::GetUserData(IR::ScalarReg reg) { return Inst(Opcode::GetUserData, reg); } @@ -240,22 +252,24 @@ void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, u32 comp Inst(Opcode::SetAttribute, attribute, value, Imm32(comp)); } -U32U64 IREmitter::ReadShared(int bit_size, bool is_signed, const U32& offset) { - /*switch (bit_size) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { + switch (bit_size) { case 8: - return Inst(is_signed ? Opcode::ReadSharedS8 : Opcode::ReadSharedU8, offset); + return Inst(is_signed ? Opcode::LoadSharedS8 : Opcode::LoadSharedU8, offset); case 16: - return Inst(is_signed ? Opcode::ReadSharedS16 : Opcode::ReadSharedU16, offset); + return Inst(is_signed ? Opcode::LoadSharedS16 : Opcode::LoadSharedU16, offset); case 32: - return Inst(Opcode::ReadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::ReadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); + case 128: + return Inst(Opcode::LoadSharedU128, offset); } - UNREACHABLE_MSG("Invalid bit size {}", bit_size);*/ + UNREACHABLE_MSG("Invalid bit size {}", bit_size); } void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { - /*switch (bit_size) { + switch (bit_size) { case 8: Inst(Opcode::WriteSharedU8, offset, value); break; @@ -268,9 +282,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) case 64: Inst(Opcode::WriteSharedU64, offset, value); break; + case 128: + Inst(Opcode::WriteSharedU128, offset, value); + break; default: - UNREACHABLE_MSG("Invalid bit size {}", bit_size); - }*/ + throw InvalidArgument("Invalid bit size {}", bit_size); + } } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 2bf10ce8..745e0f4f 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -43,6 +43,10 @@ public: void Epilogue(); void Discard(); + void Barrier(); + void WorkgroupMemoryBarrier(); + void DeviceMemoryBarrier(); + [[nodiscard]] U32 GetUserData(IR::ScalarReg reg); [[nodiscard]] U1 GetThreadBitScalarReg(IR::ScalarReg reg); void SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value); @@ -74,7 +78,7 @@ public: [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); - [[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 559443b8..32cdc326 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -19,6 +19,25 @@ OPCODE(ReadConst, U32, U32x OPCODE(ReadConstBuffer, F32, Opaque, U32, ) OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) +// Barriers +OPCODE(Barrier, Void, ) +OPCODE(WorkgroupMemoryBarrier, Void, ) +OPCODE(DeviceMemoryBarrier, Void, ) + +// Shared memory operations +OPCODE(LoadSharedU8, U32, U32, ) +OPCODE(LoadSharedS8, U32, U32, ) +OPCODE(LoadSharedU16, U32, U32, ) +OPCODE(LoadSharedS16, U32, U32, ) +OPCODE(LoadSharedU32, U32, U32, ) +OPCODE(LoadSharedU64, U32x2, U32, ) +OPCODE(LoadSharedU128, U32x4, U32, ) +OPCODE(WriteSharedU8, Void, U32, U32, ) +OPCODE(WriteSharedU16, Void, U32, U32, ) +OPCODE(WriteSharedU32, Void, U32, U32, ) +OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU128, Void, U32, U32x4, ) + // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) OPCODE(GetThreadBitScalarReg, U1, ScalarReg, ) diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index ac1cb060..25d8b937 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -16,6 +16,16 @@ void Visit(Info& info, IR::Inst& inst) { info.stores.Set(inst.Arg(0).Attribute(), inst.Arg(2).U32()); break; } + case IR::Opcode::LoadSharedS8: + case IR::Opcode::LoadSharedU8: + case IR::Opcode::WriteSharedU8: + info.uses_shared_u8 = true; + break; + case IR::Opcode::LoadSharedS16: + case IR::Opcode::LoadSharedU16: + case IR::Opcode::WriteSharedU16: + info.uses_shared_u16 = true; + break; case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index f3c33c81..54b34730 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -26,44 +26,9 @@ struct Profile { bool support_fp32_signed_zero_nan_preserve{}; bool support_fp64_signed_zero_nan_preserve{}; bool support_explicit_workgroup_layout{}; - bool support_vote{}; - bool support_viewport_mask{}; - bool support_typeless_image_loads{}; - bool support_derivative_control{}; - bool support_geometry_shader_passthrough{}; - bool support_native_ndc{}; - bool support_scaled_attributes{}; - bool support_multi_viewport{}; - bool support_geometry_streams{}; - - bool warp_size_potentially_larger_than_guest{}; - - bool lower_left_origin_mode{}; - /// Fragment outputs have to be declared even if they are not written to avoid undefined values. - /// See Ori and the Blind Forest's main menu for reference. - bool need_declared_frag_colors{}; - /// Prevents fast math optimizations that may cause inaccuracies - bool need_fastmath_off{}; - - /// OpFClamp is broken and OpFMax + OpFMin should be used instead bool has_broken_spirv_clamp{}; - /// The Position builtin needs to be wrapped in a struct when used as an input - bool has_broken_spirv_position_input{}; - /// Offset image operands with an unsigned type do not work - bool has_broken_unsigned_image_offsets{}; - /// Signed instructions with unsigned data types are misinterpreted - bool has_broken_signed_operations{}; - /// Float controls break when fp16 is enabled - bool has_broken_fp16_float_controls{}; - /// Ignores SPIR-V ordered vs unordered using GLSL semantics - bool ignore_nan_fp_comparisons{}; - - /// Maxwell and earlier nVidia architectures have broken robust support - bool has_broken_robust{}; - + bool lower_left_origin_mode{}; u64 min_ssbo_alignment{}; - - u32 max_user_clip_distances{}; }; } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 21f3602f..9d893002 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -134,7 +134,10 @@ struct Info { std::span user_data; Stage stage; + u32 shared_memory_size{}; bool uses_group_quad{}; + bool uses_shared_u8{}; + bool uses_shared_u16{}; bool translation_failed{}; // indicates that shader has unsupported instructions template diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 2233fa0c..2552cc61 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -121,6 +121,7 @@ struct Liverpool { BitField<0, 6, u64> num_vgprs; BitField<6, 4, u64> num_sgprs; BitField<33, 5, u64> num_user_regs; + BitField<47, 9, u64> lds_dwords; } settings; INSERT_PADDING_WORDS(1); u32 resource_limits; @@ -133,6 +134,11 @@ struct Liverpool { return reinterpret_cast(addr); } + u32 SharedMemSize() const noexcept { + // lds_dwords is in units of 128 dwords. We return bytes. + return settings.lds_dwords.Value() * 128 * 4; + } + std::span Code() const { const u32* code = Address(); BinaryInfo bininfo; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 8c1170ca..ab3585d3 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -45,6 +45,7 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d info.num_user_data = cs_pgm.settings.num_user_regs; info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, cs_pgm.num_thread_z.full}; + info.shared_memory_size = cs_pgm.SharedMemSize(); break; } default: @@ -60,6 +61,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, pipeline_cache = instance.GetDevice().createPipelineCacheUnique({}); profile = Shader::Profile{ .supported_spirv = 0x00010600U, + .support_explicit_workgroup_layout = true, }; } diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index e9818d75..0b194dca 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -179,19 +179,18 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR8Unorm: return vk::Format::eR8Uint; case vk::Format::eR8G8Unorm: + case vk::Format::eR16Sfloat: return vk::Format::eR8G8Uint; case vk::Format::eR8G8B8A8Srgb: - [[fallthrough]]; case vk::Format::eB8G8R8A8Srgb: - [[fallthrough]]; case vk::Format::eB8G8R8A8Unorm: - [[fallthrough]]; case vk::Format::eR8G8B8A8Unorm: + case vk::Format::eR32Uint: return vk::Format::eR32Uint; case vk::Format::eBc1RgbaUnormBlock: + case vk::Format::eR32G32Sfloat: return vk::Format::eR32G32Uint; case vk::Format::eBc3SrgbBlock: - [[fallthrough]]; case vk::Format::eBc3UnormBlock: case vk::Format::eBc7SrgbBlock: case vk::Format::eBc7UnormBlock: