From 66e96dd94439be3dd4684d7a983a7b10745d7bc7 Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Thu, 29 Aug 2024 19:29:54 +0300 Subject: [PATCH] video_core: Account of runtime state changes when compiling shaders (#575) * video_core: Compile shader permutations * spirv: Only specific storage image format for atomics * ir: Avoid cube coord patching for storage image * spirv: Fix default attributes * data_share: Add more instructions * video_core: Query storage flag with runtime state * kernel: Use std::list for semaphore * video_core: Use texture buffers for untyped format load/store * buffer_cache: Limit view usage * vk_pipeline_cache: Fix invalid iterator * image_view: Reduce log spam when alpha=1 in storage swizzle * video_core: More features and proper spirv feature detection * video_core: Attempt no2 for specialization * spirv: Remove conflict * vk_shader_cache: Small cleanup --- CMakeLists.txt | 2 + src/core/libraries/avplayer/avplayer.cpp | 2 +- .../libraries/kernel/threads/semaphore.cpp | 33 +- .../backend/spirv/emit_spirv.cpp | 41 ++- .../backend/spirv/emit_spirv.h | 2 +- .../spirv/emit_spirv_context_get_set.cpp | 269 +---------------- .../backend/spirv/spirv_emit_context.cpp | 156 ++++++---- .../backend/spirv/spirv_emit_context.h | 17 +- .../frontend/translate/data_share.cpp | 35 ++- .../frontend/translate/export.cpp | 2 - .../frontend/translate/translate.cpp | 4 +- .../frontend/translate/translate.h | 12 +- .../frontend/translate/vector_alu.cpp | 14 +- .../frontend/translate/vector_memory.cpp | 165 ++++++---- src/shader_recompiler/ir/ir_emitter.cpp | 37 +-- src/shader_recompiler/ir/ir_emitter.h | 6 +- src/shader_recompiler/ir/microinstruction.cpp | 3 - src/shader_recompiler/ir/opcodes.inc | 10 +- .../ir/passes/resource_tracking_pass.cpp | 190 +++++------- .../ir/passes/shader_info_collection_pass.cpp | 9 + src/shader_recompiler/ir/program.h | 4 +- src/shader_recompiler/ir/reg.h | 3 - src/shader_recompiler/recompiler.cpp | 5 +- src/shader_recompiler/recompiler.h | 2 +- src/shader_recompiler/runtime_info.h | 53 +++- src/video_core/amdgpu/liverpool.h | 2 +- src/video_core/amdgpu/pixel_format.h | 4 + src/video_core/amdgpu/resource.h | 5 + src/video_core/buffer_cache/buffer.cpp | 39 +-- src/video_core/buffer_cache/buffer.h | 15 +- src/video_core/buffer_cache/buffer_cache.cpp | 7 +- .../renderer_vulkan/vk_compute_pipeline.cpp | 106 +++++-- .../renderer_vulkan/vk_compute_pipeline.h | 11 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 91 ++++-- .../renderer_vulkan/vk_graphics_pipeline.h | 3 +- .../renderer_vulkan/vk_instance.cpp | 10 +- src/video_core/renderer_vulkan/vk_instance.h | 5 + .../renderer_vulkan/vk_pipeline_cache.cpp | 285 +++--------------- .../renderer_vulkan/vk_pipeline_cache.h | 19 +- .../renderer_vulkan/vk_shader_cache.cpp | 192 ++++++++++++ .../renderer_vulkan/vk_shader_cache.h | 156 ++++++++++ src/video_core/texture_cache/image_view.cpp | 6 +- src/video_core/texture_cache/tile_manager.cpp | 2 + 43 files changed, 1058 insertions(+), 976 deletions(-) create mode 100644 src/video_core/renderer_vulkan/vk_shader_cache.cpp create mode 100644 src/video_core/renderer_vulkan/vk_shader_cache.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dfc6528d..d2d1c170 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -521,6 +521,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/renderer_vulkan/vk_resource_pool.h src/video_core/renderer_vulkan/vk_scheduler.cpp src/video_core/renderer_vulkan/vk_scheduler.h + src/video_core/renderer_vulkan/vk_shader_cache.cpp + src/video_core/renderer_vulkan/vk_shader_cache.h src/video_core/renderer_vulkan/vk_shader_util.cpp src/video_core/renderer_vulkan/vk_shader_util.h src/video_core/renderer_vulkan/vk_swapchain.cpp diff --git a/src/core/libraries/avplayer/avplayer.cpp b/src/core/libraries/avplayer/avplayer.cpp index 406583a8..23e1e987 100644 --- a/src/core/libraries/avplayer/avplayer.cpp +++ b/src/core/libraries/avplayer/avplayer.cpp @@ -325,4 +325,4 @@ void RegisterlibSceAvPlayer(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("yN7Jhuv8g24", "libSceAvPlayer", 1, "libSceAvPlayer", 1, 0, sceAvPlayerVprintf); }; -} // namespace Libraries::AvPlayer \ No newline at end of file +} // namespace Libraries::AvPlayer diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index 5304dc57..e2f43803 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -2,9 +2,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include -#include -#include #include #include "common/assert.h" #include "common/logging/log.h" @@ -13,9 +12,6 @@ namespace Libraries::Kernel { -using ListBaseHook = - boost::intrusive::list_base_hook>; - class Semaphore { public: Semaphore(s32 init_count, s32 max_count, std::string_view name, bool is_fifo) @@ -37,7 +33,7 @@ public: // Create waiting thread object and add it into the list of waiters. WaitingThread waiter{need_count, is_fifo}; - AddWaiter(waiter); + AddWaiter(&waiter); // Perform the wait. return waiter.Wait(lk, timeout); @@ -52,14 +48,14 @@ public: // Wake up threads in order of priority. for (auto it = wait_list.begin(); it != wait_list.end();) { - auto& waiter = *it; - if (waiter.need_count > token_count) { + auto* waiter = *it; + if (waiter->need_count > token_count) { it++; continue; } it = wait_list.erase(it); - token_count -= waiter.need_count; - waiter.cv.notify_one(); + token_count -= waiter->need_count; + waiter->cv.notify_one(); } return true; @@ -70,9 +66,9 @@ public: if (num_waiters) { *num_waiters = wait_list.size(); } - for (auto& waiter : wait_list) { - waiter.was_cancled = true; - waiter.cv.notify_one(); + for (auto* waiter : wait_list) { + waiter->was_cancled = true; + waiter->cv.notify_one(); } wait_list.clear(); token_count = set_count < 0 ? init_count : set_count; @@ -80,7 +76,7 @@ public: } public: - struct WaitingThread : public ListBaseHook { + struct WaitingThread { std::condition_variable cv; u32 priority; s32 need_count; @@ -132,7 +128,7 @@ public: } }; - void AddWaiter(WaitingThread& waiter) { + void AddWaiter(WaitingThread* waiter) { // Insert at the end of the list for FIFO order. if (is_fifo) { wait_list.push_back(waiter); @@ -140,16 +136,13 @@ public: } // Find the first with priority less then us and insert right before it. auto it = wait_list.begin(); - while (it != wait_list.end() && it->priority > waiter.priority) { + while (it != wait_list.end() && (*it)->priority > waiter->priority) { it++; } wait_list.insert(it, waiter); } - using WaitingThreads = - boost::intrusive::list, - boost::intrusive::constant_time_size>; - WaitingThreads wait_list; + std::list wait_list; std::string name; std::atomic token_count; std::mutex mutex; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 09a9fd62..161d4ec9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -99,7 +99,7 @@ Id TypeId(const EmitContext& ctx, IR::Type type) { } } -void Traverse(EmitContext& ctx, IR::Program& program) { +void Traverse(EmitContext& ctx, const IR::Program& program) { IR::Block* current_block{}; for (const IR::AbstractSyntaxNode& node : program.syntax_list) { switch (node.type) { @@ -162,7 +162,7 @@ void Traverse(EmitContext& ctx, IR::Program& program) { } } -Id DefineMain(EmitContext& ctx, IR::Program& program) { +Id DefineMain(EmitContext& ctx, const IR::Program& program) { const Id void_function{ctx.TypeFunction(ctx.void_id)}; const Id main{ctx.OpFunction(ctx.void_id, spv::FunctionControlMask::MaskNone, void_function)}; for (IR::Block* const block : program.blocks) { @@ -185,8 +185,27 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddCapability(spv::Capability::Int16); } ctx.AddCapability(spv::Capability::Int64); - if (info.has_storage_images) { + if (info.has_storage_images || info.has_image_buffers) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); + ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat); + } + if (info.has_texel_buffers) { + ctx.AddCapability(spv::Capability::SampledBuffer); + } + if (info.has_image_buffers) { + ctx.AddCapability(spv::Capability::ImageBuffer); + } + if (info.has_image_gather) { + ctx.AddCapability(spv::Capability::ImageGatherExtended); + } + if (info.has_image_query) { + ctx.AddCapability(spv::Capability::ImageQuery); + } + if (info.uses_lane_id) { + ctx.AddCapability(spv::Capability::GroupNonUniform); + } + if (info.uses_group_quad) { + ctx.AddCapability(spv::Capability::GroupNonUniformQuad); } switch (program.info.stage) { case Stage::Compute: { @@ -206,19 +225,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { } else { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); } - ctx.AddCapability(spv::Capability::GroupNonUniform); - if (info.uses_group_quad) { - ctx.AddCapability(spv::Capability::GroupNonUniformQuad); - } if (info.has_discard) { ctx.AddCapability(spv::Capability::DemoteToHelperInvocationEXT); } - if (info.has_image_gather) { - ctx.AddCapability(spv::Capability::ImageGatherExtended); - } - if (info.has_image_query) { - ctx.AddCapability(spv::Capability::ImageQuery); - } if (info.stores.Get(IR::Attribute::Depth)) { ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); } @@ -229,7 +238,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddEntryPoint(execution_model, main, "main", interfaces); } -void PatchPhiNodes(IR::Program& program, EmitContext& ctx) { +void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) { auto inst{program.blocks.front()->begin()}; size_t block_index{0}; ctx.PatchDeferredPhi([&](size_t phi_arg) { @@ -248,8 +257,8 @@ void PatchPhiNodes(IR::Program& program, EmitContext& ctx) { } } // Anonymous namespace -std::vector EmitSPIRV(const Profile& profile, IR::Program& program, u32& binding) { - EmitContext ctx{profile, program, binding}; +std::vector EmitSPIRV(const Profile& profile, const IR::Program& program, u32& binding) { + EmitContext ctx{profile, program.info, binding}; const Id main{DefineMain(ctx, program)}; DefineEntryPoint(program, ctx, main); if (program.info.stage == Stage::Vertex) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h index e513975b..4c862185 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv.h @@ -9,7 +9,7 @@ namespace Shader::Backend::SPIRV { -[[nodiscard]] std::vector EmitSPIRV(const Profile& profile, IR::Program& program, +[[nodiscard]] std::vector EmitSPIRV(const Profile& profile, const IR::Program& program, u32& binding); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 03fc52ff..4566439c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -262,171 +262,15 @@ Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { return EmitLoadBufferF32xN<4>(ctx, handle, address); } -static bool IsSignedInteger(AmdGpu::NumberFormat format) { - switch (format) { - case AmdGpu::NumberFormat::Unorm: - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Uint: - return false; - case AmdGpu::NumberFormat::Snorm: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::SnormNz: - return true; - case AmdGpu::NumberFormat::Float: - default: - UNREACHABLE(); - } -} - -static u32 UXBitsMax(u32 bit_width) { - return (1u << bit_width) - 1u; -} - -static u32 SXBitsMax(u32 bit_width) { - return (1u << (bit_width - 1u)) - 1u; -} - -static Id ConvertValue(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) { - switch (format) { - case AmdGpu::NumberFormat::Unorm: - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width)))); - case AmdGpu::NumberFormat::Snorm: - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width)))); - case AmdGpu::NumberFormat::SnormNz: - // (x * 2 + 1) / (Format::SMAX * 2) - value = ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(2.f)); - value = ctx.OpFAdd(ctx.F32[1], value, ctx.ConstF32(1.f)); - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width) * 2))); - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Uint: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::Float: - return value; - default: - UNREACHABLE_MSG("Unsupported number format for conversion: {}", - magic_enum::enum_name(format)); - } -} - -static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offset) { - Id comp_offset = ctx.ConstU32(bit_offset); - if (stride < 4) { - // comp_offset += (address % 4) * 8; - const Id byte_offset = ctx.OpUMod(ctx.U32[1], address, ctx.ConstU32(4u)); - const Id bit_offset = ctx.OpShiftLeftLogical(ctx.U32[1], byte_offset, ctx.ConstU32(3u)); - comp_offset = ctx.OpIAdd(ctx.U32[1], comp_offset, bit_offset); - } - return comp_offset; -} - -static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) { - auto& buffer = ctx.buffers[handle]; - const auto format = buffer.dfmt; - switch (format) { - case AmdGpu::DataFormat::FormatInvalid: - return ctx.f32_zero_value; - case AmdGpu::DataFormat::Format8: - case AmdGpu::DataFormat::Format16: - case AmdGpu::DataFormat::Format32: - case AmdGpu::DataFormat::Format8_8: - case AmdGpu::DataFormat::Format16_16: - case AmdGpu::DataFormat::Format10_11_11: - case AmdGpu::DataFormat::Format11_11_10: - case AmdGpu::DataFormat::Format10_10_10_2: - case AmdGpu::DataFormat::Format2_10_10_10: - case AmdGpu::DataFormat::Format8_8_8_8: - case AmdGpu::DataFormat::Format32_32: - case AmdGpu::DataFormat::Format16_16_16_16: - case AmdGpu::DataFormat::Format32_32_32: - case AmdGpu::DataFormat::Format32_32_32_32: { - const u32 num_components = AmdGpu::NumComponents(format); - if (comp >= num_components) { - return ctx.f32_zero_value; - } - - // uint index = address / 4; - Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const u32 stride = buffer.stride; - if (stride > 4) { - const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32); - if (index_offset > 0) { - // index += index_offset; - index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(index_offset)); - } - } - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); - - const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32; - const u32 bit_width = AmdGpu::ComponentBits(format, comp); - const auto num_format = buffer.nfmt; - if (num_format == AmdGpu::NumberFormat::Float) { - if (bit_width == 32) { - return ctx.OpLoad(ctx.F32[1], ptr); - } else if (bit_width == 16) { - const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); - Id value = ctx.OpLoad(ctx.U32[1], ptr); - value = - ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpSConvert(ctx.U16, value); - value = ctx.OpBitcast(ctx.F16[1], value); - return ctx.OpFConvert(ctx.F32[1], value); - } else { - UNREACHABLE_MSG("Invalid float bit width {}", bit_width); - } - } else { - Id value = ctx.OpLoad(ctx.U32[1], ptr); - const bool is_signed = IsSignedInteger(num_format); - if (bit_width < 32) { - const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); - if (is_signed) { - value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, - ctx.ConstU32(bit_width)); - } else { - value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset, - ctx.ConstU32(bit_width)); - } - } - value = ctx.OpBitcast(ctx.F32[1], value); - return ConvertValue(ctx, value, num_format, bit_width); - } - break; - } - default: - UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); - } -} - -template -static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - auto& buffer = ctx.buffers[handle]; - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); - if constexpr (N == 1) { - return GetBufferFormatValue(ctx, handle, address, 0); - } else { - boost::container::static_vector ids; - for (u32 i = 0; i < N; i++) { - ids.push_back(GetBufferFormatValue(ctx, handle, address, i)); - } - return ctx.OpCompositeConstruct(ctx.F32[N], ids); - } -} - Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<1>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<2>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<3>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<4>(ctx, inst, handle, address); + const auto& buffer = ctx.texture_buffers[handle]; + const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); + const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + Id texel = ctx.OpImageFetch(buffer.result_type, tex_buffer, coord); + if (buffer.is_integer) { + texel = ctx.OpBitcast(ctx.F32[4], texel); + } + return texel; } template @@ -467,97 +311,14 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address EmitStoreBufferF32xN<1>(ctx, handle, address, value); } -static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, - u32 bit_width) { - switch (format) { - case AmdGpu::NumberFormat::Unorm: - return ctx.OpConvertFToU( - ctx.U32[1], ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width))))); - case AmdGpu::NumberFormat::Uint: - return ctx.OpBitcast(ctx.U32[1], value); - case AmdGpu::NumberFormat::Float: - return value; - default: - UNREACHABLE_MSG("Unsupported number format for conversion: {}", - magic_enum::enum_name(format)); - } -} - -template -static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { - auto& buffer = ctx.buffers[handle]; - const auto format = buffer.dfmt; - const auto num_format = buffer.nfmt; - - switch (format) { - case AmdGpu::DataFormat::FormatInvalid: - return; - case AmdGpu::DataFormat::Format8_8_8_8: - case AmdGpu::DataFormat::Format16: - case AmdGpu::DataFormat::Format32: - case AmdGpu::DataFormat::Format32_32: - case AmdGpu::DataFormat::Format32_32_32_32: { - ASSERT(N == AmdGpu::NumComponents(format)); - - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); - const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); - - Id packed_value{}; - for (u32 i = 0; i < N; i++) { - const u32 bit_width = AmdGpu::ComponentBits(format, i); - const u32 bit_offset = AmdGpu::ComponentOffset(format, i) % 32; - - const Id comp{ConvertF32ToFormat( - ctx, N == 1 ? value : ctx.OpCompositeExtract(ctx.F32[1], value, i), num_format, - bit_width)}; - - if (bit_width == 32) { - if constexpr (N == 1) { - ctx.OpStore(ptr, comp); - } else { - const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, - ctx.u32_zero_value, index_i); - ctx.OpStore(ptr, comp); - } - } else { - if (i == 0) { - packed_value = comp; - } else { - packed_value = - ctx.OpBitFieldInsert(ctx.U32[1], packed_value, comp, - ctx.ConstU32(bit_offset), ctx.ConstU32(bit_width)); - } - - if (i == N - 1) { - ctx.OpStore(ptr, packed_value); - } - } - } - } break; - default: - UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); - } -} - void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferFormatF32xN<1>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<2>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<3>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<4>(ctx, handle, address, value); + const auto& buffer = ctx.texture_buffers[handle]; + const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); + const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + if (buffer.is_integer) { + value = ctx.OpBitcast(ctx.U32[4], value); + } + ctx.OpImageWrite(tex_buffer, coord, value); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 994c2847..e4019604 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -41,13 +41,14 @@ void Name(EmitContext& ctx, Id object, std::string_view format_str, Args&&... ar } // Anonymous namespace -EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& binding_) - : Sirit::Module(profile_.supported_spirv), info{program.info}, profile{profile_}, - stage{program.info.stage}, binding{binding_} { +EmitContext::EmitContext(const Profile& profile_, const Shader::Info& info_, u32& binding_) + : Sirit::Module(profile_.supported_spirv), info{info_}, profile{profile_}, stage{info.stage}, + binding{binding_} { AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); DefineInterfaces(); DefineBuffers(); + DefineTextureBuffers(); DefineImagesAndSamplers(); DefineSharedMemory(); } @@ -123,25 +124,24 @@ void EmitContext::DefineInterfaces() { DefineOutputs(); } -Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { +const VectorIds& GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { switch (fmt) { case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: case AmdGpu::NumberFormat::SnormNz: - return ctx.F32[4]; - case AmdGpu::NumberFormat::Sint: - return ctx.S32[4]; - case AmdGpu::NumberFormat::Uint: - return ctx.U32[4]; case AmdGpu::NumberFormat::Sscaled: - return ctx.F32[4]; case AmdGpu::NumberFormat::Uscaled: - return ctx.F32[4]; + case AmdGpu::NumberFormat::Srgb: + return ctx.F32; + case AmdGpu::NumberFormat::Sint: + return ctx.S32; + case AmdGpu::NumberFormat::Uint: + return ctx.U32; default: break; } - throw InvalidArgument("Invalid attribute type {}", fmt); + UNREACHABLE_MSG("Invalid attribute type {}", fmt); } EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id) { @@ -162,7 +162,7 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f default: break; } - throw InvalidArgument("Invalid attribute type {}", fmt); + UNREACHABLE_MSG("Invalid attribute type {}", fmt); } void EmitContext::DefineBufferOffsets() { @@ -177,6 +177,16 @@ void EmitContext::DefineBufferOffsets() { buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); } + for (auto& tex_buffer : texture_buffers) { + const u32 binding = tex_buffer.binding; + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); + } } Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { @@ -195,6 +205,11 @@ Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { } void EmitContext::DefineInputs() { + if (info.uses_lane_id) { + subgroup_local_invocation_id = DefineVariable( + U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); + Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); + } switch (stage) { case Stage::Vertex: { vertex_index = DefineVariable(U32[1], spv::BuiltIn::VertexIndex, spv::StorageClass::Input); @@ -202,7 +217,7 @@ void EmitContext::DefineInputs() { instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input); for (const auto& input : info.vs_inputs) { - const Id type{GetAttributeType(*this, input.fmt)}; + const Id type{GetAttributeType(*this, input.fmt)[4]}; if (input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate0 || input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate1) { @@ -229,15 +244,12 @@ void EmitContext::DefineInputs() { break; } case Stage::Fragment: - subgroup_local_invocation_id = DefineVariable( - U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); - Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); for (const auto& input : info.ps_inputs) { const u32 semantic = input.param_index; - if (input.is_default) { + if (input.is_default && !input.is_flat) { input_params[semantic] = {MakeDefaultValue(*this, input.default_value), F32[1], F32[1], 4, true}; continue; @@ -328,47 +340,74 @@ void EmitContext::DefinePushDataBlock() { void EmitContext::DefineBuffers() { boost::container::small_vector type_ids; - for (u32 i = 0; const auto& buffer : info.buffers) { - const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; - const Id data_type = (*data_types)[1]; - const Id record_array_type{buffer.is_storage - ? TypeRuntimeArray(data_type) - : TypeArray(data_type, ConstU32(buffer.length))}; + const auto define_struct = [&](Id record_array_type, bool is_instance_data) { const Id struct_type{TypeStruct(record_array_type)}; - if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { - Decorate(record_array_type, spv::Decoration::ArrayStride, 4); - const auto name = - buffer.is_instance_data - ? fmt::format("{}_instance_data{}_{}{}", stage, i, 'f', - sizeof(float) * CHAR_BIT) - : fmt::format("{}_cbuf_block_{}{}", stage, 'f', sizeof(float) * CHAR_BIT); - Name(struct_type, name); - Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, 0, "data"); - MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); - type_ids.push_back(record_array_type); + if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) { + return struct_type; } + Decorate(record_array_type, spv::Decoration::ArrayStride, 4); + const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) + : fmt::format("{}_cbuf_block_f32", stage); + Name(struct_type, name); + Decorate(struct_type, spv::Decoration::Block); + MemberName(struct_type, 0, "data"); + MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); + type_ids.push_back(record_array_type); + return struct_type; + }; + + for (const auto& desc : info.buffers) { + const auto sharp = desc.GetSharp(info); + const bool is_storage = desc.IsStorage(sharp); + const auto* data_types = True(desc.used_types & IR::Type::F32) ? &F32 : &U32; + const Id data_type = (*data_types)[1]; + const Id record_array_type{is_storage ? TypeRuntimeArray(data_type) + : TypeArray(data_type, ConstU32(sharp.NumDwords()))}; + const Id struct_type{define_struct(record_array_type, desc.is_instance_data)}; const auto storage_class = - buffer.is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; + is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; const Id pointer_type = TypePointer(storage_class, data_type); const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; Decorate(id, spv::Decoration::Binding, binding); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base)); + if (is_storage && !desc.is_written) { + Decorate(id, spv::Decoration::NonWritable); + } + Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base)); buffers.push_back({ .id = id, .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, - .dfmt = buffer.dfmt, - .nfmt = buffer.nfmt, - .stride = buffer.GetVsharp(info).GetStride(), }); interfaces.push_back(id); - i++; + } +} + +void EmitContext::DefineTextureBuffers() { + for (const auto& desc : info.texture_buffers) { + const bool is_integer = + desc.nfmt == AmdGpu::NumberFormat::Uint || desc.nfmt == AmdGpu::NumberFormat::Sint; + const VectorIds& sampled_type{GetAttributeType(*this, desc.nfmt)}; + const u32 sampled = desc.is_written ? 2 : 1; + const Id image_type{TypeImage(sampled_type[1], spv::Dim::Buffer, false, false, false, + sampled, spv::ImageFormat::Unknown)}; + const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)}; + const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; + Decorate(id, spv::Decoration::Binding, binding); + Decorate(id, spv::Decoration::DescriptorSet, 0U); + Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base)); + texture_buffers.push_back({ + .id = id, + .binding = binding++, + .image_type = image_type, + .result_type = sampled_type[4], + .is_integer = is_integer, + }); + interfaces.push_back(id); } } @@ -447,7 +486,7 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { const auto image = ctx.info.ReadUd(desc.sgpr_base, desc.dword_offset); - const auto format = desc.is_storage ? GetFormat(image) : spv::ImageFormat::Unknown; + const auto format = desc.is_atomic ? GetFormat(image) : spv::ImageFormat::Unknown; const u32 sampled = desc.is_storage ? 2 : 1; switch (desc.type) { case AmdGpu::ImageType::Color1D: @@ -470,17 +509,8 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { void EmitContext::DefineImagesAndSamplers() { for (const auto& image_desc : info.images) { - const VectorIds* data_types = [&] { - switch (image_desc.nfmt) { - case AmdGpu::NumberFormat::Uint: - return &U32; - case AmdGpu::NumberFormat::Sint: - return &S32; - default: - return &F32; - } - }(); - const Id sampled_type = data_types->Get(1); + const VectorIds& data_types = GetAttributeType(*this, image_desc.nfmt); + const Id sampled_type = data_types[1]; const Id image_type{ImageType(*this, image_desc, sampled_type)}; const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)}; const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; @@ -489,7 +519,7 @@ void EmitContext::DefineImagesAndSamplers() { Name(id, fmt::format("{}_{}{}_{:02x}", stage, "img", image_desc.sgpr_base, image_desc.dword_offset)); images.push_back({ - .data_types = data_types, + .data_types = &data_types, .id = id, .sampled_type = image_desc.is_storage ? sampled_type : TypeSampledImage(image_type), .pointer_type = pointer_type, @@ -498,13 +528,12 @@ void EmitContext::DefineImagesAndSamplers() { interfaces.push_back(id); ++binding; } - - image_u32 = TypePointer(spv::StorageClass::Image, U32[1]); - + if (std::ranges::any_of(info.images, &ImageResource::is_atomic)) { + image_u32 = TypePointer(spv::StorageClass::Image, U32[1]); + } if (info.samplers.empty()) { return; } - sampler_type = TypeSampler(); sampler_pointer_type = TypePointer(spv::StorageClass::UniformConstant, sampler_type); for (const auto& samp_desc : info.samplers) { @@ -520,14 +549,15 @@ void EmitContext::DefineImagesAndSamplers() { } void EmitContext::DefineSharedMemory() { - static constexpr size_t DefaultSharedMemSize = 16_KB; + static constexpr size_t DefaultSharedMemSize = 2_KB; if (!info.uses_shared) { return; } - if (info.shared_memory_size == 0) { - info.shared_memory_size = DefaultSharedMemSize; + u32 shared_memory_size = info.shared_memory_size; + if (shared_memory_size == 0) { + shared_memory_size = DefaultSharedMemSize; } - const u32 num_elements{Common::DivCeil(info.shared_memory_size, 4U)}; + const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)}; const Id type{TypeArray(U32[1], ConstU32(num_elements))}; shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 5a09c411..5391108f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -36,7 +36,7 @@ struct VectorIds { class EmitContext final : public Sirit::Module { public: - explicit EmitContext(const Profile& profile, IR::Program& program, u32& binding); + explicit EmitContext(const Profile& profile, const Shader::Info& info, u32& binding); ~EmitContext(); Id Def(const IR::Value& value); @@ -124,7 +124,7 @@ public: return ConstantComposite(type, constituents); } - Info& info; + const Info& info; const Profile& profile; Stage stage{}; @@ -207,13 +207,19 @@ public: u32 binding; const VectorIds* data_types; Id pointer_type; - AmdGpu::DataFormat dfmt; - AmdGpu::NumberFormat nfmt; - u32 stride; + }; + struct TextureBufferDefinition { + Id id; + Id coord_offset; + u32 binding; + Id image_type; + Id result_type; + bool is_integer; }; u32& binding; boost::container::small_vector buffers; + boost::container::small_vector texture_buffers; boost::container::small_vector images; boost::container::small_vector samplers; @@ -238,6 +244,7 @@ private: void DefineOutputs(); void DefinePushDataBlock(); void DefineBuffers(); + void DefineTextureBuffers(); void DefineImagesAndSamplers(); void DefineSharedMemory(); diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 7580f744..aa9b49b6 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -18,25 +18,31 @@ void Translator::EmitDataShare(const GcnInst& inst) { case Opcode::DS_READ2_B64: return DS_READ(64, false, true, inst); case Opcode::DS_WRITE_B32: - return DS_WRITE(32, false, false, inst); + return DS_WRITE(32, false, false, false, inst); + case Opcode::DS_WRITE2ST64_B32: + return DS_WRITE(32, false, true, true, inst); case Opcode::DS_WRITE_B64: - return DS_WRITE(64, false, false, inst); + return DS_WRITE(64, false, false, false, inst); case Opcode::DS_WRITE2_B32: - return DS_WRITE(32, false, true, inst); + return DS_WRITE(32, false, true, false, inst); case Opcode::DS_WRITE2_B64: - return DS_WRITE(64, false, true, inst); + return DS_WRITE(64, false, true, false, inst); case Opcode::DS_ADD_U32: return DS_ADD_U32(inst, false); case Opcode::DS_MIN_U32: - return DS_MIN_U32(inst, false); + return DS_MIN_U32(inst, false, false); + case Opcode::DS_MIN_I32: + return DS_MIN_U32(inst, true, false); case Opcode::DS_MAX_U32: - return DS_MAX_U32(inst, false); + return DS_MAX_U32(inst, false, false); + case Opcode::DS_MAX_I32: + return DS_MAX_U32(inst, true, false); case Opcode::DS_ADD_RTN_U32: return DS_ADD_U32(inst, true); case Opcode::DS_MIN_RTN_U32: - return DS_MIN_U32(inst, true); + return DS_MIN_U32(inst, false, true); case Opcode::DS_MAX_RTN_U32: - return DS_MAX_U32(inst, true); + return DS_MAX_U32(inst, false, true); default: LogMissingOpcode(inst); } @@ -89,12 +95,13 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn } } -void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) { +void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, + const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; if (is_pair) { - const u32 adj = bit_size == 32 ? 4 : 8; + const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data0), addr0); @@ -133,23 +140,23 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { } } -void Translator::DS_MIN_U32(const GcnInst& inst, bool rtn) { +void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { const IR::U32 addr{GetSrc(inst.src[0])}; const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, false); + const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); } } -void Translator::DS_MAX_U32(const GcnInst& inst, bool rtn) { +void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) { const IR::U32 addr{GetSrc(inst.src[0])}; const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, false); + const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); } diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 889de21b..d80de002 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -1,14 +1,12 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "common/logging/log.h" #include "shader_recompiler/frontend/translate/translate.h" namespace Shader::Gcn { void Translator::EmitExport(const GcnInst& inst) { if (ir.block->has_multiple_predecessors && info.stage == Stage::Fragment) { - LOG_WARNING(Render_Recompiler, "An ambiguous export appeared in translation"); ir.Discard(ir.LogicalNot(ir.GetExec())); } diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index e59cd565..e3482546 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -354,7 +354,7 @@ void Translator::EmitFetch(const GcnInst& inst) { if (!std::filesystem::exists(dump_dir)) { std::filesystem::create_directories(dump_dir); } - const auto filename = fmt::format("vs_fetch_{:#018x}.bin", info.pgm_hash); + const auto filename = fmt::format("vs_{:#018x}_fetch.bin", info.pgm_hash); const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; file.WriteRaw(code, fetch_size); } @@ -399,9 +399,7 @@ void Translator::EmitFetch(const GcnInst& inst) { info.buffers.push_back({ .sgpr_base = attrib.sgpr_base, .dword_offset = attrib.dword_offset, - .length = buffer.num_records, .used_types = IR::Type::F32, - .is_storage = true, // we may not fit into UBO with large meshes .is_instance_data = true, }); instance_buf_handle = s32(info.buffers.size() - 1); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 8cbf7357..8d418421 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -191,8 +191,10 @@ public: void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst); // Vector Memory - void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); - void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); + void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst); + void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst); void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst); // Vector interpolation @@ -202,10 +204,10 @@ public: // Data share void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); - void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); + void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); void DS_ADD_U32(const GcnInst& inst, bool rtn); - void DS_MIN_U32(const GcnInst& inst, bool rtn); - void DS_MAX_U32(const GcnInst& inst, bool rtn); + void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); + void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); void V_READFIRSTLANE_B32(const GcnInst& inst); void V_READLANE_B32(const GcnInst& inst); void V_WRITELANE_B32(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 274dcff1..13a8342d 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -415,14 +415,20 @@ void Translator::V_ADDC_U32(const GcnInst& inst) { const auto src0 = GetSrc(inst.src[0]); const auto src1 = GetSrc(inst.src[1]); - IR::U32 scarry; + IR::U1 carry; if (inst.src_count == 3) { // VOP3 - IR::U1 thread_bit{ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[2].code))}; - scarry = IR::U32{ir.Select(thread_bit, ir.Imm32(1), ir.Imm32(0))}; + if (inst.src[2].field == OperandField::VccLo) { + carry = ir.GetVcc(); + } else if (inst.src[2].field == OperandField::ScalarGPR) { + carry = ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[2].code)); + } else { + UNREACHABLE(); + } } else { // VOP2 - scarry = ir.GetVccLo(); + carry = ir.GetVcc(); } + const IR::U32 scarry = IR::U32{ir.Select(carry, ir.Imm32(1), ir.Imm32(0))}; const IR::U32 result = ir.IAdd(ir.IAdd(src0, src1), scarry); const IR::VectorReg dst_reg{inst.dst[0].code}; diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index b88cfc46..73530dad 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -56,57 +56,57 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { // Buffer load operations case Opcode::TBUFFER_LOAD_FORMAT_X: - return BUFFER_LOAD_FORMAT(1, true, true, inst); + return BUFFER_LOAD(1, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XY: - return BUFFER_LOAD_FORMAT(2, true, true, inst); + return BUFFER_LOAD(2, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZ: - return BUFFER_LOAD_FORMAT(3, true, true, inst); + return BUFFER_LOAD(3, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - return BUFFER_LOAD_FORMAT(4, true, true, inst); + return BUFFER_LOAD(4, true, inst); case Opcode::BUFFER_LOAD_FORMAT_X: - return BUFFER_LOAD_FORMAT(1, false, true, inst); + return BUFFER_LOAD_FORMAT(1, inst); case Opcode::BUFFER_LOAD_FORMAT_XY: - return BUFFER_LOAD_FORMAT(2, false, true, inst); + return BUFFER_LOAD_FORMAT(2, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZ: - return BUFFER_LOAD_FORMAT(3, false, true, inst); + return BUFFER_LOAD_FORMAT(3, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZW: - return BUFFER_LOAD_FORMAT(4, false, true, inst); + return BUFFER_LOAD_FORMAT(4, inst); case Opcode::BUFFER_LOAD_DWORD: - return BUFFER_LOAD_FORMAT(1, false, false, inst); + return BUFFER_LOAD(1, false, inst); case Opcode::BUFFER_LOAD_DWORDX2: - return BUFFER_LOAD_FORMAT(2, false, false, inst); + return BUFFER_LOAD(2, false, inst); case Opcode::BUFFER_LOAD_DWORDX3: - return BUFFER_LOAD_FORMAT(3, false, false, inst); + return BUFFER_LOAD(3, false, inst); case Opcode::BUFFER_LOAD_DWORDX4: - return BUFFER_LOAD_FORMAT(4, false, false, inst); + return BUFFER_LOAD(4, false, inst); // Buffer store operations case Opcode::BUFFER_STORE_FORMAT_X: - return BUFFER_STORE_FORMAT(1, false, true, inst); + return BUFFER_STORE_FORMAT(1, inst); case Opcode::BUFFER_STORE_FORMAT_XY: - return BUFFER_STORE_FORMAT(2, false, true, inst); + return BUFFER_STORE_FORMAT(2, inst); case Opcode::BUFFER_STORE_FORMAT_XYZ: - return BUFFER_STORE_FORMAT(3, false, true, inst); + return BUFFER_STORE_FORMAT(3, inst); case Opcode::BUFFER_STORE_FORMAT_XYZW: - return BUFFER_STORE_FORMAT(4, false, true, inst); + return BUFFER_STORE_FORMAT(4, inst); case Opcode::TBUFFER_STORE_FORMAT_X: - return BUFFER_STORE_FORMAT(1, true, true, inst); + return BUFFER_STORE(1, true, inst); case Opcode::TBUFFER_STORE_FORMAT_XY: - return BUFFER_STORE_FORMAT(2, true, true, inst); + return BUFFER_STORE(2, true, inst); case Opcode::TBUFFER_STORE_FORMAT_XYZ: - return BUFFER_STORE_FORMAT(3, true, true, inst); + return BUFFER_STORE(3, true, inst); case Opcode::BUFFER_STORE_DWORD: - return BUFFER_STORE_FORMAT(1, false, false, inst); + return BUFFER_STORE(1, false, inst); case Opcode::BUFFER_STORE_DWORDX2: - return BUFFER_STORE_FORMAT(2, false, false, inst); + return BUFFER_STORE(2, false, inst); case Opcode::BUFFER_STORE_DWORDX3: - return BUFFER_STORE_FORMAT(3, false, false, inst); + return BUFFER_STORE(3, false, inst); case Opcode::BUFFER_STORE_DWORDX4: - return BUFFER_STORE_FORMAT(4, false, false, inst); + return BUFFER_STORE(4, false, inst); // Buffer atomic operations case Opcode::BUFFER_ATOMIC_ADD: @@ -349,8 +349,7 @@ void Translator::IMAGE_STORE(const GcnInst& inst) { ir.ImageWrite(handle, body, value, {}); } -void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, - const GcnInst& inst) { +void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -370,22 +369,19 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma info.index_enable.Assign(mtbuf.idxen); info.offset_enable.Assign(mtbuf.offen); info.inst_offset.Assign(mtbuf.offset); - info.is_typed.Assign(is_typed); if (is_typed) { - info.dmft.Assign(static_cast(mtbuf.dfmt)); - info.nfmt.Assign(static_cast(mtbuf.nfmt)); - ASSERT(info.nfmt == AmdGpu::NumberFormat::Float && - (info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || - info.dmft == AmdGpu::DataFormat::Format32_32_32 || - info.dmft == AmdGpu::DataFormat::Format32_32 || - info.dmft == AmdGpu::DataFormat::Format32)); + const auto dmft = static_cast(mtbuf.dfmt); + const auto nfmt = static_cast(mtbuf.nfmt); + ASSERT(nfmt == AmdGpu::NumberFormat::Float && + (dmft == AmdGpu::DataFormat::Format32_32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32)); } const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info) - : ir.LoadBuffer(num_dwords, handle, address, info); + const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::F32{value}); @@ -396,8 +392,34 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma } } -void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, - const GcnInst& inst) { +void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) { + const auto& mubuf = inst.control.mubuf; + const IR::VectorReg vaddr{inst.src[0].code}; + const IR::ScalarReg sharp{inst.src[2].code * 4}; + ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); + const IR::Value address = [&] -> IR::Value { + if (mubuf.idxen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); + const IR::Value soffset{GetSrc(inst.src[3])}; + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + + IR::BufferInstInfo info{}; + info.index_enable.Assign(mubuf.idxen); + + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + const IR::Value value = ir.LoadBufferFormat(handle, address, info); + const IR::VectorReg dst_reg{inst.src[1].code}; + for (u32 i = 0; i < num_dwords; i++) { + ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)}); + } +} + +void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -417,45 +439,76 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_form info.index_enable.Assign(mtbuf.idxen); info.offset_enable.Assign(mtbuf.offen); info.inst_offset.Assign(mtbuf.offset); - info.is_typed.Assign(is_typed); if (is_typed) { - info.dmft.Assign(static_cast(mtbuf.dfmt)); - info.nfmt.Assign(static_cast(mtbuf.nfmt)); + const auto dmft = static_cast(mtbuf.dfmt); + const auto nfmt = static_cast(mtbuf.nfmt); + ASSERT(nfmt == AmdGpu::NumberFormat::Float && + (dmft == AmdGpu::DataFormat::Format32_32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32)); } IR::Value value{}; const IR::VectorReg src_reg{inst.src[1].code}; switch (num_dwords) { case 1: - value = ir.GetVectorReg(src_reg); + value = ir.GetVectorReg(src_reg); break; case 2: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1)); + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), + ir.GetVectorReg(src_reg + 1)); break; case 3: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1), - ir.GetVectorReg(src_reg + 2)); + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), + ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2)); break; case 4: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1), - ir.GetVectorReg(src_reg + 2), - ir.GetVectorReg(src_reg + 3)); + value = ir.CompositeConstruct( + ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3)); break; } const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - if (is_format) { - ir.StoreBufferFormat(num_dwords, handle, address, value, info); - } else { - ir.StoreBuffer(num_dwords, handle, address, value, info); - } + ir.StoreBuffer(num_dwords, handle, address, value, info); +} + +void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) { + const auto& mubuf = inst.control.mubuf; + const IR::VectorReg vaddr{inst.src[0].code}; + const IR::ScalarReg sharp{inst.src[2].code * 4}; + ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); + const IR::Value address = [&] -> IR::Value { + if (mubuf.idxen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); + const IR::Value soffset{GetSrc(inst.src[3])}; + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + + IR::BufferInstInfo info{}; + info.index_enable.Assign(mubuf.idxen); + + const IR::VectorReg src_reg{inst.src[1].code}; + + std::array comps{}; + for (u32 i = 0; i < num_dwords; i++) { + comps[i] = ir.GetVectorReg(src_reg + i); + } + for (u32 i = num_dwords; i < 4; i++) { + comps[i] = ir.Imm32(0.f); + } + + const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]); + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + ir.StoreBufferFormat(handle, address, value, info); } -// TODO: U64 void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const auto& mubuf = inst.control.mubuf; const IR::VectorReg vaddr{inst.src[0].code}; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 65de98b7..473ae4f6 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -325,20 +325,8 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad } } -Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, - BufferInstInfo info) { - switch (num_dwords) { - case 1: - return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); - case 2: - return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address); - case 3: - return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address); - case 4: - return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address); - default: - UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); - } +Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); } void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address, @@ -409,24 +397,9 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value); } -void IREmitter::StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, - const Value& data, BufferInstInfo info) { - switch (num_dwords) { - case 1: - Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); - break; - case 2: - Inst(Opcode::StoreBufferFormatF32x2, Flags{info}, handle, address, data); - break; - case 3: - Inst(Opcode::StoreBufferFormatF32x3, Flags{info}, handle, address, data); - break; - case 4: - Inst(Opcode::StoreBufferFormatF32x4, Flags{info}, handle, address, data); - break; - default: - UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); - } +void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); } U32 IREmitter::LaneId() { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a60f4c28..de8fe450 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -92,12 +92,12 @@ public: [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); - [[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, + [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info); void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); - void StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, - const Value& data, BufferInstInfo info); + void StoreBufferFormat(const Value& handle, const Value& address, const Value& data, + BufferInstInfo info); [[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index a8c8b073..d6ef49cf 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -56,9 +56,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: case Opcode::StoreBufferFormatF32: - case Opcode::StoreBufferFormatF32x2: - case Opcode::StoreBufferFormatF32x3: - case Opcode::StoreBufferFormatF32x4: case Opcode::StoreBufferU32: case Opcode::BufferAtomicIAdd32: case Opcode::BufferAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index a49ea1c7..1e33d6d4 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -79,19 +79,13 @@ OPCODE(LoadBufferF32, F32, Opaq OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32, F32, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x2, F32x2, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x3, F32x3, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x4, F32x4, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) -OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32, ) -OPCODE(StoreBufferFormatF32x2, Void, Opaque, Opaque, F32x2, ) -OPCODE(StoreBufferFormatF32x3, Void, Opaque, Opaque, F32x3, ) -OPCODE(StoreBufferFormatF32x4, Void, Opaque, Opaque, F32x4, ) +OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) // Buffer atomic operations diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index ace6a37d..f446ac47 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -3,6 +3,7 @@ #include #include +#include "common/alignment.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" @@ -45,10 +46,6 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: - case IR::Opcode::StoreBufferFormatF32: - case IR::Opcode::StoreBufferFormatF32x2: - case IR::Opcode::StoreBufferFormatF32x3: - case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: @@ -62,10 +59,6 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: case IR::Opcode::LoadBufferU32: case IR::Opcode::ReadConstBuffer: case IR::Opcode::ReadConstBufferU32: @@ -75,6 +68,11 @@ bool IsBufferInstruction(const IR::Inst& inst) { } } +bool IsTextureBufferInstruction(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || + inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32; +} + static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) { switch (num_format) { case AmdGpu::NumberFormat::Float: @@ -100,28 +98,6 @@ static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_for IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { switch (inst.GetOpcode()) { - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: - case IR::Opcode::StoreBufferFormatF32: - case IR::Opcode::StoreBufferFormatF32x2: - case IR::Opcode::StoreBufferFormatF32x3: - case IR::Opcode::StoreBufferFormatF32x4: - switch (num_format) { - case AmdGpu::NumberFormat::Unorm: - case AmdGpu::NumberFormat::Snorm: - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Uint: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::SnormNz: - return IR::Type::U32; - case AmdGpu::NumberFormat::Float: - return IR::Type::F32; - default: - UNREACHABLE(); - } case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: @@ -143,20 +119,8 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { } } -bool IsImageInstruction(const IR::Inst& inst) { +bool IsImageAtomicInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { - case IR::Opcode::ImageSampleExplicitLod: - case IR::Opcode::ImageSampleImplicitLod: - case IR::Opcode::ImageSampleDrefExplicitLod: - case IR::Opcode::ImageSampleDrefImplicitLod: - case IR::Opcode::ImageFetch: - case IR::Opcode::ImageGather: - case IR::Opcode::ImageGatherDref: - case IR::Opcode::ImageQueryDimensions: - case IR::Opcode::ImageQueryLod: - case IR::Opcode::ImageGradient: - case IR::Opcode::ImageRead: - case IR::Opcode::ImageWrite: case IR::Opcode::ImageAtomicIAdd32: case IR::Opcode::ImageAtomicSMin32: case IR::Opcode::ImageAtomicUMin32: @@ -178,20 +142,27 @@ bool IsImageStorageInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::ImageWrite: case IR::Opcode::ImageRead: - case IR::Opcode::ImageAtomicIAdd32: - case IR::Opcode::ImageAtomicSMin32: - case IR::Opcode::ImageAtomicUMin32: - case IR::Opcode::ImageAtomicSMax32: - case IR::Opcode::ImageAtomicUMax32: - case IR::Opcode::ImageAtomicInc32: - case IR::Opcode::ImageAtomicDec32: - case IR::Opcode::ImageAtomicAnd32: - case IR::Opcode::ImageAtomicOr32: - case IR::Opcode::ImageAtomicXor32: - case IR::Opcode::ImageAtomicExchange32: return true; default: - return false; + return IsImageAtomicInstruction(inst); + } +} + +bool IsImageInstruction(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageSampleExplicitLod: + case IR::Opcode::ImageSampleImplicitLod: + case IR::Opcode::ImageSampleDrefExplicitLod: + case IR::Opcode::ImageSampleDrefImplicitLod: + case IR::Opcode::ImageFetch: + case IR::Opcode::ImageGather: + case IR::Opcode::ImageGatherDref: + case IR::Opcode::ImageQueryDimensions: + case IR::Opcode::ImageQueryLod: + case IR::Opcode::ImageGradient: + return true; + default: + return IsImageStorageInstruction(inst); } } @@ -214,7 +185,8 @@ u32 ImageOffsetArgumentPosition(const IR::Inst& inst) { class Descriptors { public: explicit Descriptors(Info& info_) - : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images}, + : info{info_}, buffer_resources{info_.buffers}, + texture_buffer_resources{info_.texture_buffers}, image_resources{info_.images}, sampler_resources{info_.samplers} {} u32 Add(const BufferResource& desc) { @@ -224,13 +196,21 @@ public: desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; - ASSERT(buffer.length == desc.length); - buffer.is_storage |= desc.is_storage; buffer.used_types |= desc.used_types; buffer.is_written |= desc.is_written; return index; } + u32 Add(const TextureBufferResource& desc) { + const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) { + return desc.sgpr_base == existing.sgpr_base && + desc.dword_offset == existing.dword_offset; + })}; + auto& buffer = texture_buffer_resources[index]; + buffer.is_written |= desc.is_written; + return index; + } + u32 Add(const ImageResource& desc) { const u32 index{Add(image_resources, desc, [&desc](const auto& existing) { return desc.sgpr_base == existing.sgpr_base && @@ -247,7 +227,7 @@ public: return true; } // Samplers with different bindings might still be the same. - return existing.GetSsharp(info) == desc.GetSsharp(info); + return existing.GetSharp(info) == desc.GetSharp(info); })}; return index; } @@ -265,6 +245,7 @@ private: const Info& info; BufferResourceList& buffer_resources; + TextureBufferResourceList& texture_buffer_resources; ImageResourceList& image_resources; SamplerResourceList& sampler_resources; }; @@ -361,33 +342,6 @@ SharpLocation TrackSharp(const IR::Inst* inst) { }; } -static constexpr size_t MaxUboSize = 65536; - -static bool IsLoadBufferFormat(const IR::Inst& inst) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: - return true; - default: - return false; - } -} - -static u32 BufferLength(const AmdGpu::Buffer& buffer) { - const auto stride = buffer.GetStride(); - if (stride < sizeof(f32)) { - ASSERT(sizeof(f32) % stride == 0); - return (((buffer.num_records - 1) / sizeof(f32)) + 1) * stride; - } else if (stride == sizeof(f32)) { - return buffer.num_records; - } else { - ASSERT(stride % sizeof(f32) == 0); - return buffer.num_records * (stride / sizeof(f32)); - } -} - s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, AmdGpu::Buffer& cbuf) { @@ -414,10 +368,8 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, return descriptors.Add(BufferResource{ .sgpr_base = std::numeric_limits::max(), .dword_offset = 0, - .length = BufferLength(cbuf), .used_types = BufferDataType(inst, cbuf.GetNumberFmt()), .inline_cbuf = cbuf, - .is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize, }); } @@ -429,28 +381,17 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, IR::Inst* handle = inst.Arg(0).InstRecursive(); IR::Inst* producer = handle->Arg(0).InstRecursive(); const auto sharp = TrackSharp(producer); - const bool is_store = IsBufferStore(inst); buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); binding = descriptors.Add(BufferResource{ .sgpr_base = sharp.sgpr_base, .dword_offset = sharp.dword_offset, - .length = BufferLength(buffer), .used_types = BufferDataType(inst, buffer.GetNumberFmt()), - .is_storage = is_store || buffer.GetSize() > MaxUboSize, - .is_written = is_store, + .is_written = IsBufferStore(inst), }); } // Update buffer descriptor format. const auto inst_info = inst.Flags(); - auto& buffer_desc = info.buffers[binding]; - if (inst_info.is_typed) { - buffer_desc.dfmt = inst_info.dmft; - buffer_desc.nfmt = inst_info.nfmt; - } else { - buffer_desc.dfmt = buffer.GetDataFmt(); - buffer_desc.nfmt = buffer.GetNumberFmt(); - } // Replace handle with binding index in buffer resource list. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -463,20 +404,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, return; } - if (IsLoadBufferFormat(inst)) { - if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) { - info.uses_fp16 = true; - } - } else { - const u32 stride = buffer.GetStride(); - if (stride < 4) { - LOG_WARNING(Render_Vulkan, - "non-formatting load_buffer_* is not implemented for stride {}", stride); - } - } - // Compute address of the buffer using the stride. - // Todo: What if buffer is rebound with different stride? IR::U32 address = ir.Imm32(inst_info.inst_offset.Value()); if (inst_info.index_enable) { const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} @@ -491,8 +419,31 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, inst.SetArg(1, address); } +void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { + const IR::Inst* handle = inst.Arg(0).InstRecursive(); + const IR::Inst* producer = handle->Arg(0).InstRecursive(); + const auto sharp = TrackSharp(producer); + const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const s32 binding = descriptors.Add(TextureBufferResource{ + .sgpr_base = sharp.sgpr_base, + .dword_offset = sharp.dword_offset, + .nfmt = buffer.GetNumberFmt(), + .is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, + }); + + // Replace handle with binding index in texture buffer resource list. + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.SetArg(0, ir.Imm32(binding)); + ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); +} + IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t, - const IR::Value& z) { + const IR::Value& z, bool is_storage) { + // When cubemap is written with imageStore it is treated like 2DArray. + if (is_storage) { + return ir.CompositeConstruct(s, t, z); + } // We need to fix x and y coordinate, // because the s and t coordinate will be scaled and plus 1.5 by v_madak_f32. // We already force the scale value to be 1.0 when handling v_cubema_f32, @@ -530,13 +481,15 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip return; } ASSERT(image.GetType() != AmdGpu::ImageType::Invalid); + const bool is_storage = IsImageStorageInstruction(inst); u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, .dword_offset = tsharp.dword_offset, .type = image.GetType(), .nfmt = static_cast(image.GetNumberFmt()), - .is_storage = IsImageStorageInstruction(inst), + .is_storage = is_storage, .is_depth = bool(inst_info.is_depth), + .is_atomic = IsImageAtomicInstruction(inst), }); // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions @@ -593,7 +546,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip case AmdGpu::ImageType::Color3D: // x, y, z return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; case AmdGpu::ImageType::Cube: // x, y, face - return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; + return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2), is_storage), + body->Arg(3)}; default: UNREACHABLE_MSG("Unknown image type {}", image.GetType()); } @@ -668,6 +622,10 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferInstruction(*block, inst, info, descriptors); continue; } + if (IsTextureBufferInstruction(inst)) { + PatchTextureBufferInstruction(*block, inst, info, descriptors); + continue; + } if (IsImageInstruction(inst)) { PatchImageInstruction(*block, inst, info, descriptors); } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 52087a65..7105f01f 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -29,6 +29,12 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::ImageWrite: info.has_storage_images = true; break; + case IR::Opcode::LoadBufferFormatF32: + info.has_texel_buffers = true; + break; + case IR::Opcode::StoreBufferFormatF32: + info.has_image_buffers = true; + break; case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; @@ -44,6 +50,9 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::ImageQueryLod: info.has_image_query = true; break; + case IR::Opcode::LaneId: + info.uses_lane_id = true; + break; default: break; } diff --git a/src/shader_recompiler/ir/program.h b/src/shader_recompiler/ir/program.h index eff933f2..f7abba64 100644 --- a/src/shader_recompiler/ir/program.h +++ b/src/shader_recompiler/ir/program.h @@ -12,11 +12,13 @@ namespace Shader::IR { struct Program { + explicit Program(Info& info_) : info{info_} {} + AbstractSyntaxList syntax_list; BlockList blocks; BlockList post_order_blocks; std::vector ins_list; - Info info; + Info& info; }; [[nodiscard]] std::string DumpProgram(const Program& program); diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index 7868a5a3..fba04f33 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -66,9 +66,6 @@ union BufferInstInfo { BitField<0, 1, u32> index_enable; BitField<1, 1, u32> offset_enable; BitField<2, 12, u32> inst_offset; - BitField<14, 4, AmdGpu::DataFormat> dmft; - BitField<18, 3, AmdGpu::NumberFormat> nfmt; - BitField<21, 1, u32> is_typed; }; enum class ScalarReg : u32 { diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 0efac4ff..dfcf9ed1 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -29,7 +29,7 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { IR::Program TranslateProgram(Common::ObjectPool& inst_pool, Common::ObjectPool& block_pool, std::span token, - const Info&& info, const Profile& profile) { + Info& info, const Profile& profile) { // Ensure first instruction is expected. constexpr u32 token_mov_vcchi = 0xBEEB03FF; ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm"); @@ -38,7 +38,7 @@ IR::Program TranslateProgram(Common::ObjectPool& inst_pool, Gcn::GcnDecodeContext decoder; // Decode and save instructions - IR::Program program; + IR::Program program{info}; program.ins_list.reserve(token.size()); while (!slice.atEnd()) { program.ins_list.emplace_back(decoder.decodeInstruction(slice)); @@ -49,7 +49,6 @@ IR::Program TranslateProgram(Common::ObjectPool& inst_pool, Gcn::CFG cfg{gcn_block_pool, program.ins_list}; // Structurize control flow graph and create program. - program.info = std::move(info); program.syntax_list = Shader::Gcn::BuildASL(inst_pool, block_pool, cfg, program.info, profile); program.blocks = GenerateBlocks(program.syntax_list); program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front()); diff --git a/src/shader_recompiler/recompiler.h b/src/shader_recompiler/recompiler.h index 34e958a1..3a229518 100644 --- a/src/shader_recompiler/recompiler.h +++ b/src/shader_recompiler/recompiler.h @@ -13,7 +13,7 @@ struct Profile; [[nodiscard]] IR::Program TranslateProgram(Common::ObjectPool& inst_pool, Common::ObjectPool& block_pool, - std::span code, const Info&& info, + std::span code, Info& info, const Profile& profile); } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 7d36dbe1..77c57e94 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include "common/assert.h" #include "common/types.h" @@ -74,18 +75,29 @@ struct Info; struct BufferResource { u32 sgpr_base; u32 dword_offset; - u32 length; IR::Type used_types; AmdGpu::Buffer inline_cbuf; - AmdGpu::DataFormat dfmt; - AmdGpu::NumberFormat nfmt; - bool is_storage{}; bool is_instance_data{}; bool is_written{}; - constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept; + bool IsStorage(AmdGpu::Buffer buffer) const noexcept { + static constexpr size_t MaxUboSize = 65536; + return buffer.GetSize() > MaxUboSize || is_written; + } + + constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept; }; -using BufferResourceList = boost::container::static_vector; +using BufferResourceList = boost::container::small_vector; + +struct TextureBufferResource { + u32 sgpr_base; + u32 dword_offset; + AmdGpu::NumberFormat nfmt; + bool is_written{}; + + constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept; +}; +using TextureBufferResourceList = boost::container::small_vector; struct ImageResource { u32 sgpr_base; @@ -94,8 +106,11 @@ struct ImageResource { AmdGpu::NumberFormat nfmt; bool is_storage; bool is_depth; + bool is_atomic{}; + + constexpr AmdGpu::Image GetSharp(const Info& info) const noexcept; }; -using ImageResourceList = boost::container::static_vector; +using ImageResourceList = boost::container::small_vector; struct SamplerResource { u32 sgpr_base; @@ -104,9 +119,9 @@ struct SamplerResource { u32 associated_image : 4; u32 disable_aniso : 1; - constexpr AmdGpu::Sampler GetSsharp(const Info& info) const noexcept; + constexpr AmdGpu::Sampler GetSharp(const Info& info) const noexcept; }; -using SamplerResourceList = boost::container::static_vector; +using SamplerResourceList = boost::container::small_vector; struct PushData { static constexpr size_t BufOffsetIndex = 2; @@ -179,6 +194,7 @@ struct Info { s8 instance_offset_sgpr = -1; BufferResourceList buffers; + TextureBufferResourceList texture_buffers; ImageResourceList images; SamplerResourceList samplers; @@ -194,9 +210,12 @@ struct Info { u64 pgm_hash{}; u32 shared_memory_size{}; bool has_storage_images{}; + bool has_image_buffers{}; + bool has_texel_buffers{}; bool has_discard{}; bool has_image_gather{}; bool has_image_query{}; + bool uses_lane_id{}; bool uses_group_quad{}; bool uses_shared{}; bool uses_fp16{}; @@ -214,6 +233,10 @@ struct Info { return data; } + size_t NumBindings() const noexcept { + return buffers.size() + texture_buffers.size() + images.size() + samplers.size(); + } + [[nodiscard]] std::pair GetDrawOffsets() const noexcept { u32 vertex_offset = 0; u32 instance_offset = 0; @@ -227,11 +250,19 @@ struct Info { } }; -constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexcept { +constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); } -constexpr AmdGpu::Sampler SamplerResource::GetSsharp(const Info& info) const noexcept { +constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept { + return info.ReadUd(sgpr_base, dword_offset); +} + +constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept { + return info.ReadUd(sgpr_base, dword_offset); +} + +constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept { return inline_sampler ? inline_sampler : info.ReadUd(sgpr_base, dword_offset); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 7b38ca79..7f262e1f 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -167,7 +167,7 @@ struct Liverpool { static constexpr auto* GetBinaryInfo(const Shader& sh) { const auto* code = sh.template Address(); const auto* bininfo = std::bit_cast(code + (code[1] + 1) * 2); - ASSERT_MSG(bininfo->Valid(), "Invalid shader binary header"); + // ASSERT_MSG(bininfo->Valid(), "Invalid shader binary header"); return bininfo; } diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 1004ed7d..53d30a7f 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -61,6 +61,10 @@ enum class NumberFormat : u32 { Ubscaled = 13, }; +[[nodiscard]] constexpr bool IsInteger(NumberFormat nfmt) { + return nfmt == AmdGpu::NumberFormat::Sint || nfmt == AmdGpu::NumberFormat::Uint; +} + [[nodiscard]] std::string_view NameOf(DataFormat fmt); [[nodiscard]] std::string_view NameOf(NumberFormat fmt); diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 8c3b675e..b85a3788 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -3,6 +3,7 @@ #pragma once +#include "common/alignment.h" #include "common/assert.h" #include "common/bit_field.h" #include "common/types.h" @@ -68,6 +69,10 @@ struct Buffer { return stride == 0 ? 1U : stride; } + u32 NumDwords() const noexcept { + return Common::AlignUp(GetSize(), sizeof(u32)) >> 2; + } + u32 GetSize() const noexcept { return GetStride() * num_records; } diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index e9dceb62..68a4aa52 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -13,13 +13,6 @@ namespace VideoCore { -constexpr vk::BufferUsageFlags AllFlags = - vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eUniformTexelBuffer | vk::BufferUsageFlagBits::eStorageTexelBuffer | - vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | - vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer | - vk::BufferUsageFlagBits::eIndirectBuffer; - std::string_view BufferTypeName(MemoryUsage type) { switch (type) { case MemoryUsage::Upload: @@ -96,13 +89,13 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa } Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_, - u64 size_bytes_) + vk::BufferUsageFlags flags, u64 size_bytes_) : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_}, buffer{instance->GetDevice(), instance->GetAllocator()} { // Create buffer object. const vk::BufferCreateInfo buffer_ci = { .size = size_bytes, - .usage = AllFlags, + .usage = flags, }; VmaAllocationInfo alloc_info{}; buffer.Create(buffer_ci, usage, &alloc_info); @@ -119,25 +112,33 @@ Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_ is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } -vk::BufferView Buffer::View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, +vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt) { - const auto it{std::ranges::find_if(views, [offset, size, dfmt, nfmt](const BufferView& view) { - return offset == view.offset && size == view.size && dfmt == view.dfmt && nfmt == view.nfmt; + const auto it{std::ranges::find_if(views, [=](const BufferView& view) { + return offset == view.offset && size == view.size && is_written == view.is_written && + dfmt == view.dfmt && nfmt == view.nfmt; })}; if (it != views.end()) { return it->handle; } + const vk::BufferUsageFlags2CreateInfoKHR usage_flags = { + .usage = is_written ? vk::BufferUsageFlagBits2KHR::eStorageTexelBuffer + : vk::BufferUsageFlagBits2KHR::eUniformTexelBuffer, + }; + const vk::BufferViewCreateInfo view_ci = { + .pNext = &usage_flags, + .buffer = buffer.buffer, + .format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt), + .offset = offset, + .range = size, + }; views.push_back({ .offset = offset, .size = size, + .is_written = is_written, .dfmt = dfmt, .nfmt = nfmt, - .handle = instance->GetDevice().createBufferView({ - .buffer = buffer.buffer, - .format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt), - .offset = offset, - .range = size, - }), + .handle = instance->GetDevice().createBufferView(view_ci), }); return views.back().handle; } @@ -147,7 +148,7 @@ constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_, MemoryUsage usage, u64 size_bytes) - : Buffer{instance, usage, 0, size_bytes}, scheduler{scheduler_} { + : Buffer{instance, usage, 0, AllFlags, size_bytes}, scheduler{scheduler_} { ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); const auto device = instance.GetDevice(); diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 87293c76..7bcd0609 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -31,6 +31,15 @@ enum class MemoryUsage { Stream, ///< Requests device local host visible buffer, falling back host memory. }; +constexpr vk::BufferUsageFlags ReadFlags = + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eUniformTexelBuffer | + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eIndexBuffer | + vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndirectBuffer; + +constexpr vk::BufferUsageFlags AllFlags = ReadFlags | vk::BufferUsageFlagBits::eTransferDst | + vk::BufferUsageFlagBits::eStorageTexelBuffer | + vk::BufferUsageFlagBits::eStorageBuffer; + struct UniqueBuffer { explicit UniqueBuffer(vk::Device device, VmaAllocator allocator); ~UniqueBuffer(); @@ -65,7 +74,7 @@ struct UniqueBuffer { class Buffer { public: explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_, - u64 size_bytes_); + vk::BufferUsageFlags flags, u64 size_bytes_); Buffer& operator=(const Buffer&) = delete; Buffer(const Buffer&) = delete; @@ -73,7 +82,8 @@ public: Buffer& operator=(Buffer&&) = default; Buffer(Buffer&&) = default; - vk::BufferView View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt); + vk::BufferView View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt, + AmdGpu::NumberFormat nfmt); /// Increases the likeliness of this being a stream buffer void IncreaseStreamScore(int score) noexcept { @@ -121,6 +131,7 @@ public: struct BufferView { u32 offset; u32 size; + bool is_written; AmdGpu::DataFormat dfmt; AmdGpu::NumberFormat nfmt; vk::BufferView handle; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 02d6b2ce..b838989f 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -23,7 +23,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, memory_tracker{&tracker} { // Ensure the first slot is used for the null buffer - void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, 1)); + void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, ReadFlags, 1)); } BufferCache::~BufferCache() = default; @@ -421,7 +421,7 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size); const u32 size = static_cast(overlap.end - overlap.begin); const BufferId new_buffer_id = - slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, size); + slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, AllFlags, size); auto& new_buffer = slot_buffers[new_buffer_id]; const size_t size_bytes = new_buffer.SizeBytes(); const auto cmdbuf = scheduler.CommandBuffer(); @@ -495,7 +495,8 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) } else { // For large one time transfers use a temporary host buffer. // RenderDoc can lag quite a bit if the stream buffer is too large. - Buffer temp_buffer{instance, MemoryUsage::Upload, 0, total_size_bytes}; + Buffer temp_buffer{instance, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc, + total_size_bytes}; src_buffer = temp_buffer.Handle(); u8* const staging = temp_buffer.mapped_data.data(); for (auto& copy : copies) { diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 81cf9c02..e62c1040 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -13,22 +13,31 @@ namespace Vulkan { ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler_, vk::PipelineCache pipeline_cache, u64 compute_key_, - const Program* program) - : instance{instance_}, scheduler{scheduler_}, compute_key{compute_key_}, - info{&program->pgm.info} { + const Shader::Info& info_, vk::ShaderModule module) + : instance{instance_}, scheduler{scheduler_}, compute_key{compute_key_}, info{&info_} { const vk::PipelineShaderStageCreateInfo shader_ci = { .stage = vk::ShaderStageFlagBits::eCompute, - .module = program->module, + .module = module, .pName = "main", }; u32 binding{}; boost::container::small_vector bindings; for (const auto& buffer : info->buffers) { + const auto sharp = buffer.GetSharp(*info); bindings.push_back({ .binding = binding++, - .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, + .descriptorType = buffer.IsStorage(sharp) ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } + for (const auto& tex_buffer : info->texture_buffers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute, }); @@ -91,22 +100,24 @@ ComputePipeline::~ComputePipeline() = default; bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. + boost::container::static_vector buffer_views; boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; Shader::PushData push_data{}; u32 binding{}; - for (const auto& buffer : info->buffers) { - const auto vsharp = buffer.GetVsharp(*info); + for (const auto& desc : info->buffers) { + const auto vsharp = desc.GetSharp(*info); + const bool is_storage = desc.IsStorage(vsharp); const VAddr address = vsharp.base_address; // Most of the time when a metadata is updated with a shader it gets cleared. It means we // can skip the whole dispatch and update the tracked state instead. Also, it is not // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will // need its full emulation anyways. For cases of metadata read a warning will be logged. - if (buffer.is_storage) { + if (desc.is_written) { if (texture_cache.TouchMeta(address, true)) { - LOG_WARNING(Render_Vulkan, "Metadata update skipped"); + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); return false; } } else { @@ -115,13 +126,12 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } } const u32 size = vsharp.GetSize(); - if (buffer.is_written) { + if (desc.is_written) { texture_cache.InvalidateMemory(address, size, true); } const u32 alignment = - buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); - const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, buffer.is_written); + is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(address, size, desc.is_written); const u32 offset_aligned = Common::AlignDown(offset, alignment); const u32 adjust = offset - offset_aligned; if (adjust != 0) { @@ -134,20 +144,68 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, .dstBinding = binding++, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, + .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, .pBufferInfo = &buffer_infos.back(), }); } + for (const auto& desc : info->texture_buffers) { + const auto vsharp = desc.GetSharp(*info); + vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE); + if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + const VAddr address = vsharp.base_address; + const u32 size = vsharp.GetSize(); + if (desc.is_written) { + if (texture_cache.TouchMeta(address, true)) { + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return false; + } + } else { + if (texture_cache.IsMeta(address)) { + LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); + } + } + if (desc.is_written) { + texture_cache.InvalidateMemory(address, size, true); + } + const u32 alignment = instance.TexelBufferMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, desc.is_written); + const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; + ASSERT_MSG(fmt_stride == vsharp.GetStride(), + "Texel buffer stride must match format stride"); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % fmt_stride == 0); + push_data.AddOffset(binding, adjust / fmt_stride); + } + buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written, + vsharp.GetDataFmt(), vsharp.GetNumberFmt()); + } + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .pTexelBufferView = &buffer_view, + }); + } + for (const auto& image_desc : info->images) { - const auto tsharp = - info->ReadUd(image_desc.sgpr_base, image_desc.dword_offset); - VideoCore::ImageInfo image_info{tsharp}; - VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; - const auto& image_view = texture_cache.FindTexture(image_info, view_info); - const auto& image = texture_cache.GetImage(image_view.image_id); - image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); + const auto tsharp = image_desc.GetSharp(*info); + if (tsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + VideoCore::ImageInfo image_info{tsharp}; + VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; + const auto& image_view = texture_cache.FindTexture(image_info, view_info); + const auto& image = texture_cache.GetImage(image_view.image_id); + image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); + } else { + image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + } set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, @@ -163,7 +221,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } } for (const auto& sampler : info->samplers) { - const auto ssharp = sampler.GetSsharp(*info); + const auto ssharp = sampler.GetSharp(*info); const auto vk_sampler = texture_cache.GetSampler(ssharp); image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); set_writes.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 5da9dc7e..0132066c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -3,7 +3,7 @@ #pragma once -#include "shader_recompiler/ir/program.h" +#include #include "shader_recompiler/runtime_info.h" #include "video_core/renderer_vulkan/vk_common.h" @@ -17,18 +17,11 @@ namespace Vulkan { class Instance; class Scheduler; -struct Program { - Shader::IR::Program pgm; - std::vector spv; - vk::ShaderModule module; - u32 end_binding; -}; - class ComputePipeline { public: explicit ComputePipeline(const Instance& instance, Scheduler& scheduler, vk::PipelineCache pipeline_cache, u64 compute_key, - const Program* program); + const Shader::Info& info, vk::ShaderModule module); ~ComputePipeline(); [[nodiscard]] vk::Pipeline Handle() const noexcept { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5623e431..719a911a 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -19,15 +19,11 @@ namespace Vulkan { GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& scheduler_, const GraphicsPipelineKey& key_, vk::PipelineCache pipeline_cache, - std::span programs) + std::span infos, + std::span modules) : instance{instance_}, scheduler{scheduler_}, key{key_} { const vk::Device device = instance.GetDevice(); - for (u32 i = 0; i < MaxShaderStages; i++) { - if (!programs[i]) { - continue; - } - stages[i] = &programs[i]->pgm.info; - } + std::ranges::copy(infos, stages.begin()); BuildDescSetLayout(); const vk::PushConstantRange push_constants = { @@ -194,16 +190,18 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul auto stage = u32(Shader::Stage::Vertex); boost::container::static_vector shader_stages; - shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ - .stage = vk::ShaderStageFlagBits::eVertex, - .module = programs[stage]->module, - .pName = "main", - }); + if (infos[stage]) { + shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = modules[stage], + .pName = "main", + }); + } stage = u32(Shader::Stage::Fragment); - if (programs[stage]) { + if (infos[stage]) { shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ .stage = vk::ShaderStageFlagBits::eFragment, - .module = programs[stage]->module, + .module = modules[stage], .pName = "main", }); } @@ -309,14 +307,24 @@ void GraphicsPipeline::BuildDescSetLayout() { continue; } for (const auto& buffer : stage->buffers) { + const auto sharp = buffer.GetSharp(*stage); bindings.push_back({ .binding = binding++, - .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, + .descriptorType = buffer.IsStorage(sharp) ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, }); } + for (const auto& tex_buffer : stage->texture_buffers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } for (const auto& image : stage->images) { bindings.push_back({ .binding = binding++, @@ -347,7 +355,8 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. - boost::container::static_vector buffer_infos; + boost::container::static_vector buffer_views; + boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; Shader::PushData push_data{}; @@ -362,15 +371,16 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, push_data.step1 = regs.vgt_instance_step_rate_1; } for (const auto& buffer : stage->buffers) { - const auto vsharp = buffer.GetVsharp(*stage); + const auto vsharp = buffer.GetSharp(*stage); + const bool is_storage = buffer.IsStorage(vsharp); if (vsharp) { const VAddr address = vsharp.base_address; if (texture_cache.IsMeta(address)) { LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)"); } const u32 size = vsharp.GetSize(); - const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment() - : instance.UniformMinAlignment(); + const u32 alignment = + is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(address, size, buffer.is_written); const u32 offset_aligned = Common::AlignDown(offset, alignment); @@ -388,16 +398,47 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, .dstBinding = binding++, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, + .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, .pBufferInfo = &buffer_infos.back(), }); } + for (const auto& tex_buffer : stage->texture_buffers) { + const auto vsharp = tex_buffer.GetSharp(*stage); + vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE); + if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + const VAddr address = vsharp.base_address; + const u32 size = vsharp.GetSize(); + const u32 alignment = instance.TexelBufferMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, tex_buffer.is_written); + const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; + ASSERT_MSG(fmt_stride == vsharp.GetStride(), + "Texel buffer stride must match format stride"); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % fmt_stride == 0); + push_data.AddOffset(binding, adjust / fmt_stride); + } + buffer_view = vk_buffer->View(offset, size + adjust, tex_buffer.is_written, + vsharp.GetDataFmt(), vsharp.GetNumberFmt()); + } + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .pTexelBufferView = &buffer_view, + }); + } + boost::container::static_vector tsharps; for (const auto& image_desc : stage->images) { - const auto tsharp = - stage->ReadUd(image_desc.sgpr_base, image_desc.dword_offset); + const auto tsharp = image_desc.GetSharp(*stage); if (tsharp) { tsharps.emplace_back(tsharp); VideoCore::ImageInfo image_info{tsharp}; @@ -423,7 +464,7 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, } } for (const auto& sampler : stage->samplers) { - auto ssharp = sampler.GetSsharp(*stage); + auto ssharp = sampler.GetSharp(*stage); if (sampler.disable_aniso) { const auto& tsharp = tsharps[sampler.associated_image]; if (tsharp.base_level == 0 && tsharp.last_level == 0) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 548e7d45..3e51e652 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -59,7 +59,8 @@ class GraphicsPipeline { public: explicit GraphicsPipeline(const Instance& instance, Scheduler& scheduler, const GraphicsPipelineKey& key, vk::PipelineCache pipeline_cache, - std::span programs); + std::span stages, + std::span modules); ~GraphicsPipeline(); void BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache, diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 19939a32..e1a5cb41 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -178,7 +178,7 @@ bool Instance::CreateDevice() { return false; } - boost::container::static_vector enabled_extensions; + boost::container::static_vector enabled_extensions; const auto add_extension = [&](std::string_view extension) -> bool { const auto result = std::find_if(available_extensions.begin(), available_extensions.end(), @@ -217,6 +217,7 @@ bool Instance::CreateDevice() { // with extensions. tooling_info = add_extension(VK_EXT_TOOLING_INFO_EXTENSION_NAME); const bool maintenance4 = add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); + const bool maintenance5 = add_extension(VK_KHR_MAINTENANCE_5_EXTENSION_NAME); add_extension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME); add_extension(VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); const bool has_sync2 = add_extension(VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME); @@ -277,6 +278,7 @@ bool Instance::CreateDevice() { .depthBiasClamp = features.depthBiasClamp, .multiViewport = features.multiViewport, .samplerAnisotropy = features.samplerAnisotropy, + .vertexPipelineStoresAndAtomics = features.vertexPipelineStoresAndAtomics, .fragmentStoresAndAtomics = features.fragmentStoresAndAtomics, .shaderImageGatherExtended = features.shaderImageGatherExtended, .shaderStorageImageExtendedFormats = features.shaderStorageImageExtendedFormats, @@ -299,6 +301,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceMaintenance4FeaturesKHR{ .maintenance4 = true, }, + vk::PhysicalDeviceMaintenance5FeaturesKHR{ + .maintenance5 = true, + }, vk::PhysicalDeviceDynamicRenderingFeaturesKHR{ .dynamicRendering = true, }, @@ -344,6 +349,9 @@ bool Instance::CreateDevice() { if (!maintenance4) { device_chain.unlink(); } + if (!maintenance5) { + device_chain.unlink(); + } if (!custom_border_color) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 4cb4741a..5f985d4a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -192,6 +192,11 @@ public: return properties.limits.minStorageBufferOffsetAlignment; } + /// Returns the minimum required alignment for texel buffers + vk::DeviceSize TexelBufferMinAlignment() const { + return properties.limits.minTexelBufferOffsetAlignment; + } + /// Returns the minimum alignemt required for accessing host-mapped device memory vk::DeviceSize NonCoherentAtomSize() const { return properties.limits.nonCoherentAtomSize; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 139edcf7..7e880657 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -1,147 +1,59 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "common/config.h" -#include "common/io_file.h" -#include "common/path_util.h" -#include "shader_recompiler/backend/spirv/emit_spirv.h" -#include "shader_recompiler/exception.h" -#include "shader_recompiler/recompiler.h" #include "shader_recompiler/runtime_info.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/renderer_vulkan/vk_shader_cache.h" extern std::unique_ptr renderer; namespace Vulkan { -using Shader::VsOutput; - -[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { - return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); -} - -void BuildVsOutputs(Shader::Info& info, const AmdGpu::Liverpool::VsOutputControl& ctl) { - const auto add_output = [&](VsOutput x, VsOutput y, VsOutput z, VsOutput w) { - if (x != VsOutput::None || y != VsOutput::None || z != VsOutput::None || - w != VsOutput::None) { - info.vs_outputs.emplace_back(Shader::VsOutputMap{x, y, z, w}); - } - }; - // VS_OUT_MISC_VEC - add_output(ctl.use_vtx_point_size ? VsOutput::PointSprite : VsOutput::None, - ctl.use_vtx_edge_flag - ? VsOutput::EdgeFlag - : (ctl.use_vtx_gs_cut_flag ? VsOutput::GsCutFlag : VsOutput::None), - ctl.use_vtx_kill_flag - ? VsOutput::KillFlag - : (ctl.use_vtx_render_target_idx ? VsOutput::GsMrtIndex : VsOutput::None), - ctl.use_vtx_viewport_idx ? VsOutput::GsVpIndex : VsOutput::None); - // VS_OUT_CCDIST0 - add_output(ctl.IsClipDistEnabled(0) - ? VsOutput::ClipDist0 - : (ctl.IsCullDistEnabled(0) ? VsOutput::CullDist0 : VsOutput::None), - ctl.IsClipDistEnabled(1) - ? VsOutput::ClipDist1 - : (ctl.IsCullDistEnabled(1) ? VsOutput::CullDist1 : VsOutput::None), - ctl.IsClipDistEnabled(2) - ? VsOutput::ClipDist2 - : (ctl.IsCullDistEnabled(2) ? VsOutput::CullDist2 : VsOutput::None), - ctl.IsClipDistEnabled(3) - ? VsOutput::ClipDist3 - : (ctl.IsCullDistEnabled(3) ? VsOutput::CullDist3 : VsOutput::None)); - // VS_OUT_CCDIST1 - add_output(ctl.IsClipDistEnabled(4) - ? VsOutput::ClipDist4 - : (ctl.IsCullDistEnabled(4) ? VsOutput::CullDist4 : VsOutput::None), - ctl.IsClipDistEnabled(5) - ? VsOutput::ClipDist5 - : (ctl.IsCullDistEnabled(5) ? VsOutput::CullDist5 : VsOutput::None), - ctl.IsClipDistEnabled(6) - ? VsOutput::ClipDist6 - : (ctl.IsCullDistEnabled(6) ? VsOutput::CullDist6 : VsOutput::None), - ctl.IsClipDistEnabled(7) - ? VsOutput::ClipDist7 - : (ctl.IsCullDistEnabled(7) ? VsOutput::CullDist7 : VsOutput::None)); -} - -Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_data, - const AmdGpu::Liverpool::Regs& regs) { - Shader::Info info{}; - info.user_data = user_data; - info.stage = stage; - switch (stage) { - case Shader::Stage::Vertex: { - info.num_user_data = regs.vs_program.settings.num_user_regs; - info.num_input_vgprs = regs.vs_program.settings.vgpr_comp_cnt; - BuildVsOutputs(info, regs.vs_output_control); - break; - } - case Shader::Stage::Fragment: { - info.num_user_data = regs.ps_program.settings.num_user_regs; - for (u32 i = 0; i < regs.num_interp; i++) { - info.ps_inputs.push_back({ - .param_index = regs.ps_inputs[i].input_offset.Value(), - .is_default = bool(regs.ps_inputs[i].use_default), - .is_flat = bool(regs.ps_inputs[i].flat_shade), - .default_value = regs.ps_inputs[i].default_value, - }); - } - break; - } - case Shader::Stage::Compute: { - const auto& cs_pgm = regs.cs_program; - info.num_user_data = cs_pgm.settings.num_user_regs; - info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, - cs_pgm.num_thread_z.full}; - info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1), - cs_pgm.IsTgidEnabled(2)}; - info.shared_memory_size = cs_pgm.SharedMemSize(); - break; - } - default: - break; - } - return info; -} - PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, inst_pool{8192}, - block_pool{512} { + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + shader_cache{std::make_unique(instance, liverpool)} { pipeline_cache = instance.GetDevice().createPipelineCacheUnique({}); - profile = Shader::Profile{ - .supported_spirv = 0x00010600U, - .subgroup_size = instance.SubgroupSize(), - .support_explicit_workgroup_layout = true, - }; } +PipelineCache::~PipelineCache() = default; + const GraphicsPipeline* PipelineCache::GetGraphicsPipeline() { + const auto& regs = liverpool->regs; // Tessellation is unsupported so skip the draw to avoid locking up the driver. - if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::PatchPrimitive) { + if (regs.primitive_type == Liverpool::PrimitiveType::PatchPrimitive) { + return nullptr; + } + // There are several cases (e.g. FCE, FMask/HTile decompression) where we don't need to do an + // actual draw hence can skip pipeline creation. + if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::EliminateFastClear) { + LOG_TRACE(Render_Vulkan, "FCE pass skipped"); + return nullptr; + } + if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::FmaskDecompress) { + // TODO: check for a valid MRT1 to promote the draw to the resolve pass. + LOG_TRACE(Render_Vulkan, "FMask decompression pass skipped"); return nullptr; } RefreshGraphicsKey(); const auto [it, is_new] = graphics_pipelines.try_emplace(graphics_key); if (is_new) { - it.value() = CreateGraphicsPipeline(); + it.value() = std::make_unique(instance, scheduler, graphics_key, + *pipeline_cache, infos, modules); } const GraphicsPipeline* pipeline = it->second.get(); return pipeline; } const ComputePipeline* PipelineCache::GetComputePipeline() { - const auto& cs_pgm = liverpool->regs.cs_program; - ASSERT(cs_pgm.Address() != nullptr); - const auto* bininfo = Liverpool::GetBinaryInfo(cs_pgm); - compute_key = bininfo->shader_hash; + RefreshComputeKey(); const auto [it, is_new] = compute_pipelines.try_emplace(compute_key); if (is_new) { - it.value() = CreateComputePipeline(); + it.value() = std::make_unique(instance, scheduler, *pipeline_cache, + compute_key, *infos[0], modules[0]); } const ComputePipeline* pipeline = it->second.get(); return pipeline; @@ -229,164 +141,37 @@ void PipelineCache::RefreshGraphicsKey() { ++remapped_cb; } + u32 binding{}; for (u32 i = 0; i < MaxShaderStages; i++) { if (!regs.stage_enable.IsStageEnabled(i)) { key.stage_hashes[i] = 0; + infos[i] = nullptr; continue; } auto* pgm = regs.ProgramForStage(i); if (!pgm || !pgm->Address()) { key.stage_hashes[i] = 0; + infos[i] = nullptr; continue; } const auto* bininfo = Liverpool::GetBinaryInfo(*pgm); if (!bininfo->Valid()) { key.stage_hashes[i] = 0; + infos[i] = nullptr; continue; } - key.stage_hashes[i] = bininfo->shader_hash; - } -} - -std::unique_ptr PipelineCache::CreateGraphicsPipeline() { - const auto& regs = liverpool->regs; - - // There are several cases (e.g. FCE, FMask/HTile decompression) where we don't need to do an - // actual draw hence can skip pipeline creation. - if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::EliminateFastClear) { - LOG_TRACE(Render_Vulkan, "FCE pass skipped"); - return {}; - } - - if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::FmaskDecompress) { - // TODO: check for a valid MRT1 to promote the draw to the resolve pass. - LOG_TRACE(Render_Vulkan, "FMask decompression pass skipped"); - return {}; - } - - u32 binding{}; - for (u32 i = 0; i < MaxShaderStages; i++) { - if (!graphics_key.stage_hashes[i]) { - programs[i] = nullptr; - continue; - } - auto* pgm = regs.ProgramForStage(i); - const auto code = pgm->Code(); - - // Dump shader code if requested. const auto stage = Shader::Stage{i}; - const u64 hash = graphics_key.stage_hashes[i]; - if (Config::dumpShaders()) { - DumpShader(code, hash, stage, "bin"); - } - - if (stage != Shader::Stage::Fragment && stage != Shader::Stage::Vertex) { - LOG_ERROR(Render_Vulkan, "Unsupported shader stage {}. PL creation skipped.", stage); - return {}; - } - - const u64 lookup_hash = HashCombine(hash, binding); - auto it = program_cache.find(lookup_hash); - if (it != program_cache.end()) { - const Program* program = it.value().get(); - ASSERT(program->pgm.info.stage == stage); - programs[i] = program; - binding = program->end_binding; - continue; - } - - // Recompile shader to IR. - try { - auto program = std::make_unique(); - block_pool.ReleaseContents(); - inst_pool.ReleaseContents(); - - LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x}", stage, hash); - Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); - info.pgm_base = pgm->Address(); - info.pgm_hash = hash; - program->pgm = - Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); - - // Compile IR to SPIR-V - program->spv = Shader::Backend::SPIRV::EmitSPIRV(profile, program->pgm, binding); - if (Config::dumpShaders()) { - DumpShader(program->spv, hash, stage, "spv"); - } - - // Compile module and set name to hash in renderdoc - program->end_binding = binding; - program->module = CompileSPV(program->spv, instance.GetDevice()); - const auto name = fmt::format("{}_{:#x}", stage, hash); - Vulkan::SetObjectName(instance.GetDevice(), program->module, name); - - // Cache program - const auto [it, _] = program_cache.emplace(lookup_hash, std::move(program)); - programs[i] = it.value().get(); - } catch (const Shader::Exception& e) { - UNREACHABLE_MSG("{}", e.what()); - } - } - - return std::make_unique(instance, scheduler, graphics_key, *pipeline_cache, - programs); -} - -std::unique_ptr PipelineCache::CreateComputePipeline() { - const auto& cs_pgm = liverpool->regs.cs_program; - const auto code = cs_pgm.Code(); - - // Dump shader code if requested. - if (Config::dumpShaders()) { - DumpShader(code, compute_key, Shader::Stage::Compute, "bin"); - } - - block_pool.ReleaseContents(); - inst_pool.ReleaseContents(); - - // Recompile shader to IR. - try { - auto program = std::make_unique(); - LOG_INFO(Render_Vulkan, "Compiling cs shader {:#x}", compute_key); - Shader::Info info = - MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); - info.pgm_base = cs_pgm.Address(); - info.pgm_hash = compute_key; - program->pgm = - Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); - - // Compile IR to SPIR-V - u32 binding{}; - program->spv = Shader::Backend::SPIRV::EmitSPIRV(profile, program->pgm, binding); - if (Config::dumpShaders()) { - DumpShader(program->spv, compute_key, Shader::Stage::Compute, "spv"); - } - - // Compile module and set name to hash in renderdoc - program->module = CompileSPV(program->spv, instance.GetDevice()); - const auto name = fmt::format("cs_{:#x}", compute_key); - Vulkan::SetObjectName(instance.GetDevice(), program->module, name); - - // Cache program - const auto [it, _] = program_cache.emplace(compute_key, std::move(program)); - return std::make_unique(instance, scheduler, *pipeline_cache, compute_key, - it.value().get()); - } catch (const Shader::Exception& e) { - UNREACHABLE_MSG("{}", e.what()); - return nullptr; + const GuestProgram guest_pgm{pgm, stage}; + std::tie(infos[i], modules[i], key.stage_hashes[i]) = + shader_cache->GetProgram(guest_pgm, binding); } } -void PipelineCache::DumpShader(std::span code, u64 hash, Shader::Stage stage, - std::string_view ext) { - using namespace Common::FS; - const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; - if (!std::filesystem::exists(dump_dir)) { - std::filesystem::create_directories(dump_dir); - } - const auto filename = fmt::format("{}_{:#018x}.{}", stage, hash, ext); - const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; - file.WriteSpan(code); +void PipelineCache::RefreshComputeKey() { + u32 binding{}; + const auto* cs_pgm = &liverpool->regs.cs_program; + const GuestProgram guest_pgm{cs_pgm, Shader::Stage::Compute}; + std::tie(infos[0], modules[0], compute_key) = shader_cache->GetProgram(guest_pgm, binding); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 8f3b806c..d0eb0c66 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -4,9 +4,6 @@ #pragma once #include -#include "shader_recompiler/ir/basic_block.h" -#include "shader_recompiler/ir/program.h" -#include "shader_recompiler/profile.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" @@ -18,6 +15,7 @@ namespace Vulkan { class Instance; class Scheduler; +class ShaderCache; class PipelineCache { static constexpr size_t MaxShaderStages = 5; @@ -25,7 +23,7 @@ class PipelineCache { public: explicit PipelineCache(const Instance& instance, Scheduler& scheduler, AmdGpu::Liverpool* liverpool); - ~PipelineCache() = default; + ~PipelineCache(); const GraphicsPipeline* GetGraphicsPipeline(); @@ -33,10 +31,7 @@ public: private: void RefreshGraphicsKey(); - void DumpShader(std::span code, u64 hash, Shader::Stage stage, std::string_view ext); - - std::unique_ptr CreateGraphicsPipeline(); - std::unique_ptr CreateComputePipeline(); + void RefreshComputeKey(); private: const Instance& instance; @@ -44,15 +39,13 @@ private: AmdGpu::Liverpool* liverpool; vk::UniquePipelineCache pipeline_cache; vk::UniquePipelineLayout pipeline_layout; - tsl::robin_map> program_cache; + std::unique_ptr shader_cache; tsl::robin_map> compute_pipelines; tsl::robin_map> graphics_pipelines; - std::array programs{}; - Shader::Profile profile{}; + std::array infos{}; + std::array modules{}; GraphicsPipelineKey graphics_key{}; u64 compute_key{}; - Common::ObjectPool inst_pool; - Common::ObjectPool block_pool; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_cache.cpp b/src/video_core/renderer_vulkan/vk_shader_cache.cpp new file mode 100644 index 00000000..76255712 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_cache.cpp @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/config.h" +#include "common/io_file.h" +#include "common/path_util.h" +#include "shader_recompiler/backend/spirv/emit_spirv.h" +#include "shader_recompiler/recompiler.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_platform.h" +#include "video_core/renderer_vulkan/vk_shader_cache.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" + +namespace Vulkan { + +using Shader::VsOutput; + +void BuildVsOutputs(Shader::Info& info, const AmdGpu::Liverpool::VsOutputControl& ctl) { + const auto add_output = [&](VsOutput x, VsOutput y, VsOutput z, VsOutput w) { + if (x != VsOutput::None || y != VsOutput::None || z != VsOutput::None || + w != VsOutput::None) { + info.vs_outputs.emplace_back(Shader::VsOutputMap{x, y, z, w}); + } + }; + // VS_OUT_MISC_VEC + add_output(ctl.use_vtx_point_size ? VsOutput::PointSprite : VsOutput::None, + ctl.use_vtx_edge_flag + ? VsOutput::EdgeFlag + : (ctl.use_vtx_gs_cut_flag ? VsOutput::GsCutFlag : VsOutput::None), + ctl.use_vtx_kill_flag + ? VsOutput::KillFlag + : (ctl.use_vtx_render_target_idx ? VsOutput::GsMrtIndex : VsOutput::None), + ctl.use_vtx_viewport_idx ? VsOutput::GsVpIndex : VsOutput::None); + // VS_OUT_CCDIST0 + add_output(ctl.IsClipDistEnabled(0) + ? VsOutput::ClipDist0 + : (ctl.IsCullDistEnabled(0) ? VsOutput::CullDist0 : VsOutput::None), + ctl.IsClipDistEnabled(1) + ? VsOutput::ClipDist1 + : (ctl.IsCullDistEnabled(1) ? VsOutput::CullDist1 : VsOutput::None), + ctl.IsClipDistEnabled(2) + ? VsOutput::ClipDist2 + : (ctl.IsCullDistEnabled(2) ? VsOutput::CullDist2 : VsOutput::None), + ctl.IsClipDistEnabled(3) + ? VsOutput::ClipDist3 + : (ctl.IsCullDistEnabled(3) ? VsOutput::CullDist3 : VsOutput::None)); + // VS_OUT_CCDIST1 + add_output(ctl.IsClipDistEnabled(4) + ? VsOutput::ClipDist4 + : (ctl.IsCullDistEnabled(4) ? VsOutput::CullDist4 : VsOutput::None), + ctl.IsClipDistEnabled(5) + ? VsOutput::ClipDist5 + : (ctl.IsCullDistEnabled(5) ? VsOutput::CullDist5 : VsOutput::None), + ctl.IsClipDistEnabled(6) + ? VsOutput::ClipDist6 + : (ctl.IsCullDistEnabled(6) ? VsOutput::CullDist6 : VsOutput::None), + ctl.IsClipDistEnabled(7) + ? VsOutput::ClipDist7 + : (ctl.IsCullDistEnabled(7) ? VsOutput::CullDist7 : VsOutput::None)); +} + +Shader::Info MakeShaderInfo(const GuestProgram& pgm, const AmdGpu::Liverpool::Regs& regs) { + Shader::Info info{}; + info.user_data = pgm.user_data; + info.pgm_base = VAddr(pgm.code.data()); + info.pgm_hash = pgm.hash; + info.stage = pgm.stage; + switch (pgm.stage) { + case Shader::Stage::Vertex: { + info.num_user_data = regs.vs_program.settings.num_user_regs; + info.num_input_vgprs = regs.vs_program.settings.vgpr_comp_cnt; + BuildVsOutputs(info, regs.vs_output_control); + break; + } + case Shader::Stage::Fragment: { + info.num_user_data = regs.ps_program.settings.num_user_regs; + for (u32 i = 0; i < regs.num_interp; i++) { + info.ps_inputs.push_back({ + .param_index = regs.ps_inputs[i].input_offset.Value(), + .is_default = bool(regs.ps_inputs[i].use_default), + .is_flat = bool(regs.ps_inputs[i].flat_shade), + .default_value = regs.ps_inputs[i].default_value, + }); + } + break; + } + case Shader::Stage::Compute: { + const auto& cs_pgm = regs.cs_program; + info.num_user_data = cs_pgm.settings.num_user_regs; + info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, + cs_pgm.num_thread_z.full}; + info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1), + cs_pgm.IsTgidEnabled(2)}; + info.shared_memory_size = cs_pgm.SharedMemSize(); + break; + } + default: + break; + } + return info; +} + +[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { + return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +ShaderCache::ShaderCache(const Instance& instance_, AmdGpu::Liverpool* liverpool_) + : instance{instance_}, liverpool{liverpool_}, inst_pool{8192}, block_pool{512} { + profile = Shader::Profile{ + .supported_spirv = 0x00010600U, + .subgroup_size = instance.SubgroupSize(), + .support_explicit_workgroup_layout = true, + }; +} + +vk::ShaderModule ShaderCache::CompileModule(Shader::Info& info, std::span code, + size_t perm_idx, u32& binding) { + LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x} {}", info.stage, info.pgm_hash, + perm_idx != 0 ? "(permutation)" : ""); + + if (Config::dumpShaders()) { + DumpShader(code, info.pgm_hash, info.stage, perm_idx, "bin"); + } + + block_pool.ReleaseContents(); + inst_pool.ReleaseContents(); + const auto ir_program = Shader::TranslateProgram(inst_pool, block_pool, code, info, profile); + + // Compile IR to SPIR-V + const auto spv = Shader::Backend::SPIRV::EmitSPIRV(profile, ir_program, binding); + if (Config::dumpShaders()) { + DumpShader(spv, info.pgm_hash, info.stage, perm_idx, "spv"); + } + + // Create module and set name to hash in renderdoc + const auto module = CompileSPV(spv, instance.GetDevice()); + ASSERT(module != VK_NULL_HANDLE); + const auto name = fmt::format("{}_{:#x}_{}", info.stage, info.pgm_hash, perm_idx); + Vulkan::SetObjectName(instance.GetDevice(), module, name); + return module; +} + +Program* ShaderCache::CreateProgram(const GuestProgram& pgm, u32& binding) { + Program* program = program_pool.Create(MakeShaderInfo(pgm, liverpool->regs)); + u32 start_binding = binding; + const auto module = CompileModule(program->info, pgm.code, 0, binding); + program->modules.emplace_back(module, StageSpecialization{program->info, start_binding}); + return program; +} + +std::tuple ShaderCache::GetProgram( + const GuestProgram& pgm, u32& binding) { + auto [it_pgm, new_program] = program_cache.try_emplace(pgm.hash); + if (new_program) { + auto program = CreateProgram(pgm, binding); + const auto module = program->modules.back().module; + it_pgm.value() = program; + return std::make_tuple(&program->info, module, HashCombine(pgm.hash, 0)); + } + + Program* program = it_pgm->second; + const auto& info = program->info; + size_t perm_idx = program->modules.size(); + StageSpecialization spec{info, binding}; + vk::ShaderModule module{}; + + const auto it = std::ranges::find(program->modules, spec, &Program::Module::spec); + if (it == program->modules.end()) { + auto new_info = MakeShaderInfo(pgm, liverpool->regs); + module = CompileModule(new_info, pgm.code, perm_idx, binding); + program->modules.emplace_back(module, std::move(spec)); + } else { + binding += info.NumBindings(); + module = it->module; + perm_idx = std::distance(program->modules.begin(), it); + } + return std::make_tuple(&info, module, HashCombine(pgm.hash, perm_idx)); +} + +void ShaderCache::DumpShader(std::span code, u64 hash, Shader::Stage stage, + size_t perm_idx, std::string_view ext) { + using namespace Common::FS; + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = fmt::format("{}_{:#018x}_{}.{}", stage, hash, perm_idx, ext); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; + file.WriteSpan(code); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_cache.h b/src/video_core/renderer_vulkan/vk_shader_cache.h new file mode 100644 index 00000000..191e1b08 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_cache.h @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include "common/object_pool.h" +#include "shader_recompiler/ir/basic_block.h" +#include "shader_recompiler/profile.h" +#include "shader_recompiler/runtime_info.h" +#include "video_core/amdgpu/liverpool.h" +#include "video_core/renderer_vulkan/vk_common.h" + +namespace Vulkan { + +class Instance; + +struct BufferSpecialization { + u16 stride : 14; + u16 is_storage : 1; + + auto operator<=>(const BufferSpecialization&) const = default; +}; + +struct TextureBufferSpecialization { + bool is_integer; + + auto operator<=>(const TextureBufferSpecialization&) const = default; +}; + +struct ImageSpecialization { + AmdGpu::ImageType type; + bool is_integer; + + auto operator<=>(const ImageSpecialization&) const = default; +}; + +struct StageSpecialization { + static constexpr size_t MaxStageResources = 32; + + const Shader::Info* info; + std::bitset bitset{}; + boost::container::small_vector buffers; + boost::container::small_vector tex_buffers; + boost::container::small_vector images; + u32 start_binding{}; + + void ForEachSharp(u32& binding, auto& spec_list, auto& desc_list, auto&& func) { + for (const auto& desc : desc_list) { + auto& spec = spec_list.emplace_back(); + const auto sharp = desc.GetSharp(*info); + if (!sharp) { + binding++; + continue; + } + bitset.set(binding++); + func(spec, desc, sharp); + } + } + + StageSpecialization(const Shader::Info& info_, u32 start_binding_) + : info{&info_}, start_binding{start_binding_} { + u32 binding{}; + ForEachSharp(binding, buffers, info->buffers, + [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + spec.stride = sharp.GetStride(); + spec.is_storage = desc.IsStorage(sharp); + }); + ForEachSharp(binding, tex_buffers, info->texture_buffers, + [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + spec.is_integer = AmdGpu::IsInteger(sharp.GetNumberFmt()); + }); + ForEachSharp(binding, images, info->images, + [](auto& spec, const auto& desc, AmdGpu::Image sharp) { + spec.type = sharp.GetType(); + spec.is_integer = AmdGpu::IsInteger(sharp.GetNumberFmt()); + }); + } + + bool operator==(const StageSpecialization& other) const { + if (start_binding != other.start_binding) { + return false; + } + u32 binding{}; + for (u32 i = 0; i < buffers.size(); i++) { + if (other.bitset[binding++] && buffers[i] != other.buffers[i]) { + return false; + } + } + for (u32 i = 0; i < tex_buffers.size(); i++) { + if (other.bitset[binding++] && tex_buffers[i] != other.tex_buffers[i]) { + return false; + } + } + for (u32 i = 0; i < images.size(); i++) { + if (other.bitset[binding++] && images[i] != other.images[i]) { + return false; + } + } + return true; + } +}; + +struct Program { + struct Module { + vk::ShaderModule module; + StageSpecialization spec; + }; + + Shader::Info info; + boost::container::small_vector modules; + + explicit Program(const Shader::Info& info_) : info{info_} {} +}; + +struct GuestProgram { + Shader::Stage stage; + std::span user_data; + std::span code; + u64 hash; + + explicit GuestProgram(const auto* pgm, Shader::Stage stage_) + : stage{stage_}, user_data{pgm->user_data}, code{pgm->Code()} { + const auto* bininfo = AmdGpu::Liverpool::GetBinaryInfo(*pgm); + hash = bininfo->shader_hash; + } +}; + +class ShaderCache { +public: + explicit ShaderCache(const Instance& instance, AmdGpu::Liverpool* liverpool); + ~ShaderCache() = default; + + std::tuple GetProgram(const GuestProgram& pgm, + u32& binding); + +private: + void DumpShader(std::span code, u64 hash, Shader::Stage stage, size_t perm_idx, + std::string_view ext); + vk::ShaderModule CompileModule(Shader::Info& info, std::span code, size_t perm_idx, + u32& binding); + Program* CreateProgram(const GuestProgram& pgm, u32& binding); + +private: + const Instance& instance; + AmdGpu::Liverpool* liverpool; + Shader::Profile profile{}; + tsl::robin_map program_cache; + Common::ObjectPool inst_pool; + Common::ObjectPool block_pool; + Common::ObjectPool program_pool; +}; + +} // namespace Vulkan diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 00c3833f..bcdc11ad 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -50,9 +50,9 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) { } bool IsIdentityMapping(u32 dst_sel, u32 num_components) { - return (num_components == 1 && dst_sel == 0b100) || - (num_components == 2 && dst_sel == 0b101'100) || - (num_components == 3 && dst_sel == 0b110'101'100) || + return (num_components == 1 && dst_sel == 0b001'000'000'100) || + (num_components == 2 && dst_sel == 0b001'000'101'100) || + (num_components == 3 && dst_sel == 0b001'110'101'100) || (num_components == 4 && dst_sel == 0b111'110'101'100); } diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 0bed5adc..8b022762 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -187,6 +187,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR32Uint: case vk::Format::eR16G16Sfloat: case vk::Format::eR16G16Unorm: + case vk::Format::eB10G11R11UfloatPack32: return vk::Format::eR32Uint; case vk::Format::eBc1RgbaSrgbBlock: case vk::Format::eBc1RgbaUnormBlock: @@ -202,6 +203,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eBc3SrgbBlock: case vk::Format::eBc3UnormBlock: case vk::Format::eBc5UnormBlock: + case vk::Format::eBc5SnormBlock: case vk::Format::eBc7SrgbBlock: case vk::Format::eBc7UnormBlock: case vk::Format::eBc6HUfloatBlock: