From 1fb0da9b897b9a2b9d31a42898b660ef7d01a848 Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:21:48 +0300 Subject: [PATCH] video_core: Crucial buffer cache fixes + proper GPU clears (#414) * translator: Use templates for stronger type guarantees * spirv: Define buffer offsets upfront * Saves a lot of shader instructions * buffer_cache: Use dynamic vertex input when available * Fixes issues when games like dark souls rebind vertex buffers with different stride * externals: Update boost * spirv: Use runtime array for ssbos * ssbos can be large and typically their size will vary, especially in generic copy/clear cs shaders * fs: Lock when doing case insensitive search * Dark Souls does fs lookups from different threads * texture_cache: More precise invalidation from compute * Fixes unrelated render targets being cleared * texture_cache: Use hashes for protect gpu modified images from reupload * translator: Treat V_CNDMASK as float * Sometimes it can have input modifiers. Worst this will cause is some extra calls to uintBitsToFloat and opposite. But most often this is used as float anyway * translator: Small optimization for V_SAD_U32 * Fix review * clang format --- externals/ext-boost | 2 +- src/core/file_sys/fs.cpp | 1 + .../libraries/kernel/threads/semaphore.cpp | 3 - .../spirv/emit_spirv_context_get_set.cpp | 15 +- .../backend/spirv/emit_spirv_special.cpp | 4 +- .../backend/spirv/spirv_emit_context.cpp | 26 +- .../backend/spirv/spirv_emit_context.h | 5 +- .../frontend/translate/translate.cpp | 329 ++++++++---------- .../frontend/translate/translate.h | 8 +- .../frontend/translate/vector_alu.cpp | 147 ++++---- src/video_core/buffer_cache/buffer_cache.cpp | 41 +++ src/video_core/buffer_cache/buffer_cache.h | 6 + .../renderer_vulkan/renderer_vulkan.h | 4 +- .../renderer_vulkan/vk_compute_pipeline.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 3 + .../renderer_vulkan/vk_instance.cpp | 9 +- src/video_core/renderer_vulkan/vk_instance.h | 6 + .../renderer_vulkan/vk_pipeline_cache.cpp | 4 + src/video_core/texture_cache/image.cpp | 1 + src/video_core/texture_cache/image.h | 1 + .../texture_cache/texture_cache.cpp | 82 +++-- src/video_core/texture_cache/texture_cache.h | 15 +- src/video_core/texture_cache/tile_manager.cpp | 1 - 23 files changed, 372 insertions(+), 346 deletions(-) diff --git a/externals/ext-boost b/externals/ext-boost index 147b2de7..a04136ad 160000 --- a/externals/ext-boost +++ b/externals/ext-boost @@ -1 +1 @@ -Subproject commit 147b2de7734f5dc3b9aeb1f4135ae15fcd44b9d7 +Subproject commit a04136add1e469f46d8ae8d3e8307779240a5c53 diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 2bcff191..a6d5c3ea 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -54,6 +54,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) // If the path does not exist attempt to verify this. // Retrieve parent path until we find one that exists. + std::scoped_lock lk{m_mutex}; path_parts.clear(); auto current_path = host_path; while (!std::filesystem::exists(current_path)) { diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index 5441c641..5304dc57 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -9,7 +9,6 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/libraries/error_codes.h" -#include "core/libraries/kernel/thread_management.h" #include "core/libraries/libs.h" namespace Libraries::Kernel { @@ -82,7 +81,6 @@ public: public: struct WaitingThread : public ListBaseHook { - std::string name; std::condition_variable cv; u32 priority; s32 need_count; @@ -90,7 +88,6 @@ public: bool was_cancled{}; explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} { - name = scePthreadSelf()->name; if (is_fifo) { return; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index bd34ed3d..5eae058a 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -128,11 +128,7 @@ Id EmitReadConst(EmitContext& ctx) { Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } - const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))}; - index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords); + index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords); const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; return ctx.OpLoad(buffer.data_types->Get(1), ptr); } @@ -229,9 +225,6 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { template static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { @@ -404,9 +397,6 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com template static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); if constexpr (N == 1) { return GetBufferFormatValue(ctx, handle, address, 0); @@ -438,9 +428,6 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad template static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index 891e41df..3ed89692 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -6,7 +6,9 @@ namespace Shader::Backend::SPIRV { -void EmitPrologue(EmitContext& ctx) {} +void EmitPrologue(EmitContext& ctx) { + ctx.DefineBufferOffsets(); +} void EmitEpilogue(EmitContext& ctx) {} diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 61b55437..55754d45 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -165,14 +165,18 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f throw InvalidArgument("Invalid attribute type {}", fmt); } -Id EmitContext::GetBufferOffset(u32 binding) { - const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); - const u32 comp = (binding & 0xf) >> 2; - const u32 offset = (binding & 0x3) << 3; - const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), - push_data_block, ConstU32(half), ConstU32(comp))}; - const Id value{OpLoad(U32[1], ptr)}; - return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); +void EmitContext::DefineBufferOffsets() { + for (auto& buffer : buffers) { + const u32 binding = buffer.binding; + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); + buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); + } } Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { @@ -327,7 +331,9 @@ void EmitContext::DefineBuffers() { for (u32 i = 0; const auto& buffer : info.buffers) { const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; const Id data_type = (*data_types)[1]; - const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))}; + const Id record_array_type{buffer.is_storage + ? TypeRuntimeArray(data_type) + : TypeArray(data_type, ConstU32(buffer.length))}; const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { Decorate(record_array_type, spv::Decoration::ArrayStride, 4); @@ -354,7 +360,7 @@ void EmitContext::DefineBuffers() { buffers.push_back({ .id = id, - .global_binding = binding++, + .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, .buffer = buffer.GetVsharp(info), diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 0d090eb3..81237a9a 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -40,7 +40,7 @@ public: ~EmitContext(); Id Def(const IR::Value& value); - Id GetBufferOffset(u32 binding); + void DefineBufferOffsets(); [[nodiscard]] Id DefineInput(Id type, u32 location) { const Id input_id{DefineVar(type, spv::StorageClass::Input)}; @@ -203,7 +203,8 @@ public: struct BufferDefinition { Id id; Id offset; - u32 global_binding; + Id offset_dwords; + u32 binding; const VectorIds* data_types; Id pointer_type; AmdGpu::Buffer buffer; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index b295c1be..8ffde7fb 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -73,101 +73,190 @@ void Translator::EmitPrologue() { } } -template <> -IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - IR::U32F32 value{}; +template +T Translator::GetSrc(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; - const bool is_float = operand.type == ScalarType::Float32 || force_flt; + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm32(std::bit_cast(value)); + } else { + return ir.Imm32(std::bit_cast(value)); + } + }; + + T value{}; switch (operand.field) { case OperandField::ScalarGPR: - if (is_float) { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } else { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } + value = ir.GetScalarReg(IR::ScalarReg(operand.code)); break; case OperandField::VectorGPR: - if (is_float) { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } else { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } + value = ir.GetVectorReg(IR::VectorReg(operand.code)); break; case OperandField::ConstZero: - if (is_float) { - value = ir.Imm32(0.f); - } else { - value = ir.Imm32(0U); - } + value = get_imm(0U); break; case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - value = ir.Imm32(operand.code - SignedConstIntPosMin + 1); + value = get_imm(operand.code - SignedConstIntPosMin + 1); break; case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); + value = get_imm(-s32(operand.code) + SignedConstIntNegMin - 1); break; case OperandField::LiteralConst: - if (is_float) { - value = ir.Imm32(std::bit_cast(operand.code)); - } else { - value = ir.Imm32(operand.code); - } + value = get_imm(operand.code); break; case OperandField::ConstFloatPos_1_0: - if (is_float) { - value = ir.Imm32(1.f); - } else { - value = ir.Imm32(std::bit_cast(1.f)); - } + value = get_imm(1.f); break; case OperandField::ConstFloatPos_0_5: - value = ir.Imm32(0.5f); + value = get_imm(0.5f); break; case OperandField::ConstFloatPos_2_0: - value = ir.Imm32(2.0f); + value = get_imm(2.0f); break; case OperandField::ConstFloatPos_4_0: - value = ir.Imm32(4.0f); + value = get_imm(4.0f); break; case OperandField::ConstFloatNeg_0_5: - value = ir.Imm32(-0.5f); + value = get_imm(-0.5f); break; case OperandField::ConstFloatNeg_1_0: - if (is_float) { - value = ir.Imm32(-1.0f); - } else { - value = ir.Imm32(std::bit_cast(-1.0f)); - } + value = get_imm(-1.0f); break; case OperandField::ConstFloatNeg_2_0: - value = ir.Imm32(-2.0f); + value = get_imm(-2.0f); break; case OperandField::ConstFloatNeg_4_0: - value = ir.Imm32(-4.0f); + value = get_imm(-4.0f); break; case OperandField::VccLo: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccLo()); } else { value = ir.GetVccLo(); } break; case OperandField::VccHi: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccHi()); } else { value = ir.GetVccHi(); } break; case OperandField::M0: - return m0_value; + if constexpr (is_float) { + UNREACHABLE(); + } else { + return m0_value; + } default: UNREACHABLE(); } - if (is_float) { + if constexpr (is_float) { + if (operand.input_modifier.abs) { + value = ir.FPAbs(value); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(value); + } + } else { + if (operand.input_modifier.abs) { + UNREACHABLE(); + } + if (operand.input_modifier.neg) { + UNREACHABLE(); + } + } + return value; +} + +template IR::U32 Translator::GetSrc(const InstOperand&); +template IR::F32 Translator::GetSrc(const InstOperand&); + +template +T Translator::GetSrc64(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; + + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm64(std::bit_cast(value)); + } else { + return ir.Imm64(std::bit_cast(value)); + } + }; + + T value{}; + switch (operand.field) { + case OperandField::ScalarGPR: { + const auto value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); + const auto value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::VectorGPR: { + const auto value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); + const auto value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::ConstZero: + value = get_imm(0ULL); + break; + case OperandField::SignedConstIntPos: + value = get_imm(s64(operand.code) - SignedConstIntPosMin + 1); + break; + case OperandField::SignedConstIntNeg: + value = get_imm(-s64(operand.code) + SignedConstIntNegMin - 1); + break; + case OperandField::LiteralConst: + value = get_imm(u64(operand.code)); + break; + case OperandField::ConstFloatPos_1_0: + value = get_imm(1.0); + break; + case OperandField::ConstFloatPos_0_5: + value = get_imm(0.5); + break; + case OperandField::ConstFloatPos_2_0: + value = get_imm(2.0); + break; + case OperandField::ConstFloatPos_4_0: + value = get_imm(4.0); + break; + case OperandField::ConstFloatNeg_0_5: + value = get_imm(-0.5); + break; + case OperandField::ConstFloatNeg_1_0: + value = get_imm(-1.0); + break; + case OperandField::ConstFloatNeg_2_0: + value = get_imm(-2.0); + break; + case OperandField::ConstFloatNeg_4_0: + value = get_imm(-4.0); + break; + case OperandField::VccLo: + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(ir.GetVccLo(), ir.GetVccHi())); + } + break; + case OperandField::VccHi: + default: + UNREACHABLE(); + } + + if constexpr (is_float) { if (operand.input_modifier.abs) { value = ir.FPAbs(value); } @@ -178,148 +267,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { return value; } -template <> -IR::U32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - return GetSrc(operand, force_flt); -} - -template <> -IR::F32 Translator::GetSrc(const InstOperand& operand, bool) { - return GetSrc(operand, true); -} - -template <> -IR::U64F64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - IR::Value value_hi{}; - IR::Value value_lo{}; - - bool immediate = false; - const bool is_float = operand.type == ScalarType::Float64 || force_flt; - switch (operand.field) { - case OperandField::ScalarGPR: - if (is_float) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::VectorGPR: - if (is_float) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::ConstZero: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(0.0); - } else { - value_lo = ir.Imm64(u64(0U)); - } - break; - case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(s64(operand.code) - SignedConstIntPosMin + 1); - break; - case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(-s64(operand.code) + SignedConstIntNegMin - 1); - break; - case OperandField::LiteralConst: - immediate = true; - if (force_flt) { - UNREACHABLE(); // There is a literal double? - } else { - value_lo = ir.Imm64(u64(operand.code)); - } - break; - case OperandField::ConstFloatPos_1_0: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(1.0); - } else { - value_lo = ir.Imm64(std::bit_cast(f64(1.0))); - } - break; - case OperandField::ConstFloatPos_0_5: - immediate = true; - value_lo = ir.Imm64(0.5); - break; - case OperandField::ConstFloatPos_2_0: - immediate = true; - value_lo = ir.Imm64(2.0); - break; - case OperandField::ConstFloatPos_4_0: - immediate = true; - value_lo = ir.Imm64(4.0); - break; - case OperandField::ConstFloatNeg_0_5: - immediate = true; - value_lo = ir.Imm64(-0.5); - break; - case OperandField::ConstFloatNeg_1_0: - immediate = true; - value_lo = ir.Imm64(-1.0); - break; - case OperandField::ConstFloatNeg_2_0: - immediate = true; - value_lo = ir.Imm64(-2.0); - break; - case OperandField::ConstFloatNeg_4_0: - immediate = true; - value_lo = ir.Imm64(-4.0); - break; - case OperandField::VccLo: { - value_lo = ir.GetVccLo(); - value_hi = ir.GetVccHi(); - } break; - case OperandField::VccHi: - UNREACHABLE(); - default: - UNREACHABLE(); - } - - IR::Value value; - - if (immediate) { - value = value_lo; - } else if (is_float) { - throw NotImplementedException("required OpPackDouble2x32 implementation"); - } else { - IR::Value packed = ir.CompositeConstruct(value_lo, value_hi); - value = ir.PackUint2x32(packed); - } - - if (is_float) { - if (operand.input_modifier.abs) { - value = ir.FPAbs(IR::F32F64(value)); - } - if (operand.input_modifier.neg) { - value = ir.FPNeg(IR::F32F64(value)); - } - } - return IR::U64F64(value); -} - -template <> -IR::U64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - return GetSrc64(operand, force_flt); -} -template <> -IR::F64 Translator::GetSrc64(const InstOperand& operand, bool) { - return GetSrc64(operand, true); -} +template IR::U64 Translator::GetSrc64(const InstOperand&); +template IR::F64 Translator::GetSrc64(const InstOperand&); void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { IR::U32F32 result = value; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index fe4457d2..2e12209d 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -211,10 +211,10 @@ public: void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); private: - template - [[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false); - template - [[nodiscard]] T GetSrc64(const InstOperand& operand, bool flt_zero = false); + template + [[nodiscard]] T GetSrc(const InstOperand& operand); + template + [[nodiscard]] T GetSrc64(const InstOperand& operand); void SetDst(const InstOperand& operand, const IR::U32F32& value); void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 89428c44..1bbc3c16 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/translate/translate.h" -#include "shader_recompiler/profile.h" namespace Shader::Gcn { @@ -312,7 +311,7 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { } void Translator::V_MOV(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0])); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_SAD(const GcnInst& inst) { @@ -321,14 +320,14 @@ void Translator::V_SAD(const GcnInst& inst) { } void Translator::V_MAC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true), - GetSrc(inst.dst[0], true))); + SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0]), GetSrc(inst.src[1]), + GetSrc(inst.dst[0]))); } void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value vec_f32 = - ir.CompositeConstruct(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)); + ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); ir.SetVectorReg(dst_reg, ir.PackHalf2x16(vec_f32)); } @@ -339,13 +338,13 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) { } void Translator::V_CVT_F16_F32(const GcnInst& inst) { - const IR::F32 src0 = GetSrc(inst.src[0], true); + const IR::F32 src0 = GetSrc(inst.src[0]); const IR::F16 src0fp16 = ir.FPConvert(16, src0); SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast(src0fp16))); } void Translator::V_MUL_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); + SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0]), GetSrc(inst.src[1]))); } void Translator::V_CNDMASK_B32(const GcnInst& inst) { @@ -354,24 +353,8 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) { const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR ? ir.GetThreadBitScalarReg(flag_reg) : ir.GetVcc(); - - // We can treat the instruction as integer most of the time, but when a source is - // a floating point constant we will force the other as float for better readability - // The other operand is also higly likely to be float as well. - const auto is_float_const = [](OperandField field) { - return field >= OperandField::ConstFloatPos_0_5 && field <= OperandField::ConstFloatNeg_4_0; - }; - const bool has_flt_source = - is_float_const(inst.src[0].field) || is_float_const(inst.src[1].field); - IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source); - IR::U32F32 src1 = GetSrc(inst.src[1], has_flt_source); - if (src0.Type() == IR::Type::F32 && src1.Type() == IR::Type::U32) { - src1 = ir.BitCast(src1); - } - if (src1.Type() == IR::Type::F32 && src0.Type() == IR::Type::U32) { - src0 = ir.BitCast(src0); - } - const IR::Value result = ir.Select(flag, src1, src0); + const IR::Value result = + ir.Select(flag, GetSrc(inst.src[1]), GetSrc(inst.src[0])); ir.SetVectorReg(dst_reg, IR::U32F32{result}); } @@ -448,21 +431,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) { } void Translator::V_MAD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_FRACT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.Fract(src0)); } void Translator::V_ADD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPAdd(src0, src1)); } @@ -476,9 +459,9 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { } void Translator::V_MED3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); } @@ -492,32 +475,32 @@ void Translator::V_MED3_I32(const GcnInst& inst) { } void Translator::V_FLOOR_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); } void Translator::V_SUB_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src0, src1)); } void Translator::V_RCP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecip(src0)); } void Translator::V_FMA_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; const IR::U1 result = [&] { switch (op) { case ConditionOp::F: @@ -557,8 +540,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { } void Translator::V_MAX_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMax(src0, src1, is_legacy)); } @@ -569,40 +552,40 @@ void Translator::V_MAX_U32(bool is_signed, const GcnInst& inst) { } void Translator::V_RSQ_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecipSqrt(src0)); } void Translator::V_SIN_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSin(src0)); } void Translator::V_LOG_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPLog2(src0)); } void Translator::V_EXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPExp2(src0)); } void Translator::V_SQRT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSqrt(src0)); } void Translator::V_MIN_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMin(src0, src1, is_legacy)); } void Translator::V_MIN3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } @@ -614,9 +597,9 @@ void Translator::V_MIN3_I32(const GcnInst& inst) { } void Translator::V_MADMK_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 k{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 k{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, k, src1)); } @@ -625,25 +608,25 @@ void Translator::V_CUBEMA_F32(const GcnInst& inst) { } void Translator::V_CUBESC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0], true)); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_CUBETC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[1], true)); + SetDst(inst.dst[0], GetSrc(inst.src[1])); } void Translator::V_CUBEID_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[2], true)); + SetDst(inst.dst[0], GetSrc(inst.src[2])); } void Translator::V_CVT_U32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToU(32, src0)); } void Translator::V_SUBREV_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src1, src0)); } @@ -727,9 +710,17 @@ void Translator::V_SAD_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 src2{GetSrc(inst.src[2])}; - const IR::U32 max{ir.IMax(src0, src1, false)}; - const IR::U32 min{ir.IMin(src0, src1, false)}; - SetDst(inst.dst[0], ir.IAdd(ir.ISub(max, min), src2)); + IR::U32 result; + if (src0.IsImmediate() && src0.U32() == 0U) { + result = src1; + } else if (src1.IsImmediate() && src1.U32() == 0U) { + result = src0; + } else { + const IR::U32 max{ir.IMax(src0, src1, false)}; + const IR::U32 min{ir.IMin(src0, src1, false)}; + result = ir.ISub(max, min); + } + SetDst(inst.dst[0], ir.IAdd(result, src2)); } void Translator::V_BFE_U32(bool is_signed, const GcnInst& inst) { @@ -783,7 +774,7 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) { } void Translator::V_RNDNE_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRoundEven(src0)); } @@ -794,14 +785,14 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) { } void Translator::V_COS_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCos(src0)); } void Translator::V_MAX3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } @@ -813,7 +804,7 @@ void Translator::V_MAX3_U32(const GcnInst& inst) { } void Translator::V_CVT_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); } @@ -830,12 +821,12 @@ void Translator::V_MUL_LO_U32(const GcnInst& inst) { } void Translator::V_TRUNC_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPTrunc(src0)); } void Translator::V_CEIL_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCeil(src0)); } @@ -899,18 +890,18 @@ void Translator::V_BFREV_B32(const GcnInst& inst) { } void Translator::V_LDEXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPLdexp(src0, src1)); } void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToI(32, true, ir.FPFloor(src0))); } void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { - const IR::F32F64 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; IR::U1 value; if (src1.IsImmediate()) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 7ab0d817..2246807a 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -87,6 +87,15 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { + boost::container::small_vector attributes; + boost::container::small_vector bindings; + SCOPE_EXIT { + if (instance.IsVertexInputDynamicState()) { + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.setVertexInputEXT(bindings, attributes); + } + }; + if (vs_info.vs_inputs.empty()) { return false; } @@ -122,6 +131,21 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { } guest_buffers.emplace_back(buffer); ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); + attributes.push_back({ + .location = input.binding, + .binding = input.binding, + .format = + Vulkan::LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()), + .offset = 0, + }); + bindings.push_back({ + .binding = input.binding, + .stride = buffer.GetStride(), + .inputRate = input.instance_step_rate == Shader::Info::VsInput::None + ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, + .divisor = 1, + }); } std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { @@ -224,6 +248,19 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } +std::pair BufferCache::ObtainTempBuffer(VAddr gpu_addr, u32 size) { + const u64 page = gpu_addr >> CACHING_PAGEBITS; + const BufferId buffer_id = page_table[page]; + if (buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(gpu_addr, size)) { + return {&buffer, buffer.Offset(gpu_addr)}; + } + } + const u32 offset = staging_buffer.Copy(gpu_addr, size, 16); + return {&staging_buffer, offset}; +} + bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); @@ -248,6 +285,10 @@ bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { return memory_tracker.IsRegionCpuModified(addr, size); } +bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { + return memory_tracker.IsRegionGpuModified(addr, size); +} + BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { if (device_addr == 0) { return NULL_BUFFER_ID; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 0dee87cf..33ea3f86 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -69,12 +69,18 @@ public: /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written); + /// Obtains a temporary buffer for usage in texture cache. + [[nodiscard]] std::pair ObtainTempBuffer(VAddr gpu_addr, u32 size); + /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + private: template void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 8178c88d..113b380e 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -47,7 +47,7 @@ public: Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); auto& image = texture_cache.GetImage(image_id); return PrepareFrameInternal(image, is_eop); } @@ -61,7 +61,7 @@ public: const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) { vo_buffers_addr.emplace_back(cpu_address); const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); return texture_cache.GetImage(image_id); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 21710a76..62b50eeb 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -96,7 +96,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, Shader::PushData push_data{}; u32 binding{}; - for (u32 i = 0; const auto& buffer : info.buffers) { + for (const auto& buffer : info.buffers) { const auto vsharp = buffer.GetVsharp(info); const VAddr address = vsharp.base_address; // Most of the time when a metadata is updated with a shader it gets cleared. It means we @@ -115,7 +115,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } const u32 size = vsharp.GetSize(); if (buffer.is_written) { - texture_cache.InvalidateMemory(address, size); + texture_cache.InvalidateMemory(address, size, true); } const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); @@ -137,7 +137,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, : vk::DescriptorType::eUniformBuffer, .pBufferInfo = &buffer_infos.back(), }); - i++; } for (const auto& image_desc : info.images) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5d87a1ca..cf23ade2 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -145,6 +145,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul dynamic_states.push_back(vk::DynamicState::eColorWriteEnableEXT); dynamic_states.push_back(vk::DynamicState::eColorWriteMaskEXT); } + if (instance.IsVertexInputDynamicState()) { + dynamic_states.push_back(vk::DynamicState::eVertexInputEXT); + } const vk::PipelineDynamicStateCreateInfo dynamic_info = { .dynamicStateCount = static_cast(dynamic_states.size()), diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 5beb57c4..b60b78e1 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -202,6 +202,8 @@ bool Instance::CreateDevice() { add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); workgroup_memory_explicit_layout = add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + // The next two extensions are required to be available together in order to support write masks color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME); color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); @@ -319,6 +321,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceSynchronization2Features{ .synchronization2 = true, }, + vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{ + .vertexInputDynamicState = true, + }, }; if (!color_write_en) { @@ -331,8 +336,8 @@ bool Instance::CreateDevice() { } else { device_chain.unlink(); } - if (!has_sync2) { - device_chain.unlink(); + if (!vertex_input_dynamic_state) { + device_chain.unlink(); } try { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 2f2397d6..4cb4741a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -132,6 +132,11 @@ public: return color_write_en; } + /// Returns true when VK_EXT_vertex_input_dynamic_state is supported. + bool IsVertexInputDynamicState() const { + return vertex_input_dynamic_state; + } + /// Returns the vendor ID of the physical device u32 GetVendorID() const { return properties.vendorID; @@ -257,6 +262,7 @@ private: bool external_memory_host{}; bool workgroup_memory_explicit_layout{}; bool color_write_en{}; + bool vertex_input_dynamic_state{}; u64 min_imported_host_pointer_alignment{}; u32 subgroup_size{}; bool tooling_info{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 8d27d252..8a22b925 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -209,6 +209,10 @@ void PipelineCache::RefreshGraphicsKey() { continue; } const auto* bininfo = Liverpool::GetBinaryInfo(*pgm); + if (!bininfo->Valid()) { + key.stage_hashes[i] = 0; + continue; + } key.stage_hashes[i] = bininfo->shader_hash; } } diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index f1148760..bae4b89d 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -117,6 +117,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, : instance{&instance_}, scheduler{&scheduler_}, info{info_}, image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address}, cpu_addr_end{cpu_addr + info.guest_size_bytes} { + mip_hashes.resize(info.resources.levels); ASSERT(info.pixel_format != vk::Format::eUndefined); // Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case // the texture cache should re-create the resource with the usage requested diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index b18f1002..5a888346 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -111,6 +111,7 @@ struct Image { vk::Flags pl_stage = vk::PipelineStageFlagBits::eAllCommands; vk::Flags access_mask = vk::AccessFlagBits::eNone; vk::ImageLayout layout = vk::ImageLayout::eUndefined; + boost::container::small_vector mip_hashes; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 53596f8e..6b14faac 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -3,6 +3,7 @@ #include #include "common/assert.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -11,13 +12,11 @@ namespace VideoCore { -static constexpr u64 StreamBufferSize = 512_MB; static constexpr u64 PageShift = 12; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, - staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize}, tile_manager{instance, scheduler} { ImageInfo info; info.pixel_format = vk::Format::eR8G8B8A8Unorm; @@ -31,9 +30,12 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& TextureCache::~TextureCache() = default; -void TextureCache::InvalidateMemory(VAddr address, size_t size) { +void TextureCache::InvalidateMemory(VAddr address, size_t size, bool from_compute) { std::unique_lock lock{mutex}; ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { + if (from_compute && !image.Overlaps(address, size)) { + return; + } // Ensure image is reuploaded when accessed again. image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. @@ -57,7 +59,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { } } -ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { +ImageId TextureCache::FindImage(const ImageInfo& info) { if (info.guest_address == 0) [[unlikely]] { return NULL_IMAGE_VIEW_ID; } @@ -87,12 +89,6 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { image_id = image_ids[image_ids.size() > 1 ? 1 : 0]; } - Image& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) { - RefreshImage(image); - TrackImage(image, image_id); - } - return image_id; } @@ -119,6 +115,7 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(info); + UpdateImage(image_id); Image& image = slot_images[image_id]; auto& usage = image.info.usage; @@ -165,7 +162,8 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; - image.flags &= ~ImageFlagBits::CpuModified; + image.flags |= ImageFlagBits::GpuModified; + UpdateImage(image_id); image.Transit(vk::ImageLayout::eColorAttachmentOptimal, vk::AccessFlagBits::eColorAttachmentWrite | @@ -198,8 +196,9 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { - const ImageId image_id = FindImage(image_info, false); + const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; + image.flags |= ImageFlagBits::GpuModified; image.flags &= ~ImageFlagBits::CpuModified; const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal @@ -228,22 +227,6 @@ void TextureCache::RefreshImage(Image& image) { // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; - scheduler.EndRendering(); - - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - vk::Buffer buffer{staging.Handle()}; - u32 offset{0}; - - auto upload_buffer = tile_manager.TryDetile(image); - if (upload_buffer) { - buffer = *upload_buffer; - } else { - // Upload data to the staging buffer. - offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16); - } - const auto& num_layers = image.info.resources.layers; const auto& num_mips = image.info.resources.levels; ASSERT(num_mips == image.info.mips_layout.size()); @@ -254,12 +237,23 @@ void TextureCache::RefreshImage(Image& image) { const u32 height = std::max(image.info.size.height >> m, 1u); const u32 depth = image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + + // Protect GPU modified resources from accidental reuploads. + if (True(image.flags & ImageFlagBits::GpuModified) && + !buffer_cache.IsRegionGpuModified(image.info.guest_address + mip_ofs, mip_size)) { + const u8* addr = std::bit_cast(image.info.guest_address); + const u64 hash = XXH3_64bits(addr + mip_ofs, mip_size); + if (image.mip_hashes[m] == hash) { + continue; + } + image.mip_hashes[m] = hash; + } image_copy.push_back({ - .bufferOffset = offset + mip_ofs * num_layers, - .bufferRowLength = static_cast(mip_pitch), - .bufferImageHeight = static_cast(mip_height), + .bufferOffset = mip_ofs * num_layers, + .bufferRowLength = static_cast(mip_pitch), + .bufferImageHeight = static_cast(mip_height), .imageSubresource{ .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = m, @@ -271,6 +265,30 @@ void TextureCache::RefreshImage(Image& image) { }); } + if (image_copy.empty()) { + return; + } + + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite, cmdbuf); + + const VAddr image_addr = image.info.guest_address; + const size_t image_size = image.info.guest_size_bytes; + vk::Buffer buffer{}; + u32 offset{}; + if (auto upload_buffer = tile_manager.TryDetile(image); upload_buffer) { + buffer = *upload_buffer; + } else { + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); + buffer = vk_buffer->Handle(); + offset = buf_offset; + } + + for (auto& copy : image_copy) { + copy.bufferOffset += offset; + } + cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 17a09898..b3af0ff1 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -38,13 +38,13 @@ public: ~TextureCache(); /// Invalidates any image in the logical page range. - void InvalidateMemory(VAddr address, size_t size); + void InvalidateMemory(VAddr address, size_t size, bool from_compute = false); /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true); + [[nodiscard]] ImageId FindImage(const ImageInfo& info); /// Retrieves an image view with the properties of the specified image descriptor. [[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info, @@ -58,6 +58,16 @@ public: [[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info); + /// Updates image contents if it was modified by CPU. + void UpdateImage(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::CpuModified)) { + return; + } + RefreshImage(image); + TrackImage(image, image_id); + } + /// Reuploads image contents. void RefreshImage(Image& image); @@ -170,7 +180,6 @@ private: Vulkan::Scheduler& scheduler; BufferCache& buffer_cache; PageManager& tracker; - StreamBuffer staging; TileManager tile_manager; Common::SlotVector slot_images; Common::SlotVector slot_image_views; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 75fa378c..6447fde1 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -5,7 +5,6 @@ #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/image_view.h" -#include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" #include "video_core/host_shaders/detile_m32x1_comp.h"