From efcabbfd2a1389871ec413c688c48cfe5aeaacb8 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:29:24 +0300 Subject: [PATCH] video_core: Introduce buffer cache --- CMakeLists.txt | 11 + src/common/unique_function.h | 61 +++ .../libraries/kernel/thread_management.cpp | 9 + src/core/memory.cpp | 96 +--- src/core/memory.h | 30 +- src/core/module.cpp | 1 + .../spirv/emit_spirv_context_get_set.cpp | 30 +- .../backend/spirv/spirv_emit_context.cpp | 82 +-- .../backend/spirv/spirv_emit_context.h | 20 +- .../frontend/translate/translate.cpp | 1 + .../frontend/translate/vector_memory.cpp | 5 + .../ir/passes/resource_tracking_pass.cpp | 105 ++-- src/shader_recompiler/recompiler.cpp | 2 +- src/shader_recompiler/runtime_info.h | 21 +- src/video_core/amdgpu/resource.h | 4 + src/video_core/buffer_cache/buffer.cpp | 227 ++++++++ src/video_core/buffer_cache/buffer.h | 173 ++++++ src/video_core/buffer_cache/buffer_cache.cpp | 497 ++++++++++++++++++ src/video_core/buffer_cache/buffer_cache.h | 121 +++++ .../buffer_cache/memory_tracker_base.h | 175 ++++++ src/video_core/buffer_cache/range_set.cpp | 0 src/video_core/buffer_cache/range_set.h | 159 ++++++ src/video_core/buffer_cache/word_manager.h | 398 ++++++++++++++ src/video_core/page_manager.cpp | 209 ++++++++ src/video_core/page_manager.h | 39 ++ .../renderer_vulkan/renderer_vulkan.cpp | 4 +- .../renderer_vulkan/renderer_vulkan.h | 6 +- .../renderer_vulkan/vk_compute_pipeline.cpp | 59 ++- .../renderer_vulkan/vk_compute_pipeline.h | 9 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 115 +--- .../renderer_vulkan/vk_graphics_pipeline.h | 14 +- .../renderer_vulkan/vk_rasterizer.cpp | 86 +-- .../renderer_vulkan/vk_rasterizer.h | 26 +- src/video_core/renderer_vulkan/vk_scheduler.h | 7 +- .../renderer_vulkan/vk_stream_buffer.h | 2 - src/video_core/texture_cache/image_info.cpp | 2 +- src/video_core/texture_cache/image_view.cpp | 15 +- .../texture_cache/texture_cache.cpp | 183 ++----- src/video_core/texture_cache/texture_cache.h | 46 +- src/video_core/texture_cache/tile_manager.cpp | 56 +- src/video_core/texture_cache/tile_manager.h | 4 +- 41 files changed, 2491 insertions(+), 619 deletions(-) create mode 100755 src/common/unique_function.h create mode 100644 src/video_core/buffer_cache/buffer.cpp create mode 100644 src/video_core/buffer_cache/buffer.h create mode 100644 src/video_core/buffer_cache/buffer_cache.cpp create mode 100644 src/video_core/buffer_cache/buffer_cache.h create mode 100644 src/video_core/buffer_cache/memory_tracker_base.h create mode 100644 src/video_core/buffer_cache/range_set.cpp create mode 100644 src/video_core/buffer_cache/range_set.h create mode 100644 src/video_core/buffer_cache/word_manager.h create mode 100644 src/video_core/page_manager.cpp create mode 100644 src/video_core/page_manager.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 90ba4d83..4a05acf5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,6 +294,7 @@ set(COMMON src/common/logging/backend.cpp src/common/thread.h src/common/types.h src/common/uint128.h + src/common/unique_function.h src/common/version.h src/common/ntapi.h src/common/ntapi.cpp @@ -451,6 +452,14 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/amdgpu/pm4_cmds.h src/video_core/amdgpu/pm4_opcodes.h src/video_core/amdgpu/resource.h + src/video_core/buffer_cache/buffer.cpp + src/video_core/buffer_cache/buffer.h + src/video_core/buffer_cache/buffer_cache.cpp + src/video_core/buffer_cache/buffer_cache.h + src/video_core/buffer_cache/memory_tracker_base.h + src/video_core/buffer_cache/range_set.cpp + src/video_core/buffer_cache/range_set.h + src/video_core/buffer_cache/word_manager.h src/video_core/renderer_vulkan/liverpool_to_vk.cpp src/video_core/renderer_vulkan/liverpool_to_vk.h src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -496,6 +505,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/texture_cache/tile_manager.cpp src/video_core/texture_cache/tile_manager.h src/video_core/texture_cache/types.h + src/video_core/page_manager.cpp + src/video_core/page_manager.h src/video_core/renderdoc.cpp src/video_core/renderdoc.h ) diff --git a/src/common/unique_function.h b/src/common/unique_function.h new file mode 100755 index 00000000..1891ec3c --- /dev/null +++ b/src/common/unique_function.h @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +namespace Common { + +/// General purpose function wrapper similar to std::function. +/// Unlike std::function, the captured values don't have to be copyable. +/// This class can be moved but not copied. +template +class UniqueFunction { + class CallableBase { + public: + virtual ~CallableBase() = default; + virtual ResultType operator()(Args&&...) = 0; + }; + + template + class Callable final : public CallableBase { + public: + Callable(Functor&& functor_) : functor{std::move(functor_)} {} + ~Callable() override = default; + + ResultType operator()(Args&&... args) override { + return functor(std::forward(args)...); + } + + private: + Functor functor; + }; + +public: + UniqueFunction() = default; + + template + UniqueFunction(Functor&& functor) + : callable{std::make_unique>(std::move(functor))} {} + + UniqueFunction& operator=(UniqueFunction&& rhs) noexcept = default; + UniqueFunction(UniqueFunction&& rhs) noexcept = default; + + UniqueFunction& operator=(const UniqueFunction&) = delete; + UniqueFunction(const UniqueFunction&) = delete; + + ResultType operator()(Args&&... args) const { + return (*callable)(std::forward(args)...); + } + + explicit operator bool() const noexcept { + return static_cast(callable); + } + +private: + std::unique_ptr callable; +}; + +} // namespace Common diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 3393138d..1b4c48fb 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -1061,7 +1061,16 @@ ScePthread PThreadPool::Create() { } } +#ifdef _WIN64 auto* ret = new PthreadInternal{}; +#else + // TODO: Linux specific hack + static u8* hint_address = reinterpret_cast(0x7FFFFC000ULL); + auto* ret = reinterpret_cast( + mmap(hint_address, sizeof(PthreadInternal), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)); + hint_address += Common::AlignUp(sizeof(PthreadInternal), 4_KB); +#endif ret->is_free = false; ret->is_detached = false; ret->is_almost_done = false; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index aa552d51..dc5ded41 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -7,7 +7,7 @@ #include "core/libraries/error_codes.h" #include "core/libraries/kernel/memory_management.h" #include "core/memory.h" -#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" namespace Core { @@ -172,7 +172,7 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M if (type == VMAType::Direct) { new_vma.phys_base = phys_addr; - MapVulkanMemory(mapped_addr, size); + rasterizer->MapMemory(mapped_addr, size); } if (type == VMAType::Flexible) { flexible_usage += size; @@ -222,7 +222,7 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { const auto type = it->second.type; const bool has_backing = type == VMAType::Direct || type == VMAType::File; if (type == VMAType::Direct) { - UnmapVulkanMemory(virtual_addr, size); + rasterizer->UnmapMemory(virtual_addr, size); } if (type == VMAType::Flexible) { flexible_usage -= size; @@ -263,7 +263,7 @@ int MemoryManager::QueryProtection(VAddr addr, void** start, void** end, u32* pr } int MemoryManager::VirtualQuery(VAddr addr, int flags, - Libraries::Kernel::OrbisVirtualQueryInfo* info) { + ::Libraries::Kernel::OrbisVirtualQueryInfo* info) { std::scoped_lock lk{mutex}; auto it = FindVMA(addr); @@ -293,7 +293,7 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags, } int MemoryManager::DirectMemoryQuery(PAddr addr, bool find_next, - Libraries::Kernel::OrbisQueryInfo* out_info) { + ::Libraries::Kernel::OrbisQueryInfo* out_info) { std::scoped_lock lk{mutex}; auto dmem_area = FindDmemArea(addr); @@ -333,13 +333,6 @@ int MemoryManager::DirectQueryAvailable(PAddr search_start, PAddr search_end, si return ORBIS_OK; } -std::pair MemoryManager::GetVulkanBuffer(VAddr addr) { - auto it = mapped_memories.upper_bound(addr); - it = std::prev(it); - ASSERT(it != mapped_memories.end() && it->first <= addr); - return std::make_pair(*it->second.buffer, addr - it->first); -} - void MemoryManager::NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name) { auto it = FindVMA(virtual_addr); @@ -455,85 +448,6 @@ MemoryManager::DMemHandle MemoryManager::Split(DMemHandle dmem_handle, size_t of return dmem_map.emplace_hint(std::next(dmem_handle), new_area.base, new_area); }; -void MemoryManager::MapVulkanMemory(VAddr addr, size_t size) { - return; - const vk::Device device = instance->GetDevice(); - const auto memory_props = instance->GetPhysicalDevice().getMemoryProperties(); - void* host_pointer = reinterpret_cast(addr); - const auto host_mem_props = device.getMemoryHostPointerPropertiesEXT( - vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, host_pointer); - ASSERT(host_mem_props.memoryTypeBits != 0); - - int mapped_memory_type = -1; - auto find_mem_type_with_flag = [&](const vk::MemoryPropertyFlags flags) { - u32 host_mem_types = host_mem_props.memoryTypeBits; - while (host_mem_types != 0) { - // Try to find a cached memory type - mapped_memory_type = std::countr_zero(host_mem_types); - host_mem_types -= (1 << mapped_memory_type); - - if ((memory_props.memoryTypes[mapped_memory_type].propertyFlags & flags) == flags) { - return; - } - } - - mapped_memory_type = -1; - }; - - // First try to find a memory that is both coherent and cached - find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent | - vk::MemoryPropertyFlagBits::eHostCached); - if (mapped_memory_type == -1) - // Then only coherent (lower performance) - find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent); - - if (mapped_memory_type == -1) { - LOG_CRITICAL(Render_Vulkan, "No coherent memory available for memory mapping"); - mapped_memory_type = std::countr_zero(host_mem_props.memoryTypeBits); - } - - const vk::StructureChain alloc_info = { - vk::MemoryAllocateInfo{ - .allocationSize = size, - .memoryTypeIndex = static_cast(mapped_memory_type), - }, - vk::ImportMemoryHostPointerInfoEXT{ - .handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, - .pHostPointer = host_pointer, - }, - }; - - const auto [it, new_memory] = mapped_memories.try_emplace(addr); - ASSERT_MSG(new_memory, "Attempting to remap already mapped vulkan memory"); - - auto& memory = it->second; - memory.backing = device.allocateMemoryUnique(alloc_info.get()); - - constexpr vk::BufferUsageFlags MapFlags = - vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer | - vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer; - - const vk::StructureChain buffer_info = { - vk::BufferCreateInfo{ - .size = size, - .usage = MapFlags, - .sharingMode = vk::SharingMode::eExclusive, - }, - vk::ExternalMemoryBufferCreateInfoKHR{ - .handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, - }}; - memory.buffer = device.createBufferUnique(buffer_info.get()); - device.bindBufferMemory(*memory.buffer, *memory.backing, 0); -} - -void MemoryManager::UnmapVulkanMemory(VAddr addr, size_t size) { - return; - const auto it = mapped_memories.find(addr); - ASSERT(it != mapped_memories.end() && it->second.buffer_size == size); - mapped_memories.erase(it); -} - int MemoryManager::GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut) { std::scoped_lock lk{mutex}; diff --git a/src/core/memory.h b/src/core/memory.h index 2b3d07a7..6d0a977f 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -3,20 +3,17 @@ #pragma once -#include +#include #include #include -#include -#include #include "common/enum.h" #include "common/singleton.h" #include "common/types.h" #include "core/address_space.h" #include "core/libraries/kernel/memory_management.h" -#include "video_core/renderer_vulkan/vk_common.h" namespace Vulkan { -class Instance; +class Rasterizer; } namespace Libraries::Kernel { @@ -128,8 +125,8 @@ public: explicit MemoryManager(); ~MemoryManager(); - void SetInstance(const Vulkan::Instance* instance_) { - instance = instance_; + void SetRasterizer(Vulkan::Rasterizer* rasterizer_) { + rasterizer = rasterizer_; } void SetTotalFlexibleSize(u64 size) { @@ -140,9 +137,7 @@ public: return total_flexible_size - flexible_usage; } - /// Returns the offset of the mapped virtual system managed memory base from where it usually - /// would be mapped. - [[nodiscard]] VAddr SystemReservedVirtualBase() noexcept { + VAddr SystemReservedVirtualBase() noexcept { return impl.SystemReservedVirtualBase(); } @@ -172,8 +167,6 @@ public: int DirectQueryAvailable(PAddr search_start, PAddr search_end, size_t alignment, PAddr* phys_addr_out, size_t* size_out); - std::pair GetVulkanBuffer(VAddr addr); - int GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut); @@ -218,10 +211,6 @@ private: DMemHandle Split(DMemHandle dmem_handle, size_t offset_in_area); - void MapVulkanMemory(VAddr addr, size_t size); - - void UnmapVulkanMemory(VAddr addr, size_t size); - private: AddressSpace impl; DMemMap dmem_map; @@ -229,14 +218,7 @@ private: std::recursive_mutex mutex; size_t total_flexible_size = 448_MB; size_t flexible_usage{}; - - struct MappedMemory { - vk::UniqueBuffer buffer; - vk::UniqueDeviceMemory backing; - size_t buffer_size; - }; - std::map mapped_memories; - const Vulkan::Instance* instance{}; + Vulkan::Rasterizer* rasterizer{}; }; using Memory = Common::Singleton; diff --git a/src/core/module.cpp b/src/core/module.cpp index d885b917..775e1ef1 100644 --- a/src/core/module.cpp +++ b/src/core/module.cpp @@ -88,6 +88,7 @@ void Module::LoadModuleToMemory(u32& max_tls_index) { aligned_base_size + TrampolineSize, MemoryProt::CpuReadWrite, MemoryMapFlags::Fixed, VMAType::Code, name, true); LoadOffset += CODE_BASE_INCR * (1 + aligned_base_size / CODE_BASE_INCR); + LOG_INFO(Core_Linker, "Loading module {} to {}", name, fmt::ptr(*out_addr)); // Initialize trampoline generator. void* trampoline_addr = std::bit_cast(base_virtual_addr + aligned_base_size); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 02480303..e85272e9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -21,6 +21,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { case VsOutput::ClipDist7: { const u32 index = u32(output) - u32(VsOutput::ClipDist0); const Id clip_num{ctx.ConstU32(index)}; + ASSERT_MSG(Sirit::ValidId(ctx.clip_distances), "Clip distance used but not defined"); return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, clip_num); } case VsOutput::CullDist0: @@ -33,6 +34,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { case VsOutput::CullDist7: { const u32 index = u32(output) - u32(VsOutput::CullDist0); const Id cull_num{ctx.ConstU32(index)}; + ASSERT_MSG(Sirit::ValidId(ctx.cull_distances), "Cull distance used but not defined"); return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, cull_num); } default: @@ -125,7 +127,12 @@ Id EmitReadConst(EmitContext& ctx) { } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - const auto& buffer = ctx.buffers[handle]; + auto& buffer = ctx.buffers[handle]; + if (!Sirit::ValidId(buffer.offset)) { + buffer.offset = ctx.GetBufferOffset(buffer.global_binding); + } + const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))}; + index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords); const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; return ctx.OpLoad(buffer.data_types->Get(1), ptr); } @@ -137,7 +144,7 @@ Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) { Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { return ctx.OpLoad( ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]), - ctx.instance_step_rates, + ctx.push_data_block, rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value)); } @@ -221,7 +228,11 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { template static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) { - const auto& buffer = ctx.buffers[handle]; + auto& buffer = ctx.buffers[handle]; + if (!Sirit::ValidId(buffer.offset)) { + buffer.offset = ctx.GetBufferOffset(buffer.global_binding); + } + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; @@ -314,7 +325,7 @@ static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offs } static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) { - const auto& buffer = ctx.buffers[handle]; + auto& buffer = ctx.buffers[handle]; const auto format = buffer.buffer.GetDataFmt(); switch (format) { case AmdGpu::DataFormat::FormatInvalid: @@ -399,6 +410,11 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com template static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + auto& buffer = ctx.buffers[handle]; + if (!Sirit::ValidId(buffer.offset)) { + buffer.offset = ctx.GetBufferOffset(buffer.global_binding); + } + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); if constexpr (N == 1) { return GetBufferFormatValue(ctx, handle, address, 0); } else { @@ -428,7 +444,11 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad template static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { - const auto& buffer = ctx.buffers[handle]; + auto& buffer = ctx.buffers[handle]; + if (!Sirit::ValidId(buffer.offset)) { + buffer.offset = ctx.GetBufferOffset(buffer.global_binding); + } + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 8ca8b7a3..61b55437 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -46,9 +46,9 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin stage{program.info.stage}, binding{binding_} { AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); - DefineInterfaces(program); - DefineBuffers(info); - DefineImagesAndSamplers(info); + DefineInterfaces(); + DefineBuffers(); + DefineImagesAndSamplers(); DefineSharedMemory(); } @@ -117,9 +117,10 @@ void EmitContext::DefineArithmeticTypes() { full_result_u32x2 = Name(TypeStruct(U32[1], U32[1]), "full_result_u32x2"); } -void EmitContext::DefineInterfaces(const IR::Program& program) { - DefineInputs(program.info); - DefineOutputs(program.info); +void EmitContext::DefineInterfaces() { + DefinePushDataBlock(); + DefineInputs(); + DefineOutputs(); } Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { @@ -164,6 +165,16 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f throw InvalidArgument("Invalid attribute type {}", fmt); } +Id EmitContext::GetBufferOffset(u32 binding) { + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); +} + Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { switch (default_value) { case 0: @@ -179,24 +190,13 @@ Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { } } -void EmitContext::DefineInputs(const Info& info) { +void EmitContext::DefineInputs() { switch (stage) { case Stage::Vertex: { vertex_index = DefineVariable(U32[1], spv::BuiltIn::VertexIndex, spv::StorageClass::Input); base_vertex = DefineVariable(U32[1], spv::BuiltIn::BaseVertex, spv::StorageClass::Input); instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input); - // Create push constants block for instance steps rates - const Id struct_type{Name(TypeStruct(U32[1], U32[1]), "instance_step_rates")}; - Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, 0, "sr0"); - MemberName(struct_type, 1, "sr1"); - MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); - MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U); - instance_step_rates = DefineVar(struct_type, spv::StorageClass::PushConstant); - Name(instance_step_rates, "step_rates"); - interfaces.push_back(instance_step_rates); - for (const auto& input : info.vs_inputs) { const Id type{GetAttributeType(*this, input.fmt)}; if (input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate0 || @@ -260,19 +260,20 @@ void EmitContext::DefineInputs(const Info& info) { } } -void EmitContext::DefineOutputs(const Info& info) { +void EmitContext::DefineOutputs() { switch (stage) { case Stage::Vertex: { output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); - const std::array zero{f32_zero_value, f32_zero_value, f32_zero_value, - f32_zero_value, f32_zero_value, f32_zero_value, - f32_zero_value, f32_zero_value}; - const Id type{TypeArray(F32[1], ConstU32(8U))}; - const Id initializer{ConstantComposite(type, zero)}; - clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output, - initializer); - cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output, - initializer); + const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) || + info.stores.Get(IR::Attribute::Position2) || + info.stores.Get(IR::Attribute::Position3); + if (has_extra_pos_stores) { + const Id type{TypeArray(F32[1], ConstU32(8U))}; + clip_distances = + DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output); + cull_distances = + DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output); + } for (u32 i = 0; i < IR::NumParams; i++) { const IR::Attribute param{IR::Attribute::Param0 + i}; if (!info.stores.GetAny(param)) { @@ -304,7 +305,24 @@ void EmitContext::DefineOutputs(const Info& info) { } } -void EmitContext::DefineBuffers(const Info& info) { +void EmitContext::DefinePushDataBlock() { + // Create push constants block for instance steps rates + const Id struct_type{Name(TypeStruct(U32[1], U32[1], U32[4], U32[4]), "AuxData")}; + Decorate(struct_type, spv::Decoration::Block); + MemberName(struct_type, 0, "sr0"); + MemberName(struct_type, 1, "sr1"); + MemberName(struct_type, 2, "buf_offsets0"); + MemberName(struct_type, 3, "buf_offsets1"); + MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); + MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U); + MemberDecorate(struct_type, 2, spv::Decoration::Offset, 8U); + MemberDecorate(struct_type, 3, spv::Decoration::Offset, 24U); + push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); + Name(push_data_block, "push_data"); + interfaces.push_back(push_data_block); +} + +void EmitContext::DefineBuffers() { boost::container::small_vector type_ids; for (u32 i = 0; const auto& buffer : info.buffers) { const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; @@ -322,8 +340,8 @@ void EmitContext::DefineBuffers(const Info& info) { Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, 0, "data"); MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); + type_ids.push_back(record_array_type); } - type_ids.push_back(record_array_type); const auto storage_class = buffer.is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; @@ -334,9 +352,9 @@ void EmitContext::DefineBuffers(const Info& info) { Decorate(id, spv::Decoration::DescriptorSet, 0U); Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base)); - binding++; buffers.push_back({ .id = id, + .global_binding = binding++, .data_types = data_types, .pointer_type = pointer_type, .buffer = buffer.GetVsharp(info), @@ -430,7 +448,7 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { throw InvalidArgument("Invalid texture type {}", desc.type); } -void EmitContext::DefineImagesAndSamplers(const Info& info) { +void EmitContext::DefineImagesAndSamplers() { for (const auto& image_desc : info.images) { const VectorIds* data_types = [&] { switch (image_desc.nfmt) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 2aa1bf78..0d090eb3 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -40,6 +40,7 @@ public: ~EmitContext(); Id Def(const IR::Value& value); + Id GetBufferOffset(u32 binding); [[nodiscard]] Id DefineInput(Id type, u32 location) { const Id input_id{DefineVar(type, spv::StorageClass::Input)}; @@ -168,7 +169,7 @@ public: Id output_position{}; Id vertex_index{}; Id instance_id{}; - Id instance_step_rates{}; + Id push_data_block{}; Id base_vertex{}; Id frag_coord{}; Id front_facing{}; @@ -201,14 +202,16 @@ public: struct BufferDefinition { Id id; + Id offset; + u32 global_binding; const VectorIds* data_types; Id pointer_type; AmdGpu::Buffer buffer; }; u32& binding; - boost::container::small_vector buffers; - boost::container::small_vector images; + boost::container::small_vector buffers; + boost::container::small_vector images; boost::container::small_vector samplers; Id sampler_type{}; @@ -227,11 +230,12 @@ public: private: void DefineArithmeticTypes(); - void DefineInterfaces(const IR::Program& program); - void DefineInputs(const Info& info); - void DefineOutputs(const Info& info); - void DefineBuffers(const Info& info); - void DefineImagesAndSamplers(const Info& info); + void DefineInterfaces(); + void DefineInputs(); + void DefineOutputs(); + void DefinePushDataBlock(); + void DefineBuffers(); + void DefineImagesAndSamplers(); void DefineSharedMemory(); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index e8c2a31c..b295c1be 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -447,6 +447,7 @@ void Translator::EmitFetch(const GcnInst& inst) { .is_instance_data = true, }); instance_buf_handle = s32(info.buffers.size() - 1); + info.uses_step_rates = true; } const u32 num_components = AmdGpu::NumComponents(buffer.GetDataFmt()); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index c667968a..3c6dfbda 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -338,6 +338,11 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma if (is_typed) { info.dmft.Assign(static_cast(mtbuf.dfmt)); info.nfmt.Assign(static_cast(mtbuf.nfmt)); + ASSERT(info.nfmt == AmdGpu::NumberFormat::Float && + (info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || + info.dmft == AmdGpu::DataFormat::Format32_32_32 || + info.dmft == AmdGpu::DataFormat::Format32_32 || + info.dmft == AmdGpu::DataFormat::Format32)); } const IR::Value handle = diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index eaca8ce8..169f6da0 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -173,10 +173,9 @@ bool IsImageStorageInstruction(const IR::Inst& inst) { class Descriptors { public: - explicit Descriptors(BufferResourceList& buffer_resources_, ImageResourceList& image_resources_, - SamplerResourceList& sampler_resources_) - : buffer_resources{buffer_resources_}, image_resources{image_resources_}, - sampler_resources{sampler_resources_} {} + explicit Descriptors(Info& info_) + : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images}, + sampler_resources{info_.samplers} {} u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { @@ -188,6 +187,7 @@ public: ASSERT(buffer.length == desc.length); buffer.is_storage |= desc.is_storage; buffer.used_types |= desc.used_types; + buffer.is_written |= desc.is_written; return index; } @@ -201,9 +201,16 @@ public: } u32 Add(const SamplerResource& desc) { - const u32 index{Add(sampler_resources, desc, [&desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) { + if (desc.sgpr_base == existing.sgpr_base && + desc.dword_offset == existing.dword_offset) { + return true; + } + // Samplers with different bindings might still be the same. + const auto old_sharp = + info.ReadUd(existing.sgpr_base, existing.dword_offset); + const auto new_sharp = info.ReadUd(desc.sgpr_base, desc.dword_offset); + return old_sharp == new_sharp; })}; return index; } @@ -219,6 +226,7 @@ private: return static_cast(descriptors.size()) - 1; } + const Info& info; BufferResourceList& buffer_resources; ImageResourceList& image_resources; SamplerResourceList& sampler_resources; @@ -328,16 +336,6 @@ static bool IsLoadBufferFormat(const IR::Inst& inst) { } } -static bool IsReadConstBuffer(const IR::Inst& inst) { - switch (inst.GetOpcode()) { - case IR::Opcode::ReadConstBuffer: - case IR::Opcode::ReadConstBufferU32: - return true; - default: - return false; - } -} - static u32 BufferLength(const AmdGpu::Buffer& buffer) { const auto stride = buffer.GetStride(); if (stride < sizeof(f32)) { @@ -401,43 +399,42 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, IR::Inst* handle = inst.Arg(0).InstRecursive(); IR::Inst* producer = handle->Arg(0).InstRecursive(); const auto sharp = TrackSharp(producer); + const bool is_store = IsBufferStore(inst); buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); binding = descriptors.Add(BufferResource{ .sgpr_base = sharp.sgpr_base, .dword_offset = sharp.dword_offset, .length = BufferLength(buffer), .used_types = BufferDataType(inst, buffer.GetNumberFmt()), - .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, + .is_storage = true || is_store || buffer.GetSize() > MaxUboSize, + .is_written = is_store, }); } + // Update buffer descriptor format. const auto inst_info = inst.Flags(); - IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - // Replace handle with binding index in buffer resource list. - inst.SetArg(0, ir.Imm32(binding)); - ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); + auto& buffer_desc = info.buffers[binding]; if (inst_info.is_typed) { - ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float && - (inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || - inst_info.dmft == AmdGpu::DataFormat::Format32_32_32 || - inst_info.dmft == AmdGpu::DataFormat::Format32_32 || - inst_info.dmft == AmdGpu::DataFormat::Format32)); + buffer_desc.dfmt = inst_info.dmft; + buffer_desc.nfmt = inst_info.nfmt; + } else { + buffer_desc.dfmt = buffer.GetDataFmt(); + buffer_desc.nfmt = buffer.GetNumberFmt(); } - if (IsReadConstBuffer(inst)) { + // Replace handle with binding index in buffer resource list. + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.SetArg(0, ir.Imm32(binding)); + ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); + + // Address of constant buffer reads can be calculated at IR emittion time. + if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer || + inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) { return; } - if (IsLoadBufferFormat(inst)) { - if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) { - info.uses_fp16 = true; - } - } else { - const u32 stride = buffer.GetStride(); - ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}", - stride); - } - + // Compute address of the buffer using the stride. + // Todo: What if buffer is rebound with different stride? IR::U32 address = ir.Imm32(inst_info.inst_offset.Value()); if (inst_info.index_enable) { const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} @@ -587,39 +584,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip } void ResourceTrackingPass(IR::Program& program) { - // When loading data from untyped buffer we don't have if it is float or integer. - // Most of the time it is float so that is the default. This pass detects float buffer loads - // combined with bitcasts and patches them to be integer loads. - for (IR::Block* const block : program.post_order_blocks) { - break; - for (IR::Inst& inst : block->Instructions()) { - if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) { - continue; - } - // Replace the bitcast with a typed buffer read - IR::Inst* const arg_inst{inst.Arg(0).TryInstRecursive()}; - if (!arg_inst) { - continue; - } - const auto replace{[&](IR::Opcode new_opcode) { - inst.ReplaceOpcode(new_opcode); - inst.SetArg(0, arg_inst->Arg(0)); - inst.SetArg(1, arg_inst->Arg(1)); - inst.SetFlags(arg_inst->Flags()); - arg_inst->Invalidate(); - }}; - if (arg_inst->GetOpcode() == IR::Opcode::ReadConstBuffer) { - replace(IR::Opcode::ReadConstBufferU32); - } - if (arg_inst->GetOpcode() == IR::Opcode::LoadBufferF32) { - replace(IR::Opcode::LoadBufferU32); - } - } - } - // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; - Descriptors descriptors{info.buffers, info.images, info.samplers}; + Descriptors descriptors{info}; for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index d747c016..392ec772 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -61,7 +61,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool; +struct PushData { + static constexpr size_t BufOffsetIndex = 2; + + u32 step0; + u32 step1; + std::array buf_offsets; + + void AddOffset(u32 binding, u32 offset) { + ASSERT(offset < 64 && binding < 32); + buf_offsets[binding] = offset; + } +}; + struct Info { struct VsInput { enum InstanceIdType : u8 { @@ -182,6 +198,7 @@ struct Info { bool uses_shared_u8{}; bool uses_shared_u16{}; bool uses_fp16{}; + bool uses_step_rates{}; bool translation_failed{}; // indicates that shader has unsupported instructions template diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 01271792..ef5bf1b6 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -363,6 +363,10 @@ struct Sampler { return raw0 != 0 || raw1 != 0; } + bool operator==(const Sampler& other) const noexcept { + return std::memcmp(this, &other, sizeof(Sampler)) == 0; + } + float LodBias() const noexcept { return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / 256.0f; diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp new file mode 100644 index 00000000..e9498b35 --- /dev/null +++ b/src/video_core/buffer_cache/buffer.cpp @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/alignment.h" +#include "common/assert.h" +#include "video_core/buffer_cache/buffer.h" +#include "video_core/renderer_vulkan/liverpool_to_vk.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_platform.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +#include + +namespace VideoCore { + +constexpr vk::BufferUsageFlags AllFlags = + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst | + vk::BufferUsageFlagBits::eUniformTexelBuffer | vk::BufferUsageFlagBits::eStorageTexelBuffer | + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer; + +std::string_view BufferTypeName(MemoryUsage type) { + switch (type) { + case MemoryUsage::Upload: + return "Upload"; + case MemoryUsage::Download: + return "Download"; + case MemoryUsage::Stream: + return "Stream"; + case MemoryUsage::DeviceLocal: + return "DeviceLocal"; + default: + return "Invalid"; + } +} + +[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) { + return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT + : VkMemoryPropertyFlagBits{}; +} + +[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::Upload: + case MemoryUsage::Stream: + return VMA_ALLOCATION_CREATE_MAPPED_BIT | + VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + case MemoryUsage::Download: + return VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; + case MemoryUsage::DeviceLocal: + return {}; + } + return {}; +} + +[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) { + switch (usage) { + case MemoryUsage::DeviceLocal: + case MemoryUsage::Stream: + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + case MemoryUsage::Upload: + case MemoryUsage::Download: + return VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + } + return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; +} + +UniqueBuffer::UniqueBuffer(vk::Device device_, VmaAllocator allocator_) + : device{device_}, allocator{allocator_} {} + +UniqueBuffer::~UniqueBuffer() { + if (buffer) { + vmaDestroyBuffer(allocator, buffer, allocation); + } +} + +void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage, + VmaAllocationInfo* out_alloc_info) { + const VmaAllocationCreateInfo alloc_ci = { + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage), + .usage = MemoryUsageVma(usage), + .requiredFlags = 0, + .preferredFlags = MemoryUsagePreferredVmaFlags(usage), + .pool = VK_NULL_HANDLE, + .pUserData = nullptr, + }; + + const VkBufferCreateInfo buffer_ci_unsafe = static_cast(buffer_ci); + VkBuffer unsafe_buffer{}; + VkResult result = vmaCreateBuffer(allocator, &buffer_ci_unsafe, &alloc_ci, &unsafe_buffer, + &allocation, out_alloc_info); + ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}", + vk::to_string(vk::Result{result})); + buffer = vk::Buffer{unsafe_buffer}; +} + +Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_, + u64 size_bytes_) + : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_}, + buffer{instance->GetDevice(), instance->GetAllocator()} { + // Create buffer object. + const vk::BufferCreateInfo buffer_ci = { + .size = size_bytes, + .usage = AllFlags, + }; + VmaAllocationInfo alloc_info{}; + buffer.Create(buffer_ci, usage, &alloc_info); + + if (instance->HasDebuggingToolAttached()) { + const auto device = instance->GetDevice(); + Vulkan::SetObjectName(device, Handle(), "Buffer {:#x} {} KiB", cpu_addr, size_bytes / 1024); + } + + // Map it if it is host visible. + VkMemoryPropertyFlags property_flags{}; + vmaGetAllocationMemoryProperties(instance->GetAllocator(), buffer.allocation, &property_flags); + if (alloc_info.pMappedData) { + mapped_data = std::span{std::bit_cast(alloc_info.pMappedData), size_bytes}; + } + is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; +} + +vk::BufferView Buffer::View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, + AmdGpu::NumberFormat nfmt) { + const auto it{std::ranges::find_if(views, [offset, size, dfmt, nfmt](const BufferView& view) { + return offset == view.offset && size == view.size && dfmt == view.dfmt && nfmt == view.nfmt; + })}; + if (it != views.end()) { + return it->handle; + } + views.push_back({ + .offset = offset, + .size = size, + .dfmt = dfmt, + .nfmt = nfmt, + .handle = instance->GetDevice().createBufferView({ + .buffer = buffer.buffer, + .format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt), + .offset = offset, + .range = size, + }), + }); + return views.back().handle; +} + +constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; +constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; + +StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_, + MemoryUsage usage, u64 size_bytes) + : Buffer{instance, usage, 0, size_bytes}, scheduler{scheduler_} { + ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); + ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); + const auto device = instance.GetDevice(); + if (instance.HasDebuggingToolAttached()) { + Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}): {} KiB", BufferTypeName(usage), + size_bytes / 1024); + } +} + +std::pair StreamBuffer::Map(u64 size, u64 alignment) { + if (!is_coherent && usage == MemoryUsage::Stream) { + size = Common::AlignUp(size, instance->NonCoherentAtomSize()); + } + + ASSERT(size <= this->size_bytes); + mapped_size = size; + + if (alignment > 0) { + offset = Common::AlignUp(offset, alignment); + } + + if (offset + size > this->size_bytes) { + // The buffer would overflow, save the amount of used watches and reset the state. + invalidation_mark = current_watch_cursor; + current_watch_cursor = 0; + offset = 0; + + // Swap watches and reset waiting cursors. + std::swap(previous_watches, current_watches); + wait_cursor = 0; + wait_bound = 0; + } + + const u64 mapped_upper_bound = offset + size; + WaitPendingOperations(mapped_upper_bound); + return std::make_pair(mapped_data.data() + offset, offset); +} + +void StreamBuffer::Commit() { + if (!is_coherent) { + if (usage == MemoryUsage::Download) { + vmaInvalidateAllocation(instance->GetAllocator(), buffer.allocation, offset, + mapped_size); + } else { + vmaFlushAllocation(instance->GetAllocator(), buffer.allocation, offset, mapped_size); + } + } + + offset += mapped_size; + if (current_watch_cursor + 1 >= current_watches.size()) { + // Ensure that there are enough watches. + ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK); + } + + auto& watch = current_watches[current_watch_cursor++]; + watch.upper_bound = offset; + watch.tick = scheduler.CurrentTick(); +} + +void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_size) { + watches.resize(watches.size() + grow_size); +} + +void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { + if (!invalidation_mark) { + return; + } + while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { + auto& watch = previous_watches[wait_cursor]; + wait_bound = watch.upper_bound; + scheduler.Wait(watch.tick); + ++wait_cursor; + } +} + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h new file mode 100644 index 00000000..e0d9da08 --- /dev/null +++ b/src/video_core/buffer_cache/buffer.h @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include "common/types.h" +#include "video_core/amdgpu/resource.h" +#include "video_core/renderer_vulkan/vk_common.h" + +namespace Vulkan { +class Instance; +class Scheduler; +} // namespace Vulkan + +VK_DEFINE_HANDLE(VmaAllocation) +VK_DEFINE_HANDLE(VmaAllocator) + +struct VmaAllocationInfo; + +namespace VideoCore { + +/// Hints and requirements for the backing memory type of a commit +enum class MemoryUsage { + DeviceLocal, ///< Requests device local buffer. + Upload, ///< Requires a host visible memory type optimized for CPU to GPU uploads + Download, ///< Requires a host visible memory type optimized for GPU to CPU readbacks + Stream, ///< Requests device local host visible buffer, falling back host memory. +}; + +struct UniqueBuffer { + explicit UniqueBuffer(vk::Device device, VmaAllocator allocator); + ~UniqueBuffer(); + + UniqueBuffer(const UniqueBuffer&) = delete; + UniqueBuffer& operator=(const UniqueBuffer&) = delete; + + UniqueBuffer(UniqueBuffer&& other) + : buffer{std::exchange(other.buffer, VK_NULL_HANDLE)}, + allocator{std::exchange(other.allocator, VK_NULL_HANDLE)}, + allocation{std::exchange(other.allocation, VK_NULL_HANDLE)} {} + UniqueBuffer& operator=(UniqueBuffer&& other) { + buffer = std::exchange(other.buffer, VK_NULL_HANDLE); + allocator = std::exchange(other.allocator, VK_NULL_HANDLE); + allocation = std::exchange(other.allocation, VK_NULL_HANDLE); + return *this; + } + + void Create(const vk::BufferCreateInfo& image_ci, MemoryUsage usage, + VmaAllocationInfo* out_alloc_info); + + operator vk::Buffer() const { + return buffer; + } + + vk::Device device; + VmaAllocator allocator; + VmaAllocation allocation; + vk::Buffer buffer{}; +}; + +class Buffer { +public: + explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_, + u64 size_bytes_); + + Buffer& operator=(const Buffer&) = delete; + Buffer(const Buffer&) = delete; + + Buffer& operator=(Buffer&&) = default; + Buffer(Buffer&&) = default; + + vk::BufferView View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt); + + /// Increases the likeliness of this being a stream buffer + void IncreaseStreamScore(int score) noexcept { + stream_score += score; + } + + /// Returns the likeliness of this being a stream buffer + [[nodiscard]] int StreamScore() const noexcept { + return stream_score; + } + + /// Returns true when vaddr -> vaddr+size is fully contained in the buffer + [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { + return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); + } + + /// Returns the base CPU address of the buffer + [[nodiscard]] VAddr CpuAddr() const noexcept { + return cpu_addr; + } + + /// Returns the offset relative to the given CPU address + [[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept { + return static_cast(other_cpu_addr - cpu_addr); + } + + size_t SizeBytes() const { + return size_bytes; + } + + vk::Buffer Handle() const noexcept { + return buffer; + } + +public: + VAddr cpu_addr = 0; + bool is_picked{}; + bool is_coherent{}; + int stream_score = 0; + size_t size_bytes = 0; + std::span mapped_data; + const Vulkan::Instance* instance{}; + MemoryUsage usage; + UniqueBuffer buffer; + struct BufferView { + u32 offset; + u32 size; + AmdGpu::DataFormat dfmt; + AmdGpu::NumberFormat nfmt; + vk::BufferView handle; + }; + std::vector views; +}; + +class StreamBuffer : public Buffer { +public: + explicit StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + MemoryUsage usage, u64 size_bytes_); + + /// Reserves a region of memory from the stream buffer. + std::pair Map(u64 size, u64 alignment = 0); + + /// Ensures that reserved bytes of memory are available to the GPU. + void Commit(); + + /// Maps and commits a memory region with user provided data + u64 Copy(VAddr src, size_t size, size_t alignment = 0) { + const auto [data, offset] = Map(size, alignment); + std::memcpy(data, reinterpret_cast(src), size); + Commit(); + return offset; + } + +private: + struct Watch { + u64 tick{}; + u64 upper_bound{}; + }; + + /// Increases the amount of watches available. + void ReserveWatches(std::vector& watches, std::size_t grow_size); + + /// Waits pending watches until requested upper bound. + void WaitPendingOperations(u64 requested_upper_bound); + +private: + Vulkan::Scheduler& scheduler; + u64 offset{}; + u64 mapped_size{}; + std::vector current_watches; + std::size_t current_watch_cursor{}; + std::optional invalidation_mark; + std::vector previous_watches; + std::size_t wait_cursor{}; + u64 wait_bound{}; +}; + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100644 index 00000000..a89107ba --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -0,0 +1,497 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later +#pragma clang optimize off +#include +#include "common/alignment.h" +#include "common/scope_exit.h" +#include "shader_recompiler/runtime_info.h" +#include "video_core/amdgpu/liverpool.h" +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/liverpool_to_vk.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace VideoCore { + +static constexpr size_t StagingBufferSize = 256_MB; +static constexpr size_t UboStreamBufferSize = 64_MB; + +BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, + const AmdGpu::Liverpool* liverpool_, PageManager& tracker_) + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, tracker{tracker_}, + staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, + stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, + memory_tracker{&tracker} { + // Ensure the first slot is used for the null buffer + void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, 1)); +} + +BufferCache::~BufferCache() = default; + +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { + std::scoped_lock lk{mutex}; + const bool is_tracked = IsRegionRegistered(device_addr, size); + if (!is_tracked) { + return; + } + // Mark the page as CPU modified to stop tracking writes. + SCOPE_EXIT { + memory_tracker.MarkRegionAsCpuModified(device_addr, size); + }; + if (!memory_tracker.IsRegionGpuModified(device_addr, size)) { + // Page has not been modified by the GPU, nothing to do. + return; + } +} + +void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + memory_tracker.ForEachDownloadRange( + device_addr, size, [&](u64 device_addr_out, u64 range_size) { + const VAddr buffer_addr = buffer.CpuAddr(); + const auto add_download = [&](VAddr start, VAddr end, u64) { + const u64 new_offset = start - buffer_addr; + const u64 new_size = end - start; + copies.push_back(vk::BufferCopy{ + .srcOffset = new_offset, + .dstOffset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + largest_copy = std::max(largest_copy, new_size); + }; + }); + if (total_size_bytes == 0) { + return; + } + const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + for (auto& copy : copies) { + // Modify copies to have the staging offset in mind + copy.dstOffset += offset; + } + staging_buffer.Commit(); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies); + scheduler.Finish(); + for (const auto& copy : copies) { + const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; + const u64 dst_offset = copy.dstOffset - offset; + std::memcpy(std::bit_cast(copy_device_addr), staging + dst_offset, copy.size); + } +} + +bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { + if (vs_info.vs_inputs.empty()) { + return false; + } + + std::array host_buffers; + std::array host_offsets; + boost::container::static_vector guest_buffers; + + struct BufferRange { + VAddr base_address; + VAddr end_address; + vk::Buffer vk_buffer; + u64 offset; + + size_t GetSize() const { + return end_address - base_address; + } + }; + + // Calculate buffers memory overlaps + bool has_step_rate = false; + boost::container::static_vector ranges{}; + for (const auto& input : vs_info.vs_inputs) { + if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 || + input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) { + has_step_rate = true; + continue; + } + + const auto& buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); + if (buffer.GetSize() == 0) { + continue; + } + guest_buffers.emplace_back(buffer); + ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); + } + + std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { + return lhv.base_address < rhv.base_address; + }); + + boost::container::static_vector ranges_merged{ranges[0]}; + for (auto range : ranges) { + auto& prev_range = ranges_merged.back(); + if (prev_range.end_address < range.base_address) { + ranges_merged.emplace_back(range); + } else { + prev_range.end_address = std::max(prev_range.end_address, range.end_address); + } + } + + // Map buffers + for (auto& range : ranges_merged) { + const auto [buffer, offset] = ObtainBuffer(range.base_address, range.GetSize(), false); + range.vk_buffer = buffer->buffer; + range.offset = offset; + } + + // Bind vertex buffers + const size_t num_buffers = guest_buffers.size(); + for (u32 i = 0; i < num_buffers; ++i) { + const auto& buffer = guest_buffers[i]; + const auto host_buffer = std::ranges::find_if(ranges_merged, [&](const BufferRange& range) { + return (buffer.base_address >= range.base_address && + buffer.base_address < range.end_address); + }); + ASSERT(host_buffer != ranges_merged.cend()); + + host_buffers[i] = host_buffer->vk_buffer; + host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address; + } + + if (num_buffers > 0) { + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data()); + } + + return has_step_rate; +} + +u32 BufferCache::BindIndexBuffer(bool& is_indexed, u32 index_offset) { + // Emulate QuadList primitive type with CPU made index buffer. + const auto& regs = liverpool->regs; + if (regs.primitive_type == AmdGpu::Liverpool::PrimitiveType::QuadList) { + is_indexed = true; + + // Emit indices. + const u32 index_size = 3 * regs.num_indices; + const auto [data, offset] = stream_buffer.Map(index_size); + Vulkan::LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices); + stream_buffer.Commit(); + + // Bind index buffer. + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindIndexBuffer(stream_buffer.Handle(), offset, vk::IndexType::eUint16); + return index_size / sizeof(u16); + } + if (!is_indexed) { + return regs.num_indices; + } + + // Figure out index type and size. + const bool is_index16 = + regs.index_buffer_type.index_type == AmdGpu::Liverpool::IndexType::Index16; + const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32; + const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32); + VAddr index_address = regs.index_base_address.Address(); + index_address += index_offset * index_size; + + // Bind index buffer. + const u32 index_buffer_size = regs.num_indices * index_size; + const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size, false); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindIndexBuffer(vk_buffer->Handle(), offset, index_type); + return regs.num_indices; +} + +std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written) { + std::scoped_lock lk{mutex}; + static constexpr u64 StreamThreshold = CACHING_PAGESIZE; + const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); + if (!is_written && size < StreamThreshold && !is_gpu_dirty) { + // For small uniform buffers that have not been modified by gpu + // use device local stream buffer to reduce renderpass breaks. + const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); + return {&stream_buffer, offset}; + } + + const BufferId buffer_id = FindBuffer(device_addr, size); + Buffer& buffer = slot_buffers[buffer_id]; + SynchronizeBuffer(buffer, device_addr, size); + if (is_written) { + memory_tracker.MarkRegionAsGpuModified(device_addr, size); + } + return {&buffer, buffer.Offset(device_addr)}; +} + +bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { + const VAddr end_addr = addr + size; + const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); + for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + const VAddr buf_start_addr = buffer.CpuAddr(); + const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes(); + if (buf_start_addr < end_addr && addr < buf_end_addr) { + return true; + } + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); + } + return false; +} + +bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { + return memory_tracker.IsRegionCpuModified(addr, size); +} + +BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { + if (device_addr == 0) { + return NULL_BUFFER_ID; + } + const u64 page = device_addr >> CACHING_PAGEBITS; + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + return CreateBuffer(device_addr, size); + } + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(device_addr, size)) { + return buffer_id; + } + return CreateBuffer(device_addr, size); +} + +BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 wanted_size) { + static constexpr int STREAM_LEAP_THRESHOLD = 16; + boost::container::small_vector overlap_ids; + VAddr begin = device_addr; + VAddr end = device_addr + wanted_size; + int stream_score = 0; + bool has_stream_leap = false; + const auto expand_begin = [&](VAddr add_value) { + static constexpr VAddr min_page = CACHING_PAGESIZE + DEVICE_PAGESIZE; + if (add_value > begin - min_page) { + begin = min_page; + device_addr = DEVICE_PAGESIZE; + return; + } + begin -= add_value; + device_addr = begin - CACHING_PAGESIZE; + }; + const auto expand_end = [&](VAddr add_value) { + static constexpr VAddr max_page = 1ULL << MemoryTracker::MAX_CPU_PAGE_BITS; + if (add_value > max_page - end) { + end = max_page; + return; + } + end += add_value; + }; + if (begin == 0) { + return OverlapResult{ + .ids = std::move(overlap_ids), + .begin = begin, + .end = end, + .has_stream_leap = has_stream_leap, + }; + } + for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); + device_addr += CACHING_PAGESIZE) { + const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS]; + if (!overlap_id) { + continue; + } + Buffer& overlap = slot_buffers[overlap_id]; + if (overlap.is_picked) { + continue; + } + overlap_ids.push_back(overlap_id); + overlap.is_picked = true; + const VAddr overlap_device_addr = overlap.CpuAddr(); + const bool expands_left = overlap_device_addr < begin; + if (expands_left) { + begin = overlap_device_addr; + } + const VAddr overlap_end = overlap_device_addr + overlap.SizeBytes(); + const bool expands_right = overlap_end > end; + if (overlap_end > end) { + end = overlap_end; + } + stream_score += overlap.StreamScore(); + if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) { + // When this memory region has been joined a bunch of times, we assume it's being used + // as a stream buffer. Increase the size to skip constantly recreating buffers. + has_stream_leap = true; + if (expands_right) { + expand_begin(CACHING_PAGESIZE * 128); + } + if (expands_left) { + expand_end(CACHING_PAGESIZE * 128); + } + } + } + return OverlapResult{ + .ids = std::move(overlap_ids), + .begin = begin, + .end = end, + .has_stream_leap = has_stream_leap, + }; +} + +void BufferCache::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, + bool accumulate_stream_score) { + Buffer& new_buffer = slot_buffers[new_buffer_id]; + Buffer& overlap = slot_buffers[overlap_id]; + if (accumulate_stream_score) { + new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); + } + const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); + const vk::BufferCopy copy = { + .srcOffset = 0, + .dstOffset = dst_base_offset, + .size = overlap.SizeBytes(), + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + static constexpr vk::MemoryBarrier READ_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite, + }; + static constexpr vk::MemoryBarrier WRITE_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, + READ_BARRIER, {}, {}); + cmdbuf.copyBuffer(overlap.buffer, new_buffer.buffer, copy); + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eAllCommands, + vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); + DeleteBuffer(overlap_id, true); +} + +BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { + const VAddr device_addr_end = Common::AlignUp(device_addr + wanted_size, CACHING_PAGESIZE); + device_addr = Common::AlignDown(device_addr, CACHING_PAGESIZE); + wanted_size = static_cast(device_addr_end - device_addr); + const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size); + const u32 size = static_cast(overlap.end - overlap.begin); + const BufferId new_buffer_id = + slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, size); + auto& new_buffer = slot_buffers[new_buffer_id]; + const size_t size_bytes = new_buffer.SizeBytes(); + const auto cmdbuf = scheduler.CommandBuffer(); + scheduler.EndRendering(); + cmdbuf.fillBuffer(new_buffer.buffer, 0, size_bytes, 0); + for (const BufferId overlap_id : overlap.ids) { + JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); + } + Register(new_buffer_id); + return new_buffer_id; +} + +void BufferCache::Register(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +void BufferCache::Unregister(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +template +void BufferCache::ChangeRegister(BufferId buffer_id) { + Buffer& buffer = slot_buffers[buffer_id]; + const auto size = buffer.SizeBytes(); + const VAddr device_addr_begin = buffer.CpuAddr(); + const VAddr device_addr_end = device_addr_begin + size; + const u64 page_begin = device_addr_begin / CACHING_PAGESIZE; + const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE); + for (u64 page = page_begin; page != page_end; ++page) { + if constexpr (insert) { + page_table[page] = buffer_id; + } else { + page_table[page] = BufferId{}; + } + } +} + +bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + VAddr buffer_start = buffer.CpuAddr(); + const auto add_copy = [&](VAddr device_addr_out, u64 range_size) { + copies.push_back(vk::BufferCopy{ + .srcOffset = total_size_bytes, + .dstOffset = device_addr_out - buffer_start, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }; + memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { + add_copy(device_addr_out, range_size); + // Prevent uploading to gpu modified regions. + // gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy); + }); + if (total_size_bytes == 0) { + return true; + } + vk::Buffer src_buffer = staging_buffer.Handle(); + if (total_size_bytes < StagingBufferSize) { + const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + for (auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + // Apply the staging offset + copy.srcOffset += offset; + } + staging_buffer.Commit(); + } else { + // For large one time transfers use a temporary host buffer. + // RenderDoc can lag quite a bit if the stream buffer is too large. + Buffer temp_buffer{instance, MemoryUsage::Upload, 0, total_size_bytes}; + src_buffer = temp_buffer.Handle(); + u8* const staging = temp_buffer.mapped_data.data(); + for (auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + } + scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {}); + } + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + static constexpr vk::MemoryBarrier READ_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite, + }; + static constexpr vk::MemoryBarrier WRITE_BARRIER{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, + READ_BARRIER, {}, {}); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eAllCommands, + vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); + return false; +} + +void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { + // Mark the whole buffer as CPU written to stop tracking CPU writes + if (!do_not_mark) { + Buffer& buffer = slot_buffers[buffer_id]; + memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + } + Unregister(buffer_id); + scheduler.DeferOperation([this, buffer_id] { slot_buffers.erase(buffer_id); }); +} + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h new file mode 100644 index 00000000..1a99b2b3 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -0,0 +1,121 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include "common/div_ceil.h" +#include "common/slot_vector.h" +#include "common/types.h" +#include "video_core/buffer_cache/buffer.h" +#include "video_core/buffer_cache/memory_tracker_base.h" +#include "video_core/buffer_cache/range_set.h" + +namespace AmdGpu { +struct Liverpool; +} + +namespace Shader { +struct Info; +} + +namespace VideoCore { + +using BufferId = Common::SlotId; + +static constexpr BufferId NULL_BUFFER_ID{0}; + +static constexpr u32 NUM_VERTEX_BUFFERS = 32; + +class BufferCache { +public: + static constexpr u32 CACHING_PAGEBITS = 12; + static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; + static constexpr u64 DEVICE_PAGESIZE = 4_KB; + + struct OverlapResult { + boost::container::small_vector ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; + +public: + explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + const AmdGpu::Liverpool* liverpool, PageManager& tracker); + ~BufferCache(); + + /// Invalidates any buffer in the logical page range. + void InvalidateMemory(VAddr device_addr, u64 size); + + /// Binds host vertex buffers for the current draw. + bool BindVertexBuffers(const Shader::Info& vs_info); + + /// Bind host index buffer for the current draw. + u32 BindIndexBuffer(bool& is_indexed, u32 index_offset); + + /// Obtains a buffer for the specified region. + [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written); + + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + + /// Return true when a CPU region is modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + +private: + template + void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE); + for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); + } + } + + void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); + + [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template + void ChangeRegister(BufferId buffer_id); + + bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size); + + void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + + const Vulkan::Instance& instance; + Vulkan::Scheduler& scheduler; + const AmdGpu::Liverpool* liverpool; + PageManager& tracker; + StreamBuffer staging_buffer; + StreamBuffer stream_buffer; + std::recursive_mutex mutex; + Common::SlotVector slot_buffers; + MemoryTracker memory_tracker; + std::array> CACHING_PAGEBITS)> page_table; +}; + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100644 index 00000000..62439447 --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -0,0 +1,175 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include "common/types.h" +#include "video_core/buffer_cache/word_manager.h" + +namespace VideoCore { + +class MemoryTracker { +public: + static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t HIGHER_PAGE_BITS = 22; + static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; + static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; + static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); + static constexpr size_t MANAGER_POOL_SIZE = 32; + static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; + using Manager = WordManager; + +public: + explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {} + ~MemoryTracker() = default; + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); + } + + /// Mark region as CPU modified, notifying the device_tracker about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as CPU modified, notifying the device_tracker about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + }); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + if constexpr (clear) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + } else { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + } + }); + } + +private: + /** + * @brief IteratePages Iterates L2 word manager page table. + * @param cpu_address Start byte cpu address + * @param size Size in bytes of the region of iterate. + * @param func Callback for each word manager. + * @return + */ + template + bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool BOOL_BREAK = std::is_same_v; + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + if (manager) { + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return false; + } + + void CreateRegion(std::size_t page_index) { + const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; + if (free_managers.empty()) { + manager_pool.emplace_back(); + auto& last_pool = manager_pool.back(); + for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { + std::construct_at(&last_pool[i], tracker, 0, HIGHER_PAGE_SIZE); + free_managers.push_back(&last_pool[i]); + } + } + // Each manager tracks a 4_MB virtual address space. + auto* new_manager = free_managers.back(); + new_manager->SetCpuAddress(base_cpu_addr); + free_managers.pop_back(); + top_tier[page_index] = new_manager; + } + + PageManager* tracker; + std::deque> manager_pool; + std::vector free_managers; + std::array top_tier{}; +}; + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/range_set.cpp b/src/video_core/buffer_cache/range_set.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/video_core/buffer_cache/range_set.h b/src/video_core/buffer_cache/range_set.h new file mode 100644 index 00000000..fe54aff8 --- /dev/null +++ b/src/video_core/buffer_cache/range_set.h @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include "common/types.h" + +namespace VideoCore { + +template +using RangeSetsAllocator = + boost::fast_pool_allocator; + +struct RangeSet { + using IntervalSet = + boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + + explicit RangeSet() = default; + ~RangeSet() = default; + + void Add(VAddr base_address, size_t size) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_set.add(interval); + } + + void Subtract(VAddr base_address, size_t size) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_set.subtract(interval); + } + + template + void ForEach(Func&& func) const { + if (m_ranges_set.empty()) { + return; + } + auto it = m_ranges_set.begin(); + auto end_it = m_ranges_set.end(); + for (; it != end_it; it++) { + const VAddr inter_addr_end = it->upper(); + const VAddr inter_addr = it->lower(); + func(inter_addr, inter_addr_end); + } + } + + template + void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { + if (m_ranges_set.empty()) { + return; + } + const VAddr start_address = base_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = m_ranges_set.lower_bound(search_interval); + if (it == m_ranges_set.end()) { + return; + } + auto end_it = m_ranges_set.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + + IntervalSet m_ranges_set; +}; + +class RangeMap { +public: + using IntervalMap = + boost::icl::interval_map; + using IntervalType = typename IntervalMap::interval_type; + +public: + RangeMap() = default; + ~RangeMap() = default; + + RangeMap(RangeMap const&) = delete; + RangeMap& operator=(RangeMap const&) = delete; + + RangeMap(RangeMap&& other); + RangeMap& operator=(RangeMap&& other); + + void Add(VAddr base_address, size_t size, u64 value) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_map.add({interval, value}); + } + + void Subtract(VAddr base_address, size_t size) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_map -= interval; + } + + template + void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { + if (m_ranges_map.empty()) { + return; + } + const VAddr start_address = base_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = m_ranges_map.lower_bound(search_interval); + if (it == m_ranges_map.end()) { + return; + } + auto end_it = m_ranges_map.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->first.upper(); + VAddr inter_addr = it->first.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + + template + void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const { + const VAddr end_addr = base_addr + size; + ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) { + if (size_t gap_size = range_addr - base_addr; gap_size != 0) { + func(base_addr, gap_size); + } + base_addr = range_end; + }); + if (base_addr != end_addr) { + func(base_addr, end_addr - base_addr); + } + } + +private: + IntervalMap m_ranges_map; +}; + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100644 index 00000000..e7aaf207 --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h @@ -0,0 +1,398 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include "common/div_ceil.h" +#include "common/types.h" +#include "video_core/page_manager.h" + +namespace VideoCore { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = 4_KB; +constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + +enum class Type { + CPU, + GPU, + Untracked, +}; + +/// Vector tracking modified pages tightly packed with small vector optimization +template +struct WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? stack.data() : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? stack.data() : heap; + } + + std::array stack{}; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage +}; + +template +struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD); + if (IsShort()) { + cpu.stack.fill(~u64{0}); + gpu.stack.fill(0); + untracked.stack.fill(~u64{0}); + } else { + // Share allocation between CPU and GPU pages and set their default values + u64* const alloc = new u64[num_words * 3]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + untracked.heap = alloc + num_words * 2; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + num_words = rhs.num_words; + cpu = rhs.cpu; + gpu = rhs.gpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu}, + untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return num_words <= stack_words; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return num_words; + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + template + std::span Span() noexcept { + if constexpr (type == Type::CPU) { + return std::span(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span(untracked.Pointer(IsShort()), num_words); + } + } + + template + std::span Span() const noexcept { + if constexpr (type == Type::CPU) { + return std::span(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span(untracked.Pointer(IsShort()), num_words); + } + } + + u64 size_bytes = 0; + size_t num_words = 0; + WordsArray cpu; + WordsArray gpu; + WordsArray untracked; +}; + +template +class WordManager { +public: + explicit WordManager(PageManager* tracker_, VAddr cpu_addr_, u64 size_bytes) + : tracker{tracker_}, cpu_addr{cpu_addr_}, words{size_bytes} {} + + explicit WordManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { + constexpr size_t number_bits = sizeof(u64) * 8; + const size_t limit_page_end = number_bits - std::min(page_end, number_bits); + u64 bits = (word >> page_start) << page_start; + bits = (bits << limit_page_end) >> limit_page_end; + return bits; + } + + static std::pair GetWordPage(VAddr address) { + const size_t converted_address = static_cast(address); + const size_t word_number = converted_address / BYTES_PER_WORD; + const size_t amount_pages = converted_address % BYTES_PER_WORD; + return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); + } + + template + void IterateWords(size_t offset, size_t size, Func&& func) const { + using FuncReturn = std::invoke_result_t; + static constexpr bool BOOL_BREAK = std::is_same_v; + const size_t start = static_cast(std::max(static_cast(offset), 0LL)); + const size_t end = static_cast(std::max(static_cast(offset + size), 0LL)); + if (start >= SizeBytes() || end <= start) { + return; + } + auto [start_word, start_page] = GetWordPage(start); + auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); + const size_t num_words = NumWords(); + start_word = std::min(start_word, num_words); + end_word = std::min(end_word, num_words); + const size_t diff = end_word - start_word; + end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; + end_word = std::min(end_word, num_words); + end_page += diff * PAGES_PER_WORD; + constexpr u64 base_mask{~0ULL}; + for (size_t word_index = start_word; word_index < end_word; word_index++) { + const u64 mask = ExtractBits(base_mask, start_page, end_page); + start_page = 0; + end_page -= PAGES_PER_WORD; + if constexpr (BOOL_BREAK) { + if (func(word_index, mask)) { + return; + } + } else { + func(word_index, mask); + } + } + } + + template + void IteratePages(u64 mask, Func&& func) const { + size_t offset = 0; + while (mask != 0) { + const size_t empty_bits = std::countr_zero(mask); + offset += empty_bits; + mask = mask >> empty_bits; + + const size_t continuous_bits = std::countr_one(mask); + func(offset, continuous_bits); + mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; + offset += continuous_bits; + } + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template + void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { + std::span state_words = words.template Span(); + [[maybe_unused]] std::span untracked_words = words.template Span(); + IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { + if constexpr (type == Type::CPU) { + NotifyPageTracker(index, untracked_words[index], mask); + } + if constexpr (enable) { + state_words[index] |= mask; + if constexpr (type == Type::CPU) { + untracked_words[index] |= mask; + } + } else { + state_words[index] &= ~mask; + if constexpr (type == Type::CPU) { + untracked_words[index] &= ~mask; + } + } + }); + } + + /** + * Loop over each page in the given range, turn off those bits and notify the tracker if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + + std::span state_words = words.template Span(); + [[maybe_unused]] std::span untracked_words = words.template Span(); + const size_t offset = query_cpu_range - cpu_addr; + bool pending = false; + size_t pending_offset{}; + size_t pending_pointer{}; + const auto release = [&]() { + func(cpu_addr + pending_offset * BYTES_PER_PAGE, + (pending_pointer - pending_offset) * BYTES_PER_PAGE); + }; + IterateWords(offset, size, [&](size_t index, u64 mask) { + if constexpr (type == Type::GPU) { + mask &= ~untracked_words[index]; + } + const u64 word = state_words[index] & mask; + if constexpr (clear) { + if constexpr (type == Type::CPU) { + NotifyPageTracker(index, untracked_words[index], mask); + } + state_words[index] &= ~mask; + if constexpr (type == Type::CPU) { + untracked_words[index] &= ~mask; + } + } + const size_t base_offset = index * PAGES_PER_WORD; + IteratePages(word, [&](size_t pages_offset, size_t pages_size) { + const auto reset = [&]() { + pending_offset = base_offset + pages_offset; + pending_pointer = base_offset + pages_offset + pages_size; + }; + if (!pending) { + reset(); + pending = true; + return; + } + if (pending_pointer == base_offset + pages_offset) { + pending_pointer += pages_size; + return; + } + release(); + reset(); + }); + }); + if (pending) { + release(); + } + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const std::span state_words = words.template Span(); + [[maybe_unused]] const std::span untracked_words = + words.template Span(); + bool result = false; + IterateWords(offset, size, [&](size_t index, u64 mask) { + if constexpr (type == Type::GPU) { + mask &= ~untracked_words[index]; + } + const u64 word = state_words[index] & mask; + if (word != 0) { + result = true; + return true; + } + return false; + }); + return result; + } + + /// Returns the number of words of the manager + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns the size in bytes of the manager + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + +private: + template + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Notify tracker about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the tracker + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_tracker True when the tracker should start tracking the new pages + */ + template + void NotifyPageTracker(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + IteratePages(changed_bits, [&](size_t offset, size_t size) { + tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE, + add_to_tracker ? 1 : -1); + }); + } + + PageManager* tracker; + VAddr cpu_addr = 0; + Words words; +}; + +} // namespace VideoCore diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp new file mode 100644 index 00000000..3ec12203 --- /dev/null +++ b/src/video_core/page_manager.cpp @@ -0,0 +1,209 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "common/alignment.h" +#include "common/assert.h" +#include "common/error.h" +#include "video_core/page_manager.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" + +#ifndef _WIN64 +#include +#include +#include +#include +#include +#include +#endif + +namespace VideoCore { + +constexpr size_t PAGESIZE = 4_KB; +constexpr size_t PAGEBITS = 12; + +#ifdef SHADPS4_USERFAULTFD +struct PageManager::Impl { + Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} { + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg()); + + // Request uffdio features from kernel. + uffdio_api api; + api.api = UFFD_API; + api.features = UFFD_FEATURE_THREAD_ID; + const int ret = ioctl(uffd, UFFDIO_API, &api); + ASSERT(ret == 0 && api.api == UFFD_API); + + // Create uffd handler thread + ufd_thread = std::jthread([&](std::stop_token token) { UffdHandler(token); }); + } + + void OnMap(VAddr address, size_t size) { + uffdio_register reg; + reg.range.start = address; + reg.range.len = size; + reg.mode = UFFDIO_REGISTER_MODE_WP; + const int ret = ioctl(uffd, UFFDIO_REGISTER, ®); + ASSERT_MSG(ret != -1, "Uffdio register failed"); + } + + void OnUnmap(VAddr address, size_t size) { + uffdio_range range; + range.start = address; + range.len = size; + const int ret = ioctl(uffd, UFFDIO_UNREGISTER, &range); + ASSERT_MSG(ret != -1, "Uffdio unregister failed"); + } + + void Protect(VAddr address, size_t size, bool allow_write) { + uffdio_writeprotect wp; + wp.range.start = address; + wp.range.len = size; + wp.mode = allow_write ? 0 : UFFDIO_WRITEPROTECT_MODE_WP; + const int ret = ioctl(uffd, UFFDIO_WRITEPROTECT, &wp); + ASSERT_MSG(ret != -1, "Uffdio writeprotect failed with error: {}", + Common::GetLastErrorMsg()); + } + + void UffdHandler(std::stop_token token) { + while (!token.stop_requested()) { + pollfd pollfd; + pollfd.fd = uffd; + pollfd.events = POLLIN; + + // Block until the descriptor is ready for data reads. + const int pollres = poll(&pollfd, 1, -1); + switch (pollres) { + case -1: + perror("Poll userfaultfd"); + continue; + break; + case 0: + continue; + case 1: + break; + default: + UNREACHABLE_MSG("Unexpected number of descriptors {} out of poll", pollres); + } + + // We don't want an error condition to have occured. + ASSERT_MSG(!(pollfd.revents & POLLERR), "POLLERR on userfaultfd"); + + // We waited until there is data to read, we don't care about anything else. + if (!(pollfd.revents & POLLIN)) { + continue; + } + + // Read message from kernel. + uffd_msg msg; + const int readret = read(uffd, &msg, sizeof(msg)); + ASSERT_MSG(readret != -1 || errno == EAGAIN, "Unexpected result of uffd read"); + if (errno == EAGAIN) { + continue; + } + ASSERT_MSG(readret == sizeof(msg), "Unexpected short read, exiting"); + ASSERT(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP); + + // Notify rasterizer about the fault. + const VAddr addr = msg.arg.pagefault.address; + const VAddr addr_page = Common::AlignDown(addr, PAGESIZE); + rasterizer->InvalidateMemory(addr_page, PAGESIZE); + } + } + + Vulkan::Rasterizer* rasterizer; + std::jthread ufd_thread; + int uffd; +}; +#else +struct PageManager::Impl { + Impl(Vulkan::Rasterizer* rasterizer_) { + rasterizer = rasterizer_; + + sigset_t signal_mask; + sigemptyset(&signal_mask); + sigaddset(&signal_mask, SIGSEGV); + + using HandlerType = decltype(sigaction::sa_sigaction); + + struct sigaction guest_access_fault {}; + guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK; + guest_access_fault.sa_sigaction = &GuestFaultSignalHandler; + guest_access_fault.sa_mask = signal_mask; + sigaction(SIGSEGV, &guest_access_fault, nullptr); + } + + void OnMap(VAddr address, size_t size) {} + + void OnUnmap(VAddr address, size_t size) {} + + void Protect(VAddr address, size_t size, bool allow_write) { + mprotect(reinterpret_cast(address), size, + PROT_READ | (allow_write ? PROT_WRITE : 0)); + } + + static void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) { + ucontext_t* ctx = reinterpret_cast(raw_context); + const VAddr address = reinterpret_cast(info->si_addr); + const greg_t err = ctx->uc_mcontext.gregs[REG_ERR]; + if (err & 0x2) { + rasterizer->InvalidateMemory(address, PAGESIZE); + } else { + // Read not supported! + UNREACHABLE(); + } + } + + inline static Vulkan::Rasterizer* rasterizer; +}; +#endif + +PageManager::PageManager(Vulkan::Rasterizer* rasterizer_) + : impl{std::make_unique(rasterizer_)}, rasterizer{rasterizer_} {} + +PageManager::~PageManager() = default; + +void PageManager::OnGpuMap(VAddr address, size_t size) { + impl->OnMap(address, size); +} + +void PageManager::OnGpuUnmap(VAddr address, size_t size) { + impl->OnUnmap(address, size); +} + +void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) { + static constexpr u64 PageShift = 12; + + std::scoped_lock lk{mutex}; + const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1; + const u64 page_start = addr >> PageShift; + const u64 page_end = page_start + num_pages; + + const auto pages_interval = + decltype(cached_pages)::interval_type::right_open(page_start, page_end); + if (delta > 0) { + cached_pages.add({pages_interval, delta}); + } + + const auto& range = cached_pages.equal_range(pages_interval); + for (const auto& [range, count] : boost::make_iterator_range(range)) { + const auto interval = range & pages_interval; + const VAddr interval_start_addr = boost::icl::first(interval) << PageShift; + const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift; + const u32 interval_size = interval_end_addr - interval_start_addr; + if (delta > 0 && count == delta) { + impl->Protect(interval_start_addr, interval_size, false); + } else if (delta < 0 && count == -delta) { + impl->Protect(interval_start_addr, interval_size, true); + } else { + ASSERT(count >= 0); + } + } + + if (delta < 0) { + cached_pages.add({pages_interval, delta}); + } +} + +} // namespace VideoCore diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h new file mode 100644 index 00000000..0dc022aa --- /dev/null +++ b/src/video_core/page_manager.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include "common/types.h" + +namespace Vulkan { +class Rasterizer; +} + +namespace VideoCore { + +class PageManager { +public: + explicit PageManager(Vulkan::Rasterizer* rasterizer); + ~PageManager(); + + /// Register a range of mapped gpu memory. + void OnGpuMap(VAddr address, size_t size); + + /// Unregister a range of gpu memory that was unmapped. + void OnGpuUnmap(VAddr address, size_t size); + + /// Increase/decrease the number of surface in pages touching the specified region + void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta); + +private: + struct Impl; + std::unique_ptr impl; + Vulkan::Rasterizer* rasterizer; + std::mutex mutex; + boost::icl::interval_map cached_pages; +}; + +} // namespace VideoCore diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 6810bf34..c78d629e 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -67,8 +67,8 @@ RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* : window{window_}, liverpool{liverpool_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance}, present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window}, - texture_cache{instance, draw_scheduler} { - rasterizer = std::make_unique(instance, draw_scheduler, texture_cache, liverpool); + rasterizer{std::make_unique(instance, draw_scheduler, liverpool)}, + texture_cache{rasterizer->GetTextureCache()} { const u32 num_images = swapchain.GetImageCount(); const vk::Device device = instance.GetDevice(); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 3fe9267f..8178c88d 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -47,7 +47,7 @@ public: Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, cpu_address); + const auto image_id = texture_cache.FindImage(info, false); auto& image = texture_cache.GetImage(image_id); return PrepareFrameInternal(image, is_eop); } @@ -61,7 +61,7 @@ public: const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) { vo_buffers_addr.emplace_back(cpu_address); const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, cpu_address); + const auto image_id = texture_cache.FindImage(info, false); return texture_cache.GetImage(image_id); } @@ -88,7 +88,7 @@ private: Scheduler flip_scheduler; Swapchain swapchain; std::unique_ptr rasterizer; - VideoCore::TextureCache texture_cache; + VideoCore::TextureCache& texture_cache; vk::UniqueCommandPool command_pool; std::vector present_frames; std::queue free_queue; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index d8e5f7fa..cc519374 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -3,7 +3,7 @@ #include #include "common/alignment.h" -#include "core/memory.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -51,6 +51,12 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler }); } + const vk::PushConstantRange push_constants = { + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .offset = 0, + .size = sizeof(Shader::PushData), + }; + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, .bindingCount = static_cast(bindings.size()), @@ -62,8 +68,8 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler const vk::PipelineLayoutCreateInfo layout_info = { .setLayoutCount = 1U, .pSetLayouts = &set_layout, - .pushConstantRangeCount = 0, - .pPushConstantRanges = nullptr, + .pushConstantRangeCount = 1U, + .pPushConstantRanges = &push_constants, }; pipeline_layout = instance.GetDevice().createPipelineLayoutUnique(layout_info); @@ -82,35 +88,18 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler ComputePipeline::~ComputePipeline() = default; -bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging, +bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; + Shader::PushData push_data{}; u32 binding{}; for (const auto& buffer : info.buffers) { const auto vsharp = buffer.GetVsharp(info); - const u32 size = vsharp.GetSize(); const VAddr address = vsharp.base_address; - if (buffer.is_storage) { - texture_cache.OnCpuWrite(address); - } - const u32 offset = staging.Copy(address, size, - buffer.is_storage ? instance.StorageMinAlignment() - : instance.UniformMinAlignment()); - buffer_infos.emplace_back(staging.Handle(), offset, size); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - // Most of the time when a metadata is updated with a shader it gets cleared. It means we // can skip the whole dispatch and update the tracked state instead. Also, it is not // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will @@ -125,6 +114,30 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); } } + const u32 size = vsharp.GetSize(); + if (buffer.is_written) { + texture_cache.InvalidateMemory(address, size); + } + const u32 alignment = + buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, buffer.is_written); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % 4 == 0); + push_data.AddOffset(binding, adjust); + } + buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); } for (const auto& image_desc : info.images) { @@ -168,6 +181,8 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s } const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data), + &push_data); cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, set_writes); return true; } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 4cdcccfc..752f8c39 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -6,13 +6,10 @@ #include "shader_recompiler/runtime_info.h" #include "video_core/renderer_vulkan/vk_common.h" -namespace Core { -class MemoryManager; -} - namespace VideoCore { +class BufferCache; class TextureCache; -} +} // namespace VideoCore namespace Vulkan { @@ -31,7 +28,7 @@ public: return *pipeline; } - bool BindResources(Core::MemoryManager* memory, StreamBuffer& staging, + bool BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const; private: diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7b00a911..c9f76124 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -1,13 +1,14 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - +#pragma clang optimize off #include #include #include +#include "common/alignment.h" #include "common/assert.h" -#include "core/memory.h" #include "video_core/amdgpu/resource.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -32,9 +33,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul BuildDescSetLayout(); const vk::PushConstantRange push_constants = { - .stageFlags = vk::ShaderStageFlagBits::eVertex, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, .offset = 0, - .size = 2 * sizeof(u32), + .size = sizeof(Shader::PushData), }; const vk::DescriptorSetLayout set_layout = *desc_layout; @@ -328,25 +329,36 @@ void GraphicsPipeline::BuildDescSetLayout() { desc_layout = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); } -void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging, +void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, + VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { - BindVertexBuffers(staging); - // Bind resource buffers and textures. boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; + Shader::PushData push_data{}; u32 binding{}; for (const auto& stage : stages) { + if (stage.uses_step_rates) { + push_data.step0 = regs.vgt_instance_step_rate_0; + push_data.step1 = regs.vgt_instance_step_rate_1; + } for (const auto& buffer : stage.buffers) { const auto vsharp = buffer.GetVsharp(stage); const VAddr address = vsharp.base_address; const u32 size = vsharp.GetSize(); - const u32 offset = staging.Copy(address, size, - buffer.is_storage ? instance.StorageMinAlignment() - : instance.UniformMinAlignment()); - buffer_infos.emplace_back(staging.Handle(), offset, size); + const u32 alignment = + buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, buffer.is_written); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % 4 == 0); + push_data.AddOffset(binding, adjust); + } + buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, @@ -406,86 +418,15 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& } } + const auto cmdbuf = scheduler.CommandBuffer(); if (!set_writes.empty()) { - const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, set_writes); } -} - -void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const { - const auto& vs_info = stages[u32(Shader::Stage::Vertex)]; - if (vs_info.vs_inputs.empty()) { - return; - } - - std::array host_buffers; - std::array host_offsets; - boost::container::static_vector guest_buffers; - - struct BufferRange { - VAddr base_address; - VAddr end_address; - u64 offset; // offset in the mapped memory - - size_t GetSize() const { - return end_address - base_address; - } - }; - - // Calculate buffers memory overlaps - boost::container::static_vector ranges{}; - for (const auto& input : vs_info.vs_inputs) { - if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 || - input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) { - continue; - } - - const auto& buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); - if (buffer.GetSize() == 0) { - continue; - } - guest_buffers.emplace_back(buffer); - ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); - } - std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { - return lhv.base_address < rhv.base_address; - }); - - boost::container::static_vector ranges_merged{ranges[0]}; - for (auto range : ranges) { - auto& prev_range = ranges_merged.back(); - if (prev_range.end_address < range.base_address) { - ranges_merged.emplace_back(range); - } else { - prev_range.end_address = std::max(prev_range.end_address, range.end_address); - } - } - - // Map buffers - for (auto& range : ranges_merged) { - range.offset = staging.Copy(range.base_address, range.GetSize(), 4); - } - - // Bind vertex buffers - const size_t num_buffers = guest_buffers.size(); - for (u32 i = 0; i < num_buffers; ++i) { - const auto& buffer = guest_buffers[i]; - const auto& host_buffer = std::ranges::find_if( - ranges_merged.cbegin(), ranges_merged.cend(), [&](const BufferRange& range) { - return (buffer.base_address >= range.base_address && - buffer.base_address < range.end_address); - }); - assert(host_buffer != ranges_merged.cend()); - - host_buffers[i] = staging.Handle(); - host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address; - } - - if (num_buffers > 0) { - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data()); - } + cmdbuf.pushConstants(*pipeline_layout, + vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, 0U, + sizeof(push_data), &push_data); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, Handle()); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index e1564f8f..2d8d9847 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -7,13 +7,10 @@ #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_common.h" -namespace Core { -class MemoryManager; -} - namespace VideoCore { +class BufferCache; class TextureCache; -} +} // namespace VideoCore namespace Vulkan { @@ -64,7 +61,7 @@ public: std::array modules); ~GraphicsPipeline(); - void BindResources(Core::MemoryManager* memory, StreamBuffer& staging, + void BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const; vk::Pipeline Handle() const noexcept { @@ -75,6 +72,10 @@ public: return *pipeline_layout; } + const Shader::Info& GetStage(Shader::Stage stage) const noexcept { + return stages[u32(stage)]; + } + bool IsEmbeddedVs() const noexcept { static constexpr size_t EmbeddedVsHash = 0x9b2da5cf47f8c29f; return key.stage_hashes[u32(Shader::Stage::Vertex)] == EmbeddedVsHash; @@ -90,7 +91,6 @@ public: private: void BuildDescSetLayout(); - void BindVertexBuffers(StreamBuffer& staging) const; private: const Instance& instance; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ff5e97d5..34807323 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -13,22 +13,16 @@ namespace Vulkan { -static constexpr vk::BufferUsageFlags VertexIndexFlags = - vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndexBuffer | - vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eUniformBuffer | - vk::BufferUsageFlagBits::eStorageBuffer; - Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, - VideoCore::TextureCache& texture_cache_, AmdGpu::Liverpool* liverpool_) - : instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_}, - liverpool{liverpool_}, memory{Core::Memory::Instance()}, - pipeline_cache{instance, scheduler, liverpool}, - vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} { + AmdGpu::Liverpool* liverpool_) + : instance{instance_}, scheduler{scheduler_}, page_manager{this}, + buffer_cache{instance, scheduler, liverpool_, page_manager}, + texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, + memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { liverpool->BindRasterizer(this); } - - memory->SetInstance(&instance); + memory->SetRasterizer(this); } Rasterizer::~Rasterizer() = default; @@ -38,29 +32,24 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { const auto cmdbuf = scheduler.CommandBuffer(); const auto& regs = liverpool->regs; - const u32 num_indices = SetupIndexBuffer(is_indexed, index_offset); const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); if (!pipeline) { return; } try { - pipeline->BindResources(memory, vertex_index_buffer, texture_cache); + pipeline->BindResources(regs, buffer_cache, texture_cache); } catch (...) { UNREACHABLE(); } + const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex); + buffer_cache.BindVertexBuffers(vs_info); + const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, index_offset); + BeginRendering(); UpdateDynamicState(*pipeline); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); - - const u32 step_rates[] = { - regs.vgt_instance_step_rate_0, - regs.vgt_instance_step_rate_1, - }; - cmdbuf.pushConstants(pipeline->GetLayout(), vk::ShaderStageFlagBits::eVertex, 0u, - sizeof(step_rates), &step_rates); if (is_indexed) { cmdbuf.drawIndexed(num_indices, regs.num_instances.NumInstances(), 0, 0, 0); } else { @@ -82,8 +71,7 @@ void Rasterizer::DispatchDirect() { } try { - const auto has_resources = - pipeline->BindResources(memory, vertex_index_buffer, texture_cache); + const auto has_resources = pipeline->BindResources(buffer_cache, texture_cache); if (!has_resources) { return; } @@ -131,7 +119,7 @@ void Rasterizer::BeginRendering() { state.color_images[state.num_color_attachments] = image.image; state.color_attachments[state.num_color_attachments++] = { .imageView = *image_view.image_view, - .imageLayout = vk::ImageLayout::eGeneral, + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, .loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = @@ -168,45 +156,19 @@ void Rasterizer::BeginRendering() { scheduler.BeginRendering(state); } -u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) { - // Emulate QuadList primitive type with CPU made index buffer. - const auto& regs = liverpool->regs; - if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) { - // ASSERT_MSG(!is_indexed, "Using QuadList primitive with indexed draw"); - is_indexed = true; +void Rasterizer::InvalidateMemory(VAddr addr, u64 size) { + buffer_cache.InvalidateMemory(addr, size); + texture_cache.InvalidateMemory(addr, size); +} - // Emit indices. - const u32 index_size = 3 * regs.num_indices; - const auto [data, offset, _] = vertex_index_buffer.Map(index_size); - LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices); - vertex_index_buffer.Commit(index_size); +void Rasterizer::MapMemory(VAddr addr, u64 size) { + page_manager.OnGpuMap(addr, size); +} - // Bind index buffer. - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset, vk::IndexType::eUint16); - return index_size / sizeof(u16); - } - if (!is_indexed) { - return regs.num_indices; - } - - // Figure out index type and size. - const bool is_index16 = regs.index_buffer_type.index_type == Liverpool::IndexType::Index16; - const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32; - const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32); - - // Upload index data to stream buffer. - const auto index_address = regs.index_base_address.Address(); - const u32 index_buffer_size = (index_offset + regs.num_indices) * index_size; - const auto [data, offset, _] = vertex_index_buffer.Map(index_buffer_size); - std::memcpy(data, index_address, index_buffer_size); - vertex_index_buffer.Commit(index_buffer_size); - - // Bind index buffer. - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset + index_offset * index_size, - index_type); - return regs.num_indices; +void Rasterizer::UnmapMemory(VAddr addr, u64 size) { + buffer_cache.InvalidateMemory(addr, size); + texture_cache.UnmapMemory(addr, size); + page_manager.OnGpuUnmap(addr, size); } void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 64dc87ef..7a2d105b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -3,8 +3,10 @@ #pragma once +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/texture_cache/texture_cache.h" namespace AmdGpu { struct Liverpool; @@ -14,10 +16,6 @@ namespace Core { class MemoryManager; } -namespace VideoCore { -class TextureCache; -} - namespace Vulkan { class Scheduler; @@ -26,9 +24,13 @@ class GraphicsPipeline; class Rasterizer { public: explicit Rasterizer(const Instance& instance, Scheduler& scheduler, - VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool); + AmdGpu::Liverpool* liverpool); ~Rasterizer(); + [[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept { + return texture_cache; + } + void Draw(bool is_indexed, u32 index_offset = 0); void DispatchDirect(); @@ -36,12 +38,13 @@ public: void ScopeMarkerBegin(const std::string& str); void ScopeMarkerEnd(); + void InvalidateMemory(VAddr addr, u64 size); + void MapMemory(VAddr addr, u64 size); + void UnmapMemory(VAddr addr, u64 size); + u64 Flush(); private: - u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset); - void MapMemory(VAddr addr, size_t size); - void BeginRendering(); void UpdateDynamicState(const GraphicsPipeline& pipeline); @@ -51,11 +54,12 @@ private: private: const Instance& instance; Scheduler& scheduler; - VideoCore::TextureCache& texture_cache; + VideoCore::PageManager page_manager; + VideoCore::BufferCache buffer_cache; + VideoCore::TextureCache texture_cache; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; PipelineCache pipeline_cache; - StreamBuffer vertex_index_buffer; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 48c3af7a..b82d558c 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -6,6 +6,7 @@ #include #include #include "common/types.h" +#include "common/unique_function.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" @@ -97,8 +98,8 @@ public: } /// Defers an operation until the gpu has reached the current cpu tick. - void DeferOperation(auto&& func) { - pending_ops.emplace(func, CurrentTick()); + void DeferOperation(Common::UniqueFunction&& func) { + pending_ops.emplace(std::move(func), CurrentTick()); } static std::mutex submit_mutex; @@ -115,7 +116,7 @@ private: vk::CommandBuffer current_cmdbuf; std::condition_variable_any event_cv; struct PendingOp { - std::function callback; + Common::UniqueFunction callback; u64 gpu_tick; }; std::queue pending_ops; diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index f7957ac0..e29728d1 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -22,8 +22,6 @@ class Instance; class Scheduler; class StreamBuffer final { - static constexpr std::size_t MAX_BUFFER_VIEWS = 3; - public: explicit StreamBuffer(const Instance& instance, Scheduler& scheduler, vk::BufferUsageFlags usage, u64 size, diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index e01a61ae..94917be0 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -260,7 +260,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { case AmdGpu::TilingMode::Display_MacroTiled: case AmdGpu::TilingMode::Texture_MacroTiled: case AmdGpu::TilingMode::Depth_MacroTiled: { - ASSERT(!props.is_cube && !props.is_block); + // ASSERT(!props.is_cube && !props.is_block); ASSERT(num_samples == 1); std::tie(mip_info.pitch, mip_info.size) = ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 04bedaff..ef6163c4 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -61,23 +61,24 @@ vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) { return format; } -ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept - : is_storage{is_storage} { +ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage_) noexcept + : is_storage{is_storage_} { type = ConvertImageViewType(image.GetType()); format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); range.base.level = image.base_level; range.base.layer = image.base_array; range.extent.levels = image.last_level + 1; range.extent.layers = image.last_array + 1; - mapping.r = ConvertComponentSwizzle(image.dst_sel_x); - mapping.g = ConvertComponentSwizzle(image.dst_sel_y); - mapping.b = ConvertComponentSwizzle(image.dst_sel_z); - mapping.a = ConvertComponentSwizzle(image.dst_sel_w); + if (!is_storage) { + mapping.r = ConvertComponentSwizzle(image.dst_sel_x); + mapping.g = ConvertComponentSwizzle(image.dst_sel_y); + mapping.b = ConvertComponentSwizzle(image.dst_sel_z); + mapping.a = ConvertComponentSwizzle(image.dst_sel_w); + } // Check for unfortunate case of storage images being swizzled const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt()); const u32 dst_sel = image.DstSelect(); if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) { - mapping = vk::ComponentMapping{}; if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) { format = new_format; return; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 7b8a5554..bffa6eff 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -3,103 +3,23 @@ #include #include "common/assert.h" -#include "common/config.h" -#include "core/virtual_memory.h" +#include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" -#ifndef _WIN64 -#include -#include - -#define PAGE_NOACCESS PROT_NONE -#define PAGE_READWRITE (PROT_READ | PROT_WRITE) -#define PAGE_READONLY PROT_READ -#else -#include - -void mprotect(void* addr, size_t len, int prot) { - DWORD old_prot{}; - BOOL result = VirtualProtect(addr, len, prot, &old_prot); - ASSERT_MSG(result != 0, "Region protection failed"); -} - -#endif - namespace VideoCore { -static TextureCache* g_texture_cache = nullptr; - -#ifndef _WIN64 -void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) { - ucontext_t* ctx = reinterpret_cast(raw_context); - const VAddr address = reinterpret_cast(info->si_addr); - -#ifdef __APPLE__ - const u32 err = ctx->uc_mcontext->__es.__err; -#else - const greg_t err = ctx->uc_mcontext.gregs[REG_ERR]; -#endif - - if (err & 0x2) { - g_texture_cache->OnCpuWrite(address); - } else { - // Read not supported! - UNREACHABLE(); - } -} -#else -LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept { - const u32 ec = pExp->ExceptionRecord->ExceptionCode; - if (ec == EXCEPTION_ACCESS_VIOLATION) { - const auto info = pExp->ExceptionRecord->ExceptionInformation; - if (info[0] == 1) { // Write violation - g_texture_cache->OnCpuWrite(info[1]); - return EXCEPTION_CONTINUE_EXECUTION; - } /* else { - UNREACHABLE(); - }*/ - } - return EXCEPTION_CONTINUE_SEARCH; // pass further -} -#endif - static constexpr u64 StreamBufferSize = 512_MB; static constexpr u64 PageShift = 12; -TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_) - : instance{instance_}, scheduler{scheduler_}, +TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, + BufferCache& buffer_cache_, PageManager& tracker_) + : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, staging{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, StreamBufferSize, Vulkan::BufferType::Upload}, tile_manager{instance, scheduler} { - -#ifndef _WIN64 -#ifdef __APPLE__ - // Read-only memory write results in SIGBUS on Apple. - static constexpr int SignalType = SIGBUS; -#else - static constexpr int SignalType = SIGSEGV; -#endif - - sigset_t signal_mask; - sigemptyset(&signal_mask); - sigaddset(&signal_mask, SignalType); - - using HandlerType = decltype(sigaction::sa_sigaction); - - struct sigaction guest_access_fault {}; - guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK; - guest_access_fault.sa_sigaction = &GuestFaultSignalHandler; - guest_access_fault.sa_mask = signal_mask; - sigaction(SignalType, &guest_access_fault, nullptr); -#else - veh_handle = AddVectoredExceptionHandler(0, GuestFaultSignalHandler); - ASSERT_MSG(veh_handle, "Failed to register an exception handler"); -#endif - g_texture_cache = this; - ImageInfo info; info.pixel_format = vk::Format::eR8G8B8A8Unorm; info.type = vk::ImageType::e2D; @@ -110,15 +30,11 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& void(slot_image_views.insert(instance, view_info, slot_images[null_id], null_id)); } -TextureCache::~TextureCache() { -#if _WIN64 - RemoveVectoredExceptionHandler(veh_handle); -#endif -} +TextureCache::~TextureCache() = default; -void TextureCache::OnCpuWrite(VAddr address) { - std::unique_lock lock{m_page_table}; - ForEachImageInRegion(address, 1 << PageShift, [&](ImageId image_id, Image& image) { +void TextureCache::InvalidateMemory(VAddr address, size_t size) { + std::unique_lock lock{mutex}; + ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { // Ensure image is reuploaded when accessed again. image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. @@ -126,8 +42,28 @@ void TextureCache::OnCpuWrite(VAddr address) { }); } +void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { + std::scoped_lock lk{mutex}; + + boost::container::small_vector deleted_images; + ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); + for (const ImageId id : deleted_images) { + Image& image = slot_images[id]; + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image, id); + } + // TODO: Download image data back to host. + UnregisterImage(id); + DeleteImage(id); + } +} + ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { - std::unique_lock lock{m_page_table}; + if (info.guest_address == 0) [[unlikely]] { + return NULL_IMAGE_VIEW_ID; + } + + std::unique_lock lock{mutex}; boost::container::small_vector image_ids; ForEachImageInRegion( info.guest_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) { @@ -183,10 +119,6 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo } ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { - if (info.guest_address == 0) [[unlikely]] { - return slot_image_views[NULL_IMAGE_VIEW_ID]; - } - const ImageId image_id = FindImage(info); Image& image = slot_images[image_id]; auto& usage = image.info.usage; @@ -344,9 +276,6 @@ void TextureCache::RefreshImage(Image& image) { } cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); - - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead); } vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) { @@ -362,8 +291,6 @@ void TextureCache::RegisterImage(ImageId image_id) { image.flags |= ImageFlagBits::Registered; ForEachPage(image.cpu_addr, image.info.guest_size_bytes, [this, image_id](u64 page) { page_table[page].push_back(image_id); }); - - image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone); } void TextureCache::UnregisterImage(ImageId image_id) { @@ -393,7 +320,7 @@ void TextureCache::TrackImage(Image& image, ImageId image_id) { return; } image.flags |= ImageFlagBits::Tracked; - UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1); + tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1); } void TextureCache::UntrackImage(Image& image, ImageId image_id) { @@ -401,40 +328,34 @@ void TextureCache::UntrackImage(Image& image, ImageId image_id) { return; } image.flags &= ~ImageFlagBits::Tracked; - UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1); + tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1); } -void TextureCache::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) { - std::scoped_lock lk{mutex}; - const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1; - const u64 page_start = addr >> PageShift; - const u64 page_end = page_start + num_pages; +void TextureCache::DeleteImage(ImageId image_id) { + Image& image = slot_images[image_id]; + ASSERT_MSG(False(image.flags & ImageFlagBits::Tracked), "Image was not untracked"); + ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Image was not unregistered"); - const auto pages_interval = - decltype(cached_pages)::interval_type::right_open(page_start, page_end); - if (delta > 0) { - cached_pages.add({pages_interval, delta}); + // Remove any registered meta areas. + const auto& meta_info = image.info.meta_info; + if (meta_info.cmask_addr) { + surface_metas.erase(meta_info.cmask_addr); + } + if (meta_info.fmask_addr) { + surface_metas.erase(meta_info.fmask_addr); + } + if (meta_info.htile_addr) { + surface_metas.erase(meta_info.htile_addr); } - const auto& range = cached_pages.equal_range(pages_interval); - for (const auto& [range, count] : boost::make_iterator_range(range)) { - const auto interval = range & pages_interval; - const VAddr interval_start_addr = boost::icl::first(interval) << PageShift; - const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift; - const u32 interval_size = interval_end_addr - interval_start_addr; - void* addr = reinterpret_cast(interval_start_addr); - if (delta > 0 && count == delta) { - mprotect(addr, interval_size, PAGE_READONLY); - } else if (delta < 0 && count == -delta) { - mprotect(addr, interval_size, PAGE_READWRITE); - } else { - ASSERT(count >= 0); + // Reclaim image and any image views it references. + scheduler.DeferOperation([this, image_id] { + Image& image = slot_images[image_id]; + for (const ImageViewId image_view_id : image.image_view_ids) { + slot_image_views.erase(image_view_id); } - } - - if (delta < 0) { - cached_pages.add({pages_interval, delta}); - } + slot_images.erase(image_id); + }); } } // namespace VideoCore diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index aef33bcf..a5e3210d 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -21,31 +21,27 @@ struct BufferAttributeGroup; namespace VideoCore { +class BufferCache; +class PageManager; + class TextureCache { // This is the page shift for adding images into the hash map. It isn't related to // the page size of the guest or the host and is chosen for convenience. A number too // small will increase the number of hash map lookups per image, while too large will // increase the number of images per page. - static constexpr u64 PageBits = 20; + static constexpr u64 PageBits = 22; static constexpr u64 PageMask = (1ULL << PageBits) - 1; - struct MetaDataInfo { - enum class Type { - CMask, - FMask, - HTile, - }; - - Type type; - bool is_cleared; - }; - public: - explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); + explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + BufferCache& buffer_cache, PageManager& tracker); ~TextureCache(); /// Invalidates any image in the logical page range. - void OnCpuWrite(VAddr address); + void InvalidateMemory(VAddr address, size_t size); + + /// Evicts any images that overlap the unmapped range. + void UnmapMemory(VAddr cpu_addr, size_t size); /// Retrieves the image handle of the image with the provided attributes. [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true); @@ -166,12 +162,14 @@ private: /// Stop tracking CPU reads and writes for image void UntrackImage(Image& image, ImageId image_id); - /// Increase/decrease the number of surface in pages touching the specified region - void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta); + /// Removes the image and any views/surface metas that reference it. + void DeleteImage(ImageId image_id); private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + BufferCache& buffer_cache; + PageManager& tracker; Vulkan::StreamBuffer staging; TileManager tile_manager; Common::SlotVector slot_images; @@ -179,12 +177,18 @@ private: tsl::robin_map samplers; tsl::robin_pg_map> page_table; boost::icl::interval_map cached_pages; - tsl::robin_map surface_metas; std::mutex mutex; -#ifdef _WIN64 - void* veh_handle{}; -#endif - std::mutex m_page_table; + + struct MetaDataInfo { + enum class Type { + CMask, + FMask, + HTile, + }; + Type type; + bool is_cleared; + }; + tsl::robin_map surface_metas; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 4f199f81..bb7ad22e 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -183,10 +183,12 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eB8G8R8A8Srgb: case vk::Format::eB8G8R8A8Unorm: case vk::Format::eR8G8B8A8Unorm: + case vk::Format::eR8G8B8A8Uint: case vk::Format::eR32Sfloat: case vk::Format::eR32Uint: case vk::Format::eR16G16Sfloat: return vk::Format::eR32Uint; + case vk::Format::eBc1RgbaSrgbBlock: case vk::Format::eBc1RgbaUnormBlock: case vk::Format::eBc4UnormBlock: case vk::Format::eR32G32Sfloat: @@ -200,6 +202,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eBc5UnormBlock: case vk::Format::eBc7SrgbBlock: case vk::Format::eBc7UnormBlock: + case vk::Format::eR32G32B32A32Sfloat: return vk::Format::eR32G32B32A32Uint; default: break; @@ -236,8 +239,11 @@ struct DetilerParams { u32 sizes[14]; }; +static constexpr size_t StreamBufferSize = 128_MB; + TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) - : instance{instance}, scheduler{scheduler} { + : instance{instance}, scheduler{scheduler}, + stream_buffer{instance, scheduler, MemoryUsage::Stream, StreamBufferSize} { static const std::array detiler_shaders{ HostShaders::DETILE_M8X1_COMP, HostShaders::DETILE_M8X2_COMP, HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP, @@ -336,8 +342,7 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /* .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT : static_cast(0), - .usage = is_large_buffer ? VMA_MEMORY_USAGE_AUTO_PREFER_HOST - : VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, + .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, .requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT : static_cast(0), }; @@ -373,37 +378,46 @@ std::optional TileManager::TryDetile(Image& image) { const auto* detiler = GetDetiler(image); if (!detiler) { - LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", - vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode)); + if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled) { + LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", + vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode)); + } return std::nullopt; } // Prepare input buffer - auto in_buffer = AllocBuffer(image.info.guest_size_bytes); - Upload(in_buffer, reinterpret_cast(image.info.guest_address), - image.info.guest_size_bytes); + const u32 image_size = image.info.guest_size_bytes; + const auto [in_buffer, in_offset] = [&] -> std::pair { + // Use stream buffer for smaller textures. + if (image_size <= StreamBufferSize) { + u32 offset = stream_buffer.Copy(image.info.guest_address, image_size); + return {stream_buffer.Handle(), offset}; + } + // Request temporary host buffer for larger sizes. + auto in_buffer = AllocBuffer(image_size); + const auto addr = reinterpret_cast(image.info.guest_address); + Upload(in_buffer, addr, image_size); + scheduler.DeferOperation([=, this]() { FreeBuffer(in_buffer); }); + return {in_buffer.first, 0}; + }(); // Prepare output buffer - auto out_buffer = AllocBuffer(image.info.guest_size_bytes, true); - - scheduler.DeferOperation([=, this]() { - FreeBuffer(in_buffer); - FreeBuffer(out_buffer); - }); + auto out_buffer = AllocBuffer(image_size, true); + scheduler.DeferOperation([=, this]() { FreeBuffer(out_buffer); }); auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl); const vk::DescriptorBufferInfo input_buffer_info{ - .buffer = in_buffer.first, - .offset = 0, - .range = image.info.guest_size_bytes, + .buffer = in_buffer, + .offset = in_offset, + .range = image_size, }; const vk::DescriptorBufferInfo output_buffer_info{ .buffer = out_buffer.first, .offset = 0, - .range = image.info.guest_size_bytes, + .range = image_size, }; std::vector set_writes{ @@ -442,16 +456,16 @@ std::optional TileManager::TryDetile(Image& image) { cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params), ¶ms); - ASSERT((image.info.guest_size_bytes % 64) == 0); + ASSERT((image_size % 64) == 0); const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u); - const auto num_tiles = image.info.guest_size_bytes / (64 * (bpp / 8)); + const auto num_tiles = image_size / (64 * (bpp / 8)); cmdbuf.dispatch(num_tiles, 1, 1); const vk::BufferMemoryBarrier post_barrier{ .srcAccessMask = vk::AccessFlagBits::eShaderWrite, .dstAccessMask = vk::AccessFlagBits::eTransferRead, .buffer = out_buffer.first, - .size = image.info.guest_size_bytes, + .size = image_size, }; cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index 9102da08..542c6bac 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -4,6 +4,7 @@ #pragma once #include "common/types.h" +#include "video_core/buffer_cache/buffer.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/texture_cache/image.h" @@ -34,7 +35,7 @@ struct DetilerContext { class TileManager { public: - using ScratchBuffer = std::pair; + using ScratchBuffer = std::pair; TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); ~TileManager(); @@ -51,6 +52,7 @@ private: private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + StreamBuffer stream_buffer; std::array detilers; };