diff --git a/CMakeLists.txt b/CMakeLists.txt index 1237432d..181b39db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -530,6 +530,13 @@ endif() target_include_directories(shadps4 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +# Shaders sources +set(HOST_SHADERS_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/src/video_core/host_shaders) + +add_subdirectory(${HOST_SHADERS_INCLUDE}) +add_dependencies(shadps4 host_shaders) +target_include_directories(shadps4 PRIVATE ${HOST_SHADERS_INCLUDE}) + if (ENABLE_QT_GUI) set_target_properties(shadps4 PROPERTIES WIN32_EXECUTABLE ON diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 4e43544a..4df2709f 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1462,7 +1462,7 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[ if (Config::dumpPM4()) { static auto last_frame_num = frames_submitted; static u32 seq_num{}; - if (last_frame_num == frames_submitted) { + if (last_frame_num && last_frame_num == frames_submitted) { ++seq_num; } else { last_frame_num = frames_submitted; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 2d645c9d..bbaac339 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -199,7 +199,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spantype3.count; - if (nop_offset == 0x0e) { + if (nop_offset == 0x0e || nop_offset == 0x0d) { ASSERT_MSG(payload[nop_offset] == 0xc0001000, "NOP hint is missing in CB setup sequence"); last_cb_extent[col_buf_id].raw = payload[nop_offset + 1]; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 56f695ca..b561b712 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -6,6 +6,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "common/types.h" +#include "resource.h" #include "video_core/amdgpu/pixel_format.h" #include @@ -622,7 +623,7 @@ struct Liverpool { BitField<19, 1, u32> cmask_is_linear; } info; union { - BitField<0, 5, u32> tile_mode_index; + BitField<0, 5, TilingMode> tile_mode_index; BitField<5, 5, u32> fmask_tile_mode_index; BitField<12, 3, u32> num_samples_log2; BitField<15, 3, u32> num_fragments_log2; @@ -661,6 +662,22 @@ struct Liverpool { return u64(cmask_base_address) << 8; } + [[nodiscard]] size_t GetSizeAligned() const { + const auto num_bytes_per_element = NumBits(info.format) / 8u; + const auto slice_size = (slice.tile_max + 1) * 64u; + const auto total_size = slice_size * (view.slice_max + 1) * num_bytes_per_element; + ASSERT(total_size > 0); + return total_size; + } + + [[nodiscard]] TilingMode GetTilingMode() const { + return attrib.tile_mode_index; + } + + [[nodiscard]] bool IsTiled() const { + return !info.linear_general; + } + NumberFormat NumFormat() const { // There is a small difference between T# and CB number types, account for it. return info.number_type == AmdGpu::NumberFormat::Uscaled ? AmdGpu::NumberFormat::Srgb @@ -834,7 +851,9 @@ private: static constexpr std::suspend_always final_suspend() noexcept { return {}; } - void unhandled_exception() {} + void unhandled_exception() { + UNREACHABLE(); + } void return_void() {} struct empty {}; std::suspend_always yield_value(empty&&) { diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index f963370d..5f6eb903 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -40,17 +40,30 @@ std::string_view NameOf(NumberFormat fmt) { } } -u32 NumComponents(DataFormat format) { - constexpr std::array numComponentsPerElement = { +int NumComponents(DataFormat format) { + constexpr std::array num_components_per_element = { 0, 1, 1, 2, 1, 2, 3, 3, 4, 4, 4, 2, 4, 3, 4, -1, 3, 4, 4, 4, 2, 2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 3, 4, 4, 4, 1, 2, 3, 4, -1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 1}; const u32 index = static_cast(format); - if (index >= numComponentsPerElement.size()) { + if (index >= num_components_per_element.size()) { return 0; } - return numComponentsPerElement[index]; + return num_components_per_element[index]; +} + +int NumBits(DataFormat format) { + const std::array num_bits_per_element = { + 0, 8, 16, 16, 32, 32, 32, 32, 32, 32, 32, 64, 64, 96, 128, -1, 16, 16, 16, 16, 32, + 32, 64, -1, -1, -1, -1, -1, -1, -1, -1, -1, 16, 16, 32, 4, 8, 8, 4, 8, 8, 8, + -1, -1, 8, 8, 8, 8, 8, 8, 16, 16, 32, 32, 32, 64, 64, 8, 16, 1, 1}; + + const u32 index = static_cast(format); + if (index >= num_bits_per_element.size()) { + return 0; + } + return num_bits_per_element[index]; } } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 7555cdb3..22d102af 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -63,7 +63,8 @@ enum class NumberFormat : u32 { [[nodiscard]] std::string_view NameOf(NumberFormat fmt); -u32 NumComponents(DataFormat format); +int NumComponents(DataFormat format); +int NumBits(DataFormat format); } // namespace AmdGpu diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index ef78c2f4..e9b7a553 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -85,6 +85,12 @@ constexpr std::string_view NameOf(ImageType type) { } } +enum class TilingMode : u32 { + Display_Linear = 0x8u, + Display_MacroTiled = 0xAu, + Texture_MicroTiled = 0xDu, +}; + struct Image { union { BitField<0, 38, u64> base_address; @@ -122,7 +128,7 @@ struct Image { } u32 Pitch() const { - return pitch; + return pitch + 1; } u32 NumLayers() const { @@ -140,6 +146,19 @@ struct Image { NumberFormat GetNumberFmt() const noexcept { return static_cast(num_format.Value()); } + + [[nodiscard]] TilingMode GetTilingMode() const { + return static_cast(tiling_index.Value()); + } + + [[nodiscard]] bool IsTiled() const { + return GetTilingMode() != TilingMode::Display_Linear; + } + + [[nodiscard]] size_t GetSizeAligned() const { + // TODO: Derive this properly from tiling params + return (width + 1) * (height + 1) * NumComponents(GetDataFmt()); + } }; // 8.2.7. Image Sampler [RDNA 2 Instruction Set Architecture] diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt new file mode 100644 index 00000000..42ff482a --- /dev/null +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright 2023 Citra Emulator Project +# SPDX-License-Identifier: GPL-2.0-or-later + +set(SHADER_FILES + detile_m8x1.comp + detile_m8x4.comp +) + +set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) +set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders) +set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE) + +set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in) +set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) + +foreach(FILENAME IN ITEMS ${SHADER_FILES}) + string(REPLACE "." "_" SHADER_NAME ${FILENAME}) + set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}) + set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) + add_custom_command( + OUTPUT + ${SOURCE_HEADER_FILE} + COMMAND + ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + DEPENDS + ${INPUT_FILE} + # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE}) +endforeach() + +set(SHADER_SOURCES ${SHADER_FILES}) +list(APPEND SHADER_SOURCES ${GLSL_INCLUDES}) + +add_custom_target(host_shaders + DEPENDS + ${SHADER_HEADERS} + SOURCES + ${SHADER_SOURCES} +) diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake new file mode 100644 index 00000000..9f752553 --- /dev/null +++ b/src/video_core/host_shaders/StringShaderHeader.cmake @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2020 yuzu Emulator Project +# SPDX-License-Identifier: GPL-2.0-or-later + +set(SOURCE_FILE ${CMAKE_ARGV3}) +set(HEADER_FILE ${CMAKE_ARGV4}) +set(INPUT_FILE ${CMAKE_ARGV5}) + +get_filename_component(CONTENTS_NAME ${SOURCE_FILE} NAME) +string(REPLACE "." "_" CONTENTS_NAME ${CONTENTS_NAME}) +string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) + +FILE(READ ${SOURCE_FILE} line_contents) + +# Replace double quotes with single quotes, +# as double quotes will be used to wrap the lines +STRING(REGEX REPLACE "\"" "'" line_contents "${line_contents}") + +# CMake separates list elements with semicolons, but semicolons +# are used extensively in the shader code. +# Replace with a temporary marker, to be reverted later. +STRING(REGEX REPLACE ";" "{{SEMICOLON}}" line_contents "${line_contents}") + +# Make every line an individual element in the CMake list. +STRING(REGEX REPLACE "\n" ";" line_contents "${line_contents}") + +# Build the shader string, wrapping each line in double quotes. +foreach(line IN LISTS line_contents) + string(CONCAT CONTENTS "${CONTENTS}" \"${line}\\n\"\n) +endforeach() + +# Revert the original semicolons in the source. +STRING(REGEX REPLACE "{{SEMICOLON}}" ";" CONTENTS "${CONTENTS}") + +get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) +make_directory(${OUTPUT_DIR}) +configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY) diff --git a/src/video_core/host_shaders/detile_m8x1.comp b/src/video_core/host_shaders/detile_m8x1.comp new file mode 100644 index 00000000..b4d920e6 --- /dev/null +++ b/src/video_core/host_shaders/detile_m8x1.comp @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 +#extension GL_KHR_shader_subgroup_shuffle : require + +// NOTE: Current subgroup utilization is subotimal on most GPUs, so +// it will be nice to process two tiles at once here. +layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(r8ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +#define MICRO_TILE_DIM 8 +#define TEXELS_PER_ELEMENT 4 + +void main() { + uint p0 = in_data[gl_GlobalInvocationID.x]; + uint p1 = subgroupShuffleXor(p0, 1); + uint hword = gl_LocalInvocationID.x & 1; + uint dst_tx = (hword == 1) + ? (p0 & 0xffff0000) | (p1 >> 16) + : (p0 & 0x0000ffff) | (p1 << 16); + + uint col = (gl_LocalInvocationID.x >> 2) & 1; + uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT) + + TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3); + + uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) + uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; + uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; + + ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); + + #pragma unroll + for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) { + imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff)); + dst_tx >>= 8; + } +} \ No newline at end of file diff --git a/src/video_core/host_shaders/detile_m8x4.comp b/src/video_core/host_shaders/detile_m8x4.comp new file mode 100644 index 00000000..25f7fef6 --- /dev/null +++ b/src/video_core/host_shaders/detile_m8x4.comp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +// Inverse morton LUT, small enough to fit into K$ +uint lut_8x4[16] = { + 0x11011000, 0x31213020, + 0x13031202, 0x33233222, + 0x51415040, 0x71617060, + 0x53435242, 0x73637262, + + 0x15051404, 0x35253424, + 0x17071606, 0x37273626, + 0x55455444, 0x75657464, + 0x57475646, 0x77677666, +}; + +#define MICRO_TILE_DIM 8 +#define TEXELS_PER_ELEMENT 1 + +void main() { + uint src_tx = in_data[gl_GlobalInvocationID.x]; + + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); + uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint col = bitfieldExtract(packed_pos, 4, 4); + uint row = bitfieldExtract(packed_pos, 0, 4); + + uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) + uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; + + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; + uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; + + ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); + + uvec4 dst_tx = uvec4( + bitfieldExtract(src_tx, 0, 8), + bitfieldExtract(src_tx, 8, 8), + bitfieldExtract(src_tx, 16, 8), + bitfieldExtract(src_tx, 24, 8) + ); + imageStore(output_img, img_pos, dst_tx); +} \ No newline at end of file diff --git a/src/video_core/host_shaders/source_shader.h.in b/src/video_core/host_shaders/source_shader.h.in new file mode 100644 index 00000000..43bf5b0c --- /dev/null +++ b/src/video_core/host_shaders/source_shader.h.in @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2022 Citra Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +namespace HostShaders { + +constexpr std::string_view @CONTENTS_NAME@ = { +@CONTENTS@ +}; + +} // namespace HostShaders diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 17353715..dc10ec6c 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -150,7 +150,6 @@ bool Instance::CreateDevice() { external_memory_host = add_extension(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); tooling_info = add_extension(VK_EXT_TOOLING_INFO_EXTENSION_NAME); custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); - index_type_uint8 = add_extension(VK_KHR_INDEX_TYPE_UINT8_EXTENSION_NAME); add_extension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); // The next two extensions are required to be available together in order to support write masks @@ -219,9 +218,6 @@ bool Instance::CreateDevice() { .customBorderColors = true, .customBorderColorWithoutFormat = true, }, - vk::PhysicalDeviceIndexTypeUint8FeaturesEXT{ - .indexTypeUint8 = true, - }, vk::PhysicalDeviceColorWriteEnableFeaturesEXT{ .colorWriteEnable = true, }, @@ -230,10 +226,6 @@ bool Instance::CreateDevice() { }, }; - if (!index_type_uint8) { - device_chain.unlink(); - } - if (!color_write_en) { device_chain.unlink(); device_chain.unlink(); diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index e4ee9aa4..28af5405 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -81,11 +81,6 @@ public: return custom_border_color; } - /// Returns true when VK_EXT_index_type_uint8 is supported - bool IsIndexTypeUint8Supported() const { - return index_type_uint8; - } - /// Returns true when VK_EXT_fragment_shader_interlock is supported bool IsFragmentShaderInterlockSupported() const { return fragment_shader_interlock; @@ -216,7 +211,6 @@ private: bool image_view_reinterpretation{true}; bool timeline_semaphores{}; bool custom_border_color{}; - bool index_type_uint8{}; bool fragment_shader_interlock{}; bool image_format_list{}; bool pipeline_creation_cache_control{}; diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 3334c4a7..7aa3062b 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -7,6 +7,7 @@ #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/image.h" +#include "video_core/texture_cache/tile_manager.h" #include @@ -16,7 +17,7 @@ using namespace Vulkan; using VideoOutFormat = Libraries::VideoOut::PixelFormat; using Libraries::VideoOut::TilingMode; -[[nodiscard]] vk::Format ConvertPixelFormat(const VideoOutFormat format) { +static vk::Format ConvertPixelFormat(const VideoOutFormat format) { switch (format) { case VideoOutFormat::A8R8G8B8Srgb: return vk::Format::eB8G8R8A8Srgb; @@ -32,7 +33,7 @@ using Libraries::VideoOut::TilingMode; return {}; } -[[nodiscard]] vk::ImageUsageFlags ImageUsageFlags(const vk::Format format) { +static vk::ImageUsageFlags ImageUsageFlags(const vk::Format format) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; @@ -46,7 +47,7 @@ using Libraries::VideoOut::TilingMode; return usage; } -[[nodiscard]] vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { +static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { switch (type) { case AmdGpu::ImageType::Color1D: return vk::ImageType::e1D; @@ -86,18 +87,19 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group) noe ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint /*= {}*/) noexcept { - is_tiled = true; + is_tiled = buffer.IsTiled(); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.info.format, buffer.NumFormat()); type = vk::ImageType::e2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; pitch = size.width; - guest_size_bytes = buffer.slice.tile_max * (buffer.view.slice_max + 1); + guest_size_bytes = buffer.GetSizeAligned(); } ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { - is_tiled = false; + is_tiled = image.IsTiled(); + tiling_mode = image.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); type = ConvertImageType(image.type); size.width = image.width + 1; @@ -106,8 +108,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { pitch = image.Pitch(); resources.levels = image.NumLevels(); resources.layers = image.NumLayers(); - // TODO: Derive this properly from tiling params - guest_size_bytes = size.width * size.height * 4; + guest_size_bytes = image.GetSizeAligned(); } UniqueImage::UniqueImage(vk::Device device_, VmaAllocator allocator_) @@ -151,6 +152,18 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, if (info.type == vk::ImageType::e3D) { flags |= vk::ImageCreateFlagBits::e2DArrayCompatible; } + if (info.is_tiled) { + flags |= vk::ImageCreateFlagBits::eExtendedUsage; + if (false) { // IsBlockCodedFormat() + flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; + } + } + + info.usage = ImageUsageFlags(info.pixel_format); + if (info.is_tiled || info.is_storage) { + info.usage |= vk::ImageUsageFlagBits::eStorage; + } + const vk::ImageCreateInfo image_ci = { .flags = flags, .imageType = info.type, @@ -163,12 +176,20 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, .mipLevels = static_cast(info.resources.levels), .arrayLayers = static_cast(info.resources.layers), .tiling = vk::ImageTiling::eOptimal, - .usage = ImageUsageFlags(info.pixel_format), + .usage = info.usage, .initialLayout = vk::ImageLayout::eUndefined, }; image.Create(image_ci); + // Create a special view for detiler + if (info.is_tiled) { + ImageViewInfo view_info; + view_info.format = DemoteImageFormatForDetiling(info.pixel_format); + view_info.used_for_detiling = true; + view_for_detiler.emplace(*instance, view_info, image); + } + Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone); } diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index cc3adff4..c357f8a2 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -12,6 +12,8 @@ #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/types.h" +#include + namespace Vulkan { class Instance; class Scheduler; @@ -39,12 +41,15 @@ struct ImageInfo { explicit ImageInfo(const AmdGpu::Image& image) noexcept; bool is_tiled = false; + bool is_storage = false; vk::Format pixel_format = vk::Format::eUndefined; vk::ImageType type = vk::ImageType::e1D; + vk::ImageUsageFlags usage; SubresourceExtent resources; Extent3D size{1, 1, 1}; u32 pitch = 0; u32 guest_size_bytes = 0; + AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear}; }; struct UniqueImage { @@ -114,6 +119,7 @@ struct Image { VAddr cpu_addr_end = 0; std::vector image_view_infos; std::vector image_view_ids; + std::optional view_for_detiler; // Resource state tracking vk::Flags pl_stage = vk::PipelineStageFlagBits::eAllCommands; diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 353e4e7f..919415e8 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -58,10 +58,16 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image) noexcept { mapping.a = ConvertComponentSwizzle(image.dst_sel_w); } -ImageView::ImageView(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - const ImageViewInfo& info_, vk::Image image) +ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info_, vk::Image image, + std::optional usage_override /*= {}*/) : info{info_} { + vk::ImageViewUsageCreateInfo usage_ci{}; + if (usage_override) { + usage_ci.usage = usage_override.value(); + } + const vk::ImageViewCreateInfo image_view_ci = { + .pNext = usage_override.has_value() ? &usage_ci : nullptr, .image = image, .viewType = info.type, .format = info.format, diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index 7f98e8ec..aa4ec8ee 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -7,6 +7,8 @@ #include "video_core/renderer_vulkan/vk_common.h" #include "video_core/texture_cache/types.h" +#include + namespace Vulkan { class Instance; class Scheduler; @@ -22,13 +24,14 @@ struct ImageViewInfo { vk::Format format = vk::Format::eR8G8B8A8Unorm; SubresourceRange range; vk::ComponentMapping mapping{}; + bool used_for_detiling = false; auto operator<=>(const ImageViewInfo&) const = default; }; struct ImageView { - explicit ImageView(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - const ImageViewInfo& info, vk::Image image); + explicit ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info, vk::Image image, + std::optional usage_override = {}); ~ImageView(); ImageView(const ImageView&) = delete; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 3e2a7dea..d3c6b678 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -5,6 +5,7 @@ #include "common/assert.h" #include "common/config.h" #include "core/virtual_memory.h" +#include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" @@ -64,7 +65,8 @@ static constexpr u64 PageShift = 12; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_) : instance{instance_}, scheduler{scheduler_}, staging{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, StreamBufferSize, - Vulkan::BufferType::Upload} { + Vulkan::BufferType::Upload}, + tile_manager{instance, scheduler} { #ifndef _WIN64 sigset_t signal_mask; @@ -91,7 +93,7 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& ASSERT(null_id.index == 0); ImageViewInfo view_info; - void(slot_image_views.insert(instance, scheduler, view_info, slot_images[null_id].image)); + void(slot_image_views.insert(instance, view_info, slot_images[null_id].image)); } TextureCache::~TextureCache() { @@ -138,21 +140,33 @@ Image& TextureCache::FindImage(const ImageInfo& info, VAddr cpu_address) { return image; } -ImageView& TextureCache::FindImageView(const AmdGpu::Image& desc) { - Image& image = FindImage(ImageInfo{desc}, desc.Address()); - - const ImageViewInfo view_info{desc}; +ImageView& TextureCache::RegisterImageView(Image& image, const ImageViewInfo& view_info) { if (const ImageViewId view_id = image.FindView(view_info); view_id) { return slot_image_views[view_id]; } + // All tiled images are created with storage usage flag. This makes set of formats (e.g. sRGB) + // impossible to use. However, during view creation, if an image isn't used as storage and not a + // target for the detiler, we can temporary remove its storage bit. + std::optional usage_override; + if (!image.info.is_storage && !view_info.used_for_detiling) { + usage_override = image.info.usage & ~vk::ImageUsageFlagBits::eStorage; + } + const ImageViewId view_id = - slot_image_views.insert(instance, scheduler, view_info, image.image); + slot_image_views.insert(instance, view_info, image.image, usage_override); image.image_view_infos.emplace_back(view_info); image.image_view_ids.emplace_back(view_id); return slot_image_views[view_id]; } +ImageView& TextureCache::FindImageView(const AmdGpu::Image& desc) { + Image& image = FindImage(ImageInfo{desc}, desc.Address()); + + const ImageViewInfo view_info{desc}; + return RegisterImageView(image, view_info); +} + ImageView& TextureCache::RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint) { const ImageInfo info{buffer, hint}; @@ -160,15 +174,7 @@ ImageView& TextureCache::RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buff ImageViewInfo view_info; view_info.format = info.pixel_format; - if (const ImageViewId view_id = image.FindView(view_info); view_id) { - return slot_image_views[view_id]; - } - - const ImageViewId view_id = - slot_image_views.insert(instance, scheduler, view_info, image.image); - image.image_view_infos.emplace_back(view_info); - image.image_view_ids.emplace_back(view_id); - return slot_image_views[view_id]; + return RegisterImageView(image, view_info); } void TextureCache::RefreshImage(Image& image) { @@ -176,52 +182,48 @@ void TextureCache::RefreshImage(Image& image) { image.flags &= ~ImageFlagBits::CpuModified; { - - // Upload data to the staging buffer. - const auto [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4); - const u8* image_data = reinterpret_cast(image.cpu_addr); - if (image.info.is_tiled) { - ConvertTileToLinear(data, image_data, image.info.size.width, image.info.size.height, - Config::isNeoMode()); - } else { + if (!tile_manager.TryDetile(image)) { + // Upload data to the staging buffer. + const auto& [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4); + const u8* image_data = reinterpret_cast(image.cpu_addr); std::memcpy(data, image_data, image.info.guest_size_bytes); + staging.Commit(image.info.guest_size_bytes); + + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + + // Copy to the image. + const vk::BufferImageCopy image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + + cmdbuf.copyBufferToImage(staging.Handle(), image.image, + vk::ImageLayout::eTransferDstOptimal, image_copy); } - staging.Commit(image.info.guest_size_bytes); - - // Copy to the image. - const vk::BufferImageCopy image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource{ - .aspectMask = vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, - .baseArrayLayer = 0, - .layerCount = 1, - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {image.info.size.width, image.info.size.height, 1}, - }; - - const auto cmdbuf = scheduler.CommandBuffer(); - const vk::ImageSubresourceRange range = { - .aspectMask = vk::ImageAspectFlagBits::eColor, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }; - - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - cmdbuf.copyBufferToImage(staging.Handle(), image.image, - vk::ImageLayout::eTransferDstOptimal, image_copy); image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); return; } + const vk::ImageSubresourceRange range = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }; + const u8* image_data = reinterpret_cast(image.cpu_addr); for (u32 l = 0; l < image.info.resources.layers; l++) { // Upload data to the staging buffer. diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 94c49929..a4dbff73 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -13,6 +13,7 @@ #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/sampler.h" #include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/tile_manager.h" namespace Core::Libraries::VideoOut { struct BufferAttributeGroup; @@ -36,22 +37,24 @@ public: void OnCpuWrite(VAddr address); /// Retrieves the image handle of the image with the provided attributes and address. - Image& FindImage(const ImageInfo& info, VAddr cpu_address); + [[nodiscard]] Image& FindImage(const ImageInfo& info, VAddr cpu_address); /// Retrieves an image view with the properties of the specified image descriptor. - ImageView& FindImageView(const AmdGpu::Image& image); + [[nodiscard]] ImageView& FindImageView(const AmdGpu::Image& image); /// Retrieves the render target with specified properties - ImageView& RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, - const AmdGpu::Liverpool::CbDbExtent& hint); + [[nodiscard]] ImageView& RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, + const AmdGpu::Liverpool::CbDbExtent& hint); /// Reuploads image contents. void RefreshImage(Image& image); /// Retrieves the sampler that matches the provided S# descriptor. - vk::Sampler GetSampler(const AmdGpu::Sampler& sampler); + [[nodiscard]] vk::Sampler GetSampler(const AmdGpu::Sampler& sampler); private: + ImageView& RegisterImageView(Image& image, const ImageViewInfo& view_info); + /// Iterate over all page indices in a range template static void ForEachPage(PAddr addr, size_t size, Func&& func) { @@ -128,6 +131,7 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; Vulkan::StreamBuffer staging; + TileManager tile_manager; SlotVector slot_images; SlotVector slot_image_views; tsl::robin_map samplers; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 7d961921..54cbc5da 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -1,10 +1,20 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include "common/assert.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/texture_cache/image_view.h" +#include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" +#include "video_core/host_shaders/detile_m8x1_comp.h" +#include "video_core/host_shaders/detile_m8x4_comp.h" + +#include +#include +#include + namespace VideoCore { static u32 IntLog2(u32 i) { @@ -162,4 +172,188 @@ void ConvertTileToLinear(u8* dst, const u8* src, u32 width, u32 height, bool is_ } } +vk::Format DemoteImageFormatForDetiling(vk::Format format) { + switch (format) { + case vk::Format::eB8G8R8A8Srgb: + case vk::Format::eR8G8B8A8Unorm: + return vk::Format::eR8G8B8A8Uint; + case vk::Format::eR8Unorm: + return vk::Format::eR8Uint; + default: + LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format)); + break; + } + return format; +} + +const DetilerContext* TileManager::GetDetiler(const Image& image) const { + const auto format = DemoteImageFormatForDetiling(image.info.pixel_format); + + if (image.info.tiling_mode == AmdGpu::TilingMode::Texture_MicroTiled) { + switch (format) { + case vk::Format::eR8Uint: + return &detilers[DetilerType::Micro8x1]; + case vk::Format::eR8G8B8A8Uint: + return &detilers[DetilerType::Micro8x4]; + default: + return nullptr; + } + } + return nullptr; +} + +static constexpr vk::BufferUsageFlags StagingFlags = vk::BufferUsageFlagBits::eTransferDst | + vk::BufferUsageFlagBits::eUniformBuffer | + vk::BufferUsageFlagBits::eStorageBuffer; + +TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) + : instance{instance}, scheduler{scheduler}, staging{instance, scheduler, StagingFlags, 64_MB} { + + static const std::array detiler_shaders{ + HostShaders::DETILE_M8X1_COMP, + HostShaders::DETILE_M8X4_COMP, + }; + + for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) { + auto& ctx = detilers[pl_id]; + + const auto& module = Vulkan::Compile( + detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); + + // Set module debug name + auto module_name = magic_enum::enum_name(static_cast(pl_id)); + const vk::DebugUtilsObjectNameInfoEXT name_info = { + .objectType = vk::ObjectType::eShaderModule, + .objectHandle = std::bit_cast(module), + .pObjectName = module_name.data(), + }; + instance.GetDevice().setDebugUtilsObjectNameEXT(name_info); + + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + + boost::container::static_vector bindings{ + { + .binding = 0, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + { + .binding = 1, + .descriptorType = vk::DescriptorType::eStorageImage, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }; + + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { + .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data(), + }; + static auto desc_layout = + instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + + const vk::PushConstantRange push_constants = { + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .offset = 0, + .size = sizeof(u32), + }; + + const vk::DescriptorSetLayout set_layout = *desc_layout; + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &push_constants, + }; + ctx.pl_layout = instance.GetDevice().createPipelineLayoutUnique(layout_info); + + const vk::ComputePipelineCreateInfo compute_pipeline_ci = { + .stage = shader_ci, + .layout = *ctx.pl_layout, + }; + auto result = instance.GetDevice().createComputePipelineUnique( + /*pipeline_cache*/ {}, compute_pipeline_ci); + if (result.result == vk::Result::eSuccess) { + ctx.pl = std::move(result.value); + } else { + UNREACHABLE_MSG("Detiler pipeline creation failed!"); + } + + // Once pipeline is compiled, we don't need the shader module anymore + instance.GetDevice().destroyShaderModule(module); + } +} + +TileManager::~TileManager() = default; + +bool TileManager::TryDetile(Image& image) { + if (!image.info.is_tiled) { + return false; + } + + const auto* detiler = GetDetiler(image); + if (!detiler) { + LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} {}", + vk::to_string(image.info.pixel_format), static_cast(image.info.tiling_mode)); + return false; + } + + const auto& [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4); + const u8* image_data = reinterpret_cast(image.cpu_addr); + std::memcpy(data, image_data, image.info.guest_size_bytes); + staging.Commit(image.info.guest_size_bytes); + + auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl); + + image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite); + + const vk::DescriptorBufferInfo input_buffer_info{ + .buffer = staging.Handle(), + .offset = offset, + .range = image.info.guest_size_bytes, + }; + + ASSERT(image.view_for_detiler.has_value()); + const vk::DescriptorImageInfo output_image_info{ + .imageView = *image.view_for_detiler->image_view, + .imageLayout = image.layout, + }; + + std::vector set_writes{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &input_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageImage, + .pImageInfo = &output_image_info, + }, + }; + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *detiler->pl_layout, 0, + set_writes); + + cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, + sizeof(image.info.pitch), &image.info.pitch); + + cmdbuf.dispatch((image.info.size.width * image.info.size.height) / 64, 1, + 1); // round to 64 + + return true; +} + } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index 7903114e..c630004c 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -4,10 +4,46 @@ #pragma once #include "common/types.h" +#include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/texture_cache/image.h" namespace VideoCore { +class TextureCache; + /// Converts tiled texture data to linear format. void ConvertTileToLinear(u8* dst, const u8* src, u32 width, u32 height, bool neo); +/// Converts image format to the one used internally by detiler. +vk::Format DemoteImageFormatForDetiling(vk::Format format); + +enum DetilerType : u32 { + Micro8x1, + Micro8x4, + + Max +}; + +struct DetilerContext { + vk::UniquePipeline pl; + vk::UniquePipelineLayout pl_layout; +}; + +class TileManager { +public: + TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); + ~TileManager(); + + bool TryDetile(Image& image); + +private: + const DetilerContext* GetDetiler(const Image& image) const; + +private: + const Vulkan::Instance& instance; + Vulkan::Scheduler& scheduler; + Vulkan::StreamBuffer staging; + std::array detilers; +}; + } // namespace VideoCore