diff --git a/src/video_core/host_shaders/detile_m8x1.comp b/src/video_core/host_shaders/detile_m8x1.comp new file mode 100644 index 00000000..1b84b402 --- /dev/null +++ b/src/video_core/host_shaders/detile_m8x1.comp @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 +#extension GL_KHR_shader_subgroup_shuffle : require + +// NOTE: Current subgroup utilization is subotimal on most GPUs, so +// it will be nice to process two tiles at once here. +layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(r8ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +#define MICRO_TILE_DIM 8 +#define TEXELS_PER_ELEMENT 4 + +void main() { + uint p0 = in_data[gl_GlobalInvocationID.x]; + uint p1 = subgroupShuffleXor(p0, 1); + uint hword = gl_LocalInvocationID.x & 1; + uint dst_tx = (hword == 1) + ? (p0 & 0xffff0000) | (p1 >> 16) + : (p0 & 0x0000ffff) | (p1 << 16); + + uint col = (gl_LocalInvocationID.x >> 2) & 1; + uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT) + + TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3); + + uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM; + uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; + uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; + + ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); + + #pragma unroll + for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) { + imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff)); + dst_tx >>= 8; + } +} \ No newline at end of file diff --git a/src/video_core/host_shaders/detile_m8x4.comp b/src/video_core/host_shaders/detile_m8x4.comp new file mode 100644 index 00000000..97438fe9 --- /dev/null +++ b/src/video_core/host_shaders/detile_m8x4.comp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 +#extension GL_KHR_shader_subgroup_shuffle : require + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint in_data[]; +}; +layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img; + +layout(push_constant) uniform image_info { + uint pitch; +} info; + +// Inverse morton LUT, small enough to fit into K$ +uint lut_8x4[16] = { + 0x11011000, 0x31213020, + 0x13031202, 0x33233222, + 0x51415040, 0x71617060, + 0x53435242, 0x73637262, + + 0x15051404, 0x35253424, + 0x17071606, 0x37273626, + 0x55455444, 0x75657464, + 0x57475646, 0x77677666, +}; + +#define MICRO_TILE_DIM 8 +#define TEXELS_PER_ELEMENT 1 + +void main() { + uint src_tx = in_data[gl_GlobalInvocationID.x]; + + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); + uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint col = bitfieldExtract(packed_pos, 4, 4); + uint row = bitfieldExtract(packed_pos, 0, 4); + + uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM; + uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; + + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; + uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; + + ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); + + uvec4 dst_tx = uvec4( + bitfieldExtract(src_tx, 0, 8), + bitfieldExtract(src_tx, 8, 8), + bitfieldExtract(src_tx, 16, 8), + bitfieldExtract(src_tx, 24, 8) + ); + imageStore(output_img, img_pos, dst_tx); +} \ No newline at end of file diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index d33427db..e36c40f4 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -4,10 +4,14 @@ #include "boost/container/static_vector.hpp" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" +#include "video_core/host_shaders/detile_m8x1_comp.h" +#include "video_core/host_shaders/detile_m8x4_comp.h" + #include namespace VideoCore { @@ -201,26 +205,20 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc TextureCache& texture_cache, Vulkan::StreamBuffer& staging) : instance{instance}, scheduler{scheduler}, texture_cache{texture_cache}, staging{staging} { + static const std::array detiler_shaders{ + HostShaders::DETILE_M8X1_COMP, + HostShaders::DETILE_M8X4_COMP, + }; + for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) { auto& ctx = detilers[pl_id]; - const std::vector shader_code{}; - - const vk::ShaderModuleCreateInfo shader_info = { - .codeSize = shader_code.size(), - .pCode = shader_code.data(), - }; - - vk::UniqueShaderModule module; - try { - module = instance.GetDevice().createShaderModuleUnique(shader_info); - } catch (vk::SystemError& err) { - UNREACHABLE_MSG("{}", err.what()); - } + const auto& module = Vulkan::Compile( + detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); const vk::PipelineShaderStageCreateInfo shader_ci = { .stage = vk::ShaderStageFlagBits::eCompute, - .module = *module, + .module = module, .pName = "main", };