texture_cache: detiler: m8x1 and m8x4 shaders
This commit is contained in:
parent
184b7b7fc2
commit
440a60a43b
|
@ -0,0 +1,48 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||||
|
|
||||||
|
// NOTE: Current subgroup utilization is subotimal on most GPUs, so
|
||||||
|
// it will be nice to process two tiles at once here.
|
||||||
|
layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(std430, binding = 0) buffer input_buf {
|
||||||
|
uint in_data[];
|
||||||
|
};
|
||||||
|
layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||||
|
|
||||||
|
layout(push_constant) uniform image_info {
|
||||||
|
uint pitch;
|
||||||
|
} info;
|
||||||
|
|
||||||
|
#define MICRO_TILE_DIM 8
|
||||||
|
#define TEXELS_PER_ELEMENT 4
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint p0 = in_data[gl_GlobalInvocationID.x];
|
||||||
|
uint p1 = subgroupShuffleXor(p0, 1);
|
||||||
|
uint hword = gl_LocalInvocationID.x & 1;
|
||||||
|
uint dst_tx = (hword == 1)
|
||||||
|
? (p0 & 0xffff0000) | (p1 >> 16)
|
||||||
|
: (p0 & 0x0000ffff) | (p1 << 16);
|
||||||
|
|
||||||
|
uint col = (gl_LocalInvocationID.x >> 2) & 1;
|
||||||
|
uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
|
||||||
|
+ TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
|
||||||
|
|
||||||
|
uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
|
||||||
|
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
|
||||||
|
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
|
||||||
|
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
|
||||||
|
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
|
||||||
|
|
||||||
|
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
|
||||||
|
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
|
||||||
|
dst_tx >>= 8;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||||
|
|
||||||
|
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(std430, binding = 0) buffer input_buf {
|
||||||
|
uint in_data[];
|
||||||
|
};
|
||||||
|
layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||||
|
|
||||||
|
layout(push_constant) uniform image_info {
|
||||||
|
uint pitch;
|
||||||
|
} info;
|
||||||
|
|
||||||
|
// Inverse morton LUT, small enough to fit into K$
|
||||||
|
uint lut_8x4[16] = {
|
||||||
|
0x11011000, 0x31213020,
|
||||||
|
0x13031202, 0x33233222,
|
||||||
|
0x51415040, 0x71617060,
|
||||||
|
0x53435242, 0x73637262,
|
||||||
|
|
||||||
|
0x15051404, 0x35253424,
|
||||||
|
0x17071606, 0x37273626,
|
||||||
|
0x55455444, 0x75657464,
|
||||||
|
0x57475646, 0x77677666,
|
||||||
|
};
|
||||||
|
|
||||||
|
#define MICRO_TILE_DIM 8
|
||||||
|
#define TEXELS_PER_ELEMENT 1
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint src_tx = in_data[gl_GlobalInvocationID.x];
|
||||||
|
|
||||||
|
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
|
||||||
|
uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs;
|
||||||
|
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||||
|
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||||
|
|
||||||
|
uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
|
||||||
|
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
|
||||||
|
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
|
||||||
|
|
||||||
|
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
|
||||||
|
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
|
||||||
|
|
||||||
|
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
|
||||||
|
|
||||||
|
uvec4 dst_tx = uvec4(
|
||||||
|
bitfieldExtract(src_tx, 0, 8),
|
||||||
|
bitfieldExtract(src_tx, 8, 8),
|
||||||
|
bitfieldExtract(src_tx, 16, 8),
|
||||||
|
bitfieldExtract(src_tx, 24, 8)
|
||||||
|
);
|
||||||
|
imageStore(output_img, img_pos, dst_tx);
|
||||||
|
}
|
|
@ -4,10 +4,14 @@
|
||||||
#include "boost/container/static_vector.hpp"
|
#include "boost/container/static_vector.hpp"
|
||||||
#include "video_core/renderer_vulkan/vk_instance.h"
|
#include "video_core/renderer_vulkan/vk_instance.h"
|
||||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||||
|
#include "video_core/renderer_vulkan/vk_shader_util.h"
|
||||||
#include "video_core/texture_cache/image_view.h"
|
#include "video_core/texture_cache/image_view.h"
|
||||||
#include "video_core/texture_cache/texture_cache.h"
|
#include "video_core/texture_cache/texture_cache.h"
|
||||||
#include "video_core/texture_cache/tile_manager.h"
|
#include "video_core/texture_cache/tile_manager.h"
|
||||||
|
|
||||||
|
#include "video_core/host_shaders/detile_m8x1_comp.h"
|
||||||
|
#include "video_core/host_shaders/detile_m8x4_comp.h"
|
||||||
|
|
||||||
#include <vulkan/vulkan_to_string.hpp>
|
#include <vulkan/vulkan_to_string.hpp>
|
||||||
|
|
||||||
namespace VideoCore {
|
namespace VideoCore {
|
||||||
|
@ -201,26 +205,20 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
|
||||||
TextureCache& texture_cache, Vulkan::StreamBuffer& staging)
|
TextureCache& texture_cache, Vulkan::StreamBuffer& staging)
|
||||||
: instance{instance}, scheduler{scheduler}, texture_cache{texture_cache}, staging{staging} {
|
: instance{instance}, scheduler{scheduler}, texture_cache{texture_cache}, staging{staging} {
|
||||||
|
|
||||||
|
static const std::array detiler_shaders{
|
||||||
|
HostShaders::DETILE_M8X1_COMP,
|
||||||
|
HostShaders::DETILE_M8X4_COMP,
|
||||||
|
};
|
||||||
|
|
||||||
for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) {
|
for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) {
|
||||||
auto& ctx = detilers[pl_id];
|
auto& ctx = detilers[pl_id];
|
||||||
|
|
||||||
const std::vector<u32> shader_code{};
|
const auto& module = Vulkan::Compile(
|
||||||
|
detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice());
|
||||||
const vk::ShaderModuleCreateInfo shader_info = {
|
|
||||||
.codeSize = shader_code.size(),
|
|
||||||
.pCode = shader_code.data(),
|
|
||||||
};
|
|
||||||
|
|
||||||
vk::UniqueShaderModule module;
|
|
||||||
try {
|
|
||||||
module = instance.GetDevice().createShaderModuleUnique(shader_info);
|
|
||||||
} catch (vk::SystemError& err) {
|
|
||||||
UNREACHABLE_MSG("{}", err.what());
|
|
||||||
}
|
|
||||||
|
|
||||||
const vk::PipelineShaderStageCreateInfo shader_ci = {
|
const vk::PipelineShaderStageCreateInfo shader_ci = {
|
||||||
.stage = vk::ShaderStageFlagBits::eCompute,
|
.stage = vk::ShaderStageFlagBits::eCompute,
|
||||||
.module = *module,
|
.module = module,
|
||||||
.pName = "main",
|
.pName = "main",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue