Surface management rework (2/3) (#329)

* texture_cache: interface refactoring

* a bit of fixes and improvements

* texture_cache: macro tile extents for bpp 128

* texture_cache: detiler: prefer host memory for large buffers upload
This commit is contained in:
psucien 2024-07-28 17:20:42 +02:00 committed by GitHub
parent 0d6edaa0a0
commit 30198d5ffc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 478 additions and 322 deletions

View File

@ -377,9 +377,13 @@ struct Liverpool {
return 1u << z_info.num_samples; // spec doesn't say it is a log2
}
u32 NumBits() const {
return z_info.format == ZFormat::Z32Float ? 32 : 16;
}
size_t GetDepthSliceSize() const {
ASSERT(z_info.format != ZFormat::Invalid);
const auto bpe = z_info.format == ZFormat::Z32Float ? 4 : 2;
const auto bpe = NumBits() >> 3; // in bytes
return (depth_slice.tile_max + 1) * 64 * bpe * NumSamples();
}
};

View File

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(r32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -31,20 +35,22 @@ uint rmort[16] = {
#define TEXELS_PER_ELEMENT (1)
void main() {
uint tile_base = gl_GlobalInvocationID.x - gl_LocalInvocationID.x; // WG*16
uint p0 = in_data[gl_GlobalInvocationID.x];
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint p0 = in_data[gl_GlobalInvocationID.x];
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0));
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
out_data[dw_ofs_x + dw_ofs_y] = p0;
}

View File

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -30,19 +34,25 @@ uint rmort[16] = {
#define MICRO_TILE_DIM (8)
void main() {
uint block_ofs = 2 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint block_ofs = 2 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
gl_WorkGroupID.x % tiles_per_pitch,
gl_WorkGroupID.x / tiles_per_pitch
);
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0));
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 8) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1) * 2;
uint target_tile_x = 2 * gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = 2 * gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col * 2;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
out_data[dw_ofs_x + dw_ofs_y] = p0;
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
}

View File

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -30,21 +34,29 @@ uint rmort[16] = {
#define MICRO_TILE_DIM (8)
void main() {
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint block_ofs = 4 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint p2 = in_data[block_ofs + 2];
uint p3 = in_data[block_ofs + 3];
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
gl_WorkGroupID.x % tiles_per_pitch,
gl_WorkGroupID.x / tiles_per_pitch
);
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3));
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 16) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max(((info.pitch >> mip) / MICRO_TILE_DIM), 1u) * 4;
uint target_tile_x = 4 * gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = 4 * gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = (target_tile_x * MICRO_TILE_DIM) + 4 * col;
uint dw_ofs_y = ((target_tile_y * tiles_per_pitch) * 64u) + ((row * tiles_per_pitch) * MICRO_TILE_DIM);
out_data[dw_ofs_x + dw_ofs_y] = p0;
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
out_data[dw_ofs_x + dw_ofs_y + 2] = p2;
out_data[dw_ofs_x + dw_ofs_y + 3] = p3;
}

View File

@ -11,10 +11,14 @@ layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
#define MICRO_TILE_DIM 8
@ -32,17 +36,15 @@ void main() {
uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
+ TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max((info.pitch >> mip) / 8, 1);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
dst_tx >>= 8;
}
uint dw_ofs_x = target_tile_x * 2 + col; // 2 = uints
uint dw_ofs_y = (target_tile_y * MICRO_TILE_DIM + row) * tiles_per_pitch * 2; // 2 = uints
out_data[dw_ofs_x + dw_ofs_y] = dst_tx;
}

View File

@ -10,10 +10,14 @@ layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
#define MICRO_TILE_DIM 8
@ -44,18 +48,14 @@ void main() {
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint mip = 0u;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max(((info.pitch >> mip) / 8u), 1u);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
uint p0 = (p[ofs] >> 8) & 0xff;
uint p1 = p[ofs] & 0xff;
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p1, p0, 0, 0));
}
uint dw_ofs_x = target_tile_x * 8 + col;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * 8;
out_data[(dw_ofs_x + dw_ofs_y) / 2] = src_tx;
}

View File

@ -392,6 +392,10 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
num_format == AmdGpu::NumberFormat::Float) {
return vk::Format::eR16G16Sfloat;
}
if (data_format == AmdGpu::DataFormat::Format16_16 &&
num_format == AmdGpu::NumberFormat::Unorm) {
return vk::Format::eR16G16Unorm;
}
if (data_format == AmdGpu::DataFormat::Format10_11_11 &&
num_format == AmdGpu::NumberFormat::Float) {
return vk::Format::eB10G11R11UfloatPack32;

View File

@ -128,7 +128,9 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
for (const auto& image_desc : info.images) {
const auto tsharp =
info.ReadUd<AmdGpu::Image>(image_desc.sgpr_base, image_desc.dword_offset);
const auto& image_view = texture_cache.FindTexture(tsharp, image_desc.is_storage);
VideoCore::ImageInfo image_info{tsharp};
VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage};
const auto& image_view = texture_cache.FindTexture(image_info, view_info);
const auto& image = texture_cache.GetImage(image_view.image_id);
image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout);
set_writes.push_back({

View File

@ -366,7 +366,9 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
for (const auto& image_desc : stage.images) {
const auto& tsharp = tsharps.emplace_back(
stage.ReadUd<AmdGpu::Image>(image_desc.sgpr_base, image_desc.dword_offset));
const auto& image_view = texture_cache.FindTexture(tsharp, image_desc.is_storage);
VideoCore::ImageInfo image_info{tsharp};
VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage};
const auto& image_view = texture_cache.FindTexture(image_info, view_info);
const auto& image = texture_cache.GetImage(image_view.image_id);
image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout);
set_writes.push_back({

View File

@ -191,7 +191,7 @@ void PipelineCache::RefreshGraphicsKey() {
LiverpoolToVK::SurfaceFormat(col_buf.info.format, col_buf.NumFormat());
const auto is_vo_surface = renderer->IsVideoOutSurface(col_buf);
key.color_formats[remapped_cb] = LiverpoolToVK::AdjustColorBufferFormat(
base_format, col_buf.info.comp_swap.Value(), is_vo_surface);
base_format, col_buf.info.comp_swap.Value(), false /*is_vo_surface*/);
key.blend_controls[remapped_cb] = regs.blend_control[cb];
key.blend_controls[remapped_cb].enable.Assign(key.blend_controls[remapped_cb].enable &&
!col_buf.info.blend_bypass);

View File

@ -32,6 +32,7 @@ static VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback(
switch (static_cast<u32>(callback_data->messageIdNumber)) {
case 0x609a13b: // Vertex attribute at location not consumed by shader
case 0xc81ad50e:
case 0x92d66fc1: // `pMultisampleState is NULL` for depth only passes (confirmed VL error)
return VK_FALSE;
default:
break;

View File

@ -120,7 +120,9 @@ void Rasterizer::BeginRendering() {
}
const auto& hint = liverpool->last_cb_extent[col_buf_id];
const auto& image_view = texture_cache.FindRenderTarget(col_buf, hint);
VideoCore::ImageInfo image_info{col_buf, hint};
VideoCore::ImageViewInfo view_info{col_buf, false /*!!image.info.usage.vo_buffer*/};
const auto& image_view = texture_cache.FindRenderTarget(image_info, view_info);
const auto& image = texture_cache.GetImage(image_view.image_id);
state.width = std::min<u32>(state.width, image.info.size.width);
state.height = std::min<u32>(state.height, image.info.size.height);
@ -143,9 +145,10 @@ void Rasterizer::BeginRendering() {
const bool is_clear = regs.depth_render_control.depth_clear_enable ||
texture_cache.IsMetaCleared(htile_address);
const auto& hint = liverpool->last_db_extent;
const auto& image_view = texture_cache.FindDepthTarget(
regs.depth_buffer, regs.depth_view.NumSlices(), htile_address, hint,
regs.depth_control.depth_write_enable);
VideoCore::ImageInfo image_info{regs.depth_buffer, regs.depth_view.NumSlices(),
htile_address, hint};
VideoCore::ImageViewInfo view_info{regs.depth_buffer, regs.depth_view, regs.depth_control};
const auto& image_view = texture_cache.FindDepthTarget(image_info, view_info);
const auto& image = texture_cache.GetImage(image_view.image_id);
state.width = std::min<u32>(state.width, image.info.size.width);
state.height = std::min<u32>(state.height, image.info.size.height);

View File

@ -117,18 +117,15 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address},
cpu_addr_end{cpu_addr + info.guest_size_bytes} {
ASSERT(info.pixel_format != vk::Format::eUndefined);
// Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case
// the texture cache should re-create the resource with the usage requested
vk::ImageCreateFlags flags{vk::ImageCreateFlagBits::eMutableFormat |
vk::ImageCreateFlagBits::eExtendedUsage};
if (info.type == vk::ImageType::e2D && info.resources.layers >= 6 &&
info.size.width == info.size.height) {
if (info.props.is_cube) {
flags |= vk::ImageCreateFlagBits::eCubeCompatible;
}
if (info.type == vk::ImageType::e3D) {
} else if (info.props.is_volume) {
flags |= vk::ImageCreateFlagBits::e2DArrayCompatible;
}
if (info.IsBlockCoded()) {
flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible;
}
usage = ImageUsageFlags(info);
@ -157,15 +154,6 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
};
image.Create(image_ci);
// Create a special view for detiler
if (info.is_tiled) {
ImageViewInfo view_info;
view_info.format = DemoteImageFormatForDetiling(info.pixel_format);
view_for_detiler.emplace(*instance, view_info, *this, ImageId{});
}
Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone);
}
void Image::Transit(vk::ImageLayout dst_layout, vk::Flags<vk::AccessFlagBits> dst_mask,

View File

@ -105,7 +105,6 @@ struct Image {
VAddr cpu_addr_end = 0;
std::vector<ImageViewInfo> image_view_infos;
std::vector<ImageViewId> image_view_ids;
std::optional<ImageView> view_for_detiler;
// Resource state tracking
vk::ImageUsageFlags usage;

View File

@ -47,33 +47,33 @@ static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept {
// clang-format off
// The table of macro tiles parameters for given tiling index (row) and bpp (column)
static constexpr std::array macro_tile_extents{
std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, // 00
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 01
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 02
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 03
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 04
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 05
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 06
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 07
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 08
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 09
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0A
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 0B
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 0C
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 0D
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0E
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0F
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 10
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 11
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 12
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 13
std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 14
std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 15
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 16
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 17
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 18
std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 19
std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 1A
std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, // 00
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 01
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 02
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 03
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 04
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 05
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 06
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 07
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 08
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 09
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0A
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0B
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0C
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 0D
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0E
std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0F
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 10
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 11
std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 12
std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 13
std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 14
std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 15
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 16
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 17
std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{128u, 64u}, // 18
std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 19
std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 1A
};
// clang-format on
@ -82,62 +82,65 @@ static constexpr auto hw_pipe_interleave = 256u;
static constexpr std::pair<u32, u32> GetMacroTileExtents(u32 tiling_idx, u32 bpp, u32 num_samples) {
ASSERT(num_samples == 1);
const auto row = tiling_idx * 4;
const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64
const auto row = tiling_idx * 5;
const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64, 128
return macro_tile_extents[row + column];
}
static constexpr size_t ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, u32 num_samples) {
static constexpr std::pair<u32, size_t> ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp,
u32 num_samples) {
const auto pitch_align = std::max(8u, 64u / ((bpp + 7) / 8));
auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1);
const auto height_aligned = height;
size_t log_sz = 1;
const auto slice_align = std::max(64u, hw_pipe_interleave / (bpp + 7) / 8);
size_t log_sz = pitch_aligned * height_aligned * num_samples;
const auto slice_align = std::max(64u, 256u / ((bpp + 7) / 8));
while (log_sz % slice_align) {
log_sz = pitch_aligned * height_aligned * num_samples;
pitch_aligned += pitch_align;
log_sz = pitch_aligned * height_aligned * num_samples;
}
return (log_sz * bpp + 7) / 8;
return {pitch_aligned, (log_sz * bpp + 7) / 8};
}
static constexpr size_t ImageSizeMicroTiled(u32 pitch, u32 height, u32 bpp, u32 num_samples) {
static constexpr std::pair<u32, size_t> ImageSizeMicroTiled(u32 pitch, u32 height, u32 bpp,
u32 num_samples) {
const auto& [pitch_align, height_align] = micro_tile_extent;
auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1);
const auto height_aligned = (height + height_align - 1) & ~(height_align - 1);
size_t log_sz = 1;
size_t log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8;
while (log_sz % 256) {
log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8;
pitch_aligned += 8;
log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8;
}
return log_sz;
return {pitch_aligned, log_sz};
}
static constexpr size_t ImageSizeMacroTiled(u32 pitch, u32 height, u32 bpp, u32 num_samples,
u32 tiling_idx) {
static constexpr std::pair<u32, size_t> ImageSizeMacroTiled(u32 pitch, u32 height, u32 bpp,
u32 num_samples, u32 tiling_idx) {
const auto& [pitch_align, height_align] = GetMacroTileExtents(tiling_idx, bpp, num_samples);
ASSERT(pitch_align != 0 && height_align != 0);
const auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1);
const auto height_aligned = (height + height_align - 1) & ~(height_align - 1);
return (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8;
const auto log_sz = pitch_aligned * height_aligned * num_samples;
return {pitch_aligned, (log_sz * bpp + 7) / 8};
}
ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group,
VAddr cpu_address) noexcept {
const auto& attrib = group.attrib;
is_tiled = attrib.tiling_mode == TilingMode::Tile;
tiling_mode =
is_tiled ? AmdGpu::TilingMode::Display_MacroTiled : AmdGpu::TilingMode::Display_Linear;
props.is_tiled = attrib.tiling_mode == TilingMode::Tile;
tiling_mode = props.is_tiled ? AmdGpu::TilingMode::Display_MacroTiled
: AmdGpu::TilingMode::Display_Linear;
pixel_format = ConvertPixelFormat(attrib.pixel_format);
type = vk::ImageType::e2D;
size.width = attrib.width;
size.height = attrib.height;
pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127);
usage.vo_buffer = true;
const bool is_32bpp = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float;
ASSERT(is_32bpp);
num_bits = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float ? 32 : 64;
ASSERT(num_bits == 32);
guest_address = cpu_address;
if (!is_tiled) {
if (!props.is_tiled) {
guest_size_bytes = pitch * size.height * 4;
} else {
if (Config::isNeoMode()) {
@ -146,15 +149,16 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group,
guest_size_bytes = pitch * ((size.height + 63) & (~63)) * 4;
}
}
mips_layout.emplace_back(0, guest_size_bytes);
mips_layout.emplace_back(guest_size_bytes, pitch, 0);
}
ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer,
const AmdGpu::Liverpool::CbDbExtent& hint /*= {}*/) noexcept {
is_tiled = buffer.IsTiled();
props.is_tiled = buffer.IsTiled();
tiling_mode = buffer.GetTilingMode();
pixel_format = LiverpoolToVK::SurfaceFormat(buffer.info.format, buffer.NumFormat());
num_samples = 1 << buffer.attrib.num_fragments_log2;
num_bits = NumBits(buffer.info.format);
type = vk::ImageType::e2D;
size.width = hint.Valid() ? hint.width : buffer.Pitch();
size.height = hint.Valid() ? hint.height : buffer.Height();
@ -168,15 +172,16 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer,
guest_address = buffer.Address();
const auto color_slice_sz = buffer.GetColorSliceSize();
guest_size_bytes = color_slice_sz * buffer.NumSlices();
mips_layout.emplace_back(0, color_slice_sz);
mips_layout.emplace_back(color_slice_sz, pitch, 0);
}
ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slices,
VAddr htile_address, const AmdGpu::Liverpool::CbDbExtent& hint) noexcept {
is_tiled = false;
props.is_tiled = false;
pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format);
type = vk::ImageType::e2D;
num_samples = 1 << buffer.z_info.num_samples; // spec doesn't say it is a log2
num_bits = buffer.NumBits();
size.width = hint.Valid() ? hint.width : buffer.Pitch();
size.height = hint.Valid() ? hint.height : buffer.Height();
size.depth = 1;
@ -188,37 +193,38 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice
guest_address = buffer.Address();
const auto depth_slice_sz = buffer.GetDepthSliceSize();
guest_size_bytes = depth_slice_sz * num_slices;
mips_layout.emplace_back(0, depth_slice_sz);
mips_layout.emplace_back(depth_slice_sz, pitch, 0);
}
ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
is_tiled = image.IsTiled();
tiling_mode = image.GetTilingMode();
pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
type = ConvertImageType(image.GetType());
is_cube = image.GetType() == AmdGpu::ImageType::Cube;
is_volume = image.GetType() == AmdGpu::ImageType::Color3D;
props.is_tiled = image.IsTiled();
props.is_cube = image.GetType() == AmdGpu::ImageType::Cube;
props.is_volume = image.GetType() == AmdGpu::ImageType::Color3D;
props.is_pow2 = image.pow2pad;
props.is_block = IsBlockCoded();
size.width = image.width + 1;
size.height = image.height + 1;
size.depth = is_volume ? image.depth + 1 : 1;
size.depth = props.is_volume ? image.depth + 1 : 1;
pitch = image.Pitch();
resources.levels = image.NumLevels();
resources.layers = image.NumLayers();
num_bits = NumBits(image.GetDataFmt());
usage.texture = true;
guest_address = image.Address();
mips_layout.reserve(resources.levels);
const auto num_bits = NumBits(image.GetDataFmt());
const auto is_block = IsBlockCoded();
const auto is_pow2 = image.pow2pad;
MipInfo mip_info{};
guest_size_bytes = 0;
for (auto mip = 0u; mip < resources.levels; ++mip) {
auto bpp = num_bits;
auto mip_w = pitch >> mip;
auto mip_h = size.height >> mip;
if (is_block) {
if (props.is_block) {
mip_w = (mip_w + 3) / 4;
mip_h = (mip_h + 3) / 4;
bpp *= 16;
@ -227,40 +233,48 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
mip_h = std::max(mip_h, 1u);
auto mip_d = std::max(size.depth >> mip, 1u);
if (is_pow2) {
if (props.is_pow2) {
mip_w = std::bit_ceil(mip_w);
mip_h = std::bit_ceil(mip_h);
mip_d = std::bit_ceil(mip_d);
}
size_t mip_size = 0;
switch (tiling_mode) {
case AmdGpu::TilingMode::Display_Linear: {
ASSERT(!is_cube);
mip_size = ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples);
ASSERT(!props.is_cube);
std::tie(mip_info.pitch, mip_info.size) =
ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples);
mip_info.height = mip_h;
break;
}
case AmdGpu::TilingMode::Texture_MicroTiled: {
mip_size = ImageSizeMicroTiled(mip_w, mip_h, bpp, num_samples);
std::tie(mip_info.pitch, mip_info.size) =
ImageSizeMicroTiled(mip_w, mip_h, bpp, num_samples);
mip_info.height = std::max(mip_h, 8u);
if (props.is_block) {
mip_info.pitch = std::max(mip_info.pitch * 4, 32u);
mip_info.height = std::max(mip_info.height * 4, 32u);
}
break;
}
case AmdGpu::TilingMode::Display_MacroTiled:
case AmdGpu::TilingMode::Texture_MacroTiled:
case AmdGpu::TilingMode::Depth_MacroTiled: {
ASSERT(!is_cube && !is_block);
ASSERT(!props.is_cube && !props.is_block);
ASSERT(num_samples == 1);
ASSERT(num_bits <= 64);
mip_size = ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index);
std::tie(mip_info.pitch, mip_info.size) =
ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index);
break;
}
default: {
UNREACHABLE();
}
}
mip_size *= mip_d;
mip_info.size *= mip_d;
mips_layout.emplace_back(guest_size_bytes, mip_size);
guest_size_bytes += mip_size;
mip_info.offset = guest_size_bytes;
mips_layout.emplace_back(mip_info);
guest_size_bytes += mip_info.size;
}
guest_size_bytes *= resources.layers;
}

View File

@ -9,6 +9,8 @@
#include "video_core/amdgpu/liverpool.h"
#include "video_core/texture_cache/types.h"
#include <boost/container/small_vector.hpp>
namespace VideoCore {
struct ImageInfo {
@ -42,18 +44,29 @@ struct ImageInfo {
u32 vo_buffer : 1;
} usage{}; // Usage data tracked during image lifetime
bool is_cube = false;
bool is_volume = false;
bool is_tiled = false;
bool is_read_only = false;
struct {
u32 is_cube : 1;
u32 is_volume : 1;
u32 is_tiled : 1;
u32 is_pow2 : 1;
u32 is_block : 1;
} props{}; // Surface properties with impact on various calculation factors
vk::Format pixel_format = vk::Format::eUndefined;
vk::ImageType type = vk::ImageType::e1D;
SubresourceExtent resources;
Extent3D size{1, 1, 1};
u32 num_bits{};
u32 num_samples = 1;
u32 pitch = 0;
AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear};
std::vector<std::pair<u32, u32>> mips_layout;
struct MipInfo {
u32 size;
u32 pitch;
u32 height;
u32 offset;
};
boost::container::small_vector<MipInfo, 14> mips_layout;
VAddr guest_address{0};
u32 guest_size_bytes{0};
};

View File

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/logging/log.h"
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/texture_cache/image.h"
@ -50,15 +51,18 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexce
: is_storage{is_storage} {
type = ConvertImageViewType(image.GetType());
format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
range.base.level = static_cast<u32>(image.base_level);
range.base.layer = static_cast<u32>(image.base_array);
range.extent.levels = image.NumLevels();
range.extent.layers = image.NumLayers();
if (!is_storage) {
range.base.level = image.base_level;
range.base.layer = image.base_array;
range.extent.levels = image.last_level + 1;
range.extent.layers = image.last_array + 1;
mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
// Check for unfortunate case of storage images being swizzled
if (is_storage && (mapping != vk::ComponentMapping{})) {
LOG_ERROR(Render_Vulkan, "Storage image requires swizzling");
mapping = vk::ComponentMapping{};
}
}
@ -70,6 +74,16 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer,
base_format, col_buffer.info.comp_swap.Value(), is_vo_surface);
}
ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer,
AmdGpu::Liverpool::DepthView view,
AmdGpu::Liverpool::DepthControl ctl) {
format = Vulkan::LiverpoolToVK::DepthFormat(depth_buffer.z_info.format,
depth_buffer.stencil_info.format);
is_storage = ctl.depth_write_enable;
range.base.layer = view.slice_start;
range.extent.layers = view.NumSlices();
}
ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info_, Image& image,
ImageId image_id_, std::optional<vk::ImageUsageFlags> usage_override /*= {}*/)
: info{info_}, image_id{image_id_} {
@ -93,10 +107,10 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info
.components = instance.GetSupportedComponentSwizzle(format, info.mapping),
.subresourceRange{
.aspectMask = aspect,
.baseMipLevel = 0U,
.levelCount = 1,
.baseMipLevel = info.range.base.level,
.levelCount = info.range.extent.levels - info.range.base.level,
.baseArrayLayer = info_.range.base.layer,
.layerCount = image.info.IsBlockCoded() ? 1 : VK_REMAINING_ARRAY_LAYERS,
.layerCount = info.range.extent.layers - info.range.base.layer,
},
};
image_view = instance.GetDevice().createImageViewUnique(image_view_ci);

View File

@ -18,10 +18,11 @@ class Scheduler;
namespace VideoCore {
struct ImageViewInfo {
explicit ImageViewInfo() = default;
explicit ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept;
explicit ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer,
bool is_vo_surface) noexcept;
ImageViewInfo() = default;
ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept;
ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, bool is_vo_surface) noexcept;
ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer,
AmdGpu::Liverpool::DepthView view, AmdGpu::Liverpool::DepthControl ctl);
vk::ImageViewType type = vk::ImageViewType::e2D;
vk::Format format = vk::Format::eR8G8B8A8Unorm;

View File

@ -152,8 +152,6 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
image_id = image_ids[0];
}
RegisterMeta(info, image_id);
Image& image = slot_images[image_id];
if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) {
RefreshImage(image);
@ -184,13 +182,12 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
return slot_image_views[view_id];
}
ImageView& TextureCache::FindTexture(const AmdGpu::Image& desc, bool is_storage) {
const ImageInfo info{desc};
ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(info);
Image& image = slot_images[image_id];
auto& usage = image.info.usage;
if (is_storage) {
if (view_info.is_storage) {
image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite);
usage.storage = true;
} else {
@ -201,14 +198,36 @@ ImageView& TextureCache::FindTexture(const AmdGpu::Image& desc, bool is_storage)
usage.texture = true;
}
const ImageViewInfo view_info{desc, is_storage};
return RegisterImageView(image_id, view_info);
// These changes are temporary and should be removed once texture cache will handle subresources
// merging
auto view_info_tmp = view_info;
if (view_info_tmp.range.base.level > image.info.resources.levels - 1 ||
view_info_tmp.range.base.layer > image.info.resources.layers - 1 ||
view_info_tmp.range.extent.levels > image.info.resources.levels ||
view_info_tmp.range.extent.layers > image.info.resources.layers) {
LOG_ERROR(Render_Vulkan,
"Subresource range ({}~{},{}~{}) exceeds base image extents ({},{})",
view_info_tmp.range.base.level, view_info_tmp.range.extent.levels,
view_info_tmp.range.base.layer, view_info_tmp.range.extent.layers,
image.info.resources.levels, image.info.resources.layers);
view_info_tmp.range.base.level =
std::min(view_info_tmp.range.base.level, image.info.resources.levels - 1);
view_info_tmp.range.base.layer =
std::min(view_info_tmp.range.base.layer, image.info.resources.layers - 1);
view_info_tmp.range.extent.levels =
std::min(view_info_tmp.range.extent.levels, image.info.resources.levels);
view_info_tmp.range.extent.layers =
std::min(view_info_tmp.range.extent.layers, image.info.resources.layers);
}
ImageView& TextureCache::FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer,
const AmdGpu::Liverpool::CbDbExtent& hint) {
const ImageInfo info{buffer, hint};
const ImageId image_id = FindImage(info);
return RegisterImageView(image_id, view_info_tmp);
}
ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(image_info);
Image& image = slot_images[image_id];
image.flags &= ~ImageFlagBits::CpuModified;
@ -216,30 +235,56 @@ ImageView& TextureCache::FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer&
vk::AccessFlagBits::eColorAttachmentWrite |
vk::AccessFlagBits::eColorAttachmentRead);
// Register meta data for this color buffer
if (!(image.flags & ImageFlagBits::MetaRegistered)) {
if (image_info.meta_info.cmask_addr) {
surface_metas.emplace(
image_info.meta_info.cmask_addr,
MetaDataInfo{.type = MetaDataInfo::Type::CMask, .is_cleared = true});
image.info.meta_info.cmask_addr = image_info.meta_info.cmask_addr;
image.flags |= ImageFlagBits::MetaRegistered;
}
if (image_info.meta_info.fmask_addr) {
surface_metas.emplace(
image_info.meta_info.fmask_addr,
MetaDataInfo{.type = MetaDataInfo::Type::FMask, .is_cleared = true});
image.info.meta_info.fmask_addr = image_info.meta_info.fmask_addr;
image.flags |= ImageFlagBits::MetaRegistered;
}
}
// Update tracked image usage
image.info.usage.render_target = true;
ImageViewInfo view_info{buffer, !!image.info.usage.vo_buffer};
return RegisterImageView(image_id, view_info);
}
ImageView& TextureCache::FindDepthTarget(const AmdGpu::Liverpool::DepthBuffer& buffer,
u32 num_slices, VAddr htile_address,
const AmdGpu::Liverpool::CbDbExtent& hint,
bool write_enabled) {
const ImageInfo info{buffer, num_slices, htile_address, hint};
const ImageId image_id = FindImage(info, false);
ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(image_info, false);
Image& image = slot_images[image_id];
image.flags &= ~ImageFlagBits::CpuModified;
const auto new_layout = write_enabled ? vk::ImageLayout::eDepthStencilAttachmentOptimal
const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal
: vk::ImageLayout::eDepthStencilReadOnlyOptimal;
image.Transit(new_layout, vk::AccessFlagBits::eDepthStencilAttachmentWrite |
vk::AccessFlagBits::eDepthStencilAttachmentRead);
// Register meta data for this depth buffer
if (!(image.flags & ImageFlagBits::MetaRegistered)) {
if (image_info.meta_info.htile_addr) {
surface_metas.emplace(
image_info.meta_info.htile_addr,
MetaDataInfo{.type = MetaDataInfo::Type::HTile, .is_cleared = true});
image.info.meta_info.htile_addr = image_info.meta_info.htile_addr;
image.flags |= ImageFlagBits::MetaRegistered;
}
}
// Update tracked image usage
image.info.usage.depth_target = true;
ImageViewInfo view_info;
view_info.format = info.pixel_format;
return RegisterImageView(image_id, view_info);
}
@ -247,65 +292,57 @@ void TextureCache::RefreshImage(Image& image) {
// Mark image as validated.
image.flags &= ~ImageFlagBits::CpuModified;
{
if (!tile_manager.TryDetile(image)) {
// Upload data to the staging buffer.
const auto offset = staging.Copy(image.cpu_addr, image.info.guest_size_bytes, 4);
// Copy to the image.
image.Upload(staging.Handle(), offset);
}
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
return;
}
ASSERT(image.info.resources.levels == image.info.mips_layout.size());
const u8* image_data = reinterpret_cast<const u8*>(image.cpu_addr);
for (u32 m = 0; m < image.info.resources.levels; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth = image.info.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const u32 map_size = image.info.mips_layout[m].second * image.info.resources.layers;
// Upload data to the staging buffer.
const auto [data, offset, _] = staging.Map(map_size, 16);
if (image.info.is_tiled) {
ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode());
} else {
std::memcpy(data,
image_data + image.info.mips_layout[m].first * image.info.resources.layers,
map_size);
}
staging.Commit(map_size);
// Copy to the image.
const vk::BufferImageCopy image_copy = {
.bufferOffset = offset,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.mipLevel = m,
.baseArrayLayer = 0,
.layerCount = u32(image.info.resources.layers),
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, depth},
};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
cmdbuf.copyBufferToImage(staging.Handle(), image.image,
vk::ImageLayout::eTransferDstOptimal, image_copy);
vk::Buffer buffer{staging.Handle()};
u32 offset{0};
auto upload_buffer = tile_manager.TryDetile(image);
if (upload_buffer) {
buffer = *upload_buffer;
} else {
// Upload data to the staging buffer.
const auto [data, offset_, _] = staging.Map(image.info.guest_size_bytes, 16);
std::memcpy(data, (void*)image.info.guest_address, image.info.guest_size_bytes);
staging.Commit(image.info.guest_size_bytes);
offset = offset_;
}
const auto& num_layers = image.info.resources.layers;
const auto& num_mips = image.info.resources.levels;
ASSERT(num_mips == image.info.mips_layout.size());
boost::container::small_vector<vk::BufferImageCopy, 14> image_copy{};
for (u32 m = 0; m < num_mips; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
image_copy.push_back({
.bufferOffset = offset + mip_ofs * num_layers,
.bufferRowLength = static_cast<uint32_t>(mip_pitch),
.bufferImageHeight = static_cast<uint32_t>(mip_height),
.imageSubresource{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.mipLevel = m,
.baseArrayLayer = 0,
.layerCount = num_layers,
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, depth},
});
}
cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
}
}
vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {
const u64 hash = XXH3_64bits(&sampler, sizeof(sampler));
@ -320,47 +357,8 @@ void TextureCache::RegisterImage(ImageId image_id) {
image.flags |= ImageFlagBits::Registered;
ForEachPage(image.cpu_addr, image.info.guest_size_bytes,
[this, image_id](u64 page) { page_table[page].push_back(image_id); });
}
void TextureCache::RegisterMeta(const ImageInfo& info, ImageId image_id) {
Image& image = slot_images[image_id];
if (image.flags & ImageFlagBits::MetaRegistered) {
return;
}
bool registered = true;
// Current resource tracking implementation allows us to detect usage of meta only in the last
// moment, so we likely will miss its first clear. To avoid this and make first frame, where
// the meta is encountered, looks correct we set its state to "cleared" at registrations time.
if (info.usage.render_target) {
if (info.meta_info.cmask_addr) {
surface_metas.emplace(
info.meta_info.cmask_addr,
MetaDataInfo{.type = MetaDataInfo::Type::CMask, .is_cleared = true});
image.info.meta_info.cmask_addr = info.meta_info.cmask_addr;
}
if (info.meta_info.fmask_addr) {
surface_metas.emplace(
info.meta_info.fmask_addr,
MetaDataInfo{.type = MetaDataInfo::Type::FMask, .is_cleared = true});
image.info.meta_info.fmask_addr = info.meta_info.fmask_addr;
}
} else if (info.usage.depth_target) {
if (info.meta_info.htile_addr) {
surface_metas.emplace(
info.meta_info.htile_addr,
MetaDataInfo{.type = MetaDataInfo::Type::HTile, .is_cleared = true});
image.info.meta_info.htile_addr = info.meta_info.htile_addr;
}
} else {
registered = false;
}
if (registered) {
image.flags |= ImageFlagBits::MetaRegistered;
}
image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone);
}
void TextureCache::UnregisterImage(ImageId image_id) {

View File

@ -51,17 +51,16 @@ public:
[[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true);
/// Retrieves an image view with the properties of the specified image descriptor.
[[nodiscard]] ImageView& FindTexture(const AmdGpu::Image& image, bool is_storage);
[[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info,
const ImageViewInfo& view_info);
/// Retrieves the render target with specified properties
[[nodiscard]] ImageView& FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer,
const AmdGpu::Liverpool::CbDbExtent& hint);
[[nodiscard]] ImageView& FindRenderTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info);
/// Retrieves the depth target with specified properties
[[nodiscard]] ImageView& FindDepthTarget(const AmdGpu::Liverpool::DepthBuffer& buffer,
u32 num_slices, VAddr htile_address,
const AmdGpu::Liverpool::CbDbExtent& hint,
bool write_enabled);
[[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info);
/// Reuploads image contents.
void RefreshImage(Image& image);
@ -158,9 +157,6 @@ private:
/// Register image in the page table
void RegisterImage(ImageId image);
/// Register metadata surfaces attached to the image
void RegisterMeta(const ImageInfo& info, ImageId image);
/// Unregister image from the page table
void UnregisterImage(ImageId image);

View File

@ -16,6 +16,7 @@
#include <boost/container/static_vector.hpp>
#include <magic_enum.hpp>
#include <vk_mem_alloc.h>
namespace VideoCore {
@ -176,6 +177,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
return vk::Format::eR8Uint;
case vk::Format::eR8G8Unorm:
case vk::Format::eR16Sfloat:
case vk::Format::eR16Unorm:
return vk::Format::eR8G8Uint;
case vk::Format::eR8G8B8A8Srgb:
case vk::Format::eB8G8R8A8Srgb:
@ -183,10 +185,13 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
case vk::Format::eR8G8B8A8Unorm:
case vk::Format::eR32Sfloat:
case vk::Format::eR32Uint:
case vk::Format::eR16G16Sfloat:
return vk::Format::eR32Uint;
case vk::Format::eBc1RgbaUnormBlock:
case vk::Format::eBc4UnormBlock:
case vk::Format::eR32G32Sfloat:
case vk::Format::eR32G32Uint:
case vk::Format::eR16G16B16A16Unorm:
return vk::Format::eR32G32Uint;
case vk::Format::eBc2SrgbBlock:
case vk::Format::eBc2UnormBlock:
@ -225,14 +230,14 @@ const DetilerContext* TileManager::GetDetiler(const Image& image) const {
return nullptr;
}
static constexpr vk::BufferUsageFlags StagingFlags = vk::BufferUsageFlagBits::eTransferDst |
vk::BufferUsageFlagBits::eUniformBuffer |
vk::BufferUsageFlagBits::eStorageBuffer;
struct DetilerParams {
u32 num_levels;
u32 pitch0;
u32 sizes[14];
};
TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler)
: instance{instance}, scheduler{scheduler},
staging{instance, scheduler, StagingFlags, 256_MB, Vulkan::BufferType::Upload} {
: instance{instance}, scheduler{scheduler} {
static const std::array detiler_shaders{
HostShaders::DETILE_M8X1_COMP, HostShaders::DETILE_M8X2_COMP,
HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP,
@ -264,7 +269,7 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
},
{
.binding = 1,
.descriptorType = vk::DescriptorType::eStorageImage,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
@ -281,7 +286,7 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
const vk::PushConstantRange push_constants = {
.stageFlags = vk::ShaderStageFlagBits::eCompute,
.offset = 0,
.size = sizeof(u32),
.size = sizeof(DetilerParams),
};
const vk::DescriptorSetLayout set_layout = *desc_layout;
@ -312,35 +317,88 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
TileManager::~TileManager() = default;
bool TileManager::TryDetile(Image& image) {
if (!image.info.is_tiled) {
return false;
TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*= false*/) {
const auto usage = vk::BufferUsageFlagBits::eStorageBuffer |
(is_storage ? vk::BufferUsageFlagBits::eTransferSrc
: vk::BufferUsageFlagBits::eTransferDst);
const vk::BufferCreateInfo buffer_ci{
.size = size,
.usage = usage,
};
const bool is_large_buffer = size > 128_MB;
VmaAllocationCreateInfo alloc_info{
.flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT
: static_cast<VmaAllocationCreateFlags>(0),
.usage = is_large_buffer ? VMA_MEMORY_USAGE_AUTO_PREFER_HOST
: VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
.requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
: static_cast<VkMemoryPropertyFlags>(0),
};
VkBuffer buffer;
VmaAllocation allocation;
const auto buffer_ci_unsafe = static_cast<VkBufferCreateInfo>(buffer_ci);
const auto result = vmaCreateBuffer(instance.GetAllocator(), &buffer_ci_unsafe, &alloc_info,
&buffer, &allocation, nullptr);
ASSERT(result == VK_SUCCESS);
return {buffer, allocation};
}
void TileManager::Upload(ScratchBuffer buffer, const void* data, size_t size) {
VmaAllocationInfo alloc_info{};
vmaGetAllocationInfo(instance.GetAllocator(), buffer.second, &alloc_info);
ASSERT(size <= alloc_info.size);
void* ptr{};
const auto result = vmaMapMemory(instance.GetAllocator(), buffer.second, &ptr);
ASSERT(result == VK_SUCCESS);
std::memcpy(ptr, data, size);
vmaUnmapMemory(instance.GetAllocator(), buffer.second);
}
void TileManager::FreeBuffer(ScratchBuffer buffer) {
vmaDestroyBuffer(instance.GetAllocator(), buffer.first, buffer.second);
}
std::optional<vk::Buffer> TileManager::TryDetile(Image& image) {
if (!image.info.props.is_tiled) {
return std::nullopt;
}
const auto* detiler = GetDetiler(image);
if (!detiler) {
LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
return false;
return std::nullopt;
}
const auto offset =
staging.Copy(image.cpu_addr, image.info.guest_size_bytes, instance.StorageMinAlignment());
image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite);
// Prepare input buffer
auto in_buffer = AllocBuffer(image.info.guest_size_bytes);
Upload(in_buffer, reinterpret_cast<const void*>(image.info.guest_address),
image.info.guest_size_bytes);
// Prepare output buffer
auto out_buffer = AllocBuffer(image.info.guest_size_bytes, true);
scheduler.DeferOperation([=, this]() {
FreeBuffer(in_buffer);
FreeBuffer(out_buffer);
});
auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl);
const vk::DescriptorBufferInfo input_buffer_info{
.buffer = staging.Handle(),
.offset = offset,
.buffer = in_buffer.first,
.offset = 0,
.range = image.info.guest_size_bytes,
};
ASSERT(image.view_for_detiler.has_value());
const vk::DescriptorImageInfo output_image_info{
.imageView = *image.view_for_detiler->image_view,
.imageLayout = image.layout,
const vk::DescriptorBufferInfo output_buffer_info{
.buffer = out_buffer.first,
.offset = 0,
.range = image.info.guest_size_bytes,
};
std::vector<vk::WriteDescriptorSet> set_writes{
@ -357,20 +415,44 @@ bool TileManager::TryDetile(Image& image) {
.dstBinding = 1,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageImage,
.pImageInfo = &output_image_info,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &output_buffer_info,
},
};
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *detiler->pl_layout, 0,
set_writes);
cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u,
sizeof(image.info.pitch), &image.info.pitch);
DetilerParams params;
params.pitch0 = image.info.pitch >> (image.info.props.is_block ? 2u : 0u);
params.num_levels = image.info.resources.levels;
cmdbuf.dispatch((image.info.size.width * image.info.size.height) / 64, 1,
1); // round to 64
ASSERT(image.info.resources.levels <= 14);
std::memset(&params.sizes, 0, sizeof(params.sizes));
for (int m = 0; m < image.info.resources.levels; ++m) {
params.sizes[m] = image.info.mips_layout[m].size * image.info.resources.layers +
(m > 0 ? params.sizes[m - 1] : 0);
}
return true;
auto pitch = image.info.pitch;
cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params),
&params);
ASSERT((image.info.guest_size_bytes % 64) == 0);
const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u);
const auto num_tiles = image.info.guest_size_bytes / (64 * (bpp / 8));
cmdbuf.dispatch(num_tiles, 1, 1);
const vk::BufferMemoryBarrier post_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead,
.buffer = out_buffer.first,
.size = image.info.guest_size_bytes,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
{}, post_barrier, {});
return {out_buffer.first};
}
} // namespace VideoCore

View File

@ -34,10 +34,16 @@ struct DetilerContext {
class TileManager {
public:
using ScratchBuffer = std::pair<VkBuffer, VmaAllocation>;
TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler);
~TileManager();
bool TryDetile(Image& image);
std::optional<vk::Buffer> TryDetile(Image& image);
ScratchBuffer AllocBuffer(u32 size, bool is_storage = false);
void Upload(ScratchBuffer buffer, const void* data, size_t size);
void FreeBuffer(ScratchBuffer buffer);
private:
const DetilerContext* GetDetiler(const Image& image) const;
@ -45,7 +51,6 @@ private:
private:
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
Vulkan::StreamBuffer staging;
std::array<DetilerContext, DetilerType::Max> detilers;
};