Implemented load_buffer_format_* conversions (#295)
* Implemented load_buffer_format_* conversions * clang-format insists on ugly things
This commit is contained in:
parent
c6cdfcfb0b
commit
f9e96793cc
|
@ -4,6 +4,8 @@
|
|||
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
|
||||
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
|
||||
|
||||
#include <magic_enum.hpp>
|
||||
|
||||
namespace Shader::Backend::SPIRV {
|
||||
namespace {
|
||||
|
||||
|
@ -209,57 +211,216 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen
|
|||
ctx.OpStore(pointer, value);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
const auto info = inst->Flags<IR::BufferInstInfo>();
|
||||
const auto& buffer = ctx.buffers[handle];
|
||||
if (info.index_enable && info.offset_enable) {
|
||||
UNREACHABLE();
|
||||
} else if (info.index_enable) {
|
||||
const Id ptr{
|
||||
ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)};
|
||||
return ctx.OpLoad(buffer.data_types->Get(1), ptr);
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return EmitLoadBufferF32(ctx, inst, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
const auto info = inst->Flags<IR::BufferInstInfo>();
|
||||
template <int N>
|
||||
static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
|
||||
const auto& buffer = ctx.buffers[handle];
|
||||
boost::container::static_vector<Id, 2> ids;
|
||||
for (u32 i = 0; i < 2; i++) {
|
||||
const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
|
||||
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
|
||||
Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
|
||||
if constexpr (N == 1) {
|
||||
const Id ptr{
|
||||
ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)};
|
||||
return ctx.OpLoad(buffer.data_types->Get(1), ptr);
|
||||
} else {
|
||||
boost::container::static_vector<Id, N> ids;
|
||||
for (u32 i = 0; i < N; i++) {
|
||||
index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
|
||||
const Id ptr{
|
||||
ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
|
||||
ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
|
||||
}
|
||||
return ctx.OpCompositeConstruct(buffer.data_types->Get(2), ids);
|
||||
return ctx.OpCompositeConstruct(buffer.data_types->Get(N), ids);
|
||||
}
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
const auto info = inst->Flags<IR::BufferInstInfo>();
|
||||
const auto& buffer = ctx.buffers[handle];
|
||||
boost::container::static_vector<Id, 3> ids;
|
||||
for (u32 i = 0; i < 3; i++) {
|
||||
const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
|
||||
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
|
||||
ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
|
||||
}
|
||||
return ctx.OpCompositeConstruct(buffer.data_types->Get(3), ids);
|
||||
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
return EmitLoadBufferF32xN<1>(ctx, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
const auto info = inst->Flags<IR::BufferInstInfo>();
|
||||
const auto& buffer = ctx.buffers[handle];
|
||||
boost::container::static_vector<Id, 4> ids;
|
||||
for (u32 i = 0; i < 4; i++) {
|
||||
const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
|
||||
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
|
||||
ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
|
||||
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
return EmitLoadBufferF32xN<2>(ctx, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
return EmitLoadBufferF32xN<3>(ctx, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
|
||||
return EmitLoadBufferF32xN<4>(ctx, handle, address);
|
||||
}
|
||||
|
||||
static bool IsSignedInteger(AmdGpu::NumberFormat format) {
|
||||
switch (format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
case AmdGpu::NumberFormat::Uscaled:
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
return false;
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
case AmdGpu::NumberFormat::Sscaled:
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
case AmdGpu::NumberFormat::SnormNz:
|
||||
return true;
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids);
|
||||
}
|
||||
|
||||
static u32 UXBitsMax(u32 bit_width) {
|
||||
return (1u << bit_width) - 1u;
|
||||
}
|
||||
|
||||
static u32 SXBitsMax(u32 bit_width) {
|
||||
return (1u << (bit_width - 1u)) - 1u;
|
||||
}
|
||||
|
||||
static Id ConvertValue(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) {
|
||||
switch (format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width))));
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width))));
|
||||
case AmdGpu::NumberFormat::SnormNz:
|
||||
// (x * 2 + 1) / (Format::SMAX * 2)
|
||||
value = ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(2.f));
|
||||
value = ctx.OpFAdd(ctx.F32[1], value, ctx.ConstF32(1.f));
|
||||
return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width) * 2)));
|
||||
case AmdGpu::NumberFormat::Uscaled:
|
||||
case AmdGpu::NumberFormat::Sscaled:
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return value;
|
||||
default:
|
||||
UNREACHABLE_MSG("Unsupported number fromat for conversion: {}",
|
||||
magic_enum::enum_name(format));
|
||||
}
|
||||
}
|
||||
|
||||
static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offset) {
|
||||
Id comp_offset = ctx.ConstU32(bit_offset);
|
||||
if (stride < 4) {
|
||||
// comp_offset += (address % 4) * 8;
|
||||
const Id byte_offset = ctx.OpUMod(ctx.U32[1], address, ctx.ConstU32(4u));
|
||||
const Id bit_offset = ctx.OpShiftLeftLogical(ctx.U32[1], byte_offset, ctx.ConstU32(3u));
|
||||
comp_offset = ctx.OpIAdd(ctx.U32[1], comp_offset, bit_offset);
|
||||
}
|
||||
return comp_offset;
|
||||
}
|
||||
|
||||
static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) {
|
||||
const auto& buffer = ctx.buffers[handle];
|
||||
const auto format = buffer.buffer.GetDataFmt();
|
||||
switch (format) {
|
||||
case AmdGpu::DataFormat::FormatInvalid:
|
||||
return ctx.f32_zero_value;
|
||||
case AmdGpu::DataFormat::Format8:
|
||||
case AmdGpu::DataFormat::Format16:
|
||||
case AmdGpu::DataFormat::Format32:
|
||||
case AmdGpu::DataFormat::Format8_8:
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
case AmdGpu::DataFormat::Format10_11_11:
|
||||
case AmdGpu::DataFormat::Format11_11_10:
|
||||
case AmdGpu::DataFormat::Format10_10_10_2:
|
||||
case AmdGpu::DataFormat::Format2_10_10_10:
|
||||
case AmdGpu::DataFormat::Format8_8_8_8:
|
||||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format16_16_16_16:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32: {
|
||||
const u32 num_components = AmdGpu::NumComponents(format);
|
||||
if (comp >= num_components) {
|
||||
return ctx.f32_zero_value;
|
||||
}
|
||||
|
||||
// uint index = address / 4;
|
||||
Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
|
||||
const u32 stride = buffer.buffer.GetStride();
|
||||
if (stride > 4) {
|
||||
const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32);
|
||||
if (index_offset > 0) {
|
||||
// index += index_offset;
|
||||
index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(index_offset));
|
||||
}
|
||||
}
|
||||
const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index);
|
||||
|
||||
const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32;
|
||||
const u32 bit_width = AmdGpu::ComponentBits(format, comp);
|
||||
const auto num_format = buffer.buffer.GetNumberFmt();
|
||||
if (num_format == AmdGpu::NumberFormat::Float) {
|
||||
if (bit_width == 32) {
|
||||
return ctx.OpLoad(ctx.F32[1], ptr);
|
||||
} else if (bit_width == 16) {
|
||||
const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset);
|
||||
Id value = ctx.OpLoad(ctx.U32[1], ptr);
|
||||
value =
|
||||
ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width));
|
||||
value = ctx.OpSConvert(ctx.U16, value);
|
||||
value = ctx.OpBitcast(ctx.F16[1], value);
|
||||
return ctx.OpFConvert(ctx.F32[1], value);
|
||||
} else {
|
||||
UNREACHABLE_MSG("Invalid float bit width {}", bit_width);
|
||||
}
|
||||
} else {
|
||||
Id value = ctx.OpLoad(ctx.U32[1], ptr);
|
||||
const bool is_signed = IsSignedInteger(num_format);
|
||||
if (bit_width < 32) {
|
||||
const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset);
|
||||
if (is_signed) {
|
||||
value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset,
|
||||
ctx.ConstU32(bit_width));
|
||||
value = ctx.OpConvertSToF(ctx.F32[1], value);
|
||||
} else {
|
||||
value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset,
|
||||
ctx.ConstU32(bit_width));
|
||||
value = ctx.OpConvertUToF(ctx.F32[1], value);
|
||||
}
|
||||
} else {
|
||||
if (is_signed) {
|
||||
value = ctx.OpConvertSToF(ctx.F32[1], value);
|
||||
} else {
|
||||
value = ctx.OpConvertUToF(ctx.F32[1], value);
|
||||
}
|
||||
}
|
||||
return ConvertValue(ctx, value, num_format, bit_width);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format));
|
||||
}
|
||||
}
|
||||
|
||||
template <int N>
|
||||
static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
if constexpr (N == 1) {
|
||||
return GetBufferFormatValue(ctx, handle, address, 0);
|
||||
} else {
|
||||
boost::container::static_vector<Id, N> ids;
|
||||
for (u32 i = 0; i < N; i++) {
|
||||
ids.push_back(GetBufferFormatValue(ctx, handle, address, i));
|
||||
}
|
||||
return ctx.OpCompositeConstruct(ctx.F32[N], ids);
|
||||
}
|
||||
}
|
||||
|
||||
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return EmitLoadBufferFormatF32xN<1>(ctx, inst, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return EmitLoadBufferFormatF32xN<2>(ctx, inst, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return EmitLoadBufferFormatF32xN<3>(ctx, inst, handle, address);
|
||||
}
|
||||
|
||||
Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
|
||||
return EmitLoadBufferFormatF32xN<4>(ctx, inst, handle, address);
|
||||
}
|
||||
|
||||
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
|
||||
|
|
|
@ -66,6 +66,10 @@ Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
|||
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
|
||||
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
|
||||
|
|
|
@ -301,9 +301,7 @@ void EmitContext::DefineBuffers(const Info& info) {
|
|||
for (u32 i = 0; const auto& buffer : info.buffers) {
|
||||
const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32;
|
||||
const Id data_type = (*data_types)[1];
|
||||
const u32 stride = buffer.stride == 0 ? 1 : buffer.stride;
|
||||
const u32 num_elements = stride * buffer.num_records;
|
||||
const Id record_array_type{TypeArray(data_type, ConstU32(num_elements))};
|
||||
const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))};
|
||||
const Id struct_type{TypeStruct(record_array_type)};
|
||||
if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) {
|
||||
Decorate(record_array_type, spv::Decoration::ArrayStride, 4);
|
||||
|
@ -333,6 +331,7 @@ void EmitContext::DefineBuffers(const Info& info) {
|
|||
.id = id,
|
||||
.data_types = data_types,
|
||||
.pointer_type = pointer_type,
|
||||
.buffer = buffer.GetVsharp(info),
|
||||
});
|
||||
interfaces.push_back(id);
|
||||
i++;
|
||||
|
|
|
@ -201,6 +201,7 @@ public:
|
|||
Id id;
|
||||
const VectorIds* data_types;
|
||||
Id pointer_type;
|
||||
AmdGpu::Buffer buffer;
|
||||
};
|
||||
|
||||
u32& binding;
|
||||
|
|
|
@ -254,8 +254,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
|
|||
info.buffers.push_back({
|
||||
.sgpr_base = attrib.sgpr_base,
|
||||
.dword_offset = attrib.dword_offset,
|
||||
.stride = buffer.GetStride(),
|
||||
.num_records = buffer.num_records,
|
||||
.length = buffer.num_records,
|
||||
.used_types = IR::Type::F32,
|
||||
.is_storage = true, // we may not fit into UBO with large meshes
|
||||
.is_instance_data = true,
|
||||
|
@ -571,28 +570,40 @@ void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_l
|
|||
translator.V_CNDMASK_B32(inst);
|
||||
break;
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_X:
|
||||
translator.BUFFER_LOAD_FORMAT(1, true, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(1, true, true, inst);
|
||||
break;
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XY:
|
||||
translator.BUFFER_LOAD_FORMAT(2, true, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(2, true, true, inst);
|
||||
break;
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
|
||||
translator.BUFFER_LOAD_FORMAT(3, true, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(3, true, true, inst);
|
||||
break;
|
||||
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
|
||||
translator.BUFFER_LOAD_FORMAT(4, true, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(4, true, true, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_FORMAT_X:
|
||||
case Opcode::BUFFER_LOAD_DWORD:
|
||||
translator.BUFFER_LOAD_FORMAT(1, false, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(1, false, true, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XY:
|
||||
translator.BUFFER_LOAD_FORMAT(2, false, true, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
|
||||
case Opcode::BUFFER_LOAD_DWORDX3:
|
||||
translator.BUFFER_LOAD_FORMAT(3, false, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(3, false, true, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
|
||||
translator.BUFFER_LOAD_FORMAT(4, false, true, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_DWORD:
|
||||
translator.BUFFER_LOAD_FORMAT(1, false, false, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_DWORDX2:
|
||||
translator.BUFFER_LOAD_FORMAT(2, false, false, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_DWORDX3:
|
||||
translator.BUFFER_LOAD_FORMAT(3, false, false, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_LOAD_DWORDX4:
|
||||
translator.BUFFER_LOAD_FORMAT(4, false, inst);
|
||||
translator.BUFFER_LOAD_FORMAT(4, false, false, inst);
|
||||
break;
|
||||
case Opcode::BUFFER_STORE_FORMAT_X:
|
||||
case Opcode::BUFFER_STORE_DWORD:
|
||||
|
|
|
@ -160,7 +160,7 @@ public:
|
|||
void V_CMP_CLASS_F32(const GcnInst& inst);
|
||||
|
||||
// Vector Memory
|
||||
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
|
||||
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst);
|
||||
void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
|
||||
|
||||
// Vector interpolation
|
||||
|
|
|
@ -225,7 +225,8 @@ void Translator::IMAGE_STORE(const GcnInst& inst) {
|
|||
ir.ImageWrite(handle, body, value, {});
|
||||
}
|
||||
|
||||
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) {
|
||||
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format,
|
||||
const GcnInst& inst) {
|
||||
const auto& mtbuf = inst.control.mtbuf;
|
||||
const IR::VectorReg vaddr{inst.src[0].code};
|
||||
const IR::ScalarReg sharp{inst.src[2].code * 4};
|
||||
|
@ -254,7 +255,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst
|
|||
const IR::Value handle =
|
||||
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
|
||||
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
|
||||
const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info);
|
||||
const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info)
|
||||
: ir.LoadBuffer(num_dwords, handle, address, info);
|
||||
const IR::VectorReg dst_reg{inst.src[1].code};
|
||||
if (num_dwords == 1) {
|
||||
ir.SetVectorReg(dst_reg, IR::F32{value});
|
||||
|
|
|
@ -327,6 +327,22 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad
|
|||
}
|
||||
}
|
||||
|
||||
Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
case 1:
|
||||
return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
|
||||
case 2:
|
||||
return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address);
|
||||
case 3:
|
||||
return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address);
|
||||
case 4:
|
||||
return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address);
|
||||
default:
|
||||
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
|
||||
}
|
||||
}
|
||||
|
||||
void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address,
|
||||
const Value& data, BufferInstInfo info) {
|
||||
switch (num_dwords) {
|
||||
|
|
|
@ -89,6 +89,8 @@ public:
|
|||
|
||||
[[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
[[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
|
||||
BufferInstInfo info);
|
||||
void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data,
|
||||
BufferInstInfo info);
|
||||
|
||||
|
|
|
@ -79,6 +79,10 @@ OPCODE(LoadBufferF32, F32, Opaq
|
|||
OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferFormatF32, F32, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferFormatF32x2, F32x2, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferFormatF32x3, F32x3, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferFormatF32x4, F32x4, Opaque, Opaque, )
|
||||
OPCODE(LoadBufferU32, U32, Opaque, Opaque, )
|
||||
OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, )
|
||||
OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, )
|
||||
|
|
|
@ -27,6 +27,10 @@ bool IsBufferInstruction(const IR::Inst& inst) {
|
|||
case IR::Opcode::LoadBufferF32x2:
|
||||
case IR::Opcode::LoadBufferF32x3:
|
||||
case IR::Opcode::LoadBufferF32x4:
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::LoadBufferFormatF32x2:
|
||||
case IR::Opcode::LoadBufferFormatF32x3:
|
||||
case IR::Opcode::LoadBufferFormatF32x4:
|
||||
case IR::Opcode::LoadBufferU32:
|
||||
case IR::Opcode::ReadConstBuffer:
|
||||
case IR::Opcode::ReadConstBufferU32:
|
||||
|
@ -41,8 +45,49 @@ bool IsBufferInstruction(const IR::Inst& inst) {
|
|||
}
|
||||
}
|
||||
|
||||
IR::Type BufferDataType(const IR::Inst& inst) {
|
||||
static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
|
||||
switch (num_format) {
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
switch (data_format) {
|
||||
case AmdGpu::DataFormat::Format16:
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
case AmdGpu::DataFormat::Format16_16_16_16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
case AmdGpu::NumberFormat::Uscaled:
|
||||
case AmdGpu::NumberFormat::Sscaled:
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
case AmdGpu::NumberFormat::SnormNz:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::LoadBufferFormatF32x2:
|
||||
case IR::Opcode::LoadBufferFormatF32x3:
|
||||
case IR::Opcode::LoadBufferFormatF32x4:
|
||||
switch (num_format) {
|
||||
case AmdGpu::NumberFormat::Unorm:
|
||||
case AmdGpu::NumberFormat::Snorm:
|
||||
case AmdGpu::NumberFormat::Uscaled:
|
||||
case AmdGpu::NumberFormat::Sscaled:
|
||||
case AmdGpu::NumberFormat::Uint:
|
||||
case AmdGpu::NumberFormat::Sint:
|
||||
case AmdGpu::NumberFormat::SnormNz:
|
||||
return IR::Type::U32;
|
||||
case AmdGpu::NumberFormat::Float:
|
||||
return IR::Type::F32;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
case IR::Opcode::LoadBufferF32:
|
||||
case IR::Opcode::LoadBufferF32x2:
|
||||
case IR::Opcode::LoadBufferF32x3:
|
||||
|
@ -141,7 +186,7 @@ public:
|
|||
desc.inline_cbuf == existing.inline_cbuf;
|
||||
})};
|
||||
auto& buffer = buffer_resources[index];
|
||||
ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records);
|
||||
ASSERT(buffer.length == desc.length);
|
||||
buffer.is_storage |= desc.is_storage;
|
||||
buffer.used_types |= desc.used_types;
|
||||
return index;
|
||||
|
@ -263,6 +308,41 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
|
|||
|
||||
static constexpr size_t MaxUboSize = 65536;
|
||||
|
||||
static bool IsLoadBufferFormat(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::LoadBufferFormatF32x2:
|
||||
case IR::Opcode::LoadBufferFormatF32x3:
|
||||
case IR::Opcode::LoadBufferFormatF32x4:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool IsReadConstBuffer(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::ReadConstBuffer:
|
||||
case IR::Opcode::ReadConstBufferU32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static u32 BufferLength(const AmdGpu::Buffer& buffer) {
|
||||
const auto stride = buffer.GetStride();
|
||||
if (stride < sizeof(f32)) {
|
||||
ASSERT(sizeof(f32) % stride == 0);
|
||||
return (((buffer.num_records - 1) / sizeof(f32)) + 1) * stride;
|
||||
} else if (stride == sizeof(f32)) {
|
||||
return buffer.num_records;
|
||||
} else {
|
||||
ASSERT(stride % sizeof(f32) == 0);
|
||||
return buffer.num_records * (stride / sizeof(f32));
|
||||
}
|
||||
}
|
||||
|
||||
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
|
||||
AmdGpu::Buffer& cbuf) {
|
||||
|
||||
|
@ -298,9 +378,8 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
|
|||
return descriptors.Add(BufferResource{
|
||||
.sgpr_base = std::numeric_limits<u32>::max(),
|
||||
.dword_offset = 0,
|
||||
.stride = cbuf.GetStride(),
|
||||
.num_records = u32(cbuf.num_records),
|
||||
.used_types = BufferDataType(inst),
|
||||
.length = BufferLength(cbuf),
|
||||
.used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
|
||||
.inline_cbuf = cbuf,
|
||||
.is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize,
|
||||
});
|
||||
|
@ -318,9 +397,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
|
|||
binding = descriptors.Add(BufferResource{
|
||||
.sgpr_base = sharp.sgpr_base,
|
||||
.dword_offset = sharp.dword_offset,
|
||||
.stride = buffer.GetStride(),
|
||||
.num_records = u32(buffer.num_records),
|
||||
.used_types = BufferDataType(inst),
|
||||
.length = BufferLength(buffer),
|
||||
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
|
||||
.is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
|
||||
});
|
||||
}
|
||||
|
@ -337,24 +415,31 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
|
|||
inst_info.dmft == AmdGpu::DataFormat::Format32_32 ||
|
||||
inst_info.dmft == AmdGpu::DataFormat::Format32));
|
||||
}
|
||||
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
|
||||
inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
|
||||
|
||||
if (IsReadConstBuffer(inst)) {
|
||||
return;
|
||||
}
|
||||
// Calculate buffer address.
|
||||
const u32 dword_stride = buffer.GetStrideElements(sizeof(u32));
|
||||
const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32);
|
||||
IR::U32 address = ir.Imm32(dword_offset);
|
||||
if (inst_info.index_enable && inst_info.offset_enable) {
|
||||
const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 1)};
|
||||
const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 0)};
|
||||
address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
|
||||
address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2)));
|
||||
} else if (inst_info.index_enable) {
|
||||
const IR::U32 index{inst.Arg(1)};
|
||||
address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
|
||||
} else if (inst_info.offset_enable) {
|
||||
const IR::U32 offset{inst.Arg(1)};
|
||||
|
||||
if (IsLoadBufferFormat(inst)) {
|
||||
if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) {
|
||||
info.uses_fp16 = true;
|
||||
}
|
||||
} else {
|
||||
const u32 stride = buffer.GetStride();
|
||||
ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}",
|
||||
stride);
|
||||
}
|
||||
|
||||
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
|
||||
if (inst_info.index_enable) {
|
||||
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
|
||||
: IR::U32{inst.Arg(1)};
|
||||
address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride())));
|
||||
}
|
||||
if (inst_info.offset_enable) {
|
||||
const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
|
||||
: IR::U32{inst.Arg(1)};
|
||||
address = ir.IAdd(address, offset);
|
||||
}
|
||||
inst.SetArg(1, address);
|
||||
}
|
||||
|
|
|
@ -74,8 +74,7 @@ struct Info;
|
|||
struct BufferResource {
|
||||
u32 sgpr_base;
|
||||
u32 dword_offset;
|
||||
u32 stride;
|
||||
u32 num_records;
|
||||
u32 length;
|
||||
IR::Type used_types;
|
||||
AmdGpu::Buffer inline_cbuf;
|
||||
bool is_storage{false};
|
||||
|
|
|
@ -66,4 +66,110 @@ int NumBits(DataFormat format) {
|
|||
return num_bits_per_element[index];
|
||||
}
|
||||
|
||||
static constexpr std::array component_bits = {
|
||||
std::array{0, 0, 0, 0}, // 0 FormatInvalid
|
||||
std::array{8, 0, 0, 0}, // 1 Format8
|
||||
std::array{16, 0, 0, 0}, // 2 Format16
|
||||
std::array{8, 8, 0, 0}, // 3 Format8_8
|
||||
std::array{32, 0, 0, 0}, // 4 Format32
|
||||
std::array{16, 16, 0, 0}, // 5 Format16_16
|
||||
std::array{10, 11, 11, 0}, // 6 Format10_11_11
|
||||
std::array{11, 11, 10, 0}, // 7 Format11_11_10
|
||||
std::array{10, 10, 10, 2}, // 8 Format10_10_10_2
|
||||
std::array{2, 10, 10, 10}, // 9 Format2_10_10_10
|
||||
std::array{8, 8, 8, 8}, // 10 Format8_8_8_8
|
||||
std::array{32, 32, 0, 0}, // 11 Format32_32
|
||||
std::array{16, 16, 16, 16}, // 12 Format16_16_16_16
|
||||
std::array{32, 32, 32, 0}, // 13 Format32_32_32
|
||||
std::array{32, 32, 32, 32}, // 14 Format32_32_32_32
|
||||
std::array{0, 0, 0, 0}, // 15
|
||||
std::array{5, 6, 5, 0}, // 16 Format5_6_5
|
||||
std::array{1, 5, 5, 5}, // 17 Format1_5_5_5
|
||||
std::array{5, 5, 5, 1}, // 18 Format5_5_5_1
|
||||
std::array{4, 4, 4, 4}, // 19 Format4_4_4_4
|
||||
std::array{8, 24, 0, 0}, // 20 Format8_24
|
||||
std::array{24, 8, 0, 0}, // 21 Format24_8
|
||||
std::array{24, 8, 0, 0}, // 22 FormatX24_8_32
|
||||
std::array{0, 0, 0, 0}, // 23
|
||||
std::array{0, 0, 0, 0}, // 24
|
||||
std::array{0, 0, 0, 0}, // 25
|
||||
std::array{0, 0, 0, 0}, // 26
|
||||
std::array{0, 0, 0, 0}, // 27
|
||||
std::array{0, 0, 0, 0}, // 28
|
||||
std::array{0, 0, 0, 0}, // 29
|
||||
std::array{0, 0, 0, 0}, // 30
|
||||
std::array{0, 0, 0, 0}, // 31
|
||||
std::array{0, 0, 0, 0}, // 32 FormatGB_GR
|
||||
std::array{0, 0, 0, 0}, // 33 FormatBG_RG
|
||||
std::array{0, 0, 0, 0}, // 34 Format5_9_9_9
|
||||
std::array{0, 0, 0, 0}, // 35 FormatBc1
|
||||
std::array{0, 0, 0, 0}, // 36 FormatBc2
|
||||
std::array{0, 0, 0, 0}, // 37 FormatBc3
|
||||
std::array{0, 0, 0, 0}, // 38 FormatBc4
|
||||
std::array{0, 0, 0, 0}, // 39 FormatBc5
|
||||
std::array{0, 0, 0, 0}, // 40 FormatBc6
|
||||
std::array{0, 0, 0, 0}, // 41 FormatBc7
|
||||
};
|
||||
|
||||
u32 ComponentBits(DataFormat format, u32 comp) {
|
||||
const u32 index = static_cast<u32>(format);
|
||||
if (index >= component_bits.size() || comp >= 4) {
|
||||
return 0;
|
||||
}
|
||||
return component_bits[index][comp];
|
||||
}
|
||||
|
||||
static constexpr std::array component_offset = {
|
||||
std::array{-1, -1, -1, -1}, // 0 FormatInvalid
|
||||
std::array{0, -1, -1, -1}, // 1 Format8
|
||||
std::array{0, -1, -1, -1}, // 2 Format16
|
||||
std::array{0, 8, -1, -1}, // 3 Format8_8
|
||||
std::array{0, -1, -1, -1}, // 4 Format32
|
||||
std::array{0, 16, -1, -1}, // 5 Format16_16
|
||||
std::array{0, 10, 21, -1}, // 6 Format10_11_11
|
||||
std::array{0, 11, 22, -1}, // 7 Format11_11_10
|
||||
std::array{0, 10, 20, 30}, // 8 Format10_10_10_2
|
||||
std::array{0, 2, 12, 22}, // 9 Format2_10_10_10
|
||||
std::array{0, 8, 16, 24}, // 10 Format8_8_8_8
|
||||
std::array{0, 32, -1, -1}, // 11 Format32_32
|
||||
std::array{0, 16, 32, 48}, // 12 Format16_16_16_16
|
||||
std::array{0, 32, 64, -1}, // 13 Format32_32_32
|
||||
std::array{0, 32, 64, 96}, // 14 Format32_32_32_32
|
||||
std::array{-1, -1, -1, -1}, // 15
|
||||
std::array{0, 5, 11, -1}, // 16 Format5_6_5
|
||||
std::array{0, 1, 6, 11}, // 17 Format1_5_5_5
|
||||
std::array{0, 5, 10, 15}, // 18 Format5_5_5_1
|
||||
std::array{0, 4, 8, 12}, // 19 Format4_4_4_4
|
||||
std::array{0, 8, -1, -1}, // 20 Format8_24
|
||||
std::array{0, 24, -1, -1}, // 21 Format24_8
|
||||
std::array{0, 24, -1, -1}, // 22 FormatX24_8_32
|
||||
std::array{-1, -1, -1, -1}, // 23
|
||||
std::array{-1, -1, -1, -1}, // 24
|
||||
std::array{-1, -1, -1, -1}, // 25
|
||||
std::array{-1, -1, -1, -1}, // 26
|
||||
std::array{-1, -1, -1, -1}, // 27
|
||||
std::array{-1, -1, -1, -1}, // 28
|
||||
std::array{-1, -1, -1, -1}, // 29
|
||||
std::array{-1, -1, -1, -1}, // 30
|
||||
std::array{-1, -1, -1, -1}, // 31
|
||||
std::array{-1, -1, -1, -1}, // 32 FormatGB_GR
|
||||
std::array{-1, -1, -1, -1}, // 33 FormatBG_RG
|
||||
std::array{-1, -1, -1, -1}, // 34 Format5_9_9_9
|
||||
std::array{-1, -1, -1, -1}, // 35 FormatBc1
|
||||
std::array{-1, -1, -1, -1}, // 36 FormatBc2
|
||||
std::array{-1, -1, -1, -1}, // 37 FormatBc3
|
||||
std::array{-1, -1, -1, -1}, // 38 FormatBc4
|
||||
std::array{-1, -1, -1, -1}, // 39 FormatBc5
|
||||
std::array{-1, -1, -1, -1}, // 40 FormatBc6
|
||||
std::array{-1, -1, -1, -1}, // 41 FormatBc7
|
||||
};
|
||||
|
||||
s32 ComponentOffset(DataFormat format, u32 comp) {
|
||||
const u32 index = static_cast<u32>(format);
|
||||
if (index >= component_offset.size() || comp >= 4) {
|
||||
return -1;
|
||||
}
|
||||
return component_offset[index][comp];
|
||||
}
|
||||
|
||||
} // namespace AmdGpu
|
||||
|
|
|
@ -65,6 +65,8 @@ enum class NumberFormat : u32 {
|
|||
|
||||
int NumComponents(DataFormat format);
|
||||
int NumBits(DataFormat format);
|
||||
u32 ComponentBits(DataFormat format, u32 comp);
|
||||
s32 ComponentOffset(DataFormat format, u32 comp);
|
||||
|
||||
} // namespace AmdGpu
|
||||
|
||||
|
|
|
@ -62,14 +62,6 @@ struct Buffer {
|
|||
return stride == 0 ? 1U : stride;
|
||||
}
|
||||
|
||||
u32 GetStrideElements(u32 element_size) const noexcept {
|
||||
if (stride == 0) {
|
||||
return 1U;
|
||||
}
|
||||
ASSERT(stride % element_size == 0);
|
||||
return stride / element_size;
|
||||
}
|
||||
|
||||
u32 GetSize() const noexcept {
|
||||
return GetStride() * num_records;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue