shader_recompiler: Better branch detection + more opcodes

This commit is contained in:
raphaelthegreat 2024-06-01 20:25:31 +03:00
parent f624f7749c
commit 02a50265f8
31 changed files with 772 additions and 120 deletions

View File

@ -9,6 +9,7 @@
#include "shader_recompiler/backend/spirv/emit_spirv.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
#include "shader_recompiler/frontend/translate/translate.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/program.h"
@ -28,6 +29,8 @@ ArgType Arg(EmitContext& ctx, const IR::Value& arg) {
return arg;
} else if constexpr (std::is_same_v<ArgType, u32>) {
return arg.U32();
} else if constexpr (std::is_same_v<ArgType, u64>) {
return arg.U64();
} else if constexpr (std::is_same_v<ArgType, IR::Attribute>) {
return arg.Attribute();
} else if constexpr (std::is_same_v<ArgType, IR::ScalarReg>) {
@ -279,6 +282,10 @@ void EmitGetVccLo(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
void EmitGetVccHi(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
void EmitSetScc(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
@ -295,4 +302,8 @@ void EmitSetVccLo(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
void EmitSetVccHi(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
} // namespace Shader::Backend::SPIRV

View File

@ -33,6 +33,14 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) {
return ctx.ConstU32(ctx.info.user_data[static_cast<size_t>(reg)]);
}
void EmitGetThreadBitScalarReg(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
void EmitSetThreadBitScalarReg(EmitContext& ctx) {
throw LogicError("Unreachable instruction");
}
void EmitGetScalarRegister(EmitContext&) {
throw LogicError("Unreachable instruction");
}
@ -68,7 +76,7 @@ Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
}
Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) {
return EmitReadConstBuffer(ctx, handle, index);
return ctx.OpBitcast(ctx.U32[1], EmitReadConstBuffer(ctx, handle, index));
}
Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) {
@ -86,7 +94,13 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) {
return ctx.OpLoad(param.component_type, param.id);
}
}
switch (attr) {
case IR::Attribute::FragCoord:
return ctx.OpLoad(ctx.F32[1],
ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp)));
default:
throw NotImplementedException("Read attribute {}", attr);
}
}
Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
@ -98,6 +112,9 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
case IR::Attribute::LocalInvocationId:
return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.local_invocation_id),
comp);
case IR::Attribute::IsFrontFace:
return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value,
ctx.u32_zero_value);
default:
throw NotImplementedException("Read U32 attribute {}", attr);
}
@ -136,19 +153,13 @@ Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address)
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
const auto info = inst->Flags<IR::BufferInstInfo>();
const auto& buffer = ctx.buffers[handle];
if (info.index_enable && info.offset_enable) {
UNREACHABLE();
} else if (info.index_enable) {
boost::container::static_vector<Id, 4> ids;
for (u32 i = 0; i < 4; i++) {
const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
const Id ptr{
ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
}
return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids);
}
UNREACHABLE();
}
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {

View File

@ -34,14 +34,18 @@ void EmitGetScc(EmitContext& ctx);
void EmitGetExec(EmitContext& ctx);
void EmitGetVcc(EmitContext& ctx);
void EmitGetVccLo(EmitContext& ctx);
void EmitGetVccHi(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
void EmitSetExec(EmitContext& ctx);
void EmitSetVcc(EmitContext& ctx);
void EmitSetVccLo(EmitContext& ctx);
void EmitSetVccHi(EmitContext& ctx);
void EmitPrologue(EmitContext& ctx);
void EmitEpilogue(EmitContext& ctx);
void EmitDiscard(EmitContext& ctx);
Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg);
void EmitGetThreadBitScalarReg(EmitContext& ctx);
void EmitSetThreadBitScalarReg(EmitContext& ctx);
void EmitGetScalarRegister(EmitContext& ctx);
void EmitSetScalarRegister(EmitContext& ctx);
void EmitGetVectorRegister(EmitContext& ctx);

View File

@ -94,6 +94,7 @@ void EmitContext::DefineArithmeticTypes() {
true_value = ConstantTrue(U1[1]);
false_value = ConstantFalse(U1[1]);
u32_one_value = ConstU32(1U);
u32_zero_value = ConstU32(0U);
f32_zero_value = ConstF32(0.0f);
@ -177,21 +178,24 @@ void EmitContext::DefineInputs(const Info& info) {
}
break;
case Stage::Fragment:
frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input);
for (const auto& input : info.ps_inputs) {
const u32 semantic = input.param_index;
if (input.is_default) {
input_params[input.semantic] = {MakeDefaultValue(*this, input.default_value),
input_f32, F32[1]};
input_params[semantic] = {MakeDefaultValue(*this, input.default_value), input_f32,
F32[1]};
continue;
}
const IR::Attribute param{IR::Attribute::Param0 + input.param_index};
const u32 num_components = info.loads.NumComponents(param);
const Id type{F32[num_components]};
const Id id{DefineInput(type, input.semantic)};
const Id id{DefineInput(type, semantic)};
if (input.is_flat) {
Decorate(id, spv::Decoration::Flat);
}
Name(id, fmt::format("fs_in_attr{}", input.semantic));
input_params[input.semantic] = {id, input_f32, F32[1], num_components};
Name(id, fmt::format("fs_in_attr{}", semantic));
input_params[semantic] = {id, input_f32, F32[1], num_components};
interfaces.push_back(id);
}
break;
@ -260,7 +264,7 @@ void EmitContext::DefineBuffers(const Info& info) {
const Id id{AddGlobalVariable(struct_pointer_type, storage_class)};
Decorate(id, spv::Decoration::Binding, binding);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}{}", buffer.is_storage ? "ssbo" : "cbuf", i));
Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base));
binding++;
buffers.push_back({
@ -318,7 +322,9 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
case AmdGpu::ImageType::Color2DArray:
return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, true, false, 1, format);
case AmdGpu::ImageType::Color3D:
return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, 2, format);
return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, 1, format);
case AmdGpu::ImageType::Cube:
return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, 1, format);
case AmdGpu::ImageType::Buffer:
throw NotImplementedException("Image buffer");
default:

View File

@ -140,6 +140,7 @@ public:
Id true_value{};
Id false_value{};
Id u32_one_value{};
Id u32_zero_value{};
Id f32_zero_value{};
@ -154,6 +155,8 @@ public:
Id output_position{};
Id vertex_index{};
Id base_vertex{};
Id frag_coord{};
Id front_facing{};
std::array<Id, 8> frag_color{};
Id workgroup_id{};

View File

@ -38,8 +38,145 @@ void Translator::S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst) {
}
void Translator::S_ANDN2_B64(const GcnInst& inst) {
// TODO: Actually implement this.
ir.SetScc(ir.GetVcc());
// TODO: What if this is used for something other than EXEC masking?
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
default:
UNREACHABLE();
}
};
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))};
SetDst(inst.dst[0], result);
ir.SetScc(result);
}
void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) {
// This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs)
// However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination
// SGPR we have a special IR opcode for SPGRs that act as thread masks.
const IR::U1 exec{ir.GetExec()};
// Mark destination SPGR as an EXEC context. This means we will use 1-bit
// IR instruction whenever it's loaded.
ASSERT(inst.dst[0].field == OperandField::ScalarGPR);
const u32 reg = inst.dst[0].code;
exec_contexts[reg] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec);
// Update EXEC.
ASSERT(inst.src[0].field == OperandField::VccLo);
ir.SetExec(ir.LogicalAnd(exec, ir.GetVcc()));
}
void Translator::S_MOV_B64(const GcnInst& inst) {
// TODO: Using VCC as EXEC context.
if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) {
return;
}
const IR::U1 src0{GetSrc(inst.src[0])};
if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) {
// Exec context push
exec_contexts[inst.dst[0].code] = true;
} else if (inst.dst[0].field == OperandField::ExecLo &&
inst.src[0].field == OperandField::ScalarGPR) {
// Exec context pop
exec_contexts[inst.src[0].code] = false;
} else if (inst.src[0].field != OperandField::ConstZero) {
UNREACHABLE();
}
SetDst(inst.dst[0], src0);
}
void Translator::S_OR_B64(bool negate, const GcnInst& inst) {
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
default:
UNREACHABLE();
}
};
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
IR::U1 result = ir.LogicalOr(src0, src1);
if (negate) {
result = ir.LogicalNot(result);
}
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result);
}
void Translator::S_AND_B64(const GcnInst& inst) {
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
default:
UNREACHABLE();
}
};
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result = ir.LogicalAnd(src0, src1);
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result);
}
void Translator::S_ADD_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.IAdd(src0, src1));
// TODO: Overflow flag
}
void Translator::S_AND_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result{ir.BitwiseAnd(src0, src1)};
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_LSHR_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result{ir.ShiftRightLogical(src0, src1)};
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_CSELECT_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)});
}
void Translator::S_BFE_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 offset{ir.BitwiseAnd(src1, ir.Imm32(0x1F))};
const IR::U32 count{ir.BitFieldExtract(src1, ir.Imm32(16), ir.Imm32(7))};
const IR::U32 result{ir.BitFieldExtract(src0, offset, count)};
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
} // namespace Shader::Gcn

View File

@ -9,11 +9,15 @@
namespace Shader::Gcn {
std::array<bool, IR::NumScalarRegs> Translator::exec_contexts{};
Translator::Translator(IR::Block* block_, Info& info_)
: ir{*block_, block_->begin()}, info{info_} {}
void Translator::EmitPrologue() {
exec_contexts.fill(false);
ir.Prologue();
ir.SetExec(ir.Imm1(true));
// Initialize user data.
IR::ScalarReg dst_sreg = IR::ScalarReg::S0;
@ -54,10 +58,16 @@ void Translator::EmitPrologue() {
}
}
IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
IR::U32F32 value{};
IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
// Input modifiers work on float values.
force_flt |= operand.input_modifier.abs | operand.input_modifier.neg;
IR::U1U32F32 value{};
switch (operand.field) {
case OperandField::ScalarGPR:
if (exec_contexts[operand.code]) {
value = ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
}
if (operand.type == ScalarType::Float32 || force_flt) {
value = ir.GetScalarReg<IR::F32>(IR::ScalarReg(operand.code));
} else {
@ -114,9 +124,15 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
case OperandField::ConstFloatNeg_2_0:
value = ir.Imm32(-2.0f);
break;
case OperandField::ExecLo:
value = ir.GetExec();
break;
case OperandField::VccLo:
value = ir.GetVccLo();
break;
case OperandField::VccHi:
value = ir.GetVccHi();
break;
default:
UNREACHABLE();
}
@ -130,8 +146,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
return value;
}
void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) {
IR::U32F32 result = value;
void Translator::SetDst(const InstOperand& operand, const IR::U1U32F32& value) {
IR::U1U32F32 result = value;
if (operand.output_modifier.multiplier != 0.f) {
result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier));
}
@ -140,14 +156,20 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) {
}
switch (operand.field) {
case OperandField::ScalarGPR:
if (value.Type() == IR::Type::U1) {
return ir.SetThreadBitScalarReg(IR::ScalarReg(operand.code), result);
}
return ir.SetScalarReg(IR::ScalarReg(operand.code), result);
case OperandField::VectorGPR:
return ir.SetVectorReg(IR::VectorReg(operand.code), result);
case OperandField::ExecLo:
return ir.SetExec(result);
case OperandField::VccLo:
return ir.SetVccLo(result);
case OperandField::VccHi:
return ir.SetVccHi(result);
case OperandField::M0:
break; // Ignore for now
break;
default:
UNREACHABLE();
}
@ -279,11 +301,32 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::IMAGE_SAMPLE:
translator.IMAGE_SAMPLE(inst);
break;
case Opcode::V_CMP_EQ_U32:
translator.V_CMP_EQ_U32(inst);
case Opcode::V_CMP_EQ_I32:
translator.V_CMP_U32(ConditionOp::EQ, true, false, inst);
break;
case Opcode::V_CMPX_GT_U32:
translator.V_CMPX_GT_U32(inst);
case Opcode::V_CMP_NE_U32:
translator.V_CMP_U32(ConditionOp::LG, false, false, inst);
break;
case Opcode::V_CMP_EQ_U32:
translator.V_CMP_U32(ConditionOp::EQ, false, false, inst);
break;
case Opcode::V_CMP_F_U32:
translator.V_CMP_U32(ConditionOp::F, false, false, inst);
break;
case Opcode::V_CMP_LT_U32:
translator.V_CMP_U32(ConditionOp::LT, false, false, inst);
break;
case Opcode::V_CMP_GT_U32:
translator.V_CMP_U32(ConditionOp::GT, false, false, inst);
break;
case Opcode::V_CMP_GE_U32:
translator.V_CMP_U32(ConditionOp::GE, false, false, inst);
break;
case Opcode::V_CMP_TRU_U32:
translator.V_CMP_U32(ConditionOp::TRU, false, false, inst);
break;
case Opcode::V_CMP_NEQ_F32:
translator.V_CMP_F32(ConditionOp::LG, inst);
break;
case Opcode::V_CMP_F_F32:
translator.V_CMP_F32(ConditionOp::F, inst);
@ -309,6 +352,9 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_CMP_LG_U32:
translator.S_CMP(ConditionOp::LG, false, inst);
break;
case Opcode::S_CMP_EQ_I32:
translator.S_CMP(ConditionOp::EQ, true, inst);
break;
case Opcode::V_CNDMASK_B32:
translator.V_CNDMASK_B32(inst);
break;
@ -348,13 +394,125 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_MIN3_F32:
translator.V_MIN3_F32(inst);
break;
case Opcode::S_NOP:
case Opcode::V_MADMK_F32:
translator.V_MADMK_F32(inst);
break;
case Opcode::V_CUBEMA_F32:
translator.V_CUBEMA_F32(inst);
break;
case Opcode::V_CUBESC_F32:
translator.V_CUBESC_F32(inst);
break;
case Opcode::V_CUBETC_F32:
translator.V_CUBETC_F32(inst);
break;
case Opcode::V_CUBEID_F32:
translator.V_CUBEID_F32(inst);
break;
case Opcode::V_CVT_U32_F32:
translator.V_CVT_U32_F32(inst);
break;
case Opcode::V_SUBREV_F32:
translator.V_SUBREV_F32(inst);
break;
case Opcode::S_AND_SAVEEXEC_B64:
translator.S_AND_SAVEEXEC_B64(inst);
break;
case Opcode::S_MOV_B64:
translator.S_MOV_B64(inst);
break;
case Opcode::V_SUBREV_I32:
translator.V_SUBREV_I32(inst);
break;
case Opcode::V_CMP_LE_U32:
translator.V_CMP_U32(ConditionOp::LE, false, false, inst);
break;
case Opcode::V_CMP_GT_I32:
translator.V_CMP_U32(ConditionOp::GT, true, false, inst);
break;
case Opcode::V_CMPX_F_U32:
translator.V_CMP_U32(ConditionOp::F, false, true, inst);
break;
case Opcode::V_CMPX_LT_U32:
translator.V_CMP_U32(ConditionOp::LT, false, true, inst);
break;
case Opcode::V_CMPX_EQ_U32:
translator.V_CMP_U32(ConditionOp::EQ, false, true, inst);
break;
case Opcode::V_CMPX_LE_U32:
translator.V_CMP_U32(ConditionOp::LE, false, true, inst);
break;
case Opcode::V_CMPX_GT_U32:
translator.V_CMP_U32(ConditionOp::GT, false, true, inst);
break;
case Opcode::V_CMPX_NE_U32:
translator.V_CMP_U32(ConditionOp::LG, false, true, inst);
break;
case Opcode::V_CMPX_GE_U32:
translator.V_CMP_U32(ConditionOp::GE, false, true, inst);
break;
case Opcode::V_CMPX_TRU_U32:
translator.V_CMP_U32(ConditionOp::TRU, false, true, inst);
break;
case Opcode::S_OR_B64:
translator.S_OR_B64(false, inst);
break;
case Opcode::S_NOR_B64:
translator.S_OR_B64(true, inst);
break;
case Opcode::S_AND_B64:
translator.S_AND_B64(inst);
break;
case Opcode::V_LSHRREV_B32:
translator.V_LSHRREV_B32(inst);
break;
case Opcode::S_ADD_I32:
translator.S_ADD_I32(inst);
break;
case Opcode::V_MUL_LO_I32:
translator.V_MUL_LO_I32(inst);
break;
case Opcode::V_SAD_U32:
translator.V_SAD_U32(inst);
break;
case Opcode::V_BFE_U32:
translator.V_BFE_U32(inst);
break;
case Opcode::V_MAD_I32_I24:
translator.V_MAD_I32_I24(inst);
break;
case Opcode::V_MUL_I32_I24:
translator.V_MUL_I32_I24(inst);
break;
case Opcode::V_SUB_I32:
translator.V_SUB_I32(inst);
break;
case Opcode::V_LSHR_B32:
translator.V_LSHR_B32(inst);
break;
case Opcode::V_ASHRREV_I32:
translator.V_ASHRREV_I32(inst);
break;
case Opcode::V_MAD_U32_U24:
translator.V_MAD_U32_U24(inst);
break;
case Opcode::S_AND_B32:
translator.S_AND_B32(inst);
break;
case Opcode::S_LSHR_B32:
translator.S_LSHR_B32(inst);
break;
case Opcode::S_CSELECT_B32:
translator.S_CSELECT_B32(inst);
break;
case Opcode::S_BFE_U32:
translator.S_BFE_U32(inst);
break;
case Opcode::S_NOP:
case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0:
case Opcode::S_CBRANCH_SCC1:
case Opcode::S_BRANCH:
case Opcode::S_MOV_B64:
case Opcode::S_WQM_B64:
case Opcode::V_INTERP_P1_F32:
case Opcode::S_ENDPGM:

View File

@ -23,6 +23,7 @@ enum class ConditionOp : u32 {
GE,
LT,
LE,
TRU,
};
class Translator {
@ -37,6 +38,15 @@ public:
void S_MUL_I32(const GcnInst& inst);
void S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst);
void S_ANDN2_B64(const GcnInst& inst);
void S_AND_SAVEEXEC_B64(const GcnInst& inst);
void S_MOV_B64(const GcnInst& inst);
void S_OR_B64(bool negate, const GcnInst& inst);
void S_AND_B64(const GcnInst& inst);
void S_ADD_I32(const GcnInst& inst);
void S_AND_B32(const GcnInst& inst);
void S_LSHR_B32(const GcnInst& inst);
void S_CSELECT_B32(const GcnInst& inst);
void S_BFE_U32(const GcnInst& inst);
// Scalar Memory
void S_LOAD_DWORD(int num_dwords, const GcnInst& inst);
@ -48,7 +58,6 @@ public:
void V_MAC_F32(const GcnInst& inst);
void V_CVT_PKRTZ_F16_F32(const GcnInst& inst);
void V_MUL_F32(const GcnInst& inst);
void V_CMP_EQ_U32(const GcnInst& inst);
void V_CNDMASK_B32(const GcnInst& inst);
void V_AND_B32(const GcnInst& inst);
void V_LSHLREV_B32(const GcnInst& inst);
@ -63,7 +72,6 @@ public:
void V_FLOOR_F32(const GcnInst& inst);
void V_SUB_F32(const GcnInst& inst);
void V_RCP_F32(const GcnInst& inst);
void V_CMPX_GT_U32(const GcnInst& inst);
void V_FMA_F32(const GcnInst& inst);
void V_CMP_F32(ConditionOp op, const GcnInst& inst);
void V_MAX_F32(const GcnInst& inst);
@ -74,6 +82,25 @@ public:
void V_SQRT_F32(const GcnInst& inst);
void V_MIN_F32(const GcnInst& inst);
void V_MIN3_F32(const GcnInst& inst);
void V_MADMK_F32(const GcnInst& inst);
void V_CUBEMA_F32(const GcnInst& inst);
void V_CUBESC_F32(const GcnInst& inst);
void V_CUBETC_F32(const GcnInst& inst);
void V_CUBEID_F32(const GcnInst& inst);
void V_CVT_U32_F32(const GcnInst& inst);
void V_SUBREV_F32(const GcnInst& inst);
void V_SUBREV_I32(const GcnInst& inst);
void V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst);
void V_LSHRREV_B32(const GcnInst& inst);
void V_MUL_LO_I32(const GcnInst& inst);
void V_SAD_U32(const GcnInst& inst);
void V_BFE_U32(const GcnInst& inst);
void V_MAD_I32_I24(const GcnInst& inst);
void V_MUL_I32_I24(const GcnInst& inst);
void V_SUB_I32(const GcnInst& inst);
void V_LSHR_B32(const GcnInst& inst);
void V_ASHRREV_I32(const GcnInst& inst);
void V_MAD_U32_U24(const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
@ -94,12 +121,13 @@ public:
void EXP(const GcnInst& inst);
private:
IR::U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false);
void SetDst(const InstOperand& operand, const IR::U32F32& value);
IR::U1U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false);
void SetDst(const InstOperand& operand, const IR::U1U32F32& value);
private:
IR::IREmitter ir;
Info& info;
static std::array<bool, IR::NumScalarRegs> exec_contexts;
};
void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info);

View File

@ -29,17 +29,6 @@ void Translator::V_MUL_F32(const GcnInst& inst) {
ir.SetVectorReg(dst_reg, ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)));
}
void Translator::V_CMP_EQ_U32(const GcnInst& inst) {
const IR::U1 result = ir.IEqual(GetSrc(inst.src[0]), GetSrc(inst.src[1]));
if (inst.dst[1].field == OperandField::VccLo) {
return ir.SetVcc(result);
} else if (inst.dst[1].field == OperandField::ScalarGPR) {
const IR::ScalarReg dst_reg{inst.dst[1].code};
return ir.SetScalarReg(dst_reg, IR::U32{ir.Select(result, ir.Imm32(1U), ir.Imm32(0U))});
}
UNREACHABLE();
}
void Translator::V_CNDMASK_B32(const GcnInst& inst) {
const IR::VectorReg dst_reg{inst.dst[0].code};
const IR::ScalarReg flag_reg{inst.src[2].code};
@ -70,9 +59,9 @@ void Translator::V_AND_B32(const GcnInst& inst) {
void Translator::V_LSHLREV_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::VectorReg dst_reg{inst.dst[0].code};
ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, src0));
ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
}
void Translator::V_ADD_I32(const GcnInst& inst) {
@ -148,14 +137,6 @@ void Translator::V_RCP_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPRecip(src0));
}
void Translator::V_CMPX_GT_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U1 result = ir.IGreaterThan(src0, src1, false);
ir.SetVcc(result);
ir.SetExec(result);
}
void Translator::V_FMA_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
@ -182,6 +163,8 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) {
return ir.FPLessThanEqual(src0, src1);
case ConditionOp::GE:
return ir.FPGreaterThanEqual(src0, src1);
default:
UNREACHABLE();
}
}();
ir.SetVcc(result);
@ -231,4 +214,147 @@ void Translator::V_MIN3_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2)));
}
void Translator::V_MADMK_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
const IR::F32 k{GetSrc(inst.src[2], true)};
SetDst(inst.dst[0], ir.FPFma(src0, k, src1));
}
void Translator::V_CUBEMA_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.Imm32(1.f));
}
void Translator::V_CUBESC_F32(const GcnInst& inst) {
SetDst(inst.dst[0], GetSrc(inst.src[0], true));
}
void Translator::V_CUBETC_F32(const GcnInst& inst) {
SetDst(inst.dst[0], GetSrc(inst.src[1], true));
}
void Translator::V_CUBEID_F32(const GcnInst& inst) {
SetDst(inst.dst[0], GetSrc(inst.src[2], true));
}
void Translator::V_CVT_U32_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
SetDst(inst.dst[0], ir.ConvertFToU(32, src0));
}
void Translator::V_SUBREV_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
SetDst(inst.dst[0], ir.FPSub(src1, src0));
}
void Translator::V_SUBREV_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ISub(src1, src0));
// TODO: Carry-out
}
void Translator::V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U1 result = [&] {
switch (op) {
case ConditionOp::F:
return ir.Imm1(false);
case ConditionOp::TRU:
return ir.Imm1(true);
case ConditionOp::EQ:
return ir.IEqual(src0, src1);
case ConditionOp::LG:
return ir.INotEqual(src0, src1);
case ConditionOp::GT:
return ir.IGreaterThan(src0, src1, is_signed);
case ConditionOp::LT:
return ir.ILessThan(src0, src1, is_signed);
case ConditionOp::LE:
return ir.ILessThanEqual(src0, src1, is_signed);
case ConditionOp::GE:
return ir.IGreaterThanEqual(src0, src1, is_signed);
default:
UNREACHABLE();
}
}();
if (set_exec) {
ir.SetExec(result);
}
switch (inst.dst[1].field) {
case OperandField::VccLo:
return ir.SetVcc(result);
case OperandField::ScalarGPR:
return ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
default:
UNREACHABLE();
}
}
void Translator::V_LSHRREV_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ShiftRightLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
}
void Translator::V_MUL_LO_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.IMul(src0, src1));
}
void Translator::V_SAD_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 src2{GetSrc(inst.src[2])};
const IR::U32 max{ir.IMax(src0, src1, false)};
const IR::U32 min{ir.IMin(src0, src1, false)};
SetDst(inst.dst[0], ir.IAdd(ir.ISub(max, min), src2));
}
void Translator::V_BFE_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{ir.BitwiseAnd(GetSrc(inst.src[1]), ir.Imm32(0x1F))};
const IR::U32 src2{ir.BitwiseAnd(GetSrc(inst.src[2]), ir.Imm32(0x1F))};
SetDst(inst.dst[0], ir.BitFieldExtract(src0, src1, src2));
}
void Translator::V_MAD_I32_I24(const GcnInst& inst) {
const IR::U32 src0{ir.BitFieldExtract(GetSrc(inst.src[0]), ir.Imm32(0), ir.Imm32(24), true)};
const IR::U32 src1{ir.BitFieldExtract(GetSrc(inst.src[1]), ir.Imm32(0), ir.Imm32(24), true)};
const IR::U32 src2{GetSrc(inst.src[2])};
SetDst(inst.dst[0], ir.IAdd(ir.IMul(src0, src1), src2));
}
void Translator::V_MUL_I32_I24(const GcnInst& inst) {
const IR::U32 src0{ir.BitFieldExtract(GetSrc(inst.src[0]), ir.Imm32(0), ir.Imm32(24), true)};
const IR::U32 src1{ir.BitFieldExtract(GetSrc(inst.src[1]), ir.Imm32(0), ir.Imm32(24), true)};
SetDst(inst.dst[0], ir.IMul(src0, src1));
}
void Translator::V_SUB_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ISub(src0, src1));
}
void Translator::V_LSHR_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ShiftRightLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F))));
}
void Translator::V_ASHRREV_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ShiftRightArithmetic(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
}
void Translator::V_MAD_U32_U24(const GcnInst& inst) {
// TODO:
V_MAD_I32_I24(inst);
}
} // namespace Shader::Gcn

View File

@ -8,7 +8,6 @@ namespace Shader::Gcn {
void Translator::V_INTERP_P2_F32(const GcnInst& inst) {
const IR::VectorReg dst_reg{inst.dst[0].code};
auto& attr = info.ps_inputs.at(inst.control.vintrp.attr);
attr.semantic = inst.control.vintrp.attr;
const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan));
}

View File

@ -119,6 +119,14 @@ U32 IREmitter::GetUserData(IR::ScalarReg reg) {
return Inst<U32>(Opcode::GetUserData, reg);
}
U1 IREmitter::GetThreadBitScalarReg(IR::ScalarReg reg) {
return Inst<U1>(Opcode::GetThreadBitScalarReg, reg);
}
void IREmitter::SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value) {
Inst(Opcode::SetThreadBitScalarReg, reg, value);
}
template <>
U32 IREmitter::GetScalarReg(IR::ScalarReg reg) {
return Inst<U32>(Opcode::GetScalarRegister, reg);
@ -196,6 +204,10 @@ U32 IREmitter::GetVccLo() {
return Inst<U32>(Opcode::GetVccLo);
}
U32 IREmitter::GetVccHi() {
return Inst<U32>(Opcode::GetVccHi);
}
void IREmitter::SetScc(const U1& value) {
Inst(Opcode::SetScc, value);
}
@ -212,6 +224,10 @@ void IREmitter::SetVccLo(const U32& value) {
Inst(Opcode::SetVccLo, value);
}
void IREmitter::SetVccHi(const U32& value) {
Inst(Opcode::SetVccHi, value);
}
F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp) {
return Inst<F32>(Opcode::GetAttribute, attribute, Imm32(comp));
}

View File

@ -43,7 +43,9 @@ public:
void Epilogue();
void Discard();
U32 GetUserData(IR::ScalarReg reg);
[[nodiscard]] U32 GetUserData(IR::ScalarReg reg);
[[nodiscard]] U1 GetThreadBitScalarReg(IR::ScalarReg reg);
void SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value);
template <typename T = U32>
[[nodiscard]] T GetScalarReg(IR::ScalarReg reg);
@ -59,10 +61,12 @@ public:
[[nodiscard]] U1 GetExec();
[[nodiscard]] U1 GetVcc();
[[nodiscard]] U32 GetVccLo();
[[nodiscard]] U32 GetVccHi();
void SetScc(const U1& value);
void SetExec(const U1& value);
void SetVcc(const U1& value);
void SetVccLo(const U32& value);
void SetVccHi(const U32& value);
[[nodiscard]] U1 Condition(IR::Condition cond);

View File

@ -21,6 +21,8 @@ OPCODE(ReadConstBufferU32, U32, Opaq
// Context getters/setters
OPCODE(GetUserData, U32, ScalarReg, )
OPCODE(GetThreadBitScalarReg, U1, ScalarReg, )
OPCODE(SetThreadBitScalarReg, Void, ScalarReg, U1, )
OPCODE(GetScalarRegister, U32, ScalarReg, )
OPCODE(SetScalarRegister, Void, ScalarReg, U32, )
OPCODE(GetVectorRegister, U32, VectorReg, )
@ -36,10 +38,12 @@ OPCODE(GetScc, U1, Void,
OPCODE(GetExec, U1, Void, )
OPCODE(GetVcc, U1, Void, )
OPCODE(GetVccLo, U32, Void, )
OPCODE(GetVccHi, U32, Void, )
OPCODE(SetScc, Void, U1, )
OPCODE(SetExec, Void, U1, )
OPCODE(SetVcc, Void, U1, )
OPCODE(SetVccLo, Void, U32, )
OPCODE(SetVccHi, Void, U32, )
// Undefined
OPCODE(UndefU1, U1, )

View File

@ -206,9 +206,12 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32);
IR::U32 address = ir.Imm32(dword_offset);
if (inst_info.index_enable && inst_info.offset_enable) {
UNREACHABLE();
const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 0)};
const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 1)};
address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2)));
} else if (inst_info.index_enable) {
IR::U32 index{inst.Arg(1)};
const IR::U32 index{inst.Arg(1)};
address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
} else if (inst_info.offset_enable) {
const IR::U32 offset{inst.Arg(1)};
@ -216,6 +219,17 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
inst.SetArg(1, address);
}
IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t,
const IR::Value& z) {
// We need to fix x and y coordinate,
// because the s and t coordinate will be scaled and plus 1.5 by v_madak_f32.
// We already force the scale value to be 1.0 when handling v_cubema_f32,
// here we subtract 1.5 to recover the original value.
const IR::Value x = ir.FPSub(IR::F32{s}, ir.Imm32(1.5f));
const IR::Value y = ir.FPSub(IR::F32{t}, ir.Imm32(1.5f));
return ir.CompositeConstruct(x, y, z);
}
void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
IR::Inst* producer = inst.Arg(0).InstRecursive();
ASSERT(producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2);
@ -256,8 +270,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
return {ir.CompositeConstruct(body->Arg(0), body->Arg(1)), body->Arg(2)};
case AmdGpu::ImageType::Color2DArray:
case AmdGpu::ImageType::Color3D:
case AmdGpu::ImageType::Cube:
return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
case AmdGpu::ImageType::Cube:
return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
default:
UNREACHABLE();
}
@ -276,6 +291,7 @@ void ResourceTrackingPass(IR::Program& program) {
// Most of the time it is float so that is the default. This pass detects float buffer loads
// combined with bitcasts and patches them to be integer loads.
for (IR::Block* const block : program.post_order_blocks) {
break;
for (IR::Inst& inst : block->Instructions()) {
if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) {
continue;

View File

@ -32,6 +32,7 @@ struct SccFlagTag : FlagTag {};
struct ExecFlagTag : FlagTag {};
struct VccFlagTag : FlagTag {};
struct VccLoTag : FlagTag {};
struct VccHiTag : FlagTag {};
struct GotoVariable : FlagTag {
GotoVariable() = default;
@ -43,7 +44,7 @@ struct GotoVariable : FlagTag {
};
using Variant = std::variant<IR::ScalarReg, IR::VectorReg, GotoVariable, SccFlagTag, ExecFlagTag,
VccFlagTag, VccLoTag>;
VccFlagTag, VccLoTag, VccHiTag>;
using ValueMap = std::unordered_map<IR::Block*, IR::Value>;
struct DefTable {
@ -89,6 +90,13 @@ struct DefTable {
vcc_lo_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, VccHiTag) {
return vcc_hi_flag[block];
}
void SetDef(IR::Block* block, VccHiTag, const IR::Value& value) {
vcc_hi_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, VccFlagTag) {
return vcc_flag[block];
}
@ -101,6 +109,7 @@ struct DefTable {
ValueMap exec_flag;
ValueMap vcc_flag;
ValueMap vcc_lo_flag;
ValueMap vcc_hi_flag;
};
IR::Opcode UndefOpcode(IR::ScalarReg) noexcept {
@ -111,6 +120,14 @@ IR::Opcode UndefOpcode(IR::VectorReg) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(const VccLoTag&) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(const VccHiTag&) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(const FlagTag&) noexcept {
return IR::Opcode::UndefU1;
}
@ -281,6 +298,7 @@ private:
void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
const IR::Opcode opcode{inst.GetOpcode()};
switch (opcode) {
case IR::Opcode::SetThreadBitScalarReg:
case IR::Opcode::SetScalarRegister: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
pass.WriteVariable(reg, block, inst.Arg(1));
@ -306,6 +324,10 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
case IR::Opcode::SetVccLo:
pass.WriteVariable(VccLoTag{}, block, inst.Arg(0));
break;
case IR::Opcode::SetVccHi:
pass.WriteVariable(VccHiTag{}, block, inst.Arg(0));
break;
case IR::Opcode::GetThreadBitScalarReg:
case IR::Opcode::GetScalarRegister: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
inst.ReplaceUsesWith(pass.ReadVariable(reg, block));
@ -331,6 +353,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
case IR::Opcode::GetVccLo:
inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block));
break;
case IR::Opcode::GetVccHi:
inst.ReplaceUsesWith(pass.ReadVariable(VccHiTag{}, block));
break;
default:
break;
}

View File

@ -219,6 +219,7 @@ using U64 = TypedValue<Type::U64>;
using F16 = TypedValue<Type::F16>;
using F32 = TypedValue<Type::F32>;
using F64 = TypedValue<Type::F64>;
using U1U32F32 = TypedValue<Type::U1 | Type::U32 | Type::F32>;
using U32F32 = TypedValue<Type::U32 | Type::F32>;
using U32U64 = TypedValue<Type::U32 | Type::U64>;
using F32F64 = TypedValue<Type::F32 | Type::F64>;

View File

@ -61,7 +61,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
Shader::Optimization::DeadCodeEliminationPass(program.blocks);
Shader::Optimization::CollectShaderInfoPass(program);
fmt::print("{}\n", Shader::IR::DumpProgram(program));
fmt::print("Post passes\n\n{}\n", Shader::IR::DumpProgram(program));
std::fflush(stdout);
return program;

View File

@ -4,6 +4,7 @@
#pragma once
#include <span>
#include <vector>
#include <boost/container/static_vector.hpp>
#include "common/assert.h"
#include "common/types.h"
@ -81,7 +82,6 @@ struct Info {
struct PsInput {
u32 param_index;
u32 semantic;
bool is_default;
bool is_flat;
u32 default_value;

View File

@ -2,7 +2,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
#include "common/io_file.h"
#include "common/thread.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/amdgpu/pm4_cmds.h"

View File

@ -374,10 +374,16 @@ struct Liverpool {
FrontAndBack = 3,
};
enum class FrontFace : u32 {
CounterClockwise = 0,
Clockwise = 1,
};
union PolygonControl {
u32 raw;
BitField<0, 1, u32> cull_front;
BitField<1, 1, u32> cull_back;
BitField<2, 1, FrontFace> front_face;
BitField<3, 2, u32> enable_polygon_mode;
BitField<5, 3, PolygonMode> polygon_mode_front;
BitField<8, 3, PolygonMode> polygon_mode_back;

View File

@ -110,11 +110,29 @@ struct Image {
BitField<59, 1, u64> atc;
BitField<60, 4, ImageType> type;
};
union {
BitField<0, 13, u64> depth;
BitField<13, 14, u64> pitch;
BitField<32, 13, u64> base_array;
BitField<45, 13, u64> last_array;
};
VAddr Address() const {
return base_address << 8;
}
u32 Pitch() const {
return pitch;
}
u32 NumLayers() const {
return last_array - base_array + 1;
}
u32 NumLevels() const {
return last_level + 1;
}
DataFormat GetDataFmt() const noexcept {
return static_cast<DataFormat>(data_format.Value());
}

View File

@ -287,7 +287,7 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
}
if (data_format == AmdGpu::DataFormat::Format8_8_8_8 &&
num_format == AmdGpu::NumberFormat::Srgb) {
return vk::Format::eR8G8B8A8Srgb;
return vk::Format::eB8G8R8A8Srgb;
}
if (data_format == AmdGpu::DataFormat::Format32_32_32 &&
num_format == AmdGpu::NumberFormat::Float) {
@ -304,6 +304,9 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Unorm) {
return vk::Format::eR8Unorm;
}
if (data_format == AmdGpu::DataFormat::FormatBc3 && num_format == AmdGpu::NumberFormat::Srgb) {
return vk::Format::eBc3SrgbBlock;
}
UNREACHABLE();
}

View File

@ -75,8 +75,10 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
.depthClampEnable = false,
.rasterizerDiscardEnable = false,
.polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode),
.cullMode = LiverpoolToVK::CullMode(key.cull_mode),
.frontFace = vk::FrontFace::eClockwise,
.cullMode = vk::CullModeFlagBits::eNone, /*LiverpoolToVK::CullMode(key.cull_mode),*/
.frontFace = key.front_face == Liverpool::FrontFace::Clockwise
? vk::FrontFace::eClockwise
: vk::FrontFace::eCounterClockwise,
.depthBiasEnable = false,
.lineWidth = 1.0f,
};
@ -177,14 +179,23 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
std::array<vk::PipelineColorBlendAttachmentState, Liverpool::NumColorBuffers> attachments;
for (u32 i = 0; i < num_color_formats; i++) {
const auto& control = key.blend_controls[i];
const auto src_color = LiverpoolToVK::BlendFactor(control.color_src_factor);
const auto dst_color = LiverpoolToVK::BlendFactor(control.color_dst_factor);
const auto color_blend = LiverpoolToVK::BlendOp(control.color_func);
attachments[i] = vk::PipelineColorBlendAttachmentState{
.blendEnable = key.blend_controls[i].enable,
.srcColorBlendFactor = LiverpoolToVK::BlendFactor(control.color_src_factor),
.dstColorBlendFactor = LiverpoolToVK::BlendFactor(control.color_dst_factor),
.colorBlendOp = LiverpoolToVK::BlendOp(control.color_func),
.srcAlphaBlendFactor = LiverpoolToVK::BlendFactor(control.alpha_src_factor),
.dstAlphaBlendFactor = LiverpoolToVK::BlendFactor(control.color_dst_factor),
.alphaBlendOp = LiverpoolToVK::BlendOp(control.alpha_func),
.srcColorBlendFactor = src_color,
.dstColorBlendFactor = dst_color,
.colorBlendOp = color_blend,
.srcAlphaBlendFactor = control.separate_alpha_blend
? LiverpoolToVK::BlendFactor(control.alpha_src_factor)
: src_color,
.dstAlphaBlendFactor = control.separate_alpha_blend
? LiverpoolToVK::BlendFactor(control.alpha_dst_factor)
: dst_color,
.alphaBlendOp = control.separate_alpha_blend
? LiverpoolToVK::BlendOp(control.alpha_func)
: color_blend,
.colorWriteMask =
instance.IsColorWriteEnableSupported()
? vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG |

View File

@ -38,6 +38,8 @@ struct GraphicsPipelineKey {
Liverpool::PrimitiveType prim_type;
Liverpool::PolygonMode polygon_mode;
Liverpool::CullMode cull_mode;
Liverpool::FrontFace front_face;
u32 pad{};
std::array<Liverpool::BlendControl, Liverpool::NumColorBuffers> blend_controls;
std::array<vk::ColorComponentFlags, Liverpool::NumColorBuffers> write_masks;

View File

@ -207,6 +207,7 @@ bool Instance::CreateDevice() {
.shaderDrawParameters = true,
},
vk::PhysicalDeviceVulkan12Features{
.scalarBlockLayout = true,
.timelineSemaphore = true,
},
vk::PhysicalDeviceVulkan13Features{

View File

@ -94,6 +94,7 @@ void PipelineCache::RefreshGraphicsKey() {
key.prim_type = regs.primitive_type;
key.polygon_mode = regs.polygon_control.PolyMode();
key.cull_mode = regs.polygon_control.CullingMode();
key.front_face = regs.polygon_control.front_face;
const auto& db = regs.depth_buffer;
key.depth_format = key.depth.depth_enable
@ -163,10 +164,19 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline() {
programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info));
// Compile IR to SPIR-V
const auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding);
auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding);
stages[i] = CompileSPV(spv_code, instance.GetDevice());
infos[i] = &programs[i].info;
// Set module name to hash in renderdoc
const auto name = fmt::format("{}_{:#x}", stage, hash);
const vk::DebugUtilsObjectNameInfoEXT name_info = {
.objectType = vk::ObjectType::eShaderModule,
.objectHandle = std::bit_cast<u64>(stages[i]),
.pObjectName = name.c_str(),
};
instance.GetDevice().setDebugUtilsObjectNameEXT(name_info);
if (Config::dumpShaders()) {
DumpShader(spv_code, hash, stage, "spv");
}

View File

@ -85,6 +85,7 @@ void Rasterizer::Draw(bool is_indexed) {
}
void Rasterizer::DispatchDirect() {
compute_done = true;
return;
const auto cmdbuf = scheduler.CommandBuffer();
const auto& cs_program = liverpool->regs.cs_program;

View File

@ -49,6 +49,7 @@ private:
Core::MemoryManager* memory;
PipelineCache pipeline_cache;
StreamBuffer vertex_index_buffer;
bool compute_done{};
};
} // namespace Vulkan

View File

@ -39,8 +39,10 @@ using Libraries::VideoOut::TilingMode;
if (false /*&& IsDepthStencilFormat(format)*/) {
usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment;
} else {
if (format != vk::Format::eBc3SrgbBlock) {
usage |= vk::ImageUsageFlagBits::eColorAttachment;
}
}
return usage;
}
@ -101,8 +103,10 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
size.width = image.width + 1;
size.height = image.height + 1;
size.depth = 1;
pitch = image.Pitch();
resources.levels = image.NumLevels();
resources.layers = image.NumLayers();
// TODO: Derive this properly from tiling params
pitch = size.width;
guest_size_bytes = size.width * size.height * 4;
}
@ -183,7 +187,7 @@ void Image::Transit(vk::ImageLayout dst_layout, vk::Flags<vk::AccessFlagBits> ds
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = 1,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
}};

View File

@ -14,8 +14,9 @@ vk::ImageViewType ConvertImageViewType(AmdGpu::ImageType type) {
case AmdGpu::ImageType::Color1DArray:
return vk::ImageViewType::e1DArray;
case AmdGpu::ImageType::Color2D:
case AmdGpu::ImageType::Cube:
return vk::ImageViewType::e2D;
case AmdGpu::ImageType::Cube:
return vk::ImageViewType::eCube;
case AmdGpu::ImageType::Color2DArray:
return vk::ImageViewType::e2DArray;
case AmdGpu::ImageType::Color3D:
@ -47,10 +48,10 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) {
ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image) noexcept {
type = ConvertImageViewType(image.type);
format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
range.base.level = image.base_level;
range.base.level = 0;
range.base.layer = 0;
range.extent.levels = 1;
range.extent.layers = 1;
range.extent.levels = image.NumLevels();
range.extent.layers = image.NumLayers();
mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
mapping.b = ConvertComponentSwizzle(image.dst_sel_z);

View File

@ -175,6 +175,8 @@ void TextureCache::RefreshImage(Image& image) {
// Mark image as validated.
image.flags &= ~ImageFlagBits::CpuModified;
{
// Upload data to the staging buffer.
const auto [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4);
const u8* image_data = reinterpret_cast<const u8*>(image.cpu_addr);
@ -212,11 +214,55 @@ void TextureCache::RefreshImage(Image& image) {
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
cmdbuf.copyBufferToImage(staging.Handle(), image.image, vk::ImageLayout::eTransferDstOptimal,
image_copy);
cmdbuf.copyBufferToImage(staging.Handle(), image.image,
vk::ImageLayout::eTransferDstOptimal, image_copy);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
return;
}
const u8* image_data = reinterpret_cast<const u8*>(image.cpu_addr);
for (u32 l = 0; l < image.info.resources.layers; l++) {
// Upload data to the staging buffer.
for (u32 m = 0; m < image.info.resources.levels; m++) {
const u32 width = image.info.size.width >> m;
const u32 height = image.info.size.height >> m;
const u32 map_size = width * height;
const auto [data, offset, _] = staging.Map(map_size, 16);
if (image.info.is_tiled) {
ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode());
} else {
std::memcpy(data, image_data, map_size);
}
staging.Commit(map_size);
image_data += map_size;
// Copy to the image.
const vk::BufferImageCopy image_copy = {
.bufferOffset = offset,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.mipLevel = m,
.baseArrayLayer = l,
.layerCount = 1,
},
.imageOffset = {0, 0, 0},
.imageExtent = {width, height, 1},
};
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
cmdbuf.copyBufferToImage(staging.Handle(), image.image,
vk::ImageLayout::eTransferDstOptimal, image_copy);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
}
}
}
vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {