shader_recompiler: Add more instructions and fix a few thinhs

This commit is contained in:
raphaelthegreat 2024-06-05 22:22:34 +03:00
parent 728249f58d
commit ae7e6dafd5
18 changed files with 245 additions and 78 deletions

View File

@ -95,9 +95,14 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) {
} }
} }
switch (attr) { switch (attr) {
case IR::Attribute::FragCoord: case IR::Attribute::FragCoord: {
return ctx.OpLoad(ctx.F32[1], const Id coord = ctx.OpLoad(
ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp))); ctx.F32[1], ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp)));
if (comp == 3) {
return ctx.OpFDiv(ctx.F32[1], ctx.ConstF32(1.f), coord);
}
return coord;
}
default: default:
throw NotImplementedException("Read attribute {}", attr); throw NotImplementedException("Read attribute {}", attr);
} }

View File

@ -55,26 +55,48 @@ void Translator::S_ANDN2_B64(const GcnInst& inst) {
const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))}; const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))};
SetDst(inst.dst[0], result);
ir.SetScc(result); ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ExecLo:
ir.SetExec(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
} }
void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) {
// This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs)
// However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination
// SGPR we have a special IR opcode for SPGRs that act as thread masks. // SGPR we have a special IR opcode for SPGRs that act as thread masks.
ASSERT(inst.src[0].field == OperandField::VccLo);
const IR::U1 exec{ir.GetExec()}; const IR::U1 exec{ir.GetExec()};
const IR::U1 vcc{ir.GetVcc()};
// Mark destination SPGR as an EXEC context. This means we will use 1-bit // Mark destination SPGR as an EXEC context. This means we will use 1-bit
// IR instruction whenever it's loaded. // IR instruction whenever it's loaded.
ASSERT(inst.dst[0].field == OperandField::ScalarGPR); switch (inst.dst[0].field) {
case OperandField::ScalarGPR: {
const u32 reg = inst.dst[0].code; const u32 reg = inst.dst[0].code;
exec_contexts[reg] = true; exec_contexts[reg] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec);
break;
}
case OperandField::VccLo:
ir.SetVcc(exec);
break;
default:
UNREACHABLE();
}
// Update EXEC. // Update EXEC.
ASSERT(inst.src[0].field == OperandField::VccLo); ir.SetExec(ir.LogicalAnd(exec, vcc));
ir.SetExec(ir.LogicalAnd(exec, ir.GetVcc()));
} }
void Translator::S_MOV_B64(const GcnInst& inst) { void Translator::S_MOV_B64(const GcnInst& inst) {
@ -114,9 +136,17 @@ void Translator::S_OR_B64(bool negate, const GcnInst& inst) {
if (negate) { if (negate) {
result = ir.LogicalNot(result); result = ir.LogicalNot(result);
} }
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result); ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
} }
void Translator::S_AND_B64(const GcnInst& inst) { void Translator::S_AND_B64(const GcnInst& inst) {
@ -135,9 +165,17 @@ void Translator::S_AND_B64(const GcnInst& inst) {
const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result = ir.LogicalAnd(src0, src1); const IR::U1 result = ir.LogicalAnd(src0, src1);
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result); ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
} }
void Translator::S_ADD_I32(const GcnInst& inst) { void Translator::S_ADD_I32(const GcnInst& inst) {
@ -169,6 +207,36 @@ void Translator::S_CSELECT_B32(const GcnInst& inst) {
SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)}); SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)});
} }
void Translator::S_CSELECT_B64(const GcnInst& inst) {
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
case OperandField::ConstZero:
return ir.Imm1(false);
default:
UNREACHABLE();
}
};
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result{ir.Select(ir.GetScc(), src0, src1)};
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::S_BFE_U32(const GcnInst& inst) { void Translator::S_BFE_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 src1{GetSrc(inst.src[1])};
@ -179,4 +247,12 @@ void Translator::S_BFE_U32(const GcnInst& inst) {
ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
} }
void Translator::S_LSHL_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F)));
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
} // namespace Shader::Gcn } // namespace Shader::Gcn

View File

@ -5,30 +5,16 @@
namespace Shader::Gcn { namespace Shader::Gcn {
void Load(IR::IREmitter& ir, int num_dwords, const IR::Value& handle, IR::ScalarReg dst_reg,
const IR::U32U64& address) {
for (u32 i = 0; i < num_dwords; i++) {
if (handle.IsEmpty()) {
ir.SetScalarReg(dst_reg++, ir.ReadConst(address, ir.Imm32(i)));
} else {
const IR::U32 index = ir.IAdd(address, ir.Imm32(i));
ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(handle, index));
}
}
}
void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const auto& smrd = inst.control.smrd; const auto& smrd = inst.control.smrd;
ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported");
const IR::ScalarReg sbase{inst.src[0].code * 2}; const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::U32 offset = const IR::Value base =
smrd.imm ? ir.Imm32(smrd.offset * 4) ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1));
: IR::U32{ir.ShiftLeftLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), IR::ScalarReg dst_reg{inst.dst[0].code};
ir.Imm32(2))}; for (u32 i = 0; i < num_dwords; i++) {
const IR::U64 base = ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i)));
ir.PackUint2x32(ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1))); }
const IR::U64 address = ir.IAdd(base, offset);
const IR::ScalarReg dst_reg{inst.dst[0].code};
Load(ir, num_dwords, {}, dst_reg, address);
} }
void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
@ -37,8 +23,11 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const IR::U32 dword_offset = const IR::U32 dword_offset =
smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset)); smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset));
const IR::Value vsharp = ir.GetScalarReg(sbase); const IR::Value vsharp = ir.GetScalarReg(sbase);
const IR::ScalarReg dst_reg{inst.dst[0].code}; IR::ScalarReg dst_reg{inst.dst[0].code};
Load(ir, num_dwords, vsharp, dst_reg, dword_offset); for (u32 i = 0; i < num_dwords; i++) {
const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));
ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(vsharp, index));
}
} }
} // namespace Shader::Gcn } // namespace Shader::Gcn

View File

@ -128,7 +128,11 @@ IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
value = ir.GetExec(); value = ir.GetExec();
break; break;
case OperandField::VccLo: case OperandField::VccLo:
if (force_flt) {
value = ir.BitCast<IR::F32>(ir.GetVccLo());
} else {
value = ir.GetVccLo(); value = ir.GetVccLo();
}
break; break;
case OperandField::VccHi: case OperandField::VccHi:
value = ir.GetVccHi(); value = ir.GetVccHi();
@ -252,6 +256,12 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
break; break;
case Opcode::S_WAITCNT: case Opcode::S_WAITCNT:
break; break;
case Opcode::S_LOAD_DWORDX4:
translator.S_LOAD_DWORD(4, inst);
break;
case Opcode::S_LOAD_DWORDX8:
translator.S_LOAD_DWORD(8, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORD: case Opcode::S_BUFFER_LOAD_DWORD:
translator.S_BUFFER_LOAD_DWORD(1, inst); translator.S_BUFFER_LOAD_DWORD(1, inst);
break; break;
@ -352,9 +362,18 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_CMP_LG_U32: case Opcode::S_CMP_LG_U32:
translator.S_CMP(ConditionOp::LG, false, inst); translator.S_CMP(ConditionOp::LG, false, inst);
break; break;
case Opcode::S_CMP_LG_I32:
translator.S_CMP(ConditionOp::LG, true, inst);
break;
case Opcode::S_CMP_EQ_I32: case Opcode::S_CMP_EQ_I32:
translator.S_CMP(ConditionOp::EQ, true, inst); translator.S_CMP(ConditionOp::EQ, true, inst);
break; break;
case Opcode::S_CMP_EQ_U32:
translator.S_CMP(ConditionOp::EQ, false, inst);
break;
case Opcode::S_LSHL_B32:
translator.S_LSHL_B32(inst);
break;
case Opcode::V_CNDMASK_B32: case Opcode::V_CNDMASK_B32:
translator.V_CNDMASK_B32(inst); translator.V_CNDMASK_B32(inst);
break; break;
@ -505,13 +524,21 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_CSELECT_B32: case Opcode::S_CSELECT_B32:
translator.S_CSELECT_B32(inst); translator.S_CSELECT_B32(inst);
break; break;
case Opcode::S_CSELECT_B64:
translator.S_CSELECT_B64(inst);
break;
case Opcode::S_BFE_U32: case Opcode::S_BFE_U32:
translator.S_BFE_U32(inst); translator.S_BFE_U32(inst);
break; break;
case Opcode::V_RNDNE_F32:
translator.V_RNDNE_F32(inst);
break;
case Opcode::S_NOP: case Opcode::S_NOP:
case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0: case Opcode::S_CBRANCH_SCC0:
case Opcode::S_CBRANCH_SCC1: case Opcode::S_CBRANCH_SCC1:
case Opcode::S_CBRANCH_VCCNZ:
case Opcode::S_CBRANCH_VCCZ:
case Opcode::S_BRANCH: case Opcode::S_BRANCH:
case Opcode::S_WQM_B64: case Opcode::S_WQM_B64:
case Opcode::V_INTERP_P1_F32: case Opcode::V_INTERP_P1_F32:

View File

@ -46,7 +46,9 @@ public:
void S_AND_B32(const GcnInst& inst); void S_AND_B32(const GcnInst& inst);
void S_LSHR_B32(const GcnInst& inst); void S_LSHR_B32(const GcnInst& inst);
void S_CSELECT_B32(const GcnInst& inst); void S_CSELECT_B32(const GcnInst& inst);
void S_CSELECT_B64(const GcnInst& inst);
void S_BFE_U32(const GcnInst& inst); void S_BFE_U32(const GcnInst& inst);
void S_LSHL_B32(const GcnInst& inst);
// Scalar Memory // Scalar Memory
void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); void S_LOAD_DWORD(int num_dwords, const GcnInst& inst);
@ -101,6 +103,7 @@ public:
void V_LSHR_B32(const GcnInst& inst); void V_LSHR_B32(const GcnInst& inst);
void V_ASHRREV_I32(const GcnInst& inst); void V_ASHRREV_I32(const GcnInst& inst);
void V_MAD_U32_U24(const GcnInst& inst); void V_MAD_U32_U24(const GcnInst& inst);
void V_RNDNE_F32(const GcnInst& inst);
// Vector Memory // Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);

View File

@ -33,7 +33,7 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) {
const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::VectorReg dst_reg{inst.dst[0].code};
const IR::ScalarReg flag_reg{inst.src[2].code}; const IR::ScalarReg flag_reg{inst.src[2].code};
const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR
? ir.INotEqual(ir.GetScalarReg(flag_reg), ir.Imm32(0U)) ? ir.GetThreadBitScalarReg(flag_reg)
: ir.GetVcc(); : ir.GetVcc();
// We can treat the instruction as integer most of the time, but when a source is // We can treat the instruction as integer most of the time, but when a source is
@ -85,21 +85,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) {
} }
void Translator::V_MAD_F32(const GcnInst& inst) { void Translator::V_MAD_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])}; const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1])}; const IR::F32 src1{GetSrc(inst.src[1], true)};
const IR::F32 src2{GetSrc(inst.src[2])}; const IR::F32 src2{GetSrc(inst.src[2], true)};
SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); SetDst(inst.dst[0], ir.FPFma(src0, src1, src2));
} }
void Translator::V_FRACT_F32(const GcnInst& inst) { void Translator::V_FRACT_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])}; const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::VectorReg dst_reg{inst.dst[0].code};
ir.SetVectorReg(dst_reg, ir.Fract(src0)); ir.SetVectorReg(dst_reg, ir.Fract(src0));
} }
void Translator::V_ADD_F32(const GcnInst& inst) { void Translator::V_ADD_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])}; const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1])}; const IR::F32 src1{GetSrc(inst.src[1], true)};
SetDst(inst.dst[0], ir.FPAdd(src0, src1)); SetDst(inst.dst[0], ir.FPAdd(src0, src1));
} }
@ -114,14 +114,14 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) {
void Translator::V_MED3_F32(const GcnInst& inst) { void Translator::V_MED3_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1])}; const IR::F32 src1{GetSrc(inst.src[1], true)};
const IR::F32 src2{GetSrc(inst.src[2])}; const IR::F32 src2{GetSrc(inst.src[2], true)};
const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2);
SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx));
} }
void Translator::V_FLOOR_F32(const GcnInst& inst) { void Translator::V_FLOOR_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])}; const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::VectorReg dst_reg{inst.dst[0].code};
ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); ir.SetVectorReg(dst_reg, ir.FPFloor(src0));
} }
@ -167,7 +167,17 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) {
UNREACHABLE(); UNREACHABLE();
} }
}(); }();
switch (inst.dst[1].field) {
case OperandField::VccLo:
ir.SetVcc(result); ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result);
break;
default:
UNREACHABLE();
}
} }
void Translator::V_MAX_F32(const GcnInst& inst) { void Translator::V_MAX_F32(const GcnInst& inst) {
@ -357,4 +367,9 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) {
V_MAD_I32_I24(inst); V_MAD_I32_I24(inst);
} }
void Translator::V_RNDNE_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
SetDst(inst.dst[0], ir.FPRoundEven(src0));
}
} // namespace Shader::Gcn } // namespace Shader::Gcn

View File

@ -273,8 +273,8 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset)
}*/ }*/
} }
U32 IREmitter::ReadConst(const U64& address, const U32& offset) { U32 IREmitter::ReadConst(const Value& base, const U32& offset) {
return Inst<U32>(Opcode::ReadConst, address, offset); return Inst<U32>(Opcode::ReadConst, base, offset);
} }
F32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { F32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {

View File

@ -77,7 +77,7 @@ public:
[[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset); [[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset);
void WriteShared(int bit_size, const Value& value, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset);
[[nodiscard]] U32 ReadConst(const U64& address, const U32& offset); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
[[nodiscard]] F32 ReadConstBuffer(const Value& handle, const U32& index); [[nodiscard]] F32 ReadConstBuffer(const Value& handle, const U32& index);
[[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,

View File

@ -15,7 +15,7 @@ OPCODE(Epilogue, Void,
OPCODE(Discard, Void, ) OPCODE(Discard, Void, )
// Constant memory operations // Constant memory operations
OPCODE(ReadConst, U32, U64, U32, ) OPCODE(ReadConst, U32, U32x2, U32, )
OPCODE(ReadConstBuffer, F32, Opaque, U32, ) OPCODE(ReadConstBuffer, F32, Opaque, U32, )
OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) OPCODE(ReadConstBufferU32, U32, Opaque, U32, )

View File

@ -157,16 +157,16 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory"); ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory");
// Retrieve offset from base. // Retrieve offset from base.
IR::Inst* addr = inst->Arg(0).InstRecursive(); const u32 dword_offset = inst->Arg(1).U32();
u32 dword_offset = addr->Arg(1).U32(); const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive();
addr = addr->Arg(0).InstRecursive();
ASSERT_MSG(addr->Arg(1).IsImmediate(), "Bindless not supported");
dword_offset += addr->Arg(1).U32() >> 2;
// Retrieve SGPR that holds sbase // Retrieve SGPR pair that holds sbase
inst = addr->Arg(0).InstRecursive()->Arg(0).InstRecursive(); const IR::Inst* sbase0 = spgpr_base->Arg(0).InstRecursive();
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::GetUserData, "Nested resource loads not supported"); const IR::Inst* sbase1 = spgpr_base->Arg(1).InstRecursive();
const IR::ScalarReg base = inst->Arg(0).ScalarReg(); ASSERT_MSG(sbase0->GetOpcode() == IR::Opcode::GetUserData &&
sbase1->GetOpcode() == IR::Opcode::GetUserData,
"Nested resource loads not supported");
const IR::ScalarReg base = sbase0->Arg(0).ScalarReg();
// Return retrieved location. // Return retrieved location.
return SharpLocation{ return SharpLocation{
@ -186,7 +186,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
.stride = buffer.GetStride(), .stride = buffer.GetStride(),
.num_records = u32(buffer.num_records), .num_records = u32(buffer.num_records),
.used_types = BufferDataType(inst), .used_types = BufferDataType(inst),
.is_storage = true || IsBufferStore(inst), .is_storage = IsBufferStore(inst),
}); });
const auto inst_info = inst.Flags<IR::BufferInstInfo>(); const auto inst_info = inst.Flags<IR::BufferInstInfo>();
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
@ -206,8 +206,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32); const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32);
IR::U32 address = ir.Imm32(dword_offset); IR::U32 address = ir.Imm32(dword_offset);
if (inst_info.index_enable && inst_info.offset_enable) { if (inst_info.index_enable && inst_info.offset_enable) {
const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 0)}; const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 1)};
const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 1)}; const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 0)};
address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2))); address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2)));
} else if (inst_info.index_enable) { } else if (inst_info.index_enable) {

View File

@ -252,6 +252,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} }
break; break;
} }
case PM4ItOpcode::DrawIndexOffset2: {
const auto* draw_index_off = reinterpret_cast<const PM4CmdDrawIndexOffset2*>(header);
regs.max_index_size = draw_index_off->max_size;
regs.num_indices = draw_index_off->index_count;
regs.draw_initiator = draw_index_off->draw_initiator;
if (rasterizer) {
rasterizer->Draw(true, draw_index_off->index_offset);
}
break;
}
case PM4ItOpcode::DrawIndexAuto: { case PM4ItOpcode::DrawIndexAuto: {
const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndexAuto*>(header); const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndexAuto*>(header);
regs.num_indices = draw_index->index_count; regs.num_indices = draw_index->index_count;
@ -272,6 +282,17 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} }
break; break;
} }
case PM4ItOpcode::NumInstances: {
const auto* num_instances = reinterpret_cast<const PM4CmdDrawNumInstances*>(header);
regs.num_instances.num_instances = num_instances->num_instances;
break;
}
case PM4ItOpcode::IndexBase: {
const auto* index_base = reinterpret_cast<const PM4CmdDrawIndexBase*>(header);
regs.index_base_address.base_addr_lo = index_base->addr_lo;
regs.index_base_address.base_addr_hi.Assign(index_base->addr_hi);
break;
}
case PM4ItOpcode::EventWrite: { case PM4ItOpcode::EventWrite: {
// const auto* event = reinterpret_cast<const PM4CmdEventWrite*>(header); // const auto* event = reinterpret_cast<const PM4CmdEventWrite*>(header);
break; break;

View File

@ -548,4 +548,15 @@ struct PM4CmdDispatchDirect {
u32 dispatch_initiator; ///< Dispatch Initiator Register u32 dispatch_initiator; ///< Dispatch Initiator Register
}; };
struct PM4CmdDrawNumInstances {
PM4Type3Header header;
u32 num_instances;
};
struct PM4CmdDrawIndexBase {
PM4Type3Header header;
u32 addr_lo;
u32 addr_hi;
};
} // namespace AmdGpu } // namespace AmdGpu

View File

@ -14,6 +14,8 @@ vk::StencilOp StencilOp(Liverpool::StencilFunc op) {
return vk::StencilOp::eKeep; return vk::StencilOp::eKeep;
case Liverpool::StencilFunc::Zero: case Liverpool::StencilFunc::Zero:
return vk::StencilOp::eZero; return vk::StencilOp::eZero;
case Liverpool::StencilFunc::ReplaceTest:
return vk::StencilOp::eReplace;
case Liverpool::StencilFunc::AddClamp: case Liverpool::StencilFunc::AddClamp:
return vk::StencilOp::eIncrementAndClamp; return vk::StencilOp::eIncrementAndClamp;
case Liverpool::StencilFunc::SubClamp: case Liverpool::StencilFunc::SubClamp:
@ -307,6 +309,13 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
if (data_format == AmdGpu::DataFormat::FormatBc3 && num_format == AmdGpu::NumberFormat::Srgb) { if (data_format == AmdGpu::DataFormat::FormatBc3 && num_format == AmdGpu::NumberFormat::Srgb) {
return vk::Format::eBc3SrgbBlock; return vk::Format::eBc3SrgbBlock;
} }
if (data_format == AmdGpu::DataFormat::Format16_16_16_16 &&
num_format == AmdGpu::NumberFormat::Sint) {
return vk::Format::eR16G16B16A16Sint;
}
if (data_format == AmdGpu::DataFormat::FormatBc7 && num_format == AmdGpu::NumberFormat::Srgb) {
return vk::Format::eBc7SrgbBlock;
}
UNREACHABLE(); UNREACHABLE();
} }

View File

@ -81,8 +81,17 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
ComputePipeline::~ComputePipeline() = default; ComputePipeline::~ComputePipeline() = default;
void ComputePipeline::BindResources(Core::MemoryManager* memory, void ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
VideoCore::TextureCache& texture_cache) const { VideoCore::TextureCache& texture_cache) const {
static constexpr u64 MinUniformAlignment = 64;
const auto map_staging = [&](auto src, size_t size) {
const auto [data, offset, _] = staging.Map(size, MinUniformAlignment);
std::memcpy(data, reinterpret_cast<const void*>(src), size);
staging.Commit(size);
return offset;
};
// Bind resource buffers and textures. // Bind resource buffers and textures.
boost::container::static_vector<vk::DescriptorBufferInfo, 4> buffer_infos; boost::container::static_vector<vk::DescriptorBufferInfo, 4> buffer_infos;
boost::container::static_vector<vk::DescriptorImageInfo, 8> image_infos; boost::container::static_vector<vk::DescriptorImageInfo, 8> image_infos;
@ -94,8 +103,9 @@ void ComputePipeline::BindResources(Core::MemoryManager* memory,
const u32 size = vsharp.GetSize(); const u32 size = vsharp.GetSize();
const VAddr addr = vsharp.base_address.Value(); const VAddr addr = vsharp.base_address.Value();
texture_cache.OnCpuWrite(addr); texture_cache.OnCpuWrite(addr);
const auto [vk_buffer, offset] = memory->GetVulkanBuffer(addr); const u32 offset = map_staging(addr, size);
buffer_infos.emplace_back(vk_buffer, offset, size); // const auto [vk_buffer, offset] = memory->GetVulkanBuffer(addr);
buffer_infos.emplace_back(staging.Handle(), offset, size);
set_writes.push_back({ set_writes.push_back({
.dstSet = VK_NULL_HANDLE, .dstSet = VK_NULL_HANDLE,
.dstBinding = binding++, .dstBinding = binding++,

View File

@ -31,7 +31,8 @@ public:
return *pipeline; return *pipeline;
} }
void BindResources(Core::MemoryManager* memory, VideoCore::TextureCache& texture_cache) const; void BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
VideoCore::TextureCache& texture_cache) const;
private: private:
const Instance& instance; const Instance& instance;

View File

@ -32,10 +32,10 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
Rasterizer::~Rasterizer() = default; Rasterizer::~Rasterizer() = default;
void Rasterizer::Draw(bool is_indexed) { void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
const auto cmdbuf = scheduler.CommandBuffer(); const auto cmdbuf = scheduler.CommandBuffer();
const auto& regs = liverpool->regs; const auto& regs = liverpool->regs;
const u32 num_indices = SetupIndexBuffer(is_indexed); const u32 num_indices = SetupIndexBuffer(is_indexed, index_offset);
const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
pipeline->BindResources(memory, vertex_index_buffer, texture_cache); pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
@ -85,17 +85,16 @@ void Rasterizer::Draw(bool is_indexed) {
} }
void Rasterizer::DispatchDirect() { void Rasterizer::DispatchDirect() {
return;
const auto cmdbuf = scheduler.CommandBuffer(); const auto cmdbuf = scheduler.CommandBuffer();
const auto& cs_program = liverpool->regs.cs_program; const auto& cs_program = liverpool->regs.cs_program;
const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline();
pipeline->BindResources(memory, texture_cache); pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle());
cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z);
} }
u32 Rasterizer::SetupIndexBuffer(bool& is_indexed) { u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) {
// Emulate QuadList primitive type with CPU made index buffer. // Emulate QuadList primitive type with CPU made index buffer.
const auto& regs = liverpool->regs; const auto& regs = liverpool->regs;
if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) { if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) {
@ -131,7 +130,8 @@ u32 Rasterizer::SetupIndexBuffer(bool& is_indexed) {
// Bind index buffer. // Bind index buffer.
const auto cmdbuf = scheduler.CommandBuffer(); const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset, index_type); cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset + index_offset * index_size,
index_type);
return regs.num_indices; return regs.num_indices;
} }

View File

@ -29,12 +29,12 @@ public:
VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool); VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool);
~Rasterizer(); ~Rasterizer();
void Draw(bool is_indexed); void Draw(bool is_indexed, u32 index_offset = 0);
void DispatchDirect(); void DispatchDirect();
private: private:
u32 SetupIndexBuffer(bool& is_indexed); u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset);
void MapMemory(VAddr addr, size_t size); void MapMemory(VAddr addr, size_t size);
void UpdateDynamicState(const GraphicsPipeline& pipeline); void UpdateDynamicState(const GraphicsPipeline& pipeline);

View File

@ -116,7 +116,7 @@ Image& TextureCache::FindImage(const ImageInfo& info, VAddr cpu_address) {
std::unique_lock lock{m_page_table}; std::unique_lock lock{m_page_table};
boost::container::small_vector<ImageId, 2> image_ids; boost::container::small_vector<ImageId, 2> image_ids;
ForEachImageInRegion(cpu_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) { ForEachImageInRegion(cpu_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) {
if (image.cpu_addr == cpu_address) { if (image.cpu_addr == cpu_address && image.info.size.width == info.size.width) {
image_ids.push_back(image_id); image_ids.push_back(image_id);
} }
}); });