diff --git a/src/core/libraries/kernel/libkernel.h b/src/core/libraries/kernel/libkernel.h index ff1f8439..56148286 100644 --- a/src/core/libraries/kernel/libkernel.h +++ b/src/core/libraries/kernel/libkernel.h @@ -14,8 +14,8 @@ namespace Libraries::Kernel { struct OrbisTimesec { time_t t; - u64 west_sec; - u64 dst_sec; + u32 west_sec; + u32 dst_sec; }; int32_t PS4_SYSV_ABI sceKernelReleaseDirectMemory(off_t start, size_t len); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index f653c2dd..d4c4afb3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -95,9 +95,14 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { } } switch (attr) { - case IR::Attribute::FragCoord: - return ctx.OpLoad(ctx.F32[1], - ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp))); + case IR::Attribute::FragCoord: { + const Id coord = ctx.OpLoad( + ctx.F32[1], ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp))); + if (comp == 3) { + return ctx.OpFDiv(ctx.F32[1], ctx.ConstF32(1.f), coord); + } + return coord; + } default: throw NotImplementedException("Read attribute {}", attr); } diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h index b9eb12aa..d343ca7d 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.h +++ b/src/shader_recompiler/frontend/control_flow_graph.h @@ -41,6 +41,7 @@ struct Block : Hook { EndClass end_class{}; Block* branch_true{}; Block* branch_false{}; + bool is_dummy{}; }; class CFG { diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 79be146a..49fe2052 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -630,9 +630,11 @@ private: break; case StatementType::Code: { ensure_block(); - const u32 start = stmt.block->begin_index; - const u32 size = stmt.block->end_index - start + 1; - Translate(current_block, inst_list.subspan(start, size), info); + if (!stmt.block->is_dummy) { + const u32 start = stmt.block->begin_index; + const u32 size = stmt.block->end_index - start + 1; + Translate(current_block, inst_list.subspan(start, size), info); + } break; } case StatementType::SetVariable: { @@ -808,7 +810,7 @@ private: ObjectPool& inst_pool; ObjectPool& block_pool; IR::AbstractSyntaxList& syntax_list; - const Block dummy_flow_block{}; + const Block dummy_flow_block{.is_dummy = true}; std::span inst_list; Info& info; }; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index ac915734..8c4c90be 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -55,26 +55,48 @@ void Translator::S_ANDN2_B64(const GcnInst& inst) { const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))}; - SetDst(inst.dst[0], result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ExecLo: + ir.SetExec(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination // SGPR we have a special IR opcode for SPGRs that act as thread masks. + ASSERT(inst.src[0].field == OperandField::VccLo); const IR::U1 exec{ir.GetExec()}; + const IR::U1 vcc{ir.GetVcc()}; // Mark destination SPGR as an EXEC context. This means we will use 1-bit // IR instruction whenever it's loaded. - ASSERT(inst.dst[0].field == OperandField::ScalarGPR); - const u32 reg = inst.dst[0].code; - exec_contexts[reg] = true; - ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); + switch (inst.dst[0].field) { + case OperandField::ScalarGPR: { + const u32 reg = inst.dst[0].code; + exec_contexts[reg] = true; + ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); + break; + } + case OperandField::VccLo: + ir.SetVcc(exec); + break; + default: + UNREACHABLE(); + } // Update EXEC. - ASSERT(inst.src[0].field == OperandField::VccLo); - ir.SetExec(ir.LogicalAnd(exec, ir.GetVcc())); + ir.SetExec(ir.LogicalAnd(exec, vcc)); } void Translator::S_MOV_B64(const GcnInst& inst) { @@ -82,18 +104,21 @@ void Translator::S_MOV_B64(const GcnInst& inst) { if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) { return; } - const IR::U1 src0{GetSrc(inst.src[0])}; if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) { // Exec context push exec_contexts[inst.dst[0].code] = true; + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), ir.GetExec()); } else if (inst.dst[0].field == OperandField::ExecLo && inst.src[0].field == OperandField::ScalarGPR) { // Exec context pop exec_contexts[inst.src[0].code] = false; - } else if (inst.src[0].field != OperandField::ConstZero) { + ir.SetExec(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))); + } else if (inst.dst[0].field == OperandField::ExecLo && + inst.src[0].field == OperandField::ConstZero) { + ir.SetExec(ir.Imm1(false)); + } else { UNREACHABLE(); } - SetDst(inst.dst[0], src0); } void Translator::S_OR_B64(bool negate, const GcnInst& inst) { @@ -114,9 +139,17 @@ void Translator::S_OR_B64(bool negate, const GcnInst& inst) { if (negate) { result = ir.LogicalNot(result); } - ASSERT(inst.dst[0].field == OperandField::VccLo); - ir.SetVcc(result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_AND_B64(const GcnInst& inst) { @@ -135,9 +168,17 @@ void Translator::S_AND_B64(const GcnInst& inst) { const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 result = ir.LogicalAnd(src0, src1); - ASSERT(inst.dst[0].field == OperandField::VccLo); - ir.SetVcc(result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_ADD_I32(const GcnInst& inst) { @@ -169,6 +210,36 @@ void Translator::S_CSELECT_B32(const GcnInst& inst) { SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)}); } +void Translator::S_CSELECT_B64(const GcnInst& inst) { + const auto get_src = [&](const InstOperand& operand) { + switch (operand.field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + case OperandField::ConstZero: + return ir.Imm1(false); + default: + UNREACHABLE(); + } + }; + const IR::U1 src0{get_src(inst.src[0])}; + const IR::U1 src1{get_src(inst.src[1])}; + const IR::U1 result{ir.Select(ir.GetScc(), src0, src1)}; + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } +} + void Translator::S_BFE_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; @@ -179,4 +250,12 @@ void Translator::S_BFE_U32(const GcnInst& inst) { ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } +void Translator::S_LSHL_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result = ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F))); + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index e76950b7..14028b76 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -5,30 +5,16 @@ namespace Shader::Gcn { -void Load(IR::IREmitter& ir, int num_dwords, const IR::Value& handle, IR::ScalarReg dst_reg, - const IR::U32U64& address) { - for (u32 i = 0; i < num_dwords; i++) { - if (handle.IsEmpty()) { - ir.SetScalarReg(dst_reg++, ir.ReadConst(address, ir.Imm32(i))); - } else { - const IR::U32 index = ir.IAdd(address, ir.Imm32(i)); - ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(handle, index)); - } - } -} - void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; + ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported"); const IR::ScalarReg sbase{inst.src[0].code * 2}; - const IR::U32 offset = - smrd.imm ? ir.Imm32(smrd.offset * 4) - : IR::U32{ir.ShiftLeftLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), - ir.Imm32(2))}; - const IR::U64 base = - ir.PackUint2x32(ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1))); - const IR::U64 address = ir.IAdd(base, offset); - const IR::ScalarReg dst_reg{inst.dst[0].code}; - Load(ir, num_dwords, {}, dst_reg, address); + const IR::Value base = + ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)); + IR::ScalarReg dst_reg{inst.dst[0].code}; + for (u32 i = 0; i < num_dwords; i++) { + ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i))); + } } void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { @@ -37,8 +23,11 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const IR::U32 dword_offset = smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset)); const IR::Value vsharp = ir.GetScalarReg(sbase); - const IR::ScalarReg dst_reg{inst.dst[0].code}; - Load(ir, num_dwords, vsharp, dst_reg, dword_offset); + IR::ScalarReg dst_reg{inst.dst[0].code}; + for (u32 i = 0; i < num_dwords; i++) { + const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); + ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(vsharp, index)); + } } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index d5ea8c48..2abc87a6 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -58,16 +58,13 @@ void Translator::EmitPrologue() { } } -IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { +IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { // Input modifiers work on float values. force_flt |= operand.input_modifier.abs | operand.input_modifier.neg; - IR::U1U32F32 value{}; + IR::U32F32 value{}; switch (operand.field) { case OperandField::ScalarGPR: - if (exec_contexts[operand.code]) { - value = ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); - } if (operand.type == ScalarType::Float32 || force_flt) { value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } else { @@ -124,11 +121,12 @@ IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { case OperandField::ConstFloatNeg_2_0: value = ir.Imm32(-2.0f); break; - case OperandField::ExecLo: - value = ir.GetExec(); - break; case OperandField::VccLo: - value = ir.GetVccLo(); + if (force_flt) { + value = ir.BitCast(ir.GetVccLo()); + } else { + value = ir.GetVccLo(); + } break; case OperandField::VccHi: value = ir.GetVccHi(); @@ -146,8 +144,8 @@ IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { return value; } -void Translator::SetDst(const InstOperand& operand, const IR::U1U32F32& value) { - IR::U1U32F32 result = value; +void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { + IR::U32F32 result = value; if (operand.output_modifier.multiplier != 0.f) { result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier)); } @@ -156,14 +154,9 @@ void Translator::SetDst(const InstOperand& operand, const IR::U1U32F32& value) { } switch (operand.field) { case OperandField::ScalarGPR: - if (value.Type() == IR::Type::U1) { - return ir.SetThreadBitScalarReg(IR::ScalarReg(operand.code), result); - } return ir.SetScalarReg(IR::ScalarReg(operand.code), result); case OperandField::VectorGPR: return ir.SetVectorReg(IR::VectorReg(operand.code), result); - case OperandField::ExecLo: - return ir.SetExec(result); case OperandField::VccLo: return ir.SetVccLo(result); case OperandField::VccHi: @@ -252,6 +245,12 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) break; case Opcode::S_WAITCNT: break; + case Opcode::S_LOAD_DWORDX4: + translator.S_LOAD_DWORD(4, inst); + break; + case Opcode::S_LOAD_DWORDX8: + translator.S_LOAD_DWORD(8, inst); + break; case Opcode::S_BUFFER_LOAD_DWORD: translator.S_BUFFER_LOAD_DWORD(1, inst); break; @@ -352,9 +351,18 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); break; + case Opcode::S_CMP_LG_I32: + translator.S_CMP(ConditionOp::LG, true, inst); + break; case Opcode::S_CMP_EQ_I32: translator.S_CMP(ConditionOp::EQ, true, inst); break; + case Opcode::S_CMP_EQ_U32: + translator.S_CMP(ConditionOp::EQ, false, inst); + break; + case Opcode::S_LSHL_B32: + translator.S_LSHL_B32(inst); + break; case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; @@ -505,13 +513,21 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CSELECT_B32: translator.S_CSELECT_B32(inst); break; + case Opcode::S_CSELECT_B64: + translator.S_CSELECT_B64(inst); + break; case Opcode::S_BFE_U32: translator.S_BFE_U32(inst); break; + case Opcode::V_RNDNE_F32: + translator.V_RNDNE_F32(inst); + break; case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: case Opcode::S_CBRANCH_SCC1: + case Opcode::S_CBRANCH_VCCNZ: + case Opcode::S_CBRANCH_VCCZ: case Opcode::S_BRANCH: case Opcode::S_WQM_B64: case Opcode::V_INTERP_P1_F32: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index d1efb724..6fd8e3f5 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -46,7 +46,9 @@ public: void S_AND_B32(const GcnInst& inst); void S_LSHR_B32(const GcnInst& inst); void S_CSELECT_B32(const GcnInst& inst); + void S_CSELECT_B64(const GcnInst& inst); void S_BFE_U32(const GcnInst& inst); + void S_LSHL_B32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -101,6 +103,7 @@ public: void V_LSHR_B32(const GcnInst& inst); void V_ASHRREV_I32(const GcnInst& inst); void V_MAD_U32_U24(const GcnInst& inst); + void V_RNDNE_F32(const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); @@ -121,8 +124,8 @@ public: void EXP(const GcnInst& inst); private: - IR::U1U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false); - void SetDst(const InstOperand& operand, const IR::U1U32F32& value); + IR::U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false); + void SetDst(const InstOperand& operand, const IR::U32F32& value); private: IR::IREmitter ir; diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 085d8694..7484da57 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -33,7 +33,7 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::ScalarReg flag_reg{inst.src[2].code}; const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR - ? ir.INotEqual(ir.GetScalarReg(flag_reg), ir.Imm32(0U)) + ? ir.GetThreadBitScalarReg(flag_reg) : ir.GetVcc(); // We can treat the instruction as integer most of the time, but when a source is @@ -85,21 +85,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) { } void Translator::V_MAD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; - const IR::F32 src1{GetSrc(inst.src[1])}; - const IR::F32 src2{GetSrc(inst.src[2])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_FRACT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.Fract(src0)); } void Translator::V_ADD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; - const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; SetDst(inst.dst[0], ir.FPAdd(src0, src1)); } @@ -114,14 +114,14 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { void Translator::V_MED3_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1])}; - const IR::F32 src2{GetSrc(inst.src[2])}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); } void Translator::V_FLOOR_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); } @@ -167,7 +167,17 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { UNREACHABLE(); } }(); - ir.SetVcc(result); + + switch (inst.dst[1].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::V_MAX_F32(const GcnInst& inst) { @@ -357,4 +367,9 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) { V_MAD_I32_I24(inst); } +void Translator::V_RNDNE_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + SetDst(inst.dst[0], ir.FPRoundEven(src0)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index bd41d587..cf57939d 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -273,8 +273,8 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) }*/ } -U32 IREmitter::ReadConst(const U64& address, const U32& offset) { - return Inst(Opcode::ReadConst, address, offset); +U32 IREmitter::ReadConst(const Value& base, const U32& offset) { + return Inst(Opcode::ReadConst, base, offset); } F32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 3394c9b6..707c127e 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -77,7 +77,7 @@ public: [[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); - [[nodiscard]] U32 ReadConst(const U64& address, const U32& offset); + [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] F32 ReadConstBuffer(const Value& handle, const U32& index); [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index a3009575..bd506f44 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -15,7 +15,7 @@ OPCODE(Epilogue, Void, OPCODE(Discard, Void, ) // Constant memory operations -OPCODE(ReadConst, U32, U64, U32, ) +OPCODE(ReadConst, U32, U32x2, U32, ) OPCODE(ReadConstBuffer, F32, Opaque, U32, ) OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) diff --git a/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp index 24c6b548..32479730 100644 --- a/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp @@ -5,10 +5,10 @@ namespace Shader::Optimization { -void DeadCodeEliminationPass(IR::BlockList& program) { +void DeadCodeEliminationPass(IR::Program& program) { // We iterate over the instructions in reverse order. // This is because removing an instruction reduces the number of uses for earlier instructions. - for (IR::Block* const block : program) { + for (IR::Block* const block : program.post_order_blocks) { auto it{block->end()}; while (it != block->begin()) { --it; @@ -20,4 +20,4 @@ void DeadCodeEliminationPass(IR::BlockList& program) { } } -} // namespace Shader::Optimization \ No newline at end of file +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 915bb80e..bf2ba4d6 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -10,7 +10,7 @@ namespace Shader::Optimization { void SsaRewritePass(IR::BlockList& program); void IdentityRemovalPass(IR::BlockList& program); -void DeadCodeEliminationPass(IR::BlockList& program); +void DeadCodeEliminationPass(IR::Program& program); void ConstantPropagationPass(IR::BlockList& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index c8e8d9cf..68b4fb11 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -157,16 +157,16 @@ SharpLocation TrackSharp(const IR::Inst* inst) { ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory"); // Retrieve offset from base. - IR::Inst* addr = inst->Arg(0).InstRecursive(); - u32 dword_offset = addr->Arg(1).U32(); - addr = addr->Arg(0).InstRecursive(); - ASSERT_MSG(addr->Arg(1).IsImmediate(), "Bindless not supported"); - dword_offset += addr->Arg(1).U32() >> 2; + const u32 dword_offset = inst->Arg(1).U32(); + const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); - // Retrieve SGPR that holds sbase - inst = addr->Arg(0).InstRecursive()->Arg(0).InstRecursive(); - ASSERT_MSG(inst->GetOpcode() == IR::Opcode::GetUserData, "Nested resource loads not supported"); - const IR::ScalarReg base = inst->Arg(0).ScalarReg(); + // Retrieve SGPR pair that holds sbase + const IR::Inst* sbase0 = spgpr_base->Arg(0).InstRecursive(); + const IR::Inst* sbase1 = spgpr_base->Arg(1).InstRecursive(); + ASSERT_MSG(sbase0->GetOpcode() == IR::Opcode::GetUserData && + sbase1->GetOpcode() == IR::Opcode::GetUserData, + "Nested resource loads not supported"); + const IR::ScalarReg base = sbase0->Arg(0).ScalarReg(); // Return retrieved location. return SharpLocation{ @@ -186,7 +186,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, .stride = buffer.GetStride(), .num_records = u32(buffer.num_records), .used_types = BufferDataType(inst), - .is_storage = true || IsBufferStore(inst), + .is_storage = IsBufferStore(inst), }); const auto inst_info = inst.Flags(); IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -206,8 +206,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32); IR::U32 address = ir.Imm32(dword_offset); if (inst_info.index_enable && inst_info.offset_enable) { - const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 0)}; - const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 1)}; + const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 1)}; + const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 0)}; address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2))); } else if (inst_info.index_enable) { diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index 82d40a9b..8c97f495 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -219,7 +219,6 @@ using U64 = TypedValue; using F16 = TypedValue; using F32 = TypedValue; using F64 = TypedValue; -using U1U32F32 = TypedValue; using U32F32 = TypedValue; using U32U64 = TypedValue; using F32F64 = TypedValue; diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 189a2ab1..f2834abf 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -58,7 +58,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool dcb, std::span(header); + regs.max_index_size = draw_index_off->max_size; + regs.num_indices = draw_index_off->index_count; + regs.draw_initiator = draw_index_off->draw_initiator; + if (rasterizer) { + rasterizer->Draw(true, draw_index_off->index_offset); + } + break; + } case PM4ItOpcode::DrawIndexAuto: { const auto* draw_index = reinterpret_cast(header); regs.num_indices = draw_index->index_count; @@ -272,6 +282,17 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + regs.num_instances.num_instances = num_instances->num_instances; + break; + } + case PM4ItOpcode::IndexBase: { + const auto* index_base = reinterpret_cast(header); + regs.index_base_address.base_addr_lo = index_base->addr_lo; + regs.index_base_address.base_addr_hi.Assign(index_base->addr_hi); + break; + } case PM4ItOpcode::EventWrite: { // const auto* event = reinterpret_cast(header); break; diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index c7b1452b..4883834f 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -548,4 +548,15 @@ struct PM4CmdDispatchDirect { u32 dispatch_initiator; ///< Dispatch Initiator Register }; +struct PM4CmdDrawNumInstances { + PM4Type3Header header; + u32 num_instances; +}; + +struct PM4CmdDrawIndexBase { + PM4Type3Header header; + u32 addr_lo; + u32 addr_hi; +}; + } // namespace AmdGpu diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index ccbb400d..eb319f09 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -14,6 +14,8 @@ vk::StencilOp StencilOp(Liverpool::StencilFunc op) { return vk::StencilOp::eKeep; case Liverpool::StencilFunc::Zero: return vk::StencilOp::eZero; + case Liverpool::StencilFunc::ReplaceTest: + return vk::StencilOp::eReplace; case Liverpool::StencilFunc::AddClamp: return vk::StencilOp::eIncrementAndClamp; case Liverpool::StencilFunc::SubClamp: @@ -307,6 +309,13 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::FormatBc3 && num_format == AmdGpu::NumberFormat::Srgb) { return vk::Format::eBc3SrgbBlock; } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Sint) { + return vk::Format::eR16G16B16A16Sint; + } + if (data_format == AmdGpu::DataFormat::FormatBc7 && num_format == AmdGpu::NumberFormat::Srgb) { + return vk::Format::eBc7SrgbBlock; + } UNREACHABLE(); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 3227a232..66fee434 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -81,8 +81,17 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler ComputePipeline::~ComputePipeline() = default; -void ComputePipeline::BindResources(Core::MemoryManager* memory, +void ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging, VideoCore::TextureCache& texture_cache) const { + static constexpr u64 MinUniformAlignment = 64; + + const auto map_staging = [&](auto src, size_t size) { + const auto [data, offset, _] = staging.Map(size, MinUniformAlignment); + std::memcpy(data, reinterpret_cast(src), size); + staging.Commit(size); + return offset; + }; + // Bind resource buffers and textures. boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; @@ -94,8 +103,9 @@ void ComputePipeline::BindResources(Core::MemoryManager* memory, const u32 size = vsharp.GetSize(); const VAddr addr = vsharp.base_address.Value(); texture_cache.OnCpuWrite(addr); - const auto [vk_buffer, offset] = memory->GetVulkanBuffer(addr); - buffer_infos.emplace_back(vk_buffer, offset, size); + const u32 offset = map_staging(addr, size); + // const auto [vk_buffer, offset] = memory->GetVulkanBuffer(addr); + buffer_infos.emplace_back(staging.Handle(), offset, size); set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index df9743c2..781bd81b 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -31,7 +31,8 @@ public: return *pipeline; } - void BindResources(Core::MemoryManager* memory, VideoCore::TextureCache& texture_cache) const; + void BindResources(Core::MemoryManager* memory, StreamBuffer& staging, + VideoCore::TextureCache& texture_cache) const; private: const Instance& instance; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 37d6f72b..d49e7138 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -32,10 +32,10 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, Rasterizer::~Rasterizer() = default; -void Rasterizer::Draw(bool is_indexed) { +void Rasterizer::Draw(bool is_indexed, u32 index_offset) { const auto cmdbuf = scheduler.CommandBuffer(); const auto& regs = liverpool->regs; - const u32 num_indices = SetupIndexBuffer(is_indexed); + const u32 num_indices = SetupIndexBuffer(is_indexed, index_offset); const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); pipeline->BindResources(memory, vertex_index_buffer, texture_cache); @@ -85,17 +85,16 @@ void Rasterizer::Draw(bool is_indexed) { } void Rasterizer::DispatchDirect() { - return; const auto cmdbuf = scheduler.CommandBuffer(); const auto& cs_program = liverpool->regs.cs_program; const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); - pipeline->BindResources(memory, texture_cache); + pipeline->BindResources(memory, vertex_index_buffer, texture_cache); cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); } -u32 Rasterizer::SetupIndexBuffer(bool& is_indexed) { +u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) { // Emulate QuadList primitive type with CPU made index buffer. const auto& regs = liverpool->regs; if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) { @@ -131,7 +130,8 @@ u32 Rasterizer::SetupIndexBuffer(bool& is_indexed) { // Bind index buffer. const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset, index_type); + cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset + index_offset * index_size, + index_type); return regs.num_indices; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a1b6a5a6..2ff4c244 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -29,12 +29,12 @@ public: VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool); ~Rasterizer(); - void Draw(bool is_indexed); + void Draw(bool is_indexed, u32 index_offset = 0); void DispatchDirect(); private: - u32 SetupIndexBuffer(bool& is_indexed); + u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset); void MapMemory(VAddr addr, size_t size); void UpdateDynamicState(const GraphicsPipeline& pipeline); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index bf39963c..39f89878 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -116,7 +116,7 @@ Image& TextureCache::FindImage(const ImageInfo& info, VAddr cpu_address) { std::unique_lock lock{m_page_table}; boost::container::small_vector image_ids; ForEachImageInRegion(cpu_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) { - if (image.cpu_addr == cpu_address) { + if (image.cpu_addr == cpu_address && image.info.size.width == info.size.width) { image_ids.push_back(image_id); } }); @@ -216,54 +216,45 @@ void TextureCache::RefreshImage(Image& image) { return; } - const vk::ImageSubresourceRange range = { - .aspectMask = vk::ImageAspectFlagBits::eColor, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }; - const u8* image_data = reinterpret_cast(image.cpu_addr); - for (u32 l = 0; l < image.info.resources.layers; l++) { + for (u32 m = 0; m < image.info.resources.levels; m++) { + const u32 width = image.info.size.width >> m; + const u32 height = image.info.size.height >> m; + const u32 map_size = width * height * image.info.resources.layers; + // Upload data to the staging buffer. - for (u32 m = 0; m < image.info.resources.levels; m++) { - const u32 width = image.info.size.width >> m; - const u32 height = image.info.size.height >> m; - const u32 map_size = width * height; - const auto [data, offset, _] = staging.Map(map_size, 16); - if (image.info.is_tiled) { - ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode()); - } else { - std::memcpy(data, image_data, map_size); - } - staging.Commit(map_size); - image_data += map_size; - - // Copy to the image. - const vk::BufferImageCopy image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource{ - .aspectMask = vk::ImageAspectFlagBits::eColor, - .mipLevel = m, - .baseArrayLayer = l, - .layerCount = 1, - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {width, height, 1}, - }; - - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - cmdbuf.copyBufferToImage(staging.Handle(), image.image, - vk::ImageLayout::eTransferDstOptimal, image_copy); - - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + const auto [data, offset, _] = staging.Map(map_size, 16); + if (image.info.is_tiled) { + ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode()); + } else { + std::memcpy(data, image_data, map_size); } + staging.Commit(map_size); + image_data += map_size; + + // Copy to the image. + const vk::BufferImageCopy image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = m, + .baseArrayLayer = 0, + .layerCount = u32(image.info.resources.layers), + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1}, + }; + + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + + cmdbuf.copyBufferToImage(staging.Handle(), image.image, + vk::ImageLayout::eTransferDstOptimal, image_copy); + + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); } }