diff --git a/CMakeLists.txt b/CMakeLists.txt index 721f1128..43ad23e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -488,6 +488,7 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/amdgpu/pm4_cmds.h src/video_core/amdgpu/pm4_opcodes.h src/video_core/amdgpu/resource.h + src/video_core/amdgpu/default_context.cpp src/video_core/buffer_cache/buffer.cpp src/video_core/buffer_cache/buffer.h src/video_core/buffer_cache/buffer_cache.cpp diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index c2ee6d59..95821a03 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -55,6 +55,10 @@ static constexpr auto HwInitPacketSize = 0x100u; // clang-format off static constexpr std::array InitSequence{ + // A fake preamble to mimic context reset sent by FW + 0xc0001200u, 0u, // IT_CLEAR_STATE + + // Actual init state sequence 0xc0017600u, 0x216u, 0xffffffffu, 0xc0017600u, 0x217u, 0xffffffffu, 0xc0017600u, 0x215u, 0u, @@ -94,9 +98,13 @@ static constexpr std::array InitSequence{ 0xc0036900u, 0x295u, 0x100u, 0x100u, 4u, 0xc0017900u, 0x200u, 0xe0000000u, }; -static_assert(InitSequence.size() == 0x73); +static_assert(InitSequence.size() == 0x73 + 2); static constexpr std::array InitSequence175{ + // A fake preamble to mimic context reset sent by FW + 0xc0001200u, 0u, // IT_CLEAR_STATE + + // Actual init state sequence 0xc0017600u, 0x216u, 0xffffffffu, 0xc0017600u, 0x217u, 0xffffffffu, 0xc0017600u, 0x215u, 0u, @@ -136,9 +144,13 @@ static constexpr std::array InitSequence175{ 0xc0036900u, 0x295u, 0x100u, 0x100u, 4u, 0xc0017900u, 0x200u, 0xe0000000u, }; -static_assert(InitSequence175.size() == 0x73); +static_assert(InitSequence175.size() == 0x73 + 2); static constexpr std::array InitSequence200{ + // A fake preamble to mimic context reset sent by FW + 0xc0001200u, 0u, // IT_CLEAR_STATE + + // Actual init state sequence 0xc0017600u, 0x216u, 0xffffffffu, 0xc0017600u, 0x217u, 0xffffffffu, 0xc0017600u, 0x215u, 0u, @@ -179,9 +191,13 @@ static constexpr std::array InitSequence200{ 0xc0036900u, 0x295u, 0x100u, 0x100u, 4u, 0xc0017900u, 0x200u, 0xe0000000u, }; -static_assert(InitSequence200.size() == 0x76); +static_assert(InitSequence200.size() == 0x76 + 2); static constexpr std::array InitSequence350{ + // A fake preamble to mimic context reset sent by FW + 0xc0001200u, 0u, // IT_CLEAR_STATE + + // Actual init state sequence 0xc0017600u, 0x216u, 0xffffffffu, 0xc0017600u, 0x217u, 0xffffffffu, 0xc0017600u, 0x215u, 0u, @@ -224,7 +240,7 @@ static constexpr std::array InitSequence350{ 0xc0017900u, 0x200u, 0xe0000000u, 0xc0016900u, 0x2aau, 0xffu, }; -static_assert(InitSequence350.size() == 0x7c); +static_assert(InitSequence350.size() == 0x7c + 2); static constexpr std::array CtxInitSequence{ 0xc0012800u, 0x80000000u, 0x80000000u, @@ -735,11 +751,11 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size) { cmdbuf = ClearContextState(cmdbuf); } - std::memcpy(cmdbuf, InitSequence.data(), InitSequence.size() * 4); - cmdbuf += InitSequence.size(); + std::memcpy(cmdbuf, &InitSequence[2], (InitSequence.size() - 2) * 4); + cmdbuf += InitSequence.size() - 2; const auto cmdbuf_left = - HwInitPacketSize - InitSequence.size() - (clear_state ? 0xc : 0) - 1; + HwInitPacketSize - (InitSequence.size() - 2) - (clear_state ? 0xc : 0) - 1; cmdbuf = WriteHeader(cmdbuf, cmdbuf_left); cmdbuf = WriteBody(cmdbuf, 0u); @@ -757,10 +773,10 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState175(u32* cmdbuf, u32 size) { } cmdbuf = ClearContextState(cmdbuf); - std::memcpy(cmdbuf, InitSequence175.data(), InitSequence175.size() * 4); - cmdbuf += InitSequence175.size(); + std::memcpy(cmdbuf, &InitSequence175[2], (InitSequence175.size() - 2) * 4); + cmdbuf += InitSequence175.size() - 2; - constexpr auto cmdbuf_left = HwInitPacketSize - InitSequence175.size() - 0xc - 1; + constexpr auto cmdbuf_left = HwInitPacketSize - (InitSequence175.size() - 2) - 0xc - 1; WriteTrailingNop(cmdbuf); return HwInitPacketSize; @@ -778,11 +794,11 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState200(u32* cmdbuf, u32 size) { cmdbuf = ClearContextState(cmdbuf); } - std::memcpy(cmdbuf, InitSequence200.data(), InitSequence200.size() * 4); - cmdbuf += InitSequence200.size(); + std::memcpy(cmdbuf, &InitSequence200[2], (InitSequence200.size() - 2) * 4); + cmdbuf += InitSequence200.size() - 2; const auto cmdbuf_left = - HwInitPacketSize - InitSequence200.size() - (clear_state ? 0xc : 0) - 1; + HwInitPacketSize - (InitSequence200.size() - 2) - (clear_state ? 0xc : 0) - 1; cmdbuf = WriteHeader(cmdbuf, cmdbuf_left); cmdbuf = WriteBody(cmdbuf, 0u); @@ -804,11 +820,11 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState350(u32* cmdbuf, u32 size) { cmdbuf = ClearContextState(cmdbuf); } - std::memcpy(cmdbuf, InitSequence350.data(), InitSequence350.size() * 4); - cmdbuf += InitSequence350.size(); + std::memcpy(cmdbuf, &InitSequence350[2], (InitSequence350.size() - 2) * 4); + cmdbuf += InitSequence350.size() - 2; const auto cmdbuf_left = - HwInitPacketSize - InitSequence350.size() - (clear_state ? 0xc : 0) - 1; + HwInitPacketSize - (InitSequence350.size() - 2) - (clear_state ? 0xc : 0) - 1; cmdbuf = WriteHeader(cmdbuf, cmdbuf_left); cmdbuf = WriteBody(cmdbuf, 0u); @@ -1743,7 +1759,7 @@ s32 PS4_SYSV_ABI sceGnmSetVsShader(u32* cmdbuf, u32 size, const u32* vs_regs, u3 return -1; } - const u32 var = shader_modifier == 0 ? vs_regs[2] : (vs_regs[2] & 0xfcfffc3f | shader_modifier); + const u32 var = shader_modifier == 0 ? vs_regs[2] : (vs_regs[2] & 0xfcfffc3f) | shader_modifier; cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x48u, vs_regs[0], 0u); // SPI_SHADER_PGM_LO_VS cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x4au, var, vs_regs[3]); // SPI_SHADER_PGM_RSRC1_VS cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x207u, vs_regs[6]); // PA_CL_VS_OUT_CNTL diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index d13062cd..a6c1a762 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -185,14 +185,16 @@ s32 PS4_SYSV_ABI sceVideoOutGetFlipStatus(s32 handle, FlipStatus* status) { return ORBIS_VIDEO_OUT_ERROR_INVALID_HANDLE; } - std::unique_lock lock{port->port_mutex}; - *status = port->flip_status; + { + std::unique_lock lock{port->port_mutex}; + *status = port->flip_status; + } - LOG_INFO(Lib_VideoOut, - "count = {}, processTime = {}, tsc = {}, submitTsc = {}, flipArg = {}, gcQueueNum = " - "{}, flipPendingNum = {}, currentBuffer = {}", - status->count, status->processTime, status->tsc, status->submitTsc, status->flipArg, - status->gcQueueNum, status->flipPendingNum, status->currentBuffer); + LOG_TRACE(Lib_VideoOut, + "count = {}, processTime = {}, tsc = {}, submitTsc = {}, flipArg = {}, gcQueueNum = " + "{}, flipPendingNum = {}, currentBuffer = {}", + status->count, status->processTime, status->tsc, status->submitTsc, status->flipArg, + status->gcQueueNum, status->flipPendingNum, status->currentBuffer); return ORBIS_OK; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index c7042763..09a9fd62 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -179,6 +179,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { spv::ExecutionModel execution_model{}; ctx.AddCapability(spv::Capability::Image1D); ctx.AddCapability(spv::Capability::Sampled1D); + ctx.AddCapability(spv::Capability::ImageQuery); if (info.uses_fp16) { ctx.AddCapability(spv::Capability::Float16); ctx.AddCapability(spv::Capability::Int16); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index e48b89f4..994c2847 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -405,6 +405,10 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { return spv::ImageFormat::Rg16f; } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format16_16 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Snorm) { + return spv::ImageFormat::Rg16Snorm; + } if (image.GetDataFmt() == AmdGpu::DataFormat::Format8_8 && image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { return spv::ImageFormat::Rg8; diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 3faf8665..276bd9db 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -21,8 +21,13 @@ struct Compare { } }; -static IR::Condition MakeCondition(Opcode opcode) { - switch (opcode) { +static IR::Condition MakeCondition(const GcnInst& inst) { + if (inst.IsCmpx()) { + ASSERT(inst.opcode == Opcode::V_CMPX_NE_U32); + return IR::Condition::Execnz; + } + + switch (inst.opcode) { case Opcode::S_CBRANCH_SCC0: return IR::Condition::Scc0; case Opcode::S_CBRANCH_SCC1: @@ -37,7 +42,6 @@ static IR::Condition MakeCondition(Opcode opcode) { return IR::Condition::Execnz; case Opcode::S_AND_SAVEEXEC_B64: case Opcode::S_ANDN2_B64: - case Opcode::V_CMPX_NE_U32: return IR::Condition::Execnz; default: return IR::Condition::True; @@ -94,7 +98,8 @@ void CFG::EmitDivergenceLabels() { // While this instruction does not save EXEC it is often used paired // with SAVEEXEC to mask the threads that didn't pass the condition // of initial branch. - inst.opcode == Opcode::S_ANDN2_B64 || inst.opcode == Opcode::V_CMPX_NE_U32; + (inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo) || + inst.opcode == Opcode::V_CMPX_NE_U32; }; const auto is_close_scope = [](const GcnInst& inst) { // Closing an EXEC scope can be either a branch instruction @@ -104,7 +109,8 @@ void CFG::EmitDivergenceLabels() { // Sometimes compiler might insert instructions between the SAVEEXEC and the branch. // Those instructions need to be wrapped in the condition as well so allow branch // as end scope instruction. - inst.opcode == Opcode::S_CBRANCH_EXECZ || inst.opcode == Opcode::S_ANDN2_B64; + inst.opcode == Opcode::S_CBRANCH_EXECZ || + (inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo); }; // Since we will be adding new labels, avoid iterating those as well. @@ -171,7 +177,7 @@ void CFG::EmitBlocks() { block->begin_index = GetIndex(start); block->end_index = end_index; block->end_inst = end_inst; - block->cond = MakeCondition(end_inst.opcode); + block->cond = MakeCondition(end_inst); blocks.insert(*block); } } diff --git a/src/shader_recompiler/frontend/instruction.cpp b/src/shader_recompiler/frontend/instruction.cpp index 756d3b4e..a0c13205 100644 --- a/src/shader_recompiler/frontend/instruction.cpp +++ b/src/shader_recompiler/frontend/instruction.cpp @@ -47,4 +47,18 @@ bool GcnInst::IsConditionalBranch() const { return false; } +bool GcnInst::IsCmpx() const { + if ((opcode >= Opcode::V_CMPX_F_F32 && opcode <= Opcode::V_CMPX_T_F32) || + (opcode >= Opcode::V_CMPX_F_F64 && opcode <= Opcode::V_CMPX_T_F64) || + (opcode >= Opcode::V_CMPSX_F_F32 && opcode <= Opcode::V_CMPSX_T_F32) || + (opcode >= Opcode::V_CMPSX_F_F64 && opcode <= Opcode::V_CMPSX_T_F64) || + (opcode >= Opcode::V_CMPX_F_I32 && opcode <= Opcode::V_CMPX_CLASS_F32) || + (opcode >= Opcode::V_CMPX_F_I64 && opcode <= Opcode::V_CMPX_CLASS_F64) || + (opcode >= Opcode::V_CMPX_F_U32 && opcode <= Opcode::V_CMPX_T_U32) || + (opcode >= Opcode::V_CMPX_F_U64 && opcode <= Opcode::V_CMPX_T_U64)) { + return true; + } + return false; +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/instruction.h b/src/shader_recompiler/frontend/instruction.h index f83f43db..7c2e0bd1 100644 --- a/src/shader_recompiler/frontend/instruction.h +++ b/src/shader_recompiler/frontend/instruction.h @@ -203,6 +203,7 @@ struct GcnInst { bool IsUnconditionalBranch() const; bool IsConditionalBranch() const; bool IsFork() const; + bool IsCmpx() const; }; } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 1bbc3c16..0216238a 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -280,6 +280,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_CMP_U32(ConditionOp::GT, true, false, inst); case Opcode::V_CMP_LT_I32: return V_CMP_U32(ConditionOp::LT, true, false, inst); + case Opcode::V_CMPX_GT_I32: + return V_CMP_U32(ConditionOp::GT, true, true, inst); case Opcode::V_CMPX_LT_I32: return V_CMP_U32(ConditionOp::LT, true, true, inst); case Opcode::V_CMPX_F_U32: diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index efee710d..016ba366 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -246,10 +246,7 @@ public: return true; } // Samplers with different bindings might still be the same. - const auto old_sharp = - info.ReadUd(existing.sgpr_base, existing.dword_offset); - const auto new_sharp = info.ReadUd(desc.sgpr_base, desc.dword_offset); - return old_sharp == new_sharp; + return existing.GetSsharp(info) == desc.GetSsharp(info); })}; return index; } diff --git a/src/video_core/amdgpu/default_context.cpp b/src/video_core/amdgpu/default_context.cpp new file mode 100644 index 00000000..01229e7b --- /dev/null +++ b/src/video_core/amdgpu/default_context.cpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/types.h" +#include "video_core/amdgpu/liverpool.h" + +#include + +namespace AmdGpu { + +// The following values are taken from fpPS4: +// https://github.com/red-prig/fpPS4/blob/436b43064be4c78229500f3d3c054fc76639247d/chip/pm4_pfp.pas#L410 +// +static constexpr std::array reg_array_default{ + 0x00000000u, 0x80000000u, 0x40004000u, 0xdeadbeefu, 0x00000000u, 0x40004000u, 0x00000000u, + 0x40004000u, 0x00000000u, 0x40004000u, 0x00000000u, 0x40004000u, 0xaa99aaaau, 0x00000000u, + 0xdeadbeefu, 0xdeadbeefu, 0x80000000u, 0x40004000u, 0x00000000u, 0x00000000u, 0x80000000u, + 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, + 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, + 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, + 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, 0x40004000u, 0x80000000u, + 0x40004000u, 0x80000000u, 0x40004000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, + 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, + 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, + 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, + 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, 0x00000000u, 0x3f800000u, + 0x2a00161au, +}; + +void Liverpool::Regs::SetDefaults() { + std::memset(reg_array.data(), 0, reg_array.size() * sizeof(u32)); + + std::memcpy(®_array[ContextRegWordOffset + 0x80], reg_array_default.data(), + reg_array_default.size() * sizeof(u32)); + + // Individual context regs values + reg_array[ContextRegWordOffset + 0x000d] = 0x40004000u; + reg_array[ContextRegWordOffset + 0x01b6] = 0x00000002u; + reg_array[ContextRegWordOffset + 0x0204] = 0x00090000u; + reg_array[ContextRegWordOffset + 0x0205] = 0x00000004u; + reg_array[ContextRegWordOffset + 0x0295] = 0x00000100u; + reg_array[ContextRegWordOffset + 0x0296] = 0x00000080u; + reg_array[ContextRegWordOffset + 0x0297] = 0x00000002u; + reg_array[ContextRegWordOffset + 0x02aa] = 0x00001000u; + reg_array[ContextRegWordOffset + 0x02f7] = 0x00001000u; + reg_array[ContextRegWordOffset + 0x02f9] = 0x00000005u; + reg_array[ContextRegWordOffset + 0x02fa] = 0x3f800000u; + reg_array[ContextRegWordOffset + 0x02fb] = 0x3f800000u; + reg_array[ContextRegWordOffset + 0x02fc] = 0x3f800000u; + reg_array[ContextRegWordOffset + 0x02fd] = 0x3f800000u; + reg_array[ContextRegWordOffset + 0x0316] = 0x0000000eu; + reg_array[ContextRegWordOffset + 0x0317] = 0x00000010u; +} + +} // namespace AmdGpu diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index dce2d4b4..89ab7fe4 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -216,6 +216,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span usage_override /*= {}*/) - : info{info_}, image_id{image_id_} { + : image_id{image_id_}, info{info_} { vk::ImageViewUsageCreateInfo usage_ci{}; if (usage_override) { usage_ci.usage = usage_override.value(); diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 6bb104a6..0bed5adc 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -342,12 +342,6 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /* .usage = usage, }; -#ifdef __APPLE__ - // Fix for detiler artifacts on macOS - const bool is_large_buffer = true; -#else - const bool is_large_buffer = size > 128_MB; -#endif VmaAllocationCreateInfo alloc_info{ .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT @@ -462,7 +456,6 @@ std::optional TileManager::TryDetile(Image& image) { (m > 0 ? params.sizes[m - 1] : 0); } - auto pitch = image.info.pitch; cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params), ¶ms);