diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index dbe454b2..921babfa 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -27,7 +27,7 @@ template static inline u32* WriteTrailingNop(u32* cmdbuf) { auto* nop = reinterpret_cast(cmdbuf); nop->header = PM4Type3Header{PM4ItOpcode::Nop, data_block_size - 1}; - nop->data_block[0] = 0; // only one out of `data_block_size` is initialized + nop->data_block[0] = 0u; // only one out of `data_block_size` is initialized return cmdbuf + data_block_size + 1 /* header */; } @@ -48,9 +48,8 @@ s32 PS4_SYSV_ABI sceGnmAddEqEvent(SceKernelEqueue eq, u64 id, void* udata) { kernel_event.event.udata = udata; eq->addEvent(kernel_event); - liverpool->eop_callback = [=]() { - eq->triggerEvent(SceKernelEvent::Type::GfxEop, EVFILT_GRAPHICS_CORE, nullptr); - }; + liverpool->SetEopCallback( + [=]() { eq->triggerEvent(SceKernelEvent::Type::GfxEop, EVFILT_GRAPHICS_CORE, nullptr); }); return ORBIS_OK; } @@ -82,7 +81,7 @@ s32 PS4_SYSV_ABI sceGnmComputeWaitOnAddress(u32* cmdbuf, u32 size, uintptr_t add wait_reg_mem->poll_addr_hi = u32(addr >> 32u); wait_reg_mem->ref = ref; wait_reg_mem->mask = mask; - wait_reg_mem->poll_interval = 10; + wait_reg_mem->poll_interval = 10u; WriteTrailingNop<2>(cmdbuf + 7); return ORBIS_OK; @@ -652,10 +651,10 @@ s32 PS4_SYSV_ABI sceGnmInsertWaitFlipDone(u32* cmdbuf, u32 size, s32 vo_handle, auto* wait_reg_mem = reinterpret_cast(cmdbuf); wait_reg_mem->header = PM4Type3Header{PM4ItOpcode::WaitRegMem, 5}; - wait_reg_mem->function.Assign(3u); - wait_reg_mem->mem_space.Assign(1u); + wait_reg_mem->function.Assign(PM4CmdWaitRegMem::Function::Equal); + wait_reg_mem->mem_space.Assign(PM4CmdWaitRegMem::MemSpace::Memory); *reinterpret_cast(&wait_reg_mem->poll_addr_lo) = - (label_addr + buf_idx * sizeof(uintptr_t)) & 0xffff'fffcu; + (label_addr + buf_idx * sizeof(uintptr_t)) & ~0x3ull; wait_reg_mem->ref = 0u; wait_reg_mem->mask = 0xffff'ffffu; wait_reg_mem->poll_interval = 10u; @@ -1303,7 +1302,7 @@ static inline s32 PatchFlipRequest(u32* cmdbuf, u32 size, u32 vo_handle, u32 buf write_lock->header = PM4Type3Header{PM4ItOpcode::WriteData, 3}; write_lock->dst_sel.Assign(5u); *reinterpret_cast(&write_lock->dst_addr_lo) = - (label_addr + buf_idx * sizeof(uintptr_t)) & 0xffff'fffcu; + (label_addr + buf_idx * sizeof(uintptr_t)) & ~0x3ull; write_lock->data[0] = 1; auto* nop = reinterpret_cast(cmdbuf + 5); @@ -1405,7 +1404,7 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], } } - liverpool->ProcessCmdList(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); + liverpool->Submit(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); return ORBIS_OK; } @@ -1416,7 +1415,10 @@ int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload() { } int PS4_SYSV_ABI sceGnmSubmitDone() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + LOG_INFO(Lib_GnmDriver, "called"); + + liverpool->SubmitDone(); + return ORBIS_OK; } diff --git a/src/core/libraries/videoout/driver.cpp b/src/core/libraries/videoout/driver.cpp index e1a8b0e9..5e093c20 100644 --- a/src/core/libraries/videoout/driver.cpp +++ b/src/core/libraries/videoout/driver.cpp @@ -196,6 +196,9 @@ void VideoOutDriver::Flip(std::chrono::microseconds timeout) { reinterpret_cast(req.flip_arg)); } } + + // Reset flip label + req.port->buffer_labels[req.index] = 0; } bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg) { diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index c7db16ce..5fbb1acb 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -117,6 +117,14 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { } case PM4ItOpcode::WriteData: { const auto* write_data = reinterpret_cast(header); + ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); + const u32 data_size = (header->type3.count.Value() - 2) * 4; + if (!write_data->wr_one_addr.Value()) { + std::memcpy(reinterpret_cast(write_data->Address()), write_data->data, + data_size); + } else { + UNREACHABLE(); + } break; } case PM4ItOpcode::AcquireMem: { @@ -125,6 +133,13 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { } case PM4ItOpcode::WaitRegMem: { const auto* wait_reg_mem = reinterpret_cast(header); + ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + ASSERT(wait_reg_mem->function.Value() == PM4CmdWaitRegMem::Function::Equal); + + { + std::unique_lock lock{m_reg_mem}; + cv_reg_mem.wait(lock, [&]() { return wait_reg_mem->Test(); }); + } break; } default: diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 44c2a526..34cac432 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -3,11 +3,14 @@ #pragma once +#include "common/assert.h" #include "common/bit_field.h" #include "common/types.h" #include +#include #include +#include namespace AmdGpu { @@ -612,9 +615,26 @@ struct Liverpool { public: Liverpool(); + void Submit(u32* cmdbuf, u32 size_in_bytes) { + ASSERT_MSG(!cp.valid(), "Trying to submit while previous submission is pending"); + cp = std::async(&Liverpool::ProcessCmdList, this, cmdbuf, size_in_bytes); + } + void SubmitDone() { + // This is wrong as `submitDone()` should never be blocking. The behavior will be + // reworked with mutiple queues introduction + cp.get(); + } + void SetEopCallback(auto const& cb) { + eop_callback = cb; + } + +private: void ProcessCmdList(u32* cmdbuf, u32 size_in_bytes); std::function eop_callback{}; + std::future cp{}; + std::condition_variable cv_reg_mem{}; + std::mutex m_reg_mem{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 6ce06750..762897fb 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -320,6 +320,19 @@ struct PM4DmaData { }; struct PM4CmdWaitRegMem { + enum Engine : u32 { Me = 0u, Pfp = 1u }; + enum MemSpace : u32 { Register = 0u, Memory = 1u }; + enum Function : u32 { + Always = 0u, + LessThan = 1u, + LessThanEqual = 2u, + Equal = 3u, + NotEqual = 4u, + GreaterThanEqual = 5u, + GreaterThan = 6u, + Reserved = 7u + }; + PM4Type3Header header; union { BitField<0, 3, u32> function; @@ -332,6 +345,41 @@ struct PM4CmdWaitRegMem { u32 ref; u32 mask; u32 poll_interval; + + u32* Address() const { + return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); + } + + bool Test() const { + switch (function.Value()) { + case Function::Always: { + return true; + } + case Function::LessThan: { + return (*Address() & mask) < ref; + } + case Function::LessThanEqual: { + return (*Address() & mask) <= ref; + } + case Function::Equal: { + return (*Address() & mask) == ref; + } + case Function::NotEqual: { + return (*Address() & mask) != ref; + } + case Function::GreaterThanEqual: { + return (*Address() & mask) >= ref; + } + case Function::GreaterThan: { + return (*Address() & mask) > ref; + } + case Function::Reserved: + [[fallthrough]]; + default: { + UNREACHABLE(); + } + } + } }; struct PM4CmdWriteData { @@ -346,6 +394,10 @@ struct PM4CmdWriteData { u32 dst_addr_lo; u32 dst_addr_hi; u32 data[0]; + + uintptr_t Address() const { + return (uintptr_t(dst_addr_hi) << 32) | dst_addr_lo; + } }; } // namespace AmdGpu