From 2963790e0d934426e1952603af97349f871c3716 Mon Sep 17 00:00:00 2001 From: psucien Date: Sun, 26 May 2024 12:39:30 +0200 Subject: [PATCH 1/2] video_core: initial support for CE and ASC queues --- src/core/libraries/gnmdriver/gnmdriver.cpp | 4 +- src/video_core/amdgpu/liverpool.cpp | 434 ++++++++++++++------- src/video_core/amdgpu/liverpool.h | 80 +++- src/video_core/amdgpu/pm4_cmds.h | 46 +++ src/video_core/amdgpu/pm4_opcodes.h | 8 +- 5 files changed, 413 insertions(+), 159 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index a3cf2a98..008ae189 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1447,7 +1447,9 @@ int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload() { int PS4_SYSV_ABI sceGnmSubmitDone() { LOG_INFO(Lib_GnmDriver, "called"); - submission_lock = true; + if (!liverpool->IsGpuIdle()) { + submission_lock = true; + } return ORBIS_OK; } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index d43f749b..bd10b302 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -10,6 +10,8 @@ namespace AmdGpu { +std::array Liverpool::ConstantEngine::constants_heap; + Liverpool::Liverpool() { process_thread = std::jthread{std::bind_front(&Liverpool::Process, this)}; } @@ -20,169 +22,311 @@ Liverpool::~Liverpool() { } void Liverpool::Process(std::stop_token stoken) { + Common::SetCurrentThreadName("GPU_CommandProcessor"); + while (!stoken.stop_requested()) { - std::span dcb{}; { - std::unique_lock lock{m_ring_access}; - cv_submit.wait(lock, stoken, [&]() { return !gfx_ring.empty(); }); - - if (stoken.stop_requested()) { - break; - } - - dcb = gfx_ring.front(); - gfx_ring.pop(); + std::unique_lock lock{m_submit}; + cv_submit.wait(lock, stoken, [this]() { return num_submits != 0; }); } - ASSERT_MSG(!dcb.empty(), "Empty command list received"); - ProcessCmdList(dcb.data(), dcb.size_bytes()); + if (stoken.stop_requested()) { + break; + } - { - std::unique_lock lock{m_ring_access}; - if (gfx_ring.empty()) { - cv_complete.notify_all(); + int qid = -1; + + while (num_submits) { + qid = (qid + 1) % NumTotalQueues; + + auto& queue = mapped_queues[qid]; + + Task::Handle task{}; + { + std::scoped_lock lock{queue.m_access}; + + if (queue.submits.empty()) { + continue; + } + + task = queue.submits.front(); + } + task.resume(); + + if (task.done()) { + std::scoped_lock lock{queue.m_access}; + queue.submits.pop(); + + --num_submits; } } + cv_complete.notify_all(); // Notify GPU idle } } void Liverpool::WaitGpuIdle() { - std::unique_lock lock{m_ring_access}; - cv_complete.wait(lock, [this]() { return gfx_ring.empty(); }); + std::unique_lock lock{m_submit}; + cv_complete.wait(lock, [this]() { return num_submits == 0; }); } -void Liverpool::ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes) { - Common::SetCurrentThreadName("CommandProcessor_Gfx"); - - auto* header = reinterpret_cast(cmdbuf); - u32 processed_cmd_size = 0; - - while (processed_cmd_size < size_in_bytes) { - const PM4Header* next_header{}; +Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { + while (!ccb.empty()) { + const auto* header = reinterpret_cast(ccb.data()); const u32 type = header->type; - switch (type) { - case 3: { - const PM4ItOpcode opcode = header->type3.opcode; - const u32 count = header->type3.NumWords(); - switch (opcode) { - case PM4ItOpcode::Nop: { - const auto* nop = reinterpret_cast(header); - if (nop->header.count.Value() == 0) { - break; - } - - switch (nop->data_block[0]) { - case PM4CmdNop::PayloadType::PatchedFlip: { - // There is no evidence that GPU CP drives flip events by parsing - // special NOP packets. For convenience lets assume that it does. - Platform::IrqC::Instance()->Signal(Platform::InterruptId::GfxFlip); - break; - } - default: - break; - } - break; - } - case PM4ItOpcode::SetContextReg: { - const auto* set_data = reinterpret_cast(header); - std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset], - header + 2, (count - 1) * sizeof(u32)); - break; - } - case PM4ItOpcode::SetShReg: { - const auto* set_data = reinterpret_cast(header); - std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, - (count - 1) * sizeof(u32)); - break; - } - case PM4ItOpcode::SetUconfigReg: { - const auto* set_data = reinterpret_cast(header); - std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset], - header + 2, (count - 1) * sizeof(u32)); - break; - } - case PM4ItOpcode::IndexType: { - const auto* index_type = reinterpret_cast(header); - regs.index_buffer_type.raw = index_type->raw; - break; - } - case PM4ItOpcode::DrawIndex2: { - const auto* draw_index = reinterpret_cast(header); - regs.max_index_size = draw_index->max_size; - regs.index_base_address.base_addr_lo = draw_index->index_base_lo; - regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi); - regs.num_indices = draw_index->index_count; - regs.draw_initiator = draw_index->draw_initiator; - if (rasterizer) { - rasterizer->Draw(true); - } - break; - } - case PM4ItOpcode::DrawIndexAuto: { - const auto* draw_index = reinterpret_cast(header); - regs.num_indices = draw_index->index_count; - regs.draw_initiator = draw_index->draw_initiator; - if (rasterizer) { - rasterizer->Draw(false); - } - break; - } - case PM4ItOpcode::DispatchDirect: { - // const auto* dispatch_direct = reinterpret_cast(header); - break; - } - case PM4ItOpcode::EventWriteEos: { - const auto* event_eos = reinterpret_cast(header); - event_eos->SignalFence(); - break; - } - case PM4ItOpcode::EventWriteEop: { - const auto* event_eop = reinterpret_cast(header); - event_eop->SignalFence(); - break; - } - case PM4ItOpcode::DmaData: { - const auto* dma_data = reinterpret_cast(header); - break; - } - case PM4ItOpcode::WriteData: { - const auto* write_data = reinterpret_cast(header); - ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); - const u32 data_size = (header->type3.count.Value() - 2) * 4; - if (!write_data->wr_one_addr.Value()) { - std::memcpy(write_data->Address(), write_data->data, data_size); - } else { - UNREACHABLE(); - } - break; - } - case PM4ItOpcode::AcquireMem: { - // const auto* acquire_mem = reinterpret_cast(header); - break; - } - case PM4ItOpcode::WaitRegMem: { - const auto* wait_reg_mem = reinterpret_cast(header); - ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); - while (!wait_reg_mem->Test()) { - using namespace std::chrono_literals; - std::this_thread::sleep_for(1ms); - } - break; - } - default: - UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", - static_cast(opcode), count); - } - next_header = header + header->type3.NumWords() + 1; - break; - } - default: + if (type != 3) { + // No other types of packets were spotted so far UNREACHABLE_MSG("Invalid PM4 type {}", type); } - processed_cmd_size += uintptr_t(next_header) - uintptr_t(header); - header = next_header; + const PM4ItOpcode opcode = header->type3.opcode; + const auto* it_body = reinterpret_cast(header) + 1; + switch (opcode) { + case PM4ItOpcode::Nop: { + const auto* nop = reinterpret_cast(header); + break; + } + case PM4ItOpcode::WriteConstRam: { + const auto* write_const = reinterpret_cast(header); + memcpy(cblock.constants_heap.data() + write_const->Offset(), &write_const->data, + write_const->Size()); + break; + } + case PM4ItOpcode::DumpConstRam: { + const auto* dump_const = reinterpret_cast(header); + memcpy(dump_const->Address(), + cblock.constants_heap.data() + dump_const->Offset(), dump_const->Size()); + break; + } + case PM4ItOpcode::IncrementCeCounter: { + ++cblock.ce_count; + break; + } + case PM4ItOpcode::WaitOnDeCounterDiff: { + const auto diff = it_body[0]; + while ((cblock.de_count - cblock.ce_count) >= diff) { + co_yield {}; + } + break; + } + default: + const u32 count = header->type3.NumWords(); + UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", + static_cast(opcode), count); + } + ccb = ccb.subspan(header->type3.NumWords() + 1); } } +Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span ccb) { + cblock.Reset(); + + // TODO: potentially, ASCs also can depend on CE and in this case the + // CE task should be moved into more global scope + Task ce_task{}; + + if (!ccb.empty()) { + // In case of CCB provided kick off CE asap to have the constant heap ready to use + ce_task = ProcessCeUpdate(ccb); + ce_task.handle.resume(); + } + + while (!dcb.empty()) { + const auto* header = reinterpret_cast(dcb.data()); + const u32 type = header->type; + if (type != 3) { + // No other types of packets were spotted so far + UNREACHABLE_MSG("Invalid PM4 type {}", type); + } + + const u32 count = header->type3.NumWords(); + const PM4ItOpcode opcode = header->type3.opcode; + switch (opcode) { + case PM4ItOpcode::Nop: { + const auto* nop = reinterpret_cast(header); + if (nop->header.count.Value() == 0) { + break; + } + + switch (nop->data_block[0]) { + case PM4CmdNop::PayloadType::PatchedFlip: { + // There is no evidence that GPU CP drives flip events by parsing + // special NOP packets. For convenience lets assume that it does. + Platform::IrqC::Instance()->Signal(Platform::InterruptId::GfxFlip); + break; + } + default: + break; + } + break; + } + case PM4ItOpcode::SetContextReg: { + const auto* set_data = reinterpret_cast(header); + std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset], header + 2, + (count - 1) * sizeof(u32)); + break; + } + case PM4ItOpcode::SetShReg: { + const auto* set_data = reinterpret_cast(header); + std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, + (count - 1) * sizeof(u32)); + break; + } + case PM4ItOpcode::SetUconfigReg: { + const auto* set_data = reinterpret_cast(header); + std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset], header + 2, + (count - 1) * sizeof(u32)); + break; + } + case PM4ItOpcode::IndexType: { + const auto* index_type = reinterpret_cast(header); + regs.index_buffer_type.raw = index_type->raw; + break; + } + case PM4ItOpcode::DrawIndex2: { + const auto* draw_index = reinterpret_cast(header); + regs.max_index_size = draw_index->max_size; + regs.index_base_address.base_addr_lo = draw_index->index_base_lo; + regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi); + regs.num_indices = draw_index->index_count; + regs.draw_initiator = draw_index->draw_initiator; + if (rasterizer) { + rasterizer->Draw(true); + } + break; + } + case PM4ItOpcode::DrawIndexAuto: { + const auto* draw_index = reinterpret_cast(header); + regs.num_indices = draw_index->index_count; + regs.draw_initiator = draw_index->draw_initiator; + if (rasterizer) { + rasterizer->Draw(false); + } + break; + } + case PM4ItOpcode::DispatchDirect: { + // const auto* dispatch_direct = reinterpret_cast(header); + break; + } + case PM4ItOpcode::EventWrite: { + // const auto* event = reinterpret_cast(header); + break; + } + case PM4ItOpcode::EventWriteEos: { + const auto* event_eos = reinterpret_cast(header); + event_eos->SignalFence(); + break; + } + case PM4ItOpcode::EventWriteEop: { + const auto* event_eop = reinterpret_cast(header); + event_eop->SignalFence(); + break; + } + case PM4ItOpcode::DmaData: { + const auto* dma_data = reinterpret_cast(header); + break; + } + case PM4ItOpcode::WriteData: { + const auto* write_data = reinterpret_cast(header); + ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); + const u32 data_size = (header->type3.count.Value() - 2) * 4; + if (!write_data->wr_one_addr.Value()) { + std::memcpy(write_data->Address(), write_data->data, data_size); + } else { + UNREACHABLE(); + } + break; + } + case PM4ItOpcode::AcquireMem: { + // const auto* acquire_mem = reinterpret_cast(header); + break; + } + case PM4ItOpcode::WaitRegMem: { + const auto* wait_reg_mem = reinterpret_cast(header); + ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + while (!wait_reg_mem->Test()) { + co_yield {}; + } + break; + } + case PM4ItOpcode::IncrementDeCounter: { + ++cblock.de_count; + break; + } + case PM4ItOpcode::WaitOnCeCounter: { + while (cblock.ce_count <= cblock.de_count) { + ce_task.handle.resume(); + } + break; + } + default: + UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", + static_cast(opcode), count); + } + + dcb = dcb.subspan(header->type3.NumWords() + 1); + } + + if (ce_task.handle) { + ASSERT_MSG(ce_task.handle.done(), "Partially processed CCB"); + } +} + +Liverpool::Task Liverpool::ProcessCompute(std::span acb) { + while (!acb.empty()) { + const auto* header = reinterpret_cast(acb.data()); + const u32 type = header->type; + if (type != 3) { + // No other types of packets were spotted so far + UNREACHABLE_MSG("Invalid PM4 type {}", type); + } + + const u32 count = header->type3.NumWords(); + const PM4ItOpcode opcode = header->type3.opcode; + const auto* it_body = reinterpret_cast(header) + 1; + switch (opcode) { + default: + UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", + static_cast(opcode), count); + } + + acb = acb.subspan(header->type3.NumWords() + 1); + } + + return {}; // Not a coroutine yet +} + +void Liverpool::SubmitGfx(std::span dcb, std::span ccb) { + static constexpr u32 GfxQueueId = 0u; + auto& queue = mapped_queues[GfxQueueId]; + + auto task = ProcessGraphics(dcb, ccb); + { + std::unique_lock lock{queue.m_access}; + queue.submits.emplace(task.handle); + } + + { + std::unique_lock lock{m_submit}; + ++num_submits; + } + cv_submit.notify_one(); +} + +void Liverpool::SubmitAsc(u32 vqid, std::span acb) { + ASSERT_MSG(vqid > 0 && vqid < NumTotalQueues, "Invalid virtual ASC queue index"); + auto& queue = mapped_queues[vqid]; + + const auto& task = ProcessCompute(acb); + { + std::unique_lock lock{queue.m_access}; + queue.submits.emplace(task.handle); + } + + { + std::unique_lock lock{m_submit}; + ++num_submits; + } + cv_submit.notify_one(); +} + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 83fd2494..ad1984eb 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,12 @@ namespace AmdGpu { [[maybe_unused]] std::array CONCAT2(pad, __LINE__) struct Liverpool { + static constexpr u32 NumGfxRings = 1u; // actually 2, but HP is reserved by system software + static constexpr u32 NumComputePipes = 7u; // actually 8, but #7 is reserved by system software + static constexpr u32 NumQueuesPerPipe = 8u; + static constexpr u32 NumTotalQueues = NumGfxRings + (NumComputePipes * NumQueuesPerPipe); + static_assert(NumTotalQueues < 64u); // need to fit into u64 bitmap for ffs + static constexpr u32 NumColorBuffers = 8; static constexpr u32 NumViewports = 16; static constexpr u32 NumClipPlanes = 6; @@ -631,32 +638,81 @@ public: Liverpool(); ~Liverpool(); - void SubmitGfx(std::span dcb, std::span ccb) { - { - std::scoped_lock lock{m_ring_access}; - gfx_ring.emplace(dcb); - - ASSERT_MSG(ccb.size() == 0, "CCBs are not supported yet"); - } - cv_submit.notify_one(); - } + void SubmitGfx(std::span dcb, std::span ccb); + void SubmitAsc(u32 vqid, std::span acb); void WaitGpuIdle(); + bool IsGpuIdle() const { + return num_submits == 0; + } void BindRasterizer(Vulkan::Rasterizer* rasterizer_) { rasterizer = rasterizer_; } private: - void ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes); + struct Task { + struct promise_type { + auto get_return_object() { + Task task{}; + task.handle = std::coroutine_handle::from_promise(*this); + return task; + } + static constexpr std::suspend_always initial_suspend() noexcept { + // We want the task to be suspended at start + return {}; + } + static constexpr std::suspend_always final_suspend() noexcept { + return {}; + } + void unhandled_exception() {} + void return_void() {} + struct empty {}; + std::suspend_always yield_value(empty&&) { + return {}; + } + }; + + using Handle = std::coroutine_handle; + Handle handle; + }; + + Task ProcessGraphics(std::span dcb, std::span ccb); + Task ProcessCeUpdate(std::span ccb); + Task ProcessCompute(std::span acb); + void Process(std::stop_token stoken); + struct GpuQueue { + std::mutex m_access{}; + std::queue submits{}; + }; + std::array mapped_queues{}; + + struct ConstantEngine { + void Reset() { + ce_count = 0; + de_count = 0; + ce_compare_count = 0; + } + + [[nodiscard]] u32 Diff() const { + ASSERT_MSG(ce_count >= de_count, "DE counter is ahead of CE"); + return ce_count - de_count; + } + + u32 ce_compare_count{}; + u32 ce_count{}; + u32 de_count{}; + static std::array constants_heap; + } cblock{}; + Vulkan::Rasterizer* rasterizer{}; std::jthread process_thread{}; - std::queue> gfx_ring{}; std::condition_variable_any cv_submit{}; std::condition_variable cv_complete{}; - std::mutex m_ring_access{}; + std::mutex m_submit{}; + std::atomic num_submits{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index e26830cd..3ac5382a 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -494,4 +494,50 @@ struct PM4CmdEventWriteEos { } }; +struct PM4WriteConstRam { + PM4Type3Header header; + union { + BitField<0, 16, u32> offset; // in DWs + u32 dw1; + }; + u32 data[0]; + + [[nodiscard]] u32 Offset() const { + return offset.Value() << 2u; + } + + [[nodiscard]] u32 Size() const { + return header.count << 2u; + } +}; + +struct PM4DumpConstRam { + PM4Type3Header header; + union { + BitField<0, 16, u32> offset; ///< Starting byte offset into the Constant RAM. The minimum + ///< granularity is 4 bytes + u32 dw1; + }; + union { + BitField<0, 15, u32> + num_dw; ///< Number of DWs to read from the constant RAM. The minimum granularity is DWs + u32 dw2; + }; + u32 addr_lo; + u32 addr_hi; + + template + T* Address() const { + return reinterpret_cast((u64(addr_hi) << 32u) | addr_lo); + } + + [[nodiscard]] u32 Offset() const { + return offset.Value(); + } + + [[nodiscard]] u32 Size() const { + return num_dw.Value() << 2u; + } +}; + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pm4_opcodes.h b/src/video_core/amdgpu/pm4_opcodes.h index fb3fc8c5..1d2ab431 100644 --- a/src/video_core/amdgpu/pm4_opcodes.h +++ b/src/video_core/amdgpu/pm4_opcodes.h @@ -58,7 +58,13 @@ enum class PM4ItOpcode : u32 { SetContextRegIndirect = 0x73, SetShReg = 0x76, SetShRegOffset = 0x77, - SetUconfigReg = 0x79 + SetUconfigReg = 0x79, + WriteConstRam = 0x81, + DumpConstRam = 0x83, + IncrementCeCounter = 0x84, + IncrementDeCounter = 0x85, + WaitOnCeCounter = 0x86, + WaitOnDeCounterDiff = 0x88, }; } // namespace AmdGpu From 09c7379fe072735f05c4609849b614840f8ef9f1 Mon Sep 17 00:00:00 2001 From: psucien Date: Sun, 26 May 2024 18:18:32 +0200 Subject: [PATCH 2/2] fix for leaks of coroutine handle --- src/video_core/amdgpu/liverpool.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index bd10b302..50e5398f 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -54,6 +54,8 @@ void Liverpool::Process(std::stop_token stoken) { task.resume(); if (task.done()) { + task.destroy(); + std::scoped_lock lock{queue.m_access}; queue.submits.pop(); @@ -268,6 +270,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span