Merge pull request #137 from shadps4-emu/video_core/multi_submits
amdgpu: multiple GFX submits
This commit is contained in:
commit
3f692a4d7d
|
@ -1359,13 +1359,13 @@ static inline s32 PatchFlipRequest(u32* cmdbuf, u32 size, u32 vo_handle, u32 buf
|
||||||
return ORBIS_OK;
|
return ORBIS_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[],
|
s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[],
|
||||||
u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[],
|
u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[],
|
||||||
u32* ccb_sizes_in_bytes, u32 vo_handle,
|
u32* ccb_sizes_in_bytes, u32 vo_handle,
|
||||||
u32 buf_idx, u32 flip_mode, u32 flip_arg) {
|
u32 buf_idx, u32 flip_mode, u32 flip_arg) {
|
||||||
LOG_INFO(Lib_GnmDriver, "called [buf = {}]", buf_idx);
|
LOG_INFO(Lib_GnmDriver, "called [buf = {}]", buf_idx);
|
||||||
|
|
||||||
auto* cmdbuf = reinterpret_cast<u32*>(dcb_gpu_addrs[count - 1]);
|
auto* cmdbuf = dcb_gpu_addrs[count - 1];
|
||||||
const auto size_dw = dcb_sizes_in_bytes[count - 1] / 4;
|
const auto size_dw = dcb_sizes_in_bytes[count - 1] / 4;
|
||||||
|
|
||||||
const s32 patch_result =
|
const s32 patch_result =
|
||||||
|
@ -1374,7 +1374,8 @@ s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addr
|
||||||
return patch_result;
|
return patch_result;
|
||||||
}
|
}
|
||||||
|
|
||||||
return sceGnmSubmitCommandBuffers(count, dcb_gpu_addrs, dcb_sizes_in_bytes, ccb_gpu_addrs,
|
return sceGnmSubmitCommandBuffers(count, const_cast<const u32**>(dcb_gpu_addrs),
|
||||||
|
dcb_sizes_in_bytes, const_cast<const u32**>(ccb_gpu_addrs),
|
||||||
ccb_sizes_in_bytes);
|
ccb_sizes_in_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1383,11 +1384,10 @@ int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload() {
|
||||||
return ORBIS_OK;
|
return ORBIS_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[],
|
s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[],
|
||||||
u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[],
|
u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[],
|
||||||
u32* ccb_sizes_in_bytes) {
|
u32* ccb_sizes_in_bytes) {
|
||||||
LOG_INFO(Lib_GnmDriver, "called");
|
LOG_INFO(Lib_GnmDriver, "called");
|
||||||
ASSERT_MSG(count == 1, "Multiple command buffer submission is unsupported!");
|
|
||||||
|
|
||||||
if (!dcb_gpu_addrs || !dcb_sizes_in_bytes) {
|
if (!dcb_gpu_addrs || !dcb_sizes_in_bytes) {
|
||||||
LOG_ERROR(Lib_GnmDriver, "dcbGpuAddrs and dcbSizesInBytes must not be NULL");
|
LOG_ERROR(Lib_GnmDriver, "dcbGpuAddrs and dcbSizesInBytes must not be NULL");
|
||||||
|
@ -1411,7 +1411,12 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
liverpool->Submit(reinterpret_cast<u32*>(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]);
|
for (auto cbpair = 0u; cbpair < count; ++cbpair) {
|
||||||
|
const auto* ccb = ccb_gpu_addrs ? ccb_gpu_addrs[cbpair] : nullptr;
|
||||||
|
const auto ccb_size = ccb_sizes_in_bytes ? ccb_sizes_in_bytes[cbpair] : 0;
|
||||||
|
|
||||||
|
liverpool->SubmitGfx({dcb_gpu_addrs[cbpair], dcb_sizes_in_bytes[cbpair]}, {ccb, ccb_size});
|
||||||
|
}
|
||||||
|
|
||||||
return ORBIS_OK;
|
return ORBIS_OK;
|
||||||
}
|
}
|
||||||
|
|
|
@ -194,13 +194,13 @@ int PS4_SYSV_ABI sceGnmSqttStopTrace();
|
||||||
int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer();
|
int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer();
|
||||||
int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer2();
|
int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer2();
|
||||||
int PS4_SYSV_ABI sceGnmSqttWaitForEvent();
|
int PS4_SYSV_ABI sceGnmSqttWaitForEvent();
|
||||||
s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[],
|
s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[],
|
||||||
u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[],
|
u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[],
|
||||||
u32* ccb_sizes_in_bytes, u32 vo_handle,
|
u32* ccb_sizes_in_bytes, u32 vo_handle,
|
||||||
u32 buf_idx, u32 flip_mode, u32 flip_arg);
|
u32 buf_idx, u32 flip_mode, u32 flip_arg);
|
||||||
int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload();
|
int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload();
|
||||||
s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[],
|
s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[],
|
||||||
u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[],
|
u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[],
|
||||||
u32* ccb_sizes_in_bytes);
|
u32* ccb_sizes_in_bytes);
|
||||||
int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload();
|
int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload();
|
||||||
int PS4_SYSV_ABI sceGnmSubmitDone();
|
int PS4_SYSV_ABI sceGnmSubmitDone();
|
||||||
|
|
|
@ -9,16 +9,50 @@
|
||||||
|
|
||||||
namespace AmdGpu {
|
namespace AmdGpu {
|
||||||
|
|
||||||
Liverpool::Liverpool() = default;
|
Liverpool::Liverpool() {
|
||||||
|
process_thread = std::jthread{std::bind_front(&Liverpool::Process, this)};
|
||||||
|
}
|
||||||
|
|
||||||
void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
Liverpool::~Liverpool() {
|
||||||
|
process_thread.request_stop();
|
||||||
|
cv_submit.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Liverpool::Process(std::stop_token stoken) {
|
||||||
|
while (!stoken.stop_requested()) {
|
||||||
|
std::span<const u32> dcb{};
|
||||||
|
{
|
||||||
|
std::unique_lock lock{m_ring_access};
|
||||||
|
cv_submit.wait(lock, stoken, [&]() { return !gfx_ring.empty(); });
|
||||||
|
|
||||||
|
if (stoken.stop_requested()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
dcb = gfx_ring.front();
|
||||||
|
gfx_ring.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_MSG(dcb.size() != 0, "Empty command list received");
|
||||||
|
ProcessCmdList(dcb.data(), dcb.size());
|
||||||
|
|
||||||
|
cv_complete.notify_all();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Liverpool::WaitGpuIdle() {
|
||||||
|
std::unique_lock lock{m_ring_access};
|
||||||
|
cv_complete.wait(lock, [this]() { return gfx_ring.empty(); });
|
||||||
|
}
|
||||||
|
|
||||||
|
void Liverpool::ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes) {
|
||||||
Common::SetCurrentThreadName("CommandProcessor_Gfx");
|
Common::SetCurrentThreadName("CommandProcessor_Gfx");
|
||||||
|
|
||||||
auto* header = reinterpret_cast<PM4Header*>(cmdbuf);
|
auto* header = reinterpret_cast<const PM4Header*>(cmdbuf);
|
||||||
u32 processed_cmd_size = 0;
|
u32 processed_cmd_size = 0;
|
||||||
|
|
||||||
while (processed_cmd_size < size_in_bytes) {
|
while (processed_cmd_size < size_in_bytes) {
|
||||||
PM4Header* next_header{};
|
const PM4Header* next_header{};
|
||||||
const u32 type = header->type;
|
const u32 type = header->type;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 3: {
|
case 3: {
|
||||||
|
@ -26,7 +60,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
||||||
const u32 count = header->type3.NumWords();
|
const u32 count = header->type3.NumWords();
|
||||||
switch (opcode) {
|
switch (opcode) {
|
||||||
case PM4ItOpcode::Nop: {
|
case PM4ItOpcode::Nop: {
|
||||||
const auto* nop = reinterpret_cast<PM4CmdNop*>(header);
|
const auto* nop = reinterpret_cast<const PM4CmdNop*>(header);
|
||||||
if (nop->header.count.Value() == 0) {
|
if (nop->header.count.Value() == 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -44,30 +78,30 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::SetContextReg: {
|
case PM4ItOpcode::SetContextReg: {
|
||||||
const auto* set_data = reinterpret_cast<PM4CmdSetData*>(header);
|
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
|
||||||
std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset],
|
std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset],
|
||||||
header + 2, (count - 1) * sizeof(u32));
|
header + 2, (count - 1) * sizeof(u32));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::SetShReg: {
|
case PM4ItOpcode::SetShReg: {
|
||||||
const auto* set_data = reinterpret_cast<PM4CmdSetData*>(header);
|
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
|
||||||
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
||||||
(count - 1) * sizeof(u32));
|
(count - 1) * sizeof(u32));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::SetUconfigReg: {
|
case PM4ItOpcode::SetUconfigReg: {
|
||||||
const auto* set_data = reinterpret_cast<PM4CmdSetData*>(header);
|
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
|
||||||
std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset],
|
std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset],
|
||||||
header + 2, (count - 1) * sizeof(u32));
|
header + 2, (count - 1) * sizeof(u32));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::IndexType: {
|
case PM4ItOpcode::IndexType: {
|
||||||
const auto* index_type = reinterpret_cast<PM4CmdDrawIndexType*>(header);
|
const auto* index_type = reinterpret_cast<const PM4CmdDrawIndexType*>(header);
|
||||||
regs.index_buffer_type.raw = index_type->raw;
|
regs.index_buffer_type.raw = index_type->raw;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::DrawIndex2: {
|
case PM4ItOpcode::DrawIndex2: {
|
||||||
const auto* draw_index = reinterpret_cast<PM4CmdDrawIndex2*>(header);
|
const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndex2*>(header);
|
||||||
regs.max_index_size = draw_index->max_size;
|
regs.max_index_size = draw_index->max_size;
|
||||||
regs.index_base_address.base_addr_lo = draw_index->index_base_lo;
|
regs.index_base_address.base_addr_lo = draw_index->index_base_lo;
|
||||||
regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi);
|
regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi);
|
||||||
|
@ -77,7 +111,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::DrawIndexAuto: {
|
case PM4ItOpcode::DrawIndexAuto: {
|
||||||
const auto* draw_index = reinterpret_cast<PM4CmdDrawIndexAuto*>(header);
|
const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndexAuto*>(header);
|
||||||
regs.num_indices = draw_index->index_count;
|
regs.num_indices = draw_index->index_count;
|
||||||
regs.draw_initiator = draw_index->draw_initiator;
|
regs.draw_initiator = draw_index->draw_initiator;
|
||||||
// rasterizer->DrawIndex();
|
// rasterizer->DrawIndex();
|
||||||
|
@ -88,21 +122,21 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::EventWriteEos: {
|
case PM4ItOpcode::EventWriteEos: {
|
||||||
const auto* event_eos = reinterpret_cast<PM4CmdEventWriteEos*>(header);
|
const auto* event_eos = reinterpret_cast<const PM4CmdEventWriteEos*>(header);
|
||||||
event_eos->SignalFence();
|
event_eos->SignalFence();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::EventWriteEop: {
|
case PM4ItOpcode::EventWriteEop: {
|
||||||
const auto* event_eop = reinterpret_cast<PM4CmdEventWriteEop*>(header);
|
const auto* event_eop = reinterpret_cast<const PM4CmdEventWriteEop*>(header);
|
||||||
event_eop->SignalFence();
|
event_eop->SignalFence();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::DmaData: {
|
case PM4ItOpcode::DmaData: {
|
||||||
const auto* dma_data = reinterpret_cast<PM4DmaData*>(header);
|
const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::WriteData: {
|
case PM4ItOpcode::WriteData: {
|
||||||
const auto* write_data = reinterpret_cast<PM4CmdWriteData*>(header);
|
const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
|
||||||
ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
|
ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
|
||||||
const u32 data_size = (header->type3.count.Value() - 2) * 4;
|
const u32 data_size = (header->type3.count.Value() - 2) * 4;
|
||||||
if (!write_data->wr_one_addr.Value()) {
|
if (!write_data->wr_one_addr.Value()) {
|
||||||
|
@ -117,7 +151,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::WaitRegMem: {
|
case PM4ItOpcode::WaitRegMem: {
|
||||||
const auto* wait_reg_mem = reinterpret_cast<PM4CmdWaitRegMem*>(header);
|
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
|
||||||
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
|
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
|
||||||
while (!wait_reg_mem->Test()) {
|
while (!wait_reg_mem->Test()) {
|
||||||
using namespace std::chrono_literals;
|
using namespace std::chrono_literals;
|
||||||
|
|
|
@ -11,6 +11,9 @@
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <future>
|
#include <future>
|
||||||
|
#include <span>
|
||||||
|
#include <thread>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
namespace AmdGpu {
|
namespace AmdGpu {
|
||||||
|
|
||||||
|
@ -614,23 +617,41 @@ struct Liverpool {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Liverpool();
|
Liverpool();
|
||||||
|
~Liverpool();
|
||||||
|
|
||||||
void Submit(u32* cmdbuf, u32 size_in_bytes) {
|
void SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
|
||||||
ASSERT_MSG(!cp.valid(), "Trying to submit while previous submission is pending");
|
if (submission_lock) {
|
||||||
cp = std::async(&Liverpool::ProcessCmdList, this, cmdbuf, size_in_bytes);
|
WaitGpuIdle();
|
||||||
|
|
||||||
|
// Suspend logic goes here
|
||||||
|
|
||||||
|
submission_lock = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::scoped_lock lock{m_ring_access};
|
||||||
|
gfx_ring.emplace(dcb);
|
||||||
|
|
||||||
|
ASSERT_MSG(ccb.size() == 0, "CCBs are not supported yet");
|
||||||
|
}
|
||||||
|
cv_submit.notify_one();
|
||||||
}
|
}
|
||||||
void SubmitDone() {
|
void SubmitDone() {
|
||||||
// This is wrong as `submitDone()` should never be blocking. The behavior will be
|
submission_lock = true;
|
||||||
// reworked with mutiple queues introduction
|
|
||||||
if (cp.valid()) {
|
|
||||||
cp.get();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void ProcessCmdList(u32* cmdbuf, u32 size_in_bytes);
|
void ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes);
|
||||||
|
void Process(std::stop_token stoken);
|
||||||
|
void WaitGpuIdle();
|
||||||
|
|
||||||
std::future<void> cp{};
|
std::jthread process_thread{};
|
||||||
|
std::queue<std::span<const u32>> gfx_ring{};
|
||||||
|
std::condition_variable_any cv_submit{};
|
||||||
|
std::condition_variable cv_complete{};
|
||||||
|
std::mutex m_ring_access{};
|
||||||
|
|
||||||
|
bool submission_lock{};
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08);
|
static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08);
|
||||||
|
|
Loading…
Reference in New Issue