video_core: added support for indirect draws (#678)
* video_core: added support for indirect draws * barriers simplified
This commit is contained in:
parent
3d375a28eb
commit
ca1613258f
|
@ -650,12 +650,12 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32
|
||||||
}
|
}
|
||||||
|
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
||||||
u32 vertex_sgpr_offset, u32 instance_vgpr_offset,
|
u32 vertex_sgpr_offset, u32 instance_sgpr_offset,
|
||||||
u32 flags) {
|
u32 flags) {
|
||||||
LOG_TRACE(Lib_GnmDriver, "called");
|
LOG_TRACE(Lib_GnmDriver, "called");
|
||||||
|
|
||||||
if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
|
if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
|
||||||
(vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) {
|
(vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) {
|
||||||
|
|
||||||
const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
|
const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
|
||||||
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndexIndirect>(
|
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndexIndirect>(
|
||||||
|
@ -665,7 +665,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset,
|
||||||
|
|
||||||
cmdbuf[0] = data_offset;
|
cmdbuf[0] = data_offset;
|
||||||
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
|
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
|
||||||
cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset;
|
cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset;
|
||||||
cmdbuf[3] = 0;
|
cmdbuf[3] = 0;
|
||||||
|
|
||||||
cmdbuf += 4;
|
cmdbuf += 4;
|
||||||
|
@ -707,11 +707,11 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
||||||
u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags) {
|
u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags) {
|
||||||
LOG_TRACE(Lib_GnmDriver, "called");
|
LOG_TRACE(Lib_GnmDriver, "called");
|
||||||
|
|
||||||
if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
|
if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
|
||||||
(vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) {
|
(vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) {
|
||||||
|
|
||||||
const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
|
const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
|
||||||
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndirect>(cmdbuf, 4, PM4ShaderType::ShaderGraphics,
|
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndirect>(cmdbuf, 4, PM4ShaderType::ShaderGraphics,
|
||||||
|
@ -721,7 +721,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32
|
||||||
|
|
||||||
cmdbuf[0] = data_offset;
|
cmdbuf[0] = data_offset;
|
||||||
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
|
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
|
||||||
cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset;
|
cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset;
|
||||||
cmdbuf[3] = 2; // auto index
|
cmdbuf[3] = 2; // auto index
|
||||||
|
|
||||||
cmdbuf += 4;
|
cmdbuf += 4;
|
||||||
|
|
|
@ -45,7 +45,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndex(u32* cmdbuf, u32 size, u32 index_count, uintptr
|
||||||
u32 flags, u32 type);
|
u32 flags, u32 type);
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 flags);
|
s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 flags);
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
||||||
u32 vertex_sgpr_offset, u32 instance_vgpr_offset,
|
u32 vertex_sgpr_offset, u32 instance_sgpr_offset,
|
||||||
u32 flags);
|
u32 flags);
|
||||||
int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti();
|
int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti();
|
||||||
int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti();
|
int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti();
|
||||||
|
@ -53,7 +53,7 @@ int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced();
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count,
|
s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count,
|
||||||
u32 flags);
|
u32 flags);
|
||||||
s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
|
||||||
u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags);
|
u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags);
|
||||||
int PS4_SYSV_ABI sceGnmDrawIndirectCountMulti();
|
int PS4_SYSV_ABI sceGnmDrawIndirectCountMulti();
|
||||||
int PS4_SYSV_ABI sceGnmDrawIndirectMulti();
|
int PS4_SYSV_ABI sceGnmDrawIndirectMulti();
|
||||||
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size);
|
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size);
|
||||||
|
|
|
@ -368,6 +368,36 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case PM4ItOpcode::DrawIndirect: {
|
||||||
|
const auto* draw_indirect = reinterpret_cast<const PM4CmdDrawIndirect*>(header);
|
||||||
|
const auto offset = draw_indirect->data_offset;
|
||||||
|
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
|
||||||
|
const auto size = sizeof(PM4CmdDrawIndirect::DrawInstancedArgs);
|
||||||
|
if (rasterizer) {
|
||||||
|
const auto cmd_address = reinterpret_cast<const void*>(header);
|
||||||
|
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address));
|
||||||
|
rasterizer->Breadcrumb(u64(cmd_address));
|
||||||
|
rasterizer->DrawIndirect(false, ib_address, offset, size);
|
||||||
|
rasterizer->ScopeMarkerEnd();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case PM4ItOpcode::DrawIndexIndirect: {
|
||||||
|
const auto* draw_index_indirect =
|
||||||
|
reinterpret_cast<const PM4CmdDrawIndexIndirect*>(header);
|
||||||
|
const auto offset = draw_index_indirect->data_offset;
|
||||||
|
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
|
||||||
|
const auto size = sizeof(PM4CmdDrawIndexIndirect::DrawIndexInstancedArgs);
|
||||||
|
if (rasterizer) {
|
||||||
|
const auto cmd_address = reinterpret_cast<const void*>(header);
|
||||||
|
rasterizer->ScopeMarkerBegin(
|
||||||
|
fmt::format("dcb:{}:DrawIndexIndirect", cmd_address));
|
||||||
|
rasterizer->Breadcrumb(u64(cmd_address));
|
||||||
|
rasterizer->DrawIndirect(true, ib_address, offset, size);
|
||||||
|
rasterizer->ScopeMarkerEnd();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case PM4ItOpcode::DispatchDirect: {
|
case PM4ItOpcode::DispatchDirect: {
|
||||||
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
|
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
|
||||||
regs.cs_program.dim_x = dispatch_direct->dim_x;
|
regs.cs_program.dim_x = dispatch_direct->dim_x;
|
||||||
|
@ -488,6 +518,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PM4ItOpcode::PfpSyncMe: {
|
case PM4ItOpcode::PfpSyncMe: {
|
||||||
|
rasterizer->CpSync();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -253,20 +253,6 @@ struct PM4CmdDrawIndexAuto {
|
||||||
u32 draw_initiator;
|
u32 draw_initiator;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PM4CmdDrawIndirect {
|
|
||||||
PM4Type3Header header; ///< header
|
|
||||||
u32 data_offset; ///< DWORD aligned offset
|
|
||||||
union {
|
|
||||||
u32 dw2;
|
|
||||||
BitField<0, 16, u32> base_vtx_loc; ///< base vertex location
|
|
||||||
};
|
|
||||||
union {
|
|
||||||
u32 dw3;
|
|
||||||
BitField<0, 16, u32> start_inst_loc; ///< start instance location
|
|
||||||
};
|
|
||||||
u32 draw_initiator; ///< Draw Initiator Register
|
|
||||||
};
|
|
||||||
|
|
||||||
enum class DataSelect : u32 {
|
enum class DataSelect : u32 {
|
||||||
None = 0,
|
None = 0,
|
||||||
Data32Low = 1,
|
Data32Low = 1,
|
||||||
|
@ -740,4 +726,51 @@ struct PM4CmdDispatchIndirect {
|
||||||
u32 dispatch_initiator; ///< Dispatch Initiator Register
|
u32 dispatch_initiator; ///< Dispatch Initiator Register
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct PM4CmdDrawIndirect {
|
||||||
|
struct DrawInstancedArgs {
|
||||||
|
u32 vertex_count_per_instance;
|
||||||
|
u32 instance_count;
|
||||||
|
u32 start_vertex_location;
|
||||||
|
u32 start_instance_location;
|
||||||
|
};
|
||||||
|
|
||||||
|
PM4Type3Header header; ///< header
|
||||||
|
u32 data_offset; ///< Byte aligned offset where the required data structure starts
|
||||||
|
union {
|
||||||
|
u32 dw2;
|
||||||
|
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
|
||||||
|
///< BaseVertexLocation it fetched from memory
|
||||||
|
};
|
||||||
|
union {
|
||||||
|
u32 dw3;
|
||||||
|
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
|
||||||
|
///< StartInstanceLocation it fetched from memory
|
||||||
|
};
|
||||||
|
u32 draw_initiator; ///< Draw Initiator Register
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PM4CmdDrawIndexIndirect {
|
||||||
|
struct DrawIndexInstancedArgs {
|
||||||
|
u32 index_count_per_instance;
|
||||||
|
u32 instance_count;
|
||||||
|
u32 start_index_location;
|
||||||
|
u32 base_vertex_location;
|
||||||
|
u32 start_instance_location;
|
||||||
|
};
|
||||||
|
|
||||||
|
PM4Type3Header header; ///< header
|
||||||
|
u32 data_offset; ///< Byte aligned offset where the required data structure starts
|
||||||
|
union {
|
||||||
|
u32 dw2;
|
||||||
|
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
|
||||||
|
///< BaseVertexLocation it fetched from memory
|
||||||
|
};
|
||||||
|
union { // NOTE: this one is undocumented in AMD spec, but Gnm driver writes this field
|
||||||
|
u32 dw3;
|
||||||
|
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
|
||||||
|
///< StartInstanceLocation it fetched from memory
|
||||||
|
};
|
||||||
|
u32 draw_initiator; ///< Draw Initiator Register
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace AmdGpu
|
} // namespace AmdGpu
|
||||||
|
|
|
@ -29,6 +29,19 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
|
||||||
|
|
||||||
Rasterizer::~Rasterizer() = default;
|
Rasterizer::~Rasterizer() = default;
|
||||||
|
|
||||||
|
void Rasterizer::CpSync() {
|
||||||
|
scheduler.EndRendering();
|
||||||
|
auto cmdbuf = scheduler.CommandBuffer();
|
||||||
|
|
||||||
|
const vk::MemoryBarrier ib_barrier{
|
||||||
|
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
|
||||||
|
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
|
||||||
|
};
|
||||||
|
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
vk::PipelineStageFlagBits::eDrawIndirect,
|
||||||
|
vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {});
|
||||||
|
}
|
||||||
|
|
||||||
void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
|
void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
|
||||||
RENDERER_TRACE;
|
RENDERER_TRACE;
|
||||||
|
|
||||||
|
@ -66,6 +79,45 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size) {
|
||||||
|
RENDERER_TRACE;
|
||||||
|
|
||||||
|
const auto cmdbuf = scheduler.CommandBuffer();
|
||||||
|
const auto& regs = liverpool->regs;
|
||||||
|
const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
|
||||||
|
if (!pipeline) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ASSERT_MSG(regs.primitive_type != AmdGpu::Liverpool::PrimitiveType::RectList,
|
||||||
|
"Unsupported primitive type for indirect draw");
|
||||||
|
|
||||||
|
try {
|
||||||
|
pipeline->BindResources(regs, buffer_cache, texture_cache);
|
||||||
|
} catch (...) {
|
||||||
|
UNREACHABLE();
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex);
|
||||||
|
buffer_cache.BindVertexBuffers(vs_info);
|
||||||
|
const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, 0);
|
||||||
|
|
||||||
|
BeginRendering();
|
||||||
|
UpdateDynamicState(*pipeline);
|
||||||
|
|
||||||
|
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
|
||||||
|
const auto total_offset = base + offset;
|
||||||
|
|
||||||
|
// We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and
|
||||||
|
// instance offsets will be automatically applied by Vulkan from indirect args buffer.
|
||||||
|
|
||||||
|
if (is_indexed) {
|
||||||
|
cmdbuf.drawIndexedIndirect(buffer->Handle(), total_offset, 1, 0);
|
||||||
|
} else {
|
||||||
|
cmdbuf.drawIndirect(buffer->Handle(), total_offset, 1, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Rasterizer::DispatchDirect() {
|
void Rasterizer::DispatchDirect() {
|
||||||
RENDERER_TRACE;
|
RENDERER_TRACE;
|
||||||
|
|
||||||
|
@ -113,19 +165,6 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) {
|
||||||
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle());
|
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle());
|
||||||
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
|
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
|
||||||
const auto total_offset = base + offset;
|
const auto total_offset = base + offset;
|
||||||
|
|
||||||
// Emulate PFP-to-ME sync packet
|
|
||||||
const vk::BufferMemoryBarrier ib_barrier{
|
|
||||||
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
|
|
||||||
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
|
|
||||||
.buffer = buffer->Handle(),
|
|
||||||
.offset = total_offset,
|
|
||||||
.size = size,
|
|
||||||
};
|
|
||||||
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
|
|
||||||
vk::PipelineStageFlagBits::eDrawIndirect,
|
|
||||||
vk::DependencyFlagBits::eByRegion, {}, ib_barrier, {});
|
|
||||||
|
|
||||||
cmdbuf.dispatchIndirect(buffer->Handle(), total_offset);
|
cmdbuf.dispatchIndirect(buffer->Handle(), total_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
void Draw(bool is_indexed, u32 index_offset = 0);
|
void Draw(bool is_indexed, u32 index_offset = 0);
|
||||||
|
void DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size);
|
||||||
|
|
||||||
void DispatchDirect();
|
void DispatchDirect();
|
||||||
void DispatchIndirect(VAddr address, u32 offset, u32 size);
|
void DispatchIndirect(VAddr address, u32 offset, u32 size);
|
||||||
|
@ -45,6 +46,7 @@ public:
|
||||||
void MapMemory(VAddr addr, u64 size);
|
void MapMemory(VAddr addr, u64 size);
|
||||||
void UnmapMemory(VAddr addr, u64 size);
|
void UnmapMemory(VAddr addr, u64 size);
|
||||||
|
|
||||||
|
void CpSync();
|
||||||
u64 Flush();
|
u64 Flush();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
Loading…
Reference in New Issue