diff --git a/CMakeLists.txt b/CMakeLists.txt index a79c1bd6..2400ea81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -337,11 +337,13 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/frontend/opcodes.h src/shader_recompiler/frontend/structured_control_flow.cpp src/shader_recompiler/frontend/structured_control_flow.h - src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp - src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/constant_propogation_pass.cpp - src/shader_recompiler/ir/passes/info_collection.cpp - src/shader_recompiler/ir/passes/passes.h + src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp + src/shader_recompiler/ir/passes/identity_removal_pass.cpp + src/shader_recompiler/ir/passes/ir_passes.h + src/shader_recompiler/ir/passes/resource_tracking_pass.cpp + src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp + src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp src/shader_recompiler/ir/abstract_syntax_list.h src/shader_recompiler/ir/attribute.cpp src/shader_recompiler/ir/attribute.h @@ -378,6 +380,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/renderer_vulkan/renderer_vulkan.h src/video_core/renderer_vulkan/vk_common.cpp src/video_core/renderer_vulkan/vk_common.h + src/video_core/renderer_vulkan/vk_compute_pipeline.cpp + src/video_core/renderer_vulkan/vk_compute_pipeline.h src/video_core/renderer_vulkan/vk_descriptor_update_queue.cpp src/video_core/renderer_vulkan/vk_descriptor_update_queue.h src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp diff --git a/externals/sirit b/externals/sirit index 9c12a07e..8c281cc0 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 9c12a07e62dfa404727e7fc85dd83bba84cc830d +Subproject commit 8c281cc0b7cd638d3853a5aa2fc35b969fcbb599 diff --git a/src/core/libraries/kernel/event_queue.cpp b/src/core/libraries/kernel/event_queue.cpp index faa29541..6392d078 100644 --- a/src/core/libraries/kernel/event_queue.cpp +++ b/src/core/libraries/kernel/event_queue.cpp @@ -36,17 +36,12 @@ int EqueueInternal::waitForEvents(SceKernelEvent* ev, int num, u32 micros) { ret = getTriggeredEvents(ev, num); return ret > 0; }; -#ifndef _WIN64 - char buf[128]; - pthread_getname_np(pthread_self(), buf, 128); - fmt::print("Thread {} waiting for events (micros = {})\n", buf, micros); -#endif // !_WIN64 + if (micros == 0) { m_cond.wait(lock, predicate); } else { m_cond.wait_for(lock, std::chrono::microseconds(micros), predicate); } - fmt::print("Wait done\n"); return ret; } diff --git a/src/core/libraries/kernel/libkernel.cpp b/src/core/libraries/kernel/libkernel.cpp index a6c2231a..cad5dd97 100644 --- a/src/core/libraries/kernel/libkernel.cpp +++ b/src/core/libraries/kernel/libkernel.cpp @@ -52,7 +52,7 @@ int PS4_SYSV_ABI sceKernelMunmap(void* addr, size_t len) { return SCE_OK; } -void PS4_SYSV_ABI sceKernelUsleep(unsigned int microseconds) { +void PS4_SYSV_ABI sceKernelUsleep(u32 microseconds) { std::this_thread::sleep_for(std::chrono::microseconds(microseconds)); } diff --git a/src/core/libraries/np_trophy/np_trophy.cpp b/src/core/libraries/np_trophy/np_trophy.cpp index fda0aa42..f7fb6e09 100644 --- a/src/core/libraries/np_trophy/np_trophy.cpp +++ b/src/core/libraries/np_trophy/np_trophy.cpp @@ -71,7 +71,7 @@ int PS4_SYSV_ABI sceNpTrophyCreateContext() { int PS4_SYSV_ABI sceNpTrophyCreateHandle() { LOG_ERROR(Lib_NpTrophy, "(STUBBED) called"); - return ORBIS_OK; + return -1; } int PS4_SYSV_ABI sceNpTrophyDestroyContext() { @@ -617,4 +617,4 @@ void RegisterlibSceNpTrophy(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("+not13BEdVI", "libSceNpTrophy", 1, "libSceNpTrophy", 1, 1, Func_FA7A2DD770447552); }; -} // namespace Libraries::NpTrophy \ No newline at end of file +} // namespace Libraries::NpTrophy diff --git a/src/core/libraries/videoout/driver.cpp b/src/core/libraries/videoout/driver.cpp index a91d95c2..b6e1ff73 100644 --- a/src/core/libraries/videoout/driver.cpp +++ b/src/core/libraries/videoout/driver.cpp @@ -202,7 +202,6 @@ void VideoOutDriver::Flip(std::chrono::microseconds timeout) { // Reset flip label req.port->buffer_labels[req.index] = 0; - LOG_INFO(Lib_VideoOut, "Flip done [buf = {}]", req.index); } bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 7838b9cc..83758688 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -199,7 +199,6 @@ MemoryManager::VMAHandle MemoryManager::MergeAdjacent(VMAHandle iter) { } void MemoryManager::MapVulkanMemory(VAddr addr, size_t size) { - return; const vk::Device device = instance->GetDevice(); const auto memory_props = instance->GetPhysicalDevice().getMemoryProperties(); void* host_pointer = reinterpret_cast(addr); @@ -271,7 +270,6 @@ void MemoryManager::MapVulkanMemory(VAddr addr, size_t size) { } void MemoryManager::UnmapVulkanMemory(VAddr addr, size_t size) { - return; const auto it = mapped_memories.find(addr); ASSERT(it != mapped_memories.end() && it->second.buffer_size == size); mapped_memories.erase(it); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 39aea9c9..c211be25 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -173,10 +173,10 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { spv::ExecutionModel execution_model{}; switch (program.info.stage) { case Stage::Compute: { - // const std::array workgroup_size{program.workgroup_size}; - // execution_model = spv::ExecutionModel::GLCompute; - // ctx.AddExecutionMode(main, spv::ExecutionMode::LocalSize, workgroup_size[0], - // workgroup_size[1], workgroup_size[2]); + const std::array workgroup_size{program.info.workgroup_size}; + execution_model = spv::ExecutionModel::GLCompute; + ctx.AddExecutionMode(main, spv::ExecutionMode::LocalSize, workgroup_size[0], + workgroup_size[1], workgroup_size[2]); break; } case Stage::Vertex: @@ -189,6 +189,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { } else { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); } + ctx.AddCapability(spv::Capability::DemoteToHelperInvocationEXT); // if (program.info.stores_frag_depth) { // ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); // } @@ -249,7 +250,11 @@ Id EmitIdentity(EmitContext& ctx, const IR::Value& value) { } Id EmitConditionRef(EmitContext& ctx, const IR::Value& value) { - throw NotImplementedException("Forward identity declaration"); + const Id id{ctx.Def(value)}; + if (!Sirit::ValidId(id)) { + throw NotImplementedException("Forward identity declaration"); + } + return id; } void EmitReference(EmitContext&) {} @@ -258,23 +263,11 @@ void EmitPhiMove(EmitContext&) { throw LogicError("Unreachable instruction"); } -void EmitGetZeroFromOp(EmitContext&) { +void EmitGetScc(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } -void EmitGetSignFromOp(EmitContext&) { - throw LogicError("Unreachable instruction"); -} - -void EmitGetCarryFromOp(EmitContext&) { - throw LogicError("Unreachable instruction"); -} - -void EmitGetOverflowFromOp(EmitContext&) { - throw LogicError("Unreachable instruction"); -} - -void EmitSetVcc(EmitContext& ctx) { +void EmitGetExec(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } @@ -282,4 +275,24 @@ void EmitGetVcc(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } +void EmitGetVccLo(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + +void EmitSetScc(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + +void EmitSetExec(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + +void EmitSetVcc(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + +void EmitSetVccLo(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index b5011218..d6c67ee9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -29,8 +29,8 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { } } // Anonymous namespace -void EmitGetUserData(EmitContext&) { - throw LogicError("Unreachable instruction"); +Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { + return ctx.ConstU32(ctx.info.user_data[static_cast(reg)]); } void EmitGetScalarRegister(EmitContext&) { @@ -62,10 +62,13 @@ Id EmitReadConst(EmitContext& ctx) { } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - const Id buffer = ctx.buffers[handle]; - const Id type = ctx.info.buffers[handle].is_storage ? ctx.storage_f32 : ctx.uniform_f32; - const Id ptr{ctx.OpAccessChain(type, buffer, ctx.ConstU32(0U), index)}; - return ctx.OpLoad(ctx.F32[1], ptr); + const auto& buffer = ctx.buffers[handle]; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + return ctx.OpLoad(buffer.data_types->Get(1), ptr); +} + +Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) { + return EmitReadConstBuffer(ctx, handle, index); } Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { @@ -76,8 +79,12 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { // Attribute is disabled or varying component is not written return ctx.ConstF32(comp == 3 ? 1.0f : 0.0f); } - const Id pointer{ctx.OpAccessChain(param.pointer_type, param.id, ctx.ConstU32(comp))}; - return ctx.OpLoad(param.component_type, pointer); + if (param.num_components > 1) { + const Id pointer{ctx.OpAccessChain(param.pointer_type, param.id, ctx.ConstU32(comp))}; + return ctx.OpLoad(param.component_type, pointer); + } else { + return ctx.OpLoad(param.component_type, param.id); + } } throw NotImplementedException("Read attribute {}", attr); } @@ -86,6 +93,11 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { switch (attr) { case IR::Attribute::VertexId: return ctx.OpLoad(ctx.U32[1], ctx.vertex_index); + case IR::Attribute::WorkgroupId: + return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.workgroup_id), comp); + case IR::Attribute::LocalInvocationId: + return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.local_invocation_id), + comp); default: throw NotImplementedException("Read U32 attribute {}", attr); } @@ -97,9 +109,22 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen } Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + const auto info = inst->Flags(); + const auto& buffer = ctx.buffers[handle]; + if (info.index_enable && info.offset_enable) { + UNREACHABLE(); + } else if (info.index_enable) { + const Id ptr{ + ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)}; + return ctx.OpLoad(buffer.data_types->Get(1), ptr); + } UNREACHABLE(); } +Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return EmitLoadBufferF32(ctx, inst, handle, address); +} + Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { UNREACHABLE(); } @@ -110,18 +135,48 @@ Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto info = inst->Flags(); - const Id buffer = ctx.buffers[handle]; - const Id type = ctx.info.buffers[handle].is_storage ? ctx.storage_f32 : ctx.uniform_f32; + const auto& buffer = ctx.buffers[handle]; if (info.index_enable && info.offset_enable) { UNREACHABLE(); } else if (info.index_enable) { boost::container::static_vector ids; for (u32 i = 0; i < 4; i++) { const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; - const Id ptr{ctx.OpAccessChain(type, buffer, ctx.ConstU32(0U), index)}; - ids.push_back(ctx.OpLoad(ctx.F32[1], ptr)); + const Id ptr{ + ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); } - return ctx.OpCompositeConstruct(ctx.F32[4], ids); + return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids); + } + UNREACHABLE(); +} + +void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + UNREACHABLE(); +} + +void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + UNREACHABLE(); +} + +void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + UNREACHABLE(); +} + +void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + UNREACHABLE(); +} + +void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + const auto info = inst->Flags(); + const auto& buffer = ctx.buffers[handle]; + if (info.index_enable && info.offset_enable) { + UNREACHABLE(); + } else if (info.index_enable) { + const Id ptr{ + ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)}; + ctx.OpStore(ptr, value); + return; } UNREACHABLE(); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp index 148eee0c..e56eb916 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp @@ -30,6 +30,10 @@ Id EmitFPAdd64(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { return ctx.OpFAdd(ctx.F64[1], a, b); } +Id EmitFPSub32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { + return ctx.OpFSub(ctx.F32[1], a, b); +} + Id EmitFPFma16(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { return ctx.OpFma(ctx.F16[1], a, b, c); } @@ -196,6 +200,10 @@ Id EmitFPTrunc64(EmitContext& ctx, Id value) { return ctx.OpTrunc(ctx.F64[1], value); } +Id EmitFPFract(EmitContext& ctx, Id value) { + return ctx.OpFract(ctx.F32[1], value); +} + Id EmitFPOrdEqual16(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpFOrdEqual(ctx.U1[1], lhs, rhs); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 77416e7f..2192b054 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -8,7 +8,7 @@ namespace Shader::IR { enum class Attribute : u64; -enum class Patch : u64; +enum class ScalarReg : u32; class Inst; class Value; } // namespace Shader::IR @@ -30,11 +30,18 @@ void EmitJoin(EmitContext& ctx); void EmitBarrier(EmitContext& ctx); void EmitWorkgroupMemoryBarrier(EmitContext& ctx); void EmitDeviceMemoryBarrier(EmitContext& ctx); +void EmitGetScc(EmitContext& ctx); +void EmitGetExec(EmitContext& ctx); void EmitGetVcc(EmitContext& ctx); +void EmitGetVccLo(EmitContext& ctx); +void EmitSetScc(EmitContext& ctx); +void EmitSetExec(EmitContext& ctx); void EmitSetVcc(EmitContext& ctx); +void EmitSetVccLo(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); -void EmitGetUserData(EmitContext& ctx); +void EmitDiscard(EmitContext& ctx); +Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg); void EmitGetScalarRegister(EmitContext& ctx); void EmitSetScalarRegister(EmitContext& ctx); void EmitGetVectorRegister(EmitContext& ctx); @@ -44,10 +51,17 @@ void EmitGetGotoVariable(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); Id EmitReadConst(EmitContext& ctx); Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index); +Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index); Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); @@ -137,6 +151,7 @@ Id EmitFPAbs64(EmitContext& ctx, Id value); Id EmitFPAdd16(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitFPAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitFPAdd64(EmitContext& ctx, IR::Inst* inst, Id a, Id b); +Id EmitFPSub32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitFPFma16(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c); Id EmitFPFma32(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c); Id EmitFPFma64(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c); @@ -177,6 +192,7 @@ Id EmitFPCeil64(EmitContext& ctx, Id value); Id EmitFPTrunc16(EmitContext& ctx, Id value); Id EmitFPTrunc32(EmitContext& ctx, Id value); Id EmitFPTrunc64(EmitContext& ctx, Id value); +Id EmitFPFract(EmitContext& ctx, Id value); Id EmitFPOrdEqual16(EmitContext& ctx, Id lhs, Id rhs); Id EmitFPOrdEqual32(EmitContext& ctx, Id lhs, Id rhs); Id EmitFPOrdEqual64(EmitContext& ctx, Id lhs, Id rhs); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index 43f1a581..0ef985a9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -10,6 +10,10 @@ void EmitPrologue(EmitContext& ctx) {} void EmitEpilogue(EmitContext& ctx) {} +void EmitDiscard(EmitContext& ctx) { + ctx.OpDemoteToHelperInvocationEXT(); +} + void EmitEmitVertex(EmitContext& ctx, const IR::Value& stream) { throw NotImplementedException("Geometry streams"); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index a2a0fc9a..de0fedd4 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -194,6 +194,12 @@ void EmitContext::DefineInputs(const Info& info) { input_params[input.semantic] = {id, input_f32, F32[1], num_components}; interfaces.push_back(id); } + break; + case Stage::Compute: + workgroup_id = DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); + local_invocation_id = + DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); + break; default: break; } @@ -233,10 +239,11 @@ void EmitContext::DefineOutputs(const Info& info) { void EmitContext::DefineBuffers(const Info& info) { for (u32 i = 0; const auto& buffer : info.buffers) { - ASSERT(True(buffer.used_types & IR::Type::F32)); - ASSERT(buffer.stride % sizeof(float) == 0); - const u32 num_elements = buffer.stride * buffer.num_records / sizeof(float); - const Id record_array_type{TypeArray(F32[1], ConstU32(num_elements))}; + const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; + const Id data_type = (*data_types)[1]; + const u32 stride = buffer.stride == 0 ? 1 : buffer.stride; + const u32 num_elements = stride * buffer.num_records; + const Id record_array_type{TypeArray(data_type, ConstU32(num_elements))}; const Id struct_type{TypeStruct(record_array_type)}; Decorate(record_array_type, spv::Decoration::ArrayStride, 4); @@ -249,18 +256,18 @@ void EmitContext::DefineBuffers(const Info& info) { const auto storage_class = buffer.is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; - if (buffer.is_storage) { - storage_f32 = TypePointer(storage_class, F32[1]); - } else { - uniform_f32 = TypePointer(storage_class, F32[1]); - } + const Id pointer_type = TypePointer(storage_class, data_type); const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; Decorate(id, spv::Decoration::Binding, binding); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("c{}", i)); + Name(id, fmt::format("{}{}", buffer.is_storage ? "ssbo" : "cbuf", i)); binding++; - buffers.push_back(id); + buffers.push_back({ + .id = id, + .data_types = data_types, + .pointer_type = pointer_type, + }); interfaces.push_back(id); i++; } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index ba0deca2..53d59f43 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -23,6 +23,14 @@ struct VectorIds { return ids[index - 1]; } + [[nodiscard]] Id& Get(u32 index) { + return ids[index - 1]; + } + + [[nodiscard]] const Id& Get(u32 index) const { + return ids[index - 1]; + } + std::array ids; }; @@ -141,9 +149,6 @@ public: Id output_u32{}; Id output_f32{}; - Id uniform_f32{}; - Id storage_f32{}; - boost::container::small_vector interfaces; Id output_position{}; @@ -151,6 +156,9 @@ public: Id base_vertex{}; std::array frag_color{}; + Id workgroup_id{}; + Id local_invocation_id{}; + struct TextureDefinition { Id id; Id sampled_type; @@ -158,8 +166,14 @@ public: Id image_type; }; + struct BufferDefinition { + Id id; + const VectorIds* data_types; + Id pointer_type; + }; + u32& binding; - boost::container::small_vector buffers; + boost::container::small_vector buffers; boost::container::small_vector images; boost::container::small_vector samplers; diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 475732c1..bd289ffb 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -42,7 +42,7 @@ static IR::Condition MakeCondition(Opcode opcode) { CFG::CFG(ObjectPool& block_pool_, std::span inst_list_) : block_pool{block_pool_}, inst_list{inst_list_} { - index_to_pc.resize(inst_list.size()); + index_to_pc.resize(inst_list.size() + 1); EmitLabels(); EmitBlocks(); LinkBlocks(); @@ -78,6 +78,7 @@ void CFG::EmitLabels() { } pc += inst.length; } + index_to_pc[inst_list.size()] = pc; // Sort labels to make sure block insertion is correct. std::ranges::sort(labels); @@ -90,7 +91,7 @@ void CFG::EmitBlocks() { } const auto it_index = std::ranges::lower_bound(index_to_pc, label); ASSERT(it_index != index_to_pc.end() || label > index_to_pc.back()); - return std::distance(index_to_pc.begin(), std::prev(it_index)); + return std::distance(index_to_pc.begin(), it_index); }; for (auto it = labels.begin(); it != labels.end(); it++) { @@ -102,7 +103,7 @@ void CFG::EmitBlocks() { return; } const Label end = *next_it; - const size_t end_index = get_index(end); + const size_t end_index = get_index(end) - 1; const auto& end_inst = inst_list[end_index]; // Insert block between the labels using the last instruction @@ -146,9 +147,15 @@ void CFG::LinkBlocks() { block.branch_true = get_block(target_pc); block.branch_false = get_block(block.end); block.end_class = EndClass::Branch; + } else if (end_inst.opcode == Opcode::S_ENDPGM) { + const auto& prev_inst = inst_list[block.end_index - 1]; + if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0) { + block.end_class = EndClass::Kill; + } else { + block.end_class = EndClass::Exit; + } } else { - // Exit blocks don't link to anything. - block.end_class = EndClass::Exit; + UNREACHABLE(); } } } @@ -187,12 +194,12 @@ std::string CFG::Dot() const { fmt::format("\t\tN{} [label=\"Exit\"][shape=square][style=stripped];\n", node_uid); ++node_uid; break; - // case EndClass::Kill: - // dot += fmt::format("\t\t{}->N{};\n", name, node_uid); - // dot += fmt::format("\t\tN{} [label=\"Kill\"][shape=square][style=stripped];\n", - // node_uid); - // ++node_uid; - // break; + case EndClass::Kill: + dot += fmt::format("\t\t{}->N{};\n", name, node_uid); + dot += + fmt::format("\t\tN{} [label=\"Kill\"][shape=square][style=stripped];\n", node_uid); + ++node_uid; + break; } } dot += "\t\tlabel = \"main\";\n\t}\n"; diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h index f3c00793..b9eb12aa 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.h +++ b/src/shader_recompiler/frontend/control_flow_graph.h @@ -21,6 +21,7 @@ using Hook = enum class EndClass { Branch, ///< Block ends with a (un)conditional branch. Exit, ///< Block ends with an exit instruction. + Kill, ///< Block ends with a discard instruction. }; /// A block represents a linear range of instructions. diff --git a/src/shader_recompiler/frontend/decode.cpp b/src/shader_recompiler/frontend/decode.cpp index dcc0495d..b5c02d74 100644 --- a/src/shader_recompiler/frontend/decode.cpp +++ b/src/shader_recompiler/frontend/decode.cpp @@ -684,7 +684,7 @@ void GcnDecodeContext::decodeInstructionVOP3(uint64_t hexInstruction) { outputMod.clamp = static_cast(control.clmp); switch (control.omod) { case 0: - outputMod.multiplier = std::numeric_limits::quiet_NaN(); + outputMod.multiplier = 0.f; break; case 1: outputMod.multiplier = 2.0f; diff --git a/src/shader_recompiler/frontend/instruction.h b/src/shader_recompiler/frontend/instruction.h index 22c2146c..d1d10efb 100644 --- a/src/shader_recompiler/frontend/instruction.h +++ b/src/shader_recompiler/frontend/instruction.h @@ -33,7 +33,7 @@ struct InputModifiers { /// These are applied before storing an operand register. struct OutputModifiers { bool clamp = false; - float multiplier = std::numeric_limits::quiet_NaN(); + float multiplier = 0.f; }; struct InstOperand { diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 2064c6a5..79be146a 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -409,9 +409,9 @@ private: case EndClass::Exit: root.insert(ip, *pool.Create(Return{}, &root_stmt)); break; - // case EndClass::Kill: - // root.insert(ip, *pool.Create(Kill{}, &root_stmt)); - // break; + case EndClass::Kill: + root.insert(ip, *pool.Create(Kill{}, &root_stmt)); + break; } } } @@ -606,8 +606,7 @@ public: Visit(root_stmt, nullptr, nullptr); IR::Block& first_block{*syntax_list.front().data.block}; - IR::IREmitter ir(first_block, first_block.begin()); - ir.Prologue(); + Translator{&first_block, info}.EmitPrologue(); } private: @@ -767,7 +766,7 @@ private: case StatementType::Kill: { ensure_block(); IR::Block* demote_block{MergeBlock(parent, stmt)}; - // IR::IREmitter{*current_block}.DemoteToHelperInvocation(); + IR::IREmitter{*current_block}.Discard(); current_block->AddBranch(demote_block); current_block = demote_block; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index c920f936..62d3a378 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -30,9 +30,16 @@ void Translator::S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst) { return ir.ILessThan(lhs, rhs, is_signed); case ConditionOp::LE: return ir.ILessThanEqual(lhs, rhs, is_signed); + default: + UNREACHABLE(); } }(); - // ir.SetScc(result); + ir.SetScc(result); +} + +void Translator::S_ANDN2_B64(const GcnInst& inst) { + // TODO: Actually implement this. + ir.SetScc(ir.GetVcc()); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index a3a2a676..e76950b7 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -34,13 +34,11 @@ void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; const IR::ScalarReg sbase{inst.src[0].code * 2}; - const IR::U32 offset = - smrd.imm ? ir.Imm32(smrd.offset * 4) - : IR::U32{ir.ShiftLeftLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), - ir.Imm32(2))}; + const IR::U32 dword_offset = + smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset)); const IR::Value vsharp = ir.GetScalarReg(sbase); const IR::ScalarReg dst_reg{inst.dst[0].code}; - Load(ir, num_dwords, vsharp, dst_reg, offset); + Load(ir, num_dwords, vsharp, dst_reg, dword_offset); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index cd42d1e0..1e9925fc 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -9,7 +9,18 @@ namespace Shader::Gcn { -Translator::Translator(IR::Block* block_, Info& info_) : block{block_}, ir{*block}, info{info_} { +Translator::Translator(IR::Block* block_, Info& info_) + : ir{*block_, block_->begin()}, info{info_} {} + +void Translator::EmitPrologue() { + ir.Prologue(); + + // Initialize user data. + IR::ScalarReg dst_sreg = IR::ScalarReg::S0; + for (u32 i = 0; i < info.num_user_data; i++) { + ir.SetScalarReg(dst_sreg++, ir.GetUserData(dst_sreg)); + } + IR::VectorReg dst_vreg = IR::VectorReg::V0; switch (info.stage) { case Stage::Vertex: @@ -29,69 +40,108 @@ Translator::Translator(IR::Block* block_, Info& info_) : block{block_}, ir{*bloc } ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::IsFrontFace)); break; + case Stage::Compute: + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 0)); + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 1)); + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 2)); + + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 0)); + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 1)); + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2)); + break; default: throw NotImplementedException("Unknown shader stage"); } - - // Initialize user data. - IR::ScalarReg dst_sreg = IR::ScalarReg::S0; - for (u32 i = 0; i < 16; i++) { - ir.SetScalarReg(dst_sreg++, ir.GetUserData(dst_sreg)); - } } IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { + IR::U32F32 value{}; switch (operand.field) { case OperandField::ScalarGPR: if (operand.type == ScalarType::Float32 || force_flt) { - return ir.GetScalarReg(IR::ScalarReg(operand.code)); + value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } else { - return ir.GetScalarReg(IR::ScalarReg(operand.code)); + value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } + break; case OperandField::VectorGPR: if (operand.type == ScalarType::Float32 || force_flt) { - return ir.GetVectorReg(IR::VectorReg(operand.code)); + value = ir.GetVectorReg(IR::VectorReg(operand.code)); } else { - return ir.GetVectorReg(IR::VectorReg(operand.code)); + value = ir.GetVectorReg(IR::VectorReg(operand.code)); } + break; case OperandField::ConstZero: if (force_flt) { - return ir.Imm32(0.f); + value = ir.Imm32(0.f); } else { - return ir.Imm32(0U); + value = ir.Imm32(0U); } + break; case OperandField::SignedConstIntPos: ASSERT(!force_flt); - return ir.Imm32(operand.code - SignedConstIntPosMin + 1); + value = ir.Imm32(operand.code - SignedConstIntPosMin + 1); + break; case OperandField::SignedConstIntNeg: ASSERT(!force_flt); - return ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); + value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); + break; case OperandField::LiteralConst: - ASSERT(!force_flt); - return ir.Imm32(operand.code); + if (force_flt) { + value = ir.Imm32(std::bit_cast(operand.code)); + } else { + value = ir.Imm32(operand.code); + } + break; case OperandField::ConstFloatPos_1_0: - return ir.Imm32(1.f); + value = ir.Imm32(1.f); + break; case OperandField::ConstFloatPos_0_5: - return ir.Imm32(0.5f); + value = ir.Imm32(0.5f); + break; case OperandField::ConstFloatPos_2_0: - return ir.Imm32(2.0f); + value = ir.Imm32(2.0f); + break; case OperandField::ConstFloatPos_4_0: - return ir.Imm32(4.0f); + value = ir.Imm32(4.0f); + break; case OperandField::ConstFloatNeg_0_5: - return ir.Imm32(-0.5f); + value = ir.Imm32(-0.5f); + break; case OperandField::ConstFloatNeg_1_0: - return ir.Imm32(-1.0f); + value = ir.Imm32(-1.0f); + break; + case OperandField::VccLo: + value = ir.GetVccLo(); + break; default: UNREACHABLE(); } + + if (operand.input_modifier.abs) { + value = ir.FPAbs(value); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(value); + } + return value; } void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { + IR::U32F32 result = value; + if (operand.output_modifier.multiplier != 0.f) { + result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier)); + } + if (operand.output_modifier.clamp) { + result = ir.FPSaturate(value); + } switch (operand.field) { case OperandField::ScalarGPR: - return ir.SetScalarReg(IR::ScalarReg(operand.code), value); + return ir.SetScalarReg(IR::ScalarReg(operand.code), result); case OperandField::VectorGPR: - return ir.SetVectorReg(IR::VectorReg(operand.code), value); + return ir.SetVectorReg(IR::VectorReg(operand.code), result); + case OperandField::VccLo: + return ir.SetVccLo(result); case OperandField::VccHi: case OperandField::M0: break; // Ignore for now @@ -168,6 +218,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CVT_F32_U32: translator.V_CVT_F32_U32(inst); break; + case Opcode::V_RCP_F32: + translator.V_RCP_F32(inst); + break; case Opcode::S_SWAPPC_B64: ASSERT(info.stage == Stage::Vertex); translator.EmitFetch(inst); @@ -198,18 +251,81 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CVT_PKRTZ_F16_F32: translator.V_CVT_PKRTZ_F16_F32(inst); break; + case Opcode::V_FRACT_F32: + translator.V_FRACT_F32(inst); + break; + case Opcode::V_ADD_F32: + translator.V_ADD_F32(inst); + break; + case Opcode::V_CVT_OFF_F32_I4: + translator.V_CVT_OFF_F32_I4(inst); + break; + case Opcode::V_MED3_F32: + translator.V_MED3_F32(inst); + break; + case Opcode::V_FLOOR_F32: + translator.V_FLOOR_F32(inst); + break; + case Opcode::V_SUB_F32: + translator.V_SUB_F32(inst); + break; + case Opcode::V_FMA_F32: + case Opcode::V_MADAK_F32: // Yes these can share the opcode + translator.V_FMA_F32(inst); + break; case Opcode::IMAGE_SAMPLE: translator.IMAGE_SAMPLE(inst); break; case Opcode::V_CMP_EQ_U32: translator.V_CMP_EQ_U32(inst); break; + case Opcode::V_CMPX_GT_U32: + translator.V_CMPX_GT_U32(inst); + break; + case Opcode::V_CMP_F_F32: + translator.V_CMP_F32(ConditionOp::F, inst); + break; + case Opcode::V_CMP_LT_F32: + translator.V_CMP_F32(ConditionOp::LT, inst); + break; + case Opcode::V_CMP_EQ_F32: + translator.V_CMP_F32(ConditionOp::EQ, inst); + break; + case Opcode::V_CMP_LE_F32: + translator.V_CMP_F32(ConditionOp::LE, inst); + break; + case Opcode::V_CMP_GT_F32: + translator.V_CMP_F32(ConditionOp::GT, inst); + break; + case Opcode::V_CMP_LG_F32: + translator.V_CMP_F32(ConditionOp::LG, inst); + break; + case Opcode::V_CMP_GE_F32: + translator.V_CMP_F32(ConditionOp::GE, inst); + break; + case Opcode::S_CMP_LG_U32: + translator.S_CMP(ConditionOp::LG, false, inst); + break; case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - translator.TBUFFER_LOAD_FORMAT_XYZW(inst); + translator.BUFFER_LOAD_FORMAT(4, true, inst); break; + case Opcode::BUFFER_LOAD_FORMAT_X: + translator.BUFFER_LOAD_FORMAT(1, false, inst); + break; + case Opcode::BUFFER_STORE_FORMAT_X: + translator.BUFFER_STORE_FORMAT(1, false, inst); + break; + case Opcode::V_MAX_F32: + translator.V_MAX_F32(inst); + break; + case Opcode::S_ANDN2_B64: + translator.S_ANDN2_B64(inst); + break; + case Opcode::S_CBRANCH_EXECZ: + case Opcode::S_CBRANCH_SCC0: case Opcode::S_MOV_B64: case Opcode::S_WQM_B64: case Opcode::V_INTERP_P1_F32: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index a837f3a1..8a027e9f 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -16,6 +16,7 @@ struct Info; namespace Shader::Gcn { enum class ConditionOp : u32 { + F, EQ, LG, GT, @@ -28,12 +29,14 @@ class Translator { public: explicit Translator(IR::Block* block_, Info& info); + void EmitPrologue(); void EmitFetch(const GcnInst& inst); // Scalar ALU void S_MOV(const GcnInst& inst); void S_MUL_I32(const GcnInst& inst); void S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst); + void S_ANDN2_B64(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -53,9 +56,21 @@ public: void V_CVT_F32_I32(const GcnInst& inst); void V_CVT_F32_U32(const GcnInst& inst); void V_MAD_F32(const GcnInst& inst); + void V_FRACT_F32(const GcnInst& inst); + void V_ADD_F32(const GcnInst& inst); + void V_CVT_OFF_F32_I4(const GcnInst& inst); + void V_MED3_F32(const GcnInst& inst); + void V_FLOOR_F32(const GcnInst& inst); + void V_SUB_F32(const GcnInst& inst); + void V_RCP_F32(const GcnInst& inst); + void V_CMPX_GT_U32(const GcnInst& inst); + void V_FMA_F32(const GcnInst& inst); + void V_CMP_F32(ConditionOp op, const GcnInst& inst); + void V_MAX_F32(const GcnInst& inst); // Vector Memory - void TBUFFER_LOAD_FORMAT_XYZW(const GcnInst& inst); + void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); // Vector interpolation void V_INTERP_P2_F32(const GcnInst& inst); @@ -76,7 +91,6 @@ private: void SetDst(const InstOperand& operand, const IR::U32F32& value); private: - IR::Block* block; IR::IREmitter ir; Info& info; }; diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 9e36cdc3..7bb97f01 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -102,4 +102,95 @@ void Translator::V_MAD_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } +void Translator::V_FRACT_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::VectorReg dst_reg{inst.dst[0].code}; + ir.SetVectorReg(dst_reg, ir.Fract(src0)); +} + +void Translator::V_ADD_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.FPAdd(src0, src1)); +} + +void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::VectorReg dst_reg{inst.dst[0].code}; + ir.SetVectorReg( + dst_reg, + ir.FPMul(ir.ConvertUToF(32, 32, ir.ISub(ir.BitwiseAnd(src0, ir.Imm32(0xF)), ir.Imm32(8))), + ir.Imm32(1.f / 16.f))); +} + +void Translator::V_MED3_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; + const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); + SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); +} + +void Translator::V_FLOOR_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::VectorReg dst_reg{inst.dst[0].code}; + ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); +} + +void Translator::V_SUB_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.FPSub(src0, src1)); +} + +void Translator::V_RCP_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + SetDst(inst.dst[0], ir.FPRecip(src0)); +} + +void Translator::V_CMPX_GT_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U1 result = ir.IGreaterThan(src0, src1, false); + ir.SetVcc(result); + ir.SetExec(result); +} + +void Translator::V_FMA_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; + SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); +} + +void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::U1 result = [&] { + switch (op) { + case ConditionOp::F: + return ir.Imm1(false); + case ConditionOp::EQ: + return ir.FPEqual(src0, src1); + case ConditionOp::LG: + return ir.FPNotEqual(src0, src1); + case ConditionOp::GT: + return ir.FPGreaterThan(src0, src1); + case ConditionOp::LT: + return ir.FPLessThan(src0, src1); + case ConditionOp::LE: + return ir.FPLessThanEqual(src0, src1); + case ConditionOp::GE: + return ir.FPGreaterThanEqual(src0, src1); + } + }(); + ir.SetVcc(result); +} + +void Translator::V_MAX_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + SetDst(inst.dst[0], ir.FPMax(src0, src1)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index a5fb50b9..9694b06c 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -107,7 +107,7 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { } } -void Translator::TBUFFER_LOAD_FORMAT_XYZW(const GcnInst& inst) { +void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -127,15 +127,68 @@ void Translator::TBUFFER_LOAD_FORMAT_XYZW(const GcnInst& inst) { info.index_enable.Assign(mtbuf.idxen); info.offset_enable.Assign(mtbuf.offen); info.inst_offset.Assign(mtbuf.offset); - info.dmft.Assign(static_cast(mtbuf.dfmt)); - info.nfmt.Assign(static_cast(mtbuf.nfmt)); - info.is_typed.Assign(1); + info.is_typed.Assign(is_typed); + if (is_typed) { + info.dmft.Assign(static_cast(mtbuf.dfmt)); + info.nfmt.Assign(static_cast(mtbuf.nfmt)); + } - const IR::Value value = ir.LoadBuffer(4, ir.GetScalarReg(sharp), address, info); + const IR::Value value = ir.LoadBuffer(num_dwords, ir.GetScalarReg(sharp), address, info); const IR::VectorReg dst_reg{inst.src[1].code}; - for (u32 i = 0; i < 4; i++) { + if (num_dwords == 1) { + ir.SetVectorReg(dst_reg, IR::F32{value}); + return; + } + for (u32 i = 0; i < num_dwords; i++) { ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)}); } } +void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { + const auto& mtbuf = inst.control.mtbuf; + const IR::VectorReg vaddr{inst.src[0].code}; + const IR::ScalarReg sharp{inst.src[2].code * 4}; + const IR::Value address = [&] -> IR::Value { + if (mtbuf.idxen && mtbuf.offen) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); + } + if (mtbuf.idxen || mtbuf.offen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); + const IR::Value soffset{GetSrc(inst.src[3])}; + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + + IR::BufferInstInfo info{}; + info.index_enable.Assign(mtbuf.idxen); + info.offset_enable.Assign(mtbuf.offen); + info.inst_offset.Assign(mtbuf.offset); + info.is_typed.Assign(is_typed); + if (is_typed) { + info.dmft.Assign(static_cast(mtbuf.dfmt)); + info.nfmt.Assign(static_cast(mtbuf.nfmt)); + } + + IR::Value value{}; + const IR::VectorReg src_reg{inst.src[1].code}; + switch (num_dwords) { + case 1: + value = ir.GetVectorReg(src_reg); + break; + case 2: + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1)); + break; + case 3: + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2)); + break; + case 4: + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3)); + break; + } + ir.StoreBuffer(num_dwords, ir.GetScalarReg(sharp), address, value, info); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 8e0db125..540cb8af 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -10,6 +10,10 @@ bool IsParam(Attribute attribute) noexcept { return attribute >= Attribute::Param0 && attribute <= Attribute::Param31; } +bool IsMrt(Attribute attribute) noexcept { + return attribute >= Attribute::RenderTarget0 && attribute <= Attribute::RenderTarget7; +} + std::string NameOf(Attribute attribute) { switch (attribute) { case Attribute::RenderTarget0: @@ -112,6 +116,12 @@ std::string NameOf(Attribute attribute) { return "FragCoord"; case Attribute::IsFrontFace: return "IsFrontFace"; + case Attribute::WorkgroupId: + return "WorkgroupId"; + case Attribute::LocalInvocationId: + return "LocalInvocationId"; + case Attribute::LocalInvocationIndex: + return "LocalInvocationIndex"; default: break; } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 687d3ad4..b148578f 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -81,6 +81,8 @@ constexpr size_t NumParams = 32; [[nodiscard]] bool IsParam(Attribute attribute) noexcept; +[[nodiscard]] bool IsMrt(Attribute attribute) noexcept; + [[nodiscard]] std::string NameOf(Attribute attribute); [[nodiscard]] constexpr Attribute operator+(Attribute attr, int num) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 69ad9a3c..43e8e439 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -111,6 +111,10 @@ void IREmitter::Epilogue() { Inst(Opcode::Epilogue); } +void IREmitter::Discard() { + Inst(Opcode::Discard); +} + U32 IREmitter::GetUserData(IR::ScalarReg reg) { return Inst(Opcode::GetUserData, reg); } @@ -156,11 +160,17 @@ U1 IREmitter::Condition(IR::Condition cond) { case IR::Condition::True: return Imm1(true); case IR::Condition::Scc0: + return LogicalNot(GetScc()); case IR::Condition::Scc1: + return GetScc(); case IR::Condition::Vccz: + return LogicalNot(GetVcc()); case IR::Condition::Vccnz: + return GetVcc(); case IR::Condition::Execz: + return LogicalNot(GetExec()); case IR::Condition::Execnz: + return GetExec(); default: throw NotImplementedException(""); } @@ -170,14 +180,38 @@ void IREmitter::SetGotoVariable(u32 id, const U1& value) { Inst(Opcode::SetGotoVariable, id, value); } +U1 IREmitter::GetScc() { + return Inst(Opcode::GetScc); +} + +U1 IREmitter::GetExec() { + return Inst(Opcode::GetExec); +} + U1 IREmitter::GetVcc() { return Inst(Opcode::GetVcc); } +U32 IREmitter::GetVccLo() { + return Inst(Opcode::GetVccLo); +} + +void IREmitter::SetScc(const U1& value) { + Inst(Opcode::SetScc, value); +} + +void IREmitter::SetExec(const U1& value) { + Inst(Opcode::SetExec, value); +} + void IREmitter::SetVcc(const U1& value) { Inst(Opcode::SetVcc, value); } +void IREmitter::SetVccLo(const U32& value) { + Inst(Opcode::SetVccLo, value); +} + F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp) { return Inst(Opcode::GetAttribute, attribute, Imm32(comp)); } @@ -247,6 +281,27 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad } } +void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info) { + switch (num_dwords) { + case 1: + Inst(data.Type() == Type::F32 ? Opcode::StoreBufferF32 : Opcode::StoreBufferU32, + Flags{info}, handle, address, data); + break; + case 2: + Inst(Opcode::StoreBufferF32x2, Flags{info}, handle, address, data); + break; + case 3: + Inst(Opcode::StoreBufferF32x3, Flags{info}, handle, address, data); + break; + case 4: + Inst(Opcode::StoreBufferF32x4, Flags{info}, handle, address, data); + break; + default: + throw InvalidArgument("Invalid number of dwords {}", num_dwords); + } +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { throw InvalidArgument("Mismatching types {} and {}", a.Type(), b.Type()); @@ -261,6 +316,18 @@ F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { } } +F32F64 IREmitter::FPSub(const F32F64& a, const F32F64& b) { + if (a.Type() != b.Type()) { + throw InvalidArgument("Mismatching types {} and {}", a.Type(), b.Type()); + } + switch (a.Type()) { + case Type::F32: + return Inst(Opcode::FPSub32, a, b); + default: + ThrowInvalidType(a.Type()); + } +} + Value IREmitter::CompositeConstruct(const Value& e1, const Value& e2) { if (e1.Type() != e2.Type()) { throw InvalidArgument("Mismatching types {} and {}", e1.Type(), e2.Type()); @@ -612,6 +679,10 @@ F32F64 IREmitter::FPTrunc(const F32F64& value) { } } +F32 IREmitter::Fract(const F32& value) { + return Inst(Opcode::FPFract, value); +} + U1 IREmitter::FPEqual(const F32F64& lhs, const F32F64& rhs, bool ordered) { if (lhs.Type() != rhs.Type()) { throw InvalidArgument("Mismatching types {} and {}", lhs.Type(), rhs.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a6023646..a52437a9 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -41,6 +41,7 @@ public: void Prologue(); void Epilogue(); + void Discard(); U32 GetUserData(IR::ScalarReg reg); @@ -54,9 +55,14 @@ public: [[nodiscard]] U1 GetGotoVariable(u32 id); void SetGotoVariable(u32 id, const U1& value); + [[nodiscard]] U1 GetScc(); + [[nodiscard]] U1 GetExec(); [[nodiscard]] U1 GetVcc(); - + [[nodiscard]] U32 GetVccLo(); + void SetScc(const U1& value); + void SetExec(const U1& value); void SetVcc(const U1& value); + void SetVccLo(const U32& value); [[nodiscard]] U1 Condition(IR::Condition cond); @@ -72,6 +78,8 @@ public: [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); + void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, + BufferInstInfo info); [[nodiscard]] U1 GetZeroFromOp(const Value& op); [[nodiscard]] U1 GetSignFromOp(const Value& op); @@ -100,6 +108,7 @@ public: [[nodiscard]] Value UnpackHalf2x16(const U32& value); [[nodiscard]] F32F64 FPAdd(const F32F64& a, const F32F64& b); + [[nodiscard]] F32F64 FPSub(const F32F64& a, const F32F64& b); [[nodiscard]] F32F64 FPMul(const F32F64& a, const F32F64& b); [[nodiscard]] F32F64 FPFma(const F32F64& a, const F32F64& b, const F32F64& c); @@ -121,6 +130,7 @@ public: [[nodiscard]] F32F64 FPFloor(const F32F64& value); [[nodiscard]] F32F64 FPCeil(const F32F64& value); [[nodiscard]] F32F64 FPTrunc(const F32F64& value); + [[nodiscard]] F32 Fract(const F32& value); [[nodiscard]] U1 FPEqual(const F32F64& lhs, const F32F64& rhs, bool ordered = true); [[nodiscard]] U1 FPNotEqual(const F32F64& lhs, const F32F64& rhs, bool ordered = true); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 5e616b53..fdbda06f 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -45,15 +45,13 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::PhiMove: case Opcode::Prologue: case Opcode::Epilogue: - // case Opcode::Join: - // case Opcode::Barrier: - // case Opcode::WorkgroupMemoryBarrier: - // case Opcode::DeviceMemoryBarrier: - // case Opcode::EmitVertex: - // case Opcode::EndPrimitive: + case Opcode::Discard: case Opcode::SetAttribute: - // case Opcode::SetFragColor: - // case Opcode::SetFragDepth: + case Opcode::StoreBufferF32: + case Opcode::StoreBufferF32x2: + case Opcode::StoreBufferF32x3: + case Opcode::StoreBufferF32x4: + case Opcode::StoreBufferU32: return true; default: return false; diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 4a6e0447..5fb4dd0f 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -12,10 +12,12 @@ OPCODE(PhiMove, Void, Opaq // Special operations OPCODE(Prologue, Void, ) OPCODE(Epilogue, Void, ) +OPCODE(Discard, Void, ) // Constant memory operations OPCODE(ReadConst, U32, U64, U32, ) OPCODE(ReadConstBuffer, F32, Opaque, U32, ) +OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) @@ -30,10 +32,14 @@ OPCODE(GetAttributeU32, U32, Attr OPCODE(SetAttribute, Void, Attribute, F32, U32, ) // Flags -//OPCODE(GetScc, U1, Void, ) -OPCODE(GetVcc, U1, Void, ) -//OPCODE(SetScc, Void, U1, ) -OPCODE(SetVcc, Void, U1, ) +OPCODE(GetScc, U1, Void, ) +OPCODE(GetExec, U1, Void, ) +OPCODE(GetVcc, U1, Void, ) +OPCODE(GetVccLo, U32, Void, ) +OPCODE(SetScc, Void, U1, ) +OPCODE(SetExec, Void, U1, ) +OPCODE(SetVcc, Void, U1, ) +OPCODE(SetVccLo, Void, U32, ) // Undefined OPCODE(UndefU1, U1, ) @@ -47,6 +53,12 @@ OPCODE(LoadBufferF32, F32, Opaq OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) +OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) +OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) +OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) +OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) +OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) +OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) // Vector utility OPCODE(CompositeConstructU32x2, U32x2, U32, U32, ) @@ -114,6 +126,7 @@ OPCODE(FPAbs32, F32, F32, OPCODE(FPAbs64, F64, F64, ) OPCODE(FPAdd32, F32, F32, F32, ) OPCODE(FPAdd64, F64, F64, F64, ) +OPCODE(FPSub32, F32, F32, F32, ) OPCODE(FPFma32, F32, F32, F32, F32, ) OPCODE(FPFma64, F64, F64, F64, F64, ) OPCODE(FPMax32, F32, F32, F32, ) @@ -145,6 +158,7 @@ OPCODE(FPCeil32, F32, F32, OPCODE(FPCeil64, F64, F64, ) OPCODE(FPTrunc32, F32, F32, ) OPCODE(FPTrunc64, F64, F64, ) +OPCODE(FPFract, F32, F32, ) OPCODE(FPOrdEqual32, U1, F32, F32, ) OPCODE(FPOrdEqual64, U1, F64, F64, ) diff --git a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp index 82f074dd..b715bcd9 100644 --- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp @@ -88,15 +88,6 @@ void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) { inst.ReplaceUsesWith(arg_inst->Arg(0)); return; } - // if constexpr (op == IR::Opcode::BitCastF32U32) { - // if (arg_inst->GetOpcode() == IR::Opcode::ReadConstBuffer) { - // // Replace the bitcast with a typed constant buffer read - // inst.ReplaceOpcode(IR::Opcode::ReadConstBufferF32); - // inst.SetArg(0, arg_inst->Arg(0)); - // inst.SetArg(1, arg_inst->Arg(1)); - // return; - // } - // } } std::optional FoldCompositeExtractImpl(IR::Value inst_value, IR::Opcode insert, @@ -249,6 +240,12 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: return FoldAdd(block, inst); + case IR::Opcode::ISub32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a - b; }); + return; + case IR::Opcode::ConvertF32U32: + FoldWhenAllImmediates(inst, [](u32 a) { return static_cast(a); }); + return; case IR::Opcode::IMul32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); return; diff --git a/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp b/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp new file mode 100644 index 00000000..24c6b548 --- /dev/null +++ b/src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/program.h" + +namespace Shader::Optimization { + +void DeadCodeEliminationPass(IR::BlockList& program) { + // We iterate over the instructions in reverse order. + // This is because removing an instruction reduces the number of uses for earlier instructions. + for (IR::Block* const block : program) { + auto it{block->end()}; + while (it != block->begin()) { + --it; + if (!it->HasUses() && !it->MayHaveSideEffects()) { + it->Invalidate(); + it = block->Instructions().erase(it); + } + } + } +} + +} // namespace Shader::Optimization \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/identity_removal_pass.cpp b/src/shader_recompiler/ir/passes/identity_removal_pass.cpp new file mode 100644 index 00000000..b7014a96 --- /dev/null +++ b/src/shader_recompiler/ir/passes/identity_removal_pass.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "shader_recompiler/ir/program.h" + +namespace Shader::Optimization { + +void IdentityRemovalPass(IR::BlockList& program) { + std::vector to_invalidate; + for (IR::Block* const block : program) { + for (auto inst = block->begin(); inst != block->end();) { + const size_t num_args{inst->NumArgs()}; + for (size_t i = 0; i < num_args; ++i) { + IR::Value arg; + while ((arg = inst->Arg(i)).IsIdentity()) { + inst->SetArg(i, arg.Inst()->Arg(0)); + } + } + if (inst->GetOpcode() == IR::Opcode::Identity || + inst->GetOpcode() == IR::Opcode::Void) { + to_invalidate.push_back(&*inst); + inst = block->Instructions().erase(inst); + } else { + ++inst; + } + } + } + for (IR::Inst* const inst : to_invalidate) { + inst->Invalidate(); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/passes.h b/src/shader_recompiler/ir/passes/ir_passes.h similarity index 100% rename from src/shader_recompiler/ir/passes/passes.h rename to src/shader_recompiler/ir/passes/ir_passes.h diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index e0a66232..95ebdf1c 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -2,8 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include #include #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/ir_emitter.h" @@ -27,26 +25,54 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: + case IR::Opcode::LoadBufferU32: case IR::Opcode::ReadConstBuffer: + case IR::Opcode::ReadConstBufferU32: + case IR::Opcode::StoreBufferF32: + case IR::Opcode::StoreBufferF32x2: + case IR::Opcode::StoreBufferF32x3: + case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferU32: return true; default: return false; } } -IR::Type BufferLoadType(const IR::Inst& inst) { +IR::Type BufferDataType(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: case IR::Opcode::ReadConstBuffer: + case IR::Opcode::StoreBufferF32: + case IR::Opcode::StoreBufferF32x2: + case IR::Opcode::StoreBufferF32x3: + case IR::Opcode::StoreBufferF32x4: return IR::Type::F32; + case IR::Opcode::LoadBufferU32: + case IR::Opcode::ReadConstBufferU32: + case IR::Opcode::StoreBufferU32: + return IR::Type::U32; default: UNREACHABLE(); } } +bool IsBufferStore(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::StoreBufferF32: + case IR::Opcode::StoreBufferF32x2: + case IR::Opcode::StoreBufferF32x3: + case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferU32: + return true; + default: + return false; + } +} + bool IsImageInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::ImageSampleExplicitLod: @@ -157,10 +183,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, const u32 binding = descriptors.Add(BufferResource{ .sgpr_base = sharp.sgpr_base, .dword_offset = sharp.dword_offset, - .stride = u32(buffer.stride), + .stride = buffer.GetStride(), .num_records = u32(buffer.num_records), - .used_types = BufferLoadType(inst), - .is_storage = /*buffer.base_address % 64 != 0*/ true, + .used_types = BufferDataType(inst), + .is_storage = true || IsBufferStore(inst), }); const auto inst_info = inst.Flags(); IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -171,17 +197,18 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float && inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32); } - if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) { + if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer || + inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) { return; } // Calculate buffer address. - const u32 dword_stride = buffer.stride / sizeof(u32); + const u32 dword_stride = buffer.GetStrideElements(sizeof(u32)); const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32); IR::U32 address = ir.Imm32(dword_offset); if (inst_info.index_enable && inst_info.offset_enable) { UNREACHABLE(); } else if (inst_info.index_enable) { - const IR::U32 index{inst.Arg(1)}; + IR::U32 index{inst.Arg(1)}; address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); } else if (inst_info.offset_enable) { const IR::U32 offset{inst.Arg(1)}; @@ -245,6 +272,36 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip } void ResourceTrackingPass(IR::Program& program) { + // When loading data from untyped buffer we don't have if it is float or integer. + // Most of the time it is float so that is the default. This pass detects float buffer loads + // combined with bitcasts and patches them to be integer loads. + for (IR::Block* const block : program.post_order_blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) { + continue; + } + // Replace the bitcast with a typed buffer read + IR::Inst* const arg_inst{inst.Arg(0).TryInstRecursive()}; + if (!arg_inst) { + continue; + } + const auto replace{[&](IR::Opcode new_opcode) { + inst.ReplaceOpcode(new_opcode); + inst.SetArg(0, arg_inst->Arg(0)); + inst.SetArg(1, arg_inst->Arg(1)); + inst.SetFlags(arg_inst->Flags()); + arg_inst->Invalidate(); + }}; + if (arg_inst->GetOpcode() == IR::Opcode::ReadConstBuffer) { + replace(IR::Opcode::ReadConstBufferU32); + } + if (arg_inst->GetOpcode() == IR::Opcode::LoadBufferF32) { + replace(IR::Opcode::LoadBufferU32); + } + } + } + + // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; Descriptors descriptors{info.buffers, info.images, info.samplers}; for (IR::Block* const block : program.post_order_blocks) { diff --git a/src/shader_recompiler/ir/passes/info_collection.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp similarity index 100% rename from src/shader_recompiler/ir/passes/info_collection.cpp rename to src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index d267465f..2958d3d1 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -17,10 +17,8 @@ #include #include #include -#include #include "shader_recompiler/ir/basic_block.h" -#include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/ir/value.h" @@ -30,11 +28,10 @@ namespace { struct FlagTag { auto operator<=>(const FlagTag&) const noexcept = default; }; -struct ZeroFlagTag : FlagTag {}; -struct SignFlagTag : FlagTag {}; -struct CarryFlagTag : FlagTag {}; -struct OverflowFlagTag : FlagTag {}; +struct SccFlagTag : FlagTag {}; +struct ExecFlagTag : FlagTag {}; struct VccFlagTag : FlagTag {}; +struct VccLoTag : FlagTag {}; struct GotoVariable : FlagTag { GotoVariable() = default; @@ -45,8 +42,8 @@ struct GotoVariable : FlagTag { u32 index; }; -using Variant = std::variant; +using Variant = std::variant; using ValueMap = std::unordered_map; struct DefTable { @@ -71,32 +68,25 @@ struct DefTable { goto_vars[variable.index].insert_or_assign(block, value); } - const IR::Value& Def(IR::Block* block, ZeroFlagTag) { - return zero_flag[block]; + const IR::Value& Def(IR::Block* block, SccFlagTag) { + return scc_flag[block]; } - void SetDef(IR::Block* block, ZeroFlagTag, const IR::Value& value) { - zero_flag.insert_or_assign(block, value); + void SetDef(IR::Block* block, SccFlagTag, const IR::Value& value) { + scc_flag.insert_or_assign(block, value); } - const IR::Value& Def(IR::Block* block, SignFlagTag) { - return sign_flag[block]; + const IR::Value& Def(IR::Block* block, ExecFlagTag) { + return exec_flag[block]; } - void SetDef(IR::Block* block, SignFlagTag, const IR::Value& value) { - sign_flag.insert_or_assign(block, value); + void SetDef(IR::Block* block, ExecFlagTag, const IR::Value& value) { + exec_flag.insert_or_assign(block, value); } - const IR::Value& Def(IR::Block* block, CarryFlagTag) { - return carry_flag[block]; + const IR::Value& Def(IR::Block* block, VccLoTag) { + return vcc_lo_flag[block]; } - void SetDef(IR::Block* block, CarryFlagTag, const IR::Value& value) { - carry_flag.insert_or_assign(block, value); - } - - const IR::Value& Def(IR::Block* block, OverflowFlagTag) { - return overflow_flag[block]; - } - void SetDef(IR::Block* block, OverflowFlagTag, const IR::Value& value) { - overflow_flag.insert_or_assign(block, value); + void SetDef(IR::Block* block, VccLoTag, const IR::Value& value) { + vcc_lo_flag.insert_or_assign(block, value); } const IR::Value& Def(IR::Block* block, VccFlagTag) { @@ -107,12 +97,10 @@ struct DefTable { } std::unordered_map goto_vars; - ValueMap indirect_branch_var; - ValueMap zero_flag; - ValueMap sign_flag; - ValueMap carry_flag; - ValueMap overflow_flag; + ValueMap scc_flag; + ValueMap exec_flag; ValueMap vcc_flag; + ValueMap vcc_lo_flag; }; IR::Opcode UndefOpcode(IR::ScalarReg) noexcept { @@ -306,18 +294,18 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::SetGotoVariable: pass.WriteVariable(GotoVariable{inst.Arg(0).U32()}, block, inst.Arg(1)); break; + case IR::Opcode::SetExec: + pass.WriteVariable(ExecFlagTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::SetScc: + pass.WriteVariable(SccFlagTag{}, block, inst.Arg(0)); + break; case IR::Opcode::SetVcc: pass.WriteVariable(VccFlagTag{}, block, inst.Arg(0)); break; - // case IR::Opcode::SetSFlag: - // pass.WriteVariable(SignFlagTag{}, block, inst.Arg(0)); - // break; - // case IR::Opcode::SetCFlag: - // pass.WriteVariable(CarryFlagTag{}, block, inst.Arg(0)); - // break; - // case IR::Opcode::SetOFlag: - // pass.WriteVariable(OverflowFlagTag{}, block, inst.Arg(0)); - // break; + case IR::Opcode::SetVccLo: + pass.WriteVariable(VccLoTag{}, block, inst.Arg(0)); + break; case IR::Opcode::GetScalarRegister: { const IR::ScalarReg reg{inst.Arg(0).ScalarReg()}; inst.ReplaceUsesWith(pass.ReadVariable(reg, block)); @@ -331,18 +319,18 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetGotoVariable: inst.ReplaceUsesWith(pass.ReadVariable(GotoVariable{inst.Arg(0).U32()}, block)); break; + case IR::Opcode::GetExec: + inst.ReplaceUsesWith(pass.ReadVariable(ExecFlagTag{}, block)); + break; + case IR::Opcode::GetScc: + inst.ReplaceUsesWith(pass.ReadVariable(SccFlagTag{}, block)); + break; case IR::Opcode::GetVcc: inst.ReplaceUsesWith(pass.ReadVariable(VccFlagTag{}, block)); break; - // case IR::Opcode::GetSFlag: - // inst.ReplaceUsesWith(pass.ReadVariable(SignFlagTag{}, block)); - // break; - // case IR::Opcode::GetCFlag: - // inst.ReplaceUsesWith(pass.ReadVariable(CarryFlagTag{}, block)); - // break; - // case IR::Opcode::GetOFlag: - // inst.ReplaceUsesWith(pass.ReadVariable(OverflowFlagTag{}, block)); - // break; + case IR::Opcode::GetVccLo: + inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block)); + break; default: break; } @@ -365,44 +353,4 @@ void SsaRewritePass(IR::BlockList& program) { } } -void IdentityRemovalPass(IR::BlockList& program) { - std::vector to_invalidate; - for (IR::Block* const block : program) { - for (auto inst = block->begin(); inst != block->end();) { - const size_t num_args{inst->NumArgs()}; - for (size_t i = 0; i < num_args; ++i) { - IR::Value arg; - while ((arg = inst->Arg(i)).IsIdentity()) { - inst->SetArg(i, arg.Inst()->Arg(0)); - } - } - if (inst->GetOpcode() == IR::Opcode::Identity || - inst->GetOpcode() == IR::Opcode::Void) { - to_invalidate.push_back(&*inst); - inst = block->Instructions().erase(inst); - } else { - ++inst; - } - } - } - for (IR::Inst* const inst : to_invalidate) { - inst->Invalidate(); - } -} - -void DeadCodeEliminationPass(IR::BlockList& program) { - // We iterate over the instructions in reverse order. - // This is because removing an instruction reduces the number of uses for earlier instructions. - for (IR::Block* const block : program) { - auto it{block->end()}; - while (it != block->begin()) { - --it; - if (!it->HasUses() && !it->MayHaveSideEffects()) { - it->Invalidate(); - it = block->Instructions().erase(it); - } - } - } -} - } // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 97fff8fe..0b9c2079 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -4,9 +4,8 @@ #include "shader_recompiler/frontend/control_flow_graph.h" #include "shader_recompiler/frontend/decode.h" #include "shader_recompiler/frontend/structured_control_flow.h" -#include "shader_recompiler/ir/passes/passes.h" +#include "shader_recompiler/ir/passes/ir_passes.h" #include "shader_recompiler/ir/post_order.h" -#include "shader_recompiler/recompiler.h" namespace Shader { @@ -62,9 +61,8 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool workgroup_size{}; + + u32 num_user_data; std::span user_data; Stage stage; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 163c1e31..0bcc3486 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -206,7 +206,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + const auto* dispatch_direct = reinterpret_cast(header); + regs.cs_program.dim_x = dispatch_direct->dim_x; + regs.cs_program.dim_y = dispatch_direct->dim_y; + regs.cs_program.dim_z = dispatch_direct->dim_z; + regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + rasterizer->DispatchDirect(); + } break; } case PM4ItOpcode::EventWrite: { diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index ed9899f8..ac507667 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -48,9 +48,28 @@ struct Liverpool { using UserData = std::array; + struct BinaryInfo { + u8 signature[7]; + u8 version; + u32 pssl_or_cg : 1; + u32 cached : 1; + u32 type : 4; + u32 source_type : 2; + u32 length : 24; + u8 chunk_usage_base_offset_in_dw; + u8 num_input_usage_slots; + u8 is_srt : 1; + u8 is_srt_used_info_valid : 1; + u8 is_extended_usage_info : 1; + u8 reserved2 : 5; + u8 reserved3; + u64 shader_hash; + u32 crc32; + }; + struct ShaderProgram { u32 address_lo; - u32 address_hi; + BitField<0, 8, u32> address_hi; union { BitField<0, 6, u64> num_vgprs; BitField<6, 4, u64> num_sgprs; @@ -65,13 +84,53 @@ struct Liverpool { } std::span Code() const { - u32 code_size = 0; const u32* code = Address(); - static constexpr std::string_view PostHeader = "OrbShdr"; - while (std::memcmp(code + code_size, PostHeader.data(), PostHeader.size()) != 0) { - code_size++; - } - return std::span{code, code_size}; + BinaryInfo bininfo; + std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); + const u32 num_dwords = bininfo.length / sizeof(u32); + return std::span{code, num_dwords}; + } + }; + + struct ComputeProgram { + u32 dispatch_initiator; + u32 dim_x; + u32 dim_y; + u32 dim_z; + u32 start_x; + u32 start_y; + u32 start_z; + struct { + u16 full; + u16 partial; + } num_thread_x, num_thread_y, num_thread_z; + INSERT_PADDING_WORDS(1); + BitField<0, 12, u32> max_wave_id; + u32 address_lo; + BitField<0, 8, u32> address_hi; + INSERT_PADDING_WORDS(4); + union { + BitField<0, 6, u64> num_vgprs; + BitField<6, 4, u64> num_sgprs; + BitField<33, 5, u64> num_user_regs; + } settings; + INSERT_PADDING_WORDS(1); + u32 resource_limits; + INSERT_PADDING_WORDS(0x2A); + UserData user_data; + + template + const T* Address() const { + const uintptr_t addr = uintptr_t(address_hi) << 40 | uintptr_t(address_lo) << 8; + return reinterpret_cast(addr); + } + + std::span Code() const { + const u32* code = Address(); + BinaryInfo bininfo; + std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); + const u32 num_dwords = bininfo.length / sizeof(u32); + return std::span{code, num_dwords}; } }; @@ -621,7 +680,9 @@ struct Liverpool { ShaderProgram ps_program; INSERT_PADDING_WORDS(0x2C); ShaderProgram vs_program; - INSERT_PADDING_WORDS(0xA008 - 0x2C4C - 16); + INSERT_PADDING_WORDS(0x2E00 - 0x2C4C - 16); + ComputeProgram cs_program; + INSERT_PADDING_WORDS(0xA008 - 0x2E00 - 80); u32 depth_bounds_min; u32 depth_bounds_max; u32 stencil_clear; @@ -777,6 +838,10 @@ private: static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); static_assert(GFX6_3D_REG_INDEX(vs_program) == 0x2C48); static_assert(GFX6_3D_REG_INDEX(vs_program.user_data) == 0x2C4C); +static_assert(GFX6_3D_REG_INDEX(cs_program) == 0x2E00); +static_assert(GFX6_3D_REG_INDEX(cs_program.dim_z) == 0x2E03); +static_assert(GFX6_3D_REG_INDEX(cs_program.address_lo) == 0x2E0C); +static_assert(GFX6_3D_REG_INDEX(cs_program.user_data) == 0x2E40); static_assert(GFX6_3D_REG_INDEX(screen_scissor) == 0xA00C); static_assert(GFX6_3D_REG_INDEX(depth_buffer.depth_slice) == 0xA017); static_assert(GFX6_3D_REG_INDEX(color_target_mask) == 0xA08E); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 3ac5382a..c7b1452b 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -540,4 +540,12 @@ struct PM4DumpConstRam { } }; +struct PM4CmdDispatchDirect { + PM4Type3Header header; + u32 dim_x; ///< X dimensions of the array of thread groups to be dispatched + u32 dim_y; ///< Y dimensions of the array of thread groups to be dispatched + u32 dim_z; ///< Z dimensions of the array of thread groups to be dispatched + u32 dispatch_initiator; ///< Dispatch Initiator Register +}; + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 71469005..cc7b9722 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -3,6 +3,7 @@ #pragma once +#include "common/assert.h" #include "common/bit_field.h" #include "common/types.h" #include "video_core/amdgpu/pixel_format.h" @@ -29,6 +30,22 @@ struct Buffer { BitField<21, 2, u32> index_stride; BitField<23, 1, u32> add_tid_enable; }; + + u32 GetStride() const noexcept { + return stride == 0 ? 1U : stride; + } + + u32 GetStrideElements(u32 element_size) const noexcept { + if (stride == 0) { + return 1U; + } + ASSERT(stride % element_size == 0); + return stride / element_size; + } + + u32 GetSize() const noexcept { + return GetStride() * num_records; + } }; enum class ImageType : u64 { @@ -70,7 +87,7 @@ constexpr std::string_view NameOf(ImageType type) { struct Image { union { - BitField<0, 40, u64> base_address; + BitField<0, 38, u64> base_address; BitField<40, 12, u64> min_lod; BitField<52, 6, u64> data_format; BitField<58, 4, u64> num_format; diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index b15760ba..f13d5f8c 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -297,6 +297,13 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::Float) { return vk::Format::eR32G32Sfloat; } + if (data_format == AmdGpu::DataFormat::Format5_6_5 && + num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eB5G6R5UnormPack16; + } + if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eR8Unorm; + } UNREACHABLE(); } @@ -305,6 +312,10 @@ vk::Format DepthFormat(DepthBuffer::ZFormat z_format, DepthBuffer::StencilFormat stencil_format == DepthBuffer::StencilFormat::Stencil8) { return vk::Format::eD32SfloatS8Uint; } + if (z_format == DepthBuffer::ZFormat::Z32Float && + stencil_format == DepthBuffer::StencilFormat::Invalid) { + return vk::Format::eD32Sfloat; + } UNREACHABLE(); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp new file mode 100644 index 00000000..3227a232 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "common/alignment.h" +#include "core/memory.h" +#include "video_core/renderer_vulkan/vk_compute_pipeline.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/texture_cache/texture_cache.h" + +namespace Vulkan { + +ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler_, + vk::PipelineCache pipeline_cache, const Shader::Info* info_, + vk::ShaderModule module) + : instance{instance_}, scheduler{scheduler_}, info{*info_} { + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + + u32 binding{}; + boost::container::small_vector bindings; + for (const auto& buffer : info.buffers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } + for (const auto& image : info.images) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eSampledImage, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } + for (const auto& sampler : info.samplers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } + + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { + .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data(), + }; + desc_layout = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + + const vk::DescriptorSetLayout set_layout = *desc_layout; + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &set_layout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, + }; + pipeline_layout = instance.GetDevice().createPipelineLayoutUnique(layout_info); + + const vk::ComputePipelineCreateInfo compute_pipeline_ci = { + .stage = shader_ci, + .layout = *pipeline_layout, + }; + auto result = + instance.GetDevice().createComputePipelineUnique(pipeline_cache, compute_pipeline_ci); + if (result.result == vk::Result::eSuccess) { + pipeline = std::move(result.value); + } else { + UNREACHABLE_MSG("Graphics pipeline creation failed!"); + } +} + +ComputePipeline::~ComputePipeline() = default; + +void ComputePipeline::BindResources(Core::MemoryManager* memory, + VideoCore::TextureCache& texture_cache) const { + // Bind resource buffers and textures. + boost::container::static_vector buffer_infos; + boost::container::static_vector image_infos; + boost::container::small_vector set_writes; + u32 binding{}; + + for (const auto& buffer : info.buffers) { + const auto vsharp = info.ReadUd(buffer.sgpr_base, buffer.dword_offset); + const u32 size = vsharp.GetSize(); + const VAddr addr = vsharp.base_address.Value(); + texture_cache.OnCpuWrite(addr); + const auto [vk_buffer, offset] = memory->GetVulkanBuffer(addr); + buffer_infos.emplace_back(vk_buffer, offset, size); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + } + + for (const auto& image : info.images) { + const auto tsharp = info.ReadUd(image.sgpr_base, image.dword_offset); + const auto& image_view = texture_cache.FindImageView(tsharp); + image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, vk::ImageLayout::eGeneral); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eSampledImage, + .pImageInfo = &image_infos.back(), + }); + } + for (const auto& sampler : info.samplers) { + const auto ssharp = info.ReadUd(sampler.sgpr_base, sampler.dword_offset); + const auto vk_sampler = texture_cache.GetSampler(ssharp); + image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eSampler, + .pImageInfo = &image_infos.back(), + }); + } + + if (!set_writes.empty()) { + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, + set_writes); + } +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h new file mode 100644 index 00000000..df9743c2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "shader_recompiler/runtime_info.h" +#include "video_core/renderer_vulkan/vk_common.h" + +namespace Core { +class MemoryManager; +} + +namespace VideoCore { +class TextureCache; +} + +namespace Vulkan { + +class Instance; +class Scheduler; +class StreamBuffer; + +class ComputePipeline { +public: + explicit ComputePipeline(const Instance& instance, Scheduler& scheduler, + vk::PipelineCache pipeline_cache, const Shader::Info* info, + vk::ShaderModule module); + ~ComputePipeline(); + + [[nodiscard]] vk::Pipeline Handle() const noexcept { + return *pipeline; + } + + void BindResources(Core::MemoryManager* memory, VideoCore::TextureCache& texture_cache) const; + +private: + const Instance& instance; + Scheduler& scheduler; + vk::UniquePipeline pipeline; + vk::UniquePipelineLayout pipeline_layout; + vk::UniqueDescriptorSetLayout desc_layout; + Shader::Info info{}; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 1815224c..4b65fe07 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -16,7 +16,8 @@ namespace Vulkan { GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& scheduler_, - const PipelineKey& key_, vk::PipelineCache pipeline_cache, + const GraphicsPipelineKey& key_, + vk::PipelineCache pipeline_cache, std::span infos, std::array modules) : instance{instance_}, scheduler{scheduler_}, key{key_} { @@ -50,7 +51,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul }); bindings.push_back({ .binding = input.binding, - .stride = u32(buffer.stride), + .stride = buffer.GetStride(), .inputRate = vk::VertexInputRate::eVertex, }); } @@ -275,8 +276,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& const auto& input = vs_info.vs_inputs[i]; const auto buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); if (i == 0) { - start_offset = - map_staging(buffer.base_address.Value(), buffer.stride * buffer.num_records); + start_offset = map_staging(buffer.base_address.Value(), buffer.GetSize()); base_address = buffer.base_address; } buffers[i] = staging.Handle(); @@ -297,7 +297,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& for (const auto& stage : stages) { for (const auto& buffer : stage.buffers) { const auto vsharp = stage.ReadUd(buffer.sgpr_base, buffer.dword_offset); - const u32 size = vsharp.stride * vsharp.num_records; + const u32 size = vsharp.GetSize(); const u32 offset = map_staging(vsharp.base_address.Value(), size); buffer_infos.emplace_back(staging.Handle(), offset, size); set_writes.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index fc8b4fa1..c7e773ad 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -26,7 +26,7 @@ class StreamBuffer; using Liverpool = AmdGpu::Liverpool; -struct PipelineKey { +struct GraphicsPipelineKey { std::array stage_hashes; std::array color_formats; vk::Format depth_format; @@ -40,16 +40,16 @@ struct PipelineKey { Liverpool::CullMode cull_mode; std::array blend_controls; - bool operator==(const PipelineKey& key) const noexcept { - return std::memcmp(this, &key, sizeof(PipelineKey)) == 0; + bool operator==(const GraphicsPipelineKey& key) const noexcept { + return std::memcmp(this, &key, sizeof(GraphicsPipelineKey)) == 0; } }; -static_assert(std::has_unique_object_representations_v); +static_assert(std::has_unique_object_representations_v); class GraphicsPipeline { public: explicit GraphicsPipeline(const Instance& instance, Scheduler& scheduler, - const PipelineKey& key, vk::PipelineCache pipeline_cache, + const GraphicsPipelineKey& key, vk::PipelineCache pipeline_cache, std::span infos, std::array modules); ~GraphicsPipeline(); @@ -76,14 +76,14 @@ private: vk::UniquePipelineLayout pipeline_layout; vk::UniqueDescriptorSetLayout desc_layout; std::array stages{}; - PipelineKey key; + GraphicsPipelineKey key; }; } // namespace Vulkan template <> -struct std::hash { - std::size_t operator()(const Vulkan::PipelineKey& key) const noexcept { +struct std::hash { + std::size_t operator()(const Vulkan::GraphicsPipelineKey& key) const noexcept { return XXH3_64bits(&key, sizeof(key)); } }; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 658d7a36..d35d35d1 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -205,6 +205,7 @@ bool Instance::CreateDevice() { .timelineSemaphore = true, }, vk::PhysicalDeviceVulkan13Features{ + .shaderDemoteToHelperInvocation = true, .dynamicRendering = true, .maintenance4 = true, }, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 54f81267..9b4e6856 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -21,7 +21,12 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d info.user_data = user_data; info.stage = stage; switch (stage) { + case Shader::Stage::Vertex: { + info.num_user_data = regs.vs_program.settings.num_user_regs; + break; + } case Shader::Stage::Fragment: { + info.num_user_data = regs.ps_program.settings.num_user_regs; for (u32 i = 0; i < regs.num_interp; i++) { info.ps_inputs.push_back({ .param_index = regs.ps_inputs[i].input_offset.Value(), @@ -32,6 +37,13 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d } break; } + case Shader::Stage::Compute: { + const auto& cs_pgm = regs.cs_program; + info.num_user_data = cs_pgm.settings.num_user_regs; + info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, + cs_pgm.num_thread_z.full}; + break; + } default: break; } @@ -48,17 +60,30 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, }; } -const GraphicsPipeline* PipelineCache::GetPipeline() { - RefreshKey(); +const GraphicsPipeline* PipelineCache::GetGraphicsPipeline() { + RefreshGraphicsKey(); const auto [it, is_new] = graphics_pipelines.try_emplace(graphics_key); if (is_new) { - it.value() = CreatePipeline(); + it.value() = CreateGraphicsPipeline(); } const GraphicsPipeline* pipeline = it->second.get(); return pipeline; } -void PipelineCache::RefreshKey() { +const ComputePipeline* PipelineCache::GetComputePipeline() { + const auto& cs_pgm = liverpool->regs.cs_program; + ASSERT(cs_pgm.Address() != nullptr); + const auto code = cs_pgm.Code(); + compute_key = XXH3_64bits(code.data(), code.size_bytes()); + const auto [it, is_new] = compute_pipelines.try_emplace(compute_key); + if (is_new) { + it.value() = CreateComputePipeline(); + } + const ComputePipeline* pipeline = it->second.get(); + return pipeline; +} + +void PipelineCache::RefreshGraphicsKey() { auto& regs = liverpool->regs; auto& key = graphics_key; @@ -92,7 +117,7 @@ void PipelineCache::RefreshKey() { } } -std::unique_ptr PipelineCache::CreatePipeline() { +std::unique_ptr PipelineCache::CreateGraphicsPipeline() { const auto& regs = liverpool->regs; u32 binding{}; @@ -141,6 +166,36 @@ std::unique_ptr PipelineCache::CreatePipeline() { infos, stages); } +std::unique_ptr PipelineCache::CreateComputePipeline() { + const auto& cs_pgm = liverpool->regs.cs_program; + const auto code = cs_pgm.Code(); + + // Dump shader code if requested. + if (Config::dumpShaders()) { + DumpShader(code, compute_key, Shader::Stage::Compute, "bin"); + } + + block_pool.ReleaseContents(); + inst_pool.ReleaseContents(); + + // Recompile shader to IR. + const Shader::Info info = + MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); + auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + + // Compile IR to SPIR-V + u32 binding{}; + const auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, program, binding); + const auto module = CompileSPV(spv_code, instance.GetDevice()); + + if (Config::dumpShaders()) { + DumpShader(spv_code, compute_key, Shader::Stage::Compute, "spv"); + } + + return std::make_unique(instance, scheduler, *pipeline_cache, &program.info, + module); +} + void PipelineCache::DumpShader(std::span code, u64 hash, Shader::Stage stage, std::string_view ext) { using namespace Common::FS; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 514adcbb..a77b298b 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -7,6 +7,7 @@ #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/object_pool.h" #include "shader_recompiler/profile.h" +#include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" namespace Shader { @@ -26,15 +27,17 @@ public: AmdGpu::Liverpool* liverpool); ~PipelineCache() = default; - const GraphicsPipeline* GetPipeline(); + const GraphicsPipeline* GetGraphicsPipeline(); + + const ComputePipeline* GetComputePipeline(); private: - void RefreshKey(); - - std::unique_ptr CreatePipeline(); - + void RefreshGraphicsKey(); void DumpShader(std::span code, u64 hash, Shader::Stage stage, std::string_view ext); + std::unique_ptr CreateGraphicsPipeline(); + std::unique_ptr CreateComputePipeline(); + private: const Instance& instance; Scheduler& scheduler; @@ -43,9 +46,11 @@ private: vk::UniquePipelineLayout pipeline_layout; tsl::robin_map module_map; std::array stages{}; - tsl::robin_map> graphics_pipelines; + tsl::robin_map> compute_pipelines; + tsl::robin_map> graphics_pipelines; Shader::Profile profile{}; - PipelineKey graphics_key{}; + GraphicsPipelineKey graphics_key{}; + u64 compute_key{}; Shader::ObjectPool inst_pool; Shader::ObjectPool block_pool; }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index aea93487..ded491c4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ void Rasterizer::Draw(bool is_indexed) { const auto cmdbuf = scheduler.CommandBuffer(); const auto& regs = liverpool->regs; const u32 num_indices = SetupIndexBuffer(is_indexed); - const GraphicsPipeline* pipeline = pipeline_cache.GetPipeline(); + const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); pipeline->BindResources(memory, vertex_index_buffer, texture_cache); const auto& image_view = texture_cache.RenderTarget(regs.color_buffers[0]); @@ -49,8 +49,13 @@ void Rasterizer::Draw(bool is_indexed) { }; // TODO: Don't restart renderpass every draw + const auto& scissor = regs.screen_scissor; const vk::RenderingInfo rendering_info = { - .renderArea = {.offset = {0, 0}, .extent = {1920, 1080}}, + .renderArea = + { + .offset = {scissor.top_left_x, scissor.top_left_y}, + .extent = {scissor.GetWidth(), scissor.GetHeight()}, + }, .layerCount = 1, .colorAttachmentCount = 1, .pColorAttachments = &color_info, @@ -69,6 +74,17 @@ void Rasterizer::Draw(bool is_indexed) { cmdbuf.endRendering(); } +void Rasterizer::DispatchDirect() { + return; + const auto cmdbuf = scheduler.CommandBuffer(); + const auto& cs_program = liverpool->regs.cs_program; + const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); + pipeline->BindResources(memory, texture_cache); + + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); + cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); +} + u32 Rasterizer::SetupIndexBuffer(bool& is_indexed) { // Emulate QuadList primitive type with CPU made index buffer. const auto& regs = liverpool->regs; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a8386c25..2678a480 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -31,6 +31,8 @@ public: void Draw(bool is_indexed); + void DispatchDirect(); + private: u32 SetupIndexBuffer(bool& is_indexed); void MapMemory(VAddr addr, size_t size); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index cb0bccda..f81514d6 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -55,7 +55,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { .pQueueFamilyIndices = queue_family_indices.data(), .preTransform = transform, .compositeAlpha = composite_alpha, - .presentMode = vk::PresentModeKHR::eMailbox, + .presentMode = vk::PresentModeKHR::eFifo, .clipped = true, .oldSwapchain = nullptr, }; diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 74c9cf5d..2128d098 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -98,7 +98,7 @@ struct Image { if (it == image_view_infos.end()) { return {}; } - return image_view_ids[std::distance(it, image_view_infos.begin())]; + return image_view_ids[std::distance(image_view_infos.begin(), it)]; } void Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 35bbb3f7..658cfa41 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -100,8 +100,7 @@ TextureCache::~TextureCache() { } void TextureCache::OnCpuWrite(VAddr address) { - const VAddr address_aligned = address & ~((1 << PageShift) - 1); - ForEachImageInRegion(address_aligned, 1 << PageShift, [&](ImageId image_id, Image& image) { + ForEachImageInRegion(address, 1 << PageShift, [&](ImageId image_id, Image& image) { // Ensure image is reuploaded when accessed again. image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. @@ -270,6 +269,7 @@ void TextureCache::UntrackImage(Image& image, ImageId image_id) { } void TextureCache::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) { + std::scoped_lock lk{mutex}; const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1; const u64 page_start = addr >> PageShift; const u64 page_end = page_start + num_pages; @@ -288,7 +288,7 @@ void TextureCache::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) { const u32 interval_size = interval_end_addr - interval_start_addr; void* addr = reinterpret_cast(interval_start_addr); if (delta > 0 && count == delta) { - mprotect(addr, interval_size, PAGE_NOACCESS); + mprotect(addr, interval_size, PAGE_READONLY); } else if (delta < 0 && count == -delta) { mprotect(addr, interval_size, PAGE_READWRITE); } else { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a09d1194..bb37677d 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -132,6 +132,7 @@ private: tsl::robin_map samplers; tsl::robin_pg_map> page_table; boost::icl::interval_map cached_pages; + std::mutex mutex; #ifdef _WIN64 void* veh_handle{}; #endif