diff --git a/CMakeLists.txt b/CMakeLists.txt index 6724bff9..66dbe119 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -349,6 +349,8 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/runtime_info.h src/shader_recompiler/backend/spirv/emit_spirv.cpp src/shader_recompiler/backend/spirv/emit_spirv.h + src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp + src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -359,6 +361,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp src/shader_recompiler/backend/spirv/emit_spirv_logical.cpp src/shader_recompiler/backend/spirv/emit_spirv_select.cpp + src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp src/shader_recompiler/backend/spirv/emit_spirv_special.cpp src/shader_recompiler/backend/spirv/emit_spirv_undefined.cpp src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp diff --git a/externals/sirit b/externals/sirit index 505cc66a..8db09231 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 505cc66a2be70b268c1700fef4d5327a5fe46494 +Subproject commit 8db09231c448b913ae905d5237ce2eca46e3fe87 diff --git a/src/common/div_ceil.h b/src/common/div_ceil.h new file mode 100755 index 00000000..de275e76 --- /dev/null +++ b/src/common/div_ceil.h @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +namespace Common { + +/// Ceiled integer division. +template + requires std::is_integral_v && std::is_unsigned_v +[[nodiscard]] constexpr N DivCeil(N number, D divisor) { + return static_cast((static_cast(number) + divisor - 1) / divisor); +} + +/// Ceiled integer division with logarithmic divisor in base 2 +template + requires std::is_integral_v && std::is_unsigned_v +[[nodiscard]] constexpr N DivCeilLog2(N value, D alignment_log2) { + return static_cast((static_cast(value) + (D(1) << alignment_log2) - 1) >> alignment_log2); +} + +} // namespace Common diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index e38fd2af..29e6aeb4 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -14,6 +14,12 @@ namespace Common { +std::string ToLower(std::string str) { + std::transform(str.begin(), str.end(), str.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + return str; +} + std::vector SplitString(const std::string& str, char delimiter) { std::istringstream iss(str); std::vector output(1); diff --git a/src/common/string_util.h b/src/common/string_util.h index ec3f1dca..8dae6c75 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -9,6 +9,9 @@ namespace Common { +/// Make a string lowercase +[[nodiscard]] std::string ToLower(std::string str); + std::vector SplitString(const std::string& str, char delimiter); #ifdef _WIN32 diff --git a/src/core/address_space.h b/src/core/address_space.h index 5bb553ae..b979481f 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -28,7 +28,7 @@ constexpr VAddr USER_MAX = 0xFBFFFFFFFFULL; // User area size is normally larger than this. However games are unlikely to map to high // regions of that area, so by default we allocate a smaller virtual address space (about 1/4th). // to save space on page tables. -static constexpr size_t UserSize = 1ULL << 38; +static constexpr size_t UserSize = 1ULL << 39; static constexpr size_t SystemSize = USER_MIN - SYSTEM_MANAGED_MIN; /** diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 912c74bf..b4ede62b 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include "common/string_util.h" #include "core/file_sys/fs.h" namespace Core::FileSys { @@ -13,6 +14,7 @@ void MntPoints::Mount(const std::filesystem::path& host_folder, const std::strin MntPair pair; pair.host_path = host_folder.string(); + std::replace(pair.host_path.begin(), pair.host_path.end(), '\\', '/'); pair.guest_path = guest_folder; m_mnt_pairs.push_back(pair); @@ -46,11 +48,24 @@ std::string MntPoints::GetHostFile(const std::string& guest_file) { for (auto& pair : m_mnt_pairs) { // horrible code but it works :D int find = guest_file.find(pair.guest_path); - if (find == 0) { - std::string npath = guest_file.substr(pair.guest_path.size(), guest_file.size() - 1); - std::replace(pair.host_path.begin(), pair.host_path.end(), '\\', '/'); - return pair.host_path + npath; + if (find != 0) { + continue; } + std::string npath = guest_file.substr(pair.guest_path.size(), guest_file.size() - 1); + const auto host_path = pair.host_path + npath; +#ifndef _WIN64 + const std::filesystem::path path{host_path}; + if (!std::filesystem::exists(path)) { + const auto filename = Common::ToLower(path.filename()); + for (const auto& file : std::filesystem::directory_iterator(path.parent_path())) { + const auto exist_filename = Common::ToLower(file.path().filename()); + if (filename == exist_filename) { + return file.path(); + } + } + } +#endif + return host_path; } return ""; } diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index cf162db1..3fc79abe 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -803,9 +803,9 @@ int PS4_SYSV_ABI sceGnmDrawOpaqueAuto() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmDriverCaptureInProgress() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +bool PS4_SYSV_ABI sceGnmDriverCaptureInProgress() { + LOG_TRACE(Lib_GnmDriver, "called"); + return false; } int PS4_SYSV_ABI sceGnmDriverInternalRetrieveGnmInterface() { @@ -1962,7 +1962,7 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[ if (Config::dumpPM4()) { static auto last_frame_num = -1LL; static u32 seq_num{}; - if (last_frame_num == frames_submitted) { + if (last_frame_num == frames_submitted && cbpair == 0) { ++seq_num; } else { last_frame_num = frames_submitted; diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index 2971d66b..08099bcc 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -63,7 +63,7 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState350(u32* cmdbuf, u32 size); u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState(u32* cmdbuf, u32 size); u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState400(u32* cmdbuf, u32 size); int PS4_SYSV_ABI sceGnmDrawOpaqueAuto(); -int PS4_SYSV_ABI sceGnmDriverCaptureInProgress(); +bool PS4_SYSV_ABI sceGnmDriverCaptureInProgress(); int PS4_SYSV_ABI sceGnmDriverInternalRetrieveGnmInterface(); int PS4_SYSV_ABI sceGnmDriverInternalRetrieveGnmInterfaceForGpuDebugger(); int PS4_SYSV_ABI sceGnmDriverInternalRetrieveGnmInterfaceForGpuException(); diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index 153467a6..b6e3054c 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -161,7 +161,6 @@ s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, std::size_t len, } int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void** end, u32* prot) { - LOG_WARNING(Kernel_Vmm, "called"); auto* memory = Core::Memory::Instance(); return memory->QueryProtection(std::bit_cast(addr), start, end, prot); } diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 2b526eed..8cf46ffc 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -4,6 +4,7 @@ #include #include #include +#include "common/alignment.h" #include "common/assert.h" #include "common/error.h" #include "common/logging/log.h" @@ -16,6 +17,8 @@ #include "core/linker.h" #ifdef _WIN64 #include +#else +#include #endif namespace Libraries::Kernel { @@ -46,7 +49,8 @@ void init_pthreads() { } void pthreadInitSelfMainThread() { - g_pthread_self = new PthreadInternal{}; + auto* pthread_pool = g_pthread_cxt->GetPthreadPool(); + g_pthread_self = pthread_pool->Create(); scePthreadAttrInit(&g_pthread_self->attr); g_pthread_self->pth = pthread_self(); g_pthread_self->name = "Main_Thread"; @@ -926,31 +930,25 @@ int PS4_SYSV_ABI scePthreadCreate(ScePthread* thread, const ScePthreadAttr* attr if ((*thread)->attr != nullptr) { scePthreadAttrDestroy(&(*thread)->attr); } - scePthreadAttrInit(&(*thread)->attr); int result = pthread_copy_attributes(&(*thread)->attr, attr); + ASSERT(result == 0); - if (result == 0) { - if (name != NULL) { - (*thread)->name = name; - } else { - (*thread)->name = "no-name"; - } - (*thread)->entry = start_routine; - (*thread)->arg = arg; - (*thread)->is_almost_done = false; - (*thread)->is_detached = (*attr)->detached; - (*thread)->is_started = false; - - result = pthread_create(&(*thread)->pth, &(*attr)->pth_attr, run_thread, *thread); + if (name != NULL) { + (*thread)->name = name; + } else { + (*thread)->name = "no-name"; } + (*thread)->entry = start_routine; + (*thread)->arg = arg; + (*thread)->is_almost_done = false; + (*thread)->is_detached = (*attr)->detached; + (*thread)->is_started = false; + + pthread_attr_setstacksize(&(*attr)->pth_attr, 2_MB); + result = pthread_create(&(*thread)->pth, &(*attr)->pth_attr, run_thread, *thread); - if (result == 0) { - while (!(*thread)->is_started) { - std::this_thread::sleep_for(std::chrono::microseconds(1000)); - } - } LOG_INFO(Kernel_Pthread, "thread create name = {}", (*thread)->name); switch (result) { @@ -979,7 +977,15 @@ ScePthread PThreadPool::Create() { } } +#ifndef _WIN64 auto* ret = new PthreadInternal{}; +#else + static u8* hint_address = reinterpret_cast(0x7FFFFC000ULL); + auto* ret = reinterpret_cast( + mmap(hint_address, sizeof(PthreadInternal), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)); + hint_address += Common::AlignUp(sizeof(PthreadInternal), 4_KB); +#endif ret->is_free = false; ret->is_detached = false; diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index ba8d6300..bfa6a68d 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -129,7 +129,11 @@ public: const auto end = std::chrono::high_resolution_clock::now(); const auto time = std::chrono::duration_cast(end - start).count(); - *timeout -= time; + if (status == std::cv_status::timeout) { + *timeout = 0; + } else { + *timeout -= time; + } return GetResult(status == std::cv_status::timeout); } }; diff --git a/src/core/libraries/save_data/savedata.cpp b/src/core/libraries/save_data/savedata.cpp index d4f04a25..9b8799bb 100644 --- a/src/core/libraries/save_data/savedata.cpp +++ b/src/core/libraries/save_data/savedata.cpp @@ -341,6 +341,7 @@ s32 saveDataMount(u32 user_id, std::string dir_name, u32 mount_mode, switch (mount_mode) { case ORBIS_SAVE_DATA_MOUNT_MODE_RDONLY: case ORBIS_SAVE_DATA_MOUNT_MODE_RDWR: + case ORBIS_SAVE_DATA_MOUNT_MODE_RDWR | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF: case ORBIS_SAVE_DATA_MOUNT_MODE_RDONLY | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF: { if (!std::filesystem::exists(mount_dir)) { return ORBIS_SAVE_DATA_ERROR_NOT_FOUND; @@ -349,11 +350,14 @@ s32 saveDataMount(u32 user_id, std::string dir_name, u32 mount_mode, mnt->Mount(mount_dir, g_mount_point); mount_result->mount_status = 0; - strncpy(mount_result->mount_point.data, g_mount_point.c_str(), 16); - } break; + std::strncpy(mount_result->mount_point.data, g_mount_point.c_str(), 16); + break; + } case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE: case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_RDONLY: case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_RDWR: + case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_RDWR | + ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF: case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_RDWR | ORBIS_SAVE_DATA_MOUNT_MODE_COPY_ICON: case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF | diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index e1931a9c..b1a0845f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -85,7 +85,7 @@ void EmitInst(EmitContext& ctx, IR::Inst* inst) { #include "shader_recompiler/ir/opcodes.inc" #undef OPCODE } - throw LogicError("Invalid opcode {}", inst->GetOpcode()); + UNREACHABLE_MSG("Invalid opcode {}", inst->GetOpcode()); } Id TypeId(const EmitContext& ctx, IR::Type type) { @@ -176,7 +176,12 @@ Id DefineMain(EmitContext& ctx, IR::Program& program) { void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { const std::span interfaces(ctx.interfaces.data(), ctx.interfaces.size()); spv::ExecutionModel execution_model{}; + ctx.AddCapability(spv::Capability::Image1D); + ctx.AddCapability(spv::Capability::Sampled1D); + ctx.AddCapability(spv::Capability::Float16); + ctx.AddCapability(spv::Capability::Int16); ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat); + ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); switch (program.info.stage) { case Stage::Compute: { const std::array workgroup_size{program.info.workgroup_size}; @@ -272,47 +277,55 @@ Id EmitConditionRef(EmitContext& ctx, const IR::Value& value) { void EmitReference(EmitContext&) {} void EmitPhiMove(EmitContext&) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetScc(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetExec(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetVcc(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetSccLo(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetVccLo(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetVccHi(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetScc(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetExec(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetVcc(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetSccLo(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetVccLo(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetVccHi(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp new file mode 100644 index 00000000..e0bc4b77 --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { +namespace { +std::pair AtomicArgs(EmitContext& ctx) { + const Id scope{ctx.ConstU32(static_cast(spv::Scope::Device))}; + const Id semantics{ctx.u32_zero_value}; + return {scope, semantics}; +} + +Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const auto& texture = ctx.images[handle & 0xFFFF]; + const Id pointer{ctx.OpImageTexelPointer(ctx.image_u32, texture.id, coords, ctx.ConstU32(0U))}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); +} +} // Anonymous namespace + +Id EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicIAdd); +} + +Id EmitImageAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicSMin); +} + +Id EmitImageAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicUMin); +} + +Id EmitImageAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicSMax); +} + +Id EmitImageAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicUMax); +} + +Id EmitImageAtomicInc32(EmitContext&, IR::Inst*, u32, Id, Id) { + // TODO: This is not yet implemented + throw NotImplementedException("SPIR-V Instruction"); +} + +Id EmitImageAtomicDec32(EmitContext&, IR::Inst*, u32, Id, Id) { + // TODO: This is not yet implemented + throw NotImplementedException("SPIR-V Instruction"); +} + +Id EmitImageAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicAnd); +} + +Id EmitImageAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicOr); +} + +Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicXor); +} + +Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { + return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicExchange); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp new file mode 100644 index 00000000..22b3523a --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { +namespace { +void MemoryBarrier(EmitContext& ctx, spv::Scope scope) { + const auto semantics{ + spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory | + spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AtomicCounterMemory | + spv::MemorySemanticsMask::ImageMemory}; + ctx.OpMemoryBarrier(ctx.ConstU32(static_cast(scope)), + ctx.ConstU32(static_cast(semantics))); +} +} // Anonymous namespace + +void EmitBarrier(EmitContext& ctx) { + const auto execution{spv::Scope::Workgroup}; + const auto memory{spv::Scope::Workgroup}; + const auto memory_semantics{spv::MemorySemanticsMask::AcquireRelease | + spv::MemorySemanticsMask::WorkgroupMemory}; + ctx.OpControlBarrier(ctx.ConstU32(static_cast(execution)), + ctx.ConstU32(static_cast(memory)), + ctx.ConstU32(static_cast(memory_semantics))); +} + +void EmitWorkgroupMemoryBarrier(EmitContext& ctx) { + MemoryBarrier(ctx, spv::Scope::Workgroup); +} + +void EmitDeviceMemoryBarrier(EmitContext& ctx) { + MemoryBarrier(ctx, spv::Scope::Device); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp index a5268211..da29f392 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp @@ -18,8 +18,8 @@ void EmitBitCastU64F64(EmitContext&) { UNREACHABLE_MSG("SPIR-V Instruction"); } -void EmitBitCastF16U16(EmitContext&) { - UNREACHABLE_MSG("SPIR-V Instruction"); +Id EmitBitCastF16U16(EmitContext& ctx, Id value) { + return ctx.OpBitcast(ctx.F16[1], value); } Id EmitBitCastF32U32(EmitContext& ctx, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index ccddbff5..8d8a1488 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -7,6 +7,37 @@ namespace Shader::Backend::SPIRV { namespace { +Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { + switch (output) { + case VsOutput::ClipDist0: + case VsOutput::ClipDist1: + case VsOutput::ClipDist2: + case VsOutput::ClipDist3: + case VsOutput::ClipDist4: + case VsOutput::ClipDist5: + case VsOutput::ClipDist6: + case VsOutput::ClipDist7: { + const u32 index = u32(output) - u32(VsOutput::ClipDist0); + const Id clip_num{ctx.ConstU32(index)}; + return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, clip_num); + } + case VsOutput::CullDist0: + case VsOutput::CullDist1: + case VsOutput::CullDist2: + case VsOutput::CullDist3: + case VsOutput::CullDist4: + case VsOutput::CullDist5: + case VsOutput::CullDist6: + case VsOutput::CullDist7: { + const u32 index = u32(output) - u32(VsOutput::CullDist0); + const Id cull_num{ctx.ConstU32(index)}; + return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, cull_num); + } + default: + UNREACHABLE(); + } +} + Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { if (IR::IsParam(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; @@ -20,10 +51,20 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { switch (attr) { case IR::Attribute::Position0: { return ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element)); + case IR::Attribute::Position1: + case IR::Attribute::Position2: + case IR::Attribute::Position3: { + const u32 index = u32(attr) - u32(IR::Attribute::Position1); + return VsOutputAttrPointer(ctx, ctx.info.vs_outputs[index][element]); + } case IR::Attribute::RenderTarget0: case IR::Attribute::RenderTarget1: case IR::Attribute::RenderTarget2: - case IR::Attribute::RenderTarget3: { + case IR::Attribute::RenderTarget3: + case IR::Attribute::RenderTarget4: + case IR::Attribute::RenderTarget5: + case IR::Attribute::RenderTarget6: + case IR::Attribute::RenderTarget7: { const u32 index = u32(attr) - u32(IR::Attribute::RenderTarget0); if (ctx.frag_num_comp[index] > 1) { return ctx.OpAccessChain(ctx.output_f32, ctx.frag_color[index], ctx.ConstU32(element)); @@ -45,39 +86,39 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { } void EmitGetThreadBitScalarReg(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetThreadBitScalarReg(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetScalarRegister(EmitContext&) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetScalarRegister(EmitContext&) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetVectorRegister(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetVectorRegister(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitSetGotoVariable(EmitContext&) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } void EmitGetGotoVariable(EmitContext&) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } Id EmitReadConst(EmitContext& ctx) { - throw LogicError("Unreachable instruction"); + UNREACHABLE_MSG("Unreachable instruction"); } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { @@ -159,7 +200,15 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { } Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - UNREACHABLE(); + const auto info = inst->Flags(); + const auto& buffer = ctx.buffers[handle]; + boost::container::static_vector ids; + for (u32 i = 0; i < 2; i++) { + const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); + } + return ctx.OpCompositeConstruct(buffer.data_types->Get(2), ids); } Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index b033f91b..ede592e0 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -68,11 +68,7 @@ Id EmitConvertS32F16(EmitContext& ctx, Id value) { } Id EmitConvertS32F32(EmitContext& ctx, Id value) { - if (ctx.profile.has_broken_signed_operations) { - return ctx.OpBitcast(ctx.U32[1], ctx.OpConvertFToS(ctx.S32[1], value)); - } else { - return ctx.OpConvertFToS(ctx.U32[1], value); - } + return ctx.OpConvertFToS(ctx.U32[1], value); } Id EmitConvertS32F64(EmitContext& ctx, Id value) { @@ -259,4 +255,8 @@ Id EmitConvertF64U64(EmitContext& ctx, Id value) { return ctx.OpConvertUToF(ctx.F64[1], value); } +Id EmitConvertU16U32(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U16, value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp index e56eb916..04b0b96e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp @@ -6,6 +6,11 @@ namespace Shader::Backend::SPIRV { +Id Decorate(EmitContext& ctx, IR::Inst* inst, Id op) { + ctx.Decorate(op, spv::Decoration::NoContraction); + return op; +} + Id EmitFPAbs16(EmitContext& ctx, Id value) { return ctx.OpFAbs(ctx.F16[1], value); } @@ -19,31 +24,31 @@ Id EmitFPAbs64(EmitContext& ctx, Id value) { } Id EmitFPAdd16(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFAdd(ctx.F16[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F16[1], a, b)); } Id EmitFPAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFAdd(ctx.F32[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F32[1], a, b)); } Id EmitFPAdd64(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFAdd(ctx.F64[1], a, b); + return Decorate(ctx, inst, ctx.OpFAdd(ctx.F64[1], a, b)); } Id EmitFPSub32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFSub(ctx.F32[1], a, b); + return Decorate(ctx, inst, ctx.OpFSub(ctx.F32[1], a, b)); } Id EmitFPFma16(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return ctx.OpFma(ctx.F16[1], a, b, c); + return Decorate(ctx, inst, ctx.OpFma(ctx.F16[1], a, b, c)); } Id EmitFPFma32(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return ctx.OpFma(ctx.F32[1], a, b, c); + return Decorate(ctx, inst, ctx.OpFma(ctx.F32[1], a, b, c)); } Id EmitFPFma64(EmitContext& ctx, IR::Inst* inst, Id a, Id b, Id c) { - return ctx.OpFma(ctx.F64[1], a, b, c); + return Decorate(ctx, inst, ctx.OpFma(ctx.F64[1], a, b, c)); } Id EmitFPMax32(EmitContext& ctx, Id a, Id b) { @@ -63,15 +68,15 @@ Id EmitFPMin64(EmitContext& ctx, Id a, Id b) { } Id EmitFPMul16(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFMul(ctx.F16[1], a, b); + return Decorate(ctx, inst, ctx.OpFMul(ctx.F16[1], a, b)); } Id EmitFPMul32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFMul(ctx.F32[1], a, b); + return Decorate(ctx, inst, ctx.OpFMul(ctx.F32[1], a, b)); } Id EmitFPMul64(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { - return ctx.OpFMul(ctx.F64[1], a, b); + return Decorate(ctx, inst, ctx.OpFMul(ctx.F64[1], a, b)); } Id EmitFPNeg16(EmitContext& ctx, Id value) { @@ -98,6 +103,10 @@ Id EmitFPExp2(EmitContext& ctx, Id value) { return ctx.OpExp2(ctx.F32[1], value); } +Id EmitFPLdexp(EmitContext& ctx, Id value, Id exp) { + return ctx.OpLdexp(ctx.F32[1], value, exp); +} + Id EmitFPLog2(EmitContext& ctx, Id value) { return ctx.OpLog2(ctx.F32[1], value); } @@ -360,4 +369,12 @@ Id EmitFPIsNan64(EmitContext& ctx, Id value) { return ctx.OpIsNan(ctx.U1[1], value); } +Id EmitFPIsInf32(EmitContext& ctx, Id value) { + return ctx.OpIsInf(ctx.U1[1], value); +} + +Id EmitFPIsInf64(EmitContext& ctx, Id value) { + return ctx.OpIsInf(ctx.U1[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 7a54f31c..2d35b97c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -79,10 +79,12 @@ Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id of Id ms) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); + const Id result_type = texture.data_types->Get(4); if (Sirit::ValidId(lod)) { - return ctx.OpImageFetch(ctx.F32[4], image, coords, spv::ImageOperandsMask::Lod, lod); + return ctx.OpBitcast(ctx.F32[4], ctx.OpImageFetch(result_type, image, coords, + spv::ImageOperandsMask::Lod, lod)); } else { - return ctx.OpImageFetch(ctx.F32[4], image, coords); + return ctx.OpBitcast(ctx.F32[4], ctx.OpImageFetch(result_type, image, coords)); } } @@ -134,7 +136,8 @@ Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id co void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id color) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); - ctx.OpImageWrite(image, ctx.OpBitcast(ctx.S32[2], coords), color); + const Id color_type = texture.data_types->Get(4); + ctx.OpImageWrite(image, coords, ctx.OpBitcast(color_type, color)); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 246d7c44..acbaf996 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -33,16 +33,21 @@ void EmitDeviceMemoryBarrier(EmitContext& ctx); void EmitGetScc(EmitContext& ctx); void EmitGetExec(EmitContext& ctx); void EmitGetVcc(EmitContext& ctx); +void EmitGetSccLo(EmitContext& ctx); void EmitGetVccLo(EmitContext& ctx); void EmitGetVccHi(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); void EmitSetExec(EmitContext& ctx); void EmitSetVcc(EmitContext& ctx); +void EmitSetSccLo(EmitContext& ctx); void EmitSetVccLo(EmitContext& ctx); void EmitSetVccHi(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); void EmitDiscard(EmitContext& ctx); +void EmitBarrier(EmitContext& ctx); +void EmitWorkgroupMemoryBarrier(EmitContext& ctx); +void EmitDeviceMemoryBarrier(EmitContext& ctx); Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg); void EmitGetThreadBitScalarReg(EmitContext& ctx); void EmitSetThreadBitScalarReg(EmitContext& ctx); @@ -82,12 +87,13 @@ Id EmitUndefU8(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); -Id EmitReadSharedU8(EmitContext& ctx, Id offset); -Id EmitReadSharedS8(EmitContext& ctx, Id offset); -Id EmitReadSharedU16(EmitContext& ctx, Id offset); -Id EmitReadSharedS16(EmitContext& ctx, Id offset); -Id EmitReadSharedU32(EmitContext& ctx, Id offset); -Id EmitReadSharedU64(EmitContext& ctx, Id offset); +Id EmitLoadSharedU8(EmitContext& ctx, Id offset); +Id EmitLoadSharedS8(EmitContext& ctx, Id offset); +Id EmitLoadSharedU16(EmitContext& ctx, Id offset); +Id EmitLoadSharedS16(EmitContext& ctx, Id offset); +Id EmitLoadSharedU32(EmitContext& ctx, Id offset); +Id EmitLoadSharedU64(EmitContext& ctx, Id offset); +Id EmitLoadSharedU128(EmitContext& ctx, Id offset); void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); @@ -140,7 +146,7 @@ Id EmitSelectF64(EmitContext& ctx, Id cond, Id true_value, Id false_value); void EmitBitCastU16F16(EmitContext& ctx); Id EmitBitCastU32F32(EmitContext& ctx, Id value); void EmitBitCastU64F64(EmitContext& ctx); -void EmitBitCastF16U16(EmitContext&); +Id EmitBitCastF16U16(EmitContext& ctx, Id value); Id EmitBitCastF32U32(EmitContext& ctx, Id value); void EmitBitCastF64U64(EmitContext& ctx); Id EmitPackUint2x32(EmitContext& ctx, Id value); @@ -172,6 +178,7 @@ Id EmitFPNeg64(EmitContext& ctx, Id value); Id EmitFPSin(EmitContext& ctx, Id value); Id EmitFPCos(EmitContext& ctx, Id value); Id EmitFPExp2(EmitContext& ctx, Id value); +Id EmitFPLdexp(EmitContext& ctx, Id value, Id exp); Id EmitFPLog2(EmitContext& ctx, Id value); Id EmitFPRecip32(EmitContext& ctx, Id value); Id EmitFPRecip64(EmitContext& ctx, Id value); @@ -236,8 +243,11 @@ Id EmitFPUnordGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs); Id EmitFPIsNan16(EmitContext& ctx, Id value); Id EmitFPIsNan32(EmitContext& ctx, Id value); Id EmitFPIsNan64(EmitContext& ctx, Id value); +Id EmitFPIsInf32(EmitContext& ctx, Id value); +Id EmitFPIsInf64(EmitContext& ctx, Id value); Id EmitIAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitIAdd64(EmitContext& ctx, Id a, Id b); +Id EmitIAddCary32(EmitContext& ctx, Id a, Id b); Id EmitISub32(EmitContext& ctx, Id a, Id b); Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitSMulExt(EmitContext& ctx, Id a, Id b); @@ -333,6 +343,7 @@ Id EmitConvertF64U8(EmitContext& ctx, Id value); Id EmitConvertF64U16(EmitContext& ctx, Id value); Id EmitConvertF64U32(EmitContext& ctx, Id value); Id EmitConvertF64U64(EmitContext& ctx, Id value); +Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, Id offset); @@ -355,6 +366,18 @@ Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, I Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id color); +Id EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); +Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); + Id EmitLaneId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 1d52a3ed..d5a0f276 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -60,6 +60,10 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b) { return ctx.OpIAdd(ctx.U64, a, b); } +Id EmitIAddCary32(EmitContext& ctx, Id a, Id b) { + return ctx.OpIAddCarry(ctx.full_result_u32x2, a, b); +} + Id EmitISub32(EmitContext& ctx, Id a, Id b) { return ctx.OpISub(ctx.U32[1], a, b); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp new file mode 100644 index 00000000..1582d9dd --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -0,0 +1,165 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { +namespace { +Id Pointer(EmitContext& ctx, Id pointer_type, Id array, Id offset, u32 shift) { + const Id shift_id{ctx.ConstU32(shift)}; + const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + return ctx.OpAccessChain(pointer_type, array, ctx.u32_zero_value, index); +} + +Id Word(EmitContext& ctx, Id offset) { + const Id shift_id{ctx.ConstU32(2U)}; + const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + return ctx.OpLoad(ctx.U32[1], pointer); +} + +std::pair ExtractArgs(EmitContext& ctx, Id offset, u32 mask, u32 count) { + const Id shift{ctx.OpShiftLeftLogical(ctx.U32[1], offset, ctx.ConstU32(3U))}; + const Id bit{ctx.OpBitwiseAnd(ctx.U32[1], shift, ctx.ConstU32(mask))}; + const Id count_id{ctx.ConstU32(count)}; + return {bit, count_id}; +} +} // Anonymous namespace + +Id EmitLoadSharedU8(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedS8(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; + return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedS16(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); + } else { + const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; + return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); + } +} + +Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2)}; + return ctx.OpLoad(ctx.U32[1], pointer); + } else { + return Word(ctx, offset); + } +} + +Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; + return ctx.OpLoad(ctx.U32[2], pointer); + } else { + const Id shift_id{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; + return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), + ctx.OpLoad(ctx.U32[1], rhs_pointer)); + } +} + +Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; + return ctx.OpLoad(ctx.U32[4], pointer); + } + const Id shift_id{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + std::array values{}; + for (u32 i = 0; i < 4; ++i) { + const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + values[i] = ctx.OpLoad(ctx.U32[1], pointer); + } + return ctx.OpCompositeConstruct(ctx.U32[4], values); +} + +void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value) { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; + ctx.OpStore(pointer, ctx.OpUConvert(ctx.U8, value)); +} + +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { + const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; + ctx.OpStore(pointer, ctx.OpUConvert(ctx.U16, value)); +} + +void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { + Id pointer{}; + if (ctx.profile.support_explicit_workgroup_layout) { + pointer = Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2); + } else { + const Id shift{ctx.ConstU32(2U)}; + const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); + } + ctx.OpStore(pointer, value); +} + +void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; + ctx.OpStore(pointer, value); + return; + } + const Id shift{ctx.ConstU32(2U)}; + const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; + ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); + ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); +} + +void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) { + if (ctx.profile.support_explicit_workgroup_layout) { + const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; + ctx.OpStore(pointer, value); + return; + } + const Id shift{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + for (u32 i = 0; i < 4; ++i) { + const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; + const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i)); + } +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 2fe6ae2c..b4a67ebf 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -3,6 +3,7 @@ #include #include +#include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { @@ -41,8 +42,9 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); DefineInterfaces(program); - DefineBuffers(program.info); - DefineImagesAndSamplers(program.info); + DefineBuffers(info); + DefineImagesAndSamplers(info); + DefineSharedMemory(info); } EmitContext::~EmitContext() = default; @@ -72,19 +74,19 @@ Id EmitContext::Def(const IR::Value& value) { void EmitContext::DefineArithmeticTypes() { void_id = Name(TypeVoid(), "void_id"); U1[1] = Name(TypeBool(), "bool_id"); - // F16[1] = Name(TypeFloat(16), "f16_id"); + F16[1] = Name(TypeFloat(16), "f16_id"); F32[1] = Name(TypeFloat(32), "f32_id"); // F64[1] = Name(TypeFloat(64), "f64_id"); S32[1] = Name(TypeSInt(32), "i32_id"); U32[1] = Name(TypeUInt(32), "u32_id"); // U8 = Name(TypeSInt(8), "u8"); // S8 = Name(TypeUInt(8), "s8"); - // U16 = Name(TypeUInt(16), "u16_id"); + U16 = Name(TypeUInt(16), "u16_id"); // S16 = Name(TypeSInt(16), "s16_id"); // U64 = Name(TypeUInt(64), "u64_id"); for (u32 i = 2; i <= 4; i++) { - // F16[i] = Name(TypeVector(F16[1], i), fmt::format("f16vec{}_id", i)); + F16[i] = Name(TypeVector(F16[1], i), fmt::format("f16vec{}_id", i)); F32[i] = Name(TypeVector(F32[1], i), fmt::format("f32vec{}_id", i)); // F64[i] = Name(TypeVector(F64[1], i), fmt::format("f64vec{}_id", i)); S32[i] = Name(TypeVector(S32[1], i), fmt::format("i32vec{}_id", i)); @@ -222,8 +224,17 @@ void EmitContext::DefineInputs(const Info& info) { void EmitContext::DefineOutputs(const Info& info) { switch (stage) { - case Stage::Vertex: + case Stage::Vertex: { output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); + const std::array zero{f32_zero_value, f32_zero_value, f32_zero_value, + f32_zero_value, f32_zero_value, f32_zero_value, + f32_zero_value, f32_zero_value}; + const Id type{TypeArray(F32[1], ConstU32(8U))}; + const Id initializer{ConstantComposite(type, zero)}; + clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output, + initializer); + cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output, + initializer); for (u32 i = 0; i < IR::NumParams; i++) { const IR::Attribute param{IR::Attribute::Param0 + i}; if (!info.stores.GetAny(param)) { @@ -236,6 +247,7 @@ void EmitContext::DefineOutputs(const Info& info) { interfaces.push_back(id); } break; + } case Stage::Fragment: for (u32 i = 0; i < IR::NumRenderTargets; i++) { const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i}; @@ -294,8 +306,49 @@ void EmitContext::DefineBuffers(const Info& info) { } } +spv::ImageFormat GetFormat(const AmdGpu::Image& image) { + if (image.GetDataFmt() == AmdGpu::DataFormat::Format32 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) { + return spv::ImageFormat::R32ui; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format32 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::R32f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format32_32 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::Rg32f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format16 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::R16f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format16_16 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::Rg16f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format8_8 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { + return spv::ImageFormat::Rg8; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format16_16_16_16 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::Rgba16f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format8 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { + return spv::ImageFormat::R8; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format8_8_8_8 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { + return spv::ImageFormat::Rgba8; + } + UNREACHABLE(); +} + Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { - const auto format = spv::ImageFormat::Unknown; + const auto image = ctx.info.ReadUd(desc.sgpr_base, desc.dword_offset); + const auto format = desc.is_storage ? GetFormat(image) : spv::ImageFormat::Unknown; const u32 sampled = desc.is_storage ? 2 : 1; switch (desc.type) { case AmdGpu::ImageType::Color1D: @@ -320,7 +373,17 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { void EmitContext::DefineImagesAndSamplers(const Info& info) { for (const auto& image_desc : info.images) { - const Id sampled_type{image_desc.nfmt == AmdGpu::NumberFormat::Uint ? U32[1] : F32[1]}; + const VectorIds* data_types = [&] { + switch (image_desc.nfmt) { + case AmdGpu::NumberFormat::Uint: + return &U32; + case AmdGpu::NumberFormat::Sint: + return &S32; + default: + return &F32; + } + }(); + const Id sampled_type = data_types->Get(1); const Id image_type{ImageType(*this, image_desc, sampled_type)}; const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)}; const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; @@ -330,6 +393,7 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { image_desc.dword_offset)); images.push_back({ .id = id, + .data_types = data_types, .sampled_type = image_desc.is_storage ? sampled_type : TypeSampledImage(image_type), .pointer_type = pointer_type, .image_type = image_type, @@ -338,6 +402,8 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { ++binding; } + image_u32 = TypePointer(spv::StorageClass::Image, U32[1]); + if (info.samplers.empty()) { return; } @@ -356,4 +422,50 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { } } +void EmitContext::DefineSharedMemory(const Info& info) { + if (info.shared_memory_size == 0) { + return; + } + const auto make{[&](Id element_type, u32 element_size) { + const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)}; + const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; + Decorate(array_type, spv::Decoration::ArrayStride, element_size); + + const Id struct_type{TypeStruct(array_type)}; + MemberDecorate(struct_type, 0U, spv::Decoration::Offset, 0U); + Decorate(struct_type, spv::Decoration::Block); + + const Id pointer{TypePointer(spv::StorageClass::Workgroup, struct_type)}; + const Id element_pointer{TypePointer(spv::StorageClass::Workgroup, element_type)}; + const Id variable{AddGlobalVariable(pointer, spv::StorageClass::Workgroup)}; + Decorate(variable, spv::Decoration::Aliased); + interfaces.push_back(variable); + + return std::make_tuple(variable, element_pointer, pointer); + }}; + if (profile.support_explicit_workgroup_layout) { + AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); + if (info.uses_shared_u8) { + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout8BitAccessKHR); + std::tie(shared_memory_u8, shared_u8, std::ignore) = make(U8, 1); + } + if (info.uses_shared_u16) { + AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); + std::tie(shared_memory_u16, shared_u16, std::ignore) = make(U16, 2); + } + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make(U32[1], 4); + std::tie(shared_memory_u32x2, shared_u32x2, std::ignore) = make(U32[2], 8); + std::tie(shared_memory_u32x4, shared_u32x4, std::ignore) = make(U32[4], 16); + return; + } + const u32 num_elements{Common::DivCeil(info.shared_memory_size, 4U)}; + const Id type{TypeArray(U32[1], ConstU32(num_elements))}; + shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); + + shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); + shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); + interfaces.push_back(shared_memory_u32); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 941b3d30..d143be4b 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -66,15 +66,17 @@ public: } template - [[nodiscard]] Id DefineVar(Id type, spv::StorageClass storage_class) { + [[nodiscard]] Id DefineVar(Id type, spv::StorageClass storage_class, + std::optional initializer = std::nullopt) { const Id pointer_type_id{TypePointer(storage_class, type)}; - return global ? AddGlobalVariable(pointer_type_id, storage_class) - : AddLocalVariable(pointer_type_id, storage_class); + return global ? AddGlobalVariable(pointer_type_id, storage_class, initializer) + : AddLocalVariable(pointer_type_id, storage_class, initializer); } [[nodiscard]] Id DefineVariable(Id type, std::optional builtin, - spv::StorageClass storage_class) { - const Id id{DefineVar(type, storage_class)}; + spv::StorageClass storage_class, + std::optional initializer = std::nullopt) { + const Id id{DefineVar(type, storage_class, initializer)}; if (builtin) { Decorate(id, spv::Decoration::BuiltIn, *builtin); } @@ -147,6 +149,12 @@ public: Id u32_zero_value{}; Id f32_zero_value{}; + Id shared_u8{}; + Id shared_u16{}; + Id shared_u32{}; + Id shared_u32x2{}; + Id shared_u32x4{}; + Id input_u32{}; Id input_f32{}; Id input_s32{}; @@ -163,13 +171,25 @@ public: Id frag_depth{}; std::array frag_color{}; std::array frag_num_comp{}; + Id clip_distances{}; + Id cull_distances{}; Id workgroup_id{}; Id local_invocation_id{}; Id subgroup_local_invocation_id{}; + Id image_u32{}; + + Id shared_memory_u8{}; + Id shared_memory_u16{}; + Id shared_memory_u32{}; + Id shared_memory_u32x2{}; + Id shared_memory_u32x4{}; + + Id shared_memory_u32_type{}; struct TextureDefinition { Id id; + const VectorIds* data_types; Id sampled_type; Id pointer_type; Id image_type; @@ -205,6 +225,7 @@ private: void DefineOutputs(const Info& info); void DefineBuffers(const Info& info); void DefineImagesAndSamplers(const Info& info); + void DefineSharedMemory(const Info& info); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id); }; diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index 893df1e9..03af1515 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -149,9 +149,15 @@ void CFG::LinkBlocks() { block.end_class = EndClass::Branch; } else if (end_inst.opcode == Opcode::S_ENDPGM) { const auto& prev_inst = inst_list[block.end_index - 1]; - if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0 && - prev_inst.control.exp.target != 9) { - block.end_class = EndClass::Kill; + if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0) { + if (prev_inst.control.exp.target != 9) { + block.end_class = EndClass::Kill; + } else if (const auto& exec_mask = inst_list[block.end_index - 2]; + exec_mask.src[0].field == OperandField::ConstZero) { + block.end_class = EndClass::Kill; + } else { + block.end_class = EndClass::Exit; + } } else { block.end_class = EndClass::Exit; } diff --git a/src/shader_recompiler/frontend/fetch_shader.cpp b/src/shader_recompiler/frontend/fetch_shader.cpp index 80917e0a..11567c1f 100644 --- a/src/shader_recompiler/frontend/fetch_shader.cpp +++ b/src/shader_recompiler/frontend/fetch_shader.cpp @@ -32,7 +32,7 @@ namespace Shader::Gcn { * We take the reverse way, extract the original input semantics from these instructions. **/ -std::vector ParseFetchShader(const u32* code) { +std::vector ParseFetchShader(const u32* code, u32* out_size) { std::vector attributes; GcnCodeSlice code_slice(code, code + std::numeric_limits::max()); GcnDecodeContext decoder; @@ -47,6 +47,8 @@ std::vector ParseFetchShader(const u32* code) { u32 semantic_index = 0; while (!code_slice.atEnd()) { const auto inst = decoder.decodeInstruction(code_slice); + *out_size += inst.length; + if (inst.opcode == Opcode::S_SETPC_B64) { break; } diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index 14f2bf4d..0858061a 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -17,6 +17,6 @@ struct VertexAttribute { u8 instance_data; ///< Indicates that the buffer will be accessed in instance rate }; -std::vector ParseFetchShader(const u32* code); +std::vector ParseFetchShader(const u32* code, u32* out_size); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 91417d5b..634566fa 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -3429,48 +3429,48 @@ constexpr std::array InstructionFormatMIMG = {{ {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, ScalarType::Undefined}, // 17 = IMAGE_ATOMIC_ADD - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 18 = IMAGE_ATOMIC_SUB - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, {}, // 20 = IMAGE_ATOMIC_SMIN - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Sint32, + ScalarType::Sint32}, // 21 = IMAGE_ATOMIC_UMIN - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 22 = IMAGE_ATOMIC_SMAX - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Sint32, + ScalarType::Sint32}, // 23 = IMAGE_ATOMIC_UMAX - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 24 = IMAGE_ATOMIC_AND - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 25 = IMAGE_ATOMIC_OR - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 26 = IMAGE_ATOMIC_XOR - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 27 = IMAGE_ATOMIC_INC - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 28 = IMAGE_ATOMIC_DEC - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32, + ScalarType::Uint32}, // 29 = IMAGE_ATOMIC_FCMPSWAP - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32, + ScalarType::Float32}, // 30 = IMAGE_ATOMIC_FMIN - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32, + ScalarType::Float32}, // 31 = IMAGE_ATOMIC_FMAX - {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32, + ScalarType::Float32}, // 32 = IMAGE_SAMPLE {InstClass::VectorMemImgSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32, ScalarType::Float32}, diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index df9fe8b6..6d78448b 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -187,7 +187,7 @@ std::string DumpExpr(const Statement* stmt) { case StatementType::Not: case StatementType::Or: case StatementType::Variable: - throw LogicError("Statement can't be printed"); + UNREACHABLE_MSG("Statement can't be printed"); } } return ret; @@ -335,7 +335,7 @@ private: } // Expensive operation: if (!AreSiblings(goto_stmt, label_stmt)) { - throw LogicError("Goto is not a sibling with the label"); + UNREACHABLE_MSG("Goto is not a sibling with the label"); } // goto_stmt and label_stmt are guaranteed to be siblings, eliminate if (std::next(goto_stmt) == label_stmt) { @@ -451,7 +451,7 @@ private: case StatementType::Loop: return MoveOutwardLoop(goto_stmt); default: - throw LogicError("Invalid outward movement"); + UNREACHABLE_MSG("Invalid outward movement"); } } @@ -486,7 +486,7 @@ private: case StatementType::Loop: break; default: - throw LogicError("Invalid inward movement"); + UNREACHABLE_MSG("Invalid inward movement"); } Tree& nested_tree{label_nested_stmt->children}; Statement* const new_goto{pool.Create(Goto{}, variable, label, &*label_nested_stmt)}; @@ -633,7 +633,8 @@ private: if (!stmt.block->is_dummy) { const u32 start = stmt.block->begin_index; const u32 size = stmt.block->end_index - start + 1; - Translate(current_block, inst_list.subspan(start, size), info); + Translate(current_block, stmt.block->begin, inst_list.subspan(start, size), + info); } break; } diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 99883015..c5d9f0ec 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -22,16 +22,18 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg dst_reg{inst.dst[0].code}; if (is_pair) { + // Pair loads are either 32 or 64-bit. We assume 32-bit for now. + ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.SetVectorReg(dst_reg, ir.ReadShared(32, is_signed, addr0)); + ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)}); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.SetVectorReg(dst_reg + 1, ir.ReadShared(32, is_signed, addr1)); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)}); } else if (bit_size == 64) { - const IR::Value data = ir.UnpackUint2x32(ir.ReadShared(bit_size, is_signed, addr)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); } else { - const IR::U32 data = ir.ReadShared(bit_size, is_signed, addr); + const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr)}; ir.SetVectorReg(dst_reg, data); } } @@ -41,17 +43,26 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; if (is_pair) { + ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); ir.WriteShared(32, ir.GetVectorReg(data0), addr0); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); ir.WriteShared(32, ir.GetVectorReg(data1), addr1); } else if (bit_size == 64) { - const IR::U64 data = ir.PackUint2x32( - ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))); + const IR::Value data = + ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); ir.WriteShared(bit_size, data, addr); } else { ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr); } } +void Translator::S_BARRIER() { + ir.Barrier(); +} + +void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) { + UNREACHABLE(); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 1b762c3a..03b4af7e 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -318,4 +318,16 @@ void Translator::S_SUB_U32(const GcnInst& inst) { ir.SetScc(ir.Imm1(false)); } +void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) { + // This only really exists to let resource tracking pass know + // there is an inline cbuf. + SetDst(inst.dst[0], ir.Imm32(pc)); +} + +void Translator::S_ADDC_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo())); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 2cf5c5b2..3c80764c 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -5,20 +5,29 @@ namespace Shader::Gcn { +static constexpr u32 SQ_SRC_LITERAL = 0xFF; + void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; - ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported"); + const u32 dword_offset = [&] -> u32 { + if (smrd.imm) { + return smrd.offset; + } + if (smrd.offset == SQ_SRC_LITERAL) { + return inst.src[1].code; + } + UNREACHABLE(); + }(); const IR::ScalarReg sbase{inst.src[0].code * 2}; const IR::Value base = ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { - ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i))); + ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(dword_offset + i))); } } void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { - static constexpr u32 SQ_SRC_LITERAL = 0xFF; const auto& smrd = inst.control.smrd; const IR::ScalarReg sbase{inst.src[0].code * 2}; const IR::U32 dword_offset = [&] -> IR::U32 { @@ -30,7 +39,9 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { } return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2)); }(); - const IR::Value vsharp = ir.GetScalarReg(sbase); + const IR::Value vsharp = + ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1), + ir.GetScalarReg(sbase + 2), ir.GetScalarReg(sbase + 3)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 407ee399..3d0857c0 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/config.h" +#include "common/io_file.h" +#include "common/path_util.h" #include "shader_recompiler/exception.h" #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/translate/translate.h" @@ -190,7 +193,20 @@ void Translator::EmitFetch(const GcnInst& inst) { std::memcpy(&code, &info.user_data[sgpr_base], sizeof(code)); // Parse the assembly to generate a list of attributes. - const auto attribs = ParseFetchShader(code); + u32 fetch_size{}; + const auto attribs = ParseFetchShader(code, &fetch_size); + + if (Config::dumpShaders()) { + using namespace Common::FS; + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = fmt::format("vs_fetch_{:#018x}.bin", info.pgm_hash); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; + file.WriteRaw(code, fetch_size); + } + for (const auto& attrib : attribs) { const IR::Attribute attr{IR::Attribute::Param0 + attrib.semantic}; IR::VectorReg dst_reg{attrib.dest_vgpr}; @@ -224,9 +240,9 @@ void Translator::EmitFetch(const GcnInst& inst) { attrib.instance_data); } - const u32 num_components = AmdGpu::NumComponents(buffer.data_format); + const u32 num_components = AmdGpu::NumComponents(buffer.GetDataFmt()); info.vs_inputs.push_back({ - .fmt = buffer.num_format, + .fmt = buffer.GetNumberFmt(), .binding = attrib.semantic, .num_components = std::min(attrib.num_elements, num_components), .sgpr_base = attrib.sgpr_base, @@ -236,12 +252,13 @@ void Translator::EmitFetch(const GcnInst& inst) { } } -void Translate(IR::Block* block, std::span inst_list, Info& info) { +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info) { if (inst_list.empty()) { return; } Translator translator{block, info}; for (const auto& inst : inst_list) { + block_base += inst.length; switch (inst.opcode) { case Opcode::S_MOVK_I32: translator.S_MOVK(inst); @@ -345,6 +362,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_BFREV_B32: translator.V_BFREV_B32(inst); break; + case Opcode::V_LDEXP_F32: + translator.V_LDEXP_F32(inst); + break; case Opcode::V_FRACT_F32: translator.V_FRACT_F32(inst); break; @@ -374,8 +394,40 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::IMAGE_SAMPLE_LZ: case Opcode::IMAGE_SAMPLE: case Opcode::IMAGE_SAMPLE_L: + case Opcode::IMAGE_SAMPLE_C_O: + case Opcode::IMAGE_SAMPLE_B: translator.IMAGE_SAMPLE(inst); break; + case Opcode::IMAGE_ATOMIC_ADD: + translator.IMAGE_ATOMIC(AtomicOp::Add, inst); + break; + case Opcode::IMAGE_ATOMIC_AND: + translator.IMAGE_ATOMIC(AtomicOp::And, inst); + break; + case Opcode::IMAGE_ATOMIC_OR: + translator.IMAGE_ATOMIC(AtomicOp::Or, inst); + break; + case Opcode::IMAGE_ATOMIC_XOR: + translator.IMAGE_ATOMIC(AtomicOp::Xor, inst); + break; + case Opcode::IMAGE_ATOMIC_UMAX: + translator.IMAGE_ATOMIC(AtomicOp::Umax, inst); + break; + case Opcode::IMAGE_ATOMIC_SMAX: + translator.IMAGE_ATOMIC(AtomicOp::Smax, inst); + break; + case Opcode::IMAGE_ATOMIC_UMIN: + translator.IMAGE_ATOMIC(AtomicOp::Umin, inst); + break; + case Opcode::IMAGE_ATOMIC_SMIN: + translator.IMAGE_ATOMIC(AtomicOp::Smin, inst); + break; + case Opcode::IMAGE_ATOMIC_INC: + translator.IMAGE_ATOMIC(AtomicOp::Inc, inst); + break; + case Opcode::IMAGE_ATOMIC_DEC: + translator.IMAGE_ATOMIC(AtomicOp::Dec, inst); + break; case Opcode::IMAGE_GET_LOD: translator.IMAGE_GET_LOD(inst); break; @@ -457,9 +509,15 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CMP_NGT_F32: translator.V_CMP_F32(ConditionOp::LE, false, inst); break; + case Opcode::V_CMP_NGE_F32: + translator.V_CMP_F32(ConditionOp::LT, false, inst); + break; case Opcode::S_CMP_LT_U32: translator.S_CMP(ConditionOp::LT, false, inst); break; + case Opcode::S_CMP_LE_U32: + translator.S_CMP(ConditionOp::LE, false, inst); + break; case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); break; @@ -487,6 +545,12 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; + case Opcode::TBUFFER_LOAD_FORMAT_X: + translator.BUFFER_LOAD_FORMAT(1, true, inst); + break; + case Opcode::TBUFFER_LOAD_FORMAT_XY: + translator.BUFFER_LOAD_FORMAT(2, true, inst); + break; case Opcode::TBUFFER_LOAD_FORMAT_XYZ: translator.BUFFER_LOAD_FORMAT(3, true, inst); break; @@ -581,6 +645,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CVT_I32_F32: translator.V_CVT_I32_F32(inst); break; + case Opcode::V_CVT_FLR_I32_F32: + translator.V_CVT_FLR_I32_F32(inst); + break; case Opcode::V_SUBREV_F32: translator.V_SUBREV_F32(inst); break; @@ -715,6 +782,7 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) translator.V_MAD_I32_I24(inst); break; case Opcode::V_MUL_I32_I24: + case Opcode::V_MUL_U32_U24: translator.V_MUL_I32_I24(inst); break; case Opcode::V_SUB_I32: @@ -771,6 +839,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CMP_NE_U64: translator.V_CMP_NE_U64(inst); break; + case Opcode::V_CMP_CLASS_F32: + translator.V_CMP_CLASS_F32(inst); + break; case Opcode::V_TRUNC_F32: translator.V_TRUNC_F32(inst); break; @@ -786,7 +857,11 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_ADD_U32: translator.S_ADD_U32(inst); break; + case Opcode::S_ADDC_U32: + translator.S_ADDC_U32(inst); + break; case Opcode::S_SUB_U32: + case Opcode::S_SUB_I32: translator.S_SUB_U32(inst); break; // TODO: Separate implementation for legacy variants. @@ -809,9 +884,30 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::IMAGE_GET_RESINFO: translator.IMAGE_GET_RESINFO(inst); break; + case Opcode::S_BARRIER: + translator.S_BARRIER(); + break; case Opcode::S_TTRACEDATA: LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); break; + case Opcode::DS_READ_B32: + translator.DS_READ(32, false, false, inst); + break; + case Opcode::DS_READ2_B32: + translator.DS_READ(32, false, true, inst); + break; + case Opcode::DS_WRITE_B32: + translator.DS_WRITE(32, false, false, inst); + break; + case Opcode::DS_WRITE2_B32: + translator.DS_WRITE(32, false, true, inst); + break; + case Opcode::V_READFIRSTLANE_B32: + translator.V_READFIRSTLANE_B32(inst); + break; + case Opcode::S_GETPC_B64: + translator.S_GETPC_B64(block_base, inst); + break; case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 1145de59..e1f72e5f 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -26,6 +26,25 @@ enum class ConditionOp : u32 { TRU, }; +enum class AtomicOp : u32 { + Swap, + CmpSwap, + Add, + Sub, + Smin, + Umin, + Smax, + Umax, + And, + Or, + Xor, + Inc, + Dec, + FCmpSwap, + Fmin, + Fmax, +}; + enum class NegateMode : u32 { None, Src1, @@ -61,6 +80,8 @@ public: void S_BREV_B32(const GcnInst& inst); void S_ADD_U32(const GcnInst& inst); void S_SUB_U32(const GcnInst& inst); + void S_GETPC_B64(u32 pc, const GcnInst& inst); + void S_ADDC_U32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -133,6 +154,9 @@ public: void V_NOT_B32(const GcnInst& inst); void V_CVT_F32_UBYTE(u32 index, const GcnInst& inst); void V_BFREV_B32(const GcnInst& inst); + void V_LDEXP_F32(const GcnInst& inst); + void V_CVT_FLR_I32_F32(const GcnInst& inst); + void V_CMP_CLASS_F32(const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); @@ -145,6 +169,8 @@ public: void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); + void V_READFIRSTLANE_B32(const GcnInst& inst); + void S_BARRIER(); // MIMG void IMAGE_GET_RESINFO(const GcnInst& inst); @@ -153,6 +179,7 @@ public: void IMAGE_STORE(const GcnInst& inst); void IMAGE_LOAD(bool has_mip, const GcnInst& inst); void IMAGE_GET_LOD(const GcnInst& inst); + void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); // Export void EXP(const GcnInst& inst); @@ -167,6 +194,6 @@ private: static std::array exec_contexts; }; -void Translate(IR::Block* block, std::span inst_list, Info& info); +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 72b2d76a..a434567a 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -28,7 +28,8 @@ void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { void Translator::V_CVT_F32_F16(const GcnInst& inst) { const IR::U32 src0 = GetSrc(inst.src[0]); - SetDst(inst.dst[0], ir.ConvertUToF(32, 16, src0)); + const IR::U16 src0l = ir.UConvert(16, src0); + SetDst(inst.dst[0], ir.FPConvert(32, ir.BitCast(src0l))); } void Translator::V_MUL_F32(const GcnInst& inst) { @@ -50,11 +51,14 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) { }; const bool has_flt_source = is_float_const(inst.src[0].field) || is_float_const(inst.src[1].field); - const IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source); + IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source); IR::U32F32 src1 = GetSrc(inst.src[1], has_flt_source); if (src0.Type() == IR::Type::F32 && src1.Type() == IR::Type::U32) { src1 = ir.BitCast(src1); } + if (src1.Type() == IR::Type::F32 && src0.Type() == IR::Type::U32) { + src0 = ir.BitCast(src0); + } const IR::Value result = ir.Select(flag, src1, src0); ir.SetVectorReg(dst_reg, IR::U32F32{result}); } @@ -502,4 +506,19 @@ void Translator::V_BFREV_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.BitReverse(src0)); } +void Translator::V_LDEXP_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.FPLdexp(src0, src1)); +} + +void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + SetDst(inst.dst[0], ir.ConvertFToI(32, true, ir.FPFloor(src0))); +} + +void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { + UNREACHABLE(); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index f12b4e2f..21f3abca 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -212,10 +212,15 @@ void Translator::IMAGE_STORE(const GcnInst& inst) { ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1), ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3)); - ASSERT(mimg.dmask == 0xF); - const IR::Value value = ir.CompositeConstruct( - ir.GetVectorReg(data_reg), ir.GetVectorReg(data_reg + 1), - ir.GetVectorReg(data_reg + 2), ir.GetVectorReg(data_reg + 3)); + boost::container::static_vector comps; + for (u32 i = 0; i < 4; i++) { + if (((mimg.dmask >> i) & 1) == 0) { + comps.push_back(ir.Imm32(0.f)); + continue; + } + comps.push_back(ir.GetVectorReg(data_reg++)); + } + const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]); ir.ImageWrite(handle, body, value, {}); } @@ -245,7 +250,10 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst info.nfmt.Assign(static_cast(mtbuf.nfmt)); } - const IR::Value value = ir.LoadBuffer(num_dwords, ir.GetScalarReg(sharp), address, info); + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::F32{value}); @@ -304,7 +312,10 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnIns ir.GetVectorReg(src_reg + 3)); break; } - ir.StoreBuffer(num_dwords, ir.GetScalarReg(sharp), address, value, info); + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + ir.StoreBuffer(num_dwords, handle, address, value, info); } void Translator::IMAGE_GET_LOD(const GcnInst& inst) { @@ -322,4 +333,48 @@ void Translator::IMAGE_GET_LOD(const GcnInst& inst) { ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(lod, 1)}); } +void Translator::IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst) { + const auto& mimg = inst.control.mimg; + IR::VectorReg val_reg{inst.dst[0].code}; + IR::VectorReg addr_reg{inst.src[0].code}; + const IR::ScalarReg tsharp_reg{inst.src[2].code * 4}; + + const IR::Value value = ir.GetVectorReg(val_reg); + const IR::Value handle = ir.GetScalarReg(tsharp_reg); + const IR::Value body = + ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1), + ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3)); + const IR::Value prev = [&] { + switch (op) { + case AtomicOp::Swap: + return ir.ImageAtomicExchange(handle, body, value, {}); + case AtomicOp::Add: + return ir.ImageAtomicIAdd(handle, body, value, {}); + case AtomicOp::Smin: + return ir.ImageAtomicIMin(handle, body, value, true, {}); + case AtomicOp::Umin: + return ir.ImageAtomicUMin(handle, body, value, {}); + case AtomicOp::Smax: + return ir.ImageAtomicIMax(handle, body, value, true, {}); + case AtomicOp::Umax: + return ir.ImageAtomicUMax(handle, body, value, {}); + case AtomicOp::And: + return ir.ImageAtomicAnd(handle, body, value, {}); + case AtomicOp::Or: + return ir.ImageAtomicOr(handle, body, value, {}); + case AtomicOp::Xor: + return ir.ImageAtomicXor(handle, body, value, {}); + case AtomicOp::Inc: + return ir.ImageAtomicInc(handle, body, value, {}); + case AtomicOp::Dec: + return ir.ImageAtomicDec(handle, body, value, {}); + default: + UNREACHABLE(); + } + }(); + if (mimg.glc) { + ir.SetVectorReg(val_reg, IR::U32{prev}); + } +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index b148578f..0cfbc421 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -4,8 +4,8 @@ #pragma once #include +#include "common/assert.h" #include "common/types.h" -#include "shader_recompiler/exception.h" namespace Shader::IR { @@ -88,10 +88,10 @@ constexpr size_t NumParams = 32; [[nodiscard]] constexpr Attribute operator+(Attribute attr, int num) { const int result{static_cast(attr) + num}; if (result > static_cast(Attribute::Param31)) { - throw LogicError("Overflow on register arithmetic"); + UNREACHABLE_MSG("Overflow on register arithmetic"); } if (result < static_cast(Attribute::RenderTarget0)) { - throw LogicError("Underflow on register arithmetic"); + UNREACHABLE_MSG("Underflow on register arithmetic"); } return static_cast(result); } diff --git a/src/shader_recompiler/ir/basic_block.cpp b/src/shader_recompiler/ir/basic_block.cpp index 39174c56..622a6249 100644 --- a/src/shader_recompiler/ir/basic_block.cpp +++ b/src/shader_recompiler/ir/basic_block.cpp @@ -39,10 +39,10 @@ Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode op, void Block::AddBranch(Block* block) { if (std::ranges::find(imm_successors, block) != imm_successors.end()) { - throw LogicError("Successor already inserted"); + UNREACHABLE_MSG("Successor already inserted"); } if (std::ranges::find(block->imm_predecessors, this) != block->imm_predecessors.end()) { - throw LogicError("Predecessor already inserted"); + UNREACHABLE_MSG("Predecessor already inserted"); } imm_successors.push_back(block); block->imm_predecessors.push_back(this); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 09bb3580..7f0fa741 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -115,6 +115,18 @@ void IREmitter::Discard() { Inst(Opcode::Discard); } +void IREmitter::Barrier() { + Inst(Opcode::Barrier); +} + +void IREmitter::WorkgroupMemoryBarrier() { + Inst(Opcode::WorkgroupMemoryBarrier); +} + +void IREmitter::DeviceMemoryBarrier() { + Inst(Opcode::DeviceMemoryBarrier); +} + U32 IREmitter::GetUserData(IR::ScalarReg reg) { return Inst(Opcode::GetUserData, reg); } @@ -200,6 +212,10 @@ U1 IREmitter::GetVcc() { return Inst(Opcode::GetVcc); } +U32 IREmitter::GetSccLo() { + return Inst(Opcode::GetSccLo); +} + U32 IREmitter::GetVccLo() { return Inst(Opcode::GetVccLo); } @@ -220,6 +236,10 @@ void IREmitter::SetVcc(const U1& value) { Inst(Opcode::SetVcc, value); } +void IREmitter::SetSccLo(const U32& value) { + Inst(Opcode::SetSccLo, value); +} + void IREmitter::SetVccLo(const U32& value) { Inst(Opcode::SetVccLo, value); } @@ -240,22 +260,25 @@ void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, u32 comp Inst(Opcode::SetAttribute, attribute, value, Imm32(comp)); } -U32U64 IREmitter::ReadShared(int bit_size, bool is_signed, const U32& offset) { - /*switch (bit_size) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { + switch (bit_size) { case 8: - return Inst(is_signed ? Opcode::ReadSharedS8 : Opcode::ReadSharedU8, offset); + return Inst(is_signed ? Opcode::LoadSharedS8 : Opcode::LoadSharedU8, offset); case 16: - return Inst(is_signed ? Opcode::ReadSharedS16 : Opcode::ReadSharedU16, offset); + return Inst(is_signed ? Opcode::LoadSharedS16 : Opcode::LoadSharedU16, offset); case 32: - return Inst(Opcode::ReadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::ReadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); + case 128: + return Inst(Opcode::LoadSharedU128, offset); + default: + UNREACHABLE_MSG("Invalid bit size {}", bit_size); } - UNREACHABLE_MSG("Invalid bit size {}", bit_size);*/ } void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { - /*switch (bit_size) { + switch (bit_size) { case 8: Inst(Opcode::WriteSharedU8, offset, value); break; @@ -268,9 +291,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) case 64: Inst(Opcode::WriteSharedU64, offset, value); break; + case 128: + Inst(Opcode::WriteSharedU128, offset, value); + break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); - }*/ + } } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { @@ -603,6 +629,10 @@ F32 IREmitter::FPExp2(const F32& value) { return Inst(Opcode::FPExp2, value); } +F32 IREmitter::FPLdexp(const F32& value, const U32& exp) { + return Inst(Opcode::FPLdexp, value, exp); +} + F32 IREmitter::FPLog2(const F32& value) { return Inst(Opcode::FPLog2, value); } @@ -810,6 +840,17 @@ U1 IREmitter::FPIsNan(const F32F64& value) { } } +U1 IREmitter::FPIsInf(const F32F64& value) { + switch (value.Type()) { + case Type::F32: + return Inst(Opcode::FPIsInf32, value); + case Type::F64: + return Inst(Opcode::FPIsInf64, value); + default: + ThrowInvalidType(value.Type()); + } +} + U1 IREmitter::FPOrdered(const F32F64& lhs, const F32F64& rhs) { if (lhs.Type() != rhs.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); @@ -866,6 +907,18 @@ U32U64 IREmitter::IAdd(const U32U64& a, const U32U64& b) { } } +Value IREmitter::IAddCary(const U32& a, const U32& b) { + if (a.Type() != b.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); + } + switch (a.Type()) { + case Type::U32: + return Inst(Opcode::IAddCary32, a, b); + default: + ThrowInvalidType(a.Type()); + } +} + U32U64 IREmitter::ISub(const U32U64& a, const U32U64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); @@ -1142,6 +1195,13 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s } U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { + switch (result_bitsize) { + case 16: + switch (value.Type()) { + case Type::U32: + return Inst(Opcode::ConvertU16U32, value); + } + } throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize); } @@ -1163,6 +1223,73 @@ F16F32F64 IREmitter::FPConvert(size_t result_bitsize, const F16F32F64& value) { throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize); } +Value IREmitter::ImageAtomicIAdd(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicIAdd32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicSMin(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicSMin32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicUMin(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicUMin32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicIMin(const Value& handle, const Value& coords, const Value& value, + bool is_signed, TextureInstInfo info) { + return is_signed ? ImageAtomicSMin(handle, coords, value, info) + : ImageAtomicUMin(handle, coords, value, info); +} + +Value IREmitter::ImageAtomicSMax(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicSMax32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicUMax(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicUMax32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicIMax(const Value& handle, const Value& coords, const Value& value, + bool is_signed, TextureInstInfo info) { + return is_signed ? ImageAtomicSMax(handle, coords, value, info) + : ImageAtomicUMax(handle, coords, value, info); +} + +Value IREmitter::ImageAtomicInc(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicInc32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicDec(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicDec32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicAnd(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicAnd32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicOr(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicOr32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicXor(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicXor32, Flags{info}, handle, coords, value); +} + +Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info) { + return Inst(Opcode::ImageAtomicExchange32, Flags{info}, handle, coords, value); +} + Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& coords, const F32& bias, const Value& offset, const F32& lod_clamp, TextureInstInfo info) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index cf74afc0..c3342530 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -43,6 +43,10 @@ public: void Epilogue(); void Discard(); + void Barrier(); + void WorkgroupMemoryBarrier(); + void DeviceMemoryBarrier(); + [[nodiscard]] U32 GetUserData(IR::ScalarReg reg); [[nodiscard]] U1 GetThreadBitScalarReg(IR::ScalarReg reg); void SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value); @@ -60,11 +64,13 @@ public: [[nodiscard]] U1 GetScc(); [[nodiscard]] U1 GetExec(); [[nodiscard]] U1 GetVcc(); + [[nodiscard]] U32 GetSccLo(); [[nodiscard]] U32 GetVccLo(); [[nodiscard]] U32 GetVccHi(); void SetScc(const U1& value); void SetExec(const U1& value); void SetVcc(const U1& value); + void SetSccLo(const U32& value); void SetVccLo(const U32& value); void SetVccHi(const U32& value); @@ -74,7 +80,7 @@ public: [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); - [[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); @@ -120,6 +126,7 @@ public: [[nodiscard]] F32 FPSin(const F32& value); [[nodiscard]] F32 FPExp2(const F32& value); [[nodiscard]] F32 FPLog2(const F32& value); + [[nodiscard]] F32 FPLdexp(const F32& value, const U32& exp); [[nodiscard]] F32F64 FPRecip(const F32F64& value); [[nodiscard]] F32F64 FPRecipSqrt(const F32F64& value); [[nodiscard]] F32 FPSqrt(const F32& value); @@ -139,14 +146,16 @@ public: [[nodiscard]] U1 FPLessThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true); [[nodiscard]] U1 FPGreaterThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true); [[nodiscard]] U1 FPIsNan(const F32F64& value); + [[nodiscard]] U1 FPIsInf(const F32F64& value); [[nodiscard]] U1 FPOrdered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] U1 FPUnordered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] F32F64 FPMax(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] F32F64 FPMin(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); + [[nodiscard]] Value IAddCary(const U32& a, const U32& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); - [[nodiscard]] IR::Value IMulExt(const U32& a, const U32& b, bool is_signed = false); + [[nodiscard]] Value IMulExt(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32 IMul(const U32& a, const U32& b); [[nodiscard]] U32 IDiv(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 INeg(const U32U64& value); @@ -199,6 +208,33 @@ public: [[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value); [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value); + [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicSMin(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicUMin(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicIMin(const Value& handle, const Value& coords, + const Value& value, bool is_signed, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicSMax(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicUMax(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicIMax(const Value& handle, const Value& coords, + const Value& value, bool is_signed, TextureInstInfo info); + [[nodiscard]] Value ImageAtomicInc(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info); + [[nodiscard]] Value ImageAtomicDec(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info); + [[nodiscard]] Value ImageAtomicAnd(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info); + [[nodiscard]] Value ImageAtomicOr(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info); + [[nodiscard]] Value ImageAtomicXor(const Value& handle, const Value& coords, const Value& value, + TextureInstInfo info); + [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords, + const Value& value, TextureInstInfo info); + [[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& coords, const F32& bias, const Value& offset, const F32& lod_clamp, TextureInstInfo info); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index da4e2e75..f823980a 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -40,6 +40,9 @@ Inst::~Inst() { bool Inst::MayHaveSideEffects() const noexcept { switch (op) { + case Opcode::Barrier: + case Opcode::WorkgroupMemoryBarrier: + case Opcode::DeviceMemoryBarrier: case Opcode::ConditionRef: case Opcode::Reference: case Opcode::PhiMove: @@ -52,7 +55,23 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: case Opcode::StoreBufferU32: + case Opcode::WriteSharedU128: + case Opcode::WriteSharedU64: + case Opcode::WriteSharedU32: + case Opcode::WriteSharedU16: + case Opcode::WriteSharedU8: case Opcode::ImageWrite: + case Opcode::ImageAtomicIAdd32: + case Opcode::ImageAtomicSMin32: + case Opcode::ImageAtomicUMin32: + case Opcode::ImageAtomicSMax32: + case Opcode::ImageAtomicUMax32: + case Opcode::ImageAtomicInc32: + case Opcode::ImageAtomicDec32: + case Opcode::ImageAtomicAnd32: + case Opcode::ImageAtomicOr32: + case Opcode::ImageAtomicXor32: + case Opcode::ImageAtomicExchange32: return true; default: return false; @@ -61,7 +80,7 @@ bool Inst::MayHaveSideEffects() const noexcept { bool Inst::AreAllArgsImmediates() const { if (op == Opcode::Phi) { - throw LogicError("Testing for all arguments are immediates on phi instruction"); + UNREACHABLE_MSG("Testing for all arguments are immediates on phi instruction"); } return std::all_of(args.begin(), args.begin() + NumArgs(), [](const IR::Value& value) { return value.IsImmediate(); }); @@ -91,7 +110,7 @@ void Inst::SetArg(size_t index, Value value) { Block* Inst::PhiBlock(size_t index) const { if (op != Opcode::Phi) { - throw LogicError("{} is not a Phi instruction", op); + UNREACHABLE_MSG("{} is not a Phi instruction", op); } if (index >= phi_args.size()) { throw InvalidArgument("Out of bounds argument index {} in phi instruction"); @@ -143,7 +162,7 @@ void Inst::ReplaceUsesWith(Value replacement) { void Inst::ReplaceOpcode(IR::Opcode opcode) { if (opcode == IR::Opcode::Phi) { - throw LogicError("Cannot transition into Phi"); + UNREACHABLE_MSG("Cannot transition into Phi"); } if (op == Opcode::Phi) { // Transition out of phi arguments into non-phi diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index a9b895d2..47bc4248 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -19,6 +19,25 @@ OPCODE(ReadConst, U32, U32x OPCODE(ReadConstBuffer, F32, Opaque, U32, ) OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) +// Barriers +OPCODE(Barrier, Void, ) +OPCODE(WorkgroupMemoryBarrier, Void, ) +OPCODE(DeviceMemoryBarrier, Void, ) + +// Shared memory operations +OPCODE(LoadSharedU8, U32, U32, ) +OPCODE(LoadSharedS8, U32, U32, ) +OPCODE(LoadSharedU16, U32, U32, ) +OPCODE(LoadSharedS16, U32, U32, ) +OPCODE(LoadSharedU32, U32, U32, ) +OPCODE(LoadSharedU64, U32x2, U32, ) +OPCODE(LoadSharedU128, U32x4, U32, ) +OPCODE(WriteSharedU8, Void, U32, U32, ) +OPCODE(WriteSharedU16, Void, U32, U32, ) +OPCODE(WriteSharedU32, Void, U32, U32, ) +OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU128, Void, U32, U32x4, ) + // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) OPCODE(GetThreadBitScalarReg, U1, ScalarReg, ) @@ -37,11 +56,13 @@ OPCODE(SetAttribute, Void, Attr OPCODE(GetScc, U1, Void, ) OPCODE(GetExec, U1, Void, ) OPCODE(GetVcc, U1, Void, ) +OPCODE(GetSccLo, U32, Void, ) OPCODE(GetVccLo, U32, Void, ) OPCODE(GetVccHi, U32, Void, ) OPCODE(SetScc, Void, U1, ) OPCODE(SetExec, Void, U1, ) OPCODE(SetVcc, Void, U1, ) +OPCODE(SetSccLo, Void, U32, ) OPCODE(SetVccLo, Void, U32, ) OPCODE(SetVccHi, Void, U32, ) @@ -148,6 +169,7 @@ OPCODE(FPRecipSqrt64, F64, F64, OPCODE(FPSqrt, F32, F32, ) OPCODE(FPSin, F32, F32, ) OPCODE(FPExp2, F32, F32, ) +OPCODE(FPLdexp, F32, F32, U32, ) OPCODE(FPCos, F32, F32, ) OPCODE(FPLog2, F32, F32, ) OPCODE(FPSaturate32, F32, F32, ) @@ -190,10 +212,13 @@ OPCODE(FPUnordGreaterThanEqual32, U1, F32, OPCODE(FPUnordGreaterThanEqual64, U1, F64, F64, ) OPCODE(FPIsNan32, U1, F32, ) OPCODE(FPIsNan64, U1, F64, ) +OPCODE(FPIsInf32, U1, F32, ) +OPCODE(FPIsInf64, U1, F64, ) // Integer operations OPCODE(IAdd32, U32, U32, U32, ) OPCODE(IAdd64, U64, U64, U64, ) +OPCODE(IAddCary32, U32x2, U32, U32, ) OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) @@ -258,6 +283,7 @@ OPCODE(ConvertF32U32, F32, U32, OPCODE(ConvertF64S32, F64, U32, ) OPCODE(ConvertF64U32, F64, U32, ) OPCODE(ConvertF32U16, F32, U16, ) +OPCODE(ConvertU16U32, U16, U32, ) // Image operations OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) @@ -273,6 +299,19 @@ OPCODE(ImageGradient, F32x4, Opaq OPCODE(ImageRead, U32x4, Opaque, Opaque, ) OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) +// Image atomic operations +OPCODE(ImageAtomicIAdd32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicSMin32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicUMin32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicSMax32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicUMax32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicInc32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicDec32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicAnd32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicOr32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicXor32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicExchange32, U32, Opaque, Opaque, U32, ) + // Warp operations OPCODE(LaneId, U32, ) OPCODE(QuadShuffle, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp index b715bcd9..7cd896fb 100644 --- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp @@ -324,8 +324,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::BitFieldUExtract: FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) { if (static_cast(shift) + static_cast(count) > 32) { - throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract, - base, shift, count); + UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract, + base, shift, count); } return (base >> shift) & ((1U << count) - 1); }); @@ -336,8 +336,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { const size_t left_shift{32 - back_shift}; const size_t right_shift{static_cast(32 - count)}; if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) { - throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract, - base, shift, count); + UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract, + base, shift, count); } return static_cast((base << left_shift) >> right_shift); }); @@ -345,8 +345,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::BitFieldInsert: FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) { if (bits >= 32 || offset >= 32) { - throw LogicError("Undefined result in {}({}, {}, {}, {})", - IR::Opcode::BitFieldInsert, base, insert, offset, bits); + UNREACHABLE_MSG("Undefined result in {}({}, {}, {}, {})", + IR::Opcode::BitFieldInsert, base, insert, offset, bits); } return (base & ~(~(~0u << bits) << offset)) | (insert << offset); }); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 7f91a63c..4c26c996 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -89,6 +89,17 @@ bool IsImageInstruction(const IR::Inst& inst) { case IR::Opcode::ImageGradient: case IR::Opcode::ImageRead: case IR::Opcode::ImageWrite: + case IR::Opcode::ImageAtomicIAdd32: + case IR::Opcode::ImageAtomicSMin32: + case IR::Opcode::ImageAtomicUMin32: + case IR::Opcode::ImageAtomicSMax32: + case IR::Opcode::ImageAtomicUMax32: + case IR::Opcode::ImageAtomicInc32: + case IR::Opcode::ImageAtomicDec32: + case IR::Opcode::ImageAtomicAnd32: + case IR::Opcode::ImageAtomicOr32: + case IR::Opcode::ImageAtomicXor32: + case IR::Opcode::ImageAtomicExchange32: return true; default: return false; @@ -99,6 +110,17 @@ bool IsImageStorageInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::ImageWrite: case IR::Opcode::ImageRead: + case IR::Opcode::ImageAtomicIAdd32: + case IR::Opcode::ImageAtomicSMin32: + case IR::Opcode::ImageAtomicUMin32: + case IR::Opcode::ImageAtomicSMax32: + case IR::Opcode::ImageAtomicUMax32: + case IR::Opcode::ImageAtomicInc32: + case IR::Opcode::ImageAtomicDec32: + case IR::Opcode::ImageAtomicAnd32: + case IR::Opcode::ImageAtomicOr32: + case IR::Opcode::ImageAtomicXor32: + case IR::Opcode::ImageAtomicExchange32: return true; default: return false; @@ -115,7 +137,8 @@ public: u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + desc.dword_offset == existing.dword_offset && + desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records); @@ -196,20 +219,70 @@ SharpLocation TrackSharp(const IR::Inst* inst) { }; } +static constexpr size_t MaxUboSize = 65536; + +s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, + AmdGpu::Buffer& cbuf) { + + // Assuming V# is in UD s[32:35] + // The next pattern: + // s_getpc_b64 s[32:33] + // s_add_u32 s32, , s32 + // s_addc_u32 s33, 0, s33 + // s_mov_b32 s35, + // s_movk_i32 s34, + // buffer_load_format_xyz v[8:10], v1, s[32:35], 0 ... + // is used to define an inline constant buffer + + IR::Inst* handle = inst.Arg(0).InstRecursive(); + IR::Inst* p0 = handle->Arg(0).InstRecursive(); + if (p0->GetOpcode() != IR::Opcode::IAdd32 || !p0->Arg(0).IsImmediate() || + !p0->Arg(1).IsImmediate()) { + return -1; + } + IR::Inst* p1 = handle->Arg(1).InstRecursive(); + if (p1->GetOpcode() != IR::Opcode::IAdd32) { + return -1; + } + if (!handle->Arg(3).IsImmediate() || !handle->Arg(2).IsImmediate()) { + return -1; + } + // We have found this pattern. Build the sharp. + std::array buffer; + buffer[0] = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32(); + buffer[1] = handle->Arg(2).U32() | handle->Arg(3).U64() << 32; + cbuf = std::bit_cast(buffer); + // Assign a binding to this sharp. + return descriptors.Add(BufferResource{ + .sgpr_base = std::numeric_limits::max(), + .dword_offset = 0, + .stride = cbuf.GetStride(), + .num_records = u32(cbuf.num_records), + .used_types = BufferDataType(inst), + .inline_cbuf = cbuf, + .is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize, + }); +} + void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { - static constexpr size_t MaxUboSize = 65536; - IR::Inst* producer = inst.Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); - const u32 binding = descriptors.Add(BufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, - .stride = buffer.GetStride(), - .num_records = u32(buffer.num_records), - .used_types = BufferDataType(inst), - .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, - }); + s32 binding{}; + AmdGpu::Buffer buffer; + if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) { + IR::Inst* handle = inst.Arg(0).InstRecursive(); + IR::Inst* producer = handle->Arg(0).InstRecursive(); + const auto sharp = TrackSharp(producer); + buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + binding = descriptors.Add(BufferResource{ + .sgpr_base = sharp.sgpr_base, + .dword_offset = sharp.dword_offset, + .stride = buffer.GetStride(), + .num_records = u32(buffer.num_records), + .used_types = BufferDataType(inst), + .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize, + }); + } + const auto inst_info = inst.Flags(); IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; // Replace handle with binding index in buffer resource list. @@ -217,7 +290,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); if (inst_info.is_typed) { ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float && - inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32); + (inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || + inst_info.dmft == AmdGpu::DataFormat::Format32_32_32 || + inst_info.dmft == AmdGpu::DataFormat::Format32_32 || + inst_info.dmft == AmdGpu::DataFormat::Format32)); } if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer || inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) { diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index ac1cb060..25d8b937 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -16,6 +16,16 @@ void Visit(Info& info, IR::Inst& inst) { info.stores.Set(inst.Arg(0).Attribute(), inst.Arg(2).U32()); break; } + case IR::Opcode::LoadSharedS8: + case IR::Opcode::LoadSharedU8: + case IR::Opcode::WriteSharedU8: + info.uses_shared_u8 = true; + break; + case IR::Opcode::LoadSharedS16: + case IR::Opcode::LoadSharedU16: + case IR::Opcode::WriteSharedU16: + info.uses_shared_u16 = true; + break; case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index 9ee01953..8a24a68b 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -32,6 +32,7 @@ struct SccFlagTag : FlagTag {}; struct ExecFlagTag : FlagTag {}; struct VccFlagTag : FlagTag {}; struct VccLoTag : FlagTag {}; +struct SccLoTag : FlagTag {}; struct VccHiTag : FlagTag {}; struct GotoVariable : FlagTag { @@ -44,7 +45,7 @@ struct GotoVariable : FlagTag { }; using Variant = std::variant; + VccFlagTag, SccLoTag, VccLoTag, VccHiTag>; using ValueMap = std::unordered_map; struct DefTable { @@ -83,6 +84,13 @@ struct DefTable { exec_flag.insert_or_assign(block, value); } + const IR::Value& Def(IR::Block* block, SccLoTag) { + return scc_lo_flag[block]; + } + void SetDef(IR::Block* block, SccLoTag, const IR::Value& value) { + scc_lo_flag.insert_or_assign(block, value); + } + const IR::Value& Def(IR::Block* block, VccLoTag) { return vcc_lo_flag[block]; } @@ -108,6 +116,7 @@ struct DefTable { ValueMap scc_flag; ValueMap exec_flag; ValueMap vcc_flag; + ValueMap scc_lo_flag; ValueMap vcc_lo_flag; ValueMap vcc_hi_flag; }; @@ -124,6 +133,10 @@ IR::Opcode UndefOpcode(const VccLoTag&) noexcept { return IR::Opcode::UndefU32; } +IR::Opcode UndefOpcode(const SccLoTag&) noexcept { + return IR::Opcode::UndefU32; +} + IR::Opcode UndefOpcode(const VccHiTag&) noexcept { return IR::Opcode::UndefU32; } @@ -321,6 +334,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::SetVcc: pass.WriteVariable(VccFlagTag{}, block, inst.Arg(0)); break; + case IR::Opcode::SetSccLo: + pass.WriteVariable(SccLoTag{}, block, inst.Arg(0)); + break; case IR::Opcode::SetVccLo: pass.WriteVariable(VccLoTag{}, block, inst.Arg(0)); break; @@ -350,6 +366,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetVcc: inst.ReplaceUsesWith(pass.ReadVariable(VccFlagTag{}, block)); break; + case IR::Opcode::GetSccLo: + inst.ReplaceUsesWith(pass.ReadVariable(SccLoTag{}, block)); + break; case IR::Opcode::GetVccLo: inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block)); break; diff --git a/src/shader_recompiler/ir/post_order.cpp b/src/shader_recompiler/ir/post_order.cpp index 5ab72aa2..9f588690 100644 --- a/src/shader_recompiler/ir/post_order.cpp +++ b/src/shader_recompiler/ir/post_order.cpp @@ -14,7 +14,7 @@ BlockList PostOrder(const AbstractSyntaxNode& root) { BlockList post_order_blocks; if (root.type != AbstractSyntaxNode::Type::Block) { - throw LogicError("First node in abstract syntax list root is not a block"); + UNREACHABLE_MSG("First node in abstract syntax list root is not a block"); } Block* const first_block{root.data.block}; visited.insert(first_block); diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index f3000528..ae38ecf3 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -3,9 +3,9 @@ #pragma once +#include "common/assert.h" #include "common/bit_field.h" #include "common/types.h" -#include "shader_recompiler/exception.h" #include "video_core/amdgpu/pixel_format.h" namespace Shader::IR { @@ -428,10 +428,10 @@ template [[nodiscard]] constexpr Reg operator+(Reg reg, int num) { const int result{static_cast(reg) + num}; if (result >= static_cast(Reg::Max)) { - throw LogicError("Overflow on register arithmetic"); + UNREACHABLE_MSG("Overflow on register arithmetic"); } if (result < 0) { - throw LogicError("Underflow on register arithmetic"); + UNREACHABLE_MSG("Underflow on register arithmetic"); } return static_cast(result); } diff --git a/src/shader_recompiler/ir/value.cpp b/src/shader_recompiler/ir/value.cpp index a455f8b1..9cbb9e7c 100644 --- a/src/shader_recompiler/ir/value.cpp +++ b/src/shader_recompiler/ir/value.cpp @@ -83,7 +83,7 @@ bool Value::operator==(const Value& other) const { case Type::F64x4: break; } - throw LogicError("Invalid type {}", type); + UNREACHABLE_MSG("Invalid type {}", type); } bool Value::operator!=(const Value& other) const { diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index f3c33c81..54b34730 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -26,44 +26,9 @@ struct Profile { bool support_fp32_signed_zero_nan_preserve{}; bool support_fp64_signed_zero_nan_preserve{}; bool support_explicit_workgroup_layout{}; - bool support_vote{}; - bool support_viewport_mask{}; - bool support_typeless_image_loads{}; - bool support_derivative_control{}; - bool support_geometry_shader_passthrough{}; - bool support_native_ndc{}; - bool support_scaled_attributes{}; - bool support_multi_viewport{}; - bool support_geometry_streams{}; - - bool warp_size_potentially_larger_than_guest{}; - - bool lower_left_origin_mode{}; - /// Fragment outputs have to be declared even if they are not written to avoid undefined values. - /// See Ori and the Blind Forest's main menu for reference. - bool need_declared_frag_colors{}; - /// Prevents fast math optimizations that may cause inaccuracies - bool need_fastmath_off{}; - - /// OpFClamp is broken and OpFMax + OpFMin should be used instead bool has_broken_spirv_clamp{}; - /// The Position builtin needs to be wrapped in a struct when used as an input - bool has_broken_spirv_position_input{}; - /// Offset image operands with an unsigned type do not work - bool has_broken_unsigned_image_offsets{}; - /// Signed instructions with unsigned data types are misinterpreted - bool has_broken_signed_operations{}; - /// Float controls break when fp16 is enabled - bool has_broken_fp16_float_controls{}; - /// Ignores SPIR-V ordered vs unordered using GLSL semantics - bool ignore_nan_fp_comparisons{}; - - /// Maxwell and earlier nVidia architectures have broken robust support - bool has_broken_robust{}; - + bool lower_left_origin_mode{}; u64 min_ssbo_alignment{}; - - u32 max_user_clip_distances{}; }; } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 21f3602f..29841315 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -4,7 +4,6 @@ #pragma once #include -#include #include #include "common/assert.h" #include "common/types.h" @@ -42,15 +41,45 @@ enum class TextureType : u32 { }; constexpr u32 NUM_TEXTURE_TYPES = 7; +enum class VsOutput : u32 { + None, + PointSprite, + EdgeFlag, + KillFlag, + GsCutFlag, + GsMrtIndex, + GsVpIndex, + CullDist0, + CullDist1, + CullDist2, + CullDist3, + CullDist4, + CullDist5, + CullDist6, + CullDist7, + ClipDist0, + ClipDist1, + ClipDist2, + ClipDist3, + ClipDist4, + ClipDist5, + ClipDist6, + ClipDist7, +}; +using VsOutputMap = std::array; + +struct Info; + struct BufferResource { u32 sgpr_base; u32 dword_offset; u32 stride; u32 num_records; IR::Type used_types; + AmdGpu::Buffer inline_cbuf; bool is_storage; - auto operator<=>(const BufferResource&) const = default; + constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept; }; using BufferResourceList = boost::container::static_vector; @@ -123,6 +152,7 @@ struct Info { }; AttributeFlags loads{}; AttributeFlags stores{}; + boost::container::static_vector vs_outputs; BufferResourceList buffers; ImageResourceList images; @@ -134,7 +164,12 @@ struct Info { std::span user_data; Stage stage; + uintptr_t pgm_base{}; + u64 pgm_hash{}; + u32 shared_memory_size{}; bool uses_group_quad{}; + bool uses_shared_u8{}; + bool uses_shared_u16{}; bool translation_failed{}; // indicates that shader has unsupported instructions template @@ -149,6 +184,10 @@ struct Info { } }; +constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexcept { + return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); +} + } // namespace Shader template <> diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index f97f2d6c..2c3590fe 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -323,6 +323,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanaddr_hi); break; } + case PM4ItOpcode::IndexBufferSize: { + const auto* index_size = reinterpret_cast(header); + regs.num_indices = index_size->num_indices; + break; + } case PM4ItOpcode::EventWrite: { // const auto* event = reinterpret_cast(header); break; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 2233fa0c..6e19f55d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -85,14 +85,14 @@ struct Liverpool { } settings; UserData user_data; - template - const T* Address() const { + template + const T Address() const { const uintptr_t addr = uintptr_t(address_hi) << 40 | uintptr_t(address_lo) << 8; - return reinterpret_cast(addr); + return reinterpret_cast(addr); } std::span Code() const { - const u32* code = Address(); + const u32* code = Address(); BinaryInfo bininfo; std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); const u32 num_dwords = bininfo.length / sizeof(u32); @@ -121,20 +121,26 @@ struct Liverpool { BitField<0, 6, u64> num_vgprs; BitField<6, 4, u64> num_sgprs; BitField<33, 5, u64> num_user_regs; + BitField<47, 9, u64> lds_dwords; } settings; INSERT_PADDING_WORDS(1); u32 resource_limits; INSERT_PADDING_WORDS(0x2A); UserData user_data; - template - const T* Address() const { + template + const T Address() const { const uintptr_t addr = uintptr_t(address_hi) << 40 | uintptr_t(address_lo) << 8; - return reinterpret_cast(addr); + return reinterpret_cast(addr); + } + + u32 SharedMemSize() const noexcept { + // lds_dwords is in units of 128 dwords. We return bytes. + return settings.lds_dwords.Value() * 128 * 4; } std::span Code() const { - const u32* code = Address(); + const u32* code = Address(); BinaryInfo bininfo; std::memcpy(&bininfo, code + (code[1] + 1) * 2, sizeof(bininfo)); const u32 num_dwords = bininfo.length / sizeof(u32); @@ -144,7 +150,7 @@ struct Liverpool { template static constexpr auto* GetBinaryInfo(const Shader& sh) { - const auto* code = sh.template Address(); + const auto* code = sh.template Address(); const auto* bininfo = std::bit_cast(code + (code[1] + 1) * 2); ASSERT_MSG(bininfo->Valid(), "Invalid shader binary header"); return bininfo; @@ -208,6 +214,10 @@ struct Liverpool { BitField<18, 1, u32> use_vtx_render_target_idx; BitField<19, 1, u32> use_vtx_viewport_idx; BitField<20, 1, u32> use_vtx_kill_flag; + BitField<21, 1, u32> vs_out_misc_enable; + BitField<22, 1, u32> vs_out_ccdist0_enable; + BitField<23, 1, u32> vs_out_ccdist1_enable; + BitField<25, 1, u32> use_vtx_gs_cut_flag; bool IsClipDistEnabled(u32 index) const { return (clip_distance_enable.Value() >> index) & 1; @@ -469,7 +479,7 @@ struct Liverpool { template T Address() const { - return reinterpret_cast(base_addr_lo | u64(base_addr_hi) << 32); + return reinterpret_cast((base_addr_lo & ~1U) | u64(base_addr_hi) << 32); } }; @@ -1021,6 +1031,7 @@ static_assert(GFX6_3D_REG_INDEX(cs_program.user_data) == 0x2E40); static_assert(GFX6_3D_REG_INDEX(depth_render_control) == 0xA000); static_assert(GFX6_3D_REG_INDEX(depth_htile_data_base) == 0xA005); static_assert(GFX6_3D_REG_INDEX(screen_scissor) == 0xA00C); +static_assert(GFX6_3D_REG_INDEX(depth_buffer.z_info) == 0xA010); static_assert(GFX6_3D_REG_INDEX(depth_buffer.depth_slice) == 0xA017); static_assert(GFX6_3D_REG_INDEX(color_target_mask) == 0xA08E); static_assert(GFX6_3D_REG_INDEX(color_shader_mask) == 0xA08F); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 5b49157a..4d80ae29 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -549,8 +549,8 @@ struct PM4DumpConstRam { u32 addr_hi; template - T* Address() const { - return reinterpret_cast((u64(addr_hi) << 32u) | addr_lo); + T Address() const { + return reinterpret_cast((u64(addr_hi) << 32u) | addr_lo); } [[nodiscard]] u32 Offset() const { @@ -581,6 +581,11 @@ struct PM4CmdDrawIndexBase { u32 addr_hi; }; +struct PM4CmdDrawIndexBufferSize { + PM4Type3Header header; + u32 num_indices; +}; + struct PM4CmdIndirectBuffer { PM4Type3Header header; u32 ibase_lo; ///< Indirect buffer base address, must be 4 byte aligned diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 17686b7a..ba3de154 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -21,32 +21,45 @@ enum class CompSwizzle : u32 { // Table 8.5 Buffer Resource Descriptor [Sea Islands Series Instruction Set Architecture] struct Buffer { - union { - BitField<0, 44, u64> base_address; - BitField<48, 14, u64> stride; - BitField<62, 1, u64> cache_swizzle; - BitField<63, 1, u64> swizzle_enable; - }; + u64 base_address : 44; + u64 : 4; + u64 stride : 14; + u64 cache_swizzle : 1; + u64 swizzle_enable : 1; u32 num_records; - union { - BitField<0, 3, u32> dst_sel_x; - BitField<3, 3, u32> dst_sel_y; - BitField<6, 3, u32> dst_sel_z; - BitField<9, 3, u32> dst_sel_w; - BitField<0, 12, u32> dst_sel; - BitField<12, 3, NumberFormat> num_format; - BitField<15, 4, DataFormat> data_format; - BitField<19, 2, u32> element_size; - BitField<21, 2, u32> index_stride; - BitField<23, 1, u32> add_tid_enable; - }; + u32 dst_sel_x : 3; + u32 dst_sel_y : 3; + u32 dst_sel_z : 3; + u32 dst_sel_w : 3; + u32 num_format : 3; + u32 data_format : 4; + u32 element_size : 2; + u32 index_stride : 2; + u32 add_tid_enable : 1; + + operator bool() const noexcept { + return base_address != 0; + } + + bool operator==(const Buffer& other) const noexcept { + return std::memcmp(this, &other, sizeof(Buffer)) == 0; + } CompSwizzle GetSwizzle(u32 comp) const noexcept { - return static_cast((dst_sel.Value() >> (comp * 3)) & 0x7); + const std::array select{dst_sel_x, dst_sel_y, dst_sel_z, dst_sel_w}; + return static_cast(select[comp]); + } + + NumberFormat GetNumberFmt() const noexcept { + return static_cast(num_format); + } + + DataFormat GetDataFmt() const noexcept { + return static_cast(data_format); } u32 GetStride() const noexcept { - return stride == 0 ? 1U : stride.Value(); + return stride == 0 ? 1U : stride; } u32 GetStrideElements(u32 element_size) const noexcept { @@ -61,6 +74,7 @@ struct Buffer { return GetStride() * num_records; } }; +static_assert(sizeof(Buffer) == 16); // 128bits enum class ImageType : u64 { Buffer = 0, diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index abcbe8d4..2509467f 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -392,6 +392,36 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::Float) { return vk::Format::eR16G16Sfloat; } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eR16G16B16A16Snorm; + } + if (data_format == AmdGpu::DataFormat::Format32_32 && + num_format == AmdGpu::NumberFormat::Uint) { + return vk::Format::eR32G32Uint; + } + if (data_format == AmdGpu::DataFormat::Format4_4_4_4 && + num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eR4G4B4A4UnormPack16; + } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Uint) { + return vk::Format::eR16G16B16A16Uint; + } + if (data_format == AmdGpu::DataFormat::Format32_32_32_32 && + num_format == AmdGpu::NumberFormat::Uint) { + return vk::Format::eR32G32B32A32Uint; + } + if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Sint) { + return vk::Format::eR8Sint; + } + if (data_format == AmdGpu::DataFormat::FormatBc1 && num_format == AmdGpu::NumberFormat::Srgb) { + return vk::Format::eBc1RgbaSrgbBlock; + } + if (data_format == AmdGpu::DataFormat::Format16_16 && + num_format == AmdGpu::NumberFormat::Sint) { + return vk::Format::eR16G16Sint; + } UNREACHABLE_MSG("Unknown data_format={} and num_format={}", u32(data_format), u32(num_format)); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index cfbacd48..f70e49a7 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -14,8 +14,8 @@ namespace Vulkan { ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler_, vk::PipelineCache pipeline_cache, const Shader::Info* info_, - vk::ShaderModule module) - : instance{instance_}, scheduler{scheduler_}, info{*info_} { + u64 compute_key_, vk::ShaderModule module) + : instance{instance_}, scheduler{scheduler_}, compute_key{compute_key_}, info{*info_} { const vk::PipelineShaderStageCreateInfo shader_ci = { .stage = vk::ShaderStageFlagBits::eCompute, .module = module, @@ -85,15 +85,15 @@ ComputePipeline::~ComputePipeline() = default; bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. - boost::container::static_vector buffer_infos; - boost::container::static_vector image_infos; + boost::container::static_vector buffer_infos; + boost::container::static_vector image_infos; boost::container::small_vector set_writes; u32 binding{}; for (const auto& buffer : info.buffers) { - const auto vsharp = info.ReadUd(buffer.sgpr_base, buffer.dword_offset); + const auto vsharp = buffer.GetVsharp(info); const u32 size = vsharp.GetSize(); - const VAddr address = vsharp.base_address.Value(); + const VAddr address = vsharp.base_address; texture_cache.OnCpuWrite(address); const u32 offset = staging.Copy(address, size, buffer.is_storage ? instance.StorageMinAlignment() diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 1d074814..4cdcccfc 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -24,7 +24,7 @@ class ComputePipeline { public: explicit ComputePipeline(const Instance& instance, Scheduler& scheduler, vk::PipelineCache pipeline_cache, const Shader::Info* info, - vk::ShaderModule module); + u64 compute_key, vk::ShaderModule module); ~ComputePipeline(); [[nodiscard]] vk::Pipeline Handle() const noexcept { @@ -40,6 +40,7 @@ private: vk::UniquePipeline pipeline; vk::UniquePipelineLayout pipeline_layout; vk::UniqueDescriptorSetLayout desc_layout; + u64 compute_key; Shader::Info info{}; }; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5d638a69..8b9dcc0f 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -47,7 +47,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul attributes.push_back({ .location = input.binding, .binding = input.binding, - .format = LiverpoolToVK::SurfaceFormat(buffer.data_format, buffer.num_format), + .format = LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()), .offset = 0, }); bindings.push_back({ @@ -326,8 +326,8 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& for (const auto& stage : stages) { for (const auto& buffer : stage.buffers) { - const auto vsharp = stage.ReadUd(buffer.sgpr_base, buffer.dword_offset); - const VAddr address = vsharp.base_address.Value(); + const auto vsharp = buffer.GetVsharp(stage); + const VAddr address = vsharp.base_address; const u32 size = vsharp.GetSize(); const u32 offset = staging.Copy(address, size, buffer.is_storage ? instance.StorageMinAlignment() @@ -419,8 +419,7 @@ void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const { continue; } guest_buffers.emplace_back(buffer); - ranges.emplace_back(buffer.base_address.Value(), - buffer.base_address.Value() + buffer.GetSize()); + ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); } std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { return lhv.base_address < rhv.base_address; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 1fc7790d..c071cc2f 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -74,12 +74,12 @@ Instance::Instance(Frontend::WindowSDL& window, s32 physical_device_index, available_extensions = GetSupportedExtensions(physical_device); properties = physical_device.getProperties(); + CollectDeviceParameters(); ASSERT_MSG(properties.apiVersion >= TargetVulkanApiVersion, "Vulkan {}.{} is required, but only {}.{} is supported by device!", VK_VERSION_MAJOR(TargetVulkanApiVersion), VK_VERSION_MINOR(TargetVulkanApiVersion), VK_VERSION_MAJOR(properties.apiVersion), VK_VERSION_MINOR(properties.apiVersion)); - CollectDeviceParameters(); CreateDevice(); CollectToolingInfo(); } @@ -156,6 +156,7 @@ bool Instance::CreateDevice() { add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); + add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); // The next two extensions are required to be available together in order to support write masks color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME); color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); @@ -208,12 +209,14 @@ bool Instance::CreateDevice() { .shaderImageGatherExtended = true, .shaderStorageImageMultisample = true, .shaderClipDistance = features.shaderClipDistance, + .shaderInt16 = true, }, }, vk::PhysicalDeviceVulkan11Features{ .shaderDrawParameters = true, }, vk::PhysicalDeviceVulkan12Features{ + .shaderFloat16 = true, .scalarBlockLayout = true, .uniformBufferStandardLayout = true, .hostQueryReset = true, @@ -237,7 +240,12 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceDepthClipControlFeaturesEXT{ .depthClipControl = true, }, - }; + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR{ + .workgroupMemoryExplicitLayout = true, + .workgroupMemoryExplicitLayoutScalarBlockLayout = true, + .workgroupMemoryExplicitLayout8BitAccess = true, + .workgroupMemoryExplicitLayout16BitAccess = true, + }}; if (!color_write_en) { device_chain.unlink(); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index e0b53322..50274604 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -18,6 +18,52 @@ extern std::unique_ptr renderer; namespace Vulkan { +using Shader::VsOutput; + +void BuildVsOutputs(Shader::Info& info, const AmdGpu::Liverpool::VsOutputControl& ctl) { + const auto add_output = [&](VsOutput x, VsOutput y, VsOutput z, VsOutput w) { + if (x != VsOutput::None || y != VsOutput::None || z != VsOutput::None || + w != VsOutput::None) { + info.vs_outputs.emplace_back(Shader::VsOutputMap{x, y, z, w}); + } + }; + // VS_OUT_MISC_VEC + add_output(ctl.use_vtx_point_size ? VsOutput::PointSprite : VsOutput::None, + ctl.use_vtx_edge_flag + ? VsOutput::EdgeFlag + : (ctl.use_vtx_gs_cut_flag ? VsOutput::GsCutFlag : VsOutput::None), + ctl.use_vtx_kill_flag + ? VsOutput::KillFlag + : (ctl.use_vtx_render_target_idx ? VsOutput::GsMrtIndex : VsOutput::None), + ctl.use_vtx_viewport_idx ? VsOutput::GsVpIndex : VsOutput::None); + // VS_OUT_CCDIST0 + add_output(ctl.IsClipDistEnabled(0) + ? VsOutput::ClipDist0 + : (ctl.IsCullDistEnabled(0) ? VsOutput::CullDist0 : VsOutput::None), + ctl.IsClipDistEnabled(1) + ? VsOutput::ClipDist1 + : (ctl.IsCullDistEnabled(1) ? VsOutput::CullDist1 : VsOutput::None), + ctl.IsClipDistEnabled(2) + ? VsOutput::ClipDist2 + : (ctl.IsCullDistEnabled(2) ? VsOutput::CullDist2 : VsOutput::None), + ctl.IsClipDistEnabled(3) + ? VsOutput::ClipDist3 + : (ctl.IsCullDistEnabled(3) ? VsOutput::CullDist3 : VsOutput::None)); + // VS_OUT_CCDIST1 + add_output(ctl.IsClipDistEnabled(4) + ? VsOutput::ClipDist4 + : (ctl.IsCullDistEnabled(4) ? VsOutput::CullDist4 : VsOutput::None), + ctl.IsClipDistEnabled(5) + ? VsOutput::ClipDist5 + : (ctl.IsCullDistEnabled(5) ? VsOutput::CullDist5 : VsOutput::None), + ctl.IsClipDistEnabled(6) + ? VsOutput::ClipDist6 + : (ctl.IsCullDistEnabled(6) ? VsOutput::CullDist6 : VsOutput::None), + ctl.IsClipDistEnabled(7) + ? VsOutput::ClipDist7 + : (ctl.IsCullDistEnabled(7) ? VsOutput::CullDist7 : VsOutput::None)); +} + Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_data, const AmdGpu::Liverpool::Regs& regs) { Shader::Info info{}; @@ -26,6 +72,7 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d switch (stage) { case Shader::Stage::Vertex: { info.num_user_data = regs.vs_program.settings.num_user_regs; + BuildVsOutputs(info, regs.vs_output_control); break; } case Shader::Stage::Fragment: { @@ -45,6 +92,7 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span user_d info.num_user_data = cs_pgm.settings.num_user_regs; info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, cs_pgm.num_thread_z.full}; + info.shared_memory_size = cs_pgm.SharedMemSize(); break; } default: @@ -60,6 +108,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, pipeline_cache = instance.GetDevice().createPipelineCacheUnique({}); profile = Shader::Profile{ .supported_spirv = 0x00010600U, + .support_explicit_workgroup_layout = true, }; } @@ -153,7 +202,7 @@ void PipelineCache::RefreshGraphicsKey() { for (u32 i = 0; i < MaxShaderStages; i++) { auto* pgm = regs.ProgramForStage(i); - if (!pgm || !pgm->Address()) { + if (!pgm || !pgm->Address()) { key.stage_hashes[i] = 0; continue; } @@ -209,7 +258,9 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { // Recompile shader to IR. try { LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x}", stage, hash); - const Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); + Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); + info.pgm_base = pgm->Address(); + info.pgm_hash = hash; programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); // Compile IR to SPIR-V @@ -247,8 +298,9 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { // Recompile shader to IR. try { LOG_INFO(Render_Vulkan, "Compiling cs shader {:#x}", compute_key); - const Shader::Info info = + Shader::Info info = MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); + info.pgm_base = cs_pgm.Address(); auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); // Compile IR to SPIR-V @@ -258,8 +310,11 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { DumpShader(spv_code, compute_key, Shader::Stage::Compute, "spv"); } const auto module = CompileSPV(spv_code, instance.GetDevice()); + // Set module name to hash in renderdoc + const auto name = fmt::format("cs_{:#x}", compute_key); + Vulkan::SetObjectName(instance.GetDevice(), module, name); return std::make_unique(instance, scheduler, *pipeline_cache, - &program.info, module); + &program.info, compute_key, module); } catch (const Shader::Exception& e) { UNREACHABLE_MSG("{}", e.what()); return nullptr; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index cabec162..6440ebc7 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -23,7 +23,7 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, : instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool}, - vertex_index_buffer{instance, scheduler, VertexIndexFlags, 512_MB, BufferType::Upload} { + vertex_index_buffer{instance, scheduler, VertexIndexFlags, 3_GB, BufferType::Upload} { if (!Config::nullGpu()) { liverpool->BindRasterizer(this); } @@ -44,11 +44,14 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { return; } - UpdateDynamicState(*pipeline); - - pipeline->BindResources(memory, vertex_index_buffer, texture_cache); + try { + pipeline->BindResources(memory, vertex_index_buffer, texture_cache); + } catch (...) { + UNREACHABLE(); + } BeginRendering(); + UpdateDynamicState(*pipeline); cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); if (is_indexed) { @@ -71,9 +74,14 @@ void Rasterizer::DispatchDirect() { return; } - const auto has_resources = pipeline->BindResources(memory, vertex_index_buffer, texture_cache); - if (!has_resources) { - return; + try { + const auto has_resources = + pipeline->BindResources(memory, vertex_index_buffer, texture_cache); + if (!has_resources) { + return; + } + } catch (...) { + UNREACHABLE(); } scheduler.EndRendering(); @@ -163,7 +171,7 @@ u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) { // Upload index data to stream buffer. const auto index_address = regs.index_base_address.Address(); - const u32 index_buffer_size = regs.num_indices * index_size; + const u32 index_buffer_size = (index_offset + regs.num_indices) * index_size; const auto [data, offset, _] = vertex_index_buffer.Map(index_buffer_size); std::memcpy(data, index_address, index_buffer_size); vertex_index_buffer.Commit(index_buffer_size); diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 2a025e22..116f7896 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -226,7 +226,7 @@ void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; wait_bound = watch.upper_bound; - // scheduler.Wait(watch.tick); + scheduler.Wait(watch.tick); ++wait_cursor; } } diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 427f7e4b..f06492ef 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -321,12 +321,15 @@ void Image::Upload(vk::Buffer buffer, u64 offset) { Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); // Copy to the image. + const auto aspect = aspect_mask & vk::ImageAspectFlagBits::eStencil + ? vk::ImageAspectFlagBits::eDepth + : aspect_mask; const vk::BufferImageCopy image_copy = { .bufferOffset = offset, .bufferRowLength = info.pitch, .bufferImageHeight = info.size.height, .imageSubresource{ - .aspectMask = aspect_mask, + .aspectMask = aspect, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1, diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 4fc0589c..8c168000 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -77,7 +77,6 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info if (usage_override) { usage_ci.usage = usage_override.value(); } - // When sampling D32 texture from shader, the T# specifies R32 Float format so adjust it. vk::Format format = info.format; vk::ImageAspectFlags aspect = image.aspect_mask; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 02811735..8cd6f893 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -134,13 +134,13 @@ ImageId TextureCache::FindImage(const ImageInfo& info, VAddr cpu_address, bool r image_id = slot_images.insert(instance, scheduler, info, cpu_address); RegisterImage(image_id); } else { - image_id = image_ids.size() > 1 ? image_ids[1] : image_ids[0]; + image_id = image_ids[0]; } RegisterMeta(info, image_id); Image& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::CpuModified)) { + if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) { RefreshImage(image); TrackImage(image, image_id); } @@ -193,7 +193,7 @@ ImageView& TextureCache::FindImageView(const AmdGpu::Image& desc, bool is_storag ImageView& TextureCache::RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint) { const ImageInfo info{buffer, hint}; - const ImageId image_id = FindImage(info, buffer.Address(), false); + const ImageId image_id = FindImage(info, buffer.Address()); Image& image = slot_images[image_id]; image.flags &= ~ImageFlagBits::CpuModified; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index e9818d75..51af4ad8 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -179,19 +179,19 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR8Unorm: return vk::Format::eR8Uint; case vk::Format::eR8G8Unorm: + case vk::Format::eR16Sfloat: return vk::Format::eR8G8Uint; case vk::Format::eR8G8B8A8Srgb: - [[fallthrough]]; case vk::Format::eB8G8R8A8Srgb: - [[fallthrough]]; case vk::Format::eB8G8R8A8Unorm: - [[fallthrough]]; case vk::Format::eR8G8B8A8Unorm: + case vk::Format::eR32Sfloat: + case vk::Format::eR32Uint: return vk::Format::eR32Uint; case vk::Format::eBc1RgbaUnormBlock: + case vk::Format::eR32G32Sfloat: return vk::Format::eR32G32Uint; case vk::Format::eBc3SrgbBlock: - [[fallthrough]]; case vk::Format::eBc3UnormBlock: case vk::Format::eBc7SrgbBlock: case vk::Format::eBc7UnormBlock: