diff --git a/CMakeLists.txt b/CMakeLists.txt index 4df3db2b..9153197c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,8 @@ if(ENABLE_QT_GUI) find_package(Qt6 REQUIRED COMPONENTS Widgets Concurrent) qt_standard_project_setup() set(CMAKE_AUTORCC ON) + set(CMAKE_AUTOMOC ON) + set(CMAKE_AUTOUIC ON) endif() set(AUDIO_CORE src/audio_core/sdl_audio.cpp @@ -419,6 +421,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h + src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -546,10 +549,13 @@ set(QT_GUI src/qt_gui/elf_viewer.h src/qt_gui/main_window_themes.cpp src/qt_gui/main_window_themes.h + src/qt_gui/settings_dialog.cpp + src/qt_gui/settings_dialog.h + src/qt_gui/settings_dialog.ui src/qt_gui/main.cpp ${EMULATOR} ${RESOURCE_FILES} - ) +) endif() if (ENABLE_QT_GUI) diff --git a/externals/ext-boost b/externals/ext-boost index 147b2de7..a04136ad 160000 --- a/externals/ext-boost +++ b/externals/ext-boost @@ -1 +1 @@ -Subproject commit 147b2de7734f5dc3b9aeb1f4135ae15fcd44b9d7 +Subproject commit a04136add1e469f46d8ae8d3e8307779240a5c53 diff --git a/src/common/config.cpp b/src/common/config.cpp index f676ab94..a65a5b59 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -25,7 +25,9 @@ static bool shouldDumpPM4 = false; static u32 vblankDivider = 1; static bool vkValidation = false; static bool vkValidationSync = false; +static bool vkValidationGpu = false; static bool rdocEnable = false; +static bool rdocMarkersEnable = false; // Gui std::string settings_install_dir = ""; u32 main_window_geometry_x = 400; @@ -102,6 +104,10 @@ bool isRdocEnabled() { return rdocEnable; } +bool isMarkersEnabled() { + return rdocMarkersEnable; +} + u32 vblankDiv() { return vblankDivider; } @@ -114,6 +120,78 @@ bool vkValidationSyncEnabled() { return vkValidationSync; } +bool vkValidationGpuEnabled() { + return vkValidationGpu; +} + +void setGpuId(s32 selectedGpuId) { + gpuId = selectedGpuId; +} + +void setScreenWidth(u32 width) { + screenWidth = width; +} + +void setScreenHeight(u32 height) { + screenHeight = height; +} + +void setDebugDump(bool enable) { + isDebugDump = enable; +} + +void setShowSplash(bool enable) { + isShowSplash = enable; +} + +void setNullGpu(bool enable) { + isNullGpu = enable; +} + +void setDumpShaders(bool enable) { + shouldDumpShaders = enable; +} + +void setDumpPM4(bool enable) { + shouldDumpPM4 = enable; +} + +void setVkValidation(bool enable) { + vkValidation = enable; +} + +void setVkSyncValidation(bool enable) { + vkValidationSync = enable; +} + +void setRdocEnabled(bool enable) { + rdocEnable = enable; +} + +void setVblankDiv(u32 value) { + vblankDivider = value; +} + +void setFullscreenMode(bool enable) { + isFullscreen = enable; +} + +void setLanguage(u32 language) { + m_language = language; +} + +void setNeoMode(bool enable) { + isNeo = enable; +} + +void setLogType(std::string type) { + logType = type; +} + +void setLogFilter(std::string type) { + logFilter = type; +} + void setMainWindowGeometry(u32 x, u32 y, u32 w, u32 h) { main_window_geometry_x = x; main_window_geometry_y = y; @@ -255,7 +333,9 @@ void load(const std::filesystem::path& path) { gpuId = toml::find_or(vk, "gpuId", -1); vkValidation = toml::find_or(vk, "validation", false); vkValidationSync = toml::find_or(vk, "validation_sync", false); + vkValidationGpu = toml::find_or(vk, "validation_gpu", true); rdocEnable = toml::find_or(vk, "rdocEnable", false); + rdocMarkersEnable = toml::find_or(vk, "rdocMarkersEnable", false); } if (data.contains("Debug")) { @@ -330,7 +410,9 @@ void save(const std::filesystem::path& path) { data["Vulkan"]["gpuId"] = gpuId; data["Vulkan"]["validation"] = vkValidation; data["Vulkan"]["validation_sync"] = vkValidationSync; + data["Vulkan"]["validation_gpu"] = vkValidationGpu; data["Vulkan"]["rdocEnable"] = rdocEnable; + data["Vulkan"]["rdocMarkersEnable"] = rdocMarkersEnable; data["Debug"]["DebugDump"] = isDebugDump; data["LLE"]["libc"] = isLibc; data["GUI"]["theme"] = mw_themes; @@ -356,4 +438,24 @@ void save(const std::filesystem::path& path) { file << data; file.close(); } + +void setDefaultValues() { + isNeo = false; + isFullscreen = false; + screenWidth = 1280; + screenHeight = 720; + logFilter = ""; + logType = "async"; + isDebugDump = false; + isShowSplash = false; + isNullGpu = false; + shouldDumpShaders = false; + shouldDumpPM4 = false; + vblankDivider = 1; + vkValidation = false; + rdocEnable = false; + m_language = 1; + gpuId = -1; +} + } // namespace Config diff --git a/src/common/config.h b/src/common/config.h index 53c88ec9..97055028 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -27,10 +27,32 @@ bool nullGpu(); bool dumpShaders(); bool dumpPM4(); bool isRdocEnabled(); +bool isMarkersEnabled(); u32 vblankDiv(); +void setDebugDump(bool enable); +void setShowSplash(bool enable); +void setNullGpu(bool enable); +void setDumpShaders(bool enable); +void setDumpPM4(bool enable); +void setVblankDiv(u32 value); +void setGpuId(s32 selectedGpuId); +void setScreenWidth(u32 width); +void setScreenHeight(u32 height); +void setFullscreenMode(bool enable); +void setLanguage(u32 language); +void setNeoMode(bool enable); + +void setLogType(std::string type); +void setLogFilter(std::string type); + +void setVkValidation(bool enable); +void setVkSyncValidation(bool enable); +void setRdocEnabled(bool enable); + bool vkValidationEnabled(); bool vkValidationSyncEnabled(); +bool vkValidationGpuEnabled(); // Gui void setMainWindowGeometry(u32 x, u32 y, u32 w, u32 h); @@ -64,7 +86,8 @@ std::vector getPkgViewer(); std::vector getElfViewer(); std::vector getRecentFiles(); +void setDefaultValues(); + // settings u32 GetLanguage(); - }; // namespace Config diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 4dd06e99..47860b25 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -459,8 +459,28 @@ void* AddressSpace::MapFile(VAddr virtual_addr, size_t size, size_t offset, u32 #endif } -void AddressSpace::Unmap(VAddr virtual_addr, size_t size, bool has_backing) { - return impl->Unmap(virtual_addr, size, has_backing); +void AddressSpace::Unmap(VAddr virtual_addr, size_t size, VAddr start_in_vma, VAddr end_in_vma, + PAddr phys_base, bool is_exec, bool has_backing) { +#ifdef _WIN32 + // There does not appear to be comparable support for partial unmapping on Windows. + // Unfortunately, a least one title was found to require this. The workaround is to unmap + // the entire allocation and remap the portions outside of the requested unmapping range. + impl->Unmap(virtual_addr, size, has_backing); + + // TODO: Determine if any titles require partial unmapping support for flexible allocations. + ASSERT_MSG(has_backing || (start_in_vma == 0 && end_in_vma == size), + "Partial unmapping of flexible allocations is not supported"); + + if (start_in_vma != 0) { + Map(virtual_addr, start_in_vma, 0, phys_base, is_exec); + } + + if (end_in_vma != size) { + Map(virtual_addr + end_in_vma, size - end_in_vma, 0, phys_base + end_in_vma, is_exec); + } +#else + impl->Unmap(virtual_addr + start_in_vma, end_in_vma - start_in_vma, has_backing); +#endif } void AddressSpace::Protect(VAddr virtual_addr, size_t size, MemoryPermission perms) { diff --git a/src/core/address_space.h b/src/core/address_space.h index 29f74f56..53041bcc 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -91,7 +91,8 @@ public: void* MapFile(VAddr virtual_addr, size_t size, size_t offset, u32 prot, uintptr_t fd); /// Unmaps specified virtual memory area. - void Unmap(VAddr virtual_addr, size_t size, bool has_backing); + void Unmap(VAddr virtual_addr, size_t size, VAddr start_in_vma, VAddr end_in_vma, + PAddr phys_base, bool is_exec, bool has_backing); void Protect(VAddr virtual_addr, size_t size, MemoryPermission perms); diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 2bcff191..a6d5c3ea 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -54,6 +54,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) // If the path does not exist attempt to verify this. // Retrieve parent path until we find one that exists. + std::scoped_lock lk{m_mutex}; path_parts.clear(); auto current_path = host_path; while (!std::filesystem::exists(current_path)) { diff --git a/src/core/libraries/audio/audioout.cpp b/src/core/libraries/audio/audioout.cpp index cc7ce342..eac3845f 100644 --- a/src/core/libraries/audio/audioout.cpp +++ b/src/core/libraries/audio/audioout.cpp @@ -235,6 +235,9 @@ int PS4_SYSV_ABI sceAudioOutGetSystemState() { } int PS4_SYSV_ABI sceAudioOutInit() { + if (audio != nullptr) { + return ORBIS_AUDIO_OUT_ERROR_ALREADY_INIT; + } audio = std::make_unique(); LOG_INFO(Lib_AudioOut, "called"); return ORBIS_OK; diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 650252f9..c2ee6d59 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -956,9 +956,9 @@ int PS4_SYSV_ABI sceGnmGetGpuBlockStatus() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { - LOG_DEBUG(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +u32 PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { + LOG_TRACE(Lib_GnmDriver, "called"); + return Config::isNeoMode() ? 911'000'000 : 800'000'000; } int PS4_SYSV_ABI sceGnmGetGpuInfoStatus() { @@ -1706,8 +1706,18 @@ int PS4_SYSV_ABI sceGnmSetupMipStatsReport() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmSetVgtControl() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); +s32 PS4_SYSV_ABI sceGnmSetVgtControl(u32* cmdbuf, u32 size, u32 prim_group_sz_minus_one, + u32 partial_vs_wave_mode, u32 wd_switch_only_on_eop_mode) { + LOG_TRACE(Lib_GnmDriver, "called"); + + if (!cmdbuf || size != 3 || (prim_group_sz_minus_one >= 0x100) || + ((wd_switch_only_on_eop_mode | partial_vs_wave_mode) >= 2)) { + return -1; + } + + const u32 reg_value = + ((partial_vs_wave_mode & 1) << 0x10) | (prim_group_sz_minus_one & 0xffffu); + PM4CmdSetData::SetContextReg(cmdbuf, 0x2aau, reg_value); // IA_MULTI_VGT_PARAM return ORBIS_OK; } diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index 8100b116..84872297 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -85,7 +85,7 @@ int PS4_SYSV_ABI sceGnmGetDebugTimestamp(); int PS4_SYSV_ABI sceGnmGetEqEventType(); int PS4_SYSV_ABI sceGnmGetEqTimeStamp(); int PS4_SYSV_ABI sceGnmGetGpuBlockStatus(); -int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency(); +u32 PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency(); int PS4_SYSV_ABI sceGnmGetGpuInfoStatus(); int PS4_SYSV_ABI sceGnmGetLastWaitedAddress(); int PS4_SYSV_ABI sceGnmGetNumTcaUnits(); @@ -161,7 +161,8 @@ int PS4_SYSV_ABI sceGnmSetResourceUserData(); int PS4_SYSV_ABI sceGnmSetSpiEnableSqCounters(); int PS4_SYSV_ABI sceGnmSetSpiEnableSqCountersForUnitInstance(); int PS4_SYSV_ABI sceGnmSetupMipStatsReport(); -int PS4_SYSV_ABI sceGnmSetVgtControl(); +s32 PS4_SYSV_ABI sceGnmSetVgtControl(u32* cmdbuf, u32 size, u32 prim_group_sz_minus_one, + u32 partial_vs_wave_mode, u32 wd_switch_only_on_eop_mode); s32 PS4_SYSV_ABI sceGnmSetVsShader(u32* cmdbuf, u32 size, const u32* vs_regs, u32 shader_modifier); int PS4_SYSV_ABI sceGnmSetWaveLimitMultiplier(); int PS4_SYSV_ABI sceGnmSetWaveLimitMultipliers(); diff --git a/src/core/libraries/kernel/libkernel.cpp b/src/core/libraries/kernel/libkernel.cpp index 4b9565b6..460fc427 100644 --- a/src/core/libraries/kernel/libkernel.cpp +++ b/src/core/libraries/kernel/libkernel.cpp @@ -360,7 +360,6 @@ int PS4_SYSV_ABI posix_connect() { } int PS4_SYSV_ABI _sigprocmask() { - LOG_DEBUG(Lib_Kernel, "STUBBED"); return ORBIS_OK; } diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index 11e8a74c..189c7387 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -75,13 +75,22 @@ s32 PS4_SYSV_ABI sceKernelAvailableDirectMemorySize(u64 searchStart, u64 searchE size_t* sizeOut) { LOG_WARNING(Kernel_Vmm, "called searchStart = {:#x}, searchEnd = {:#x}, alignment = {:#x}", searchStart, searchEnd, alignment); + + if (searchEnd <= searchStart) { + return ORBIS_KERNEL_ERROR_EINVAL; + } + if (searchEnd > SCE_KERNEL_MAIN_DMEM_SIZE) { + return ORBIS_KERNEL_ERROR_EINVAL; + } + auto* memory = Core::Memory::Instance(); PAddr physAddr; - s32 size = memory->DirectQueryAvailable(searchStart, searchEnd, alignment, &physAddr, sizeOut); + s32 result = + memory->DirectQueryAvailable(searchStart, searchEnd, alignment, &physAddr, sizeOut); *physAddrOut = static_cast(physAddr); - return size; + return result; } s32 PS4_SYSV_ABI sceKernelVirtualQuery(const void* addr, int flags, OrbisVirtualQueryInfo* info, @@ -244,9 +253,9 @@ s32 PS4_SYSV_ABI sceKernelAvailableFlexibleMemorySize(size_t* out_size) { return ORBIS_OK; } -void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func) { +void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func[]) { auto* linker = Common::Singleton::Instance(); - linker->SetHeapApiFunc(func); + linker->SetHeapAPI(func); } int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut, diff --git a/src/core/libraries/kernel/memory_management.h b/src/core/libraries/kernel/memory_management.h index 0c775246..440685c3 100644 --- a/src/core/libraries/kernel/memory_management.h +++ b/src/core/libraries/kernel/memory_management.h @@ -102,7 +102,7 @@ int PS4_SYSV_ABI sceKernelMTypeProtect(void* addr, size_t size, int mtype, int p int PS4_SYSV_ABI sceKernelDirectMemoryQuery(u64 offset, int flags, OrbisQueryInfo* query_info, size_t infoSize); s32 PS4_SYSV_ABI sceKernelAvailableFlexibleMemorySize(size_t* sizeOut); -void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func); +void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func[]); int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut); diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 85e2d0e6..6319b7c2 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -421,13 +421,21 @@ ScePthreadMutex* createMutex(ScePthreadMutex* addr) { return addr; } -int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMutexattr* attr, +int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMutexattr* mutex_attr, const char* name) { + const ScePthreadMutexattr* attr; + if (mutex == nullptr) { return SCE_KERNEL_ERROR_EINVAL; } - if (attr == nullptr) { + if (mutex_attr == nullptr) { attr = g_pthread_cxt->getDefaultMutexattr(); + } else { + if (*mutex_attr == nullptr) { + attr = g_pthread_cxt->getDefaultMutexattr(); + } else { + attr = mutex_attr; + } } *mutex = new PthreadMutexInternal{}; @@ -1086,6 +1094,19 @@ int PS4_SYSV_ABI scePthreadAttrGetstack(ScePthreadAttr* attr, void** addr, size_ return SCE_KERNEL_ERROR_EINVAL; } +int PS4_SYSV_ABI scePthreadAttrSetstack(ScePthreadAttr* attr, void* addr, size_t size) { + if (attr == nullptr || *attr == nullptr || addr == nullptr || size < 0x4000) { + return ORBIS_KERNEL_ERROR_EINVAL; + } + int result = pthread_attr_setstack(&(*attr)->pth_attr, addr, size); + LOG_INFO(Kernel_Pthread, "scePthreadAttrSetstack: result = {}", result); + + if (result == 0) { + return ORBIS_OK; + } + return ORBIS_KERNEL_ERROR_EINVAL; +} + int PS4_SYSV_ABI scePthreadJoin(ScePthread thread, void** res) { int result = pthread_join(thread->pth, res); LOG_INFO(Kernel_Pthread, "scePthreadJoin result = {}", result); @@ -1542,6 +1563,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("B5GmVDKwpn0", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_yield); LIB_FUNCTION("-quPa4SEJUw", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrGetstack); + LIB_FUNCTION("Bvn74vj6oLo", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetstack); LIB_FUNCTION("Ru36fiTtJzA", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrGetstackaddr); LIB_FUNCTION("-fA+7ZlGDQs", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrGetstacksize); LIB_FUNCTION("14bOACANTBo", "libkernel", 1, "libkernel", 1, 1, scePthreadOnce); diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index 370dba44..5304dc57 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -9,7 +9,6 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/libraries/error_codes.h" -#include "core/libraries/kernel/thread_management.h" #include "core/libraries/libs.h" namespace Libraries::Kernel { @@ -82,7 +81,6 @@ public: public: struct WaitingThread : public ListBaseHook { - std::string name; std::condition_variable cv; u32 priority; s32 need_count; @@ -90,7 +88,6 @@ public: bool was_cancled{}; explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} { - name = scePthreadSelf()->name; if (is_fifo) { return; } @@ -174,10 +171,16 @@ s32 PS4_SYSV_ABI sceKernelCreateSema(OrbisKernelSema* sem, const char* pName, u3 } s32 PS4_SYSV_ABI sceKernelWaitSema(OrbisKernelSema sem, s32 needCount, u32* pTimeout) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Wait(true, needCount, pTimeout); } s32 PS4_SYSV_ABI sceKernelSignalSema(OrbisKernelSema sem, s32 signalCount) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } if (!sem->Signal(signalCount)) { return ORBIS_KERNEL_ERROR_EINVAL; } @@ -185,10 +188,16 @@ s32 PS4_SYSV_ABI sceKernelSignalSema(OrbisKernelSema sem, s32 signalCount) { } s32 PS4_SYSV_ABI sceKernelPollSema(OrbisKernelSema sem, s32 needCount) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Wait(false, needCount, nullptr); } int PS4_SYSV_ABI sceKernelCancelSema(OrbisKernelSema sem, s32 setCount, s32* pNumWaitThreads) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Cancel(setCount, pNumWaitThreads); } diff --git a/src/core/libraries/np_manager/np_manager.cpp b/src/core/libraries/np_manager/np_manager.cpp index 33308abc..fd4e31f5 100644 --- a/src/core/libraries/np_manager/np_manager.cpp +++ b/src/core/libraries/np_manager/np_manager.cpp @@ -974,8 +974,11 @@ int PS4_SYSV_ABI sceNpGetGamePresenceStatusA() { return ORBIS_OK; } -int PS4_SYSV_ABI sceNpGetNpId() { - LOG_ERROR(Lib_NpManager, "(STUBBED) called"); +int PS4_SYSV_ABI sceNpGetNpId(OrbisUserServiceUserId userId, OrbisNpId* npId) { + LOG_ERROR(Lib_NpManager, "(DUMMY) called"); + + std::string name = "shadps4"; + strcpy(npId->handle.data, name.c_str()); return ORBIS_OK; } diff --git a/src/core/libraries/np_manager/np_manager.h b/src/core/libraries/np_manager/np_manager.h index 5b11355a..5955a40b 100644 --- a/src/core/libraries/np_manager/np_manager.h +++ b/src/core/libraries/np_manager/np_manager.h @@ -11,6 +11,22 @@ class SymbolsResolver; namespace Libraries::NpManager { +constexpr int ORBIS_NP_ONLINEID_MAX_LENGTH = 16; + +typedef int OrbisUserServiceUserId; + +struct OrbisNpOnlineId { + char data[ORBIS_NP_ONLINEID_MAX_LENGTH]; + char term; + char dummy[3]; +}; + +struct OrbisNpId { + OrbisNpOnlineId handle; + u8 opt[8]; + u8 reserved[8]; +}; + int PS4_SYSV_ABI Func_EF4378573542A508(); int PS4_SYSV_ABI _sceNpIpcCreateMemoryFromKernel(); int PS4_SYSV_ABI _sceNpIpcCreateMemoryFromPool(); @@ -204,7 +220,7 @@ int PS4_SYSV_ABI sceNpGetAccountLanguage2(); int PS4_SYSV_ABI sceNpGetAccountLanguageA(); int PS4_SYSV_ABI sceNpGetGamePresenceStatus(); int PS4_SYSV_ABI sceNpGetGamePresenceStatusA(); -int PS4_SYSV_ABI sceNpGetNpId(); +int PS4_SYSV_ABI sceNpGetNpId(OrbisUserServiceUserId userId, OrbisNpId* npId); int PS4_SYSV_ABI sceNpGetNpReachabilityState(); int PS4_SYSV_ABI sceNpGetOnlineId(); int PS4_SYSV_ABI sceNpGetParentalControlInfo(); diff --git a/src/core/libraries/pad/pad.cpp b/src/core/libraries/pad/pad.cpp index d3993550..c9e332d2 100644 --- a/src/core/libraries/pad/pad.cpp +++ b/src/core/libraries/pad/pad.cpp @@ -419,8 +419,14 @@ int PS4_SYSV_ABI scePadSetForceIntercepted() { } int PS4_SYSV_ABI scePadSetLightBar(s32 handle, const OrbisPadLightBarParam* pParam) { - LOG_ERROR(Lib_Pad, "(STUBBED) called"); - return ORBIS_OK; + if (pParam != nullptr) { + LOG_INFO(Lib_Pad, "scePadSetLightBar called handle = {} rgb = {} {} {}", handle, pParam->r, + pParam->g, pParam->b); + auto* controller = Common::Singleton::Instance(); + controller->SetLightBarRGB(pParam->r, pParam->g, pParam->b); + return ORBIS_OK; + } + return ORBIS_PAD_ERROR_INVALID_ARG; } int PS4_SYSV_ABI scePadSetLightBarBaseBrightness() { @@ -479,8 +485,14 @@ int PS4_SYSV_ABI scePadSetUserColor() { } int PS4_SYSV_ABI scePadSetVibration(s32 handle, const OrbisPadVibrationParam* pParam) { - LOG_DEBUG(Lib_Pad, "(STUBBED) called"); - return ORBIS_OK; + if (pParam != nullptr) { + LOG_INFO(Lib_Pad, "scePadSetVibration called handle = {} data = {} , {}", handle, + pParam->smallMotor, pParam->largeMotor); + auto* controller = Common::Singleton::Instance(); + controller->SetVibration(pParam->smallMotor, pParam->largeMotor); + return ORBIS_OK; + } + return ORBIS_PAD_ERROR_INVALID_ARG; } int PS4_SYSV_ABI scePadSetVibrationForce() { diff --git a/src/core/libraries/videoout/driver.cpp b/src/core/libraries/videoout/driver.cpp index 97b1816e..25de48a4 100644 --- a/src/core/libraries/videoout/driver.cpp +++ b/src/core/libraries/videoout/driver.cpp @@ -9,6 +9,7 @@ #include "core/libraries/error_codes.h" #include "core/libraries/kernel/time_management.h" #include "core/libraries/videoout/driver.h" +#include "core/platform.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" extern std::unique_ptr renderer; @@ -173,14 +174,19 @@ std::chrono::microseconds VideoOutDriver::Flip(const Request& req) { // Update flip status. auto* port = req.port; - auto& flip_status = port->flip_status; - flip_status.count++; - flip_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); - flip_status.tsc = Libraries::Kernel::sceKernelReadTsc(); - flip_status.submitTsc = Libraries::Kernel::sceKernelReadTsc(); - flip_status.flipArg = req.flip_arg; - flip_status.currentBuffer = req.index; - flip_status.flipPendingNum = static_cast(requests.size()); + { + std::unique_lock lock{port->port_mutex}; + auto& flip_status = port->flip_status; + flip_status.count++; + flip_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); + flip_status.tsc = Libraries::Kernel::sceKernelReadTsc(); + flip_status.flipArg = req.flip_arg; + flip_status.currentBuffer = req.index; + if (req.eop) { + --flip_status.gcQueueNum; + } + --flip_status.flipPendingNum; + } // Trigger flip events for the port. for (auto& event : port->flip_events) { @@ -202,34 +208,54 @@ std::chrono::microseconds VideoOutDriver::Flip(const Request& req) { bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, bool is_eop /*= false*/) { + { + std::unique_lock lock{port->port_mutex}; + if (index != -1 && port->flip_status.flipPendingNum >= port->NumRegisteredBuffers()) { + LOG_ERROR(Lib_VideoOut, "Flip queue is full"); + return false; + } + + if (is_eop) { + ++port->flip_status.gcQueueNum; + } + ++port->flip_status.flipPendingNum; // integral GPU and CPU pending flips counter + port->flip_status.submitTsc = Libraries::Kernel::sceKernelReadTsc(); + } + + if (!is_eop) { + // Before processing the flip we need to ask GPU thread to flush command list as at this + // point VO surface is ready to be presented, and we will need have an actual state of + // Vulkan image at the time of frame presentation. + liverpool->SendCommand([=, this]() { + renderer->FlushDraw(); + SubmitFlipInternal(port, index, flip_arg, is_eop); + }); + } else { + SubmitFlipInternal(port, index, flip_arg, is_eop); + } + + return true; +} + +void VideoOutDriver::SubmitFlipInternal(VideoOutPort* port, s32 index, s64 flip_arg, + bool is_eop /*= false*/) { Vulkan::Frame* frame; if (index == -1) { - frame = renderer->PrepareBlankFrame(); + frame = renderer->PrepareBlankFrame(is_eop); } else { const auto& buffer = port->buffer_slots[index]; const auto& group = port->groups[buffer.group_index]; frame = renderer->PrepareFrame(group, buffer.address_left, is_eop); } - if (index != -1 && requests.size() >= port->NumRegisteredBuffers()) { - LOG_ERROR(Lib_VideoOut, "Flip queue is full"); - return false; - } - std::scoped_lock lock{mutex}; requests.push({ .frame = frame, .port = port, .index = index, .flip_arg = flip_arg, - .submit_tsc = Libraries::Kernel::sceKernelReadTsc(), .eop = is_eop, }); - - port->flip_status.flipPendingNum = static_cast(requests.size()); - port->flip_status.gcQueueNum = 0; - - return true; } void VideoOutDriver::PresentThread(std::stop_token token) { diff --git a/src/core/libraries/videoout/driver.h b/src/core/libraries/videoout/driver.h index 104056de..bee80060 100644 --- a/src/core/libraries/videoout/driver.h +++ b/src/core/libraries/videoout/driver.h @@ -29,6 +29,7 @@ struct VideoOutPort { std::vector flip_events; std::vector vblank_events; std::mutex vo_mutex; + std::mutex port_mutex; std::condition_variable vo_cv; std::condition_variable vblank_cv; int flip_rate = 0; @@ -93,7 +94,6 @@ private: VideoOutPort* port; s32 index; s64 flip_arg; - u64 submit_tsc; bool eop; operator bool() const noexcept { @@ -102,6 +102,7 @@ private: }; std::chrono::microseconds Flip(const Request& req); + void SubmitFlipInternal(VideoOutPort* port, s32 index, s64 flip_arg, bool is_eop = false); void PresentThread(std::stop_token token); std::mutex mutex; diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index 15e14662..acfcbad4 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -113,7 +113,9 @@ s32 PS4_SYSV_ABI sceVideoOutSetFlipRate(s32 handle, s32 rate) { s32 PS4_SYSV_ABI sceVideoOutIsFlipPending(s32 handle) { LOG_INFO(Lib_VideoOut, "called"); - s32 pending = driver->GetPort(handle)->flip_status.flipPendingNum; + auto* port = driver->GetPort(handle); + std::unique_lock lock{port->port_mutex}; + s32 pending = port->flip_status.flipPendingNum; return pending; } @@ -161,6 +163,7 @@ s32 PS4_SYSV_ABI sceVideoOutGetFlipStatus(s32 handle, FlipStatus* status) { return ORBIS_VIDEO_OUT_ERROR_INVALID_HANDLE; } + std::unique_lock lock{port->port_mutex}; *status = port->flip_status; LOG_INFO(Lib_VideoOut, diff --git a/src/core/linker.cpp b/src/core/linker.cpp index e4cbe573..d4a15825 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -305,7 +305,8 @@ void* Linker::TlsGetAddr(u64 module_index, u64 offset) { // Module was just loaded by above code. Allocate TLS block for it. Module* module = m_modules[module_index - 1].get(); const u32 init_image_size = module->tls.init_image_size; - u8* dest = reinterpret_cast(heap_api_func(module->tls.image_size)); + // TODO: Determine if Windows will crash from this + u8* dest = reinterpret_cast(heap_api->heap_malloc(module->tls.image_size)); const u8* src = reinterpret_cast(module->tls.image_virtual_addr); std::memcpy(dest, src, init_image_size); std::memset(dest + init_image_size, 0, module->tls.image_size - init_image_size); @@ -335,10 +336,23 @@ void Linker::InitTlsForThread(bool is_primary) { &addr_out, tls_aligned, 3, 0, "SceKernelPrimaryTcbTls"); ASSERT_MSG(ret == 0, "Unable to allocate TLS+TCB for the primary thread"); } else { - if (heap_api_func) { - addr_out = heap_api_func(total_tls_size); + if (heap_api) { +#ifndef WIN32 + addr_out = heap_api->heap_malloc(total_tls_size); } else { addr_out = std::malloc(total_tls_size); +#else + // TODO: Windows tls malloc replacement, refer to rtld_tls_block_malloc + LOG_ERROR(Core_Linker, "TLS user malloc called, using std::malloc"); + addr_out = std::malloc(total_tls_size); + if (!addr_out) { + auto pth_id = pthread_self(); + auto handle = pthread_gethandle(pth_id); + ASSERT_MSG(addr_out, + "Cannot allocate TLS block defined for handle=%x, index=%d size=%d", + handle, pth_id, total_tls_size); + } +#endif } } diff --git a/src/core/linker.h b/src/core/linker.h index aee8c8fd..ed1fe400 100644 --- a/src/core/linker.h +++ b/src/core/linker.h @@ -46,7 +46,21 @@ struct EntryParams { const char* argv[3]; }; -using HeapApiFunc = PS4_SYSV_ABI void* (*)(size_t); +struct HeapAPI { + PS4_SYSV_ABI void* (*heap_malloc)(size_t); + PS4_SYSV_ABI void (*heap_free)(void*); + PS4_SYSV_ABI void* (*heap_calloc)(size_t, size_t); + PS4_SYSV_ABI void* (*heap_realloc)(void*, size_t); + PS4_SYSV_ABI void* (*heap_memalign)(size_t, size_t); + PS4_SYSV_ABI int (*heap_posix_memalign)(void**, size_t, size_t); + // NOTE: Fields below may be inaccurate + PS4_SYSV_ABI int (*heap_reallocalign)(void); + PS4_SYSV_ABI void (*heap_malloc_stats)(void); + PS4_SYSV_ABI int (*heap_malloc_stats_fast)(void); + PS4_SYSV_ABI size_t (*heap_malloc_usable_size)(void*); +}; + +using AppHeapAPI = HeapAPI*; class Linker { public: @@ -75,8 +89,8 @@ public: } } - void SetHeapApiFunc(void* func) { - heap_api_func = *reinterpret_cast(func); + void SetHeapAPI(void* func[]) { + heap_api = reinterpret_cast(func); } void AdvanceGenerationCounter() noexcept { @@ -104,7 +118,7 @@ private: size_t static_tls_size{}; u32 max_tls_index{}; u32 num_static_modules{}; - HeapApiFunc heap_api_func{}; + AppHeapAPI heap_api{}; std::vector> m_modules; Loader::SymbolsResolver m_hle_symbols{}; }; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 41bb60e3..931009e2 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -55,7 +55,7 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, free_addr = alignment > 0 ? Common::AlignUp(free_addr, alignment) : free_addr; // Add the allocated region to the list and commit its pages. - auto& area = CarveDmemArea(free_addr, size); + auto& area = CarveDmemArea(free_addr, size)->second; area.memory_type = memory_type; area.is_free = false; return free_addr; @@ -64,9 +64,8 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, void MemoryManager::Free(PAddr phys_addr, size_t size) { std::scoped_lock lk{mutex}; - const auto dmem_area = FindDmemArea(phys_addr); - ASSERT(dmem_area != dmem_map.end() && dmem_area->second.base == phys_addr && - dmem_area->second.size == size); + auto dmem_area = CarveDmemArea(phys_addr, size); + ASSERT(dmem_area != dmem_map.end() && dmem_area->second.size >= size); // Release any dmem mappings that reference this physical block. std::vector> remove_list; @@ -75,10 +74,11 @@ void MemoryManager::Free(PAddr phys_addr, size_t size) { continue; } if (mapping.phys_base <= phys_addr && phys_addr < mapping.phys_base + mapping.size) { - LOG_INFO(Kernel_Vmm, "Unmaping direct mapping {:#x} with size {:#x}", addr, - mapping.size); + auto vma_segment_start_addr = phys_addr - mapping.phys_base + addr; + LOG_INFO(Kernel_Vmm, "Unmaping direct mapping {:#x} with size {:#x}", + vma_segment_start_addr, size); // Unmaping might erase from vma_map. We can't do it here. - remove_list.emplace_back(addr, mapping.size); + remove_list.emplace_back(vma_segment_start_addr, size); } } for (const auto& [addr, size] : remove_list) { @@ -105,8 +105,6 @@ int MemoryManager::Reserve(void** out_addr, VAddr virtual_addr, size_t size, Mem const auto& vma = FindVMA(mapped_addr)->second; // If the VMA is mapped, unmap the region first. if (vma.IsMapped()) { - ASSERT_MSG(vma.base == mapped_addr && vma.size == size, - "Region must match when reserving a mapped region"); UnmapMemory(mapped_addr, size); } const size_t remaining_size = vma.base + vma.size - mapped_addr; @@ -170,6 +168,7 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M new_vma.prot = prot; new_vma.name = name; new_vma.type = type; + new_vma.is_exec = is_exec; if (type == VMAType::Direct) { new_vma.phys_base = phys_addr; @@ -217,10 +216,16 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { std::scoped_lock lk{mutex}; const auto it = FindVMA(virtual_addr); - ASSERT_MSG(it->second.Contains(virtual_addr, size), + const auto& vma_base = it->second; + ASSERT_MSG(vma_base.Contains(virtual_addr, size), "Existing mapping does not contain requested unmap range"); - const auto type = it->second.type; + const auto vma_base_addr = vma_base.base; + const auto vma_base_size = vma_base.size; + const auto phys_base = vma_base.phys_base; + const bool is_exec = vma_base.is_exec; + const auto start_in_vma = virtual_addr - vma_base_addr; + const auto type = vma_base.type; const bool has_backing = type == VMAType::Direct || type == VMAType::File; if (type == VMAType::Direct) { rasterizer->UnmapMemory(virtual_addr, size); @@ -240,7 +245,8 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { MergeAdjacent(vma_map, new_it); // Unmap the memory region. - impl.Unmap(virtual_addr, size, has_backing); + impl.Unmap(vma_base_addr, vma_base_size, start_in_vma, start_in_vma + size, phys_base, is_exec, + has_backing); TRACK_FREE(virtual_addr, "VMEM"); } @@ -364,10 +370,10 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags, std::scoped_lock lk{mutex}; auto it = FindVMA(addr); - if (!it->second.IsMapped() && flags == 1) { + if (it->second.type == VMAType::Free && flags == 1) { it++; } - if (!it->second.IsMapped()) { + if (it->second.type == VMAType::Free) { LOG_WARNING(Kernel_Vmm, "VirtualQuery on free memory region"); return ORBIS_KERNEL_ERROR_EACCES; } @@ -494,13 +500,12 @@ MemoryManager::VMAHandle MemoryManager::CarveVMA(VAddr virtual_addr, size_t size return vma_handle; } -DirectMemoryArea& MemoryManager::CarveDmemArea(PAddr addr, size_t size) { +MemoryManager::DMemHandle MemoryManager::CarveDmemArea(PAddr addr, size_t size) { auto dmem_handle = FindDmemArea(addr); ASSERT_MSG(dmem_handle != dmem_map.end(), "Physical address not in dmem_map"); const DirectMemoryArea& area = dmem_handle->second; - ASSERT_MSG(area.is_free && area.base <= addr, - "Adding an allocation to already allocated region"); + ASSERT_MSG(area.base <= addr, "Adding an allocation to already allocated region"); const PAddr start_in_area = addr - area.base; const PAddr end_in_vma = start_in_area + size; @@ -515,7 +520,7 @@ DirectMemoryArea& MemoryManager::CarveDmemArea(PAddr addr, size_t size) { dmem_handle = Split(dmem_handle, start_in_area); } - return dmem_handle->second; + return dmem_handle; } MemoryManager::VMAHandle MemoryManager::Split(VMAHandle vma_handle, size_t offset_in_vma) { diff --git a/src/core/memory.h b/src/core/memory.h index 17902897..9c046c25 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -84,6 +84,7 @@ struct VirtualMemoryArea { bool disallow_merge = false; std::string name = ""; uintptr_t fd = 0; + bool is_exec = false; bool Contains(VAddr addr, size_t size) const { return addr >= base && (addr + size) <= (base + this->size); @@ -210,7 +211,7 @@ private: VMAHandle CarveVMA(VAddr virtual_addr, size_t size); - DirectMemoryArea& CarveDmemArea(PAddr addr, size_t size); + DMemHandle CarveDmemArea(PAddr addr, size_t size); VMAHandle Split(VMAHandle vma_handle, size_t offset_in_vma); diff --git a/src/input/controller.cpp b/src/input/controller.cpp index 247e08ce..4a3db163 100644 --- a/src/input/controller.cpp +++ b/src/input/controller.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "core/libraries/kernel/time_management.h" #include "core/libraries/pad/pad.h" #include "input/controller.h" @@ -117,4 +118,29 @@ void GameController::Axis(int id, Input::Axis axis, int value) { AddState(state); } +void GameController::SetLightBarRGB(u8 r, u8 g, u8 b) { + if (m_sdl_gamepad != nullptr) { + SDL_SetGamepadLED(m_sdl_gamepad, r, g, b); + } +} + +bool GameController::SetVibration(u8 smallMotor, u8 largeMotor) { + if (m_sdl_gamepad != nullptr) { + return SDL_RumbleGamepad(m_sdl_gamepad, (smallMotor / 255.0f) * 0xFFFF, + (largeMotor / 255.0f) * 0xFFFF, -1) == 0; + } + return true; +} + +void GameController::TryOpenSDLController() { + if (m_sdl_gamepad == nullptr || !SDL_GamepadConnected(m_sdl_gamepad)) { + int gamepad_count; + SDL_JoystickID* gamepads = SDL_GetGamepads(&gamepad_count); + m_sdl_gamepad = gamepad_count > 0 ? SDL_OpenGamepad(gamepads[0]) : nullptr; + SDL_free(gamepads); + } + + SetLightBarRGB(0, 0, 255); +} + } // namespace Input diff --git a/src/input/controller.h b/src/input/controller.h index a16f7dd0..ef099156 100644 --- a/src/input/controller.h +++ b/src/input/controller.h @@ -6,6 +6,8 @@ #include #include "common/types.h" +struct SDL_Gamepad; + namespace Input { enum class Axis { @@ -43,6 +45,9 @@ public: void CheckButton(int id, u32 button, bool isPressed); void AddState(const State& state); void Axis(int id, Input::Axis axis, int value); + void SetLightBarRGB(u8 r, u8 g, u8 b); + bool SetVibration(u8 smallMotor, u8 largeMotor); + void TryOpenSDLController(); private: struct StateInternal { @@ -57,6 +62,8 @@ private: u32 m_first_state = 0; std::array m_states; std::array m_private; + + SDL_Gamepad* m_sdl_gamepad = nullptr; }; } // namespace Input diff --git a/src/qt_gui/main_window.cpp b/src/qt_gui/main_window.cpp index 646433ee..aec2e7a5 100644 --- a/src/qt_gui/main_window.cpp +++ b/src/qt_gui/main_window.cpp @@ -15,6 +15,8 @@ #include "core/loader.h" #include "game_install_dialog.h" #include "main_window.h" +#include "settings_dialog.h" +#include "video_core/renderer_vulkan/vk_instance.h" MainWindow::MainWindow(QWidget* parent) : QMainWindow(parent), ui(new Ui::MainWindow) { ui->setupUi(this); @@ -38,6 +40,7 @@ bool MainWindow::Init() { CreateConnects(); SetLastUsedTheme(); SetLastIconSizeBullet(); + GetPhysicalDevices(); // show ui setMinimumSize(350, minimumSizeHint().height()); setWindowTitle(QString::fromStdString("shadPS4 v" + std::string(Common::VERSION))); @@ -157,6 +160,19 @@ void MainWindow::LoadGameLists() { } } +void MainWindow::GetPhysicalDevices() { + Vulkan::Instance instance(false, false); + auto physical_devices = instance.GetPhysicalDevices(); + for (const vk::PhysicalDevice physical_device : physical_devices) { + auto prop = physical_device.getProperties(); + QString name = QString::fromUtf8(prop.deviceName, -1); + if (prop.apiVersion < Vulkan::TargetVulkanApiVersion) { + name += " * Unsupported Vulkan Version"; + } + m_physical_devices.push_back(name); + } +} + void MainWindow::CreateConnects() { connect(this, &MainWindow::WindowResized, this, &MainWindow::HandleResize); connect(ui->mw_searchbar, &QLineEdit::textChanged, this, &MainWindow::SearchGameTable); @@ -185,6 +201,11 @@ void MainWindow::CreateConnects() { connect(m_game_list_frame.get(), &QTableWidget::cellDoubleClicked, this, &MainWindow::StartGame); + connect(ui->settingsButton, &QPushButton::clicked, this, [this]() { + auto settingsDialog = new SettingsDialog(m_physical_devices, this); + settingsDialog->exec(); + }); + connect(ui->setIconSizeTinyAct, &QAction::triggered, this, [this]() { if (isTableList) { m_game_list_frame->icon_size = diff --git a/src/qt_gui/main_window.h b/src/qt_gui/main_window.h index 39a5d049..35fd0bf6 100644 --- a/src/qt_gui/main_window.h +++ b/src/qt_gui/main_window.h @@ -54,6 +54,7 @@ private: void CreateActions(); void CreateRecentGameActions(); void CreateDockWindows(); + void GetPhysicalDevices(); void LoadGameLists(); void CreateConnects(); void SetLastUsedTheme(); @@ -79,6 +80,8 @@ private: QScopedPointer m_elf_viewer; // Status Bar. QScopedPointer statusBar; + // Available GPU devices + std::vector m_physical_devices; PSF psf; diff --git a/src/qt_gui/settings_dialog.cpp b/src/qt_gui/settings_dialog.cpp new file mode 100644 index 00000000..bde0eada --- /dev/null +++ b/src/qt_gui/settings_dialog.cpp @@ -0,0 +1,135 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "settings_dialog.h" +#include "ui_settings_dialog.h" + +SettingsDialog::SettingsDialog(std::span physical_devices, QWidget* parent) + : QDialog(parent), ui(new Ui::SettingsDialog) { + ui->setupUi(this); + ui->tabWidgetSettings->setUsesScrollButtons(false); + const auto config_dir = Common::FS::GetUserPath(Common::FS::PathType::UserDir); + + ui->buttonBox->button(QDialogButtonBox::StandardButton::Close)->setFocus(); + + // Add list of available GPUs + ui->graphicsAdapterBox->addItem("Auto Select"); // -1, auto selection + for (const auto& device : physical_devices) { + ui->graphicsAdapterBox->addItem(device); + } + + LoadValuesFromConfig(); + + connect(ui->buttonBox, &QDialogButtonBox::rejected, this, &QWidget::close); + + connect(ui->buttonBox, &QDialogButtonBox::clicked, this, + [this, config_dir](QAbstractButton* button) { + if (button == ui->buttonBox->button(QDialogButtonBox::Save)) { + Config::save(config_dir / "config.toml"); + QWidget::close(); + } else if (button == ui->buttonBox->button(QDialogButtonBox::Apply)) { + Config::save(config_dir / "config.toml"); + } else if (button == ui->buttonBox->button(QDialogButtonBox::RestoreDefaults)) { + Config::setDefaultValues(); + LoadValuesFromConfig(); + } + }); + + connect(ui->tabWidgetSettings, &QTabWidget::currentChanged, this, [this]() { + ui->buttonBox->button(QDialogButtonBox::StandardButton::Close)->setFocus(); + }); + + // EMULATOR TAB + { + connect(ui->consoleLanguageComboBox, &QComboBox::currentIndexChanged, this, + [](int index) { Config::setLanguage(index); }); + } + + // GPU TAB + { + // First options is auto selection -1, so gpuId on the GUI will always have to subtract 1 + // when setting and add 1 when getting to select the correct gpu in Qt + connect(ui->graphicsAdapterBox, &QComboBox::currentIndexChanged, this, + [](int index) { Config::setGpuId(index - 1); }); + + connect(ui->widthSpinBox, &QSpinBox::valueChanged, this, + [](int val) { Config::setScreenWidth(val); }); + + connect(ui->heightSpinBox, &QSpinBox::valueChanged, this, + [](int val) { Config::setScreenHeight(val); }); + + connect(ui->vblankSpinBox, &QSpinBox::valueChanged, this, + [](int val) { Config::setVblankDiv(val); }); + + connect(ui->dumpShadersCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setDumpShaders(val); }); + + connect(ui->nullGpuCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setNullGpu(val); }); + + connect(ui->dumpPM4CheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setDumpPM4(val); }); + } + + // GENERAL TAB + { + connect(ui->fullscreenCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setFullscreenMode(val); }); + + connect(ui->showSplashCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setShowSplash(val); }); + + connect(ui->ps4proCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setNeoMode(val); }); + + connect(ui->logTypeComboBox, &QComboBox::currentTextChanged, this, + [](const QString& text) { Config::setLogType(text.toStdString()); }); + + connect(ui->logFilterLineEdit, &QLineEdit::textChanged, this, + [](const QString& text) { Config::setLogFilter(text.toStdString()); }); + } + + // DEBUG TAB + { + connect(ui->debugDump, &QCheckBox::stateChanged, this, + [](int val) { Config::setDebugDump(val); }); + + connect(ui->vkValidationCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setVkValidation(val); }); + + connect(ui->vkSyncValidationCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setVkSyncValidation(val); }); + + connect(ui->rdocCheckBox, &QCheckBox::stateChanged, this, + [](int val) { Config::setRdocEnabled(val); }); + } +} + +void SettingsDialog::LoadValuesFromConfig() { + ui->consoleLanguageComboBox->setCurrentIndex(Config::GetLanguage()); + + ui->graphicsAdapterBox->setCurrentIndex(Config::getGpuId() + 1); + ui->widthSpinBox->setValue(Config::getScreenWidth()); + ui->heightSpinBox->setValue(Config::getScreenHeight()); + ui->vblankSpinBox->setValue(Config::vblankDiv()); + ui->dumpShadersCheckBox->setChecked(Config::dumpShaders()); + ui->nullGpuCheckBox->setChecked(Config::nullGpu()); + ui->dumpPM4CheckBox->setChecked(Config::dumpPM4()); + + ui->fullscreenCheckBox->setChecked(Config::isFullscreenMode()); + ui->showSplashCheckBox->setChecked(Config::showSplash()); + ui->ps4proCheckBox->setChecked(Config::isNeoMode()); + ui->logTypeComboBox->setCurrentText(QString::fromStdString(Config::getLogType())); + ui->logFilterLineEdit->setText(QString::fromStdString(Config::getLogFilter())); + + ui->debugDump->setChecked(Config::debugDump()); + ui->vkValidationCheckBox->setChecked(Config::vkValidationEnabled()); + ui->vkSyncValidationCheckBox->setChecked(Config::vkValidationSyncEnabled()); + ui->rdocCheckBox->setChecked(Config::isRdocEnabled()); +} + +int SettingsDialog::exec() { + return QDialog::exec(); +} + +SettingsDialog::~SettingsDialog() {} \ No newline at end of file diff --git a/src/qt_gui/settings_dialog.h b/src/qt_gui/settings_dialog.h new file mode 100644 index 00000000..7d870109 --- /dev/null +++ b/src/qt_gui/settings_dialog.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include + +#include "common/config.h" +#include "common/path_util.h" + +namespace Ui { +class SettingsDialog; +} + +class SettingsDialog : public QDialog { + Q_OBJECT +public: + explicit SettingsDialog(std::span physical_devices, QWidget* parent = nullptr); + ~SettingsDialog(); + + int exec() override; + +private: + void LoadValuesFromConfig(); + + std::unique_ptr ui; +}; diff --git a/src/qt_gui/settings_dialog.ui b/src/qt_gui/settings_dialog.ui new file mode 100644 index 00000000..4893bd61 --- /dev/null +++ b/src/qt_gui/settings_dialog.ui @@ -0,0 +1,908 @@ + + + + + SettingsDialog + + + Qt::WindowModality::WindowModal + + + + 0 + 0 + 1024 + 768 + + + + + 0 + 0 + + + + Settings + + + + :/images/shadps4.ico:/images/shadps4.ico + + + + + + QFrame::Shape::NoFrame + + + true + + + + true + + + + 0 + 0 + 1006 + 720 + + + + + 0 + 0 + + + + 0 + + + + Emulator + + + + + + + + + + Console Language + + + + + + + Japanese + + + + + English (United States) + + + + + French (France) + + + + + Spanish (Spain) + + + + + German + + + + + Italian + + + + + Dutch + + + + + Portuguese (Portugal) + + + + + Russian + + + + + Korean + + + + + Traditional Chinese + + + + + Simplified Chinese + + + + + Finnish + + + + + Swedish + + + + + Danish + + + + + Norwegian + + + + + Polish + + + + + Portuguese (Brazil) + + + + + English (United Kingdom) + + + + + Turkish + + + + + Spanish (Latin America) + + + + + Arabic + + + + + French (Canada) + + + + + Czech + + + + + Hungarian + + + + + Greek + + + + + Romanian + + + + + Thai + + + + + Vietnamese + + + + + Indonesian + + + + + + + + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + + 12 + + + 12 + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + GPU + + + + + + + + + + Graphics Device + + + + + + + + + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + 6 + + + 0 + + + + + + + Width + + + + + + true + + + QAbstractSpinBox::CorrectionMode::CorrectToNearestValue + + + false + + + 0 + + + 9999 + + + 1280 + + + + + + + + + + Height + + + + + + true + + + true + + + QAbstractSpinBox::CorrectionMode::CorrectToNearestValue + + + false + + + 0 + + + 9999 + + + 720 + + + + + + + + + + + + + + 6 + + + 0 + + + + + + + Vblank Divider + + + + + + true + + + true + + + QAbstractSpinBox::CorrectionMode::CorrectToNearestValue + + + false + + + 1 + + + 9999 + + + 1 + + + + + + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + 12 + + + 12 + + + + + Additional Settings + + + Qt::AlignmentFlag::AlignLeading|Qt::AlignmentFlag::AlignLeft|Qt::AlignmentFlag::AlignVCenter + + + + + + Enable Shaders Dumping + + + + + + + Enable NULL GPU + + + + + + + Enable PM4 Dumping + + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + General + + + + + + + + + + Emulator Settings + + + + + + Enable Fullscreen + + + + + + + Show Splash + + + + + + + Is PS4 Pro + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + + + + Logger Settings + + + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + Log Type + + + + + + + async + + + + + sync + + + + + + + + + + + + + + 6 + + + 0 + + + + + + + Log Filter + + + + + + + + + + + + + + + + + + + + + + + Additional Settings + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + Debug + + + + + + + + true + + + General + + + Qt::AlignmentFlag::AlignLeading|Qt::AlignmentFlag::AlignLeft|Qt::AlignmentFlag::AlignTop + + + + + + Enable Debug Dumping + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + Enable Vulkan Validation Layers + + + + + + + Enable Vulkan Synchronization Validation + + + + + + + Enable RenderDoc Debugging + + + + + + + + + + + + Qt::Orientation::Vertical + + + QSizePolicy::Policy::MinimumExpanding + + + + 0 + 0 + + + + + + + + + + + + + QDialogButtonBox::StandardButton::Apply|QDialogButtonBox::StandardButton::Close|QDialogButtonBox::StandardButton::RestoreDefaults|QDialogButtonBox::StandardButton::Save + + + + + + + + diff --git a/src/sdl_window.cpp b/src/sdl_window.cpp index 5e1a4c95..9fd59669 100644 --- a/src/sdl_window.cpp +++ b/src/sdl_window.cpp @@ -43,6 +43,9 @@ WindowSDL::WindowSDL(s32 width_, s32 height_, Input::GameController* controller_ SDL_SetWindowFullscreen(window, Config::isFullscreenMode()); + SDL_InitSubSystem(SDL_INIT_GAMEPAD); + controller->TryOpenSDLController(); + #if defined(SDL_PLATFORM_WIN32) window_info.type = WindowSystemType::Windows; window_info.render_surface = SDL_GetPointerProperty(SDL_GetWindowProperties(window), @@ -92,6 +95,11 @@ void WindowSDL::waitEvent() { case SDL_EVENT_KEY_UP: onKeyPress(&event); break; + case SDL_EVENT_GAMEPAD_BUTTON_DOWN: + case SDL_EVENT_GAMEPAD_BUTTON_UP: + case SDL_EVENT_GAMEPAD_AXIS_MOTION: + onGamepadEvent(&event); + break; case SDL_EVENT_QUIT: is_open = false; break; @@ -276,4 +284,71 @@ void WindowSDL::onKeyPress(const SDL_Event* event) { } } +void WindowSDL::onGamepadEvent(const SDL_Event* event) { + using Libraries::Pad::OrbisPadButtonDataOffset; + + u32 button = 0; + Input::Axis axis = Input::Axis::AxisMax; + switch (event->type) { + case SDL_EVENT_GAMEPAD_BUTTON_DOWN: + case SDL_EVENT_GAMEPAD_BUTTON_UP: + button = sdlGamepadToOrbisButton(event->gbutton.button); + if (button != 0) { + controller->CheckButton(0, button, event->type == SDL_EVENT_GAMEPAD_BUTTON_DOWN); + } + break; + case SDL_EVENT_GAMEPAD_AXIS_MOTION: + axis = event->gaxis.axis == SDL_GAMEPAD_AXIS_LEFTX ? Input::Axis::LeftX + : event->gaxis.axis == SDL_GAMEPAD_AXIS_LEFTY ? Input::Axis::LeftY + : event->gaxis.axis == SDL_GAMEPAD_AXIS_RIGHTX ? Input::Axis::RightX + : event->gaxis.axis == SDL_GAMEPAD_AXIS_RIGHTY ? Input::Axis::RightY + : event->gaxis.axis == SDL_GAMEPAD_AXIS_LEFT_TRIGGER ? Input::Axis::TriggerLeft + : event->gaxis.axis == SDL_GAMEPAD_AXIS_RIGHT_TRIGGER ? Input::Axis::TriggerRight + : Input::Axis::AxisMax; + if (axis != Input::Axis::AxisMax) { + controller->Axis(0, axis, Input::GetAxis(-0x8000, 0x8000, event->gaxis.value)); + } + break; + } +} + +int WindowSDL::sdlGamepadToOrbisButton(u8 button) { + using Libraries::Pad::OrbisPadButtonDataOffset; + + switch (button) { + case SDL_GAMEPAD_BUTTON_DPAD_DOWN: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_DOWN; + case SDL_GAMEPAD_BUTTON_DPAD_UP: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_UP; + case SDL_GAMEPAD_BUTTON_DPAD_LEFT: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_LEFT; + case SDL_GAMEPAD_BUTTON_DPAD_RIGHT: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_RIGHT; + case SDL_GAMEPAD_BUTTON_SOUTH: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_CROSS; + case SDL_GAMEPAD_BUTTON_NORTH: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_TRIANGLE; + case SDL_GAMEPAD_BUTTON_WEST: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_SQUARE; + case SDL_GAMEPAD_BUTTON_EAST: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_CIRCLE; + case SDL_GAMEPAD_BUTTON_START: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_OPTIONS; + case SDL_GAMEPAD_BUTTON_TOUCHPAD: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_TOUCH_PAD; + case SDL_GAMEPAD_BUTTON_BACK: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_TOUCH_PAD; + case SDL_GAMEPAD_BUTTON_LEFT_SHOULDER: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_L1; + case SDL_GAMEPAD_BUTTON_RIGHT_SHOULDER: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_R1; + case SDL_GAMEPAD_BUTTON_LEFT_STICK: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_L3; + case SDL_GAMEPAD_BUTTON_RIGHT_STICK: + return OrbisPadButtonDataOffset::ORBIS_PAD_BUTTON_R3; + default: + return 0; + } +} + } // namespace Frontend diff --git a/src/sdl_window.h b/src/sdl_window.h index 02d01128..cf6c3711 100644 --- a/src/sdl_window.h +++ b/src/sdl_window.h @@ -7,6 +7,7 @@ #include "common/types.h" struct SDL_Window; +struct SDL_Gamepad; union SDL_Event; namespace Input { @@ -66,6 +67,9 @@ public: private: void onResize(); void onKeyPress(const SDL_Event* event); + void onGamepadEvent(const SDL_Event* event); + + int sdlGamepadToOrbisButton(u8 button); private: s32 width; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index e85272e9..bbf259fe 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -128,11 +128,7 @@ Id EmitReadConst(EmitContext& ctx) { Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } - const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))}; - index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords); + index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords); const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; return ctx.OpLoad(buffer.data_types->Get(1), ptr); } @@ -218,6 +214,10 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { } void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) { + if (attr == IR::Attribute::Position1) { + LOG_WARNING(Render_Vulkan, "Ignoring pos1 export"); + return; + } const Id pointer{OutputAttrPointer(ctx, attr, element)}; ctx.OpStore(pointer, ctx.OpBitcast(ctx.F32[1], value)); } @@ -229,9 +229,6 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { template static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { @@ -386,19 +383,12 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com if (is_signed) { value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpConvertSToF(ctx.F32[1], value); } else { value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpConvertUToF(ctx.F32[1], value); - } - } else { - if (is_signed) { - value = ctx.OpConvertSToF(ctx.F32[1], value); - } else { - value = ctx.OpConvertUToF(ctx.F32[1], value); } } + value = ctx.OpBitcast(ctx.F32[1], value); return ConvertValue(ctx, value, num_format, bit_width); } break; @@ -411,9 +401,6 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com template static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); if constexpr (N == 1) { return GetBufferFormatValue(ctx, handle, address, 0); @@ -445,9 +432,6 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad template static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { @@ -483,4 +467,96 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address EmitStoreBufferF32xN<1>(ctx, handle, address, value); } +static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, + u32 bit_width) { + switch (format) { + case AmdGpu::NumberFormat::Unorm: + return ctx.OpConvertFToU( + ctx.U32[1], ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width))))); + case AmdGpu::NumberFormat::Uint: + return ctx.OpBitcast(ctx.U32[1], value); + case AmdGpu::NumberFormat::Float: + return value; + default: + UNREACHABLE_MSG("Unsupported number fromat for conversion: {}", + magic_enum::enum_name(format)); + } +} + +template +static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { + auto& buffer = ctx.buffers[handle]; + const auto format = buffer.buffer.GetDataFmt(); + const auto num_format = buffer.buffer.GetNumberFmt(); + + switch (format) { + case AmdGpu::DataFormat::FormatInvalid: + return; + case AmdGpu::DataFormat::Format8_8_8_8: + case AmdGpu::DataFormat::Format16: + case AmdGpu::DataFormat::Format32: + case AmdGpu::DataFormat::Format32_32_32_32: { + ASSERT(N == AmdGpu::NumComponents(format)); + + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); + + Id packed_value{}; + for (u32 i = 0; i < N; i++) { + const u32 bit_width = AmdGpu::ComponentBits(format, i); + const u32 bit_offset = AmdGpu::ComponentOffset(format, i) % 32; + + const Id comp{ConvertF32ToFormat( + ctx, N == 1 ? value : ctx.OpCompositeExtract(ctx.F32[1], value, i), num_format, + bit_width)}; + + if (bit_width == 32) { + if constexpr (N == 1) { + ctx.OpStore(ptr, comp); + } else { + const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); + const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, + ctx.u32_zero_value, index_i); + ctx.OpStore(ptr, comp); + } + } else { + if (i == 0) { + packed_value = comp; + } else { + packed_value = + ctx.OpBitFieldInsert(ctx.U32[1], packed_value, comp, + ctx.ConstU32(bit_offset), ctx.ConstU32(bit_width)); + } + + if (i == N - 1) { + ctx.OpStore(ptr, packed_value); + } + } + } + } break; + default: + UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); + } +} + +void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferFormatF32xN<1>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<2>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<3>(ctx, handle, address, value); +} + +void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id value) { + EmitStoreBufferFormatF32xN<4>(ctx, handle, address, value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 030d3948..5526e541 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -17,86 +17,133 @@ struct ImageOperands { operands.push_back(value); } + void AddOffset(EmitContext& ctx, const IR::Value& offset, + bool can_use_runtime_offsets = false) { + if (offset.IsEmpty()) { + return; + } + if (offset.IsImmediate()) { + const s32 operand = offset.U32(); + Add(spv::ImageOperandsMask::ConstOffset, ctx.ConstS32(operand)); + return; + } + IR::Inst* const inst{offset.InstRecursive()}; + if (inst->AreAllArgsImmediates()) { + switch (inst->GetOpcode()) { + case IR::Opcode::CompositeConstructU32x2: + Add(spv::ImageOperandsMask::ConstOffset, + ctx.ConstS32(static_cast(inst->Arg(0).U32()), + static_cast(inst->Arg(1).U32()))); + return; + case IR::Opcode::CompositeConstructU32x3: + Add(spv::ImageOperandsMask::ConstOffset, + ctx.ConstS32(static_cast(inst->Arg(0).U32()), + static_cast(inst->Arg(1).U32()), + static_cast(inst->Arg(2).U32()))); + return; + default: + break; + } + } + if (can_use_runtime_offsets) { + Add(spv::ImageOperandsMask::Offset, ctx.Def(offset)); + } else { + LOG_WARNING(Render_Vulkan, + "Runtime offset provided to unsupported image sample instruction"); + } + } + spv::ImageOperandsMask mask{}; boost::container::static_vector operands; }; -Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, - Id offset) { +Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias, + const IR::Value& offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); ImageOperands operands; - operands.Add(spv::ImageOperandsMask::Offset, offset); + operands.Add(spv::ImageOperandsMask::Bias, bias); + operands.AddOffset(ctx, offset); return ctx.OpImageSampleImplicitLod(ctx.F32[4], sampled_image, coords, operands.mask, operands.operands); } -Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, - Id offset) { +Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id lod, + const IR::Value& offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, - spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); + ImageOperands operands; + operands.Add(spv::ImageOperandsMask::Lod, lod); + operands.AddOffset(ctx, offset); + return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, operands.mask, + operands.operands); } Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, const IR::Value& offset) { + Id bias, const IR::Value& offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleDrefImplicitLod(ctx.F32[1], sampled_image, coords, dref); + ImageOperands operands; + operands.Add(spv::ImageOperandsMask::Bias, bias); + operands.AddOffset(ctx, offset); + return ctx.OpImageSampleDrefImplicitLod(ctx.F32[1], sampled_image, coords, dref, operands.mask, + operands.operands); } Id EmitImageSampleDrefExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, Id offset) { + Id lod, const IR::Value& offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleDrefExplicitLod(ctx.F32[1], sampled_image, coords, dref, - spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); + ImageOperands operands; + operands.AddOffset(ctx, offset); + operands.Add(spv::ImageOperandsMask::Lod, lod); + return ctx.OpImageSampleDrefExplicitLod(ctx.F32[1], sampled_image, coords, dref, operands.mask, + operands.operands); } -Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id offset2) { +Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, + const IR::Value& offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); const u32 comp = inst->Flags().gather_comp.Value(); ImageOperands operands; - operands.Add(spv::ImageOperandsMask::Offset, offset); + operands.AddOffset(ctx, offset); return ctx.OpImageGather(ctx.F32[4], sampled_image, coords, ctx.ConstU32(comp), operands.mask, operands.operands); } -Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, - Id offset2, Id dref) { +Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, + const IR::Value& offset, Id dref) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); ImageOperands operands; - operands.Add(spv::ImageOperandsMask::Offset, offset); + operands.AddOffset(ctx, offset); return ctx.OpImageDrefGather(ctx.F32[4], sampled_image, coords, dref, operands.mask, operands.operands); } -Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id lod, - Id ms) { +Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const IR::Value& offset, + Id lod, Id ms) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id result_type = texture.data_types->Get(4); - if (Sirit::ValidId(lod)) { - return ctx.OpBitcast(ctx.F32[4], ctx.OpImageFetch(result_type, image, coords, - spv::ImageOperandsMask::Lod, lod)); - } else { - return ctx.OpBitcast(ctx.F32[4], ctx.OpImageFetch(result_type, image, coords)); - } + ImageOperands operands; + operands.AddOffset(ctx, offset); + operands.Add(spv::ImageOperandsMask::Lod, lod); + return ctx.OpBitcast( + ctx.F32[4], ctx.OpImageFetch(result_type, image, coords, operands.mask, operands.operands)); } Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod, bool skip_mips) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 51899eb4..8a0fcd4b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -76,6 +76,10 @@ void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); @@ -93,15 +97,9 @@ Id EmitUndefU8(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); -Id EmitLoadSharedU8(EmitContext& ctx, Id offset); -Id EmitLoadSharedS8(EmitContext& ctx, Id offset); -Id EmitLoadSharedU16(EmitContext& ctx, Id offset); -Id EmitLoadSharedS16(EmitContext& ctx, Id offset); Id EmitLoadSharedU32(EmitContext& ctx, Id offset); Id EmitLoadSharedU64(EmitContext& ctx, Id offset); Id EmitLoadSharedU128(EmitContext& ctx, Id offset); -void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value); -void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value); @@ -357,19 +355,20 @@ Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitConvertU32U16(EmitContext& ctx, Id value); -Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, - Id offset); -Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, - Id offset); +Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias, + const IR::Value& offset); +Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id lod, + const IR::Value& offset); Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, const IR::Value& offset); + Id bias, const IR::Value& offset); Id EmitImageSampleDrefExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, Id offset); -Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id offset2); -Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, - Id offset2, Id dref); -Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id lod, - Id ms); + Id lod, const IR::Value& offset); +Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, + const IR::Value& offset); +Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, + const IR::Value& offset, Id dref); +Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const IR::Value& offset, + Id lod, Id ms); Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod, bool skip_mips); Id EmitImageQueryLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords); Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 1582d9dd..57ea476f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -5,99 +5,25 @@ #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { -namespace { -Id Pointer(EmitContext& ctx, Id pointer_type, Id array, Id offset, u32 shift) { - const Id shift_id{ctx.ConstU32(shift)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - return ctx.OpAccessChain(pointer_type, array, ctx.u32_zero_value, index); -} -Id Word(EmitContext& ctx, Id offset) { +Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; return ctx.OpLoad(ctx.U32[1], pointer); } -std::pair ExtractArgs(EmitContext& ctx, Id offset, u32 mask, u32 count) { - const Id shift{ctx.OpShiftLeftLogical(ctx.U32[1], offset, ctx.ConstU32(3U))}; - const Id bit{ctx.OpBitwiseAnd(ctx.U32[1], shift, ctx.ConstU32(mask))}; - const Id count_id{ctx.ConstU32(count)}; - return {bit, count_id}; -} -} // Anonymous namespace - -Id EmitLoadSharedU8(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{ - ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; - return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); - } else { - const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; - return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); - } -} - -Id EmitLoadSharedS8(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{ - ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; - return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, pointer)); - } else { - const auto [bit, count]{ExtractArgs(ctx, offset, 24, 8)}; - return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); - } -} - -Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; - return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); - } else { - const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; - return ctx.OpBitFieldUExtract(ctx.U32[1], Word(ctx, offset), bit, count); - } -} - -Id EmitLoadSharedS16(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; - return ctx.OpSConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); - } else { - const auto [bit, count]{ExtractArgs(ctx, offset, 16, 16)}; - return ctx.OpBitFieldSExtract(ctx.U32[1], Word(ctx, offset), bit, count); - } -} - -Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2)}; - return ctx.OpLoad(ctx.U32[1], pointer); - } else { - return Word(ctx, offset); - } -} - Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; - return ctx.OpLoad(ctx.U32[2], pointer); - } else { - const Id shift_id{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); - } + const Id shift_id{ctx.ConstU32(2U)}; + const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; + const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; + return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), + ctx.OpLoad(ctx.U32[1], rhs_pointer)); } Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; - return ctx.OpLoad(ctx.U32[4], pointer); - } const Id shift_id{ctx.ConstU32(2U)}; const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; std::array values{}; @@ -109,35 +35,14 @@ Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { return ctx.OpCompositeConstruct(ctx.U32[4], values); } -void EmitWriteSharedU8(EmitContext& ctx, Id offset, Id value) { - const Id pointer{ - ctx.OpAccessChain(ctx.shared_u8, ctx.shared_memory_u8, ctx.u32_zero_value, offset)}; - ctx.OpStore(pointer, ctx.OpUConvert(ctx.U8, value)); -} - -void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { - const Id pointer{Pointer(ctx, ctx.shared_u16, ctx.shared_memory_u16, offset, 1)}; - ctx.OpStore(pointer, ctx.OpUConvert(ctx.U16, value)); -} - void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { - Id pointer{}; - if (ctx.profile.support_explicit_workgroup_layout) { - pointer = Pointer(ctx, ctx.shared_u32, ctx.shared_memory_u32, offset, 2); - } else { - const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); - } + const Id shift{ctx.ConstU32(2U)}; + const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; + const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); ctx.OpStore(pointer, value); } void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u32x2, ctx.shared_memory_u32x2, offset, 3)}; - ctx.OpStore(pointer, value); - return; - } const Id shift{ctx.ConstU32(2U)}; const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; @@ -148,11 +53,6 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { } void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) { - if (ctx.profile.support_explicit_workgroup_layout) { - const Id pointer{Pointer(ctx, ctx.shared_u32x4, ctx.shared_memory_u32x4, offset, 4)}; - ctx.OpStore(pointer, value); - return; - } const Id shift{ctx.ConstU32(2U)}; const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; for (u32 i = 0; i < 4; ++i) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index 891e41df..3ed89692 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -6,7 +6,9 @@ namespace Shader::Backend::SPIRV { -void EmitPrologue(EmitContext& ctx) {} +void EmitPrologue(EmitContext& ctx) { + ctx.DefineBufferOffsets(); +} void EmitEpilogue(EmitContext& ctx) {} diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 61b55437..4b732ecd 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -165,14 +165,18 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f throw InvalidArgument("Invalid attribute type {}", fmt); } -Id EmitContext::GetBufferOffset(u32 binding) { - const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); - const u32 comp = (binding & 0xf) >> 2; - const u32 offset = (binding & 0x3) << 3; - const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), - push_data_block, ConstU32(half), ConstU32(comp))}; - const Id value{OpLoad(U32[1], ptr)}; - return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); +void EmitContext::DefineBufferOffsets() { + for (auto& buffer : buffers) { + const u32 binding = buffer.binding; + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); + buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); + } } Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { @@ -327,7 +331,9 @@ void EmitContext::DefineBuffers() { for (u32 i = 0; const auto& buffer : info.buffers) { const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; const Id data_type = (*data_types)[1]; - const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))}; + const Id record_array_type{buffer.is_storage + ? TypeRuntimeArray(data_type) + : TypeArray(data_type, ConstU32(buffer.length))}; const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { Decorate(record_array_type, spv::Decoration::ArrayStride, 4); @@ -354,7 +360,7 @@ void EmitContext::DefineBuffers() { buffers.push_back({ .id = id, - .global_binding = binding++, + .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, .buffer = buffer.GetVsharp(info), @@ -401,6 +407,10 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { return spv::ImageFormat::Rgba16f; } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format16_16_16_16 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { + return spv::ImageFormat::Rgba16; + } if (image.GetDataFmt() == AmdGpu::DataFormat::Format8 && image.GetNumberFmt() == AmdGpu::NumberFormat::Unorm) { return spv::ImageFormat::R8; @@ -507,43 +517,9 @@ void EmitContext::DefineSharedMemory() { if (info.shared_memory_size == 0) { info.shared_memory_size = DefaultSharedMemSize; } - const auto make{[&](Id element_type, u32 element_size) { - const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)}; - const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; - Decorate(array_type, spv::Decoration::ArrayStride, element_size); - - const Id struct_type{TypeStruct(array_type)}; - MemberDecorate(struct_type, 0U, spv::Decoration::Offset, 0U); - Decorate(struct_type, spv::Decoration::Block); - - const Id pointer{TypePointer(spv::StorageClass::Workgroup, struct_type)}; - const Id element_pointer{TypePointer(spv::StorageClass::Workgroup, element_type)}; - const Id variable{AddGlobalVariable(pointer, spv::StorageClass::Workgroup)}; - Decorate(variable, spv::Decoration::Aliased); - interfaces.push_back(variable); - - return std::make_tuple(variable, element_pointer, pointer); - }}; - if (profile.support_explicit_workgroup_layout) { - AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); - AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); - if (info.uses_shared_u8) { - AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout8BitAccessKHR); - std::tie(shared_memory_u8, shared_u8, std::ignore) = make(U8, 1); - } - if (info.uses_shared_u16) { - AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); - std::tie(shared_memory_u16, shared_u16, std::ignore) = make(U16, 2); - } - std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make(U32[1], 4); - std::tie(shared_memory_u32x2, shared_u32x2, std::ignore) = make(U32[2], 8); - std::tie(shared_memory_u32x4, shared_u32x4, std::ignore) = make(U32[4], 16); - return; - } const u32 num_elements{Common::DivCeil(info.shared_memory_size, 4U)}; const Id type{TypeArray(U32[1], ConstU32(num_elements))}; shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); - shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); interfaces.push_back(shared_memory_u32); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 0d090eb3..81237a9a 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -40,7 +40,7 @@ public: ~EmitContext(); Id Def(const IR::Value& value); - Id GetBufferOffset(u32 binding); + void DefineBufferOffsets(); [[nodiscard]] Id DefineInput(Id type, u32 location) { const Id input_id{DefineVar(type, spv::StorageClass::Input)}; @@ -203,7 +203,8 @@ public: struct BufferDefinition { Id id; Id offset; - u32 global_binding; + Id offset_dwords; + u32 binding; const VectorIds* data_types; Id pointer_type; AmdGpu::Buffer buffer; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index b295c1be..d48e4def 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -73,101 +73,190 @@ void Translator::EmitPrologue() { } } -template <> -IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - IR::U32F32 value{}; +template +T Translator::GetSrc(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; - const bool is_float = operand.type == ScalarType::Float32 || force_flt; + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm32(std::bit_cast(value)); + } else { + return ir.Imm32(std::bit_cast(value)); + } + }; + + T value{}; switch (operand.field) { case OperandField::ScalarGPR: - if (is_float) { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } else { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } + value = ir.GetScalarReg(IR::ScalarReg(operand.code)); break; case OperandField::VectorGPR: - if (is_float) { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } else { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } + value = ir.GetVectorReg(IR::VectorReg(operand.code)); break; case OperandField::ConstZero: - if (is_float) { - value = ir.Imm32(0.f); - } else { - value = ir.Imm32(0U); - } + value = get_imm(0U); break; case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - value = ir.Imm32(operand.code - SignedConstIntPosMin + 1); + value = get_imm(operand.code - SignedConstIntPosMin + 1); break; case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); + value = get_imm(-s32(operand.code) + SignedConstIntNegMin - 1); break; case OperandField::LiteralConst: - if (is_float) { - value = ir.Imm32(std::bit_cast(operand.code)); - } else { - value = ir.Imm32(operand.code); - } + value = get_imm(operand.code); break; case OperandField::ConstFloatPos_1_0: - if (is_float) { - value = ir.Imm32(1.f); - } else { - value = ir.Imm32(std::bit_cast(1.f)); - } + value = get_imm(1.f); break; case OperandField::ConstFloatPos_0_5: - value = ir.Imm32(0.5f); + value = get_imm(0.5f); break; case OperandField::ConstFloatPos_2_0: - value = ir.Imm32(2.0f); + value = get_imm(2.0f); break; case OperandField::ConstFloatPos_4_0: - value = ir.Imm32(4.0f); + value = get_imm(4.0f); break; case OperandField::ConstFloatNeg_0_5: - value = ir.Imm32(-0.5f); + value = get_imm(-0.5f); break; case OperandField::ConstFloatNeg_1_0: - if (is_float) { - value = ir.Imm32(-1.0f); - } else { - value = ir.Imm32(std::bit_cast(-1.0f)); - } + value = get_imm(-1.0f); break; case OperandField::ConstFloatNeg_2_0: - value = ir.Imm32(-2.0f); + value = get_imm(-2.0f); break; case OperandField::ConstFloatNeg_4_0: - value = ir.Imm32(-4.0f); + value = get_imm(-4.0f); break; case OperandField::VccLo: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccLo()); } else { value = ir.GetVccLo(); } break; case OperandField::VccHi: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccHi()); } else { value = ir.GetVccHi(); } break; case OperandField::M0: - return m0_value; + if constexpr (is_float) { + UNREACHABLE(); + } else { + return m0_value; + } default: UNREACHABLE(); } - if (is_float) { + if constexpr (is_float) { + if (operand.input_modifier.abs) { + value = ir.FPAbs(value); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(value); + } + } else { + if (operand.input_modifier.abs) { + LOG_WARNING(Render_Vulkan, "Input abs modifier on integer instruction"); + } + if (operand.input_modifier.neg) { + UNREACHABLE(); + } + } + return value; +} + +template IR::U32 Translator::GetSrc(const InstOperand&); +template IR::F32 Translator::GetSrc(const InstOperand&); + +template +T Translator::GetSrc64(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; + + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm64(std::bit_cast(value)); + } else { + return ir.Imm64(std::bit_cast(value)); + } + }; + + T value{}; + switch (operand.field) { + case OperandField::ScalarGPR: { + const auto value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); + const auto value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::VectorGPR: { + const auto value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); + const auto value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::ConstZero: + value = get_imm(0ULL); + break; + case OperandField::SignedConstIntPos: + value = get_imm(s64(operand.code) - SignedConstIntPosMin + 1); + break; + case OperandField::SignedConstIntNeg: + value = get_imm(-s64(operand.code) + SignedConstIntNegMin - 1); + break; + case OperandField::LiteralConst: + value = get_imm(u64(operand.code)); + break; + case OperandField::ConstFloatPos_1_0: + value = get_imm(1.0); + break; + case OperandField::ConstFloatPos_0_5: + value = get_imm(0.5); + break; + case OperandField::ConstFloatPos_2_0: + value = get_imm(2.0); + break; + case OperandField::ConstFloatPos_4_0: + value = get_imm(4.0); + break; + case OperandField::ConstFloatNeg_0_5: + value = get_imm(-0.5); + break; + case OperandField::ConstFloatNeg_1_0: + value = get_imm(-1.0); + break; + case OperandField::ConstFloatNeg_2_0: + value = get_imm(-2.0); + break; + case OperandField::ConstFloatNeg_4_0: + value = get_imm(-4.0); + break; + case OperandField::VccLo: + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(ir.GetVccLo(), ir.GetVccHi())); + } + break; + case OperandField::VccHi: + default: + UNREACHABLE(); + } + + if constexpr (is_float) { if (operand.input_modifier.abs) { value = ir.FPAbs(value); } @@ -178,148 +267,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { return value; } -template <> -IR::U32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - return GetSrc(operand, force_flt); -} - -template <> -IR::F32 Translator::GetSrc(const InstOperand& operand, bool) { - return GetSrc(operand, true); -} - -template <> -IR::U64F64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - IR::Value value_hi{}; - IR::Value value_lo{}; - - bool immediate = false; - const bool is_float = operand.type == ScalarType::Float64 || force_flt; - switch (operand.field) { - case OperandField::ScalarGPR: - if (is_float) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::VectorGPR: - if (is_float) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::ConstZero: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(0.0); - } else { - value_lo = ir.Imm64(u64(0U)); - } - break; - case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(s64(operand.code) - SignedConstIntPosMin + 1); - break; - case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(-s64(operand.code) + SignedConstIntNegMin - 1); - break; - case OperandField::LiteralConst: - immediate = true; - if (force_flt) { - UNREACHABLE(); // There is a literal double? - } else { - value_lo = ir.Imm64(u64(operand.code)); - } - break; - case OperandField::ConstFloatPos_1_0: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(1.0); - } else { - value_lo = ir.Imm64(std::bit_cast(f64(1.0))); - } - break; - case OperandField::ConstFloatPos_0_5: - immediate = true; - value_lo = ir.Imm64(0.5); - break; - case OperandField::ConstFloatPos_2_0: - immediate = true; - value_lo = ir.Imm64(2.0); - break; - case OperandField::ConstFloatPos_4_0: - immediate = true; - value_lo = ir.Imm64(4.0); - break; - case OperandField::ConstFloatNeg_0_5: - immediate = true; - value_lo = ir.Imm64(-0.5); - break; - case OperandField::ConstFloatNeg_1_0: - immediate = true; - value_lo = ir.Imm64(-1.0); - break; - case OperandField::ConstFloatNeg_2_0: - immediate = true; - value_lo = ir.Imm64(-2.0); - break; - case OperandField::ConstFloatNeg_4_0: - immediate = true; - value_lo = ir.Imm64(-4.0); - break; - case OperandField::VccLo: { - value_lo = ir.GetVccLo(); - value_hi = ir.GetVccHi(); - } break; - case OperandField::VccHi: - UNREACHABLE(); - default: - UNREACHABLE(); - } - - IR::Value value; - - if (immediate) { - value = value_lo; - } else if (is_float) { - throw NotImplementedException("required OpPackDouble2x32 implementation"); - } else { - IR::Value packed = ir.CompositeConstruct(value_lo, value_hi); - value = ir.PackUint2x32(packed); - } - - if (is_float) { - if (operand.input_modifier.abs) { - value = ir.FPAbs(IR::F32F64(value)); - } - if (operand.input_modifier.neg) { - value = ir.FPNeg(IR::F32F64(value)); - } - } - return IR::U64F64(value); -} - -template <> -IR::U64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - return GetSrc64(operand, force_flt); -} -template <> -IR::F64 Translator::GetSrc64(const InstOperand& operand, bool) { - return GetSrc64(operand, true); -} +template IR::U64 Translator::GetSrc64(const InstOperand&); +template IR::F64 Translator::GetSrc64(const InstOperand&); void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { IR::U32F32 result = value; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index fe4457d2..9ebcb116 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -186,7 +186,7 @@ public: // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); - void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); // Vector interpolation void V_INTERP_P2_F32(const GcnInst& inst); @@ -211,10 +211,10 @@ public: void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); private: - template - [[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false); - template - [[nodiscard]] T GetSrc64(const InstOperand& operand, bool flt_zero = false); + template + [[nodiscard]] T GetSrc(const InstOperand& operand); + template + [[nodiscard]] T GetSrc64(const InstOperand& operand); void SetDst(const InstOperand& operand, const IR::U32F32& value); void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 89428c44..1bbc3c16 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/translate/translate.h" -#include "shader_recompiler/profile.h" namespace Shader::Gcn { @@ -312,7 +311,7 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { } void Translator::V_MOV(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0])); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_SAD(const GcnInst& inst) { @@ -321,14 +320,14 @@ void Translator::V_SAD(const GcnInst& inst) { } void Translator::V_MAC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true), - GetSrc(inst.dst[0], true))); + SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0]), GetSrc(inst.src[1]), + GetSrc(inst.dst[0]))); } void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value vec_f32 = - ir.CompositeConstruct(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)); + ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); ir.SetVectorReg(dst_reg, ir.PackHalf2x16(vec_f32)); } @@ -339,13 +338,13 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) { } void Translator::V_CVT_F16_F32(const GcnInst& inst) { - const IR::F32 src0 = GetSrc(inst.src[0], true); + const IR::F32 src0 = GetSrc(inst.src[0]); const IR::F16 src0fp16 = ir.FPConvert(16, src0); SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast(src0fp16))); } void Translator::V_MUL_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); + SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0]), GetSrc(inst.src[1]))); } void Translator::V_CNDMASK_B32(const GcnInst& inst) { @@ -354,24 +353,8 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) { const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR ? ir.GetThreadBitScalarReg(flag_reg) : ir.GetVcc(); - - // We can treat the instruction as integer most of the time, but when a source is - // a floating point constant we will force the other as float for better readability - // The other operand is also higly likely to be float as well. - const auto is_float_const = [](OperandField field) { - return field >= OperandField::ConstFloatPos_0_5 && field <= OperandField::ConstFloatNeg_4_0; - }; - const bool has_flt_source = - is_float_const(inst.src[0].field) || is_float_const(inst.src[1].field); - IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source); - IR::U32F32 src1 = GetSrc(inst.src[1], has_flt_source); - if (src0.Type() == IR::Type::F32 && src1.Type() == IR::Type::U32) { - src1 = ir.BitCast(src1); - } - if (src1.Type() == IR::Type::F32 && src0.Type() == IR::Type::U32) { - src0 = ir.BitCast(src0); - } - const IR::Value result = ir.Select(flag, src1, src0); + const IR::Value result = + ir.Select(flag, GetSrc(inst.src[1]), GetSrc(inst.src[0])); ir.SetVectorReg(dst_reg, IR::U32F32{result}); } @@ -448,21 +431,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) { } void Translator::V_MAD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_FRACT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.Fract(src0)); } void Translator::V_ADD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPAdd(src0, src1)); } @@ -476,9 +459,9 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { } void Translator::V_MED3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); } @@ -492,32 +475,32 @@ void Translator::V_MED3_I32(const GcnInst& inst) { } void Translator::V_FLOOR_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); } void Translator::V_SUB_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src0, src1)); } void Translator::V_RCP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecip(src0)); } void Translator::V_FMA_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; const IR::U1 result = [&] { switch (op) { case ConditionOp::F: @@ -557,8 +540,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { } void Translator::V_MAX_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMax(src0, src1, is_legacy)); } @@ -569,40 +552,40 @@ void Translator::V_MAX_U32(bool is_signed, const GcnInst& inst) { } void Translator::V_RSQ_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecipSqrt(src0)); } void Translator::V_SIN_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSin(src0)); } void Translator::V_LOG_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPLog2(src0)); } void Translator::V_EXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPExp2(src0)); } void Translator::V_SQRT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSqrt(src0)); } void Translator::V_MIN_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMin(src0, src1, is_legacy)); } void Translator::V_MIN3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } @@ -614,9 +597,9 @@ void Translator::V_MIN3_I32(const GcnInst& inst) { } void Translator::V_MADMK_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 k{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 k{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, k, src1)); } @@ -625,25 +608,25 @@ void Translator::V_CUBEMA_F32(const GcnInst& inst) { } void Translator::V_CUBESC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0], true)); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_CUBETC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[1], true)); + SetDst(inst.dst[0], GetSrc(inst.src[1])); } void Translator::V_CUBEID_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[2], true)); + SetDst(inst.dst[0], GetSrc(inst.src[2])); } void Translator::V_CVT_U32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToU(32, src0)); } void Translator::V_SUBREV_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src1, src0)); } @@ -727,9 +710,17 @@ void Translator::V_SAD_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 src2{GetSrc(inst.src[2])}; - const IR::U32 max{ir.IMax(src0, src1, false)}; - const IR::U32 min{ir.IMin(src0, src1, false)}; - SetDst(inst.dst[0], ir.IAdd(ir.ISub(max, min), src2)); + IR::U32 result; + if (src0.IsImmediate() && src0.U32() == 0U) { + result = src1; + } else if (src1.IsImmediate() && src1.U32() == 0U) { + result = src0; + } else { + const IR::U32 max{ir.IMax(src0, src1, false)}; + const IR::U32 min{ir.IMin(src0, src1, false)}; + result = ir.ISub(max, min); + } + SetDst(inst.dst[0], ir.IAdd(result, src2)); } void Translator::V_BFE_U32(bool is_signed, const GcnInst& inst) { @@ -783,7 +774,7 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) { } void Translator::V_RNDNE_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRoundEven(src0)); } @@ -794,14 +785,14 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) { } void Translator::V_COS_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCos(src0)); } void Translator::V_MAX3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } @@ -813,7 +804,7 @@ void Translator::V_MAX3_U32(const GcnInst& inst) { } void Translator::V_CVT_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); } @@ -830,12 +821,12 @@ void Translator::V_MUL_LO_U32(const GcnInst& inst) { } void Translator::V_TRUNC_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPTrunc(src0)); } void Translator::V_CEIL_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCeil(src0)); } @@ -899,18 +890,18 @@ void Translator::V_BFREV_B32(const GcnInst& inst) { } void Translator::V_LDEXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPLdexp(src0, src1)); } void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToI(32, true, ir.FPFloor(src0))); } void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { - const IR::F32F64 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; IR::U1 value; if (src1.IsImmediate()) { diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 3c6dfbda..63f6c3b4 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -53,6 +53,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::IMAGE_GET_RESINFO: return IMAGE_GET_RESINFO(inst); + // Buffer load operations case Opcode::TBUFFER_LOAD_FORMAT_X: return BUFFER_LOAD_FORMAT(1, true, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XY: @@ -61,6 +62,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, true, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZW: return BUFFER_LOAD_FORMAT(4, true, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_X: return BUFFER_LOAD_FORMAT(1, false, true, inst); case Opcode::BUFFER_LOAD_FORMAT_XY: @@ -69,6 +71,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, false, true, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZW: return BUFFER_LOAD_FORMAT(4, false, true, inst); + case Opcode::BUFFER_LOAD_DWORD: return BUFFER_LOAD_FORMAT(1, false, false, inst); case Opcode::BUFFER_LOAD_DWORDX2: @@ -77,16 +80,25 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_LOAD_FORMAT(3, false, false, inst); case Opcode::BUFFER_LOAD_DWORDX4: return BUFFER_LOAD_FORMAT(4, false, false, inst); + + // Buffer store operations case Opcode::BUFFER_STORE_FORMAT_X: - case Opcode::BUFFER_STORE_DWORD: - return BUFFER_STORE_FORMAT(1, false, inst); - case Opcode::BUFFER_STORE_DWORDX2: - return BUFFER_STORE_FORMAT(2, false, inst); - case Opcode::BUFFER_STORE_DWORDX3: - return BUFFER_STORE_FORMAT(3, false, inst); + return BUFFER_STORE_FORMAT(1, false, true, inst); + case Opcode::BUFFER_STORE_FORMAT_XY: + return BUFFER_STORE_FORMAT(2, false, true, inst); + case Opcode::BUFFER_STORE_FORMAT_XYZ: + return BUFFER_STORE_FORMAT(3, false, true, inst); case Opcode::BUFFER_STORE_FORMAT_XYZW: + return BUFFER_STORE_FORMAT(4, false, true, inst); + + case Opcode::BUFFER_STORE_DWORD: + return BUFFER_STORE_FORMAT(1, false, false, inst); + case Opcode::BUFFER_STORE_DWORDX2: + return BUFFER_STORE_FORMAT(2, false, false, inst); + case Opcode::BUFFER_STORE_DWORDX3: + return BUFFER_STORE_FORMAT(3, false, false, inst); case Opcode::BUFFER_STORE_DWORDX4: - return BUFFER_STORE_FORMAT(4, false, inst); + return BUFFER_STORE_FORMAT(4, false, false, inst); default: LogMissingOpcode(inst); } @@ -135,8 +147,8 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { // Load first address components as denoted in 8.2.4 VGPR Usage Sea Islands Series Instruction // Set Architecture - const IR::Value offset = - flags.test(MimgModifier::Offset) ? ir.GetVectorReg(addr_reg++) : IR::Value{}; + const IR::U32 offset = + flags.test(MimgModifier::Offset) ? ir.GetVectorReg(addr_reg++) : IR::U32{}; const IR::F32 bias = flags.test(MimgModifier::LodBias) ? ir.GetVectorReg(addr_reg++) : IR::F32{}; const IR::F32 dref = @@ -168,18 +180,17 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { // Issue IR instruction, leaving unknown fields blank to patch later. const IR::Value texel = [&]() -> IR::Value { - const IR::F32 lod = flags.test(MimgModifier::Level0) ? ir.Imm32(0.f) : IR::F32{}; if (!flags.test(MimgModifier::Pcf)) { if (explicit_lod) { - return ir.ImageSampleExplicitLod(handle, body, lod, offset, info); + return ir.ImageSampleExplicitLod(handle, body, offset, info); } else { - return ir.ImageSampleImplicitLod(handle, body, bias, offset, {}, info); + return ir.ImageSampleImplicitLod(handle, body, bias, offset, info); } } if (explicit_lod) { - return ir.ImageSampleDrefExplicitLod(handle, body, dref, lod, offset, info); + return ir.ImageSampleDrefExplicitLod(handle, body, dref, offset, info); } - return ir.ImageSampleDrefImplicitLod(handle, body, dref, bias, offset, {}, info); + return ir.ImageSampleDrefImplicitLod(handle, body, dref, bias, offset, info); }(); for (u32 i = 0; i < 4; i++) { @@ -251,10 +262,10 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) { const IR::Value texel = [&]() -> IR::Value { const IR::F32 lod = flags.test(MimgModifier::Level0) ? ir.Imm32(0.f) : IR::F32{}; if (!flags.test(MimgModifier::Pcf)) { - return ir.ImageGather(handle, body, offset, {}, info); + return ir.ImageGather(handle, body, offset, info); } ASSERT(mimg.dmask & 1); // should be always 1st (R) component - return ir.ImageGatherDref(handle, body, offset, {}, dref, info); + return ir.ImageGatherDref(handle, body, offset, dref, info); }(); // For gather4 instructions dmask selects which component to read and must have @@ -360,7 +371,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma } } -void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { +void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, + const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -411,7 +423,11 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnIns const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - ir.StoreBuffer(num_dwords, handle, address, value, info); + if (is_format) { + ir.StoreBufferFormat(num_dwords, handle, address, value, info); + } else { + ir.StoreBuffer(num_dwords, handle, address, value, info); + } } void Translator::IMAGE_GET_LOD(const GcnInst& inst) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 03404aca..4271ac35 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -16,18 +16,6 @@ namespace { UNREACHABLE_MSG("Invalid type = {}, functionName = {}, line = {}", u32(type), functionName, lineNumber); } - -Value MakeLodClampPair(IREmitter& ir, const F32& bias_lod, const F32& lod_clamp) { - if (!bias_lod.IsEmpty() && !lod_clamp.IsEmpty()) { - return ir.CompositeConstruct(bias_lod, lod_clamp); - } else if (!bias_lod.IsEmpty()) { - return bias_lod; - } else if (!lod_clamp.IsEmpty()) { - return lod_clamp; - } else { - return Value{}; - } -} } // Anonymous namespace U1 IREmitter::Imm1(bool value) const { @@ -271,10 +259,6 @@ void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, u32 comp Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { switch (bit_size) { - case 8: - return Inst(is_signed ? Opcode::LoadSharedS8 : Opcode::LoadSharedU8, offset); - case 16: - return Inst(is_signed ? Opcode::LoadSharedS16 : Opcode::LoadSharedU16, offset); case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: @@ -288,12 +272,6 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { switch (bit_size) { - case 8: - Inst(Opcode::WriteSharedU8, offset, value); - break; - case 16: - Inst(Opcode::WriteSharedU16, offset, value); - break; case 32: Inst(Opcode::WriteSharedU32, offset, value); break; @@ -369,6 +347,26 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad } } +void IREmitter::StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info) { + switch (num_dwords) { + case 1: + Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); + break; + case 2: + Inst(Opcode::StoreBufferFormatF32x2, Flags{info}, handle, address, data); + break; + case 3: + Inst(Opcode::StoreBufferFormatF32x3, Flags{info}, handle, address, data); + break; + case 4: + Inst(Opcode::StoreBufferFormatF32x4, Flags{info}, handle, address, data); + break; + default: + UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); + } +} + U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } @@ -1386,41 +1384,37 @@ Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, c return Inst(Opcode::ImageAtomicExchange32, Flags{info}, handle, coords, value); } -Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& coords, const F32& bias, - const Value& offset, const F32& lod_clamp, +Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& body, const F32& bias, + const U32& offset, TextureInstInfo info) { + return Inst(Opcode::ImageSampleImplicitLod, Flags{info}, handle, body, bias, offset); +} + +Value IREmitter::ImageSampleExplicitLod(const Value& handle, const Value& body, const U32& offset, TextureInstInfo info) { - const Value bias_lc{MakeLodClampPair(*this, bias, lod_clamp)}; - return Inst(Opcode::ImageSampleImplicitLod, Flags{info}, handle, coords, bias_lc, offset); + return Inst(Opcode::ImageSampleExplicitLod, Flags{info}, handle, body, IR::F32{}, offset); } -Value IREmitter::ImageSampleExplicitLod(const Value& handle, const Value& coords, const F32& lod, - const Value& offset, TextureInstInfo info) { - return Inst(Opcode::ImageSampleExplicitLod, Flags{info}, handle, coords, lod, offset); -} - -F32 IREmitter::ImageSampleDrefImplicitLod(const Value& handle, const Value& coords, const F32& dref, - const F32& bias, const Value& offset, - const F32& lod_clamp, TextureInstInfo info) { - const Value bias_lc{MakeLodClampPair(*this, bias, lod_clamp)}; - return Inst(Opcode::ImageSampleDrefImplicitLod, Flags{info}, handle, coords, dref, bias_lc, +F32 IREmitter::ImageSampleDrefImplicitLod(const Value& handle, const Value& body, const F32& dref, + const F32& bias, const U32& offset, + TextureInstInfo info) { + return Inst(Opcode::ImageSampleDrefImplicitLod, Flags{info}, handle, body, dref, bias, offset); } -F32 IREmitter::ImageSampleDrefExplicitLod(const Value& handle, const Value& coords, const F32& dref, - const F32& lod, const Value& offset, - TextureInstInfo info) { - return Inst(Opcode::ImageSampleDrefExplicitLod, Flags{info}, handle, coords, dref, lod, +F32 IREmitter::ImageSampleDrefExplicitLod(const Value& handle, const Value& body, const F32& dref, + const U32& offset, TextureInstInfo info) { + return Inst(Opcode::ImageSampleDrefExplicitLod, Flags{info}, handle, body, dref, IR::F32{}, offset); } Value IREmitter::ImageGather(const Value& handle, const Value& coords, const Value& offset, - const Value& offset2, TextureInstInfo info) { - return Inst(Opcode::ImageGather, Flags{info}, handle, coords, offset, offset2); + TextureInstInfo info) { + return Inst(Opcode::ImageGather, Flags{info}, handle, coords, offset); } Value IREmitter::ImageGatherDref(const Value& handle, const Value& coords, const Value& offset, - const Value& offset2, const F32& dref, TextureInstInfo info) { - return Inst(Opcode::ImageGatherDref, Flags{info}, handle, coords, offset, offset2, dref); + const F32& dref, TextureInstInfo info) { + return Inst(Opcode::ImageGatherDref, Flags{info}, handle, coords, offset, dref); } Value IREmitter::ImageFetch(const Value& handle, const Value& coords, const Value& offset, diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a65e4613..59ced93e 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -93,6 +93,8 @@ public: BufferInstInfo info); void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); + void StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info); [[nodiscard]] U32 LaneId(); [[nodiscard]] U32 WarpId(); @@ -241,31 +243,32 @@ public: [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords, const Value& value, TextureInstInfo info); - [[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& coords, - const F32& bias, const Value& offset, - const F32& lod_clamp, TextureInstInfo info); - [[nodiscard]] Value ImageSampleExplicitLod(const Value& handle, const Value& coords, - const F32& lod, const Value& offset, + [[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& body, + const F32& bias, const U32& offset, TextureInstInfo info); - [[nodiscard]] F32 ImageSampleDrefImplicitLod(const Value& handle, const Value& coords, + + [[nodiscard]] Value ImageSampleExplicitLod(const Value& handle, const Value& body, + const U32& offset, TextureInstInfo info); + + [[nodiscard]] F32 ImageSampleDrefImplicitLod(const Value& handle, const Value& body, const F32& dref, const F32& bias, - const Value& offset, const F32& lod_clamp, + const U32& offset, TextureInstInfo info); + + [[nodiscard]] F32 ImageSampleDrefExplicitLod(const Value& handle, const Value& body, + const F32& dref, const U32& offset, TextureInstInfo info); - [[nodiscard]] F32 ImageSampleDrefExplicitLod(const Value& handle, const Value& coords, - const F32& dref, const F32& lod, - const Value& offset, TextureInstInfo info); - [[nodiscard]] Value ImageQueryDimension(const Value& handle, const IR::U32& lod, - const IR::U1& skip_mips); - [[nodiscard]] Value ImageQueryDimension(const Value& handle, const IR::U32& lod, - const IR::U1& skip_mips, TextureInstInfo info); + + [[nodiscard]] Value ImageQueryDimension(const Value& handle, const U32& lod, + const U1& skip_mips); + [[nodiscard]] Value ImageQueryDimension(const Value& handle, const U32& lod, + const U1& skip_mips, TextureInstInfo info); [[nodiscard]] Value ImageQueryLod(const Value& handle, const Value& coords, TextureInstInfo info); [[nodiscard]] Value ImageGather(const Value& handle, const Value& coords, const Value& offset, - const Value& offset2, TextureInstInfo info); + TextureInstInfo info); [[nodiscard]] Value ImageGatherDref(const Value& handle, const Value& coords, - const Value& offset, const Value& offset2, const F32& dref, - TextureInstInfo info); + const Value& offset, const F32& dref, TextureInstInfo info); [[nodiscard]] Value ImageFetch(const Value& handle, const Value& coords, const Value& offset, const U32& lod, const U32& multisampling, TextureInstInfo info); [[nodiscard]] Value ImageGradient(const Value& handle, const Value& coords, diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index aa03e3d6..a8166125 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -55,12 +55,14 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x2: case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: + case Opcode::StoreBufferFormatF32: + case Opcode::StoreBufferFormatF32x2: + case Opcode::StoreBufferFormatF32x3: + case Opcode::StoreBufferFormatF32x4: case Opcode::StoreBufferU32: case Opcode::WriteSharedU128: case Opcode::WriteSharedU64: case Opcode::WriteSharedU32: - case Opcode::WriteSharedU16: - case Opcode::WriteSharedU8: case Opcode::ImageWrite: case Opcode::ImageAtomicIAdd32: case Opcode::ImageAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index aa2fd3f8..4c6122a8 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -26,15 +26,9 @@ OPCODE(WorkgroupMemoryBarrier, Void, OPCODE(DeviceMemoryBarrier, Void, ) // Shared memory operations -OPCODE(LoadSharedU8, U32, U32, ) -OPCODE(LoadSharedS8, U32, U32, ) -OPCODE(LoadSharedU16, U32, U32, ) -OPCODE(LoadSharedS16, U32, U32, ) OPCODE(LoadSharedU32, U32, U32, ) OPCODE(LoadSharedU64, U32x2, U32, ) OPCODE(LoadSharedU128, U32x4, U32, ) -OPCODE(WriteSharedU8, Void, U32, U32, ) -OPCODE(WriteSharedU16, Void, U32, U32, ) OPCODE(WriteSharedU32, Void, U32, U32, ) OPCODE(WriteSharedU64, Void, U32, U32x2, ) OPCODE(WriteSharedU128, Void, U32, U32x4, ) @@ -88,6 +82,10 @@ OPCODE(StoreBufferF32, Void, Opaq OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) +OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32, ) +OPCODE(StoreBufferFormatF32x2, Void, Opaque, Opaque, F32x2, ) +OPCODE(StoreBufferFormatF32x3, Void, Opaque, Opaque, F32x3, ) +OPCODE(StoreBufferFormatF32x4, Void, Opaque, Opaque, F32x4, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) // Vector utility @@ -298,12 +296,12 @@ OPCODE(ConvertU16U32, U16, U32, OPCODE(ConvertU32U16, U32, U16, ) // Image operations -OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) -OPCODE(ImageSampleExplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) -OPCODE(ImageSampleDrefImplicitLod, F32, Opaque, Opaque, F32, Opaque, Opaque, ) -OPCODE(ImageSampleDrefExplicitLod, F32, Opaque, Opaque, F32, Opaque, Opaque, ) -OPCODE(ImageGather, F32x4, Opaque, Opaque, Opaque, Opaque, ) -OPCODE(ImageGatherDref, F32x4, Opaque, Opaque, Opaque, Opaque, F32, ) +OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, F32, Opaque, ) +OPCODE(ImageSampleExplicitLod, F32x4, Opaque, Opaque, U32, Opaque, ) +OPCODE(ImageSampleDrefImplicitLod, F32, Opaque, Opaque, Opaque, F32, Opaque, ) +OPCODE(ImageSampleDrefExplicitLod, F32, Opaque, Opaque, Opaque, U32, Opaque, ) +OPCODE(ImageGather, F32x4, Opaque, Opaque, Opaque, ) +OPCODE(ImageGatherDref, F32x4, Opaque, Opaque, Opaque, F32, ) OPCODE(ImageFetch, F32x4, Opaque, Opaque, Opaque, U32, Opaque, ) OPCODE(ImageQueryDimensions, U32x4, Opaque, U32, U1, ) OPCODE(ImageQueryLod, F32x4, Opaque, Opaque, ) diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index bf2ba4d6..7e2b962b 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -14,5 +14,6 @@ void DeadCodeEliminationPass(IR::Program& program); void ConstantPropagationPass(IR::BlockList& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); +void LowerSharedMemToRegisters(IR::Program& program); } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp new file mode 100644 index 00000000..a87cf31b --- /dev/null +++ b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "shader_recompiler/ir/program.h" + +namespace Shader::Optimization { + +void LowerSharedMemToRegisters(IR::Program& program) { + boost::container::small_vector ds_writes; + Info& info{program.info}; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + const auto opcode = inst.GetOpcode(); + if (opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64) { + ds_writes.emplace_back(&inst); + continue; + } + if (opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64) { + // Search for write instruction with same offset + const IR::Inst* prod = inst.Arg(0).InstRecursive(); + const auto it = std::ranges::find_if(ds_writes, [&](const IR::Inst* write) { + const IR::Inst* write_prod = write->Arg(0).InstRecursive(); + return write_prod->Arg(1).U32() == prod->Arg(1).U32() && + write_prod->Arg(0) == prod->Arg(0); + }); + ASSERT(it != ds_writes.end()); + // Replace data read with value written. + inst.ReplaceUsesWith((*it)->Arg(1)); + } + } + } + // We should have eliminated everything. Invalidate data write instructions. + for (const auto inst : ds_writes) { + inst->Invalidate(); + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 97438f80..97fc5b99 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -37,6 +37,10 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: @@ -73,6 +77,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { case IR::Opcode::LoadBufferFormatF32x2: case IR::Opcode::LoadBufferFormatF32x3: case IR::Opcode::LoadBufferFormatF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: switch (num_format) { case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: @@ -112,6 +120,10 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32x2: + case IR::Opcode::StoreBufferFormatF32x3: + case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: @@ -171,6 +183,22 @@ bool IsImageStorageInstruction(const IR::Inst& inst) { } } +u32 ImageOffsetArgumentPosition(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageGather: + case IR::Opcode::ImageGatherDref: + return 2; + case IR::Opcode::ImageSampleExplicitLod: + case IR::Opcode::ImageSampleImplicitLod: + return 3; + case IR::Opcode::ImageSampleDrefExplicitLod: + case IR::Opcode::ImageSampleDrefImplicitLod: + return 4; + default: + UNREACHABLE(); + } +} + class Descriptors { public: explicit Descriptors(Info& info_) @@ -376,9 +404,11 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, return -1; } // We have found this pattern. Build the sharp. - std::array buffer; + std::array buffer; buffer[0] = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32(); - buffer[1] = handle->Arg(2).U32() | handle->Arg(3).U64() << 32; + buffer[1] = 0; + buffer[2] = handle->Arg(2).U32(); + buffer[3] = handle->Arg(3).U32(); cbuf = std::bit_cast(buffer); // Assign a binding to this sharp. return descriptors.Add(BufferResource{ @@ -492,6 +522,13 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const auto tsharp = TrackSharp(tsharp_handle); const auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); const auto inst_info = inst.Flags(); + if (!image.Valid()) { + LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!"); + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.ReplaceUsesWith( + ir.CompositeConstruct(ir.Imm32(0.f), ir.Imm32(0.f), ir.Imm32(0.f), ir.Imm32(0.f))); + return; + } ASSERT(image.GetType() != AmdGpu::ImageType::Invalid); u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, @@ -565,25 +602,43 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip if (inst_info.has_offset) { // The offsets are six-bit signed integers: X=[5:0], Y=[13:8], and Z=[21:16]. - const bool is_gather = inst.GetOpcode() == IR::Opcode::ImageGather || - inst.GetOpcode() == IR::Opcode::ImageGatherDref; - const u32 arg_pos = is_gather ? 2 : (inst_info.is_depth ? 4 : 3); + const u32 arg_pos = ImageOffsetArgumentPosition(inst); const IR::Value arg = inst.Arg(arg_pos); ASSERT_MSG(arg.Type() == IR::Type::U32, "Unexpected offset type"); - const auto sign_ext = [&](u32 value) { return ir.Imm32(s32(value << 24) >> 24); }; - union { - u32 raw; - BitField<0, 6, u32> x; - BitField<8, 6, u32> y; - BitField<16, 6, u32> z; - } offset{arg.U32()}; - const IR::Value value = ir.CompositeConstruct(sign_ext(offset.x), sign_ext(offset.y)); - inst.SetArg(arg_pos, value); + + const auto read = [&](u32 offset) -> auto { + return ir.BitFieldExtract(IR::U32{arg}, ir.Imm32(offset), ir.Imm32(6), true); + }; + + switch (image.GetType()) { + case AmdGpu::ImageType::Color1D: + case AmdGpu::ImageType::Color1DArray: + inst.SetArg(arg_pos, read(0)); + break; + case AmdGpu::ImageType::Color2D: + case AmdGpu::ImageType::Color2DArray: + inst.SetArg(arg_pos, ir.CompositeConstruct(read(0), read(8))); + break; + case AmdGpu::ImageType::Color3D: + inst.SetArg(arg_pos, ir.CompositeConstruct(read(0), read(8), read(16))); + break; + default: + UNREACHABLE(); + } } if (inst_info.has_lod_clamp) { - // Final argument contains lod_clamp - const u32 arg_pos = inst_info.is_depth ? 5 : 4; + const u32 arg_pos = [&]() -> u32 { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageSampleImplicitLod: + return 2; + case IR::Opcode::ImageSampleDrefImplicitLod: + return 3; + default: + break; + } + return inst_info.is_depth ? 5 : 4; + }(); inst.SetArg(arg_pos, arg); } if (inst_info.explicit_lod) { @@ -591,7 +646,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip inst.GetOpcode() == IR::Opcode::ImageSampleExplicitLod || inst.GetOpcode() == IR::Opcode::ImageSampleDrefExplicitLod); const u32 pos = inst.GetOpcode() == IR::Opcode::ImageSampleExplicitLod ? 2 : 3; - inst.SetArg(pos, arg); + const IR::Value value = inst_info.force_level0 ? ir.Imm32(0.f) : arg; + inst.SetArg(pos, value); } } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 7100b384..52087a65 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -16,18 +16,6 @@ void Visit(Info& info, IR::Inst& inst) { info.stores.Set(inst.Arg(0).Attribute(), inst.Arg(2).U32()); break; } - case IR::Opcode::LoadSharedS8: - case IR::Opcode::LoadSharedU8: - case IR::Opcode::WriteSharedU8: - info.uses_shared_u8 = true; - info.uses_shared = true; - break; - case IR::Opcode::LoadSharedS16: - case IR::Opcode::LoadSharedU16: - case IR::Opcode::WriteSharedU16: - info.uses_shared_u16 = true; - info.uses_shared = true; - break; case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU32: diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 69eec50f..0f9fd6d4 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -58,6 +58,9 @@ IR::Program TranslateProgram(Common::ObjectPool& inst_pool, Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + if (program.info.stage != Stage::Compute) { + Shader::Optimization::LowerSharedMemToRegisters(program); + } Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::CollectShaderInfoPass(program); diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 4ab71c3b..9b592e12 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -116,7 +116,7 @@ struct PushData { std::array buf_offsets; void AddOffset(u32 binding, u32 offset) { - ASSERT(offset < 64 && binding < 32); + ASSERT(offset < 256 && binding < buf_offsets.size()); buf_offsets[binding] = offset; } }; @@ -195,8 +195,6 @@ struct Info { bool has_image_query{}; bool uses_group_quad{}; bool uses_shared{}; - bool uses_shared_u8{}; - bool uses_shared_u16{}; bool uses_fp16{}; bool uses_step_rates{}; bool translation_failed{}; // indicates that shader has unsupported instructions diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index af1963ee..dce2d4b4 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -35,7 +35,7 @@ void Liverpool::Process(std::stop_token stoken) { { std::unique_lock lk{submit_mutex}; Common::CondvarWait(submit_cv, lk, stoken, - [this] { return num_submits != 0 || submit_done; }); + [this] { return num_commands || num_submits || submit_done; }); } if (stoken.stop_requested()) { break; @@ -45,7 +45,23 @@ void Liverpool::Process(std::stop_token stoken) { int qid = -1; - while (num_submits) { + while (num_submits || num_commands) { + + // Process incoming commands with high priority + while (num_commands) { + + Common::UniqueFunction callback{}; + { + std::unique_lock lk{submit_mutex}; + callback = std::move(command_queue.back()); + command_queue.pop(); + } + + callback(); + + --num_commands; + } + qid = (qid + 1) % NumTotalQueues; auto& queue = mapped_queues[qid]; @@ -180,6 +196,17 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanSignal(Platform::InterruptId::GfxFlip); break; } + case PM4CmdNop::PayloadType::DebugMarkerPush: { + const auto marker_sz = nop->header.count.Value() * 2; + const std::string_view label{reinterpret_cast(&nop->data_block[1]), + marker_sz}; + rasterizer->ScopeMarkerBegin(label); + break; + } + case PM4CmdNop::PayloadType::DebugMarkerPop: { + rasterizer->ScopeMarkerEnd(); + break; + } default: break; } @@ -208,7 +235,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span dcb, std::spantype3.count; - if (nop_offset == 0x0e || nop_offset == 0x0d) { + if (nop_offset == 0x0e || nop_offset == 0x0d || nop_offset == 0x0b) { ASSERT_MSG(payload[nop_offset] == 0xc0001000, "NOP hint is missing in CB setup sequence"); last_cb_extent[col_buf_id].raw = payload[nop_offset + 1]; @@ -295,8 +322,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:DrawIndex2", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndex2", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(true); rasterizer->ScopeMarkerEnd(); } @@ -308,8 +336,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index_off->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin(fmt::format( - "dcb:{}:DrawIndexOffset2", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndexOffset2", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(true, draw_index_off->index_offset); rasterizer->ScopeMarkerEnd(); } @@ -320,8 +349,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:DrawIndexAuto", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndexAuto", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(false); rasterizer->ScopeMarkerEnd(); } @@ -334,8 +364,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spandim_z; regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:Dispatch", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->DispatchDirect(); rasterizer->ScopeMarkerEnd(); } @@ -393,7 +424,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + // ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); // Optimization: VO label waits are special because the emulator // will write to the label when presentation is finished. So if // there are no other submits to yield to we can sleep the thread @@ -486,8 +517,9 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { regs.cs_program.dim_z = dispatch_direct->dim_z; regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { - rasterizer->ScopeMarkerBegin(fmt::format( - "acb[{}]:{}:Dispatch", vqid, reinterpret_cast(acb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->DispatchDirect(); rasterizer->ScopeMarkerEnd(); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 3ebd9a97..778bd7a5 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -11,10 +11,12 @@ #include #include #include + #include "common/assert.h" #include "common/bit_field.h" #include "common/polyfill_thread.h" #include "common/types.h" +#include "common/unique_function.h" #include "video_core/amdgpu/pixel_format.h" #include "video_core/amdgpu/resource.h" @@ -766,7 +768,8 @@ struct Liverpool { } TilingMode GetTilingMode() const { - return attrib.tile_mode_index; + return info.linear_general ? TilingMode::Display_Linear + : attrib.tile_mode_index.Value(); } bool IsTiled() const { @@ -866,6 +869,33 @@ struct Liverpool { } }; + union ShaderStageEnable { + u32 raw; + BitField<0, 2, u32> ls_en; + BitField<2, 1, u32> hs_en; + BitField<3, 2, u32> es_en; + BitField<5, 1, u32> gs_en; + BitField<6, 1, u32> vs_en; + + bool IsStageEnabled(u32 stage) { + switch (stage) { + case 0: + case 1: + return true; + case 2: + return gs_en.Value(); + case 3: + return es_en.Value(); + case 4: + return hs_en.Value(); + case 5: + return ls_en.Value(); + default: + UNREACHABLE(); + } + } + }; + union Regs { struct { INSERT_PADDING_WORDS(0x2C08); @@ -944,7 +974,9 @@ struct Liverpool { INSERT_PADDING_WORDS(0xA2A8 - 0xA2A1 - 1); u32 vgt_instance_step_rate_0; u32 vgt_instance_step_rate_1; - INSERT_PADDING_WORDS(0xA2DF - 0xA2A9 - 1); + INSERT_PADDING_WORDS(0xA2D5 - 0xA2A9 - 1); + ShaderStageEnable stage_enable; + INSERT_PADDING_WORDS(9); PolygonOffset poly_offset; INSERT_PADDING_WORDS(0xA2F8 - 0xA2DF - 5); AaConfig aa_config; @@ -1024,6 +1056,13 @@ public: rasterizer = rasterizer_; } + void SendCommand(Common::UniqueFunction&& func) { + std::scoped_lock lk{submit_mutex}; + command_queue.emplace(std::move(func)); + ++num_commands; + submit_cv.notify_one(); + } + private: struct Task { struct promise_type { @@ -1092,9 +1131,11 @@ private: Libraries::VideoOut::VideoOutPort* vo_port{}; std::jthread process_thread{}; std::atomic num_submits{}; + std::atomic num_commands{}; std::atomic submit_done{}; std::mutex submit_mutex; std::condition_variable_any submit_cv; + std::queue> command_queue{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); @@ -1139,6 +1180,7 @@ static_assert(GFX6_3D_REG_INDEX(index_buffer_type) == 0xA29F); static_assert(GFX6_3D_REG_INDEX(enable_primitive_id) == 0xA2A1); static_assert(GFX6_3D_REG_INDEX(vgt_instance_step_rate_0) == 0xA2A8); static_assert(GFX6_3D_REG_INDEX(vgt_instance_step_rate_1) == 0xA2A9); +static_assert(GFX6_3D_REG_INDEX(stage_enable) == 0xA2D5); static_assert(GFX6_3D_REG_INDEX(poly_offset) == 0xA2DF); static_assert(GFX6_3D_REG_INDEX(aa_config) == 0xA2F8); static_assert(GFX6_3D_REG_INDEX(color_buffers[0].base_address) == 0xA318); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index e5f618cc..5ab233fd 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -282,6 +282,13 @@ enum class InterruptSelect : u32 { IrqUndocumented = 3, }; +static u64 GetGpuClock64() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + auto ticks = std::chrono::duration_cast(duration).count(); + return static_cast(ticks); +} + struct PM4CmdEventWriteEop { PM4Type3Header header; union { @@ -325,6 +332,10 @@ struct PM4CmdEventWriteEop { *Address() = DataQWord(); break; } + case DataSelect::GpuClock64: { + *Address() = GetGpuClock64(); + break; + } case DataSelect::PerfCounter: { *Address() = Common::FencedRDTSC(); break; @@ -652,13 +663,6 @@ struct PM4CmdReleaseMem { return data_lo | u64(data_hi) << 32; } - uint64_t GetGpuClock64() const { - auto now = std::chrono::high_resolution_clock::now(); - auto duration = now.time_since_epoch(); - auto ticks = std::chrono::duration_cast(duration).count(); - return static_cast(ticks); - } - void SignalFence(Platform::InterruptId irq_id) const { switch (data_sel.Value()) { case DataSelect::Data32Low: { diff --git a/src/video_core/amdgpu/pm4_opcodes.h b/src/video_core/amdgpu/pm4_opcodes.h index 8922c4ea..fba0cbb9 100644 --- a/src/video_core/amdgpu/pm4_opcodes.h +++ b/src/video_core/amdgpu/pm4_opcodes.h @@ -41,6 +41,7 @@ enum class PM4ItOpcode : u32 { CondIndirectBuffer = 0x3F, CopyData = 0x40, CommandProcessorDma = 0x41, + PfpSyncMe = 0x42, SurfaceSync = 0x43, CondWrite = 0x45, EventWrite = 0x46, diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index e9498b35..d112864d 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -106,10 +106,8 @@ Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_ VmaAllocationInfo alloc_info{}; buffer.Create(buffer_ci, usage, &alloc_info); - if (instance->HasDebuggingToolAttached()) { - const auto device = instance->GetDevice(); - Vulkan::SetObjectName(device, Handle(), "Buffer {:#x} {} KiB", cpu_addr, size_bytes / 1024); - } + const auto device = instance->GetDevice(); + Vulkan::SetObjectName(device, Handle(), "Buffer {:#x}:{:#x}", cpu_addr, size_bytes); // Map it if it is host visible. VkMemoryPropertyFlags property_flags{}; @@ -152,10 +150,8 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); const auto device = instance.GetDevice(); - if (instance.HasDebuggingToolAttached()) { - Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}): {} KiB", BufferTypeName(usage), - size_bytes / 1024); - } + Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}):{:#x}", BufferTypeName(usage), + size_bytes); } std::pair StreamBuffer::Map(u64 size, u64 alignment) { diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index e0d9da08..d373fbff 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -146,6 +146,10 @@ public: return offset; } + u64 GetFreeSize() const { + return size_bytes - offset - mapped_size; + } + private: struct Watch { u64 tick{}; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 7ab0d817..2246807a 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -87,6 +87,15 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { + boost::container::small_vector attributes; + boost::container::small_vector bindings; + SCOPE_EXIT { + if (instance.IsVertexInputDynamicState()) { + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.setVertexInputEXT(bindings, attributes); + } + }; + if (vs_info.vs_inputs.empty()) { return false; } @@ -122,6 +131,21 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { } guest_buffers.emplace_back(buffer); ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); + attributes.push_back({ + .location = input.binding, + .binding = input.binding, + .format = + Vulkan::LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()), + .offset = 0, + }); + bindings.push_back({ + .binding = input.binding, + .stride = buffer.GetStride(), + .inputRate = input.instance_step_rate == Shader::Info::VsInput::None + ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, + .divisor = 1, + }); } std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { @@ -224,6 +248,19 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } +std::pair BufferCache::ObtainTempBuffer(VAddr gpu_addr, u32 size) { + const u64 page = gpu_addr >> CACHING_PAGEBITS; + const BufferId buffer_id = page_table[page]; + if (buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(gpu_addr, size)) { + return {&buffer, buffer.Offset(gpu_addr)}; + } + } + const u32 offset = staging_buffer.Copy(gpu_addr, size, 16); + return {&staging_buffer, offset}; +} + bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); @@ -248,6 +285,10 @@ bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { return memory_tracker.IsRegionCpuModified(addr, size); } +bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { + return memory_tracker.IsRegionGpuModified(addr, size); +} + BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { if (device_addr == 0) { return NULL_BUFFER_ID; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 0dee87cf..33ea3f86 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -69,12 +69,18 @@ public: /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written); + /// Obtains a temporary buffer for usage in texture cache. + [[nodiscard]] std::pair ObtainTempBuffer(VAddr gpu_addr, u32 size); + /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + private: template void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 01526265..4fc32ab2 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -81,6 +81,8 @@ vk::PrimitiveTopology PrimitiveType(Liverpool::PrimitiveType type) { return vk::PrimitiveTopology::eTriangleListWithAdjacency; case Liverpool::PrimitiveType::AdjTriangleStrip: return vk::PrimitiveTopology::eTriangleStripWithAdjacency; + case Liverpool::PrimitiveType::PatchPrimitive: + return vk::PrimitiveTopology::ePatchList; case Liverpool::PrimitiveType::QuadList: // Needs to generate index buffer on the fly. return vk::PrimitiveTopology::eTriangleList; @@ -339,6 +341,7 @@ std::span GetAllFormats() { vk::Format::eR32Sint, vk::Format::eR32Uint, vk::Format::eBc6HUfloatBlock, + vk::Format::eBc6HSfloatBlock, vk::Format::eR16G16Unorm, vk::Format::eR16G16B16A16Sscaled, vk::Format::eR16G16Sscaled, @@ -540,6 +543,9 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::FormatBc6 && num_format == AmdGpu::NumberFormat::Unorm) { return vk::Format::eBc6HUfloatBlock; } + if (data_format == AmdGpu::DataFormat::FormatBc6 && num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eBc6HSfloatBlock; + } if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && num_format == AmdGpu::NumberFormat::Sint) { return vk::Format::eR8G8B8A8Sint; diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 8178c88d..eab9d527 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -47,21 +47,22 @@ public: Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); + texture_cache.UpdateImage(image_id, is_eop ? nullptr : &flip_scheduler); auto& image = texture_cache.GetImage(image_id); return PrepareFrameInternal(image, is_eop); } - Frame* PrepareBlankFrame() { + Frame* PrepareBlankFrame(bool is_eop) { auto& image = texture_cache.GetImage(VideoCore::NULL_IMAGE_ID); - return PrepareFrameInternal(image, true); + return PrepareFrameInternal(image, is_eop); } VideoCore::Image& RegisterVideoOutSurface( const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) { vo_buffers_addr.emplace_back(cpu_address); const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); return texture_cache.GetImage(image_id); } @@ -75,6 +76,11 @@ public: void Present(Frame* frame); void RecreateFrame(Frame* frame, u32 width, u32 height); + void FlushDraw() { + SubmitInfo info{}; + draw_scheduler.Flush(info); + } + private: Frame* PrepareFrameInternal(VideoCore::Image& image, bool is_eop = true); Frame* GetRenderFrame(); diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 21710a76..62b50eeb 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -96,7 +96,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, Shader::PushData push_data{}; u32 binding{}; - for (u32 i = 0; const auto& buffer : info.buffers) { + for (const auto& buffer : info.buffers) { const auto vsharp = buffer.GetVsharp(info); const VAddr address = vsharp.base_address; // Most of the time when a metadata is updated with a shader it gets cleared. It means we @@ -115,7 +115,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } const u32 size = vsharp.GetSize(); if (buffer.is_written) { - texture_cache.InvalidateMemory(address, size); + texture_cache.InvalidateMemory(address, size, true); } const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); @@ -137,7 +137,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, : vk::DescriptorType::eUniformBuffer, .pBufferInfo = &buffer_infos.back(), }); - i++; } for (const auto& image_desc : info.images) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5d87a1ca..cf23ade2 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -145,6 +145,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul dynamic_states.push_back(vk::DynamicState::eColorWriteEnableEXT); dynamic_states.push_back(vk::DynamicState::eColorWriteMaskEXT); } + if (instance.IsVertexInputDynamicState()) { + dynamic_states.push_back(vk::DynamicState::eVertexInputEXT); + } const vk::PipelineDynamicStateCreateInfo dynamic_info = { .dynamicStateCount = static_cast(dynamic_states.size()), diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 2d396daf..66da030f 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -8,6 +8,7 @@ #include #include "common/assert.h" +#include "common/config.h" #include "sdl_window.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -163,7 +164,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceColorWriteEnableFeaturesEXT, vk::PhysicalDeviceVulkan12Features, vk::PhysicalDeviceVulkan13Features, vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR, - vk::PhysicalDeviceDepthClipControlFeaturesEXT>(); + vk::PhysicalDeviceDepthClipControlFeaturesEXT, vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDevicePortabilitySubsetFeaturesKHR>(); const vk::StructureChain properties_chain = physical_device.getProperties2< vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); @@ -197,10 +199,12 @@ bool Instance::CreateDevice() { external_memory_host = add_extension(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); add_extension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); - add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); + const bool depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); workgroup_memory_explicit_layout = add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + // The next two extensions are required to be available together in order to support write masks color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME); color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); @@ -210,9 +214,21 @@ bool Instance::CreateDevice() { // These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2 // with extensions. tooling_info = add_extension(VK_EXT_TOOLING_INFO_EXTENSION_NAME); - add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); + const bool maintenance4 = add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); add_extension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME); add_extension(VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); + const bool has_sync2 = add_extension(VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME); + + if (has_sync2) { + has_nv_checkpoints = Config::isMarkersEnabled() + ? add_extension(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME) + : false; + } + +#ifdef __APPLE__ + // Required by Vulkan spec if supported. + add_extension(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME); +#endif const auto family_properties = physical_device.getQueueFamilyProperties(); if (family_properties.empty()) { @@ -308,21 +324,51 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceRobustness2FeaturesEXT{ .nullDescriptor = true, }, + vk::PhysicalDeviceSynchronization2Features{ + .synchronization2 = true, + }, + vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{ + .vertexInputDynamicState = true, + }, +#ifdef __APPLE__ + feature_chain.get(), +#endif }; + if (!maintenance4) { + device_chain.unlink(); + } + if (!custom_border_color) { + device_chain.unlink(); + } if (!color_write_en) { device_chain.unlink(); device_chain.unlink(); } - if (!robustness) { + if (!depth_clip_control) { + device_chain.unlink(); + } + if (!workgroup_memory_explicit_layout) { + device_chain.unlink(); + } + if (robustness) { + device_chain.get().nullDescriptor = + feature_chain.get().nullDescriptor; + } else { device_chain.unlink(); } + if (!vertex_input_dynamic_state) { + device_chain.unlink(); + } try { device = physical_device.createDeviceUnique(device_chain.get()); } catch (vk::ExtensionNotPresentError& err) { LOG_CRITICAL(Render_Vulkan, "Some required extensions are not available {}", err.what()); return false; + } catch (vk::FeatureNotPresentError& err) { + LOG_CRITICAL(Render_Vulkan, "Some required features are not available {}", err.what()); + return false; } VULKAN_HPP_DEFAULT_DISPATCHER.init(*device); diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index a8c0dcf4..4cb4741a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -88,6 +88,10 @@ public: return profiler_context; } + bool HasNvCheckpoints() const { + return has_nv_checkpoints; + } + /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { return has_renderdoc || has_nsight_graphics; @@ -128,6 +132,11 @@ public: return color_write_en; } + /// Returns true when VK_EXT_vertex_input_dynamic_state is supported. + bool IsVertexInputDynamicState() const { + return vertex_input_dynamic_state; + } + /// Returns the vendor ID of the physical device u32 GetVendorID() const { return properties.vendorID; @@ -253,12 +262,14 @@ private: bool external_memory_host{}; bool workgroup_memory_explicit_layout{}; bool color_write_en{}; + bool vertex_input_dynamic_state{}; u64 min_imported_host_pointer_alignment{}; u32 subgroup_size{}; bool tooling_info{}; bool debug_utils_supported{}; bool has_nsight_graphics{}; bool has_renderdoc{}; + bool has_nv_checkpoints{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 8d27d252..38d1f51b 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -115,6 +115,10 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, } const GraphicsPipeline* PipelineCache::GetGraphicsPipeline() { + // Tessellation is unsupported so skip the draw to avoid locking up the driver. + if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::PatchPrimitive) { + return nullptr; + } RefreshGraphicsKey(); const auto [it, is_new] = graphics_pipelines.try_emplace(graphics_key); if (is_new) { @@ -203,12 +207,20 @@ void PipelineCache::RefreshGraphicsKey() { } for (u32 i = 0; i < MaxShaderStages; i++) { + if (!regs.stage_enable.IsStageEnabled(i)) { + key.stage_hashes[i] = 0; + continue; + } auto* pgm = regs.ProgramForStage(i); if (!pgm || !pgm->Address()) { key.stage_hashes[i] = 0; continue; } const auto* bininfo = Liverpool::GetBinaryInfo(*pgm); + if (!bininfo->Valid()) { + key.stage_hashes[i] = 0; + continue; + } key.stage_hashes[i] = bininfo->shader_hash; } } diff --git a/src/video_core/renderer_vulkan/vk_platform.cpp b/src/video_core/renderer_vulkan/vk_platform.cpp index 0915514b..c73a8139 100644 --- a/src/video_core/renderer_vulkan/vk_platform.cpp +++ b/src/video_core/renderer_vulkan/vk_platform.cpp @@ -157,6 +157,10 @@ std::vector GetInstanceExtensions(Frontend::WindowSystemType window break; } +#ifdef __APPLE__ + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); +#endif + if (window_type != Frontend::WindowSystemType::Headless) { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } @@ -221,12 +225,61 @@ vk::UniqueInstance CreateInstance(vk::DynamicLoader& dl, Frontend::WindowSystemT vk::Bool32 enable_sync = enable_validation && Config::vkValidationSyncEnabled() ? vk::True : vk::False; - vk::LayerSettingEXT layer_set = { - .pLayerName = VALIDATION_LAYER_NAME, - .pSettingName = "validate_sync", - .type = vk::LayerSettingTypeEXT::eBool32, - .valueCount = 1, - .pValues = &enable_sync, + vk::Bool32 enable_gpuav = + enable_validation && Config::vkValidationSyncEnabled() ? vk::True : vk::False; + const char* gpuav_mode = enable_validation && Config::vkValidationGpuEnabled() + ? "GPU_BASED_GPU_ASSISTED" + : "GPU_BASED_NONE"; + const std::array layer_setings = { + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "validate_sync", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_sync, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "sync_queue_submit", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_sync, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "validate_gpu_based", + .type = vk::LayerSettingTypeEXT::eString, + .valueCount = 1, + .pValues = &gpuav_mode, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_reserve_binding_slot", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_descriptor_checks", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_validate_indirect_buffer", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_buffer_copies", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, }; vk::StructureChain instance_ci_chain = { @@ -236,10 +289,13 @@ vk::UniqueInstance CreateInstance(vk::DynamicLoader& dl, Frontend::WindowSystemT .ppEnabledLayerNames = layers.data(), .enabledExtensionCount = static_cast(extensions.size()), .ppEnabledExtensionNames = extensions.data(), +#ifdef __APPLE__ + .flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR, +#endif }, vk::LayerSettingsCreateInfoEXT{ - .settingCount = 1, - .pSettings = &layer_set, + .settingCount = layer_setings.size(), + .pSettings = layer_setings.data(), }, }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 51de09f7..542624a0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -152,7 +152,8 @@ void Rasterizer::BeginRendering() { .stencil = regs.stencil_clear}}, }; texture_cache.TouchMeta(htile_address, false); - state.num_depth_attachments++; + state.has_depth = true; + state.has_stencil = image.info.usage.stencil; } scheduler.BeginRendering(state); } @@ -230,16 +231,42 @@ void Rasterizer::UpdateDepthStencilState() { cmdbuf.setDepthBoundsTestEnable(depth.depth_bounds_enable); } -void Rasterizer::ScopeMarkerBegin(const std::string& str) { +void Rasterizer::ScopeMarkerBegin(const std::string_view& str) { + if (!Config::isMarkersEnabled()) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.beginDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ - .pLabelName = str.c_str(), + .pLabelName = str.data(), }); } void Rasterizer::ScopeMarkerEnd() { + if (!Config::isMarkersEnabled()) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.endDebugUtilsLabelEXT(); } +void Rasterizer::ScopedMarkerInsert(const std::string_view& str) { + if (!Config::isMarkersEnabled()) { + return; + } + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ + .pLabelName = str.data(), + }); +} + +void Rasterizer::Breadcrumb(u64 id) { + if (!instance.HasNvCheckpoints()) { + return; + } + scheduler.CommandBuffer().setCheckpointNV(id); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 685ba6e0..a151ebc2 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -35,8 +35,10 @@ public: void DispatchDirect(); - void ScopeMarkerBegin(const std::string& str); + void ScopeMarkerBegin(const std::string_view& str); void ScopeMarkerEnd(); + void ScopedMarkerInsert(const std::string_view& str); + void Breadcrumb(u64 id); void InvalidateMemory(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index fb64285f..a6c2536b 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -38,8 +38,7 @@ void Scheduler::BeginRendering(const RenderState& new_state) { .layerCount = 1, .colorAttachmentCount = render_state.num_color_attachments, .pColorAttachments = render_state.color_attachments.data(), - .pDepthAttachment = - render_state.num_depth_attachments ? &render_state.depth_attachment : nullptr, + .pDepthAttachment = render_state.has_depth ? &render_state.depth_attachment : nullptr, }; current_cmdbuf.beginRendering(rendering_info); @@ -50,6 +49,8 @@ void Scheduler::EndRendering() { return; } is_rendering = false; + current_cmdbuf.endRendering(); + boost::container::static_vector barriers; for (size_t i = 0; i < render_state.num_color_attachments; ++i) { barriers.push_back(vk::ImageMemoryBarrier{ @@ -70,10 +71,35 @@ void Scheduler::EndRendering() { }, }); } - current_cmdbuf.endRendering(); + if (render_state.has_depth) { + barriers.push_back(vk::ImageMemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eDepthStencilAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite, + .oldLayout = render_state.depth_attachment.imageLayout, + .newLayout = render_state.depth_attachment.imageLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = render_state.depth_image, + .subresourceRange = + { + .aspectMask = vk::ImageAspectFlagBits::eDepth | + (render_state.has_stencil ? vk::ImageAspectFlagBits::eStencil + : vk::ImageAspectFlagBits::eNone), + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); + } + if (!barriers.empty()) { - current_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput, - vk::PipelineStageFlagBits::eFragmentShader, + const auto src_stages = + vk::PipelineStageFlagBits::eColorAttachmentOutput | + (render_state.has_depth ? vk::PipelineStageFlagBits::eLateFragmentTests | + vk::PipelineStageFlagBits::eEarlyFragmentTests + : vk::PipelineStageFlagBits::eNone); + current_cmdbuf.pipelineBarrier(src_stages, vk::PipelineStageFlagBits::eFragmentShader, vk::DependencyFlagBits::eByRegion, {}, {}, barriers); } } @@ -158,6 +184,13 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { try { instance.GetGraphicsQueue().submit(submit_info, info.fence); } catch (vk::DeviceLostError& err) { + if (instance.HasNvCheckpoints()) { + const auto checkpoint_data = instance.GetGraphicsQueue().getCheckpointData2NV(); + for (const auto& cp : checkpoint_data) { + LOG_CRITICAL(Render_Vulkan, "{}: {:#x}", vk::to_string(cp.stage), + reinterpret_cast(cp.pCheckpointMarker)); + } + } UNREACHABLE_MSG("Device lost during submit: {}", err.what()); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index b82d558c..1140bfbc 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -20,7 +20,8 @@ struct RenderState { vk::RenderingAttachmentInfo depth_attachment{}; vk::Image depth_image{}; u32 num_color_attachments{}; - u32 num_depth_attachments{}; + bool has_depth{}; + bool has_stencil{}; u32 width = std::numeric_limits::max(); u32 height = std::numeric_limits::max(); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 20c99e30..dcc19bf3 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -37,6 +37,16 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { instance.GetPresentQueueFamilyIndex(), }; + const auto modes = instance.GetPhysicalDevice().getSurfacePresentModesKHR(surface); + const auto find_mode = [&modes](vk::PresentModeKHR requested) { + const auto it = + std::find_if(modes.begin(), modes.end(), + [&requested](vk::PresentModeKHR mode) { return mode == requested; }); + + return it != modes.end(); + }; + const bool has_mailbox = find_mode(vk::PresentModeKHR::eMailbox); + const bool exclusive = queue_family_indices[0] == queue_family_indices[1]; const u32 queue_family_indices_count = exclusive ? 1u : 2u; const vk::SharingMode sharing_mode = @@ -55,7 +65,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { .pQueueFamilyIndices = queue_family_indices.data(), .preTransform = transform, .compositeAlpha = composite_alpha, - .presentMode = vk::PresentModeKHR::eMailbox, + .presentMode = has_mailbox ? vk::PresentModeKHR::eMailbox : vk::PresentModeKHR::eImmediate, .clipped = true, .oldSwapchain = nullptr, }; @@ -83,6 +93,7 @@ bool Swapchain::AcquireNextImage() { case vk::Result::eSuboptimalKHR: case vk::Result::eErrorSurfaceLostKHR: case vk::Result::eErrorOutOfDateKHR: + case vk::Result::eErrorUnknown: needs_recreation = true; break; default: diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index f7aef847..bae4b89d 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" +#include "common/config.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -116,6 +117,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, : instance{&instance_}, scheduler{&scheduler_}, info{info_}, image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address}, cpu_addr_end{cpu_addr + info.guest_size_bytes} { + mip_hashes.resize(info.resources.levels); ASSERT(info.pixel_format != vk::Format::eUndefined); // Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case // the texture cache should re-create the resource with the usage requested @@ -154,6 +156,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, }; image.Create(image_ci); + + Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {:#x}:{:#x}", + info.guest_address, info.guest_size_bytes); } void Image::Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask, diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index b18f1002..5a888346 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -111,6 +111,7 @@ struct Image { vk::Flags pl_stage = vk::PipelineStageFlagBits::eAllCommands; vk::Flags access_mask = vk::AccessFlagBits::eNone; vk::ImageLayout layout = vk::ImageLayout::eUndefined; + boost::container::small_vector mip_hashes; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 94917be0..17b78a6d 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -189,6 +189,8 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice resources.layers = num_slices; meta_info.htile_addr = buffer.z_info.tile_surface_en ? htile_address : 0; usage.depth_target = true; + usage.stencil = + buffer.stencil_info.format != AmdGpu::Liverpool::DepthBuffer::StencilFormat::Invalid; guest_address = buffer.Address(); const auto depth_slice_sz = buffer.GetDepthSliceSize(); @@ -260,7 +262,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { case AmdGpu::TilingMode::Display_MacroTiled: case AmdGpu::TilingMode::Texture_MacroTiled: case AmdGpu::TilingMode::Depth_MacroTiled: { - // ASSERT(!props.is_cube && !props.is_block); + ASSERT(!props.is_block); ASSERT(num_samples == 1); std::tie(mip_info.pitch, mip_info.size) = ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index ef6163c4..cbf77f2d 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -92,6 +92,8 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, bool is_vo_surface) noexcept { const auto base_format = Vulkan::LiverpoolToVK::SurfaceFormat(col_buffer.info.format, col_buffer.NumFormat()); + range.base.layer = col_buffer.view.slice_start; + range.extent.layers = col_buffer.NumSlices(); format = Vulkan::LiverpoolToVK::AdjustColorBufferFormat( base_format, col_buffer.info.comp_swap.Value(), is_vo_surface); } diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 53596f8e..6bc893b0 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -3,6 +3,7 @@ #include #include "common/assert.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -11,13 +12,11 @@ namespace VideoCore { -static constexpr u64 StreamBufferSize = 512_MB; static constexpr u64 PageShift = 12; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, - staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize}, tile_manager{instance, scheduler} { ImageInfo info; info.pixel_format = vk::Format::eR8G8B8A8Unorm; @@ -31,9 +30,12 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& TextureCache::~TextureCache() = default; -void TextureCache::InvalidateMemory(VAddr address, size_t size) { +void TextureCache::InvalidateMemory(VAddr address, size_t size, bool from_compute) { std::unique_lock lock{mutex}; ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { + if (from_compute && !image.Overlaps(address, size)) { + return; + } // Ensure image is reuploaded when accessed again. image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. @@ -57,7 +59,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { } } -ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { +ImageId TextureCache::FindImage(const ImageInfo& info) { if (info.guest_address == 0) [[unlikely]] { return NULL_IMAGE_VIEW_ID; } @@ -87,12 +89,6 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { image_id = image_ids[image_ids.size() > 1 ? 1 : 0]; } - Image& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) { - RefreshImage(image); - TrackImage(image, image_id); - } - return image_id; } @@ -119,6 +115,7 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(info); + UpdateImage(image_id); Image& image = slot_images[image_id]; auto& usage = image.info.usage; @@ -165,7 +162,8 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; - image.flags &= ~ImageFlagBits::CpuModified; + image.flags |= ImageFlagBits::GpuModified; + UpdateImage(image_id); image.Transit(vk::ImageLayout::eColorAttachmentOptimal, vk::AccessFlagBits::eColorAttachmentWrite | @@ -198,8 +196,9 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { - const ImageId image_id = FindImage(image_info, false); + const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; + image.flags |= ImageFlagBits::GpuModified; image.flags &= ~ImageFlagBits::CpuModified; const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal @@ -224,26 +223,10 @@ ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, return RegisterImageView(image_id, view_info); } -void TextureCache::RefreshImage(Image& image) { +void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_scheduler /*= nullptr*/) { // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; - scheduler.EndRendering(); - - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - vk::Buffer buffer{staging.Handle()}; - u32 offset{0}; - - auto upload_buffer = tile_manager.TryDetile(image); - if (upload_buffer) { - buffer = *upload_buffer; - } else { - // Upload data to the staging buffer. - offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16); - } - const auto& num_layers = image.info.resources.layers; const auto& num_mips = image.info.resources.levels; ASSERT(num_mips == image.info.mips_layout.size()); @@ -254,12 +237,23 @@ void TextureCache::RefreshImage(Image& image) { const u32 height = std::max(image.info.size.height >> m, 1u); const u32 depth = image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + + // Protect GPU modified resources from accidental reuploads. + if (True(image.flags & ImageFlagBits::GpuModified) && + !buffer_cache.IsRegionGpuModified(image.info.guest_address + mip_ofs, mip_size)) { + const u8* addr = std::bit_cast(image.info.guest_address); + const u64 hash = XXH3_64bits(addr + mip_ofs, mip_size); + if (image.mip_hashes[m] == hash) { + continue; + } + image.mip_hashes[m] = hash; + } image_copy.push_back({ - .bufferOffset = offset + mip_ofs * num_layers, - .bufferRowLength = static_cast(mip_pitch), - .bufferImageHeight = static_cast(mip_height), + .bufferOffset = mip_ofs * num_layers, + .bufferRowLength = static_cast(mip_pitch), + .bufferImageHeight = static_cast(mip_height), .imageSubresource{ .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = m, @@ -271,6 +265,32 @@ void TextureCache::RefreshImage(Image& image) { }); } + if (image_copy.empty()) { + return; + } + + auto* sched_ptr = custom_scheduler ? custom_scheduler : &scheduler; + sched_ptr->EndRendering(); + + const auto cmdbuf = sched_ptr->CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite, cmdbuf); + + const VAddr image_addr = image.info.guest_address; + const size_t image_size = image.info.guest_size_bytes; + vk::Buffer buffer{}; + u32 offset{}; + if (auto upload_buffer = tile_manager.TryDetile(image); upload_buffer) { + buffer = *upload_buffer; + } else { + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); + buffer = vk_buffer->Handle(); + offset = buf_offset; + } + + for (auto& copy : image_copy) { + copy.bufferOffset += offset; + } + cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 17a09898..137b6014 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -38,13 +38,13 @@ public: ~TextureCache(); /// Invalidates any image in the logical page range. - void InvalidateMemory(VAddr address, size_t size); + void InvalidateMemory(VAddr address, size_t size, bool from_compute = false); /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true); + [[nodiscard]] ImageId FindImage(const ImageInfo& info); /// Retrieves an image view with the properties of the specified image descriptor. [[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info, @@ -58,8 +58,18 @@ public: [[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info); + /// Updates image contents if it was modified by CPU. + void UpdateImage(ImageId image_id, Vulkan::Scheduler* custom_scheduler = nullptr) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::CpuModified)) { + return; + } + RefreshImage(image, custom_scheduler); + TrackImage(image, image_id); + } + /// Reuploads image contents. - void RefreshImage(Image& image); + void RefreshImage(Image& image, Vulkan::Scheduler* custom_scheduler = nullptr); /// Retrieves the sampler that matches the provided S# descriptor. [[nodiscard]] vk::Sampler GetSampler(const AmdGpu::Sampler& sampler); @@ -170,7 +180,6 @@ private: Vulkan::Scheduler& scheduler; BufferCache& buffer_cache; PageManager& tracker; - StreamBuffer staging; TileManager tile_manager; Common::SlotVector slot_images; Common::SlotVector slot_image_views; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index d3a7d796..f08f2094 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -5,7 +5,6 @@ #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/image_view.h" -#include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" #include "video_core/host_shaders/detile_m32x1_comp.h" @@ -187,6 +186,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR32Sfloat: case vk::Format::eR32Uint: case vk::Format::eR16G16Sfloat: + case vk::Format::eR16G16Unorm: return vk::Format::eR32Uint; case vk::Format::eBc1RgbaSrgbBlock: case vk::Format::eBc1RgbaUnormBlock: @@ -194,6 +194,8 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR32G32Sfloat: case vk::Format::eR32G32Uint: case vk::Format::eR16G16B16A16Unorm: + case vk::Format::eR16G16B16A16Uint: + case vk::Format::eR16G16B16A16Sfloat: return vk::Format::eR32G32Uint; case vk::Format::eBc2SrgbBlock: case vk::Format::eBc2UnormBlock: @@ -397,7 +399,7 @@ std::optional TileManager::TryDetile(Image& image) { const u32 image_size = image.info.guest_size_bytes; const auto [in_buffer, in_offset] = [&] -> std::pair { // Use stream buffer for smaller textures. - if (image_size <= StreamBufferSize) { + if (image_size <= stream_buffer.GetFreeSize()) { u32 offset = stream_buffer.Copy(image.info.guest_address, image_size); return {stream_buffer.Handle(), offset}; }