From ea2e4f7b5c7eb9b1e33c745839ff7df18e607c7d Mon Sep 17 00:00:00 2001 From: raphaelthegreat <47210458+raphaelthegreat@users.noreply.github.com> Date: Mon, 3 Jun 2024 18:52:50 +0300 Subject: [PATCH] Various fixes --- src/audio_core/sdl_audio.cpp | 2 +- src/core/libraries/audio/audioout.cpp | 8 +-- .../libraries/kernel/memory_management.cpp | 2 +- .../libraries/kernel/thread_management.cpp | 38 ++++++++++- src/core/libraries/kernel/thread_management.h | 1 + src/core/linker.cpp | 50 +++++++------- src/core/linker.h | 17 +---- src/core/memory.cpp | 4 +- src/core/module.h | 7 +- src/core/tls.cpp | 4 ++ src/core/tls.h | 14 ++++ src/main.cpp | 5 +- .../frontend/translate/scalar_alu.cpp | 68 +++++++++++++++++-- .../frontend/translate/scalar_memory.cpp | 34 +++------- .../frontend/translate/translate.cpp | 19 +++++- .../frontend/translate/translate.h | 1 + .../frontend/translate/vector_alu.cpp | 30 +++++--- src/shader_recompiler/ir/ir_emitter.cpp | 4 +- src/shader_recompiler/ir/ir_emitter.h | 2 +- src/shader_recompiler/ir/opcodes.inc | 2 +- .../ir/passes/resource_tracking_pass.cpp | 17 +++-- .../renderer_vulkan/vk_pipeline_cache.cpp | 9 ++- .../texture_cache/texture_cache.cpp | 2 +- 23 files changed, 231 insertions(+), 109 deletions(-) diff --git a/src/audio_core/sdl_audio.cpp b/src/audio_core/sdl_audio.cpp index 59e83f8f..6a36c694 100644 --- a/src/audio_core/sdl_audio.cpp +++ b/src/audio_core/sdl_audio.cpp @@ -66,7 +66,7 @@ int SDLAudio::AudioOutOpen(int type, u32 samples_num, u32 freq, port.sample_size = 4; break; default: - UNREACHABLE_MSG("Unknown format"); + UNREACHABLE_MSG("Unknown format {}", u32(format)); } for (int i = 0; i < port.channels_num; i++) { diff --git a/src/core/libraries/audio/audioout.cpp b/src/core/libraries/audio/audioout.cpp index 993970c6..5b778fdf 100644 --- a/src/core/libraries/audio/audioout.cpp +++ b/src/core/libraries/audio/audioout.cpp @@ -234,7 +234,7 @@ s32 PS4_SYSV_ABI sceAudioOutOpen(UserService::OrbisUserServiceUserId user_id, "AudioOutOpen id = {} port_type = {} index = {} lenght= {} sample_rate = {} " "param_type = {}", user_id, GetAudioOutPort(port_type), index, length, sample_rate, - GetAudioOutParam(param_type)); + GetAudioOutParam(param_type & 0xFF)); if ((port_type < 0 || port_type > 4) && (port_type != 127)) { LOG_ERROR(Lib_AudioOut, "Invalid port type"); return ORBIS_AUDIO_OUT_ERROR_INVALID_PORT_TYPE; @@ -243,10 +243,6 @@ s32 PS4_SYSV_ABI sceAudioOutOpen(UserService::OrbisUserServiceUserId user_id, LOG_ERROR(Lib_AudioOut, "Invalid sample rate"); return ORBIS_AUDIO_OUT_ERROR_INVALID_SAMPLE_FREQ; } - if (param_type < 0 || param_type > 7) { - LOG_ERROR(Lib_AudioOut, "Invalid format"); - return ORBIS_AUDIO_OUT_ERROR_INVALID_FORMAT; - } if (length != 256 && length != 512 && length != 768 && length != 1024 && length != 1280 && length != 1536 && length != 1792 && length != 2048) { LOG_ERROR(Lib_AudioOut, "Invalid length"); @@ -255,7 +251,7 @@ s32 PS4_SYSV_ABI sceAudioOutOpen(UserService::OrbisUserServiceUserId user_id, if (index != 0) { LOG_ERROR(Lib_AudioOut, "index is not valid !=0 {}", index); } - int result = audio->AudioOutOpen(port_type, length, sample_rate, param_type); + int result = audio->AudioOutOpen(port_type, length, sample_rate, OrbisAudioOutParam(param_type & 0xFF)); if (result == -1) { LOG_ERROR(Lib_AudioOut, "Audio ports are full"); return ORBIS_AUDIO_OUT_ERROR_PORT_FULL; diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index 250920cd..b369c2ed 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -28,7 +28,7 @@ int PS4_SYSV_ABI sceKernelAllocateDirectMemory(s64 searchStart, s64 searchEnd, u LOG_ERROR(Kernel_Vmm, "Provided address range is invalid!"); return SCE_KERNEL_ERROR_EINVAL; } - if ((alignment != 0 || Common::Is16KBAligned(alignment)) && !std::has_single_bit(alignment)) { + if (alignment != 0 && !Common::Is16KBAligned(alignment)) { LOG_ERROR(Kernel_Vmm, "Alignment value is invalid!"); return SCE_KERNEL_ERROR_EINVAL; } diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 745293ae..9472ce4b 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -10,7 +10,6 @@ #include "core/libraries/error_codes.h" #include "core/libraries/kernel/thread_management.h" #include "core/libraries/libs.h" -#include "core/tls.h" #include "core/linker.h" #ifdef _WIN64 #include @@ -516,7 +515,7 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) { int result = pthread_mutex_lock(&(*mutex)->pth_mutex); if (result != 0) { - LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); + //LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); } switch (result) { case 0: @@ -539,7 +538,7 @@ int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) { int result = pthread_mutex_unlock(&(*mutex)->pth_mutex); if (result != 0) { - LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); + //LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); } switch (result) { case 0: @@ -1122,6 +1121,34 @@ void* PS4_SYSV_ABI __tls_get_addr(TlsIndex* index) { return linker->TlsGetAddr(index->ti_module, index->ti_offset); } +int PS4_SYSV_ABI posix_sem_init(sem_t *sem, int pshared, unsigned int value) { + return sem_init(sem, pshared, value); +} + +int PS4_SYSV_ABI posix_sem_wait(sem_t *sem) { + return sem_wait(sem); +} + +int PS4_SYSV_ABI posix_sem_post(sem_t *sem) { + return sem_post(sem); +} + +int PS4_SYSV_ABI posix_pthread_mutex_destroy(ScePthreadMutex* mutex) { + // LOG_INFO(Kernel_Pthread, "posix pthread_mutex_init redirect to scePthreadMutexInit"); + int result = scePthreadMutexDestroy(mutex); + if (result < 0) { + int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP + ? result + -SCE_KERNEL_ERROR_UNKNOWN + : POSIX_EOTHER; + return rt; + } + return result; +} + +int PS4_SYSV_ABI posix_pthread_join(ScePthread thread, void** value_ptr) { + return pthread_join(thread->pth, value_ptr); +} + void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("4+h9EzwKF4I", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetschedpolicy); LIB_FUNCTION("-Wreprtu0Qs", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetdetachstate); @@ -1173,9 +1200,14 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("dQHWEsJtoE4", "libScePosix", 1, "libkernel", 1, 1, pthread_mutexattr_init); LIB_FUNCTION("mDmgMOGVUqg", "libScePosix", 1, "libkernel", 1, 1, pthread_mutexattr_settype); LIB_FUNCTION("ttHNfU+qDBU", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_init); + LIB_FUNCTION("ltCfaGr2JGE", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_destroy); LIB_FUNCTION("7H0iTOciTLo", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_lock); LIB_FUNCTION("2Z+PpY6CaJg", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_unlock); LIB_FUNCTION("mkx2fVhNMsg", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_cond_broadcast); + LIB_FUNCTION("h9CcP3J0oVM", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_join); + LIB_FUNCTION("pDuPEf3m4fI", "libScePosix", 1, "libkernel", 1, 1, posix_sem_init); + LIB_FUNCTION("YCV5dGGBcCo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_wait); + LIB_FUNCTION("IKP8typ0QUk", "libScePosix", 1, "libkernel", 1, 1, posix_sem_post); LIB_FUNCTION("QBi7HCK03hw", "libkernel", 1, "libkernel", 1, 1, sceKernelClockGettime); LIB_FUNCTION("lLMT9vJAck0", "libkernel", 1, "libkernel", 1, 1, clock_gettime); diff --git a/src/core/libraries/kernel/thread_management.h b/src/core/libraries/kernel/thread_management.h index b086a7e4..3a72992e 100644 --- a/src/core/libraries/kernel/thread_management.h +++ b/src/core/libraries/kernel/thread_management.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "common/types.h" diff --git a/src/core/linker.cpp b/src/core/linker.cpp index d5ecb793..287f3b6b 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -57,24 +57,24 @@ void Linker::Execute() { } // Calculate static TLS size. - static constexpr size_t StOff = 0x80; // TODO: What is this offset? - static_tls_size = std::ranges::fold_left(m_modules, StOff, [&](u32 size, auto& module) { - const size_t new_size = size + module->tls.image_size; - module->tls.distance_from_fs = new_size; - return new_size; - }); - - Common::SetCurrentThreadName("GAME_MainThread"); - Libraries::Kernel::pthreadInitSelfMainThread(); - - // Init primary thread TLS. - InitTlsForThread(true); + for (const auto& module : m_modules) { + if (module->tls.image_size != 0) { + module->tls.modid = ++max_tls_index; + } + static_tls_size += module->tls.image_size; + module->tls.offset = static_tls_size; + } // Relocate all modules - for (u32 i = 1; const auto& m : m_modules) { - Relocate(i, m.get()); + for (const auto& m : m_modules) { + Relocate(m.get()); } + // Init primary thread. + Common::SetCurrentThreadName("GAME_MainThread"); + Libraries::Kernel::pthreadInitSelfMainThread(); + InitTlsForThread(true); + // Start shared library modules for (auto& m : m_modules) { if (m->IsSharedLib()) { @@ -113,7 +113,7 @@ Module* Linker::LoadModule(const std::filesystem::path& elf_name) { return m_modules.emplace_back(std::move(module)).get(); } -void Linker::Relocate(u32 index, Module* module) { +void Linker::Relocate(Module* module) { module->ForEachRelocation([&](elf_relocation* rel, bool isJmpRel) { auto type = rel->GetType(); auto symbol = rel->GetSymbol(); @@ -134,7 +134,7 @@ void Linker::Relocate(u32 index, Module* module) { rel_is_resolved = true; break; case R_X86_64_DTPMOD64: - rel_value = static_cast(index); + rel_value = static_cast(module->tls.modid); rel_is_resolved = true; rel_sym_type = Loader::SymbolType::Tls; break; @@ -254,10 +254,11 @@ void Linker::Resolve(const std::string& name, Loader::SymbolType sym_type, Modul } void* Linker::TlsGetAddr(u64 module_index, u64 offset) { + DtvEntry* dtv_table = GetTcbBase()->tcb_dtv; ASSERT_MSG(dtv_table[0].counter == dtv_generation_counter, "Reallocation of DTV table is not supported"); - void* module = dtv_table[module_index + 1].pointer; + void* module = (u8*)dtv_table[module_index + 1].pointer + offset; ASSERT_MSG(module, "DTV allocation is not supported"); return module; } @@ -286,26 +287,29 @@ void Linker::InitTlsForThread(bool is_primary) { } // Initialize allocated memory and allocate DTV table. - const u32 num_dtvs = m_modules.size() - 1; + const u32 num_dtvs = max_tls_index; std::memset(addr_out, 0, total_tls_size); - dtv_table.resize(num_dtvs + 2); + DtvEntry* dtv_table = new DtvEntry[num_dtvs + 2]; // Initialize thread control block u8* addr = reinterpret_cast(addr_out); Tcb* tcb = reinterpret_cast(addr + static_tls_size); tcb->tcb_self = tcb; - tcb->tcb_dtv = dtv_table.data(); + tcb->tcb_dtv = dtv_table; // Dtv[0] is the generation counter. libkernel puts their number into dtv[1] (why?) dtv_table[0].counter = dtv_generation_counter; dtv_table[1].counter = num_dtvs; // Copy init images to TLS thread blocks and map them to DTV slots. - for (u32 i = 2; const auto& module : m_modules) { - u8* dest = reinterpret_cast(addr + static_tls_size - module->tls.distance_from_fs); + for (const auto& module : m_modules) { + if (module->tls.image_size == 0) { + continue; + } + u8* dest = reinterpret_cast(addr + static_tls_size - module->tls.offset); const u8* src = reinterpret_cast(module->tls.image_virtual_addr); std::memcpy(dest, src, module->tls.init_image_size); - tcb->tcb_dtv[i++].pointer = dest; + tcb->tcb_dtv[module->tls.modid + 1].pointer = dest; } // Set pointer to FS base diff --git a/src/core/linker.h b/src/core/linker.h index 7de851bd..76d153c6 100644 --- a/src/core/linker.h +++ b/src/core/linker.h @@ -17,19 +17,6 @@ struct EntryParams { const char* argv[3]; }; -union DtvEntry { - struct { - size_t counter; - }; - void* pointer; -}; - -struct Tcb { - Tcb* tcb_self; - DtvEntry* tcb_dtv; - void* tcb_thread; -}; - using HeapApiFunc = PS4_SYSV_ABI void*(*)(size_t); class Linker { @@ -54,7 +41,7 @@ public: Module* LoadModule(const std::filesystem::path& elf_name); - void Relocate(u32 index, Module* module); + void Relocate(Module* module); void Resolve(const std::string& name, Loader::SymbolType type, Module* module, Loader::SymbolRecord* return_info); void Execute(); @@ -64,9 +51,9 @@ private: const Module* FindExportedModule(const ModuleInfo& m, const LibraryInfo& l); void InitTls(); - std::vector dtv_table; u32 dtv_generation_counter{1}; size_t static_tls_size{}; + size_t max_tls_index{}; HeapApiFunc heap_api_func{}; std::vector> m_modules; Loader::SymbolsResolver m_hle_symbols{}; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index c109dfbe..735c8ee4 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -35,7 +35,9 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, } // Align free position - free_addr = Common::AlignUp(free_addr, alignment); + if (alignment > 0) { + free_addr = Common::AlignUp(free_addr, alignment); + } ASSERT(free_addr >= search_start && free_addr + size <= search_end); // Add the allocated region to the list and commit its pages. diff --git a/src/core/module.h b/src/core/module.h index ccc3f6b3..a09de2d2 100644 --- a/src/core/module.h +++ b/src/core/module.h @@ -47,10 +47,11 @@ struct LibraryInfo { struct ThreadLocalImage { u64 align; + u64 image_size; + u64 offset; + u32 modid; VAddr image_virtual_addr; u64 init_image_size; - u64 image_size; - u64 distance_from_fs; }; struct DynamicModuleInfo { @@ -166,7 +167,7 @@ public: std::vector m_dynamic_data; Loader::SymbolsResolver export_sym; Loader::SymbolsResolver import_sym; - ThreadLocalImage tls; + ThreadLocalImage tls{}; }; } // namespace Core diff --git a/src/core/tls.cpp b/src/core/tls.cpp index 3cebed4a..de0ba9bf 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -49,6 +49,10 @@ void SetTcbBase(void* image_address) { ASSERT(result != 0); } +Tcb* GetTcbBase() { + return reinterpret_cast(TlsGetValue(slot)); +} + void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) { using namespace Xbyak::util; diff --git a/src/core/tls.h b/src/core/tls.h index 311d85f0..58ffd26e 100644 --- a/src/core/tls.h +++ b/src/core/tls.h @@ -11,9 +11,23 @@ class CodeGenerator; namespace Core { +union DtvEntry { + size_t counter; + void* pointer; +}; + +struct Tcb { + Tcb* tcb_self; + DtvEntry* tcb_dtv; + void* tcb_thread; +}; + /// Sets the data pointer to the TCB block. void SetTcbBase(void* image_address); +/// Retrieves Tcb structure for the calling thread. +Tcb* GetTcbBase(); + /// Patches any instructions that access guest TLS to use provided storage. void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c); diff --git a/src/main.cpp b/src/main.cpp index 6f43d941..dc3225c8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -87,6 +87,7 @@ int main(int argc, char* argv[]) { linker->LoadModule(entry.path().string().c_str()); } } + // Check if there is a libc.prx in sce_module folder bool found = false; if (Config::isLleLibc()) { @@ -94,7 +95,8 @@ int main(int argc, char* argv[]) { if (std::filesystem::is_directory(sce_module_folder)) { for (const auto& entry : std::filesystem::directory_iterator(sce_module_folder)) { if (entry.path().filename() == "libc.prx" || - entry.path().filename() == "libSceFios2.prx") { + entry.path().filename() == "libSceFios2.prx" || + entry.path().filename() == "libSceNpToolkit2.prx") { found = true; LOG_INFO(Loader, "Loading {}", entry.path().string().c_str()); linker->LoadModule(entry.path().string().c_str()); @@ -105,6 +107,7 @@ int main(int argc, char* argv[]) { if (!found) { Libraries::LibC::libcSymbolsRegister(&linker->GetHLESymbols()); } + std::thread mainthread([linker]() { linker->Execute(); }); Discord::RPC discordRPC; discordRPC.init(); diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 0fb945e3..eef822b3 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -55,8 +55,20 @@ void Translator::S_ANDN2_B64(const GcnInst& inst) { const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))}; - SetDst(inst.dst[0], result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ExecLo: + ir.SetExec(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { @@ -124,9 +136,17 @@ void Translator::S_OR_B64(bool negate, const GcnInst& inst) { if (negate) { result = ir.LogicalNot(result); } - ASSERT(inst.dst[0].field == OperandField::VccLo); - ir.SetVcc(result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_AND_B64(const GcnInst& inst) { @@ -145,9 +165,17 @@ void Translator::S_AND_B64(const GcnInst& inst) { const IR::U1 src0{get_src(inst.src[0])}; const IR::U1 src1{get_src(inst.src[1])}; const IR::U1 result = ir.LogicalAnd(src0, src1); - ASSERT(inst.dst[0].field == OperandField::VccLo); - ir.SetVcc(result); ir.SetScc(result); + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::S_ADD_I32(const GcnInst& inst) { @@ -179,6 +207,36 @@ void Translator::S_CSELECT_B32(const GcnInst& inst) { SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)}); } +void Translator::S_CSELECT_B64(const GcnInst& inst) { + const auto get_src = [&](const InstOperand& operand) { + switch (operand.field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + case OperandField::ConstZero: + return ir.Imm1(false); + default: + UNREACHABLE(); + } + }; + const IR::U1 src0{get_src(inst.src[0])}; + const IR::U1 src1{get_src(inst.src[1])}; + const IR::U1 result{ir.Select(ir.GetScc(), src0, src1)}; + switch (inst.dst[0].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + break; + default: + UNREACHABLE(); + } +} + void Translator::S_BFE_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index e76950b7..07abc3b8 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -5,30 +5,15 @@ namespace Shader::Gcn { -void Load(IR::IREmitter& ir, int num_dwords, const IR::Value& handle, IR::ScalarReg dst_reg, - const IR::U32U64& address) { - for (u32 i = 0; i < num_dwords; i++) { - if (handle.IsEmpty()) { - ir.SetScalarReg(dst_reg++, ir.ReadConst(address, ir.Imm32(i))); - } else { - const IR::U32 index = ir.IAdd(address, ir.Imm32(i)); - ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(handle, index)); - } - } -} - void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; + ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported"); const IR::ScalarReg sbase{inst.src[0].code * 2}; - const IR::U32 offset = - smrd.imm ? ir.Imm32(smrd.offset * 4) - : IR::U32{ir.ShiftLeftLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), - ir.Imm32(2))}; - const IR::U64 base = - ir.PackUint2x32(ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1))); - const IR::U64 address = ir.IAdd(base, offset); - const IR::ScalarReg dst_reg{inst.dst[0].code}; - Load(ir, num_dwords, {}, dst_reg, address); + const IR::Value base = ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)); + IR::ScalarReg dst_reg{inst.dst[0].code}; + for (u32 i = 0; i < num_dwords; i++) { + ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i))); + } } void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { @@ -37,8 +22,11 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const IR::U32 dword_offset = smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset)); const IR::Value vsharp = ir.GetScalarReg(sbase); - const IR::ScalarReg dst_reg{inst.dst[0].code}; - Load(ir, num_dwords, vsharp, dst_reg, dword_offset); + IR::ScalarReg dst_reg{inst.dst[0].code}; + for (u32 i = 0; i < num_dwords; i++) { + const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); + ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(vsharp, index)); + } } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 77b4d742..fa43a9e1 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -256,6 +256,12 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) break; case Opcode::S_WAITCNT: break; + case Opcode::S_LOAD_DWORDX4: + translator.S_LOAD_DWORD(4, inst); + break; + case Opcode::S_LOAD_DWORDX8: + translator.S_LOAD_DWORD(8, inst); + break; case Opcode::S_BUFFER_LOAD_DWORD: translator.S_BUFFER_LOAD_DWORD(1, inst); break; @@ -356,9 +362,15 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); break; + case Opcode::S_CMP_LG_I32: + translator.S_CMP(ConditionOp::LG, true, inst); + break; case Opcode::S_CMP_EQ_I32: translator.S_CMP(ConditionOp::EQ, true, inst); break; + case Opcode::S_CMP_EQ_U32: + translator.S_CMP(ConditionOp::EQ, false, inst); + break; case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; @@ -509,6 +521,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CSELECT_B32: translator.S_CSELECT_B32(inst); break; + case Opcode::S_CSELECT_B64: + translator.S_CSELECT_B64(inst); + break; case Opcode::S_BFE_U32: translator.S_BFE_U32(inst); break; @@ -516,6 +531,8 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: case Opcode::S_CBRANCH_SCC1: + case Opcode::S_CBRANCH_VCCNZ: + case Opcode::S_CBRANCH_VCCZ: case Opcode::S_BRANCH: case Opcode::S_WQM_B64: case Opcode::V_INTERP_P1_F32: @@ -523,7 +540,7 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) break; default: const u32 opcode = u32(inst.opcode); - UNREACHABLE_MSG("Unknown opcode {}", opcode); + throw NotImplementedException("Opcode {}", opcode); } } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index d1efb724..8910f9fa 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -46,6 +46,7 @@ public: void S_AND_B32(const GcnInst& inst); void S_LSHR_B32(const GcnInst& inst); void S_CSELECT_B32(const GcnInst& inst); + void S_CSELECT_B64(const GcnInst& inst); void S_BFE_U32(const GcnInst& inst); // Scalar Memory diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 56f4c82f..10c5c1a1 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -85,21 +85,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) { } void Translator::V_MAD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; - const IR::F32 src1{GetSrc(inst.src[1])}; - const IR::F32 src2{GetSrc(inst.src[2])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_FRACT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.Fract(src0)); } void Translator::V_ADD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; - const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; SetDst(inst.dst[0], ir.FPAdd(src0, src1)); } @@ -114,14 +114,14 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { void Translator::V_MED3_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1])}; - const IR::F32 src2{GetSrc(inst.src[2])}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); } void Translator::V_FLOOR_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); } @@ -167,7 +167,17 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { UNREACHABLE(); } }(); - ir.SetVcc(result); + + switch (inst.dst[1].field) { + case OperandField::VccLo: + ir.SetVcc(result); + break; + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result); + break; + default: + UNREACHABLE(); + } } void Translator::V_MAX_F32(const GcnInst& inst) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index b5c067cf..7f3112b4 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -273,8 +273,8 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) }*/ } -U32 IREmitter::ReadConst(const U64& address, const U32& offset) { - return Inst(Opcode::ReadConst, address, offset); +U32 IREmitter::ReadConst(const Value& base, const U32& offset) { + return Inst(Opcode::ReadConst, base, offset); } F32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 3394c9b6..707c127e 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -77,7 +77,7 @@ public: [[nodiscard]] U32U64 ReadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); - [[nodiscard]] U32 ReadConst(const U64& address, const U32& offset); + [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] F32 ReadConstBuffer(const Value& handle, const U32& index); [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index a3009575..bd506f44 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -15,7 +15,7 @@ OPCODE(Epilogue, Void, OPCODE(Discard, Void, ) // Constant memory operations -OPCODE(ReadConst, U32, U64, U32, ) +OPCODE(ReadConst, U32, U32x2, U32, ) OPCODE(ReadConstBuffer, F32, Opaque, U32, ) OPCODE(ReadConstBufferU32, U32, Opaque, U32, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index c8e8d9cf..41da28aa 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -157,16 +157,15 @@ SharpLocation TrackSharp(const IR::Inst* inst) { ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory"); // Retrieve offset from base. - IR::Inst* addr = inst->Arg(0).InstRecursive(); - u32 dword_offset = addr->Arg(1).U32(); - addr = addr->Arg(0).InstRecursive(); - ASSERT_MSG(addr->Arg(1).IsImmediate(), "Bindless not supported"); - dword_offset += addr->Arg(1).U32() >> 2; + const u32 dword_offset = inst->Arg(1).U32(); + const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); - // Retrieve SGPR that holds sbase - inst = addr->Arg(0).InstRecursive()->Arg(0).InstRecursive(); - ASSERT_MSG(inst->GetOpcode() == IR::Opcode::GetUserData, "Nested resource loads not supported"); - const IR::ScalarReg base = inst->Arg(0).ScalarReg(); + // Retrieve SGPR pair that holds sbase + const IR::Inst* sbase0 = spgpr_base->Arg(0).InstRecursive(); + const IR::Inst* sbase1 = spgpr_base->Arg(1).InstRecursive(); + ASSERT_MSG(sbase0->GetOpcode() == IR::Opcode::GetUserData && + sbase1->GetOpcode() == IR::Opcode::GetUserData, "Nested resource loads not supported"); + const IR::ScalarReg base = sbase0->Arg(0).ScalarReg(); // Return retrieved location. return SharpLocation{ diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 3a14a02e..9acb5784 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -160,8 +160,13 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { inst_pool.ReleaseContents(); // Recompile shader to IR. - const Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); - programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + try { + const Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); + programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + } catch (const Shader::Exception& e) { + LOG_ERROR(Render_Vulkan, "{}", e.what()); + std::abort(); + } // Compile IR to SPIR-V auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 85891ec4..f3fe43a9 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -58,7 +58,7 @@ LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept { } #endif -static constexpr u64 StreamBufferSize = 128_MB; +static constexpr u64 StreamBufferSize = 512_MB; static constexpr u64 PageShift = 12; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_)