From a17150960f01b728d17522791c714d2d5ff10c79 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Fri, 19 Jul 2024 01:09:07 -0700 Subject: [PATCH] Add patches for F16C instructions under Rosetta 2. --- .gitmodules | 5 +- CMakeLists.txt | 3 + externals/CMakeLists.txt | 16 +- externals/half | 1 + src/core/cpu_patches.cpp | 291 ++++++++++++++++-- src/core/cpu_patches.h | 6 + .../libraries/kernel/thread_management.cpp | 3 + src/core/linker.cpp | 4 + 8 files changed, 292 insertions(+), 37 deletions(-) create mode 160000 externals/half diff --git a/.gitmodules b/.gitmodules index 94996586..95b0fc0b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -81,4 +81,7 @@ [submodule "externals/ffmpeg-core"] path = externals/ffmpeg-core url = https://github.com/shadps4-emu/ext-ffmpeg-core.git - shallow = true \ No newline at end of file + shallow = true +[submodule "externals/half"] + path = externals/half + url = https://github.com/ROCm/half.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 04740784..5601556d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -638,6 +638,9 @@ if (APPLE) # Replacement for std::chrono::time_zone target_link_libraries(shadps4 PRIVATE date::date-tz) + + # Half float conversions for F16C patches + target_link_libraries(shadps4 PRIVATE half) endif() if (NOT ENABLE_QT_GUI) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 6fe73a29..de0317ff 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -142,11 +142,17 @@ if (WIN32) target_compile_options(sirit PUBLIC "-Wno-error=unused-command-line-argument") endif() -# date -if (APPLE AND NOT TARGET date::date-tz) - option(BUILD_TZ_LIB "" ON) - option(USE_SYSTEM_TZ_DB "" ON) - add_subdirectory(date) +if (APPLE) + # half + add_library(half INTERFACE) + target_include_directories(half INTERFACE half/include) + + # date + if (NOT TARGET date::date-tz) + option(BUILD_TZ_LIB "" ON) + option(USE_SYSTEM_TZ_DB "" ON) + add_subdirectory(date) + endif() endif() # Tracy diff --git a/externals/half b/externals/half new file mode 160000 index 00000000..1ddada22 --- /dev/null +++ b/externals/half @@ -0,0 +1 @@ +Subproject commit 1ddada225144cac0de8f6b5c0dd9acffd99a2e68 diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 42318822..55bbf23b 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -15,6 +15,7 @@ #else #include #ifdef __APPLE__ +#include #include #endif #endif @@ -30,6 +31,12 @@ static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) { if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) { return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX); } + if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM31) { + return Xbyak::Xmm(reg - ZYDIS_REGISTER_XMM0 + xmm0.getIdx()); + } + if (reg >= ZYDIS_REGISTER_YMM0 && reg <= ZYDIS_REGISTER_YMM31) { + return Xbyak::Ymm(reg - ZYDIS_REGISTER_YMM0 + ymm0.getIdx()); + } UNREACHABLE_MSG("Unsupported register: {}", static_cast(reg)); } @@ -66,6 +73,12 @@ static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& opera return ptr[expression]; } +static u64 ZydisToXbyakImmediateOperand(const ZydisDecodedOperand& operand) { + ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_IMMEDIATE, + "Expected immediate operand, got type: {}", static_cast(operand.type)); + return operand.imm.value.u; +} + static std::unique_ptr ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { switch (operand.type) { case ZYDIS_OPERAND_TYPE_REGISTER: { @@ -110,51 +123,135 @@ static Xbyak::Reg AllocateScratchRegister( #ifdef __APPLE__ -static constexpr u32 MaxSavedRegisters = 3; -static pthread_key_t register_save_slots[MaxSavedRegisters]; -static std::once_flag register_save_init_flag; +static pthread_key_t stack_pointer_slot; +static pthread_key_t patch_stack_slot; +static std::once_flag patch_context_slots_init_flag; static_assert(sizeof(void*) == sizeof(u64), "Cannot fit a register inside a thread local storage slot."); -static void InitializeRegisterSaveSlots() { - for (u32 i = 0; i < MaxSavedRegisters; i++) { - ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0, - "Unable to allocate thread-local register save slot {}", i); +static void InitializePatchContextSlots() { + ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0, + "Unable to allocate thread-local register for stack pointer."); + ASSERT_MSG(pthread_key_create(&patch_stack_slot, nullptr) == 0, + "Unable to allocate thread-local register for patch stack."); +} + +void InitializeThreadPatchStack() { + std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); + + const auto* patch_stack = std::malloc(0x1000); + pthread_setspecific(patch_stack_slot, patch_stack); +} + +void CleanupThreadPatchStack() { + std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); + + auto* patch_stack = pthread_getspecific(patch_stack_slot); + if (patch_stack != nullptr) { + std::free(patch_stack); + pthread_setspecific(patch_stack_slot, nullptr); } } +/// Saves the stack pointer to thread local storage and loads the patch stack. +static void SaveStack(Xbyak::CodeGenerator& c) { + std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); + + // Save stack pointer and load patch stack. + c.putSeg(gs); + c.mov(qword[reinterpret_cast(stack_pointer_slot * sizeof(void*))], rsp); + c.putSeg(gs); + c.mov(rsp, qword[reinterpret_cast(patch_stack_slot * sizeof(void*))]); +} + +/// Restores the stack pointer from thread local storage. +static void RestoreStack(Xbyak::CodeGenerator& c) { + std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); + + // Save patch stack pointer and load original stack. + c.putSeg(gs); + c.mov(qword[reinterpret_cast(patch_stack_slot * sizeof(void*))], rsp); + c.putSeg(gs); + c.mov(rsp, qword[reinterpret_cast(stack_pointer_slot * sizeof(void*))]); +} + +#else + +// These utilities are not implemented as we can't save anything to thread local storage without +// temporary registers. +void InitializeThreadPatchStack() { + // No-op +} + +void CleanupThreadPatchStack() { + // No-op +} + +/// Saves the stack pointer to thread local storage and loads the patch stack. +static void SaveStack(Xbyak::CodeGenerator& c) { + UNIMPLEMENTED(); +} + +/// Restores the stack pointer from thread local storage. +static void RestoreStack(Xbyak::CodeGenerator& c) { + UNIMPLEMENTED(); +} + +#endif + +/// Switches to the patch stack, saves registers, and restores the original stack. static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { - ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", - regs.size()); - - std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); - - u32 index = 0; + SaveStack(c); for (const auto& reg : regs) { - const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); - - c.putSeg(gs); - c.mov(qword[offset], reg.cvt64()); + c.push(reg.cvt64()); } + RestoreStack(c); } +/// Switches to the patch stack, restores registers, and restores the original stack. static void RestoreRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { - ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", - regs.size()); - - std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); - - u32 index = 0; + SaveStack(c); for (const auto& reg : regs) { - const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); + c.pop(reg.cvt64()); + } + RestoreStack(c); +} - c.putSeg(gs); - c.mov(reg.cvt64(), qword[offset]); +/// Switches to the patch stack and stores all registers. +static void SaveContext(Xbyak::CodeGenerator& c) { + SaveStack(c); + for (int reg = Xbyak::Operand::RAX; reg <= Xbyak::Operand::R15; reg++) { + c.push(Xbyak::Reg64(reg)); + } + for (int reg = 0; reg <= 7; reg++) { + c.sub(rsp, 32); + c.vmovdqu(ptr[rsp], Xbyak::Ymm(reg)); } } +/// Restores all registers and restores the original stack. +/// If the destination is a register, it is not restored to preserve the output. +static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst) { + for (int reg = 7; reg >= 0; reg--) { + if ((!dst.isXMM() && !dst.isYMM()) || dst.getIdx() != reg) { + c.vmovdqu(Xbyak::Ymm(reg), ptr[rsp]); + } + c.add(rsp, 32); + } + for (int reg = Xbyak::Operand::R15; reg >= Xbyak::Operand::RAX; reg--) { + if (!dst.isREG() || dst.getIdx() != reg) { + c.pop(Xbyak::Reg64(reg)); + } else { + c.add(rsp, 4); + } + } + RestoreStack(c); +} + +#ifdef __APPLE__ + static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); @@ -204,9 +301,9 @@ static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenera c.and_(dst, scratch2); if (dst.getIdx() == shift.getIdx()) { - RestoreRegisters(c, {scratch1, scratch2}); + RestoreRegisters(c, {scratch2, scratch1}); } else { - RestoreRegisters(c, {scratch1, scratch2, shift}); + RestoreRegisters(c, {shift, scratch2, scratch1}); } } @@ -258,10 +355,138 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat RestoreRegisters(c, {scratch}); } -bool FilterRosetta2Only(const ZydisDecodedOperand*) { +static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in, + const u32 count) { + for (u32 i = 0; i < count; i++) { + out[i] = half_float::half_cast(in[i]); + } +} + +static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + + const auto float_count = dst.getBit() / 32; + const auto byte_count = float_count * 4; + + SaveContext(c); + + // Allocate stack space for outputs and load into first parameter. + c.sub(rsp, byte_count); + c.mov(rdi, rsp); + + if (src->isXMM()) { + // Allocate stack space for inputs and load into second parameter. + c.sub(rsp, byte_count); + c.mov(rsi, rsp); + + // Move input to the allocated space. + c.movdqu(ptr[rsp], *reinterpret_cast(src.get())); + } else { + c.lea(rsi, src->getAddress()); + } + + // Load float count into third parameter. + c.mov(rdx, float_count); + + c.mov(rax, reinterpret_cast(PerformVCVTPH2PS)); + c.call(rax); + + if (src->isXMM()) { + // Clean up after inputs space. + c.add(rsp, byte_count); + } + + // Load outputs into destination register and clean up space. + if (dst.isYMM()) { + c.vmovdqu(*reinterpret_cast(&dst), ptr[rsp]); + } else { + c.movdqu(*reinterpret_cast(&dst), ptr[rsp]); + } + c.add(rsp, byte_count); + + RestoreContext(c, dst); +} + +using SingleToHalfFloatConverter = half_float::half (*)(float); +static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = { + half_float::half_cast, + half_float::half_cast, + half_float::half_cast, + half_float::half_cast, +}; + +static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in, + const u32 count, const u8 rounding_mode) { + const auto conversion_func = SingleToHalfFloatConverters[rounding_mode]; + + for (u32 i = 0; i < count; i++) { + out[i] = conversion_func(in[i]); + } +} + +static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakOperand(operands[0]); + const auto src = ZydisToXbyakRegisterOperand(operands[1]); + const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]); + + const auto float_count = src.getBit() / 32; + const auto byte_count = float_count * 4; + + SaveContext(c); + + if (dst->isXMM()) { + // Allocate stack space for outputs and load into first parameter. + c.sub(rsp, byte_count); + c.mov(rdi, rsp); + } else { + c.lea(rdi, dst->getAddress()); + } + + // Allocate stack space for inputs and load into second parameter. + c.sub(rsp, byte_count); + c.mov(rsi, rsp); + + // Move input to the allocated space. + if (src.isYMM()) { + c.vmovdqu(ptr[rsp], *reinterpret_cast(&src)); + } else { + c.movdqu(ptr[rsp], *reinterpret_cast(&src)); + } + + // Load float count into third parameter. + c.mov(rdx, float_count); + + // Load rounding mode into fourth parameter. + if (ctrl & 4) { + // Load from MXCSR.RC. + c.stmxcsr(ptr[rsp - 4]); + c.mov(rcx, ptr[rsp - 4]); + c.shr(rcx, 13); + c.and_(rcx, 3); + } else { + c.mov(rcx, ctrl & 3); + } + + c.mov(rax, reinterpret_cast(PerformVCVTPS2PH)); + c.call(rax); + + // Clean up after inputs space. + c.add(rsp, byte_count); + + if (dst->isXMM()) { + // Load outputs into destination register and clean up space. + c.movdqu(*reinterpret_cast(dst.get()), ptr[rsp]); + c.add(rsp, byte_count); + } + + RestoreContext(c, *dst); +} + +static bool FilterRosetta2Only(const ZydisDecodedOperand*) { int ret = 0; size_t size = sizeof(ret); - if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) != 0) { + if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) { return false; } return ret; @@ -339,12 +564,16 @@ static const std::unordered_map Patches = { #endif #ifdef __APPLE__ - // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. + // Patches for instruction sets not supported by Rosetta 2. + // BMI1 {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}}, {ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}}, {ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}}, {ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}}, {ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}}, + // F16C + {ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}}, + {ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}}, #endif }; diff --git a/src/core/cpu_patches.h b/src/core/cpu_patches.h index 45adbeda..9126074e 100644 --- a/src/core/cpu_patches.h +++ b/src/core/cpu_patches.h @@ -9,6 +9,12 @@ class CodeGenerator; namespace Core { +/// Initializes a stack for the current thread for use by patch implementations. +void InitializeThreadPatchStack(); + +/// Cleans up the patch stack for the current thread. +void CleanupThreadPatchStack(); + /// Patches CPU instructions that cannot run as-is on the host. void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c); diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 605d0d29..ac7c8da8 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -10,6 +10,7 @@ #include "common/logging/log.h" #include "common/singleton.h" #include "common/thread.h" +#include "core/cpu_patches.h" #include "core/libraries/error_codes.h" #include "core/libraries/kernel/libkernel.h" #include "core/libraries/kernel/thread_management.h" @@ -985,6 +986,7 @@ static void cleanup_thread(void* arg) { destructor(value); } } + Core::CleanupThreadPatchStack(); thread->is_almost_done = true; } @@ -993,6 +995,7 @@ static void* run_thread(void* arg) { Common::SetCurrentThreadName(thread->name.c_str()); auto* linker = Common::Singleton::Instance(); linker->InitTlsForThread(false); + Core::InitializeThreadPatchStack(); void* ret = nullptr; g_pthread_self = thread; pthread_cleanup_push(cleanup_thread, thread); diff --git a/src/core/linker.cpp b/src/core/linker.cpp index 9783ad96..2e242129 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -10,6 +10,7 @@ #include "common/thread.h" #include "core/aerolib/aerolib.h" #include "core/aerolib/stubs.h" +#include "core/cpu_patches.h" #include "core/libraries/kernel/memory_management.h" #include "core/libraries/kernel/thread_management.h" #include "core/linker.h" @@ -86,6 +87,7 @@ void Linker::Execute() { Common::SetCurrentThreadName("GAME_MainThread"); Libraries::Kernel::pthreadInitSelfMainThread(); InitTlsForThread(true); + InitializeThreadPatchStack(); // Start shared library modules for (auto& m : m_modules) { @@ -104,6 +106,8 @@ void Linker::Execute() { RunMainEntry(m->GetEntryAddress(), &p, ProgramExitFunc); } } + + CleanupThreadPatchStack(); } s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) {