From d9231b239c59e547e796e34ea384d77bcefdb51e Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Sat, 13 Jul 2024 21:38:20 -0700 Subject: [PATCH] Migrate TLS patches to new patching system. --- CMakeLists.txt | 4 +- src/core/cpu_patches.cpp | 393 ++++++++++++++++++ .../{instruction_emulator.h => cpu_patches.h} | 1 + src/core/instruction_emulator.cpp | 291 ------------- src/core/module.cpp | 4 +- src/core/tls.cpp | 178 +------- src/core/tls.h | 6 +- 7 files changed, 420 insertions(+), 457 deletions(-) create mode 100644 src/core/cpu_patches.cpp rename src/core/{instruction_emulator.h => cpu_patches.h} (82%) delete mode 100644 src/core/instruction_emulator.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c2a7265..f1fd0a99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -292,6 +292,8 @@ set(CORE src/core/aerolib/stubs.cpp src/core/aerolib/aerolib.h src/core/address_space.cpp src/core/address_space.h + src/core/cpu_patches.cpp + src/core/cpu_patches.h src/core/crypto/crypto.cpp src/core/crypto/crypto.h src/core/crypto/keys.h @@ -308,8 +310,6 @@ set(CORE src/core/aerolib/stubs.cpp src/core/file_format/splash.cpp src/core/file_sys/fs.cpp src/core/file_sys/fs.h - src/core/instruction_emulator.cpp - src/core/instruction_emulator.h src/core/loader.cpp src/core/loader.h src/core/loader/dwarf.cpp diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp new file mode 100644 index 00000000..81a775da --- /dev/null +++ b/src/core/cpu_patches.cpp @@ -0,0 +1,393 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include "common/assert.h" +#include "common/types.h" +#include "core/tls.h" +#include "cpu_patches.h" + +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace Core { + +static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) { + if (reg >= ZYDIS_REGISTER_EAX && reg <= ZYDIS_REGISTER_R15D) { + return Xbyak::Reg32(reg - ZYDIS_REGISTER_EAX + Xbyak::Operand::EAX); + } + if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) { + return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX); + } + UNREACHABLE_MSG("Unsupported register: {}", static_cast(reg)); +} + +static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand) { + ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_REGISTER, + "Expected register operand, got type: {}", static_cast(operand.type)); + + return ZydisToXbyakRegister(operand.reg.value); +} + +static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) { + ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", + static_cast(operand.type)); + + Xbyak::RegExp expression{}; + if (operand.mem.base != ZYDIS_REGISTER_NONE) { + expression = expression + ZydisToXbyakRegister(operand.mem.base); + } + if (operand.mem.index != ZYDIS_REGISTER_NONE) { + if (operand.mem.scale != 0) { + expression = expression + ZydisToXbyakRegister(operand.mem.index) * operand.mem.scale; + } else { + expression = expression + ZydisToXbyakRegister(operand.mem.index); + } + } + if (operand.mem.disp.size != 0 && operand.mem.disp.value != 0) { + expression = expression + operand.mem.disp.value; + } + + return Xbyak::util::ptr[expression]; +} + +static std::unique_ptr ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { + switch (operand.type) { + case ZYDIS_OPERAND_TYPE_REGISTER: { + return std::make_unique(ZydisToXbyakRegisterOperand(operand)); + } + case ZYDIS_OPERAND_TYPE_MEMORY: { + return std::make_unique(ZydisToXbyakMemoryOperand(operand)); + } + default: + UNREACHABLE_MSG("Unsupported operand type: {}", static_cast(operand.type)); + } +} + +static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) { + if (operand->isREG()) { + return operand->getIdx() == index; + } + if (operand->isMEM()) { + const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp(); + return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index; + } + UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast(operand->getKind())); +} + +static bool IsRegisterAllocated( + const std::initializer_list& allocated_registers, const int index) { + return std::ranges::find_if(allocated_registers.begin(), allocated_registers.end(), + [index](const Xbyak::Operand* operand) { + return OperandUsesRegister(operand, index); + }) != allocated_registers.end(); +} + +static Xbyak::Reg AllocateScratchRegister( + const std::initializer_list allocated_registers, const u32 bits) { + for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) { + if (!IsRegisterAllocated(allocated_registers, index)) { + return Xbyak::Reg32e(index, static_cast(bits)); + } + } + UNREACHABLE_MSG("Out of scratch registers!"); +} + +#ifdef __APPLE__ + +static constexpr u32 MaxSavedRegisters = 3; +static pthread_key_t register_save_slots[MaxSavedRegisters]; +static std::once_flag register_save_init_flag; + +static_assert(sizeof(void*) == sizeof(u64), + "Cannot fit a register inside a thread local storage slot."); + +static void InitializeRegisterSaveSlots() { + for (u32 i = 0; i < MaxSavedRegisters; i++) { + ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0, + "Unable to allocate thread-local register save slot {}", i); + } +} + +static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { + ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", + regs.size()); + + std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); + + u32 index = 0; + for (const auto& reg : regs) { + const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); + + c.putSeg(Xbyak::util::gs); + c.mov(Xbyak::util::qword[offset], reg.cvt64()); + } +} + +static void RestoreRegisters(Xbyak::CodeGenerator& c, + const std::initializer_list regs) { + ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", + regs.size()); + + std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); + + u32 index = 0; + for (const auto& reg : regs) { + const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); + + c.putSeg(Xbyak::util::gs); + c.mov(reg.cvt64(), Xbyak::util::qword[offset]); + } +} + +static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); + const auto src2 = ZydisToXbyakOperand(operands[2]); + + const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit()); + + SaveRegisters(c, {scratch}); + + c.mov(scratch, src1); + c.not_(scratch); + c.and_(scratch, *src2); + c.mov(dst, scratch); + + RestoreRegisters(c, {scratch}); +} + +static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + const auto start_len = ZydisToXbyakRegisterOperand(operands[2]); + + const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast(start_len.getBit())); + const auto scratch1 = + AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit()); + const auto scratch2 = + AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit()); + + if (dst.getIdx() == shift.getIdx()) { + SaveRegisters(c, {scratch1, scratch2}); + } else { + SaveRegisters(c, {scratch1, scratch2, shift}); + } + + c.mov(scratch1, *src); + if (shift.getIdx() != start_len.getIdx()) { + c.mov(shift, start_len); + } + + c.shr(scratch1, shift.cvt8()); + c.shr(shift, 8); + c.mov(scratch2, 1); + c.shl(scratch2, shift.cvt8()); + c.dec(scratch2); + + c.mov(dst, scratch1); + c.and_(dst, scratch2); + + if (dst.getIdx() == shift.getIdx()) { + RestoreRegisters(c, {scratch1, scratch2}); + } else { + RestoreRegisters(c, {scratch1, scratch2, shift}); + } +} + +static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + + const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); + + SaveRegisters(c, {scratch}); + + c.mov(scratch, *src); + c.neg(scratch); + c.and_(scratch, *src); + c.mov(dst, scratch); + + RestoreRegisters(c, {scratch}); +} + +static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + + const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); + + SaveRegisters(c, {scratch}); + + c.mov(scratch, *src); + c.dec(scratch); + c.xor_(scratch, *src); + c.mov(dst, scratch); + + RestoreRegisters(c, {scratch}); +} + +static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + + const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); + + SaveRegisters(c, {scratch}); + + c.mov(scratch, *src); + c.dec(scratch); + c.and_(scratch, *src); + c.mov(dst, scratch); + + RestoreRegisters(c, {scratch}); +} + +#endif // __APPLE__ + +static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { + const auto& dst_op = operands[0]; + const auto& src_op = operands[1]; + + // Patch only 'mov (64-bit register), fs:[0]' + return src_op.type == ZYDIS_OPERAND_TYPE_MEMORY && src_op.mem.segment == ZYDIS_REGISTER_FS && + src_op.mem.base == ZYDIS_REGISTER_NONE && src_op.mem.index == ZYDIS_REGISTER_NONE && + src_op.mem.disp.value == 0 && dst_op.reg.value >= ZYDIS_REGISTER_RAX && + dst_op.reg.value <= ZYDIS_REGISTER_R15; +} + +static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto slot = GetTcbKey(); + +#if defined(_WIN32) + // The following logic is based on the wine implementation of TlsGetValue + // https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719 + static constexpr u32 TlsSlotsOffset = 0x1480; + static constexpr u32 TlsExpansionSlotsOffset = 0x1780; + static constexpr u32 TlsMinimumAvailable = 64; + + const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset; + const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable; + + // Load the pointer to the table of TLS slots. + c.putSeg(Xbyak::util::gs); + c.mov(dst, Xbyak::util::ptr[reinterpret_cast(teb_offset)]); + // Load the pointer to our buffer. + c.mov(dst, Xbyak::util::qword[dst + tls_index * sizeof(LPVOID)]); +#elif defined(__APPLE__) + // The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by + // pthread_getspecific https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96 + c.putSeg(Xbyak::util::gs); + c.mov(dst, Xbyak::util::qword[reinterpret_cast(slot * sizeof(void*))]); +#else + const auto src = ZydisToXbyakMemoryOperand(operands[1]); + + // Replace fs read with gs read. + c.putSeg(Xbyak::util::gs); + c.mov(dst, src); +#endif +} + +bool FilterAlwaysTrue(const ZydisDecodedOperand* operands) { + return true; +} + +using PatchFilter = bool (*)(const ZydisDecodedOperand*); +using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); +struct PatchInfo { + /// Filter for more granular patch conditions past just the instruction mnemonic. + PatchFilter filter; + + /// Generator for the patch/trampoline. + InstructionGenerator generator; + + /// Whether to use a trampoline for this patch. + bool trampoline; +}; + +static const std::unordered_map Patches = { +#if defined(_WIN32) || defined(__APPLE__) + // Windows and Apple need a trampoline. + {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, +#else + {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, +#endif + +#ifdef __APPLE__ + // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. + {ZYDIS_MNEMONIC_ANDN, {FilterAlwaysTrue, GenerateANDN, true}}, + {ZYDIS_MNEMONIC_BEXTR, {FilterAlwaysTrue, GenerateBEXTR, true}}, + {ZYDIS_MNEMONIC_BLSI, {FilterAlwaysTrue, GenerateBLSI, true}}, + {ZYDIS_MNEMONIC_BLSMSK, {FilterAlwaysTrue, GenerateBLSMSK, true}}, + {ZYDIS_MNEMONIC_BLSR, {FilterAlwaysTrue, GenerateBLSR, true}}, +#endif +}; + +void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) { + if (Patches.empty()) { + // Nothing to patch on this platform. + return; + } + + ZydisDecoder instr_decoder; + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + ZydisDecoderInit(&instr_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); + + u8* code = reinterpret_cast(segment_addr); + u8* end = code + segment_size; + while (code < end) { + ZyanStatus status = + ZydisDecoderDecodeFull(&instr_decoder, code, end - code, &instruction, operands); + if (!ZYAN_SUCCESS(status)) { + code++; + continue; + } + + if (Patches.contains(instruction.mnemonic)) { + auto patch_info = Patches.at(instruction.mnemonic); + if (patch_info.filter(operands)) { + auto patch_gen = Xbyak::CodeGenerator(instruction.length, code); + + if (patch_info.trampoline) { + const auto trampoline_ptr = c.getCurr(); + + patch_info.generator(operands, c); + + // Return to the following instruction at the end of the trampoline. + c.jmp(code + instruction.length); + + // Replace instruction with near jump to the trampoline. + patch_gen.jmp(trampoline_ptr, Xbyak::CodeGenerator::LabelType::T_NEAR); + } else { + patch_info.generator(operands, patch_gen); + } + + const auto patch_size = patch_gen.getCurr() - code; + if (patch_size > 0) { + ASSERT_MSG(instruction.length >= patch_size, + "Instruction {} with length {} is too short to replace at: {}", + ZydisMnemonicGetString(instruction.mnemonic), instruction.length, + fmt::ptr(code)); + + // Fill remaining space with nops. + patch_gen.nop(instruction.length - patch_size); + + LOG_DEBUG(Core, "Patched instruction '{}' at: {}", + ZydisMnemonicGetString(instruction.mnemonic), fmt::ptr(code)); + } + } + } + + code += instruction.length; + } +} + +} // namespace Core diff --git a/src/core/instruction_emulator.h b/src/core/cpu_patches.h similarity index 82% rename from src/core/instruction_emulator.h rename to src/core/cpu_patches.h index 023fd62b..45adbeda 100644 --- a/src/core/instruction_emulator.h +++ b/src/core/cpu_patches.h @@ -9,6 +9,7 @@ class CodeGenerator; namespace Core { +/// Patches CPU instructions that cannot run as-is on the host. void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c); } // namespace Core diff --git a/src/core/instruction_emulator.cpp b/src/core/instruction_emulator.cpp deleted file mode 100644 index b0bf4146..00000000 --- a/src/core/instruction_emulator.cpp +++ /dev/null @@ -1,291 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include -#include -#include "common/assert.h" -#include "common/types.h" -#include "instruction_emulator.h" - -namespace Core { - -static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) { - if (reg >= ZYDIS_REGISTER_EAX && reg <= ZYDIS_REGISTER_R15D) { - return Xbyak::Reg32(reg - ZYDIS_REGISTER_EAX); - } else if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) { - return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX); - } else { - UNREACHABLE_MSG("Unsupported register: {}", static_cast(reg)); - } -} - -static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand) { - ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_REGISTER, "Expected register operand, got type: {}", static_cast(operand.type)); - - return ZydisToXbyakRegister(operand.reg.value); -} - -static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) { - ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", static_cast(operand.type)); - - Xbyak::RegExp expression{}; - if (operand.mem.base != ZYDIS_REGISTER_NONE) { - expression = expression + ZydisToXbyakRegister(operand.mem.base); - } - if (operand.mem.index != ZYDIS_REGISTER_NONE) { - if (operand.mem.scale != 0) { - expression = expression + ZydisToXbyakRegister(operand.mem.index) * operand.mem.scale; - } else { - expression = expression + ZydisToXbyakRegister(operand.mem.index); - } - } - if (operand.mem.disp.size != 0 && operand.mem.disp.value != 0) { - expression = expression + operand.mem.disp.value; - } - - return Xbyak::util::ptr[expression]; -} - -static std::unique_ptr ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { - switch (operand.type) { - case ZYDIS_OPERAND_TYPE_REGISTER: { - return std::make_unique(ZydisToXbyakRegisterOperand(operand)); - } - case ZYDIS_OPERAND_TYPE_MEMORY: { - return std::make_unique(ZydisToXbyakMemoryOperand(operand)); - } - default: - UNREACHABLE_MSG("Unsupported operand type: {}", static_cast(operand.type)); - } -} - -#ifdef __APPLE__ - -static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) { - if (operand->isREG()) { - return operand->getIdx() == index; - } - if (operand->isMEM()) { - const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp(); - return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index; - } - UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast(operand->getKind())); -} - -static bool IsRegisterAllocated(const std::initializer_list& allocated_registers, const int index) { - return std::ranges::find_if( - allocated_registers.begin(), allocated_registers.end(), - [index](const Xbyak::Operand* operand) { return OperandUsesRegister(operand, index); }) != allocated_registers.end(); -} - -static Xbyak::Reg AllocateScratchRegister(const std::initializer_list allocated_registers, const u32 bits) { - for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) { - if (!IsRegisterAllocated(allocated_registers, index)) { - return Xbyak::Reg32e(index, static_cast(bits)); - } - } - UNREACHABLE_MSG("Out of scratch registers!"); -} - -static constexpr u32 MaxSavedRegisters = 3; -static pthread_key_t register_save_slots[MaxSavedRegisters]; -static std::once_flag register_save_init_flag; - -static_assert(sizeof(void*) == sizeof(u64), "Cannot fit a register inside a thread local storage slot."); - -static void InitializeRegisterSaveSlots() { - for (u32 i = 0; i < MaxSavedRegisters; i++) { - ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0, - "Unable to allocate thread-local register save slot {}", i); - } -} - -static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { - ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", regs.size()); - - std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); - - u32 index = 0; - for (const auto& reg : regs) { - const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); - - c.putSeg(Xbyak::util::gs); - c.mov(Xbyak::util::qword[offset], reg.cvt64()); - } -} - -static void RestoreRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { - ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", regs.size()); - - std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); - - u32 index = 0; - for (const auto& reg : regs) { - const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); - - c.putSeg(Xbyak::util::gs); - c.mov(reg.cvt64(), Xbyak::util::qword[offset]); - } -} - -static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); - const auto src2 = ZydisToXbyakOperand(operands[2]); - - const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - c.mov(scratch, src1); - c.not_(scratch); - c.and_(scratch, *src2); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - const auto start_len = ZydisToXbyakRegisterOperand(operands[2]); - - const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast(start_len.getBit())); - const auto scratch1 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit()); - const auto scratch2 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit()); - - if (dst.getIdx() == shift.getIdx()) { - SaveRegisters(c, {scratch1, scratch2}); - } else { - SaveRegisters(c, {scratch1, scratch2, shift}); - } - - c.mov(scratch1, *src); - if (shift.getIdx() != start_len.getIdx()) { - c.mov(shift, start_len); - } - - c.shr(scratch1, shift.cvt8()); - c.shr(shift, 8); - c.mov(scratch2, 1); - c.shl(scratch2, shift.cvt8()); - c.dec(scratch2); - - c.mov(dst, scratch1); - c.and_(dst, scratch2); - - if (dst.getIdx() == shift.getIdx()) { - RestoreRegisters(c, {scratch1, scratch2}); - } else { - RestoreRegisters(c, {scratch1, scratch2, shift}); - } -} - -static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - c.mov(scratch, *src); - c.neg(scratch); - c.and_(scratch, *src); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - c.mov(scratch, *src); - c.dec(scratch); - c.xor_(scratch, *src); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - c.mov(scratch, *src); - c.dec(scratch); - c.and_(scratch, *src); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -#endif - -using InstructionGenerator = void(*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); -static const std::unordered_map InstructionGenerators = { -#ifdef __APPLE__ - // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. - {ZYDIS_MNEMONIC_ANDN, &GenerateANDN}, - {ZYDIS_MNEMONIC_BEXTR, &GenerateBEXTR}, - {ZYDIS_MNEMONIC_BLSI, &GenerateBLSI}, - {ZYDIS_MNEMONIC_BLSMSK, &GenerateBLSMSK}, - {ZYDIS_MNEMONIC_BLSR, &GenerateBLSR}, -#endif -}; - -void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) { - if (InstructionGenerators.empty()) { - // Nothing to patch on this platform. - return; - } - - ZydisDecoder instr_decoder; - ZydisDecodedInstruction instruction; - ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; - ZydisDecoderInit(&instr_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); - - u8* code = reinterpret_cast(segment_addr); - u8* end = code + segment_size; - while (code < end) { - ZyanStatus status = - ZydisDecoderDecodeFull(&instr_decoder, code, end - code, &instruction, operands); - if (!ZYAN_SUCCESS(status)) { - code++; - continue; - } - - if (InstructionGenerators.contains(instruction.mnemonic)) { - LOG_DEBUG(Core, "Replacing instruction '{}' at: {}", ZydisMnemonicGetString(instruction.mnemonic), - fmt::ptr(code)); - - // Replace instruction with near jump to the trampoline. - static constexpr u32 NearJmpSize = 5; - ASSERT_MSG(instruction.length >= NearJmpSize, "Instruction {} with length {} is too short to replace at: {}", - ZydisMnemonicGetString(instruction.mnemonic), instruction.length, fmt::ptr(code)); - - auto patch = Xbyak::CodeGenerator(instruction.length, code); - patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR); - patch.nop(instruction.length - NearJmpSize); - - auto generator = InstructionGenerators.at(instruction.mnemonic); - generator(operands, c); - c.jmp(code + instruction.length); // Return to the following instruction. - } - - code += instruction.length; - } -} - -} // namespace Loader diff --git a/src/core/module.cpp b/src/core/module.cpp index 70ae4425..3eff2ef6 100644 --- a/src/core/module.cpp +++ b/src/core/module.cpp @@ -7,11 +7,10 @@ #include "common/logging/log.h" #include "common/string_util.h" #include "core/aerolib/aerolib.h" -#include "core/instruction_emulator.h" +#include "core/cpu_patches.h" #include "core/loader/dwarf.h" #include "core/memory.h" #include "core/module.h" -#include "core/tls.h" namespace Core { @@ -132,7 +131,6 @@ void Module::LoadModuleToMemory(u32& max_tls_index) { add_segment(elf_pheader[i]); if (elf_pheader[i].p_flags & PF_EXEC) { - PatchTLS(segment_addr, segment_file_size, c); PatchInstructions(segment_addr, segment_file_size, c); } break; diff --git a/src/core/tls.cpp b/src/core/tls.cpp index d220638e..3216d0fe 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -1,141 +1,58 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include +#include #include "common/assert.h" #include "common/types.h" #include "core/tls.h" #ifdef _WIN32 #include -#elif !defined(__APPLE__) -#include /* Definition of ARCH_* constants */ -#include /* Definition of SYS_* constants */ +#elif defined(__APPLE__) +#include #endif namespace Core { -struct TLSPattern { - u8 pattern[5]; - u8 pattern_size; - u8 imm_size; - u8 target_reg; -}; - -constexpr static TLSPattern TlsPatterns[] = { - // 64 48 A1 | 00 00 00 00 00 00 00 00 # mov rax, qword ptr fs:[64b imm] - {{0x64, 0x48, 0xA1}, 3, 8, 0}, - // 64 48 8B 04 25 | 00 00 00 00 # mov rax,qword ptr fs:[0] - {{0x64, 0x48, 0x8B, 0x4, 0x25}, 5, 4, 0}, // rax - {{0x64, 0x48, 0x8B, 0xC, 0x25}, 5, 4, 1}, // rcx - {{0x64, 0x48, 0x8B, 0x14, 0x25}, 5, 4, 2}, // rdx - {{0x64, 0x48, 0x8B, 0x1C, 0x25}, 5, 4, 3}, // rbx - {{0x64, 0x48, 0x8B, 0x24, 0x25}, 5, 4, 4}, // rsp - {{0x64, 0x48, 0x8B, 0x2C, 0x25}, 5, 4, 5}, // rbp - {{0x64, 0x48, 0x8B, 0x34, 0x25}, 5, 4, 6}, // rsi - {{0x64, 0x48, 0x8B, 0x3C, 0x25}, 5, 4, 7}, // rdi - {{0x64, 0x4C, 0x8B, 0x4, 0x25}, 5, 4, 8}, // r8 - {{0x64, 0x4C, 0x8B, 0xC, 0x25}, 5, 4, 9}, // r9 - {{0x64, 0x4C, 0x8B, 0x14, 0x25}, 5, 4, 10}, // r10 - {{0x64, 0x4C, 0x8B, 0x1C, 0x25}, 5, 4, 11}, // r11 - {{0x64, 0x4C, 0x8B, 0x24, 0x25}, 5, 4, 12}, // r12 - {{0x64, 0x4C, 0x8B, 0x2C, 0x25}, 5, 4, 13}, // r13 - {{0x64, 0x4C, 0x8B, 0x34, 0x25}, 5, 4, 14}, // r14 - {{0x64, 0x4C, 0x8B, 0x3C, 0x25}, 5, 4, 15}, // r15 -}; - #ifdef _WIN32 + static DWORD slot = 0; -void SetTcbBase(void* image_address) { - const BOOL result = TlsSetValue(slot, image_address); - ASSERT(result != 0); -} - -Tcb* GetTcbBase() { - return reinterpret_cast(TlsGetValue(slot)); -} - static void AllocTcbKey() { slot = TlsAlloc(); } -static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) { - using namespace Xbyak::util; - const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size; +void SetTcbBase(void* image_address) { + const BOOL result = TlsSetValue(GetTcbKey(), image_address); + ASSERT(result != 0); +} - // Replace mov instruction with near jump to the trampoline. - static constexpr u32 NearJmpSize = 5; - auto patch = Xbyak::CodeGenerator(total_size, code); - patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR); - patch.nop(total_size - NearJmpSize); - - // Write the trampoline. - // The following logic is based on the wine implementation of TlsGetValue - // https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719 - static constexpr u32 TlsSlotsOffset = 0x1480; - static constexpr u32 TlsExpansionSlotsOffset = 0x1780; - static constexpr u32 TlsMinimumAvailable = 64; - const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset; - const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable; - - const auto target_reg = Xbyak::Reg64(tls_pattern.target_reg); - c.mov(target_reg, teb_offset); - c.putSeg(gs); - c.mov(target_reg, ptr[target_reg]); // Load the pointer to the table of tls slots. - c.mov(target_reg, - qword[target_reg + tls_index * sizeof(LPVOID)]); // Load the pointer to our buffer. - c.jmp(code + total_size); // Return to the instruction right after the mov. +Tcb* GetTcbBase() { + return reinterpret_cast(TlsGetValue(GetTcbKey())); } #elif defined(__APPLE__) static pthread_key_t slot = 0; -static std::once_flag slot_alloc_flag; static void AllocTcbKey() { ASSERT(pthread_key_create(&slot, nullptr) == 0); } void SetTcbBase(void* image_address) { - std::call_once(slot_alloc_flag, &AllocTcbKey); - ASSERT(pthread_setspecific(slot, image_address) == 0); + ASSERT(pthread_setspecific(GetTcbKey(), image_address) == 0); } Tcb* GetTcbBase() { - std::call_once(slot_alloc_flag, &AllocTcbKey); - return reinterpret_cast(pthread_getspecific(slot)); -} - -static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) { - using namespace Xbyak::util; - const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size; - - // Allocate slot in the process if not done already. - std::call_once(slot_alloc_flag, &AllocTcbKey); - - static constexpr u32 NearJmpSize = 5; - - // Replace fs read with gs read. - auto patch = Xbyak::CodeGenerator(total_size, code); - patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR); - patch.nop(total_size - NearJmpSize); - - // Write the trampoline. - const auto target_reg = Xbyak::Reg64(tls_pattern.target_reg); - - // The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by pthread_getspecific - // https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96 - c.putSeg(gs); - c.mov(target_reg, qword[reinterpret_cast(slot * sizeof(void*))]); // Load the slot data. - - // Return to the instruction right after the mov. - c.jmp(code + total_size); + return reinterpret_cast(pthread_getspecific(GetTcbKey())); } #else -static u32 slot = 0; +// Placeholder for code compatibility. +static constexpr u32 slot = 0; + +static void AllocTcbKey() {} void SetTcbBase(void* image_address) { asm volatile("wrgsbase %0" ::"r"(image_address) : "memory"); @@ -147,68 +64,13 @@ Tcb* GetTcbBase() { return tcb; } -static void AllocTcbKey() {} - -static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) { - using namespace Xbyak::util; - const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size; - - // Replace fs read with gs read. - auto patch = Xbyak::CodeGenerator(total_size, code); - patch.putSeg(gs); -} - #endif -void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) { - u8* code = reinterpret_cast(segment_addr); - auto remaining_size = segment_size; +static std::once_flag slot_alloc_flag; - // Sometimes loads from the FS segment are prefixed with useless operand size prefix bytes like: - // |66 66 66| 64 48 8b 04 25 00 # mov rax, qword ptr fs:[0x0] - // These are probably ignored by the processor but when patching the instruction to a jump - // they cause issues. So look for them and patch them to nop to avoid problems. - static constexpr std::array BadPrefix = {0x66, 0x66, 0x66}; - - while (remaining_size) { - for (const auto& tls_pattern : TlsPatterns) { - const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size; - if (remaining_size < total_size) { - continue; - } - if (std::memcmp(code, tls_pattern.pattern, tls_pattern.pattern_size) != 0) { - continue; - } - u64 offset = 0; - if (tls_pattern.imm_size == 4) { - std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u32)); - LOG_TRACE(Core_Linker, "PATTERN32 FOUND at {}, reg: {} offset: {:#x}", - fmt::ptr(code), tls_pattern.target_reg, offset); - } else { - std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u64)); - LOG_ERROR(Core_Linker, "PATTERN64 FOUND at {}, reg: {} offset: {:#x}", - fmt::ptr(code), tls_pattern.target_reg, offset); - continue; - } - ASSERT(offset == 0); - - // Replace bogus instruction prefix with nops if it exists. - if (std::memcmp(code - BadPrefix.size(), BadPrefix.data(), sizeof(BadPrefix)) == 0) { - auto patch = Xbyak::CodeGenerator(BadPrefix.size(), code - BadPrefix.size()); - patch.nop(BadPrefix.size()); - } - - // Patch access to FS register to a trampoline. - PatchFsAccess(code, tls_pattern, c); - - // Move ahead in module. - code += total_size - 1; - remaining_size -= total_size - 1; - break; - } - code++; - remaining_size--; - } +u32 GetTcbKey() { + std::call_once(slot_alloc_flag, &AllocTcbKey); + return slot; } } // namespace Core diff --git a/src/core/tls.h b/src/core/tls.h index 8e546935..9829c8d9 100644 --- a/src/core/tls.h +++ b/src/core/tls.h @@ -22,13 +22,13 @@ struct Tcb { void* tcb_thread; }; +/// Gets the thread local storage key for the TCB block. +u32 GetTcbKey(); + /// Sets the data pointer to the TCB block. void SetTcbBase(void* image_address); /// Retrieves Tcb structure for the calling thread. Tcb* GetTcbBase(); -/// Patches any instructions that access guest TLS to use provided storage. -void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c); - } // namespace Core