Migrate TLS patches to new patching system.
This commit is contained in:
parent
6a6d5bad42
commit
d9231b239c
|
@ -292,6 +292,8 @@ set(CORE src/core/aerolib/stubs.cpp
|
||||||
src/core/aerolib/aerolib.h
|
src/core/aerolib/aerolib.h
|
||||||
src/core/address_space.cpp
|
src/core/address_space.cpp
|
||||||
src/core/address_space.h
|
src/core/address_space.h
|
||||||
|
src/core/cpu_patches.cpp
|
||||||
|
src/core/cpu_patches.h
|
||||||
src/core/crypto/crypto.cpp
|
src/core/crypto/crypto.cpp
|
||||||
src/core/crypto/crypto.h
|
src/core/crypto/crypto.h
|
||||||
src/core/crypto/keys.h
|
src/core/crypto/keys.h
|
||||||
|
@ -308,8 +310,6 @@ set(CORE src/core/aerolib/stubs.cpp
|
||||||
src/core/file_format/splash.cpp
|
src/core/file_format/splash.cpp
|
||||||
src/core/file_sys/fs.cpp
|
src/core/file_sys/fs.cpp
|
||||||
src/core/file_sys/fs.h
|
src/core/file_sys/fs.h
|
||||||
src/core/instruction_emulator.cpp
|
|
||||||
src/core/instruction_emulator.h
|
|
||||||
src/core/loader.cpp
|
src/core/loader.cpp
|
||||||
src/core/loader.h
|
src/core/loader.h
|
||||||
src/core/loader/dwarf.cpp
|
src/core/loader/dwarf.cpp
|
||||||
|
|
|
@ -0,0 +1,393 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <Zydis/Zydis.h>
|
||||||
|
#include <xbyak/xbyak.h>
|
||||||
|
#include "common/assert.h"
|
||||||
|
#include "common/types.h"
|
||||||
|
#include "core/tls.h"
|
||||||
|
#include "cpu_patches.h"
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <windows.h>
|
||||||
|
#else
|
||||||
|
#include <pthread.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace Core {
|
||||||
|
|
||||||
|
static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) {
|
||||||
|
if (reg >= ZYDIS_REGISTER_EAX && reg <= ZYDIS_REGISTER_R15D) {
|
||||||
|
return Xbyak::Reg32(reg - ZYDIS_REGISTER_EAX + Xbyak::Operand::EAX);
|
||||||
|
}
|
||||||
|
if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) {
|
||||||
|
return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX);
|
||||||
|
}
|
||||||
|
UNREACHABLE_MSG("Unsupported register: {}", static_cast<u32>(reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand) {
|
||||||
|
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_REGISTER,
|
||||||
|
"Expected register operand, got type: {}", static_cast<u32>(operand.type));
|
||||||
|
|
||||||
|
return ZydisToXbyakRegister(operand.reg.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) {
|
||||||
|
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}",
|
||||||
|
static_cast<u32>(operand.type));
|
||||||
|
|
||||||
|
Xbyak::RegExp expression{};
|
||||||
|
if (operand.mem.base != ZYDIS_REGISTER_NONE) {
|
||||||
|
expression = expression + ZydisToXbyakRegister(operand.mem.base);
|
||||||
|
}
|
||||||
|
if (operand.mem.index != ZYDIS_REGISTER_NONE) {
|
||||||
|
if (operand.mem.scale != 0) {
|
||||||
|
expression = expression + ZydisToXbyakRegister(operand.mem.index) * operand.mem.scale;
|
||||||
|
} else {
|
||||||
|
expression = expression + ZydisToXbyakRegister(operand.mem.index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (operand.mem.disp.size != 0 && operand.mem.disp.value != 0) {
|
||||||
|
expression = expression + operand.mem.disp.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Xbyak::util::ptr[expression];
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) {
|
||||||
|
switch (operand.type) {
|
||||||
|
case ZYDIS_OPERAND_TYPE_REGISTER: {
|
||||||
|
return std::make_unique<Xbyak::Reg>(ZydisToXbyakRegisterOperand(operand));
|
||||||
|
}
|
||||||
|
case ZYDIS_OPERAND_TYPE_MEMORY: {
|
||||||
|
return std::make_unique<Xbyak::Address>(ZydisToXbyakMemoryOperand(operand));
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
UNREACHABLE_MSG("Unsupported operand type: {}", static_cast<u32>(operand.type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) {
|
||||||
|
if (operand->isREG()) {
|
||||||
|
return operand->getIdx() == index;
|
||||||
|
}
|
||||||
|
if (operand->isMEM()) {
|
||||||
|
const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp();
|
||||||
|
return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index;
|
||||||
|
}
|
||||||
|
UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast<u32>(operand->getKind()));
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool IsRegisterAllocated(
|
||||||
|
const std::initializer_list<const Xbyak::Operand*>& allocated_registers, const int index) {
|
||||||
|
return std::ranges::find_if(allocated_registers.begin(), allocated_registers.end(),
|
||||||
|
[index](const Xbyak::Operand* operand) {
|
||||||
|
return OperandUsesRegister(operand, index);
|
||||||
|
}) != allocated_registers.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
static Xbyak::Reg AllocateScratchRegister(
|
||||||
|
const std::initializer_list<const Xbyak::Operand*> allocated_registers, const u32 bits) {
|
||||||
|
for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) {
|
||||||
|
if (!IsRegisterAllocated(allocated_registers, index)) {
|
||||||
|
return Xbyak::Reg32e(index, static_cast<int>(bits));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
UNREACHABLE_MSG("Out of scratch registers!");
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
|
||||||
|
static constexpr u32 MaxSavedRegisters = 3;
|
||||||
|
static pthread_key_t register_save_slots[MaxSavedRegisters];
|
||||||
|
static std::once_flag register_save_init_flag;
|
||||||
|
|
||||||
|
static_assert(sizeof(void*) == sizeof(u64),
|
||||||
|
"Cannot fit a register inside a thread local storage slot.");
|
||||||
|
|
||||||
|
static void InitializeRegisterSaveSlots() {
|
||||||
|
for (u32 i = 0; i < MaxSavedRegisters; i++) {
|
||||||
|
ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0,
|
||||||
|
"Unable to allocate thread-local register save slot {}", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
|
||||||
|
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.",
|
||||||
|
regs.size());
|
||||||
|
|
||||||
|
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
|
||||||
|
|
||||||
|
u32 index = 0;
|
||||||
|
for (const auto& reg : regs) {
|
||||||
|
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
|
||||||
|
|
||||||
|
c.putSeg(Xbyak::util::gs);
|
||||||
|
c.mov(Xbyak::util::qword[offset], reg.cvt64());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void RestoreRegisters(Xbyak::CodeGenerator& c,
|
||||||
|
const std::initializer_list<Xbyak::Reg> regs) {
|
||||||
|
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.",
|
||||||
|
regs.size());
|
||||||
|
|
||||||
|
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
|
||||||
|
|
||||||
|
u32 index = 0;
|
||||||
|
for (const auto& reg : regs) {
|
||||||
|
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
|
||||||
|
|
||||||
|
c.putSeg(Xbyak::util::gs);
|
||||||
|
c.mov(reg.cvt64(), Xbyak::util::qword[offset]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto src1 = ZydisToXbyakRegisterOperand(operands[1]);
|
||||||
|
const auto src2 = ZydisToXbyakOperand(operands[2]);
|
||||||
|
|
||||||
|
const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit());
|
||||||
|
|
||||||
|
SaveRegisters(c, {scratch});
|
||||||
|
|
||||||
|
c.mov(scratch, src1);
|
||||||
|
c.not_(scratch);
|
||||||
|
c.and_(scratch, *src2);
|
||||||
|
c.mov(dst, scratch);
|
||||||
|
|
||||||
|
RestoreRegisters(c, {scratch});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||||
|
const auto start_len = ZydisToXbyakRegisterOperand(operands[2]);
|
||||||
|
|
||||||
|
const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast<int>(start_len.getBit()));
|
||||||
|
const auto scratch1 =
|
||||||
|
AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit());
|
||||||
|
const auto scratch2 =
|
||||||
|
AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit());
|
||||||
|
|
||||||
|
if (dst.getIdx() == shift.getIdx()) {
|
||||||
|
SaveRegisters(c, {scratch1, scratch2});
|
||||||
|
} else {
|
||||||
|
SaveRegisters(c, {scratch1, scratch2, shift});
|
||||||
|
}
|
||||||
|
|
||||||
|
c.mov(scratch1, *src);
|
||||||
|
if (shift.getIdx() != start_len.getIdx()) {
|
||||||
|
c.mov(shift, start_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
c.shr(scratch1, shift.cvt8());
|
||||||
|
c.shr(shift, 8);
|
||||||
|
c.mov(scratch2, 1);
|
||||||
|
c.shl(scratch2, shift.cvt8());
|
||||||
|
c.dec(scratch2);
|
||||||
|
|
||||||
|
c.mov(dst, scratch1);
|
||||||
|
c.and_(dst, scratch2);
|
||||||
|
|
||||||
|
if (dst.getIdx() == shift.getIdx()) {
|
||||||
|
RestoreRegisters(c, {scratch1, scratch2});
|
||||||
|
} else {
|
||||||
|
RestoreRegisters(c, {scratch1, scratch2, shift});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||||
|
|
||||||
|
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||||
|
|
||||||
|
SaveRegisters(c, {scratch});
|
||||||
|
|
||||||
|
c.mov(scratch, *src);
|
||||||
|
c.neg(scratch);
|
||||||
|
c.and_(scratch, *src);
|
||||||
|
c.mov(dst, scratch);
|
||||||
|
|
||||||
|
RestoreRegisters(c, {scratch});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||||
|
|
||||||
|
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||||
|
|
||||||
|
SaveRegisters(c, {scratch});
|
||||||
|
|
||||||
|
c.mov(scratch, *src);
|
||||||
|
c.dec(scratch);
|
||||||
|
c.xor_(scratch, *src);
|
||||||
|
c.mov(dst, scratch);
|
||||||
|
|
||||||
|
RestoreRegisters(c, {scratch});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto src = ZydisToXbyakOperand(operands[1]);
|
||||||
|
|
||||||
|
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
||||||
|
|
||||||
|
SaveRegisters(c, {scratch});
|
||||||
|
|
||||||
|
c.mov(scratch, *src);
|
||||||
|
c.dec(scratch);
|
||||||
|
c.and_(scratch, *src);
|
||||||
|
c.mov(dst, scratch);
|
||||||
|
|
||||||
|
RestoreRegisters(c, {scratch});
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __APPLE__
|
||||||
|
|
||||||
|
static bool FilterTcbAccess(const ZydisDecodedOperand* operands) {
|
||||||
|
const auto& dst_op = operands[0];
|
||||||
|
const auto& src_op = operands[1];
|
||||||
|
|
||||||
|
// Patch only 'mov (64-bit register), fs:[0]'
|
||||||
|
return src_op.type == ZYDIS_OPERAND_TYPE_MEMORY && src_op.mem.segment == ZYDIS_REGISTER_FS &&
|
||||||
|
src_op.mem.base == ZYDIS_REGISTER_NONE && src_op.mem.index == ZYDIS_REGISTER_NONE &&
|
||||||
|
src_op.mem.disp.value == 0 && dst_op.reg.value >= ZYDIS_REGISTER_RAX &&
|
||||||
|
dst_op.reg.value <= ZYDIS_REGISTER_R15;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
||||||
|
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
||||||
|
const auto slot = GetTcbKey();
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
// The following logic is based on the wine implementation of TlsGetValue
|
||||||
|
// https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719
|
||||||
|
static constexpr u32 TlsSlotsOffset = 0x1480;
|
||||||
|
static constexpr u32 TlsExpansionSlotsOffset = 0x1780;
|
||||||
|
static constexpr u32 TlsMinimumAvailable = 64;
|
||||||
|
|
||||||
|
const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset;
|
||||||
|
const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable;
|
||||||
|
|
||||||
|
// Load the pointer to the table of TLS slots.
|
||||||
|
c.putSeg(Xbyak::util::gs);
|
||||||
|
c.mov(dst, Xbyak::util::ptr[reinterpret_cast<void*>(teb_offset)]);
|
||||||
|
// Load the pointer to our buffer.
|
||||||
|
c.mov(dst, Xbyak::util::qword[dst + tls_index * sizeof(LPVOID)]);
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
// The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by
|
||||||
|
// pthread_getspecific https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96
|
||||||
|
c.putSeg(Xbyak::util::gs);
|
||||||
|
c.mov(dst, Xbyak::util::qword[reinterpret_cast<void*>(slot * sizeof(void*))]);
|
||||||
|
#else
|
||||||
|
const auto src = ZydisToXbyakMemoryOperand(operands[1]);
|
||||||
|
|
||||||
|
// Replace fs read with gs read.
|
||||||
|
c.putSeg(Xbyak::util::gs);
|
||||||
|
c.mov(dst, src);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool FilterAlwaysTrue(const ZydisDecodedOperand* operands) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
using PatchFilter = bool (*)(const ZydisDecodedOperand*);
|
||||||
|
using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&);
|
||||||
|
struct PatchInfo {
|
||||||
|
/// Filter for more granular patch conditions past just the instruction mnemonic.
|
||||||
|
PatchFilter filter;
|
||||||
|
|
||||||
|
/// Generator for the patch/trampoline.
|
||||||
|
InstructionGenerator generator;
|
||||||
|
|
||||||
|
/// Whether to use a trampoline for this patch.
|
||||||
|
bool trampoline;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
|
||||||
|
#if defined(_WIN32) || defined(__APPLE__)
|
||||||
|
// Windows and Apple need a trampoline.
|
||||||
|
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}},
|
||||||
|
#else
|
||||||
|
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}},
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
// BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon.
|
||||||
|
{ZYDIS_MNEMONIC_ANDN, {FilterAlwaysTrue, GenerateANDN, true}},
|
||||||
|
{ZYDIS_MNEMONIC_BEXTR, {FilterAlwaysTrue, GenerateBEXTR, true}},
|
||||||
|
{ZYDIS_MNEMONIC_BLSI, {FilterAlwaysTrue, GenerateBLSI, true}},
|
||||||
|
{ZYDIS_MNEMONIC_BLSMSK, {FilterAlwaysTrue, GenerateBLSMSK, true}},
|
||||||
|
{ZYDIS_MNEMONIC_BLSR, {FilterAlwaysTrue, GenerateBLSR, true}},
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) {
|
||||||
|
if (Patches.empty()) {
|
||||||
|
// Nothing to patch on this platform.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ZydisDecoder instr_decoder;
|
||||||
|
ZydisDecodedInstruction instruction;
|
||||||
|
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
|
||||||
|
ZydisDecoderInit(&instr_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
|
||||||
|
|
||||||
|
u8* code = reinterpret_cast<u8*>(segment_addr);
|
||||||
|
u8* end = code + segment_size;
|
||||||
|
while (code < end) {
|
||||||
|
ZyanStatus status =
|
||||||
|
ZydisDecoderDecodeFull(&instr_decoder, code, end - code, &instruction, operands);
|
||||||
|
if (!ZYAN_SUCCESS(status)) {
|
||||||
|
code++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Patches.contains(instruction.mnemonic)) {
|
||||||
|
auto patch_info = Patches.at(instruction.mnemonic);
|
||||||
|
if (patch_info.filter(operands)) {
|
||||||
|
auto patch_gen = Xbyak::CodeGenerator(instruction.length, code);
|
||||||
|
|
||||||
|
if (patch_info.trampoline) {
|
||||||
|
const auto trampoline_ptr = c.getCurr();
|
||||||
|
|
||||||
|
patch_info.generator(operands, c);
|
||||||
|
|
||||||
|
// Return to the following instruction at the end of the trampoline.
|
||||||
|
c.jmp(code + instruction.length);
|
||||||
|
|
||||||
|
// Replace instruction with near jump to the trampoline.
|
||||||
|
patch_gen.jmp(trampoline_ptr, Xbyak::CodeGenerator::LabelType::T_NEAR);
|
||||||
|
} else {
|
||||||
|
patch_info.generator(operands, patch_gen);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto patch_size = patch_gen.getCurr() - code;
|
||||||
|
if (patch_size > 0) {
|
||||||
|
ASSERT_MSG(instruction.length >= patch_size,
|
||||||
|
"Instruction {} with length {} is too short to replace at: {}",
|
||||||
|
ZydisMnemonicGetString(instruction.mnemonic), instruction.length,
|
||||||
|
fmt::ptr(code));
|
||||||
|
|
||||||
|
// Fill remaining space with nops.
|
||||||
|
patch_gen.nop(instruction.length - patch_size);
|
||||||
|
|
||||||
|
LOG_DEBUG(Core, "Patched instruction '{}' at: {}",
|
||||||
|
ZydisMnemonicGetString(instruction.mnemonic), fmt::ptr(code));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
code += instruction.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Core
|
|
@ -9,6 +9,7 @@ class CodeGenerator;
|
||||||
|
|
||||||
namespace Core {
|
namespace Core {
|
||||||
|
|
||||||
|
/// Patches CPU instructions that cannot run as-is on the host.
|
||||||
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);
|
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);
|
||||||
|
|
||||||
} // namespace Core
|
} // namespace Core
|
|
@ -1,291 +0,0 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include <mutex>
|
|
||||||
#include <Zydis/Zydis.h>
|
|
||||||
#include <xbyak/xbyak.h>
|
|
||||||
#include "common/assert.h"
|
|
||||||
#include "common/types.h"
|
|
||||||
#include "instruction_emulator.h"
|
|
||||||
|
|
||||||
namespace Core {
|
|
||||||
|
|
||||||
static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) {
|
|
||||||
if (reg >= ZYDIS_REGISTER_EAX && reg <= ZYDIS_REGISTER_R15D) {
|
|
||||||
return Xbyak::Reg32(reg - ZYDIS_REGISTER_EAX);
|
|
||||||
} else if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) {
|
|
||||||
return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX);
|
|
||||||
} else {
|
|
||||||
UNREACHABLE_MSG("Unsupported register: {}", static_cast<u32>(reg));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand) {
|
|
||||||
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_REGISTER, "Expected register operand, got type: {}", static_cast<u32>(operand.type));
|
|
||||||
|
|
||||||
return ZydisToXbyakRegister(operand.reg.value);
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) {
|
|
||||||
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", static_cast<u32>(operand.type));
|
|
||||||
|
|
||||||
Xbyak::RegExp expression{};
|
|
||||||
if (operand.mem.base != ZYDIS_REGISTER_NONE) {
|
|
||||||
expression = expression + ZydisToXbyakRegister(operand.mem.base);
|
|
||||||
}
|
|
||||||
if (operand.mem.index != ZYDIS_REGISTER_NONE) {
|
|
||||||
if (operand.mem.scale != 0) {
|
|
||||||
expression = expression + ZydisToXbyakRegister(operand.mem.index) * operand.mem.scale;
|
|
||||||
} else {
|
|
||||||
expression = expression + ZydisToXbyakRegister(operand.mem.index);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (operand.mem.disp.size != 0 && operand.mem.disp.value != 0) {
|
|
||||||
expression = expression + operand.mem.disp.value;
|
|
||||||
}
|
|
||||||
|
|
||||||
return Xbyak::util::ptr[expression];
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) {
|
|
||||||
switch (operand.type) {
|
|
||||||
case ZYDIS_OPERAND_TYPE_REGISTER: {
|
|
||||||
return std::make_unique<Xbyak::Reg>(ZydisToXbyakRegisterOperand(operand));
|
|
||||||
}
|
|
||||||
case ZYDIS_OPERAND_TYPE_MEMORY: {
|
|
||||||
return std::make_unique<Xbyak::Address>(ZydisToXbyakMemoryOperand(operand));
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
UNREACHABLE_MSG("Unsupported operand type: {}", static_cast<u32>(operand.type));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
|
|
||||||
static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) {
|
|
||||||
if (operand->isREG()) {
|
|
||||||
return operand->getIdx() == index;
|
|
||||||
}
|
|
||||||
if (operand->isMEM()) {
|
|
||||||
const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp();
|
|
||||||
return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index;
|
|
||||||
}
|
|
||||||
UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast<u32>(operand->getKind()));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool IsRegisterAllocated(const std::initializer_list<const Xbyak::Operand*>& allocated_registers, const int index) {
|
|
||||||
return std::ranges::find_if(
|
|
||||||
allocated_registers.begin(), allocated_registers.end(),
|
|
||||||
[index](const Xbyak::Operand* operand) { return OperandUsesRegister(operand, index); }) != allocated_registers.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
static Xbyak::Reg AllocateScratchRegister(const std::initializer_list<const Xbyak::Operand*> allocated_registers, const u32 bits) {
|
|
||||||
for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) {
|
|
||||||
if (!IsRegisterAllocated(allocated_registers, index)) {
|
|
||||||
return Xbyak::Reg32e(index, static_cast<int>(bits));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
UNREACHABLE_MSG("Out of scratch registers!");
|
|
||||||
}
|
|
||||||
|
|
||||||
static constexpr u32 MaxSavedRegisters = 3;
|
|
||||||
static pthread_key_t register_save_slots[MaxSavedRegisters];
|
|
||||||
static std::once_flag register_save_init_flag;
|
|
||||||
|
|
||||||
static_assert(sizeof(void*) == sizeof(u64), "Cannot fit a register inside a thread local storage slot.");
|
|
||||||
|
|
||||||
static void InitializeRegisterSaveSlots() {
|
|
||||||
for (u32 i = 0; i < MaxSavedRegisters; i++) {
|
|
||||||
ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0,
|
|
||||||
"Unable to allocate thread-local register save slot {}", i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
|
|
||||||
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", regs.size());
|
|
||||||
|
|
||||||
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
|
|
||||||
|
|
||||||
u32 index = 0;
|
|
||||||
for (const auto& reg : regs) {
|
|
||||||
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
|
|
||||||
|
|
||||||
c.putSeg(Xbyak::util::gs);
|
|
||||||
c.mov(Xbyak::util::qword[offset], reg.cvt64());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void RestoreRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
|
|
||||||
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", regs.size());
|
|
||||||
|
|
||||||
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
|
|
||||||
|
|
||||||
u32 index = 0;
|
|
||||||
for (const auto& reg : regs) {
|
|
||||||
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
|
|
||||||
|
|
||||||
c.putSeg(Xbyak::util::gs);
|
|
||||||
c.mov(reg.cvt64(), Xbyak::util::qword[offset]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
|
||||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
|
||||||
const auto src1 = ZydisToXbyakRegisterOperand(operands[1]);
|
|
||||||
const auto src2 = ZydisToXbyakOperand(operands[2]);
|
|
||||||
|
|
||||||
const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit());
|
|
||||||
|
|
||||||
SaveRegisters(c, {scratch});
|
|
||||||
|
|
||||||
c.mov(scratch, src1);
|
|
||||||
c.not_(scratch);
|
|
||||||
c.and_(scratch, *src2);
|
|
||||||
c.mov(dst, scratch);
|
|
||||||
|
|
||||||
RestoreRegisters(c, {scratch});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
|
||||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
|
||||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
|
||||||
const auto start_len = ZydisToXbyakRegisterOperand(operands[2]);
|
|
||||||
|
|
||||||
const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast<int>(start_len.getBit()));
|
|
||||||
const auto scratch1 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit());
|
|
||||||
const auto scratch2 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit());
|
|
||||||
|
|
||||||
if (dst.getIdx() == shift.getIdx()) {
|
|
||||||
SaveRegisters(c, {scratch1, scratch2});
|
|
||||||
} else {
|
|
||||||
SaveRegisters(c, {scratch1, scratch2, shift});
|
|
||||||
}
|
|
||||||
|
|
||||||
c.mov(scratch1, *src);
|
|
||||||
if (shift.getIdx() != start_len.getIdx()) {
|
|
||||||
c.mov(shift, start_len);
|
|
||||||
}
|
|
||||||
|
|
||||||
c.shr(scratch1, shift.cvt8());
|
|
||||||
c.shr(shift, 8);
|
|
||||||
c.mov(scratch2, 1);
|
|
||||||
c.shl(scratch2, shift.cvt8());
|
|
||||||
c.dec(scratch2);
|
|
||||||
|
|
||||||
c.mov(dst, scratch1);
|
|
||||||
c.and_(dst, scratch2);
|
|
||||||
|
|
||||||
if (dst.getIdx() == shift.getIdx()) {
|
|
||||||
RestoreRegisters(c, {scratch1, scratch2});
|
|
||||||
} else {
|
|
||||||
RestoreRegisters(c, {scratch1, scratch2, shift});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
|
||||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
|
||||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
|
||||||
|
|
||||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
|
||||||
|
|
||||||
SaveRegisters(c, {scratch});
|
|
||||||
|
|
||||||
c.mov(scratch, *src);
|
|
||||||
c.neg(scratch);
|
|
||||||
c.and_(scratch, *src);
|
|
||||||
c.mov(dst, scratch);
|
|
||||||
|
|
||||||
RestoreRegisters(c, {scratch});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
|
||||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
|
||||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
|
||||||
|
|
||||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
|
||||||
|
|
||||||
SaveRegisters(c, {scratch});
|
|
||||||
|
|
||||||
c.mov(scratch, *src);
|
|
||||||
c.dec(scratch);
|
|
||||||
c.xor_(scratch, *src);
|
|
||||||
c.mov(dst, scratch);
|
|
||||||
|
|
||||||
RestoreRegisters(c, {scratch});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
|
|
||||||
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
|
|
||||||
const auto src = ZydisToXbyakOperand(operands[1]);
|
|
||||||
|
|
||||||
const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit());
|
|
||||||
|
|
||||||
SaveRegisters(c, {scratch});
|
|
||||||
|
|
||||||
c.mov(scratch, *src);
|
|
||||||
c.dec(scratch);
|
|
||||||
c.and_(scratch, *src);
|
|
||||||
c.mov(dst, scratch);
|
|
||||||
|
|
||||||
RestoreRegisters(c, {scratch});
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using InstructionGenerator = void(*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&);
|
|
||||||
static const std::unordered_map<ZydisMnemonic, InstructionGenerator> InstructionGenerators = {
|
|
||||||
#ifdef __APPLE__
|
|
||||||
// BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon.
|
|
||||||
{ZYDIS_MNEMONIC_ANDN, &GenerateANDN},
|
|
||||||
{ZYDIS_MNEMONIC_BEXTR, &GenerateBEXTR},
|
|
||||||
{ZYDIS_MNEMONIC_BLSI, &GenerateBLSI},
|
|
||||||
{ZYDIS_MNEMONIC_BLSMSK, &GenerateBLSMSK},
|
|
||||||
{ZYDIS_MNEMONIC_BLSR, &GenerateBLSR},
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) {
|
|
||||||
if (InstructionGenerators.empty()) {
|
|
||||||
// Nothing to patch on this platform.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ZydisDecoder instr_decoder;
|
|
||||||
ZydisDecodedInstruction instruction;
|
|
||||||
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
|
|
||||||
ZydisDecoderInit(&instr_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
|
|
||||||
|
|
||||||
u8* code = reinterpret_cast<u8*>(segment_addr);
|
|
||||||
u8* end = code + segment_size;
|
|
||||||
while (code < end) {
|
|
||||||
ZyanStatus status =
|
|
||||||
ZydisDecoderDecodeFull(&instr_decoder, code, end - code, &instruction, operands);
|
|
||||||
if (!ZYAN_SUCCESS(status)) {
|
|
||||||
code++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (InstructionGenerators.contains(instruction.mnemonic)) {
|
|
||||||
LOG_DEBUG(Core, "Replacing instruction '{}' at: {}", ZydisMnemonicGetString(instruction.mnemonic),
|
|
||||||
fmt::ptr(code));
|
|
||||||
|
|
||||||
// Replace instruction with near jump to the trampoline.
|
|
||||||
static constexpr u32 NearJmpSize = 5;
|
|
||||||
ASSERT_MSG(instruction.length >= NearJmpSize, "Instruction {} with length {} is too short to replace at: {}",
|
|
||||||
ZydisMnemonicGetString(instruction.mnemonic), instruction.length, fmt::ptr(code));
|
|
||||||
|
|
||||||
auto patch = Xbyak::CodeGenerator(instruction.length, code);
|
|
||||||
patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR);
|
|
||||||
patch.nop(instruction.length - NearJmpSize);
|
|
||||||
|
|
||||||
auto generator = InstructionGenerators.at(instruction.mnemonic);
|
|
||||||
generator(operands, c);
|
|
||||||
c.jmp(code + instruction.length); // Return to the following instruction.
|
|
||||||
}
|
|
||||||
|
|
||||||
code += instruction.length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Loader
|
|
|
@ -7,11 +7,10 @@
|
||||||
#include "common/logging/log.h"
|
#include "common/logging/log.h"
|
||||||
#include "common/string_util.h"
|
#include "common/string_util.h"
|
||||||
#include "core/aerolib/aerolib.h"
|
#include "core/aerolib/aerolib.h"
|
||||||
#include "core/instruction_emulator.h"
|
#include "core/cpu_patches.h"
|
||||||
#include "core/loader/dwarf.h"
|
#include "core/loader/dwarf.h"
|
||||||
#include "core/memory.h"
|
#include "core/memory.h"
|
||||||
#include "core/module.h"
|
#include "core/module.h"
|
||||||
#include "core/tls.h"
|
|
||||||
|
|
||||||
namespace Core {
|
namespace Core {
|
||||||
|
|
||||||
|
@ -132,7 +131,6 @@ void Module::LoadModuleToMemory(u32& max_tls_index) {
|
||||||
|
|
||||||
add_segment(elf_pheader[i]);
|
add_segment(elf_pheader[i]);
|
||||||
if (elf_pheader[i].p_flags & PF_EXEC) {
|
if (elf_pheader[i].p_flags & PF_EXEC) {
|
||||||
PatchTLS(segment_addr, segment_file_size, c);
|
|
||||||
PatchInstructions(segment_addr, segment_file_size, c);
|
PatchInstructions(segment_addr, segment_file_size, c);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
178
src/core/tls.cpp
178
src/core/tls.cpp
|
@ -1,141 +1,58 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
#include <xbyak/xbyak.h>
|
#include <mutex>
|
||||||
#include "common/assert.h"
|
#include "common/assert.h"
|
||||||
#include "common/types.h"
|
#include "common/types.h"
|
||||||
#include "core/tls.h"
|
#include "core/tls.h"
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#elif !defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
#include <asm/prctl.h> /* Definition of ARCH_* constants */
|
#include <pthread.h>
|
||||||
#include <sys/syscall.h> /* Definition of SYS_* constants */
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace Core {
|
namespace Core {
|
||||||
|
|
||||||
struct TLSPattern {
|
|
||||||
u8 pattern[5];
|
|
||||||
u8 pattern_size;
|
|
||||||
u8 imm_size;
|
|
||||||
u8 target_reg;
|
|
||||||
};
|
|
||||||
|
|
||||||
constexpr static TLSPattern TlsPatterns[] = {
|
|
||||||
// 64 48 A1 | 00 00 00 00 00 00 00 00 # mov rax, qword ptr fs:[64b imm]
|
|
||||||
{{0x64, 0x48, 0xA1}, 3, 8, 0},
|
|
||||||
// 64 48 8B 04 25 | 00 00 00 00 # mov rax,qword ptr fs:[0]
|
|
||||||
{{0x64, 0x48, 0x8B, 0x4, 0x25}, 5, 4, 0}, // rax
|
|
||||||
{{0x64, 0x48, 0x8B, 0xC, 0x25}, 5, 4, 1}, // rcx
|
|
||||||
{{0x64, 0x48, 0x8B, 0x14, 0x25}, 5, 4, 2}, // rdx
|
|
||||||
{{0x64, 0x48, 0x8B, 0x1C, 0x25}, 5, 4, 3}, // rbx
|
|
||||||
{{0x64, 0x48, 0x8B, 0x24, 0x25}, 5, 4, 4}, // rsp
|
|
||||||
{{0x64, 0x48, 0x8B, 0x2C, 0x25}, 5, 4, 5}, // rbp
|
|
||||||
{{0x64, 0x48, 0x8B, 0x34, 0x25}, 5, 4, 6}, // rsi
|
|
||||||
{{0x64, 0x48, 0x8B, 0x3C, 0x25}, 5, 4, 7}, // rdi
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x4, 0x25}, 5, 4, 8}, // r8
|
|
||||||
{{0x64, 0x4C, 0x8B, 0xC, 0x25}, 5, 4, 9}, // r9
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x14, 0x25}, 5, 4, 10}, // r10
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x1C, 0x25}, 5, 4, 11}, // r11
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x24, 0x25}, 5, 4, 12}, // r12
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x2C, 0x25}, 5, 4, 13}, // r13
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x34, 0x25}, 5, 4, 14}, // r14
|
|
||||||
{{0x64, 0x4C, 0x8B, 0x3C, 0x25}, 5, 4, 15}, // r15
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
||||||
static DWORD slot = 0;
|
static DWORD slot = 0;
|
||||||
|
|
||||||
void SetTcbBase(void* image_address) {
|
|
||||||
const BOOL result = TlsSetValue(slot, image_address);
|
|
||||||
ASSERT(result != 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
Tcb* GetTcbBase() {
|
|
||||||
return reinterpret_cast<Tcb*>(TlsGetValue(slot));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void AllocTcbKey() {
|
static void AllocTcbKey() {
|
||||||
slot = TlsAlloc();
|
slot = TlsAlloc();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) {
|
void SetTcbBase(void* image_address) {
|
||||||
using namespace Xbyak::util;
|
const BOOL result = TlsSetValue(GetTcbKey(), image_address);
|
||||||
const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size;
|
ASSERT(result != 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Replace mov instruction with near jump to the trampoline.
|
Tcb* GetTcbBase() {
|
||||||
static constexpr u32 NearJmpSize = 5;
|
return reinterpret_cast<Tcb*>(TlsGetValue(GetTcbKey()));
|
||||||
auto patch = Xbyak::CodeGenerator(total_size, code);
|
|
||||||
patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR);
|
|
||||||
patch.nop(total_size - NearJmpSize);
|
|
||||||
|
|
||||||
// Write the trampoline.
|
|
||||||
// The following logic is based on the wine implementation of TlsGetValue
|
|
||||||
// https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719
|
|
||||||
static constexpr u32 TlsSlotsOffset = 0x1480;
|
|
||||||
static constexpr u32 TlsExpansionSlotsOffset = 0x1780;
|
|
||||||
static constexpr u32 TlsMinimumAvailable = 64;
|
|
||||||
const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset;
|
|
||||||
const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable;
|
|
||||||
|
|
||||||
const auto target_reg = Xbyak::Reg64(tls_pattern.target_reg);
|
|
||||||
c.mov(target_reg, teb_offset);
|
|
||||||
c.putSeg(gs);
|
|
||||||
c.mov(target_reg, ptr[target_reg]); // Load the pointer to the table of tls slots.
|
|
||||||
c.mov(target_reg,
|
|
||||||
qword[target_reg + tls_index * sizeof(LPVOID)]); // Load the pointer to our buffer.
|
|
||||||
c.jmp(code + total_size); // Return to the instruction right after the mov.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
|
|
||||||
static pthread_key_t slot = 0;
|
static pthread_key_t slot = 0;
|
||||||
static std::once_flag slot_alloc_flag;
|
|
||||||
|
|
||||||
static void AllocTcbKey() {
|
static void AllocTcbKey() {
|
||||||
ASSERT(pthread_key_create(&slot, nullptr) == 0);
|
ASSERT(pthread_key_create(&slot, nullptr) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetTcbBase(void* image_address) {
|
void SetTcbBase(void* image_address) {
|
||||||
std::call_once(slot_alloc_flag, &AllocTcbKey);
|
ASSERT(pthread_setspecific(GetTcbKey(), image_address) == 0);
|
||||||
ASSERT(pthread_setspecific(slot, image_address) == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Tcb* GetTcbBase() {
|
Tcb* GetTcbBase() {
|
||||||
std::call_once(slot_alloc_flag, &AllocTcbKey);
|
return reinterpret_cast<Tcb*>(pthread_getspecific(GetTcbKey()));
|
||||||
return reinterpret_cast<Tcb*>(pthread_getspecific(slot));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) {
|
|
||||||
using namespace Xbyak::util;
|
|
||||||
const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size;
|
|
||||||
|
|
||||||
// Allocate slot in the process if not done already.
|
|
||||||
std::call_once(slot_alloc_flag, &AllocTcbKey);
|
|
||||||
|
|
||||||
static constexpr u32 NearJmpSize = 5;
|
|
||||||
|
|
||||||
// Replace fs read with gs read.
|
|
||||||
auto patch = Xbyak::CodeGenerator(total_size, code);
|
|
||||||
patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR);
|
|
||||||
patch.nop(total_size - NearJmpSize);
|
|
||||||
|
|
||||||
// Write the trampoline.
|
|
||||||
const auto target_reg = Xbyak::Reg64(tls_pattern.target_reg);
|
|
||||||
|
|
||||||
// The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by pthread_getspecific
|
|
||||||
// https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96
|
|
||||||
c.putSeg(gs);
|
|
||||||
c.mov(target_reg, qword[reinterpret_cast<void*>(slot * sizeof(void*))]); // Load the slot data.
|
|
||||||
|
|
||||||
// Return to the instruction right after the mov.
|
|
||||||
c.jmp(code + total_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static u32 slot = 0;
|
// Placeholder for code compatibility.
|
||||||
|
static constexpr u32 slot = 0;
|
||||||
|
|
||||||
|
static void AllocTcbKey() {}
|
||||||
|
|
||||||
void SetTcbBase(void* image_address) {
|
void SetTcbBase(void* image_address) {
|
||||||
asm volatile("wrgsbase %0" ::"r"(image_address) : "memory");
|
asm volatile("wrgsbase %0" ::"r"(image_address) : "memory");
|
||||||
|
@ -147,68 +64,13 @@ Tcb* GetTcbBase() {
|
||||||
return tcb;
|
return tcb;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void AllocTcbKey() {}
|
|
||||||
|
|
||||||
static void PatchFsAccess(u8* code, const TLSPattern& tls_pattern, Xbyak::CodeGenerator& c) {
|
|
||||||
using namespace Xbyak::util;
|
|
||||||
const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size;
|
|
||||||
|
|
||||||
// Replace fs read with gs read.
|
|
||||||
auto patch = Xbyak::CodeGenerator(total_size, code);
|
|
||||||
patch.putSeg(gs);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) {
|
static std::once_flag slot_alloc_flag;
|
||||||
u8* code = reinterpret_cast<u8*>(segment_addr);
|
|
||||||
auto remaining_size = segment_size;
|
|
||||||
|
|
||||||
// Sometimes loads from the FS segment are prefixed with useless operand size prefix bytes like:
|
u32 GetTcbKey() {
|
||||||
// |66 66 66| 64 48 8b 04 25 00 # mov rax, qword ptr fs:[0x0]
|
std::call_once(slot_alloc_flag, &AllocTcbKey);
|
||||||
// These are probably ignored by the processor but when patching the instruction to a jump
|
return slot;
|
||||||
// they cause issues. So look for them and patch them to nop to avoid problems.
|
|
||||||
static constexpr std::array<u8, 3> BadPrefix = {0x66, 0x66, 0x66};
|
|
||||||
|
|
||||||
while (remaining_size) {
|
|
||||||
for (const auto& tls_pattern : TlsPatterns) {
|
|
||||||
const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size;
|
|
||||||
if (remaining_size < total_size) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (std::memcmp(code, tls_pattern.pattern, tls_pattern.pattern_size) != 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
u64 offset = 0;
|
|
||||||
if (tls_pattern.imm_size == 4) {
|
|
||||||
std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u32));
|
|
||||||
LOG_TRACE(Core_Linker, "PATTERN32 FOUND at {}, reg: {} offset: {:#x}",
|
|
||||||
fmt::ptr(code), tls_pattern.target_reg, offset);
|
|
||||||
} else {
|
|
||||||
std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u64));
|
|
||||||
LOG_ERROR(Core_Linker, "PATTERN64 FOUND at {}, reg: {} offset: {:#x}",
|
|
||||||
fmt::ptr(code), tls_pattern.target_reg, offset);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ASSERT(offset == 0);
|
|
||||||
|
|
||||||
// Replace bogus instruction prefix with nops if it exists.
|
|
||||||
if (std::memcmp(code - BadPrefix.size(), BadPrefix.data(), sizeof(BadPrefix)) == 0) {
|
|
||||||
auto patch = Xbyak::CodeGenerator(BadPrefix.size(), code - BadPrefix.size());
|
|
||||||
patch.nop(BadPrefix.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Patch access to FS register to a trampoline.
|
|
||||||
PatchFsAccess(code, tls_pattern, c);
|
|
||||||
|
|
||||||
// Move ahead in module.
|
|
||||||
code += total_size - 1;
|
|
||||||
remaining_size -= total_size - 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
code++;
|
|
||||||
remaining_size--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Core
|
} // namespace Core
|
||||||
|
|
|
@ -22,13 +22,13 @@ struct Tcb {
|
||||||
void* tcb_thread;
|
void* tcb_thread;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Gets the thread local storage key for the TCB block.
|
||||||
|
u32 GetTcbKey();
|
||||||
|
|
||||||
/// Sets the data pointer to the TCB block.
|
/// Sets the data pointer to the TCB block.
|
||||||
void SetTcbBase(void* image_address);
|
void SetTcbBase(void* image_address);
|
||||||
|
|
||||||
/// Retrieves Tcb structure for the calling thread.
|
/// Retrieves Tcb structure for the calling thread.
|
||||||
Tcb* GetTcbBase();
|
Tcb* GetTcbBase();
|
||||||
|
|
||||||
/// Patches any instructions that access guest TLS to use provided storage.
|
|
||||||
void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);
|
|
||||||
|
|
||||||
} // namespace Core
|
} // namespace Core
|
||||||
|
|
Loading…
Reference in New Issue