Add patches for F16C instructions under Rosetta 2.

This commit is contained in:
squidbus 2024-07-19 01:09:07 -07:00
parent 905d49fd96
commit a17150960f
8 changed files with 292 additions and 37 deletions

3
.gitmodules vendored
View File

@ -82,3 +82,6 @@
path = externals/ffmpeg-core
url = https://github.com/shadps4-emu/ext-ffmpeg-core.git
shallow = true
[submodule "externals/half"]
path = externals/half
url = https://github.com/ROCm/half.git

View File

@ -638,6 +638,9 @@ if (APPLE)
# Replacement for std::chrono::time_zone
target_link_libraries(shadps4 PRIVATE date::date-tz)
# Half float conversions for F16C patches
target_link_libraries(shadps4 PRIVATE half)
endif()
if (NOT ENABLE_QT_GUI)

View File

@ -142,11 +142,17 @@ if (WIN32)
target_compile_options(sirit PUBLIC "-Wno-error=unused-command-line-argument")
endif()
# date
if (APPLE AND NOT TARGET date::date-tz)
option(BUILD_TZ_LIB "" ON)
option(USE_SYSTEM_TZ_DB "" ON)
add_subdirectory(date)
if (APPLE)
# half
add_library(half INTERFACE)
target_include_directories(half INTERFACE half/include)
# date
if (NOT TARGET date::date-tz)
option(BUILD_TZ_LIB "" ON)
option(USE_SYSTEM_TZ_DB "" ON)
add_subdirectory(date)
endif()
endif()
# Tracy

1
externals/half vendored Submodule

@ -0,0 +1 @@
Subproject commit 1ddada225144cac0de8f6b5c0dd9acffd99a2e68

View File

@ -15,6 +15,7 @@
#else
#include <pthread.h>
#ifdef __APPLE__
#include <half.hpp>
#include <sys/sysctl.h>
#endif
#endif
@ -30,6 +31,12 @@ static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) {
if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) {
return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX);
}
if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM31) {
return Xbyak::Xmm(reg - ZYDIS_REGISTER_XMM0 + xmm0.getIdx());
}
if (reg >= ZYDIS_REGISTER_YMM0 && reg <= ZYDIS_REGISTER_YMM31) {
return Xbyak::Ymm(reg - ZYDIS_REGISTER_YMM0 + ymm0.getIdx());
}
UNREACHABLE_MSG("Unsupported register: {}", static_cast<u32>(reg));
}
@ -66,6 +73,12 @@ static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& opera
return ptr[expression];
}
static u64 ZydisToXbyakImmediateOperand(const ZydisDecodedOperand& operand) {
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_IMMEDIATE,
"Expected immediate operand, got type: {}", static_cast<u32>(operand.type));
return operand.imm.value.u;
}
static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) {
switch (operand.type) {
case ZYDIS_OPERAND_TYPE_REGISTER: {
@ -110,51 +123,135 @@ static Xbyak::Reg AllocateScratchRegister(
#ifdef __APPLE__
static constexpr u32 MaxSavedRegisters = 3;
static pthread_key_t register_save_slots[MaxSavedRegisters];
static std::once_flag register_save_init_flag;
static pthread_key_t stack_pointer_slot;
static pthread_key_t patch_stack_slot;
static std::once_flag patch_context_slots_init_flag;
static_assert(sizeof(void*) == sizeof(u64),
"Cannot fit a register inside a thread local storage slot.");
static void InitializeRegisterSaveSlots() {
for (u32 i = 0; i < MaxSavedRegisters; i++) {
ASSERT_MSG(pthread_key_create(&register_save_slots[i], nullptr) == 0,
"Unable to allocate thread-local register save slot {}", i);
static void InitializePatchContextSlots() {
ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0,
"Unable to allocate thread-local register for stack pointer.");
ASSERT_MSG(pthread_key_create(&patch_stack_slot, nullptr) == 0,
"Unable to allocate thread-local register for patch stack.");
}
void InitializeThreadPatchStack() {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
const auto* patch_stack = std::malloc(0x1000);
pthread_setspecific(patch_stack_slot, patch_stack);
}
void CleanupThreadPatchStack() {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
auto* patch_stack = pthread_getspecific(patch_stack_slot);
if (patch_stack != nullptr) {
std::free(patch_stack);
pthread_setspecific(patch_stack_slot, nullptr);
}
}
/// Saves the stack pointer to thread local storage and loads the patch stack.
static void SaveStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save stack pointer and load patch stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))]);
}
/// Restores the stack pointer from thread local storage.
static void RestoreStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save patch stack pointer and load original stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))]);
}
#else
// These utilities are not implemented as we can't save anything to thread local storage without
// temporary registers.
void InitializeThreadPatchStack() {
// No-op
}
void CleanupThreadPatchStack() {
// No-op
}
/// Saves the stack pointer to thread local storage and loads the patch stack.
static void SaveStack(Xbyak::CodeGenerator& c) {
UNIMPLEMENTED();
}
/// Restores the stack pointer from thread local storage.
static void RestoreStack(Xbyak::CodeGenerator& c) {
UNIMPLEMENTED();
}
#endif
/// Switches to the patch stack, saves registers, and restores the original stack.
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.",
regs.size());
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
u32 index = 0;
SaveStack(c);
for (const auto& reg : regs) {
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
c.putSeg(gs);
c.mov(qword[offset], reg.cvt64());
c.push(reg.cvt64());
}
RestoreStack(c);
}
/// Switches to the patch stack, restores registers, and restores the original stack.
static void RestoreRegisters(Xbyak::CodeGenerator& c,
const std::initializer_list<Xbyak::Reg> regs) {
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.",
regs.size());
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
u32 index = 0;
SaveStack(c);
for (const auto& reg : regs) {
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*));
c.pop(reg.cvt64());
}
RestoreStack(c);
}
c.putSeg(gs);
c.mov(reg.cvt64(), qword[offset]);
/// Switches to the patch stack and stores all registers.
static void SaveContext(Xbyak::CodeGenerator& c) {
SaveStack(c);
for (int reg = Xbyak::Operand::RAX; reg <= Xbyak::Operand::R15; reg++) {
c.push(Xbyak::Reg64(reg));
}
for (int reg = 0; reg <= 7; reg++) {
c.sub(rsp, 32);
c.vmovdqu(ptr[rsp], Xbyak::Ymm(reg));
}
}
/// Restores all registers and restores the original stack.
/// If the destination is a register, it is not restored to preserve the output.
static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst) {
for (int reg = 7; reg >= 0; reg--) {
if ((!dst.isXMM() && !dst.isYMM()) || dst.getIdx() != reg) {
c.vmovdqu(Xbyak::Ymm(reg), ptr[rsp]);
}
c.add(rsp, 32);
}
for (int reg = Xbyak::Operand::R15; reg >= Xbyak::Operand::RAX; reg--) {
if (!dst.isREG() || dst.getIdx() != reg) {
c.pop(Xbyak::Reg64(reg));
} else {
c.add(rsp, 4);
}
}
RestoreStack(c);
}
#ifdef __APPLE__
static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
const auto src1 = ZydisToXbyakRegisterOperand(operands[1]);
@ -204,9 +301,9 @@ static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenera
c.and_(dst, scratch2);
if (dst.getIdx() == shift.getIdx()) {
RestoreRegisters(c, {scratch1, scratch2});
RestoreRegisters(c, {scratch2, scratch1});
} else {
RestoreRegisters(c, {scratch1, scratch2, shift});
RestoreRegisters(c, {shift, scratch2, scratch1});
}
}
@ -258,10 +355,138 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat
RestoreRegisters(c, {scratch});
}
bool FilterRosetta2Only(const ZydisDecodedOperand*) {
static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in,
const u32 count) {
for (u32 i = 0; i < count; i++) {
out[i] = half_float::half_cast<float>(in[i]);
}
}
static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
const auto src = ZydisToXbyakOperand(operands[1]);
const auto float_count = dst.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c);
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
if (src->isXMM()) {
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
c.movdqu(ptr[rsp], *reinterpret_cast<Xbyak::Xmm*>(src.get()));
} else {
c.lea(rsi, src->getAddress());
}
// Load float count into third parameter.
c.mov(rdx, float_count);
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPH2PS));
c.call(rax);
if (src->isXMM()) {
// Clean up after inputs space.
c.add(rsp, byte_count);
}
// Load outputs into destination register and clean up space.
if (dst.isYMM()) {
c.vmovdqu(*reinterpret_cast<const Xbyak::Ymm*>(&dst), ptr[rsp]);
} else {
c.movdqu(*reinterpret_cast<const Xbyak::Xmm*>(&dst), ptr[rsp]);
}
c.add(rsp, byte_count);
RestoreContext(c, dst);
}
using SingleToHalfFloatConverter = half_float::half (*)(float);
static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = {
half_float::half_cast<half_float::half, std::round_to_nearest, float>,
half_float::half_cast<half_float::half, std::round_toward_neg_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_zero, float>,
};
static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in,
const u32 count, const u8 rounding_mode) {
const auto conversion_func = SingleToHalfFloatConverters[rounding_mode];
for (u32 i = 0; i < count; i++) {
out[i] = conversion_func(in[i]);
}
}
static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakOperand(operands[0]);
const auto src = ZydisToXbyakRegisterOperand(operands[1]);
const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]);
const auto float_count = src.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c);
if (dst->isXMM()) {
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
} else {
c.lea(rdi, dst->getAddress());
}
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
if (src.isYMM()) {
c.vmovdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Ymm*>(&src));
} else {
c.movdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Xmm*>(&src));
}
// Load float count into third parameter.
c.mov(rdx, float_count);
// Load rounding mode into fourth parameter.
if (ctrl & 4) {
// Load from MXCSR.RC.
c.stmxcsr(ptr[rsp - 4]);
c.mov(rcx, ptr[rsp - 4]);
c.shr(rcx, 13);
c.and_(rcx, 3);
} else {
c.mov(rcx, ctrl & 3);
}
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPS2PH));
c.call(rax);
// Clean up after inputs space.
c.add(rsp, byte_count);
if (dst->isXMM()) {
// Load outputs into destination register and clean up space.
c.movdqu(*reinterpret_cast<Xbyak::Xmm*>(dst.get()), ptr[rsp]);
c.add(rsp, byte_count);
}
RestoreContext(c, *dst);
}
static bool FilterRosetta2Only(const ZydisDecodedOperand*) {
int ret = 0;
size_t size = sizeof(ret);
if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) != 0) {
if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) {
return false;
}
return ret;
@ -339,12 +564,16 @@ static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
#endif
#ifdef __APPLE__
// BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon.
// Patches for instruction sets not supported by Rosetta 2.
// BMI1
{ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}},
{ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}},
{ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}},
{ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}},
{ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}},
// F16C
{ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}},
{ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}},
#endif
};

View File

@ -9,6 +9,12 @@ class CodeGenerator;
namespace Core {
/// Initializes a stack for the current thread for use by patch implementations.
void InitializeThreadPatchStack();
/// Cleans up the patch stack for the current thread.
void CleanupThreadPatchStack();
/// Patches CPU instructions that cannot run as-is on the host.
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);

View File

@ -10,6 +10,7 @@
#include "common/logging/log.h"
#include "common/singleton.h"
#include "common/thread.h"
#include "core/cpu_patches.h"
#include "core/libraries/error_codes.h"
#include "core/libraries/kernel/libkernel.h"
#include "core/libraries/kernel/thread_management.h"
@ -985,6 +986,7 @@ static void cleanup_thread(void* arg) {
destructor(value);
}
}
Core::CleanupThreadPatchStack();
thread->is_almost_done = true;
}
@ -993,6 +995,7 @@ static void* run_thread(void* arg) {
Common::SetCurrentThreadName(thread->name.c_str());
auto* linker = Common::Singleton<Core::Linker>::Instance();
linker->InitTlsForThread(false);
Core::InitializeThreadPatchStack();
void* ret = nullptr;
g_pthread_self = thread;
pthread_cleanup_push(cleanup_thread, thread);

View File

@ -10,6 +10,7 @@
#include "common/thread.h"
#include "core/aerolib/aerolib.h"
#include "core/aerolib/stubs.h"
#include "core/cpu_patches.h"
#include "core/libraries/kernel/memory_management.h"
#include "core/libraries/kernel/thread_management.h"
#include "core/linker.h"
@ -86,6 +87,7 @@ void Linker::Execute() {
Common::SetCurrentThreadName("GAME_MainThread");
Libraries::Kernel::pthreadInitSelfMainThread();
InitTlsForThread(true);
InitializeThreadPatchStack();
// Start shared library modules
for (auto& m : m_modules) {
@ -104,6 +106,8 @@ void Linker::Execute() {
RunMainEntry(m->GetEntryAddress(), &p, ProgramExitFunc);
}
}
CleanupThreadPatchStack();
}
s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) {