Merge pull request #638 from squidbus/f16c

Add patches for F16C instructions under Rosetta 2.
This commit is contained in:
georgemoralis 2024-08-30 11:16:21 +03:00 committed by GitHub
commit cdd193d5b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 292 additions and 37 deletions

5
.gitmodules vendored
View File

@ -81,4 +81,7 @@
[submodule "externals/ffmpeg-core"] [submodule "externals/ffmpeg-core"]
path = externals/ffmpeg-core path = externals/ffmpeg-core
url = https://github.com/shadps4-emu/ext-ffmpeg-core.git url = https://github.com/shadps4-emu/ext-ffmpeg-core.git
shallow = true shallow = true
[submodule "externals/half"]
path = externals/half
url = https://github.com/ROCm/half.git

View File

@ -644,6 +644,9 @@ if (APPLE)
# Replacement for std::chrono::time_zone # Replacement for std::chrono::time_zone
target_link_libraries(shadps4 PRIVATE date::date-tz) target_link_libraries(shadps4 PRIVATE date::date-tz)
# Half float conversions for F16C patches
target_link_libraries(shadps4 PRIVATE half)
endif() endif()
if (NOT ENABLE_QT_GUI) if (NOT ENABLE_QT_GUI)

View File

@ -142,11 +142,17 @@ if (WIN32)
target_compile_options(sirit PUBLIC "-Wno-error=unused-command-line-argument") target_compile_options(sirit PUBLIC "-Wno-error=unused-command-line-argument")
endif() endif()
# date if (APPLE)
if (APPLE AND NOT TARGET date::date-tz) # half
option(BUILD_TZ_LIB "" ON) add_library(half INTERFACE)
option(USE_SYSTEM_TZ_DB "" ON) target_include_directories(half INTERFACE half/include)
add_subdirectory(date)
# date
if (NOT TARGET date::date-tz)
option(BUILD_TZ_LIB "" ON)
option(USE_SYSTEM_TZ_DB "" ON)
add_subdirectory(date)
endif()
endif() endif()
# Tracy # Tracy

1
externals/half vendored Submodule

@ -0,0 +1 @@
Subproject commit 1ddada225144cac0de8f6b5c0dd9acffd99a2e68

View File

@ -15,6 +15,7 @@
#else #else
#include <pthread.h> #include <pthread.h>
#ifdef __APPLE__ #ifdef __APPLE__
#include <half.hpp>
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
#endif #endif
@ -30,6 +31,12 @@ static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) {
if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) { if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) {
return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX); return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX);
} }
if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM31) {
return Xbyak::Xmm(reg - ZYDIS_REGISTER_XMM0 + xmm0.getIdx());
}
if (reg >= ZYDIS_REGISTER_YMM0 && reg <= ZYDIS_REGISTER_YMM31) {
return Xbyak::Ymm(reg - ZYDIS_REGISTER_YMM0 + ymm0.getIdx());
}
UNREACHABLE_MSG("Unsupported register: {}", static_cast<u32>(reg)); UNREACHABLE_MSG("Unsupported register: {}", static_cast<u32>(reg));
} }
@ -66,6 +73,12 @@ static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& opera
return ptr[expression]; return ptr[expression];
} }
static u64 ZydisToXbyakImmediateOperand(const ZydisDecodedOperand& operand) {
ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_IMMEDIATE,
"Expected immediate operand, got type: {}", static_cast<u32>(operand.type));
return operand.imm.value.u;
}
static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { static std::unique_ptr<Xbyak::Operand> ZydisToXbyakOperand(const ZydisDecodedOperand& operand) {
switch (operand.type) { switch (operand.type) {
case ZYDIS_OPERAND_TYPE_REGISTER: { case ZYDIS_OPERAND_TYPE_REGISTER: {
@ -110,51 +123,135 @@ static Xbyak::Reg AllocateScratchRegister(
#ifdef __APPLE__ #ifdef __APPLE__
static constexpr u32 MaxSavedRegisters = 3; static pthread_key_t stack_pointer_slot;
static pthread_key_t register_save_slots[MaxSavedRegisters]; static pthread_key_t patch_stack_slot;
static std::once_flag register_save_init_flag; static std::once_flag patch_context_slots_init_flag;
static_assert(sizeof(void*) == sizeof(u64), static_assert(sizeof(void*) == sizeof(u64),
"Cannot fit a register inside a thread local storage slot."); "Cannot fit a register inside a thread local storage slot.");
static void InitializeRegisterSaveSlots() { static void InitializePatchContextSlots() {
for (u32 i = 0; i < MaxSavedRegisters; i++) { ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0,
ASSERT_MSG(pthread_key_create(&register_save_slots[i], nullptr) == 0, "Unable to allocate thread-local register for stack pointer.");
"Unable to allocate thread-local register save slot {}", i); ASSERT_MSG(pthread_key_create(&patch_stack_slot, nullptr) == 0,
"Unable to allocate thread-local register for patch stack.");
}
void InitializeThreadPatchStack() {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
const auto* patch_stack = std::malloc(0x1000);
pthread_setspecific(patch_stack_slot, patch_stack);
}
void CleanupThreadPatchStack() {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
auto* patch_stack = pthread_getspecific(patch_stack_slot);
if (patch_stack != nullptr) {
std::free(patch_stack);
pthread_setspecific(patch_stack_slot, nullptr);
} }
} }
/// Saves the stack pointer to thread local storage and loads the patch stack.
static void SaveStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save stack pointer and load patch stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))]);
}
/// Restores the stack pointer from thread local storage.
static void RestoreStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save patch stack pointer and load original stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))]);
}
#else
// These utilities are not implemented as we can't save anything to thread local storage without
// temporary registers.
void InitializeThreadPatchStack() {
// No-op
}
void CleanupThreadPatchStack() {
// No-op
}
/// Saves the stack pointer to thread local storage and loads the patch stack.
static void SaveStack(Xbyak::CodeGenerator& c) {
UNIMPLEMENTED();
}
/// Restores the stack pointer from thread local storage.
static void RestoreStack(Xbyak::CodeGenerator& c) {
UNIMPLEMENTED();
}
#endif
/// Switches to the patch stack, saves registers, and restores the original stack.
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) { static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", SaveStack(c);
regs.size());
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
u32 index = 0;
for (const auto& reg : regs) { for (const auto& reg : regs) {
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*)); c.push(reg.cvt64());
c.putSeg(gs);
c.mov(qword[offset], reg.cvt64());
} }
RestoreStack(c);
} }
/// Switches to the patch stack, restores registers, and restores the original stack.
static void RestoreRegisters(Xbyak::CodeGenerator& c, static void RestoreRegisters(Xbyak::CodeGenerator& c,
const std::initializer_list<Xbyak::Reg> regs) { const std::initializer_list<Xbyak::Reg> regs) {
ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", SaveStack(c);
regs.size());
std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots);
u32 index = 0;
for (const auto& reg : regs) { for (const auto& reg : regs) {
const auto offset = reinterpret_cast<void*>(register_save_slots[index++] * sizeof(void*)); c.pop(reg.cvt64());
}
RestoreStack(c);
}
c.putSeg(gs); /// Switches to the patch stack and stores all registers.
c.mov(reg.cvt64(), qword[offset]); static void SaveContext(Xbyak::CodeGenerator& c) {
SaveStack(c);
for (int reg = Xbyak::Operand::RAX; reg <= Xbyak::Operand::R15; reg++) {
c.push(Xbyak::Reg64(reg));
}
for (int reg = 0; reg <= 7; reg++) {
c.sub(rsp, 32);
c.vmovdqu(ptr[rsp], Xbyak::Ymm(reg));
} }
} }
/// Restores all registers and restores the original stack.
/// If the destination is a register, it is not restored to preserve the output.
static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst) {
for (int reg = 7; reg >= 0; reg--) {
if ((!dst.isXMM() && !dst.isYMM()) || dst.getIdx() != reg) {
c.vmovdqu(Xbyak::Ymm(reg), ptr[rsp]);
}
c.add(rsp, 32);
}
for (int reg = Xbyak::Operand::R15; reg >= Xbyak::Operand::RAX; reg--) {
if (!dst.isREG() || dst.getIdx() != reg) {
c.pop(Xbyak::Reg64(reg));
} else {
c.add(rsp, 4);
}
}
RestoreStack(c);
}
#ifdef __APPLE__
static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]);
@ -204,9 +301,9 @@ static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenera
c.and_(dst, scratch2); c.and_(dst, scratch2);
if (dst.getIdx() == shift.getIdx()) { if (dst.getIdx() == shift.getIdx()) {
RestoreRegisters(c, {scratch1, scratch2}); RestoreRegisters(c, {scratch2, scratch1});
} else { } else {
RestoreRegisters(c, {scratch1, scratch2, shift}); RestoreRegisters(c, {shift, scratch2, scratch1});
} }
} }
@ -258,10 +355,138 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat
RestoreRegisters(c, {scratch}); RestoreRegisters(c, {scratch});
} }
bool FilterRosetta2Only(const ZydisDecodedOperand*) { static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in,
const u32 count) {
for (u32 i = 0; i < count; i++) {
out[i] = half_float::half_cast<float>(in[i]);
}
}
static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
const auto src = ZydisToXbyakOperand(operands[1]);
const auto float_count = dst.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c);
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
if (src->isXMM()) {
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
c.movdqu(ptr[rsp], *reinterpret_cast<Xbyak::Xmm*>(src.get()));
} else {
c.lea(rsi, src->getAddress());
}
// Load float count into third parameter.
c.mov(rdx, float_count);
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPH2PS));
c.call(rax);
if (src->isXMM()) {
// Clean up after inputs space.
c.add(rsp, byte_count);
}
// Load outputs into destination register and clean up space.
if (dst.isYMM()) {
c.vmovdqu(*reinterpret_cast<const Xbyak::Ymm*>(&dst), ptr[rsp]);
} else {
c.movdqu(*reinterpret_cast<const Xbyak::Xmm*>(&dst), ptr[rsp]);
}
c.add(rsp, byte_count);
RestoreContext(c, dst);
}
using SingleToHalfFloatConverter = half_float::half (*)(float);
static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = {
half_float::half_cast<half_float::half, std::round_to_nearest, float>,
half_float::half_cast<half_float::half, std::round_toward_neg_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_zero, float>,
};
static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in,
const u32 count, const u8 rounding_mode) {
const auto conversion_func = SingleToHalfFloatConverters[rounding_mode];
for (u32 i = 0; i < count; i++) {
out[i] = conversion_func(in[i]);
}
}
static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakOperand(operands[0]);
const auto src = ZydisToXbyakRegisterOperand(operands[1]);
const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]);
const auto float_count = src.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c);
if (dst->isXMM()) {
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
} else {
c.lea(rdi, dst->getAddress());
}
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
if (src.isYMM()) {
c.vmovdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Ymm*>(&src));
} else {
c.movdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Xmm*>(&src));
}
// Load float count into third parameter.
c.mov(rdx, float_count);
// Load rounding mode into fourth parameter.
if (ctrl & 4) {
// Load from MXCSR.RC.
c.stmxcsr(ptr[rsp - 4]);
c.mov(rcx, ptr[rsp - 4]);
c.shr(rcx, 13);
c.and_(rcx, 3);
} else {
c.mov(rcx, ctrl & 3);
}
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPS2PH));
c.call(rax);
// Clean up after inputs space.
c.add(rsp, byte_count);
if (dst->isXMM()) {
// Load outputs into destination register and clean up space.
c.movdqu(*reinterpret_cast<Xbyak::Xmm*>(dst.get()), ptr[rsp]);
c.add(rsp, byte_count);
}
RestoreContext(c, *dst);
}
static bool FilterRosetta2Only(const ZydisDecodedOperand*) {
int ret = 0; int ret = 0;
size_t size = sizeof(ret); size_t size = sizeof(ret);
if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) != 0) { if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) {
return false; return false;
} }
return ret; return ret;
@ -339,12 +564,16 @@ static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
#endif #endif
#ifdef __APPLE__ #ifdef __APPLE__
// BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. // Patches for instruction sets not supported by Rosetta 2.
// BMI1
{ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}}, {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}},
{ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}}, {ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}},
{ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}}, {ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}},
{ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}}, {ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}},
{ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}}, {ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}},
// F16C
{ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}},
{ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}},
#endif #endif
}; };

View File

@ -9,6 +9,12 @@ class CodeGenerator;
namespace Core { namespace Core {
/// Initializes a stack for the current thread for use by patch implementations.
void InitializeThreadPatchStack();
/// Cleans up the patch stack for the current thread.
void CleanupThreadPatchStack();
/// Patches CPU instructions that cannot run as-is on the host. /// Patches CPU instructions that cannot run as-is on the host.
void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c); void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);

View File

@ -10,6 +10,7 @@
#include "common/logging/log.h" #include "common/logging/log.h"
#include "common/singleton.h" #include "common/singleton.h"
#include "common/thread.h" #include "common/thread.h"
#include "core/cpu_patches.h"
#include "core/libraries/error_codes.h" #include "core/libraries/error_codes.h"
#include "core/libraries/kernel/libkernel.h" #include "core/libraries/kernel/libkernel.h"
#include "core/libraries/kernel/thread_management.h" #include "core/libraries/kernel/thread_management.h"
@ -985,6 +986,7 @@ static void cleanup_thread(void* arg) {
destructor(value); destructor(value);
} }
} }
Core::CleanupThreadPatchStack();
thread->is_almost_done = true; thread->is_almost_done = true;
} }
@ -993,6 +995,7 @@ static void* run_thread(void* arg) {
Common::SetCurrentThreadName(thread->name.c_str()); Common::SetCurrentThreadName(thread->name.c_str());
auto* linker = Common::Singleton<Core::Linker>::Instance(); auto* linker = Common::Singleton<Core::Linker>::Instance();
linker->InitTlsForThread(false); linker->InitTlsForThread(false);
Core::InitializeThreadPatchStack();
void* ret = nullptr; void* ret = nullptr;
g_pthread_self = thread; g_pthread_self = thread;
pthread_cleanup_push(cleanup_thread, thread); pthread_cleanup_push(cleanup_thread, thread);

View File

@ -10,6 +10,7 @@
#include "common/thread.h" #include "common/thread.h"
#include "core/aerolib/aerolib.h" #include "core/aerolib/aerolib.h"
#include "core/aerolib/stubs.h" #include "core/aerolib/stubs.h"
#include "core/cpu_patches.h"
#include "core/libraries/kernel/memory_management.h" #include "core/libraries/kernel/memory_management.h"
#include "core/libraries/kernel/thread_management.h" #include "core/libraries/kernel/thread_management.h"
#include "core/linker.h" #include "core/linker.h"
@ -86,6 +87,7 @@ void Linker::Execute() {
Common::SetCurrentThreadName("GAME_MainThread"); Common::SetCurrentThreadName("GAME_MainThread");
Libraries::Kernel::pthreadInitSelfMainThread(); Libraries::Kernel::pthreadInitSelfMainThread();
InitTlsForThread(true); InitTlsForThread(true);
InitializeThreadPatchStack();
// Start shared library modules // Start shared library modules
for (auto& m : m_modules) { for (auto& m : m_modules) {
@ -104,6 +106,8 @@ void Linker::Execute() {
RunMainEntry(m->GetEntryAddress(), &p, ProgramExitFunc); RunMainEntry(m->GetEntryAddress(), &p, ProgramExitFunc);
} }
} }
CleanupThreadPatchStack();
} }
s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) { s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) {