From 8e19ccd5077569e87c9e86be1dda0f1ffee838c5 Mon Sep 17 00:00:00 2001 From: offtkp Date: Wed, 28 Aug 2024 19:23:31 +0300 Subject: [PATCH 1/5] Patch `insertq` --- src/core/cpu_patches.cpp | 149 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 3 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 42318822..d16a3a7c 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -40,6 +40,15 @@ static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand return ZydisToXbyakRegister(operand.reg.value); } +static Xbyak::Xmm ZydisToXbyakVectorOperand(const ZydisDecodedOperand& operand) { + const ZydisRegister reg = operand.reg.value; + + if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM15) { + return Xbyak::Xmm(reg - ZYDIS_REGISTER_XMM0); + } + UNREACHABLE_MSG("Unsupported vector register: {}", static_cast(reg)); +} + static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) { ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", static_cast(operand.type)); @@ -108,9 +117,7 @@ static Xbyak::Reg AllocateScratchRegister( UNREACHABLE_MSG("Out of scratch registers!"); } -#ifdef __APPLE__ - -static constexpr u32 MaxSavedRegisters = 3; +static constexpr u32 MaxSavedRegisters = 4; static pthread_key_t register_save_slots[MaxSavedRegisters]; static std::once_flag register_save_init_flag; @@ -155,6 +162,8 @@ static void RestoreRegisters(Xbyak::CodeGenerator& c, } } +#ifdef __APPLE__ + static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); @@ -280,6 +289,11 @@ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { dst_op.reg.value <= ZYDIS_REGISTER_R15; } +// For instructions that always need to be patched +static bool FilterAlwaysTrue(const ZydisDecodedOperand*) { + return true; +} + static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto slot = GetTcbKey(); @@ -317,6 +331,133 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe #endif } +static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + // INSERTQ Instruction Reference + // Inserts bits from the lower 64 bits of the source operand into the lower 64 bits of the destination operand + // No other bits in the lower 64 bits of the destination are modified. The upper 64 bits of the destination are undefined. + + // There's two forms of the instruction: + // INSERTQ xmm1, xmm2, imm8, imm8 + // INSERTQ xmm1, xmm2 + + // For the immediate form: + // Insert field starting at bit 0 of xmm2 with the length + // specified by [5:0] of the first immediate byte. This + // field is inserted into xmm1 starting at the bit position + // specified by [5:0] of the second immediate byte. + + // For the register form: + // Insert field starting at bit 0 of xmm2 with the length + // specified by xmm2[69:64]. This field is inserted into + // xmm1 starting at the bit position specified by + // xmm2[77:72]. + + // A value of zero in the field length is defined as a length of 64. If the length field is 0 and the bit index + // is 0, bits 63:0 of the source operand are inserted. For any other value of the bit index, the results are + // undefined. + + bool immediateForm = operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && + operands[3].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; + + if (operands[0].type != ZYDIS_OPERAND_TYPE_REGISTER || operands[1].type != ZYDIS_OPERAND_TYPE_REGISTER) { + ASSERT_MSG("operands 0 and 1 must be registers."); + } + + const Xbyak::Xmm dst = ZydisToXbyakVectorOperand(operands[0]); + const Xbyak::Xmm src = ZydisToXbyakVectorOperand(operands[1]); + + if (immediateForm) { + u8 length = operands[2].imm.value.u & 0x3F; + u8 index = operands[3].imm.value.u & 0x3F; + if (length == 0) { + length = 64; + } + + if (length + index > 64) { + ASSERT_MSG("length + index must be less than or equal to 64."); + } + + const Xbyak::Reg64 scratch1 = AllocateScratchRegister({}, 64).cvt64(); + const Xbyak::Reg64 scratch2 = AllocateScratchRegister({&scratch1}, 64).cvt64(); + const Xbyak::Reg64 mask = AllocateScratchRegister({&scratch1, &scratch2}, 64).cvt64(); + + u64 maskValue = (1ULL << length) - 1; + + SaveRegisters(c, {scratch1, scratch2, mask}); + + c.movq(scratch1, src); + c.movq(scratch2, dst); + c.mov(mask, maskValue); + + // src &= mask + c.and_(scratch1, mask); + + // src <<= index + c.shl(scratch1, index); + + // dst &= ~(mask << index) + maskValue = ~(maskValue << index); + c.mov(mask, maskValue); + c.and_(scratch2, mask); + + // dst |= src + c.or_(scratch2, scratch1); + + // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected + c.pinsrq(dst, scratch2, 0); + + RestoreRegisters(c, {scratch1, scratch2, mask}); + } else { + if (operands[2].type != ZYDIS_OPERAND_TYPE_UNUSED || operands[3].type != ZYDIS_OPERAND_TYPE_UNUSED) { + ASSERT_MSG("operands 2 and 3 must be unused for register form."); + } + + const Xbyak::Reg64 scratch1 = AllocateScratchRegister({}, 64).cvt64(); + const Xbyak::Reg64 scratch2 = AllocateScratchRegister({&scratch1}, 64).cvt64(); + const Xbyak::Reg64 index = AllocateScratchRegister({&scratch1, &scratch2}, 64).cvt64(); + const Xbyak::Reg64 mask = AllocateScratchRegister({&scratch1, &scratch2, &index}, 64).cvt64(); + + SaveRegisters(c, {scratch1, scratch2, index, mask}); + + // Get upper 64 bits of src + c.pextrq(index, src, 1); + c.mov(mask, index); + + c.mov(scratch1, 64); // for the cmovz below + c.and_(mask, 0x3F); // mask now holds the length + c.cmovz(mask, scratch1); // Check if length is 0, if so, set to 64 + + // Get index to insert at + c.shr(index, 8); + c.and_(index, 0x3F); + + // Create a mask out of the length + c.mov(scratch1, 1); + c.shlx(mask, scratch1, mask); + c.sub(mask, 1); + + c.movq(scratch1, src); + c.movq(scratch2, dst); + + // src &= mask + c.and_(scratch1, mask); + + // dst &= ~(mask << index) + c.shlx(mask, mask, index); + c.not_(mask); + c.and_(scratch2, mask); + + // dst |= (src << index) + c.shlx(scratch1, scratch1, index); + c.or_(scratch2, scratch1); + + // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected + c.pinsrq(dst, scratch2, 0); + + RestoreRegisters(c, {scratch1, scratch2, index, mask}); + } +} + using PatchFilter = bool (*)(const ZydisDecodedOperand*); using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); struct PatchInfo { @@ -338,6 +479,8 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif + {ZYDIS_MNEMONIC_INSERTQ, {FilterAlwaysTrue, GenerateINSERTQ, true}}, + #ifdef __APPLE__ // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}}, From 96afa3d8ee5e6449b3988f0db8dda078a77a21a4 Mon Sep 17 00:00:00 2001 From: offtkp Date: Wed, 28 Aug 2024 23:08:42 +0300 Subject: [PATCH 2/5] Format code --- src/core/cpu_patches.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index d16a3a7c..857c1537 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -333,8 +333,9 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { // INSERTQ Instruction Reference - // Inserts bits from the lower 64 bits of the source operand into the lower 64 bits of the destination operand - // No other bits in the lower 64 bits of the destination are modified. The upper 64 bits of the destination are undefined. + // Inserts bits from the lower 64 bits of the source operand into the lower 64 bits of the + // destination operand No other bits in the lower 64 bits of the destination are modified. The + // upper 64 bits of the destination are undefined. // There's two forms of the instruction: // INSERTQ xmm1, xmm2, imm8, imm8 @@ -352,14 +353,15 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene // xmm1 starting at the bit position specified by // xmm2[77:72]. - // A value of zero in the field length is defined as a length of 64. If the length field is 0 and the bit index - // is 0, bits 63:0 of the source operand are inserted. For any other value of the bit index, the results are - // undefined. + // A value of zero in the field length is defined as a length of 64. If the length field is 0 + // and the bit index is 0, bits 63:0 of the source operand are inserted. For any other value of + // the bit index, the results are undefined. bool immediateForm = operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[3].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; - if (operands[0].type != ZYDIS_OPERAND_TYPE_REGISTER || operands[1].type != ZYDIS_OPERAND_TYPE_REGISTER) { + if (operands[0].type != ZYDIS_OPERAND_TYPE_REGISTER || + operands[1].type != ZYDIS_OPERAND_TYPE_REGISTER) { ASSERT_MSG("operands 0 and 1 must be registers."); } @@ -408,14 +410,16 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene RestoreRegisters(c, {scratch1, scratch2, mask}); } else { - if (operands[2].type != ZYDIS_OPERAND_TYPE_UNUSED || operands[3].type != ZYDIS_OPERAND_TYPE_UNUSED) { + if (operands[2].type != ZYDIS_OPERAND_TYPE_UNUSED || + operands[3].type != ZYDIS_OPERAND_TYPE_UNUSED) { ASSERT_MSG("operands 2 and 3 must be unused for register form."); } - + const Xbyak::Reg64 scratch1 = AllocateScratchRegister({}, 64).cvt64(); const Xbyak::Reg64 scratch2 = AllocateScratchRegister({&scratch1}, 64).cvt64(); const Xbyak::Reg64 index = AllocateScratchRegister({&scratch1, &scratch2}, 64).cvt64(); - const Xbyak::Reg64 mask = AllocateScratchRegister({&scratch1, &scratch2, &index}, 64).cvt64(); + const Xbyak::Reg64 mask = + AllocateScratchRegister({&scratch1, &scratch2, &index}, 64).cvt64(); SaveRegisters(c, {scratch1, scratch2, index, mask}); @@ -423,8 +427,8 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene c.pextrq(index, src, 1); c.mov(mask, index); - c.mov(scratch1, 64); // for the cmovz below - c.and_(mask, 0x3F); // mask now holds the length + c.mov(scratch1, 64); // for the cmovz below + c.and_(mask, 0x3F); // mask now holds the length c.cmovz(mask, scratch1); // Check if length is 0, if so, set to 64 // Get index to insert at From 4f6fe8293640347b910bc9555185e12b6cb4c009 Mon Sep 17 00:00:00 2001 From: offtkp Date: Wed, 28 Aug 2024 23:18:05 +0300 Subject: [PATCH 3/5] Make filter check for SSE4a --- src/core/cpu_patches.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 857c1537..29c9fa88 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "common/assert.h" #include "common/types.h" #include "core/tls.h" @@ -289,9 +290,9 @@ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { dst_op.reg.value <= ZYDIS_REGISTER_R15; } -// For instructions that always need to be patched -static bool FilterAlwaysTrue(const ZydisDecodedOperand*) { - return true; +static bool FilterNoSSE4a(const ZydisDecodedOperand*) { + Xbyak::util::Cpu cpu; + return !cpu.has(Xbyak::util::Cpu::tSSE4a); } static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { @@ -483,7 +484,7 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif - {ZYDIS_MNEMONIC_INSERTQ, {FilterAlwaysTrue, GenerateINSERTQ, true}}, + {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, #ifdef __APPLE__ // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. From 1d073fe56d6e93cea2aeac2e179a616f0252ec13 Mon Sep 17 00:00:00 2001 From: offtkp Date: Wed, 28 Aug 2024 23:42:43 +0300 Subject: [PATCH 4/5] Get rid of shlx --- src/core/cpu_patches.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 29c9fa88..a961caec 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -424,6 +424,9 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene SaveRegisters(c, {scratch1, scratch2, index, mask}); + // Backup rcx temporarily since we need it to shift + c.mov(scratch2, rcx); + // Get upper 64 bits of src c.pextrq(index, src, 1); c.mov(mask, index); @@ -437,23 +440,28 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene c.and_(index, 0x3F); // Create a mask out of the length - c.mov(scratch1, 1); - c.shlx(mask, scratch1, mask); - c.sub(mask, 1); + c.mov(cl, mask.cvt8()); + c.mov(mask, 1); + c.shl(mask, cl); c.movq(scratch1, src); - c.movq(scratch2, dst); // src &= mask c.and_(scratch1, mask); // dst &= ~(mask << index) - c.shlx(mask, mask, index); + c.mov(cl, index.cvt8()); + c.shl(mask, cl); c.not_(mask); - c.and_(scratch2, mask); - // dst |= (src << index) - c.shlx(scratch1, scratch1, index); + // src <<= index + c.shl(scratch1, cl); + + // Restore rcx + c.mov(rcx, scratch2); + + c.movq(scratch2, dst); + c.and_(scratch2, mask); c.or_(scratch2, scratch1); // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected From 384ea6e6b43a84eaadd4cc0cfc1ac5270893f1fc Mon Sep 17 00:00:00 2001 From: offtkp Date: Wed, 28 Aug 2024 23:50:42 +0300 Subject: [PATCH 5/5] Include pthread.h on windows too --- src/core/cpu_patches.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index a961caec..8e601c3b 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "common/assert.h" @@ -14,7 +15,6 @@ #ifdef _WIN32 #include #else -#include #ifdef __APPLE__ #include #endif