diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 42318822..d16a3a7c 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -40,6 +40,15 @@ static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand return ZydisToXbyakRegister(operand.reg.value); } +static Xbyak::Xmm ZydisToXbyakVectorOperand(const ZydisDecodedOperand& operand) { + const ZydisRegister reg = operand.reg.value; + + if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM15) { + return Xbyak::Xmm(reg - ZYDIS_REGISTER_XMM0); + } + UNREACHABLE_MSG("Unsupported vector register: {}", static_cast(reg)); +} + static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) { ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", static_cast(operand.type)); @@ -108,9 +117,7 @@ static Xbyak::Reg AllocateScratchRegister( UNREACHABLE_MSG("Out of scratch registers!"); } -#ifdef __APPLE__ - -static constexpr u32 MaxSavedRegisters = 3; +static constexpr u32 MaxSavedRegisters = 4; static pthread_key_t register_save_slots[MaxSavedRegisters]; static std::once_flag register_save_init_flag; @@ -155,6 +162,8 @@ static void RestoreRegisters(Xbyak::CodeGenerator& c, } } +#ifdef __APPLE__ + static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); @@ -280,6 +289,11 @@ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { dst_op.reg.value <= ZYDIS_REGISTER_R15; } +// For instructions that always need to be patched +static bool FilterAlwaysTrue(const ZydisDecodedOperand*) { + return true; +} + static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto slot = GetTcbKey(); @@ -317,6 +331,133 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe #endif } +static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + // INSERTQ Instruction Reference + // Inserts bits from the lower 64 bits of the source operand into the lower 64 bits of the destination operand + // No other bits in the lower 64 bits of the destination are modified. The upper 64 bits of the destination are undefined. + + // There's two forms of the instruction: + // INSERTQ xmm1, xmm2, imm8, imm8 + // INSERTQ xmm1, xmm2 + + // For the immediate form: + // Insert field starting at bit 0 of xmm2 with the length + // specified by [5:0] of the first immediate byte. This + // field is inserted into xmm1 starting at the bit position + // specified by [5:0] of the second immediate byte. + + // For the register form: + // Insert field starting at bit 0 of xmm2 with the length + // specified by xmm2[69:64]. This field is inserted into + // xmm1 starting at the bit position specified by + // xmm2[77:72]. + + // A value of zero in the field length is defined as a length of 64. If the length field is 0 and the bit index + // is 0, bits 63:0 of the source operand are inserted. For any other value of the bit index, the results are + // undefined. + + bool immediateForm = operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && + operands[3].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; + + if (operands[0].type != ZYDIS_OPERAND_TYPE_REGISTER || operands[1].type != ZYDIS_OPERAND_TYPE_REGISTER) { + ASSERT_MSG("operands 0 and 1 must be registers."); + } + + const Xbyak::Xmm dst = ZydisToXbyakVectorOperand(operands[0]); + const Xbyak::Xmm src = ZydisToXbyakVectorOperand(operands[1]); + + if (immediateForm) { + u8 length = operands[2].imm.value.u & 0x3F; + u8 index = operands[3].imm.value.u & 0x3F; + if (length == 0) { + length = 64; + } + + if (length + index > 64) { + ASSERT_MSG("length + index must be less than or equal to 64."); + } + + const Xbyak::Reg64 scratch1 = AllocateScratchRegister({}, 64).cvt64(); + const Xbyak::Reg64 scratch2 = AllocateScratchRegister({&scratch1}, 64).cvt64(); + const Xbyak::Reg64 mask = AllocateScratchRegister({&scratch1, &scratch2}, 64).cvt64(); + + u64 maskValue = (1ULL << length) - 1; + + SaveRegisters(c, {scratch1, scratch2, mask}); + + c.movq(scratch1, src); + c.movq(scratch2, dst); + c.mov(mask, maskValue); + + // src &= mask + c.and_(scratch1, mask); + + // src <<= index + c.shl(scratch1, index); + + // dst &= ~(mask << index) + maskValue = ~(maskValue << index); + c.mov(mask, maskValue); + c.and_(scratch2, mask); + + // dst |= src + c.or_(scratch2, scratch1); + + // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected + c.pinsrq(dst, scratch2, 0); + + RestoreRegisters(c, {scratch1, scratch2, mask}); + } else { + if (operands[2].type != ZYDIS_OPERAND_TYPE_UNUSED || operands[3].type != ZYDIS_OPERAND_TYPE_UNUSED) { + ASSERT_MSG("operands 2 and 3 must be unused for register form."); + } + + const Xbyak::Reg64 scratch1 = AllocateScratchRegister({}, 64).cvt64(); + const Xbyak::Reg64 scratch2 = AllocateScratchRegister({&scratch1}, 64).cvt64(); + const Xbyak::Reg64 index = AllocateScratchRegister({&scratch1, &scratch2}, 64).cvt64(); + const Xbyak::Reg64 mask = AllocateScratchRegister({&scratch1, &scratch2, &index}, 64).cvt64(); + + SaveRegisters(c, {scratch1, scratch2, index, mask}); + + // Get upper 64 bits of src + c.pextrq(index, src, 1); + c.mov(mask, index); + + c.mov(scratch1, 64); // for the cmovz below + c.and_(mask, 0x3F); // mask now holds the length + c.cmovz(mask, scratch1); // Check if length is 0, if so, set to 64 + + // Get index to insert at + c.shr(index, 8); + c.and_(index, 0x3F); + + // Create a mask out of the length + c.mov(scratch1, 1); + c.shlx(mask, scratch1, mask); + c.sub(mask, 1); + + c.movq(scratch1, src); + c.movq(scratch2, dst); + + // src &= mask + c.and_(scratch1, mask); + + // dst &= ~(mask << index) + c.shlx(mask, mask, index); + c.not_(mask); + c.and_(scratch2, mask); + + // dst |= (src << index) + c.shlx(scratch1, scratch1, index); + c.or_(scratch2, scratch1); + + // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected + c.pinsrq(dst, scratch2, 0); + + RestoreRegisters(c, {scratch1, scratch2, index, mask}); + } +} + using PatchFilter = bool (*)(const ZydisDecodedOperand*); using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); struct PatchInfo { @@ -338,6 +479,8 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif + {ZYDIS_MNEMONIC_INSERTQ, {FilterAlwaysTrue, GenerateINSERTQ, true}}, + #ifdef __APPLE__ // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}},