diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 29c9fa88..a961caec 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -424,6 +424,9 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene SaveRegisters(c, {scratch1, scratch2, index, mask}); + // Backup rcx temporarily since we need it to shift + c.mov(scratch2, rcx); + // Get upper 64 bits of src c.pextrq(index, src, 1); c.mov(mask, index); @@ -437,23 +440,28 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene c.and_(index, 0x3F); // Create a mask out of the length - c.mov(scratch1, 1); - c.shlx(mask, scratch1, mask); - c.sub(mask, 1); + c.mov(cl, mask.cvt8()); + c.mov(mask, 1); + c.shl(mask, cl); c.movq(scratch1, src); - c.movq(scratch2, dst); // src &= mask c.and_(scratch1, mask); // dst &= ~(mask << index) - c.shlx(mask, mask, index); + c.mov(cl, index.cvt8()); + c.shl(mask, cl); c.not_(mask); - c.and_(scratch2, mask); - // dst |= (src << index) - c.shlx(scratch1, scratch1, index); + // src <<= index + c.shl(scratch1, cl); + + // Restore rcx + c.mov(rcx, scratch2); + + c.movq(scratch2, dst); + c.and_(scratch2, mask); c.or_(scratch2, scratch1); // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected