Common: Switch SIMD padd/pmul instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle 2025-06-01 21:30:20 -05:00 committed by TellowKrinkle
parent 10ed797881
commit 8ad9d7d047
3 changed files with 89 additions and 41 deletions

View File

@ -51,32 +51,32 @@ namespace x86Emitter
//
struct xImplSimd_AddSub
{
const xImplSimd_DestRegEither B;
const xImplSimd_DestRegEither W;
const xImplSimd_DestRegEither D;
const xImplSimd_DestRegEither Q;
const xImplSimd_3Arg B;
const xImplSimd_3Arg W;
const xImplSimd_3Arg D;
const xImplSimd_3Arg Q;
// Add/Sub packed signed byte [8bit] integers from src into dest, and saturate the results.
const xImplSimd_DestRegEither SB;
const xImplSimd_3Arg SB;
// Add/Sub packed signed word [16bit] integers from src into dest, and saturate the results.
const xImplSimd_DestRegEither SW;
const xImplSimd_3Arg SW;
// Add/Sub packed unsigned byte [8bit] integers from src into dest, and saturate the results.
const xImplSimd_DestRegEither USB;
const xImplSimd_3Arg USB;
// Add/Sub packed unsigned word [16bit] integers from src into dest, and saturate the results.
const xImplSimd_DestRegEither USW;
const xImplSimd_3Arg USW;
};
//////////////////////////////////////////////////////////////////////////////////////////
//
struct xImplSimd_PMul
{
const xImplSimd_DestRegEither LW;
const xImplSimd_DestRegEither HW;
const xImplSimd_DestRegEither HUW;
const xImplSimd_DestRegEither UDQ;
const xImplSimd_3Arg LW;
const xImplSimd_3Arg HW;
const xImplSimd_3Arg HUW;
const xImplSimd_3Arg UDQ;
// [SSE-3] PMULHRSW multiplies vertically each signed 16-bit integer from dest with the
// corresponding signed 16-bit integer of source, producing intermediate signed 32-bit
@ -88,14 +88,14 @@ namespace x86Emitter
//
// Both operands can be MMX or XMM registers. Source can be register or memory.
//
const xImplSimd_DestRegEither HRSW;
const xImplSimd_3Arg HRSW;
// [SSE-4.1] Multiply the packed dword signed integers in dest with src, and store
// the low 32 bits of each product in xmm1.
const xImplSimd_DestRegSSE LD;
const xImplSimd_3Arg LD;
// [SSE-4.1] Multiply the packed signed dword integers in dest with src.
const xImplSimd_DestRegSSE DQ;
const xImplSimd_3Arg DQ;
};
//////////////////////////////////////////////////////////////////////////////////////////

View File

@ -339,41 +339,41 @@ namespace x86Emitter
};
const xImplSimd_AddSub xPADD =
{
{0x66, 0xdc + 0x20}, // B
{0x66, 0xdc + 0x21}, // W
{0x66, 0xdc + 0x22}, // D
{0x66, 0xd4}, // Q
{
{SIMDInstructionInfo(0xfc).p66().i().commutative()}, // B
{SIMDInstructionInfo(0xfd).p66().i().commutative()}, // W
{SIMDInstructionInfo(0xfe).p66().i().commutative()}, // D
{SIMDInstructionInfo(0xd4).p66().i().commutative()}, // Q
{0x66, 0xdc + 0x10}, // SB
{0x66, 0xdc + 0x11}, // SW
{0x66, 0xdc}, // USB
{0x66, 0xdc + 1}, // USW
{SIMDInstructionInfo(0xec).p66().i().commutative()}, // SB
{SIMDInstructionInfo(0xed).p66().i().commutative()}, // SW
{SIMDInstructionInfo(0xdc).p66().i().commutative()}, // USB
{SIMDInstructionInfo(0xdd).p66().i().commutative()}, // USW
};
const xImplSimd_AddSub xPSUB =
{
{0x66, 0xd8 + 0x20}, // B
{0x66, 0xd8 + 0x21}, // W
{0x66, 0xd8 + 0x22}, // D
{0x66, 0xfb}, // Q
{
{SIMDInstructionInfo(0xf8).p66().i()}, // B
{SIMDInstructionInfo(0xf9).p66().i()}, // W
{SIMDInstructionInfo(0xfa).p66().i()}, // D
{SIMDInstructionInfo(0xfb).p66().i()}, // Q
{0x66, 0xd8 + 0x10}, // SB
{0x66, 0xd8 + 0x11}, // SW
{0x66, 0xd8}, // USB
{0x66, 0xd8 + 1}, // USW
{SIMDInstructionInfo(0xe8).p66().i()}, // SB
{SIMDInstructionInfo(0xe9).p66().i()}, // SW
{SIMDInstructionInfo(0xd8).p66().i()}, // USB
{SIMDInstructionInfo(0xd9).p66().i()}, // USW
};
const xImplSimd_PMul xPMUL =
{
{0x66, 0xd5}, // LW
{0x66, 0xe5}, // HW
{0x66, 0xe4}, // HUW
{0x66, 0xf4}, // UDQ
{
{SIMDInstructionInfo(0xd5).p66().i().commutative()}, // LW
{SIMDInstructionInfo(0xe5).p66().i().commutative()}, // HW
{SIMDInstructionInfo(0xe4).p66().i().commutative()}, // HUW
{SIMDInstructionInfo(0xf4).p66().i().commutative()}, // UDQ
{0x66, 0x0b38}, // HRSW
{0x66, 0x4038}, // LD
{0x66, 0x2838}, // DQ
{SIMDInstructionInfo(0x0b).p66().m0f38().i().commutative()}, // HRSW
{SIMDInstructionInfo(0x40).p66().m0f38().i().commutative()}, // LD
{SIMDInstructionInfo(0x28).p66().m0f38().i().commutative()}, // DQ
};
const xImplSimd_rSqrt xRSQRT =

View File

@ -187,6 +187,30 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xPSRL.Q(xmm7, 4), "66 0f 73 d7 04");
CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "66 41 0f 73 d8 05");
CODEGEN_TEST(xPADD.B(xmm1, xmm8), "66 41 0f fc c8");
CODEGEN_TEST(xPADD.W(xmm4, xmm7), "66 0f fd e7");
CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "66 0f fe 11");
CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "66 44 0f d4 c2");
CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "66 45 0f ec c8");
CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "66 41 0f ed 10");
CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "66 0f dc db");
CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "66 41 0f dd d1");
CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "66 41 0f f8 c8");
CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "66 0f f9 e7");
CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "66 0f fa 11");
CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "66 44 0f fb c2");
CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "66 45 0f e8 c8");
CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "66 41 0f e9 10");
CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "66 0f d8 db");
CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "66 41 0f d9 d1");
CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "66 41 0f d5 d0");
CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "66 45 0f e5 09");
CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "66 0f e4 e3");
CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "66 0f f4 cf");
CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "66 0f 38 0b d4");
CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "66 41 0f 38 40 c8");
CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "66 41 0f 38 28 e1");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
@ -255,6 +279,30 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xPSRL.Q(xmm7, 4), "c5 c1 73 d7 04");
CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "c4 c1 39 73 d8 05");
CODEGEN_TEST(xPADD.B(xmm1, xmm8), "c5 b9 fc c9"); // => vpaddb xmm1, xmm8, xmm1
CODEGEN_TEST(xPADD.W(xmm4, xmm7), "c5 d9 fd e7");
CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "c5 e9 fe 11");
CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "c5 39 d4 c2");
CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "c4 41 31 ec c8");
CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "c4 c1 69 ed 10");
CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "c5 e1 dc db");
CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "c5 b1 dd d2"); // => vpaddd xmm2, xmm9, xmm2
CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "c4 c1 71 f8 c8");
CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "c5 d9 f9 e7");
CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "c5 e9 fa 11");
CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "c5 39 fb c2");
CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "c4 41 31 e8 c8");
CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "c4 c1 69 e9 10");
CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "c5 e1 d8 db");
CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "c4 c1 69 d9 d1");
CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "c5 b9 d5 d2"); // => vpmullw xmm2, xmm8, xmm2
CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "c4 41 31 e5 09");
CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "c5 d9 e4 e3");
CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "c5 f1 f4 cf");
CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "c4 e2 69 0b d4");
CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "c4 c2 71 40 c8");
CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "c4 c2 59 28 e1");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");