mirror of
https://github.com/PCSX2/pcsx2.git
synced 2025-12-16 04:08:48 +00:00
Common: Switch SIMD padd/pmul instructions to auto SSE/AVX
This commit is contained in:
parent
10ed797881
commit
8ad9d7d047
@ -51,32 +51,32 @@ namespace x86Emitter
|
||||
//
|
||||
struct xImplSimd_AddSub
|
||||
{
|
||||
const xImplSimd_DestRegEither B;
|
||||
const xImplSimd_DestRegEither W;
|
||||
const xImplSimd_DestRegEither D;
|
||||
const xImplSimd_DestRegEither Q;
|
||||
const xImplSimd_3Arg B;
|
||||
const xImplSimd_3Arg W;
|
||||
const xImplSimd_3Arg D;
|
||||
const xImplSimd_3Arg Q;
|
||||
|
||||
// Add/Sub packed signed byte [8bit] integers from src into dest, and saturate the results.
|
||||
const xImplSimd_DestRegEither SB;
|
||||
const xImplSimd_3Arg SB;
|
||||
|
||||
// Add/Sub packed signed word [16bit] integers from src into dest, and saturate the results.
|
||||
const xImplSimd_DestRegEither SW;
|
||||
const xImplSimd_3Arg SW;
|
||||
|
||||
// Add/Sub packed unsigned byte [8bit] integers from src into dest, and saturate the results.
|
||||
const xImplSimd_DestRegEither USB;
|
||||
const xImplSimd_3Arg USB;
|
||||
|
||||
// Add/Sub packed unsigned word [16bit] integers from src into dest, and saturate the results.
|
||||
const xImplSimd_DestRegEither USW;
|
||||
const xImplSimd_3Arg USW;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
struct xImplSimd_PMul
|
||||
{
|
||||
const xImplSimd_DestRegEither LW;
|
||||
const xImplSimd_DestRegEither HW;
|
||||
const xImplSimd_DestRegEither HUW;
|
||||
const xImplSimd_DestRegEither UDQ;
|
||||
const xImplSimd_3Arg LW;
|
||||
const xImplSimd_3Arg HW;
|
||||
const xImplSimd_3Arg HUW;
|
||||
const xImplSimd_3Arg UDQ;
|
||||
|
||||
// [SSE-3] PMULHRSW multiplies vertically each signed 16-bit integer from dest with the
|
||||
// corresponding signed 16-bit integer of source, producing intermediate signed 32-bit
|
||||
@ -88,14 +88,14 @@ namespace x86Emitter
|
||||
//
|
||||
// Both operands can be MMX or XMM registers. Source can be register or memory.
|
||||
//
|
||||
const xImplSimd_DestRegEither HRSW;
|
||||
const xImplSimd_3Arg HRSW;
|
||||
|
||||
// [SSE-4.1] Multiply the packed dword signed integers in dest with src, and store
|
||||
// the low 32 bits of each product in xmm1.
|
||||
const xImplSimd_DestRegSSE LD;
|
||||
const xImplSimd_3Arg LD;
|
||||
|
||||
// [SSE-4.1] Multiply the packed signed dword integers in dest with src.
|
||||
const xImplSimd_DestRegSSE DQ;
|
||||
const xImplSimd_3Arg DQ;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -339,41 +339,41 @@ namespace x86Emitter
|
||||
};
|
||||
|
||||
const xImplSimd_AddSub xPADD =
|
||||
{
|
||||
{0x66, 0xdc + 0x20}, // B
|
||||
{0x66, 0xdc + 0x21}, // W
|
||||
{0x66, 0xdc + 0x22}, // D
|
||||
{0x66, 0xd4}, // Q
|
||||
{
|
||||
{SIMDInstructionInfo(0xfc).p66().i().commutative()}, // B
|
||||
{SIMDInstructionInfo(0xfd).p66().i().commutative()}, // W
|
||||
{SIMDInstructionInfo(0xfe).p66().i().commutative()}, // D
|
||||
{SIMDInstructionInfo(0xd4).p66().i().commutative()}, // Q
|
||||
|
||||
{0x66, 0xdc + 0x10}, // SB
|
||||
{0x66, 0xdc + 0x11}, // SW
|
||||
{0x66, 0xdc}, // USB
|
||||
{0x66, 0xdc + 1}, // USW
|
||||
{SIMDInstructionInfo(0xec).p66().i().commutative()}, // SB
|
||||
{SIMDInstructionInfo(0xed).p66().i().commutative()}, // SW
|
||||
{SIMDInstructionInfo(0xdc).p66().i().commutative()}, // USB
|
||||
{SIMDInstructionInfo(0xdd).p66().i().commutative()}, // USW
|
||||
};
|
||||
|
||||
const xImplSimd_AddSub xPSUB =
|
||||
{
|
||||
{0x66, 0xd8 + 0x20}, // B
|
||||
{0x66, 0xd8 + 0x21}, // W
|
||||
{0x66, 0xd8 + 0x22}, // D
|
||||
{0x66, 0xfb}, // Q
|
||||
{
|
||||
{SIMDInstructionInfo(0xf8).p66().i()}, // B
|
||||
{SIMDInstructionInfo(0xf9).p66().i()}, // W
|
||||
{SIMDInstructionInfo(0xfa).p66().i()}, // D
|
||||
{SIMDInstructionInfo(0xfb).p66().i()}, // Q
|
||||
|
||||
{0x66, 0xd8 + 0x10}, // SB
|
||||
{0x66, 0xd8 + 0x11}, // SW
|
||||
{0x66, 0xd8}, // USB
|
||||
{0x66, 0xd8 + 1}, // USW
|
||||
{SIMDInstructionInfo(0xe8).p66().i()}, // SB
|
||||
{SIMDInstructionInfo(0xe9).p66().i()}, // SW
|
||||
{SIMDInstructionInfo(0xd8).p66().i()}, // USB
|
||||
{SIMDInstructionInfo(0xd9).p66().i()}, // USW
|
||||
};
|
||||
|
||||
const xImplSimd_PMul xPMUL =
|
||||
{
|
||||
{0x66, 0xd5}, // LW
|
||||
{0x66, 0xe5}, // HW
|
||||
{0x66, 0xe4}, // HUW
|
||||
{0x66, 0xf4}, // UDQ
|
||||
{
|
||||
{SIMDInstructionInfo(0xd5).p66().i().commutative()}, // LW
|
||||
{SIMDInstructionInfo(0xe5).p66().i().commutative()}, // HW
|
||||
{SIMDInstructionInfo(0xe4).p66().i().commutative()}, // HUW
|
||||
{SIMDInstructionInfo(0xf4).p66().i().commutative()}, // UDQ
|
||||
|
||||
{0x66, 0x0b38}, // HRSW
|
||||
{0x66, 0x4038}, // LD
|
||||
{0x66, 0x2838}, // DQ
|
||||
{SIMDInstructionInfo(0x0b).p66().m0f38().i().commutative()}, // HRSW
|
||||
{SIMDInstructionInfo(0x40).p66().m0f38().i().commutative()}, // LD
|
||||
{SIMDInstructionInfo(0x28).p66().m0f38().i().commutative()}, // DQ
|
||||
};
|
||||
|
||||
const xImplSimd_rSqrt xRSQRT =
|
||||
|
||||
@ -187,6 +187,30 @@ TEST(CodegenTests, SSETest)
|
||||
CODEGEN_TEST(xPSRL.Q(xmm7, 4), "66 0f 73 d7 04");
|
||||
CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "66 41 0f 73 d8 05");
|
||||
|
||||
CODEGEN_TEST(xPADD.B(xmm1, xmm8), "66 41 0f fc c8");
|
||||
CODEGEN_TEST(xPADD.W(xmm4, xmm7), "66 0f fd e7");
|
||||
CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "66 0f fe 11");
|
||||
CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "66 44 0f d4 c2");
|
||||
CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "66 45 0f ec c8");
|
||||
CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "66 41 0f ed 10");
|
||||
CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "66 0f dc db");
|
||||
CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "66 41 0f dd d1");
|
||||
CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "66 41 0f f8 c8");
|
||||
CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "66 0f f9 e7");
|
||||
CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "66 0f fa 11");
|
||||
CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "66 44 0f fb c2");
|
||||
CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "66 45 0f e8 c8");
|
||||
CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "66 41 0f e9 10");
|
||||
CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "66 0f d8 db");
|
||||
CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "66 41 0f d9 d1");
|
||||
CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "66 41 0f d5 d0");
|
||||
CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "66 45 0f e5 09");
|
||||
CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "66 0f e4 e3");
|
||||
CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "66 0f f4 cf");
|
||||
CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "66 0f 38 0b d4");
|
||||
CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "66 41 0f 38 40 c8");
|
||||
CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "66 41 0f 38 28 e1");
|
||||
|
||||
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
|
||||
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
|
||||
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
|
||||
@ -255,6 +279,30 @@ TEST(CodegenTests, AVXTest)
|
||||
CODEGEN_TEST(xPSRL.Q(xmm7, 4), "c5 c1 73 d7 04");
|
||||
CODEGEN_TEST(xPSRL.DQ(xmm8, 5), "c4 c1 39 73 d8 05");
|
||||
|
||||
CODEGEN_TEST(xPADD.B(xmm1, xmm8), "c5 b9 fc c9"); // => vpaddb xmm1, xmm8, xmm1
|
||||
CODEGEN_TEST(xPADD.W(xmm4, xmm7), "c5 d9 fd e7");
|
||||
CODEGEN_TEST(xPADD.D(xmm2, ptr[rcx]), "c5 e9 fe 11");
|
||||
CODEGEN_TEST(xPADD.Q(xmm8, xmm2), "c5 39 d4 c2");
|
||||
CODEGEN_TEST(xPADD.SB(xmm9, xmm8), "c4 41 31 ec c8");
|
||||
CODEGEN_TEST(xPADD.SW(xmm2, ptr[r8]), "c4 c1 69 ed 10");
|
||||
CODEGEN_TEST(xPADD.USB(xmm3, xmm3), "c5 e1 dc db");
|
||||
CODEGEN_TEST(xPADD.USW(xmm2, xmm9), "c5 b1 dd d2"); // => vpaddd xmm2, xmm9, xmm2
|
||||
CODEGEN_TEST(xPSUB.B(xmm1, xmm8), "c4 c1 71 f8 c8");
|
||||
CODEGEN_TEST(xPSUB.W(xmm4, xmm7), "c5 d9 f9 e7");
|
||||
CODEGEN_TEST(xPSUB.D(xmm2, ptr[rcx]), "c5 e9 fa 11");
|
||||
CODEGEN_TEST(xPSUB.Q(xmm8, xmm2), "c5 39 fb c2");
|
||||
CODEGEN_TEST(xPSUB.SB(xmm9, xmm8), "c4 41 31 e8 c8");
|
||||
CODEGEN_TEST(xPSUB.SW(xmm2, ptr[r8]), "c4 c1 69 e9 10");
|
||||
CODEGEN_TEST(xPSUB.USB(xmm3, xmm3), "c5 e1 d8 db");
|
||||
CODEGEN_TEST(xPSUB.USW(xmm2, xmm9), "c4 c1 69 d9 d1");
|
||||
CODEGEN_TEST(xPMUL.LW(xmm2, xmm8), "c5 b9 d5 d2"); // => vpmullw xmm2, xmm8, xmm2
|
||||
CODEGEN_TEST(xPMUL.HW(xmm9, ptr[r9]), "c4 41 31 e5 09");
|
||||
CODEGEN_TEST(xPMUL.HUW(xmm4, xmm3), "c5 d9 e4 e3");
|
||||
CODEGEN_TEST(xPMUL.UDQ(xmm1, xmm7), "c5 f1 f4 cf");
|
||||
CODEGEN_TEST(xPMUL.HRSW(xmm2, xmm4), "c4 e2 69 0b d4");
|
||||
CODEGEN_TEST(xPMUL.LD(xmm1, xmm8), "c4 c2 71 40 c8");
|
||||
CODEGEN_TEST(xPMUL.DQ(xmm4, xmm9), "c4 c2 59 28 e1");
|
||||
|
||||
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
|
||||
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
|
||||
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");
|
||||
|
||||
Loading…
Reference in New Issue
Block a user