Common: Switch hadd/dp/round instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle 2025-06-01 22:55:09 -05:00 committed by TellowKrinkle
parent ddefb8a393
commit 01a1b017e8
3 changed files with 37 additions and 19 deletions

View File

@ -216,14 +216,14 @@ namespace x86Emitter
// stores the result in the second dword of dest.
// * Adds single-precision floating-point values in the first and second dword of *src*
// and stores the result in the third dword of dest.
const xImplSimd_DestRegSSE PS;
const xImplSimd_3Arg PS;
// [SSE-3] Horizontal Add of Packed Data. A two step process:
// * Adds the double-precision floating-point values in the high and low quadwords of
// dest and stores the result in the low quadword of dest.
// * Adds the double-precision floating-point values in the high and low quadwords of
// *src* stores the result in the high quadword of dest.
const xImplSimd_DestRegSSE PD;
const xImplSimd_3Arg PD;
};
//////////////////////////////////////////////////////////////////////////////////////////
@ -244,10 +244,10 @@ namespace x86Emitter
// element in dest. If a broadcast mask bit is zero, the corresponding element in
// the destination is set to zero.
//
xImplSimd_DestRegImmSSE PS;
xImplSimd_3ArgImm PS;
// [SSE-4.1]
xImplSimd_DestRegImmSSE PD;
xImplSimd_3ArgImm PD;
};
//////////////////////////////////////////////////////////////////////////////////////////
@ -265,7 +265,7 @@ namespace x86Emitter
// Rounding Mode Reference:
// 0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
//
const xImplSimd_DestRegImmSSE PS;
const xImplSimd_2ArgImm PS;
// [SSE-4.1] Rounds the 2 packed double-precision src values and stores them in dest.
//
@ -277,7 +277,7 @@ namespace x86Emitter
// Rounding Mode Reference:
// 0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
//
const xImplSimd_DestRegImmSSE PD;
const xImplSimd_2ArgImm PD;
// [SSE-4.1] Rounds the single-precision src value and stores in dest.
//
@ -289,7 +289,7 @@ namespace x86Emitter
// Rounding Mode Reference:
// 0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
//
const xImplSimd_DestRegImmSSE SS;
const xImplSimd_3ArgImm SS;
// [SSE-4.1] Rounds the double-precision src value and stores in dest.
//
@ -301,7 +301,7 @@ namespace x86Emitter
// Rounding Mode Reference:
// 0 - Nearest, 1 - Negative Infinity, 2 - Positive infinity, 3 - Truncate.
//
const xImplSimd_DestRegImmSSE SD;
const xImplSimd_3ArgImm SD;
};
} // End namespace x86Emitter

View File

@ -423,23 +423,23 @@ namespace x86Emitter
};
const xImplSimd_HorizAdd xHADD =
{
{0xf2, 0x7c}, // PS
{0x66, 0x7c}, // PD
{
{SIMDInstructionInfo(0x7c).pf2()}, // PS
{SIMDInstructionInfo(0x7c).p66()}, // PD
};
const xImplSimd_DotProduct xDP =
{
{0x66, 0x403a}, // PS
{0x66, 0x413a}, // PD
{
{SIMDInstructionInfo(0x40).p66().m0f3a().commutative()}, // PS
{SIMDInstructionInfo(0x41).p66().m0f3a().commutative()}, // PD
};
const xImplSimd_Round xROUND =
{
{0x66, 0x083a}, // PS
{0x66, 0x093a}, // PD
{0x66, 0x0a3a}, // SS
{0x66, 0x0b3a}, // SD
{
{SIMDInstructionInfo(0x08).p66().m0f3a()}, // PS
{SIMDInstructionInfo(0x09).p66().m0f3a()}, // PD
{SIMDInstructionInfo(0x0a).p66().m0f3a()}, // SS
{SIMDInstructionInfo(0x0b).p66().m0f3a()}, // SD
};
// =====================================================================================================

View File

@ -231,6 +231,15 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xPMADD.WD(xmm0, xmm8), "66 41 0f f5 c0");
CODEGEN_TEST(xPMADD.UBSW(xmm0, xmm8), "66 41 0f 38 04 c0");
CODEGEN_TEST(xHADD.PS(xmm1, xmm8), "f2 41 0f 7c c8");
CODEGEN_TEST(xHADD.PD(xmm4, ptr[r8]), "66 41 0f 7c 20");
CODEGEN_TEST(xDP.PS(xmm3, xmm9, 0xf7), "66 41 0f 3a 40 d9 f7");
CODEGEN_TEST(xDP.PD(xmm8, xmm4, 0x33), "66 44 0f 3a 41 c4 33");
CODEGEN_TEST(xROUND.PS(xmm1, xmm3, 0), "66 0f 3a 08 cb 00");
CODEGEN_TEST(xROUND.PD(xmm3, xmm9, 1), "66 41 0f 3a 09 d9 01");
CODEGEN_TEST(xROUND.SS(xmm5, xmm2, 2), "66 0f 3a 0a ea 02");
CODEGEN_TEST(xROUND.SD(xmm8, xmm2, 3), "66 44 0f 3a 0b c2 03");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
@ -343,6 +352,15 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xPMADD.WD(xmm0, xmm8), "c5 b9 f5 c0"); // => vpmaddwd xmm0, xmm8, xmm0
CODEGEN_TEST(xPMADD.UBSW(xmm0, xmm8), "c4 c2 79 04 c0");
CODEGEN_TEST(xHADD.PS(xmm1, xmm8), "c4 c1 73 7c c8");
CODEGEN_TEST(xHADD.PD(xmm4, ptr[r8]), "c4 c1 59 7c 20");
CODEGEN_TEST(xDP.PS(xmm3, xmm9, 0xf7), "c4 c3 61 40 d9 f7");
CODEGEN_TEST(xDP.PD(xmm8, xmm4, 0x33), "c4 63 39 41 c4 33");
CODEGEN_TEST(xROUND.PS(xmm1, xmm3, 0), "c4 e3 79 08 cb 00");
CODEGEN_TEST(xROUND.PD(xmm3, xmm9, 1), "c4 c3 79 09 d9 01");
CODEGEN_TEST(xROUND.SS(xmm5, xmm2, 2), "c4 e3 51 0a ea 02");
CODEGEN_TEST(xROUND.SD(xmm8, xmm2, 3), "c4 63 39 0b c2 03");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");