Common: Switch simd mov to auto SSE/AVX

This commit is contained in:
TellowKrinkle 2025-08-09 19:48:33 -05:00 committed by TellowKrinkle
parent a052a43b84
commit a045c917e7
4 changed files with 129 additions and 76 deletions

View File

@ -44,7 +44,7 @@ namespace x86Emitter
// --------------------------------------------------------------------------------------
// xImplSimd_MoveSSE
// --------------------------------------------------------------------------------------
// Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD
// Legends in their own right: MOVAPS / MOVAPD / MOVUPS / MOVUPD / MOVDQA / MOVDQU
//
// All implementations of Unaligned Movs will, when possible, use aligned movs instead.
// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
@ -52,27 +52,10 @@ namespace x86Emitter
//
struct xImplSimd_MoveSSE
{
u8 Prefix;
bool isAligned;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const;
void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const;
void operator()(const xIndirectVoid& to, const xRegisterSSE& from) const;
};
// --------------------------------------------------------------------------------------
// xImplSimd_MoveDQ
// --------------------------------------------------------------------------------------
// Implementations for MOVDQA / MOVDQU
//
// All implementations of Unaligned Movs will, when possible, use aligned movs instead.
// This happens when using Mem,Reg or Reg,Mem forms where the address is simple displacement
// which can be checked for alignment at runtime.
struct xImplSimd_MoveDQ
{
u8 Prefix;
bool isAligned;
SIMDInstructionInfo aligned_load;
SIMDInstructionInfo aligned_store;
SIMDInstructionInfo unaligned_load;
SIMDInstructionInfo unaligned_store;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const;
void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const;

View File

@ -486,14 +486,8 @@ namespace x86Emitter
extern const xImplSimd_MoveSSE xMOVUPS;
extern const xImplSimd_MoveSSE xMOVAPD;
extern const xImplSimd_MoveSSE xMOVUPD;
#ifdef ALWAYS_USE_MOVAPS
extern const xImplSimd_MoveSSE xMOVDQA;
extern const xImplSimd_MoveSSE xMOVDQU;
#else
extern const xImplSimd_MoveDQ xMOVDQA;
extern const xImplSimd_MoveDQ xMOVDQU;
#endif
extern const xImplSimd_MovHL xMOVH;
extern const xImplSimd_MovHL xMOVL;

View File

@ -647,53 +647,55 @@ namespace x86Emitter
void xImplSimd_MovHL_RtoR::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); }
void xImplSimd_MovHL_RtoR::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info.p66(), dst, src1, src2); }
static const u16 MovPS_OpAligned = 0x28; // Aligned [aps] form
static const u16 MovPS_OpUnaligned = 0x10; // unaligned [ups] form
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
static bool IsAligned(const xRegisterSSE& reg, const xIndirectVoid& mem)
{
if (to != from)
xOpWrite0F(Prefix, MovPS_OpAligned, to, from);
u32 mask = reg.GetOperandSize() - 1;
// Aligned if it's displacement-only and the displacement is aligned
if (mem.Displacement & mask)
return false;
return mem.Index.IsEmpty() && mem.Base.IsEmpty();
}
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
static const xImplSimd_MoveSSE& GetLoadStoreOp(const xImplSimd_MoveSSE* op)
{
// ModSib form is aligned if it's displacement-only and the displacement is aligned:
bool isReallyAligned = isAligned || (((from.Displacement & 0x0f) == 0) && from.Index.IsEmpty() && from.Base.IsEmpty());
xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned : MovPS_OpUnaligned, to, from);
if (!x86Emitter::use_avx)
{
// movaps is shorter, and no processor differentiates between the various movs for load/store
const bool aligned = std::bit_cast<u32>(op->aligned_load) == std::bit_cast<u32>(op->unaligned_load);
return aligned ? xMOVAPS : xMOVUPS;
}
return *op;
}
void xImplSimd_MoveSSE::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xRegisterSSE& src) const
{
// ModSib form is aligned if it's displacement-only and the displacement is aligned:
bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty());
xOpWrite0F(Prefix, isReallyAligned ? MovPS_OpAligned + 1 : MovPS_OpUnaligned + 1, from, to);
if (dst.GetId() == src.GetId() && dst.GetOperandSize() == src.GetOperandSize())
return;
SIMDInstructionInfo info = aligned_load;
const xRegisterSSE* arg0 = &dst;
const xRegisterSSE* arg1 = &src;
if (x86Emitter::use_avx)
{
if (arg1->IsExtended() && !arg0->IsExtended())
{
// Can save a byte by using the store opcode
info = aligned_store;
std::swap(arg0, arg1);
}
}
EmitSIMD(info, *arg0, *arg0, *arg1);
}
static const u8 MovDQ_PrefixAligned = 0x66; // Aligned [dqa] form
static const u8 MovDQ_PrefixUnaligned = 0xf3; // unaligned [dqu] form
void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xIndirectVoid& src) const
{
if (to != from)
xOpWrite0F(MovDQ_PrefixAligned, 0x6f, to, from);
const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
EmitSIMD(IsAligned(dst, src) ? op.aligned_load : op.unaligned_load, dst, dst, src);
}
void xImplSimd_MoveDQ::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
void xImplSimd_MoveSSE::operator()(const xIndirectVoid& dst, const xRegisterSSE& src) const
{
// ModSib form is aligned if it's displacement-only and the displacement is aligned:
bool isReallyAligned = isAligned || ((from.Displacement & 0x0f) == 0 && from.Index.IsEmpty() && from.Base.IsEmpty());
xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x6f, to, from);
}
void xImplSimd_MoveDQ::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
{
// ModSib form is aligned if it's displacement-only and the displacement is aligned:
bool isReallyAligned = isAligned || ((to.Displacement & 0x0f) == 0 && to.Index.IsEmpty() && to.Base.IsEmpty());
// use opcode 0x7f : alternate ModRM encoding (reverse src/dst)
xOpWrite0F(isReallyAligned ? MovDQ_PrefixAligned : MovDQ_PrefixUnaligned, 0x7f, from, to);
const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
EmitSIMD(IsAligned(src, dst) ? aligned_store : op.unaligned_store, src, src, dst);
}
void xImplSimd_PMove::BW(const xRegisterSSE& to, const xRegisterSSE& from) const { OpWriteSSE(0x66, OpcodeBase); }
@ -715,21 +717,39 @@ namespace x86Emitter
void xImplSimd_PMove::DQ(const xRegisterSSE& to, const xIndirect64& from) const { OpWriteSSE(0x66, OpcodeBase + 0x500); }
const xImplSimd_MoveSSE xMOVAPS = {0x00, true};
const xImplSimd_MoveSSE xMOVUPS = {0x00, false};
const xImplSimd_MoveSSE xMOVAPS = {
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
};
const xImplSimd_MoveSSE xMOVUPS = {
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
SIMDInstructionInfo(0x10).mov(), SIMDInstructionInfo(0x11).mov(),
};
#ifdef ALWAYS_USE_MOVAPS
const xImplSimd_MoveSSE xMOVDQA = {0x00, true};
const xImplSimd_MoveSSE xMOVAPD = {0x00, true};
const xImplSimd_MoveSSE xMOVDQA = xMOVAPS;
const xImplSimd_MoveSSE xMOVAPD = xMOVAPS;
const xImplSimd_MoveSSE xMOVDQU = {0x00, false};
const xImplSimd_MoveSSE xMOVUPD = {0x00, false};
const xImplSimd_MoveSSE xMOVDQU = xMOVUPS;
const xImplSimd_MoveSSE xMOVUPD = xMOVUPS;
#else
const xImplSimd_MoveDQ xMOVDQA = {0x66, true};
const xImplSimd_MoveSSE xMOVAPD = {0x66, true};
const xImplSimd_MoveSSE xMOVDQA = {
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
};
const xImplSimd_MoveSSE xMOVDQU = {
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
SIMDInstructionInfo(0x6f).pf3().mov(), SIMDInstructionInfo(0x7f).pf3().mov(),
};
const xImplSimd_MoveDQ xMOVDQU = {0xf3, false};
const xImplSimd_MoveSSE xMOVUPD = {0x66, false};
const xImplSimd_MoveSSE xMOVAPD = {
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
};
const xImplSimd_MoveSSE xMOVUPD = {
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
SIMDInstructionInfo(0x10).p66().mov(), SIMDInstructionInfo(0x11).p66().mov(),
};
#endif

View File

@ -337,10 +337,32 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9), "41 0f 12 e1");
CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1), "0f 16 d1");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
CODEGEN_TEST(xMOVAPS(ptr128[rax+r9], xmm8), "46 0f 29 04 08");
CODEGEN_TEST(xMOVAPS(xmm0, xmm8), "41 0f 28 c0");
CODEGEN_TEST(xMOVUPS(xmm8, xmm3), "44 0f 28 c3");
CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4), "41 0f 29 20");
CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "0f 11 28");
CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]), "45 0f 28 00");
CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]), "41 0f 10 29");
CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "44 0f 29 01");
CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "45 0f 11 18");
CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "45 0f 28 39");
CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "0f 10 08");
CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "41 0f 29 01");
CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "41 0f 11 18");
CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "44 0f 28 06");
CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "0f 10 39");
#ifdef ALWAYS_USE_MOVAPS
CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "41 0f 28 e0");
CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "0f 28 cc");
CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "45 0f 28 cb");
CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "41 0f 28 fa");
#else
CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "66 41 0f 28 e0");
CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "66 0f 28 cc");
CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "66 45 0f 6f cb");
CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "66 41 0f 6f fa");
#endif
CODEGEN_TEST(xBLEND.PS(xmm0, xmm1, 0x55), "66 0f 3a 0c c1 55");
CODEGEN_TEST(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa");
CODEGEN_TEST(xPBLEND.W(xmm0, xmm1, 0x55), "66 0f 3a 0e c1 55");
@ -545,6 +567,40 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xMOVHL.PS(xmm4, xmm9), "c4 c1 58 12 e1");
CODEGEN_TEST(xMOVLH.PS(xmm2, xmm1), "c5 e8 16 d1");
CODEGEN_TEST(xMOVAPS(xmm0, xmm8), "c5 78 29 c0");
CODEGEN_TEST(xMOVUPS(xmm8, xmm3), "c5 78 28 c3");
CODEGEN_TEST(xMOVAPS(ptr[r8], xmm4), "c4 c1 78 29 20");
CODEGEN_TEST(xMOVUPS(ptr[rax], xmm5), "c5 f8 11 28");
CODEGEN_TEST(xMOVAPS(xmm8, ptr[r8]), "c4 41 78 28 00");
CODEGEN_TEST(xMOVUPS(xmm5, ptr[r9]), "c4 c1 78 10 29");
#ifdef ALWAYS_USE_MOVAPS
CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "c5 78 29 c4");
CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "c5 f8 28 cc");
CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 78 29 01");
CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 78 11 18");
CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 78 28 39");
CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f8 10 08");
CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "c4 41 78 28 cb");
CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "c5 78 29 d7");
CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "c4 c1 78 29 01");
CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "c4 c1 78 11 18");
CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 78 28 06");
CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 f8 10 39");
#else
CODEGEN_TEST(xMOVAPD(xmm4, xmm8), "c5 79 29 c4");
CODEGEN_TEST(xMOVUPD(xmm1, xmm4), "c5 f9 28 cc");
CODEGEN_TEST(xMOVAPD(ptr[rcx], xmm8), "c5 79 29 01");
CODEGEN_TEST(xMOVUPD(ptr[r8], xmm11), "c4 41 79 11 18");
CODEGEN_TEST(xMOVAPD(xmm15, ptr[r9]), "c4 41 79 28 39");
CODEGEN_TEST(xMOVUPD(xmm1, ptr[rax]), "c5 f9 10 08");
CODEGEN_TEST(xMOVDQA(xmm9, xmm11), "c4 41 79 6f cb");
CODEGEN_TEST(xMOVDQU(xmm7, xmm10), "c5 79 7f d7");
CODEGEN_TEST(xMOVDQA(ptr[r9], xmm0), "c4 c1 79 7f 01");
CODEGEN_TEST(xMOVDQU(ptr[r8], xmm3), "c4 c1 7a 7f 18");
CODEGEN_TEST(xMOVDQA(xmm8, ptr[rsi]), "c5 79 6f 06");
CODEGEN_TEST(xMOVDQU(xmm7, ptr[rcx]), "c5 fa 6f 39");
#endif
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");