pcsx2/common/emitter/simd.cpp
TellowKrinkle 4662d0e5dc Common: Disable and remove ALWAYS_USE_MOVAPS
In AVX, MOVAPS is the same size as all the other instructions.

In Nehalem (one of the main targets of SSE4), there are bypass penalties for using the wrong mov.
2025-08-24 20:16:24 -05:00

912 lines
43 KiB
C++

// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#include "common/emitter/internal.h"
#include "common/VectorIntrin.h"
namespace x86Emitter
{
__emitinline static u8 getSSE(SIMDInstructionInfo::Prefix prefix)
{
switch (prefix) {
case SIMDInstructionInfo::Prefix::P66: return 0x66;
case SIMDInstructionInfo::Prefix::PF3: return 0xf3;
case SIMDInstructionInfo::Prefix::PF2: return 0xf2;
case SIMDInstructionInfo::Prefix::None:
default:
pxAssert(0);
return 0;
}
}
__emitinline static u16 getSSE(SIMDInstructionInfo::Map map)
{
switch (map) {
case SIMDInstructionInfo::Map::M0F38: return 0x380f;
case SIMDInstructionInfo::Map::M0F3A: return 0x3a0f;
case SIMDInstructionInfo::Map::M0F:
default:
pxAssert(0);
return 0;
}
}
__emitinline static SIMDInstructionInfo getMov(SIMDInstructionInfo::Type type)
{
switch (type) {
case SIMDInstructionInfo::Type::Integer:
return SIMDInstructionInfo(0x6f).p66().mov();
case SIMDInstructionInfo::Type::Double:
return SIMDInstructionInfo(0x28).p66().mov();
default:
case SIMDInstructionInfo::Type::Float:
return SIMDInstructionInfo(0x28).mov();
}
}
template <typename T1, typename T2>
__emitinline static void xOpWrite0F(SIMDInstructionInfo info, T1 dst, const T2& src, int extraRIPOffset)
{
if (info.prefix != SIMDInstructionInfo::Prefix::None)
xWrite8(getSSE(info.prefix));
pxAssert(!info.w_bit); // Only used by AVX
EmitRex(info, dst, src);
if (info.map == SIMDInstructionInfo::Map::M0F)
{
xWrite16(0x0F | (info.opcode << 8));
}
else
{
xWrite16(getSSE(info.map));
xWrite8(info.opcode);
}
EmitSibMagic(dst, src, extraRIPOffset);
}
template <typename S2>
__emitinline static void EmitSimdOp(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, const S2& src2, int extraRIPOffset)
{
if (x86Emitter::use_avx)
{
EmitVEX(info, dst, info.is_mov ? 0 : src1.GetId(), src2, extraRIPOffset);
}
else
{
if (dst.GetId() != src1.GetId())
{
pxAssert(!info.is_mov);
// Generate a mov to copy from src1 to dst
xOpWrite0F(getMov(info.type), dst, src1, 0);
}
xOpWrite0F(info, dst, src2, extraRIPOffset);
}
}
void EmitSIMDImpl(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, int extraRipOffset)
{
pxAssert(!info.is_mov);
if (x86Emitter::use_avx)
{
EmitVEX(info, info.ext, dst.GetId(), src1, extraRipOffset);
}
else
{
if (dst.GetId() != src1.GetId())
{
// Generate a mov to copy from src1 to dst
xOpWrite0F(getMov(info.type), dst, src1, 0);
}
xOpWrite0F(info, info.ext, dst, extraRipOffset);
}
}
void EmitSIMDImpl(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, const xRegisterBase& src2, int extraRipOffset)
{
pxAssert(!info.is_mov || dst.GetId() == src1.GetId());
const xRegisterBase* ps1 = &src1;
const xRegisterBase* ps2 = &src2;
if (x86Emitter::use_avx)
{
if (info.is_commutative && info.map == SIMDInstructionInfo::Map::M0F && src2.IsExtended() && !src1.IsExtended())
{
// We can use a C5 op instead of a C4 op if we swap the inputs
std::swap(ps1, ps2);
}
}
else if (dst.GetId() != src1.GetId() && dst.GetId() == src2.GetId())
{
if (info.is_commutative)
std::swap(ps1, ps2);
else
pxAssertRel(0, "SSE4 auto mov would destroy the second source!");
}
EmitSimdOp(info, dst, *ps1, *ps2, extraRipOffset);
}
void EmitSIMDImpl(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, const xIndirectVoid& src2, int extraRipOffset)
{
pxAssert(!info.is_mov || dst.GetId() == src1.GetId());
if (!x86Emitter::use_avx && info.is_commutative && dst.GetId() != src1.GetId())
{
// Do load, op instead of mov, op+load
// No processors differentiate between loads, so always use movaps
EmitSimdOp(getMov(SIMDInstructionInfo::Type::Float), dst, dst, src2, 0);
EmitSimdOp(info, dst, dst, src1, extraRipOffset);
}
else
{
EmitSimdOp(info, dst, src1, src2, extraRipOffset);
}
}
void EmitSIMD(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, const xRegisterBase& src2, const xRegisterBase& src3)
{
pxAssert(!info.is_mov);
pxAssertMsg(!info.is_commutative, "I don't think any blend instructions are commutative...");
if (x86Emitter::use_avx)
{
EmitSimdOp(info, dst, src1, src2, 1);
xWrite8(src3.GetId());
}
else
{
pxAssertRel(src3.GetId() == 0, "SSE4 requires the third source to be xmm0!");
if (dst.GetId() != src1.GetId() && dst.GetId() == src2.GetId())
pxAssertRel(0, "SSE4 auto mov would destroy the second source!");
EmitSimdOp(info, dst, src1, src2, 0);
}
}
void EmitSIMD(SIMDInstructionInfo info, const xRegisterBase& dst, const xRegisterBase& src1, const xIndirectVoid& src2, const xRegisterBase& src3)
{
pxAssert(!info.is_mov);
pxAssertMsg(!info.is_commutative, "I don't think any blend instructions are commutative...");
if (x86Emitter::use_avx)
{
EmitSimdOp(info, dst, src1, src2, 1);
xWrite8(src3.GetId());
}
else
{
pxAssertRel(src3.GetId() == 0, "SSE4 requires the third source to be xmm0!");
EmitSimdOp(info, dst, src1, src2, 0);
}
}
// ------------------------------------------------------------------------
// SimdPrefix - If the lower byte of the opcode is 0x38 or 0x3a, then the opcode is
// treated as a 16 bit value (in SSE 0x38 and 0x3a denote prefixes for extended SSE3/4
// instructions). Any other lower value assumes the upper value is 0 and ignored.
// Non-zero upper bytes, when the lower byte is not the 0x38 or 0x3a prefix, will
// generate an assertion.
//
__emitinline void SimdPrefix(u8 prefix, u16 opcode)
{
pxAssertMsg(prefix == 0, "REX prefix must be just before the opcode");
const bool is16BitOpcode = ((opcode & 0xff) == 0x38) || ((opcode & 0xff) == 0x3a);
// If the lower byte is not a valid prefix and the upper byte is non-zero it
// means we made a mistake!
if (!is16BitOpcode)
pxAssert((opcode >> 8) == 0);
if (prefix != 0)
{
if (is16BitOpcode)
xWrite32((opcode << 16) | 0x0f00 | prefix);
else
{
xWrite16(0x0f00 | prefix);
xWrite8(opcode);
}
}
else
{
if (is16BitOpcode)
{
xWrite8(0x0f);
xWrite16(opcode);
}
else
xWrite16((opcode << 8) | 0x0f);
}
}
// clang-format off
const xImplSimd_3Arg xPAND = {SIMDInstructionInfo(0xdb).i().p66().commutative()};
const xImplSimd_3Arg xPANDN = {SIMDInstructionInfo(0xdf).i().p66()};
const xImplSimd_3Arg xPOR = {SIMDInstructionInfo(0xeb).i().p66().commutative()};
const xImplSimd_3Arg xPXOR = {SIMDInstructionInfo(0xef).i().p66().commutative()};
// [SSE-4.1] Performs a bitwise AND of dest against src, and sets the ZF flag
// only if all bits in the result are 0. PTEST also sets the CF flag according
// to the following condition: (xmm2/m128 AND NOT xmm1) == 0;
const xImplSimd_2Arg xPTEST = {SIMDInstructionInfo(0x17).p66().m0f38()};
// =====================================================================================================
// SSE Conversion Operations, as looney as they are.
// =====================================================================================================
// These enforce pointer strictness for Indirect forms, due to the otherwise completely confusing
// nature of the functions. (so if a function expects an m32, you must use (u32*) or ptr32[]).
//
__fi void xCVTDQ2PD(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).pf3()); }
__fi void xCVTDQ2PD(const xRegisterSSE& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).pf3()); }
__fi void xCVTDQ2PS(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b)); }
__fi void xCVTDQ2PS(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b)); }
__fi void xCVTPD2DQ(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).pf2()); }
__fi void xCVTPD2DQ(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).pf2()); }
__fi void xCVTPD2PS(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5a).p66()); }
__fi void xCVTPD2PS(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5a).p66()); }
__fi void xCVTPI2PD(const xRegisterSSE& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2a).p66()); }
__fi void xCVTPI2PS(const xRegisterSSE& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2a)); }
__fi void xCVTPS2DQ(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b).p66()); }
__fi void xCVTPS2DQ(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b).p66()); }
__fi void xCVTPS2PD(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5a)); }
__fi void xCVTPS2PD(const xRegisterSSE& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5a)); }
__fi void xCVTSD2SI(const xRegister32or64& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2d).dstw().pf2()); }
__fi void xCVTSD2SI(const xRegister32or64& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2d).dstw().pf2()); }
__fi void xCVTSD2SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) { EmitSIMD(SIMDInstructionInfo(0x5a).pf2(), dst, src1, src2); }
__fi void xCVTSD2SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2) { EmitSIMD(SIMDInstructionInfo(0x5a).pf2(), dst, src1, src2); }
__fi void xCVTSI2SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32or64& src2) { EmitSIMD(SIMDInstructionInfo(0x2a).srcw().pf3(), dst, src1, src2); }
__fi void xCVTSI2SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2) { EmitSIMD(SIMDInstructionInfo(0x2a).srcw().pf3(), dst, src1, src2); }
__fi void xCVTSI2SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2) { EmitSIMD(SIMDInstructionInfo(0x2a).srcw().pf3(), dst, src1, src2); }
__fi void xCVTSS2SI(const xRegister32or64& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2d).dstw().pf3()); }
__fi void xCVTSS2SI(const xRegister32or64& to, const xIndirect32& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2d).dstw().pf3()); }
__fi void xCVTSS2SD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) { EmitSIMD(SIMDInstructionInfo(0x5a).pf3(), dst, src1, src2); }
__fi void xCVTSS2SD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2) { EmitSIMD(SIMDInstructionInfo(0x5a).pf3(), dst, src1, src2); }
__fi void xCVTTPD2DQ(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).p66()); }
__fi void xCVTTPD2DQ(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0xe6).p66()); }
__fi void xCVTTPS2DQ(const xRegisterSSE& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b).pf3()); }
__fi void xCVTTPS2DQ(const xRegisterSSE& to, const xIndirect128& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x5b).pf3()); }
__fi void xCVTTSD2SI(const xRegister32or64& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2c).dstw().pf2()); }
__fi void xCVTTSD2SI(const xRegister32or64& to, const xIndirect64& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2c).dstw().pf2()); }
__fi void xCVTTSS2SI(const xRegister32or64& to, const xRegisterSSE& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2c).dstw().pf3()); }
__fi void xCVTTSS2SI(const xRegister32or64& to, const xIndirect32& from) { OpWriteSIMDMovOp(SIMDInstructionInfo(0x2c).dstw().pf3()); }
// ------------------------------------------------------------------------
void xImplSimd_2Arg::operator()(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(info, dst, dst, src); }
void xImplSimd_2Arg::operator()(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(info, dst, dst, src); }
void xImplSimd_2ArgImm::operator()(const xRegisterSSE& dst, const xRegisterSSE& src, u8 imm) const { EmitSIMD(info, dst, dst, src, imm); }
void xImplSimd_2ArgImm::operator()(const xRegisterSSE& dst, const xIndirectVoid& src, u8 imm) const { EmitSIMD(info, dst, dst, src, imm); }
void xImplSimd_3Arg::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); }
void xImplSimd_3Arg::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(info, dst, src1, src2); }
void xImplSimd_3ArgImm::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 imm) const { EmitSIMD(info, dst, src1, src2, imm); }
void xImplSimd_3ArgImm::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 imm) const { EmitSIMD(info, dst, src1, src2, imm); }
void xImplSimd_3ArgCmp::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, SSE2_ComparisonType imm) const { EmitSIMD(info, dst, src1, src2, imm); }
void xImplSimd_3ArgCmp::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, SSE2_ComparisonType imm) const { EmitSIMD(info, dst, src1, src2, imm); }
void xImplSimd_4ArgBlend::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, const xRegisterSSE& src3) const { EmitSIMD(x86Emitter::use_avx ? avx : sse, dst, src1, src2, src3); }
void xImplSimd_4ArgBlend::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, const xRegisterSSE& src3) const { EmitSIMD(x86Emitter::use_avx ? avx : sse, dst, src1, src2, src3); }
// =====================================================================================================
// SIMD Arithmetic Instructions
// =====================================================================================================
void _SimdShiftHelper::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); }
void _SimdShiftHelper::operator()(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(info, dst, src1, src2); }
void _SimdShiftHelper::operator()(const xRegisterSSE& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(infoImm, dst, src, imm8); }
void xImplSimd_Shift::DQ(const xRegisterSSE& dst, const xRegisterSSE& src, u8 imm8) const
{
SIMDInstructionInfo info = Q.infoImm;
info.ext += 1;
EmitSIMD(info, dst, src, imm8);
}
const xImplSimd_ShiftWithoutQ xPSRA =
{
{SIMDInstructionInfo(0xe1).p66().i(), SIMDInstructionInfo(0x71, 4).p66().i()}, // W
{SIMDInstructionInfo(0xe2).p66().i(), SIMDInstructionInfo(0x72, 4).p66().i()}, // D
};
const xImplSimd_Shift xPSRL =
{
{SIMDInstructionInfo(0xd1).p66().i(), SIMDInstructionInfo(0x71, 2).p66().i()}, // W
{SIMDInstructionInfo(0xd2).p66().i(), SIMDInstructionInfo(0x72, 2).p66().i()}, // D
{SIMDInstructionInfo(0xd3).p66().i(), SIMDInstructionInfo(0x73, 2).p66().i()}, // Q
};
const xImplSimd_Shift xPSLL =
{
{SIMDInstructionInfo(0xf1).p66().i(), SIMDInstructionInfo(0x71, 6).p66().i()}, // W
{SIMDInstructionInfo(0xf2).p66().i(), SIMDInstructionInfo(0x72, 6).p66().i()}, // D
{SIMDInstructionInfo(0xf3).p66().i(), SIMDInstructionInfo(0x73, 6).p66().i()}, // Q
};
const xImplSimd_AddSub xPADD =
{
{SIMDInstructionInfo(0xfc).p66().i().commutative()}, // B
{SIMDInstructionInfo(0xfd).p66().i().commutative()}, // W
{SIMDInstructionInfo(0xfe).p66().i().commutative()}, // D
{SIMDInstructionInfo(0xd4).p66().i().commutative()}, // Q
{SIMDInstructionInfo(0xec).p66().i().commutative()}, // SB
{SIMDInstructionInfo(0xed).p66().i().commutative()}, // SW
{SIMDInstructionInfo(0xdc).p66().i().commutative()}, // USB
{SIMDInstructionInfo(0xdd).p66().i().commutative()}, // USW
};
const xImplSimd_AddSub xPSUB =
{
{SIMDInstructionInfo(0xf8).p66().i()}, // B
{SIMDInstructionInfo(0xf9).p66().i()}, // W
{SIMDInstructionInfo(0xfa).p66().i()}, // D
{SIMDInstructionInfo(0xfb).p66().i()}, // Q
{SIMDInstructionInfo(0xe8).p66().i()}, // SB
{SIMDInstructionInfo(0xe9).p66().i()}, // SW
{SIMDInstructionInfo(0xd8).p66().i()}, // USB
{SIMDInstructionInfo(0xd9).p66().i()}, // USW
};
const xImplSimd_PMul xPMUL =
{
{SIMDInstructionInfo(0xd5).p66().i().commutative()}, // LW
{SIMDInstructionInfo(0xe5).p66().i().commutative()}, // HW
{SIMDInstructionInfo(0xe4).p66().i().commutative()}, // HUW
{SIMDInstructionInfo(0xf4).p66().i().commutative()}, // UDQ
{SIMDInstructionInfo(0x0b).p66().m0f38().i().commutative()}, // HRSW
{SIMDInstructionInfo(0x40).p66().m0f38().i().commutative()}, // LD
{SIMDInstructionInfo(0x28).p66().m0f38().i().commutative()}, // DQ
};
const xImplSimd_rSqrt xRSQRT =
{
{SIMDInstructionInfo(0x52)}, // PS
{SIMDInstructionInfo(0x52).pf3()}, // SS
};
const xImplSimd_rSqrt xRCP =
{
{SIMDInstructionInfo(0x53)}, // PS
{SIMDInstructionInfo(0x53).pf3()}, // SS
};
const xImplSimd_Sqrt xSQRT =
{
{SIMDInstructionInfo(0x51)}, // PS
{SIMDInstructionInfo(0x51).pf3()}, // SS
{SIMDInstructionInfo(0x51).p66()}, // PD
{SIMDInstructionInfo(0x51).pf2()}, // SS
};
const xImplSimd_AndNot xANDN =
{
{SIMDInstructionInfo(0x55)}, // PS
{SIMDInstructionInfo(0x55).p66()}, // PD
};
const xImplSimd_PAbsolute xPABS =
{
{SIMDInstructionInfo(0x1c).p66().m0f38().i()}, // B
{SIMDInstructionInfo(0x1d).p66().m0f38().i()}, // W
{SIMDInstructionInfo(0x1e).p66().m0f38().i()}, // D
};
const xImplSimd_PSign xPSIGN =
{
{SIMDInstructionInfo(0x08).p66().m0f38().i()}, // B
{SIMDInstructionInfo(0x09).p66().m0f38().i()}, // W
{SIMDInstructionInfo(0x0a).p66().m0f38().i()}, // D
};
const xImplSimd_PMultAdd xPMADD =
{
{SIMDInstructionInfo(0xf5).p66().i().commutative()}, // WD
{SIMDInstructionInfo(0x04).p66().m0f38().i().commutative()}, // UBSW
};
const xImplSimd_HorizAdd xHADD =
{
{SIMDInstructionInfo(0x7c).pf2()}, // PS
{SIMDInstructionInfo(0x7c).p66()}, // PD
};
const xImplSimd_DotProduct xDP =
{
{SIMDInstructionInfo(0x40).p66().m0f3a().commutative()}, // PS
{SIMDInstructionInfo(0x41).p66().m0f3a().commutative()}, // PD
};
const xImplSimd_Round xROUND =
{
{SIMDInstructionInfo(0x08).p66().m0f3a()}, // PS
{SIMDInstructionInfo(0x09).p66().m0f3a()}, // PD
{SIMDInstructionInfo(0x0a).p66().m0f3a()}, // SS
{SIMDInstructionInfo(0x0b).p66().m0f3a()}, // SD
};
// =====================================================================================================
// SIMD Comparison Instructions
// =====================================================================================================
void xImplSimd_Compare::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).f(), dst, src1, src2, CType); }
void xImplSimd_Compare::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).f(), dst, src1, src2, CType); }
void xImplSimd_Compare::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).d().p66(), dst, src1, src2, CType); }
void xImplSimd_Compare::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).d().p66(), dst, src1, src2, CType); }
void xImplSimd_Compare::SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).f().pf3(), dst, src1, src2, CType); }
void xImplSimd_Compare::SS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).f().pf3(), dst, src1, src2, CType); }
void xImplSimd_Compare::SD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).d().pf2(), dst, src1, src2, CType); }
void xImplSimd_Compare::SD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(SIMDInstructionInfo(0xc2).d().pf2(), dst, src1, src2, CType); }
const xImplSimd_MinMax xMIN =
{
{SIMDInstructionInfo(0x5d).f()}, // PS
{SIMDInstructionInfo(0x5d).d().p66()}, // PD
{SIMDInstructionInfo(0x5d).f().pf3()}, // SS
{SIMDInstructionInfo(0x5d).d().pf2()}, // SD
};
const xImplSimd_MinMax xMAX =
{
{SIMDInstructionInfo(0x5f).f()}, // PS
{SIMDInstructionInfo(0x5f).d().p66()}, // PD
{SIMDInstructionInfo(0x5f).f().pf3()}, // SS
{SIMDInstructionInfo(0x5f).d().pf2()}, // SD
};
// [TODO] : Merge this into the xCMP class, so that they are notation as: xCMP.EQ
const xImplSimd_Compare xCMPEQ = {SSE2_Equal};
const xImplSimd_Compare xCMPLT = {SSE2_Less};
const xImplSimd_Compare xCMPLE = {SSE2_LessOrEqual};
const xImplSimd_Compare xCMPUNORD = {SSE2_Unordered};
const xImplSimd_Compare xCMPNE = {SSE2_NotEqual};
const xImplSimd_Compare xCMPNLT = {SSE2_NotLess};
const xImplSimd_Compare xCMPNLE = {SSE2_NotLessOrEqual};
const xImplSimd_Compare xCMPORD = {SSE2_Ordered};
const xImplSimd_COMI xCOMI =
{
{SIMDInstructionInfo(0x2f)}, // SS
{SIMDInstructionInfo(0x2f).p66()}, // SD
};
const xImplSimd_COMI xUCOMI =
{
{SIMDInstructionInfo(0x2e)}, // SS
{SIMDInstructionInfo(0x2e).p66()}, // SD
};
const xImplSimd_PCompare xPCMP =
{
{SIMDInstructionInfo(0x74).i().p66().commutative()}, // EQB
{SIMDInstructionInfo(0x75).i().p66().commutative()}, // EQW
{SIMDInstructionInfo(0x76).i().p66().commutative()}, // EQD
{SIMDInstructionInfo(0x64).i().p66()}, // GTB
{SIMDInstructionInfo(0x65).i().p66()}, // GTW
{SIMDInstructionInfo(0x66).i().p66()}, // GTD
};
const xImplSimd_PMinMax xPMIN =
{
{SIMDInstructionInfo(0xda).i().p66().commutative()}, // UB
{SIMDInstructionInfo(0xea).i().p66().commutative()}, // SW
{SIMDInstructionInfo(0x38).i().p66().m0f38().commutative()}, // SB
{SIMDInstructionInfo(0x39).i().p66().m0f38().commutative()}, // SD
{SIMDInstructionInfo(0x3a).i().p66().m0f38().commutative()}, // UW
{SIMDInstructionInfo(0x3b).i().p66().m0f38().commutative()}, // UD
};
const xImplSimd_PMinMax xPMAX =
{
{SIMDInstructionInfo(0xde).i().p66().commutative()}, // UB
{SIMDInstructionInfo(0xee).i().p66().commutative()}, // SW
{SIMDInstructionInfo(0x3c).i().p66().m0f38().commutative()}, // SB
{SIMDInstructionInfo(0x3d).i().p66().m0f38().commutative()}, // SD
{SIMDInstructionInfo(0x3e).i().p66().m0f38().commutative()}, // UW
{SIMDInstructionInfo(0x3f).i().p66().m0f38().commutative()}, // UD
};
// =====================================================================================================
// SIMD Shuffle/Pack (Shuffle puck?)
// =====================================================================================================
__fi void xImplSimd_Shuffle::_selector_assertion_check(u8 selector) const
{
pxAssertMsg((selector & ~3) == 0,
"Invalid immediate operand on SSE Shuffle: Upper 6 bits of the SSE Shuffle-PD Selector are reserved and must be zero.");
}
void xImplSimd_Shuffle::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const
{
EmitSIMD(SIMDInstructionInfo(0xc6), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const
{
EmitSIMD(SIMDInstructionInfo(0xc6), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const
{
_selector_assertion_check(selector);
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const
{
_selector_assertion_check(selector);
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::B(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect8& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x20).i().p66().m0f3a(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::W(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect16& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc4).i().p66(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::D(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegister64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void xImplSimd_PInsert::Q(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect64& src2, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x22).i().p66().m0f3a().srcw(), dst, src1, src2, imm8); }
void SimdImpl_PExtract::B(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::B(const xIndirect8& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x14).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::W(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0xc5).mov().p66(), dst, dst, src, imm8); }
void SimdImpl_PExtract::W(const xIndirect16& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x15).mov().p66().m0f3a(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::D(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xRegister64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
void SimdImpl_PExtract::Q(const xIndirect64& dst, const xRegisterSSE& src, u8 imm8) const { EmitSIMD(SIMDInstructionInfo(0x16).mov().p66().m0f3a().srcw(), src, src, dst, imm8); }
const xImplSimd_Shuffle xSHUF = {};
const xImplSimd_PShuffle xPSHUF =
{
{SIMDInstructionInfo(0x70).i().p66()}, // D
{SIMDInstructionInfo(0x70).i().pf2()}, // LW
{SIMDInstructionInfo(0x70).i().pf3()}, // HW
{SIMDInstructionInfo(0x00).i().p66().m0f38()}, // B
};
const SimdImpl_PUnpack xPUNPCK =
{
{SIMDInstructionInfo(0x60).i().p66()}, // LBW
{SIMDInstructionInfo(0x61).i().p66()}, // LWD
{SIMDInstructionInfo(0x62).i().p66()}, // LDQ
{SIMDInstructionInfo(0x6c).i().p66()}, // LQDQ
{SIMDInstructionInfo(0x68).i().p66()}, // HBW
{SIMDInstructionInfo(0x69).i().p66()}, // HWD
{SIMDInstructionInfo(0x6a).i().p66()}, // HDQ
{SIMDInstructionInfo(0x6d).i().p66()}, // HQDQ
};
const SimdImpl_Pack xPACK =
{
{SIMDInstructionInfo(0x63).i().p66()}, // SSWB
{SIMDInstructionInfo(0x6b).i().p66()}, // SSDW
{SIMDInstructionInfo(0x67).i().p66()}, // USWB
{SIMDInstructionInfo(0x2b).i().p66().m0f38()}, // USDW
};
const xImplSimd_Unpack xUNPCK =
{
{SIMDInstructionInfo(0x15).f()}, // HPS
{SIMDInstructionInfo(0x15).d().p66()}, // HPD
{SIMDInstructionInfo(0x14).f()}, // LPS
{SIMDInstructionInfo(0x14).d().p66()}, // LPD
};
const xImplSimd_PInsert xPINSR;
const SimdImpl_PExtract xPEXTR;
static SIMDInstructionInfo nextop(SIMDInstructionInfo op, u32 offset = 1)
{
op.opcode += offset;
return op;
}
// =====================================================================================================
// SIMD Move And Blend Instructions
// =====================================================================================================
void xImplSimd_MovHL::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(info, dst, src1, src2); }
void xImplSimd_MovHL::PS(const xIndirectVoid& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info).mov(), src, src, dst); }
void xImplSimd_MovHL::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2) const { EmitSIMD(info.p66(), dst, src1, src2); }
void xImplSimd_MovHL::PD(const xIndirectVoid& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info).p66().mov(), src, src, dst); }
void xImplSimd_MovHL_RtoR::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info, dst, src1, src2); }
void xImplSimd_MovHL_RtoR::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) const { EmitSIMD(info.p66(), dst, src1, src2); }
static bool IsAligned(const xRegisterSSE& reg, const xIndirectVoid& mem)
{
u32 mask = reg.GetOperandSize() - 1;
// Aligned if it's displacement-only and the displacement is aligned
if (mem.Displacement & mask)
return false;
return mem.Index.IsEmpty() && mem.Base.IsEmpty();
}
static const xImplSimd_MoveSSE& GetLoadStoreOp(const xImplSimd_MoveSSE* op)
{
if (!x86Emitter::use_avx)
{
// movaps is shorter, and no processor differentiates between the various movs for load/store
const bool aligned = std::bit_cast<u32>(op->aligned_load) == std::bit_cast<u32>(op->unaligned_load);
return aligned ? xMOVAPS : xMOVUPS;
}
return *op;
}
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xRegisterSSE& src) const
{
if (dst.GetId() == src.GetId() && dst.GetOperandSize() == src.GetOperandSize())
return;
SIMDInstructionInfo info = aligned_load;
const xRegisterSSE* arg0 = &dst;
const xRegisterSSE* arg1 = &src;
if (x86Emitter::use_avx)
{
if (arg1->IsExtended() && !arg0->IsExtended())
{
// Can save a byte by using the store opcode
info = aligned_store;
std::swap(arg0, arg1);
}
}
EmitSIMD(info, *arg0, *arg0, *arg1);
}
void xImplSimd_MoveSSE::operator()(const xRegisterSSE& dst, const xIndirectVoid& src) const
{
const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
EmitSIMD(IsAligned(dst, src) ? op.aligned_load : op.unaligned_load, dst, dst, src);
}
void xImplSimd_MoveSSE::operator()(const xIndirectVoid& dst, const xRegisterSSE& src) const
{
const xImplSimd_MoveSSE& op = GetLoadStoreOp(this);
EmitSIMD(IsAligned(src, dst) ? aligned_store : op.unaligned_store, src, src, dst);
}
void xImplSimd_PMove::BW(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(info, dst, dst, src); }
void xImplSimd_PMove::BW(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(info, dst, dst, src); }
void xImplSimd_PMove::BD(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info), dst, dst, src); }
void xImplSimd_PMove::BD(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(nextop(info), dst, dst, src); }
void xImplSimd_PMove::BQ(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info, 2), dst, dst, src); }
void xImplSimd_PMove::BQ(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(nextop(info, 2), dst, dst, src); }
void xImplSimd_PMove::WD(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info, 3), dst, dst, src); }
void xImplSimd_PMove::WD(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(nextop(info, 3), dst, dst, src); }
void xImplSimd_PMove::WQ(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info, 4), dst, dst, src); }
void xImplSimd_PMove::WQ(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(nextop(info, 4), dst, dst, src); }
void xImplSimd_PMove::DQ(const xRegisterSSE& dst, const xRegisterSSE& src) const { EmitSIMD(nextop(info, 5), dst, dst, src); }
void xImplSimd_PMove::DQ(const xRegisterSSE& dst, const xIndirectVoid& src) const { EmitSIMD(nextop(info, 5), dst, dst, src); }
const xImplSimd_MoveSSE xMOVAPS = {
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
};
const xImplSimd_MoveSSE xMOVUPS = {
SIMDInstructionInfo(0x28).mov(), SIMDInstructionInfo(0x29).mov(),
SIMDInstructionInfo(0x10).mov(), SIMDInstructionInfo(0x11).mov(),
};
const xImplSimd_MoveSSE xMOVDQA = {
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
};
const xImplSimd_MoveSSE xMOVDQU = {
SIMDInstructionInfo(0x6f).p66().mov(), SIMDInstructionInfo(0x7f).p66().mov(),
SIMDInstructionInfo(0x6f).pf3().mov(), SIMDInstructionInfo(0x7f).pf3().mov(),
};
const xImplSimd_MoveSSE xMOVAPD = {
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
};
const xImplSimd_MoveSSE xMOVUPD = {
SIMDInstructionInfo(0x28).p66().mov(), SIMDInstructionInfo(0x29).p66().mov(),
SIMDInstructionInfo(0x10).p66().mov(), SIMDInstructionInfo(0x11).p66().mov(),
};
const xImplSimd_MovHL xMOVH = {SIMDInstructionInfo(0x16)};
const xImplSimd_MovHL xMOVL = {SIMDInstructionInfo(0x12)};
const xImplSimd_MovHL_RtoR xMOVLH = {SIMDInstructionInfo(0x16)};
const xImplSimd_MovHL_RtoR xMOVHL = {SIMDInstructionInfo(0x12)};
const xImplSimd_PBlend xPBLEND =
{
{SIMDInstructionInfo(0x0e).i().p66().m0f3a()}, // W
{SIMDInstructionInfo(0x10).i().p66().m0f38(), SIMDInstructionInfo(0x4c).i().p66().m0f3a()}, // VB
};
const xImplSimd_Blend xBLEND =
{
{SIMDInstructionInfo(0x0c).p66().f().m0f3a()}, // PS
{SIMDInstructionInfo(0x0d).p66().d().m0f3a()}, // PD
{SIMDInstructionInfo(0x14).p66().f().m0f38(), SIMDInstructionInfo(0x4a).f().p66().m0f3a()}, // VPS
{SIMDInstructionInfo(0x15).p66().d().m0f38(), SIMDInstructionInfo(0x4b).d().p66().m0f3a()}, // VPD
};
const xImplSimd_PMove xPMOVSX = {SIMDInstructionInfo(0x20).p66().m0f38().mov()};
const xImplSimd_PMove xPMOVZX = {SIMDInstructionInfo(0x30).p66().m0f38().mov()};
// [SSE-3]
const xImplSimd_2Arg xMOVSLDUP = {SIMDInstructionInfo(0x12).pf3()};
// [SSE-3]
const xImplSimd_2Arg xMOVSHDUP = {SIMDInstructionInfo(0x16).pf3()};
//////////////////////////////////////////////////////////////////////////////////////////
// MMX Mov Instructions (MOVD, MOVQ, MOVSS).
//
// Notes:
// * Some of the functions have been renamed to more clearly reflect what they actually
// do. Namely we've affixed "ZX" to several MOVs that take a register as a destination
// since that's what they do (MOVD clears upper 32/96 bits, etc).
//
// * MOVD has valid forms for MMX and XMM registers.
//
__fi void xMOVDZX(const xRegisterSSE& dst, const xRegister32or64& src) { EmitSIMD(SIMDInstructionInfo(0x6e).p66().srcw().mov(), dst, dst, src); }
__fi void xMOVDZX(const xRegisterSSE& dst, const xIndirectVoid& src) { EmitSIMD(SIMDInstructionInfo(0x6e).p66().mov(), dst, dst, src); }
__fi void xMOVD(const xRegister32or64& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x7e).p66().srcw().mov(), src, src, dst); }
__fi void xMOVD(const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x7e).p66().mov(), src, src, dst); }
__fi void xMOVQZX(const xRegisterSSE& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x7e).pf3().mov(), dst, dst, src); }
__fi void xMOVQZX(const xRegisterSSE& dst, const xIndirectVoid& src) { EmitSIMD(SIMDInstructionInfo(0x7e).pf3().mov(), dst, dst, src); }
// Moves lower quad of XMM to ptr64 (no bits are cleared)
__fi void xMOVQ(const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0xd6).p66().mov(), src, src, dst); }
//////////////////////////////////////////////////////////////////////////////////////////
//
#define IMPLEMENT_xMOVS(ssd, prefix) \
__fi void xMOV##ssd(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2) \
{ \
if (src1 == src2) \
return xMOVAPS(dst, src1); \
SIMDInstructionInfo op = SIMDInstructionInfo(0x10).prefix(); \
const xRegisterSSE* psrc = &src2; \
const xRegisterSSE* pdst = &dst; \
if (x86Emitter::use_avx && src2.IsExtended() && !dst.IsExtended()) \
{ \
op.opcode = 0x11; \
std::swap(psrc, pdst); \
} \
EmitSIMD(op, *pdst, src1, *psrc); \
} \
__fi void xMOV##ssd##ZX(const xRegisterSSE& dst, const xIndirectVoid& src) { EmitSIMD(SIMDInstructionInfo(0x10).prefix().mov(), dst, dst, src); } \
__fi void xMOV##ssd (const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x11).prefix().mov(), src, src, dst); }
IMPLEMENT_xMOVS(SS, pf3)
IMPLEMENT_xMOVS(SD, pf2)
__fi void xMOVNTDQA(const xRegisterSSE& dst, const xIndirectVoid& src) { EmitSIMD(SIMDInstructionInfo(0x2a).p66().m0f38().mov(), dst, dst, src); }
__fi void xMOVNTDQA(const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0xe7).p66().mov(), src, src, dst); }
__fi void xMOVNTPD(const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x2b).p66().mov(), src, src, dst); }
__fi void xMOVNTPS(const xIndirectVoid& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x2b).mov(), src, src, dst); }
// ------------------------------------------------------------------------
__fi void xMOVMSKPS(const xRegister32& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x50).mov(), dst, dst, src); }
__fi void xMOVMSKPD(const xRegister32& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0x50).mov().p66(), dst, dst, src); }
// xMASKMOV:
// Selectively write bytes from mm1/xmm1 to memory location using the byte mask in mm2/xmm2.
// The default memory location is specified by DS:EDI. The most significant bit in each byte
// of the mask operand determines whether the corresponding byte in the source operand is
// written to the corresponding byte location in memory.
__fi void xMASKMOV(const xRegisterSSE& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0xf7).mov().p66(), dst, dst, src); }
// xPMOVMSKB:
// Creates a mask made up of the most significant bit of each byte of the source
// operand and stores the result in the low byte or word of the destination operand.
// Upper bits of the destination are cleared to zero.
//
// When operating on a 64-bit (MMX) source, the byte mask is 8 bits; when operating on
// 128-bit (SSE) source, the byte mask is 16-bits.
//
__fi void xPMOVMSKB(const xRegister32or64& dst, const xRegisterSSE& src) { EmitSIMD(SIMDInstructionInfo(0xd7).mov().p66(), dst, dst, src); }
// [sSSE-3] Concatenates dest and source operands into an intermediate composite,
// shifts the composite at byte granularity to the right by a constant immediate,
// and extracts the right-aligned result into the destination.
//
__fi void xPALIGNR(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x0f).i().p66().m0f3a(), dst, src1, src2, imm8); }
// --------------------------------------------------------------------------------------
// INSERTPS / EXTRACTPS [SSE4.1 only!]
// --------------------------------------------------------------------------------------
// [TODO] these might be served better as classes, especially if other instructions use
// the M32,sse,imm form (I forget offhand if any do).
// [SSE-4.1] Insert a single-precision floating-point value from src into a specified
// location in dest, and selectively zero out the data elements in dest according to
// the mask field in the immediate byte. The source operand can be a memory location
// (32 bits) or an XMM register (lower 32 bits used).
//
// Imm8 provides three fields:
// * COUNT_S: The value of Imm8[7:6] selects the dword element from src. It is 0 if
// the source is a memory operand.
// * COUNT_D: The value of Imm8[5:4] selects the target dword element in dest.
// * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to be written
// with 0.0 if set to 1.
//
__emitinline void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x21).p66().m0f3a(), dst, src1, src2, imm8); }
__emitinline void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x21).p66().m0f3a(), dst, src1, src2, imm8); }
// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
// determined by imm8[1-0]*32. The extracted single precision floating-point value
// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
//
__emitinline void xEXTRACTPS(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x17).mov().p66().m0f3a(), src, src, dst, imm8); }
__emitinline void xEXTRACTPS(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x17).mov().p66().m0f3a(), src, src, dst, imm8); }
// =====================================================================================================
// Ungrouped Instructions!
// =====================================================================================================
// Store Streaming SIMD Extension Control/Status to Mem32.
__emitinline void xSTMXCSR(const xIndirect32& dest)
{
xOpWrite0F(0, 0xae, 3, dest);
}
// Load Streaming SIMD Extension Control/Status from Mem32.
__emitinline void xLDMXCSR(const xIndirect32& src)
{
xOpWrite0F(0, 0xae, 2, src);
}
// Save x87 FPU, MMX Technology, and SSE State to buffer
// Target buffer must be at least 512 bytes in length to hold the result.
__emitinline void xFXSAVE(const xIndirectVoid& dest)
{
xOpWrite0F(0, 0xae, 0, dest);
}
// Restore x87 FPU, MMX , XMM, and MXCSR State.
// Source buffer should be 512 bytes in length.
__emitinline void xFXRSTOR(const xIndirectVoid& src)
{
xOpWrite0F(0, 0xae, 1, src);
}
void xVZEROUPPER()
{
// rather than dealing with nonexistant operands..
xWrite8(0xc5);
xWrite8(0xf8);
xWrite8(0x77);
}
} // namespace x86Emitter