Common: Remove dedicated AVX instructions

The main instructions *are* AVX instructions now
This commit is contained in:
TellowKrinkle 2025-08-10 01:10:01 -05:00 committed by TellowKrinkle
parent d9ff9d7aa1
commit cc5f594384
7 changed files with 52 additions and 284 deletions

View File

@ -85,7 +85,6 @@ target_sources(common PRIVATE
if(_M_X86)
target_sources(common PRIVATE
emitter/avx.cpp
emitter/bmi.cpp
emitter/fpu.cpp
emitter/groups.cpp

View File

@ -80,9 +80,6 @@
<ClCompile Include="Windows\WinThreads.cpp" />
<ClCompile Include="HostSys.cpp" />
<ClCompile Include="Semaphore.cpp" />
<ClCompile Include="emitter\avx.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="emitter\bmi.cpp">
<ExcludedFromBuild Condition="'$(Platform)'=='ARM64'">true</ExcludedFromBuild>
</ClCompile>

View File

@ -1,172 +0,0 @@
// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#include "common/emitter/internal.h"
// warning: suggest braces around initialization of subobject [-Wmissing-braces]
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wmissing-braces"
#endif
namespace x86Emitter
{
const xImplAVX_Move xVMOVAPS = {0x00, 0x28, 0x29};
const xImplAVX_Move xVMOVUPS = {0x00, 0x10, 0x11};
const xImplAVX_ArithFloat xVADD = {
{0x00, 0x58}, // VADDPS
{0x66, 0x58}, // VADDPD
{0xF3, 0x58}, // VADDSS
{0xF2, 0x58}, // VADDSD
};
const xImplAVX_ArithFloat xVSUB = {
{0x00, 0x5C}, // VSUBPS
{0x66, 0x5C}, // VSUBPD
{0xF3, 0x5C}, // VSUBSS
{0xF2, 0x5C}, // VSUBSD
};
const xImplAVX_ArithFloat xVMUL = {
{0x00, 0x59}, // VMULPS
{0x66, 0x59}, // VMULPD
{0xF3, 0x59}, // VMULSS
{0xF2, 0x59}, // VMULSD
};
const xImplAVX_ArithFloat xVDIV = {
{0x00, 0x5E}, // VDIVPS
{0x66, 0x5E}, // VDIVPD
{0xF3, 0x5E}, // VDIVSS
{0xF2, 0x5E}, // VDIVSD
};
const xImplAVX_CmpFloat xVCMP = {
{SSE2_Equal},
{SSE2_Less},
{SSE2_LessOrEqual},
{SSE2_Unordered},
{SSE2_NotEqual},
{SSE2_NotLess},
{SSE2_NotLessOrEqual},
{SSE2_Ordered},
};
const xImplAVX_ThreeArgYMM xVPAND = {0x66, 0xDB};
const xImplAVX_ThreeArgYMM xVPANDN = {0x66, 0xDF};
const xImplAVX_ThreeArgYMM xVPOR = {0x66, 0xEB};
const xImplAVX_ThreeArgYMM xVPXOR = {0x66, 0xEF};
const xImplAVX_CmpInt xVPCMP = {
{0x66, 0x74}, // VPCMPEQB
{0x66, 0x75}, // VPCMPEQW
{0x66, 0x76}, // VPCMPEQD
{0x66, 0x64}, // VPCMPGTB
{0x66, 0x65}, // VPCMPGTW
{0x66, 0x66}, // VPCMPGTD
};
void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x66, 0xd7, to, xRegister32(), from);
}
void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x00, 0x50, to, xRegister32(), from);
}
void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x66, 0x50, to, xRegister32(), from);
}
void xVZEROUPPER()
{
// rather than dealing with nonexistant operands..
xWrite8(0xc5);
xWrite8(0xf8);
xWrite8(0x77);
}
void xImplAVX_Move::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
{
if (to != from)
xOpWriteC5(Prefix, LoadOpcode, to, xRegisterSSE(), from);
}
void xImplAVX_Move::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
{
xOpWriteC5(Prefix, LoadOpcode, to, xRegisterSSE(), from);
}
void xImplAVX_Move::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
{
xOpWriteC5(Prefix, StoreOpcode, from, xRegisterSSE(), to);
}
void xImplAVX_ThreeArg::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
pxAssert(!to.IsWideSIMD() && !from1.IsWideSIMD() && !from2.IsWideSIMD());
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArg::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
pxAssert(!to.IsWideSIMD() && !from1.IsWideSIMD());
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArgYMM::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArgYMM::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_CmpFloatHelper::PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0x00, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0x00, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0x66, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0x66, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0xF3, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0xF3, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0xF2, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0xF2, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
} // namespace x86Emitter

View File

@ -621,22 +621,6 @@ namespace x86Emitter
// ------------------------------------------------------------------------
extern const xImplAVX_Move xVMOVAPS;
extern const xImplAVX_Move xVMOVUPS;
extern const xImplAVX_ArithFloat xVADD;
extern const xImplAVX_ArithFloat xVSUB;
extern const xImplAVX_ArithFloat xVMUL;
extern const xImplAVX_ArithFloat xVDIV;
extern const xImplAVX_CmpFloat xVCMP;
extern const xImplAVX_ThreeArgYMM xVPAND;
extern const xImplAVX_ThreeArgYMM xVPANDN;
extern const xImplAVX_ThreeArgYMM xVPOR;
extern const xImplAVX_ThreeArgYMM xVPXOR;
extern const xImplAVX_CmpInt xVPCMP;
extern void xVPMOVMSKB(const xRegister32& to, const xRegisterSSE& from);
extern void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from);
extern void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from);
extern void xVZEROUPPER();
} // namespace x86Emitter

View File

@ -910,4 +910,12 @@ namespace x86Emitter
{
xOpWrite0F(0, 0xae, 1, src);
}
void xVZEROUPPER()
{
// rather than dealing with nonexistant operands..
xWrite8(0xc5);
xWrite8(0xf8);
xWrite8(0x77);
}
} // namespace x86Emitter

View File

@ -209,13 +209,13 @@ static void mVUGenerateCopyPipelineState(mV)
if (g_cpu.vectorISA >= ProcessorFeatures::VectorISA::AVX)
{
xVMOVAPS(ymm0, ptr[rax]);
xVMOVAPS(ymm1, ptr[rax + 32u]);
xVMOVAPS(ymm2, ptr[rax + 64u]);
xMOVAPS(ymm0, ptr[rax]);
xMOVAPS(ymm1, ptr[rax + 32u]);
xMOVAPS(ymm2, ptr[rax + 64u]);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], ymm0);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], ymm1);
xVMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], ymm2);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState)], ymm0);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 32u], ymm1);
xMOVUPS(ptr[reinterpret_cast<u8*>(&mVU.prog.lpState) + 64u], ymm2);
xVZEROUPPER();
}
@ -285,19 +285,19 @@ static void mVUGenerateCompareState(mV)
else
{
// We have to use unaligned loads here, because the blocks are only 16 byte aligned.
xVMOVUPS(ymm0, ptr[arg1reg]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xVPMOVMSKB(eax, ymm0);
xMOVUPS(ymm0, ptr[arg1reg]);
xPCMP.EQD(ymm0, ymm0, ptr[arg2reg]);
xPMOVMSKB(eax, ymm0);
xXOR(eax, 0xffffffff);
xForwardJNZ8 exitPoint;
xVMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xVMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xVPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xVPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xVPAND(ymm0, ymm0, ymm1);
xMOVUPS(ymm0, ptr[arg1reg + 0x20]);
xMOVUPS(ymm1, ptr[arg1reg + 0x40]);
xPCMP.EQD(ymm0, ymm0, ptr[arg2reg + 0x20]);
xPCMP.EQD(ymm1, ymm1, ptr[arg2reg + 0x40]);
xPAND(ymm0, ymm0, ymm1);
xVPMOVMSKB(eax, ymm0);
xPMOVMSKB(eax, ymm0);
xNOT(eax);
exitPoint.SetTarget();

View File

@ -747,96 +747,48 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xPMOVMSKB(eax, xmm2), "c5 f9 d7 c2");
CODEGEN_TEST(xPALIGNR(xmm4, xmm8, 1), "c4 c3 59 0f e0 01");
CODEGEN_TEST(xMASKMOV(xmm2, xmm9), "c4 c1 79 f7 d1");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");
CODEGEN_TEST(xVMOVUPS(xmm0, ptr32[rdi]), "c5 f8 10 07");
CODEGEN_TEST(xVMOVUPS(ptr32[rdi], xmm0), "c5 f8 11 07");
CODEGEN_TEST(xVADD.PS(xmm0, xmm1, xmm2), "c5 f0 58 c2");
CODEGEN_TEST(xVADD.PD(xmm0, xmm1, xmm2), "c5 f1 58 c2");
CODEGEN_TEST(xVADD.SS(xmm0, xmm1, xmm2), "c5 f2 58 c2");
CODEGEN_TEST(xVADD.SD(xmm0, xmm1, xmm2), "c5 f3 58 c2");
CODEGEN_TEST(xVSUB.PS(xmm0, xmm1, xmm2), "c5 f0 5c c2");
CODEGEN_TEST(xVSUB.PD(xmm0, xmm1, xmm2), "c5 f1 5c c2");
CODEGEN_TEST(xVSUB.SS(xmm0, xmm1, xmm2), "c5 f2 5c c2");
CODEGEN_TEST(xVSUB.SD(xmm0, xmm1, xmm2), "c5 f3 5c c2");
CODEGEN_TEST(xVMUL.PS(xmm0, xmm1, xmm2), "c5 f0 59 c2");
CODEGEN_TEST(xVMUL.PD(xmm0, xmm1, xmm2), "c5 f1 59 c2");
CODEGEN_TEST(xVMUL.SS(xmm0, xmm1, xmm2), "c5 f2 59 c2");
CODEGEN_TEST(xVMUL.SD(xmm0, xmm1, xmm2), "c5 f3 59 c2");
CODEGEN_TEST(xVDIV.PS(xmm0, xmm1, xmm2), "c5 f0 5e c2");
CODEGEN_TEST(xVDIV.PD(xmm0, xmm1, xmm2), "c5 f1 5e c2");
CODEGEN_TEST(xVDIV.SS(xmm0, xmm1, xmm2), "c5 f2 5e c2");
CODEGEN_TEST(xVDIV.SD(xmm0, xmm1, xmm2), "c5 f3 5e c2");
// Don't need to test all variants, since they just change the condition immediate.
CODEGEN_TEST(xVCMP.EQ.PS(xmm0, xmm1, xmm2), "c5 f0 c2 c2 00");
CODEGEN_TEST(xVCMP.EQ.PD(xmm0, xmm1, xmm2), "c5 f1 c2 c2 00");
CODEGEN_TEST(xVCMP.EQ.SS(xmm0, xmm1, xmm2), "c5 f2 c2 c2 00");
CODEGEN_TEST(xVCMP.EQ.SD(xmm0, xmm1, xmm2), "c5 f3 c2 c2 00");
CODEGEN_TEST(xVCMP.LE.PS(xmm0, xmm1, xmm2), "c5 f0 c2 c2 02");
CODEGEN_TEST(xVCMP.LE.PD(xmm0, xmm1, xmm2), "c5 f1 c2 c2 02");
CODEGEN_TEST(xVCMP.LE.SS(xmm0, xmm1, xmm2), "c5 f2 c2 c2 02");
CODEGEN_TEST(xVCMP.LE.SD(xmm0, xmm1, xmm2), "c5 f3 c2 c2 02");
CODEGEN_TEST(xVPCMP.EQB(xmm0, xmm1, xmm2), "c5 f1 74 c2");
CODEGEN_TEST(xVPCMP.EQW(xmm0, xmm1, xmm2), "c5 f1 75 c2");
CODEGEN_TEST(xVPCMP.EQD(xmm0, xmm1, xmm2), "c5 f1 76 c2");
CODEGEN_TEST(xVPCMP.GTB(xmm0, xmm1, xmm2), "c5 f1 64 c2");
CODEGEN_TEST(xVPCMP.GTW(xmm0, xmm1, xmm2), "c5 f1 65 c2");
CODEGEN_TEST(xVPCMP.GTD(xmm0, xmm1, xmm2), "c5 f1 66 c2");
CODEGEN_TEST(xVPAND(xmm0, xmm1, xmm2), "c5 f1 db c2");
CODEGEN_TEST(xVPANDN(xmm0, xmm1, xmm2), "c5 f1 df c2");
CODEGEN_TEST(xVPOR(xmm0, xmm1, xmm2), "c5 f1 eb c2");
CODEGEN_TEST(xVPXOR(xmm0, xmm1, xmm2), "c5 f1 ef c2");
CODEGEN_TEST(xVMOVMSKPS(eax, xmm1), "c5 f8 50 c1");
CODEGEN_TEST(xVMOVMSKPD(eax, xmm1), "c5 f9 50 c1");
}
TEST(CodegenTests, AVX256Test)
{
x86Emitter::use_avx = true;
CODEGEN_TEST(xVMOVAPS(ymm0, ymm1), "c5 fc 28 c1");
CODEGEN_TEST(xVMOVAPS(ymm0, ptr32[rdi]), "c5 fc 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], ymm0), "c5 fc 29 07");
CODEGEN_TEST(xVMOVUPS(ymm0, ptr32[rdi]), "c5 fc 10 07");
CODEGEN_TEST(xVMOVUPS(ptr32[rdi], ymm0), "c5 fc 11 07");
CODEGEN_TEST(xMOVAPS(ymm0, ymm1), "c5 fc 28 c1");
CODEGEN_TEST(xMOVAPS(ymm0, ptr32[rdi]), "c5 fc 28 07");
CODEGEN_TEST(xMOVAPS(ptr32[rdi], ymm0), "c5 fc 29 07");
CODEGEN_TEST(xMOVUPS(ymm0, ptr32[rdi]), "c5 fc 10 07");
CODEGEN_TEST(xMOVUPS(ptr32[rdi], ymm0), "c5 fc 11 07");
CODEGEN_TEST(xVZEROUPPER(), "c5 f8 77");
CODEGEN_TEST(xVADD.PS(ymm0, ymm1, ymm2), "c5 f4 58 c2");
CODEGEN_TEST(xVADD.PD(ymm0, ymm1, ymm2), "c5 f5 58 c2");
CODEGEN_TEST(xVSUB.PS(ymm0, ymm1, ymm2), "c5 f4 5c c2");
CODEGEN_TEST(xVSUB.PD(ymm0, ymm1, ymm2), "c5 f5 5c c2");
CODEGEN_TEST(xVMUL.PS(ymm0, ymm1, ymm2), "c5 f4 59 c2");
CODEGEN_TEST(xVMUL.PD(ymm0, ymm1, ymm2), "c5 f5 59 c2");
CODEGEN_TEST(xVDIV.PS(ymm0, ymm1, ymm2), "c5 f4 5e c2");
CODEGEN_TEST(xVDIV.PD(ymm0, ymm1, ymm2), "c5 f5 5e c2");
CODEGEN_TEST(xADD.PS(ymm0, ymm1, ymm2), "c5 f4 58 c2");
CODEGEN_TEST(xADD.PD(ymm0, ymm1, ymm2), "c5 f5 58 c2");
CODEGEN_TEST(xSUB.PS(ymm0, ymm1, ymm2), "c5 f4 5c c2");
CODEGEN_TEST(xSUB.PD(ymm0, ymm1, ymm2), "c5 f5 5c c2");
CODEGEN_TEST(xMUL.PS(ymm0, ymm1, ymm2), "c5 f4 59 c2");
CODEGEN_TEST(xMUL.PD(ymm0, ymm1, ymm2), "c5 f5 59 c2");
CODEGEN_TEST(xDIV.PS(ymm0, ymm1, ymm2), "c5 f4 5e c2");
CODEGEN_TEST(xDIV.PD(ymm0, ymm1, ymm2), "c5 f5 5e c2");
CODEGEN_TEST(xVCMP.EQ.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 00");
CODEGEN_TEST(xVCMP.EQ.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 00");
CODEGEN_TEST(xVCMP.LE.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 02");
CODEGEN_TEST(xVCMP.LE.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 02");
CODEGEN_TEST(xCMPEQ.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 00");
CODEGEN_TEST(xCMPEQ.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 00");
CODEGEN_TEST(xCMPLE.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 02");
CODEGEN_TEST(xCMPLE.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 02");
CODEGEN_TEST(xVPCMP.EQB(ymm0, ymm1, ymm2), "c5 f5 74 c2");
CODEGEN_TEST(xVPCMP.EQW(ymm0, ymm1, ymm2), "c5 f5 75 c2");
CODEGEN_TEST(xVPCMP.EQD(ymm0, ymm1, ymm2), "c5 f5 76 c2");
CODEGEN_TEST(xVPCMP.GTB(ymm0, ymm1, ymm2), "c5 f5 64 c2");
CODEGEN_TEST(xVPCMP.GTW(ymm0, ymm1, ymm2), "c5 f5 65 c2");
CODEGEN_TEST(xVPCMP.GTD(ymm0, ymm1, ymm2), "c5 f5 66 c2");
CODEGEN_TEST(xPCMP.EQB(ymm0, ymm1, ymm2), "c5 f5 74 c2");
CODEGEN_TEST(xPCMP.EQW(ymm0, ymm1, ymm2), "c5 f5 75 c2");
CODEGEN_TEST(xPCMP.EQD(ymm0, ymm1, ymm2), "c5 f5 76 c2");
CODEGEN_TEST(xPCMP.GTB(ymm0, ymm1, ymm2), "c5 f5 64 c2");
CODEGEN_TEST(xPCMP.GTW(ymm0, ymm1, ymm2), "c5 f5 65 c2");
CODEGEN_TEST(xPCMP.GTD(ymm0, ymm1, ymm2), "c5 f5 66 c2");
CODEGEN_TEST(xVPAND(ymm0, ymm1, ymm2), "c5 f5 db c2");
CODEGEN_TEST(xVPANDN(ymm0, ymm1, ymm2), "c5 f5 df c2");
CODEGEN_TEST(xVPOR(ymm0, ymm1, ymm2), "c5 f5 eb c2");
CODEGEN_TEST(xVPXOR(ymm0, ymm1, ymm2), "c5 f5 ef c2");
CODEGEN_TEST(xPAND(ymm0, ymm1, ymm2), "c5 f5 db c2");
CODEGEN_TEST(xPANDN(ymm0, ymm1, ymm2), "c5 f5 df c2");
CODEGEN_TEST(xPOR(ymm0, ymm1, ymm2), "c5 f5 eb c2");
CODEGEN_TEST(xPXOR(ymm0, ymm1, ymm2), "c5 f5 ef c2");
CODEGEN_TEST(xVMOVMSKPS(eax, ymm1), "c5 fc 50 c1");
CODEGEN_TEST(xVMOVMSKPD(eax, ymm1), "c5 fd 50 c1");
CODEGEN_TEST(xMOVMSKPS(eax, ymm1), "c5 fc 50 c1");
CODEGEN_TEST(xMOVMSKPD(eax, ymm1), "c5 fd 50 c1");
}
TEST(CodegenTests, Extended8BitTest)