mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2026-01-30 11:03:31 +00:00
Merge pull request #13900 from JosJuice/jit-fma-double-rounding
Jit: Implement error-free transformation for single-precision FMA
This commit is contained in:
commit
3221e982d3
@ -3156,6 +3156,10 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
|
||||
|
||||
EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, 0, 3, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
|
||||
@ -3505,6 +3509,53 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
|
||||
EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
|
||||
}
|
||||
|
||||
// Comparison
|
||||
void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
|
||||
}
|
||||
void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||
{
|
||||
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0xA, Rd, Rn);
|
||||
}
|
||||
void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
|
||||
}
|
||||
|
||||
// Float comparison
|
||||
void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
|
||||
{
|
||||
EmitCompare(0, 0, 0, 0, Rn, Rm);
|
||||
@ -3664,7 +3715,7 @@ void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
|
||||
shift, src_size);
|
||||
EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
|
||||
EmitShiftImm(IsQuad(Rd), 0, src_size | shift, 0b01010, Rd, Rn);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
|
||||
@ -3674,11 +3725,18 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
|
||||
EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
|
||||
shift, src_size);
|
||||
EmitShiftImm(IsQuad(Rd), 0, src_size * 2 - shift, 0b00000, Rd, Rn);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
|
||||
shift, src_size);
|
||||
EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
|
||||
EmitShiftImm(IsQuad(Rd), 1, src_size * 2 - shift, 0b00100, Rd, Rn);
|
||||
}
|
||||
|
||||
void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
|
||||
|
||||
@ -800,6 +800,7 @@ public:
|
||||
ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
|
||||
CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
|
||||
}
|
||||
void CNEG(ARM64Reg Rd, ARM64Reg Rn, CCFlags cond) { CSNEG(Rd, Rn, Rn, (CCFlags)((u32)cond ^ 1)); }
|
||||
void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
|
||||
void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
|
||||
{
|
||||
@ -1281,6 +1282,7 @@ public:
|
||||
void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
||||
void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
@ -1342,6 +1344,19 @@ public:
|
||||
void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
|
||||
void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
|
||||
|
||||
// Comparison
|
||||
void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||
void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||
|
||||
// Float comparison
|
||||
void FCMP(ARM64Reg Rn, ARM64Reg Rm);
|
||||
void FCMP(ARM64Reg Rn);
|
||||
@ -1380,6 +1395,7 @@ public:
|
||||
void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||
|
||||
@ -2519,19 +2519,19 @@ void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
|
||||
WriteSSEOp(0x66, 0x6C, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLW(X64Reg reg, int shift)
|
||||
void XEmitter::PSRLW(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLD(X64Reg reg, int shift)
|
||||
void XEmitter::PSRLD(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLQ(X64Reg reg, int shift)
|
||||
void XEmitter::PSRLQ(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
@ -2542,38 +2542,38 @@ void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
|
||||
WriteSSEOp(0x66, 0xd3, reg, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLDQ(X64Reg reg, int shift)
|
||||
void XEmitter::PSRLDQ(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLW(X64Reg reg, int shift)
|
||||
void XEmitter::PSLLW(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLD(X64Reg reg, int shift)
|
||||
void XEmitter::PSLLD(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLQ(X64Reg reg, int shift)
|
||||
void XEmitter::PSLLQ(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLDQ(X64Reg reg, int shift)
|
||||
void XEmitter::PSLLDQ(X64Reg reg, u8 shift)
|
||||
{
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
// WARNING not REX compatible
|
||||
void XEmitter::PSRAW(X64Reg reg, int shift)
|
||||
void XEmitter::PSRAW(X64Reg reg, u8 shift)
|
||||
{
|
||||
if (reg > 7)
|
||||
PanicAlertFmt("The PSRAW-emitter does not support regs above 7");
|
||||
@ -2585,7 +2585,7 @@ void XEmitter::PSRAW(X64Reg reg, int shift)
|
||||
}
|
||||
|
||||
// WARNING not REX compatible
|
||||
void XEmitter::PSRAD(X64Reg reg, int shift)
|
||||
void XEmitter::PSRAD(X64Reg reg, u8 shift)
|
||||
{
|
||||
if (reg > 7)
|
||||
PanicAlertFmt("The PSRAD-emitter does not support regs above 7");
|
||||
@ -2695,6 +2695,11 @@ void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
|
||||
Write8(blend);
|
||||
}
|
||||
|
||||
void XEmitter::PCMPEQQ(X64Reg dest, const OpArg& arg)
|
||||
{
|
||||
WriteSSE41Op(0x66, 0x3829, dest, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PAND(X64Reg dest, const OpArg& arg)
|
||||
{
|
||||
WriteSSEOp(0x66, 0xDB, dest, arg);
|
||||
@ -3038,6 +3043,12 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
|
||||
WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
|
||||
}
|
||||
|
||||
void XEmitter::VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift)
|
||||
{
|
||||
WriteAVXOp(0x66, 0x73, (X64Reg)6, regOp1, R(regOp2));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp)
|
||||
{
|
||||
WriteAVXOp(0x00, 0x29, regOp, X64Reg::INVALID_REG, arg);
|
||||
|
||||
@ -801,19 +801,19 @@ public:
|
||||
void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
|
||||
void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
|
||||
|
||||
void PSRLW(X64Reg reg, int shift);
|
||||
void PSRLD(X64Reg reg, int shift);
|
||||
void PSRLQ(X64Reg reg, int shift);
|
||||
void PSRLW(X64Reg reg, u8 shift);
|
||||
void PSRLD(X64Reg reg, u8 shift);
|
||||
void PSRLQ(X64Reg reg, u8 shift);
|
||||
void PSRLQ(X64Reg reg, const OpArg& arg);
|
||||
void PSRLDQ(X64Reg reg, int shift);
|
||||
void PSRLDQ(X64Reg reg, u8 shift);
|
||||
|
||||
void PSLLW(X64Reg reg, int shift);
|
||||
void PSLLD(X64Reg reg, int shift);
|
||||
void PSLLQ(X64Reg reg, int shift);
|
||||
void PSLLDQ(X64Reg reg, int shift);
|
||||
void PSLLW(X64Reg reg, u8 shift);
|
||||
void PSLLD(X64Reg reg, u8 shift);
|
||||
void PSLLQ(X64Reg reg, u8 shift);
|
||||
void PSLLDQ(X64Reg reg, u8 shift);
|
||||
|
||||
void PSRAW(X64Reg reg, int shift);
|
||||
void PSRAD(X64Reg reg, int shift);
|
||||
void PSRAW(X64Reg reg, u8 shift);
|
||||
void PSRAD(X64Reg reg, u8 shift);
|
||||
|
||||
// SSE4: data type conversions
|
||||
void PMOVSXBW(X64Reg dest, const OpArg& arg);
|
||||
@ -836,6 +836,9 @@ public:
|
||||
void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
|
||||
void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
|
||||
|
||||
// SSE4: compare instructions
|
||||
void PCMPEQQ(X64Reg dest, const OpArg& arg);
|
||||
|
||||
// AVX
|
||||
void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
@ -878,6 +881,8 @@ public:
|
||||
void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
|
||||
|
||||
void VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift);
|
||||
|
||||
void VMOVAPS(const OpArg& arg, X64Reg regOp);
|
||||
|
||||
void VZEROUPPER();
|
||||
|
||||
@ -222,6 +222,7 @@ const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS{{System::Main, "Core", "DivByZer
|
||||
false};
|
||||
const Info<bool> MAIN_FPRF{{System::Main, "Core", "FPRF"}, false};
|
||||
const Info<bool> MAIN_ACCURATE_NANS{{System::Main, "Core", "AccurateNaNs"}, false};
|
||||
const Info<bool> MAIN_ACCURATE_FMADDS{{System::Main, "Core", "AccurateFmadds"}, true};
|
||||
const Info<bool> MAIN_DISABLE_ICACHE{{System::Main, "Core", "DisableICache"}, false};
|
||||
const Info<float> MAIN_EMULATION_SPEED{{System::Main, "Core", "EmulationSpeed"}, 1.0f};
|
||||
#if defined(ANDROID)
|
||||
|
||||
@ -128,6 +128,7 @@ extern const Info<bool> MAIN_FLOAT_EXCEPTIONS;
|
||||
extern const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS;
|
||||
extern const Info<bool> MAIN_FPRF;
|
||||
extern const Info<bool> MAIN_ACCURATE_NANS;
|
||||
extern const Info<bool> MAIN_ACCURATE_FMADDS;
|
||||
extern const Info<bool> MAIN_DISABLE_ICACHE;
|
||||
extern const Info<float> MAIN_EMULATION_SPEED;
|
||||
extern const Info<bool> MAIN_PRECISION_FRAME_TIMING;
|
||||
|
||||
@ -80,6 +80,7 @@ public:
|
||||
layer->Set(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS, m_settings.divide_by_zero_exceptions);
|
||||
layer->Set(Config::MAIN_FPRF, m_settings.fprf);
|
||||
layer->Set(Config::MAIN_ACCURATE_NANS, m_settings.accurate_nans);
|
||||
layer->Set(Config::MAIN_ACCURATE_FMADDS, m_settings.accurate_fmadds);
|
||||
layer->Set(Config::MAIN_DISABLE_ICACHE, m_settings.disable_icache);
|
||||
layer->Set(Config::MAIN_SYNC_ON_SKIP_IDLE, m_settings.sync_on_skip_idle);
|
||||
layer->Set(Config::MAIN_SYNC_GPU, m_settings.sync_gpu);
|
||||
|
||||
@ -68,6 +68,7 @@ struct NetSettings
|
||||
bool divide_by_zero_exceptions = false;
|
||||
bool fprf = false;
|
||||
bool accurate_nans = false;
|
||||
bool accurate_fmadds = false;
|
||||
bool disable_icache = false;
|
||||
bool sync_on_skip_idle = false;
|
||||
bool sync_gpu = false;
|
||||
|
||||
@ -1425,6 +1425,7 @@ bool NetPlayServer::SetupNetSettings()
|
||||
settings.divide_by_zero_exceptions = Config::Get(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS);
|
||||
settings.fprf = Config::Get(Config::MAIN_FPRF);
|
||||
settings.accurate_nans = Config::Get(Config::MAIN_ACCURATE_NANS);
|
||||
settings.accurate_fmadds = Config::Get(Config::MAIN_ACCURATE_FMADDS);
|
||||
settings.disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE);
|
||||
settings.sync_on_skip_idle = Config::Get(Config::MAIN_SYNC_ON_SKIP_IDLE);
|
||||
settings.sync_gpu = Config::Get(Config::MAIN_SYNC_GPU);
|
||||
|
||||
@ -342,12 +342,12 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
|
||||
// - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
|
||||
// 3. Tying up to even because `c` is too small
|
||||
// a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
|
||||
// b. The lowest bit of `f` is 1 (this means it ties to even downwards)
|
||||
// b. The lowest bit of `f` is 1 (this means it ties to even upwards)
|
||||
// c. `c` is negative and does not round `d` downwards
|
||||
// - This is similar to the first one but in reverse, rounding up instead of down.
|
||||
// 4. Tying down because `d` rounded down
|
||||
// a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
|
||||
// b. The lowest bit of `f` is 0 (this means it ties to even upwards)
|
||||
// b. The lowest bit of `f` is 0 (this means it ties to even downwards)
|
||||
// c. `c` is negative, and the highest bit of c is 1,
|
||||
// and at least one other bit of c is nonzero
|
||||
// - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
|
||||
@ -375,12 +375,6 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
|
||||
// - Correct ordering of NaN checking (for both double and single precision)
|
||||
// - Rounding frC up
|
||||
// - Rounding only once for single precision inputs (this will be the large majority of cases!)
|
||||
// - Currently this is interpreter-only.
|
||||
// This can be implemented in the JIT just as easily, though.
|
||||
// Eventually the JITs should hopefully support detecting back to back
|
||||
// single-precision operations, which will lead to no overhead at all.
|
||||
// In the cases where JITs can't do this, an alternative method is used, as
|
||||
// is done in the interpreter as well.
|
||||
// - Rounding only once for double precision inputs
|
||||
// - This is a side effect of how we handle single-precision inputs: By doing
|
||||
// error calculations rather than checking if every input is a float, we ensure that we know
|
||||
@ -421,7 +415,7 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
|
||||
const double b_sign = sub ? -b : b;
|
||||
result.value = std::fma(a, c_round, b_sign);
|
||||
|
||||
// We then check if we're currently tying in rounding directioh
|
||||
// We then check if we're currently tying in rounding direction
|
||||
const u64 result_bits = std::bit_cast<u64>(result.value);
|
||||
|
||||
// The mask of the `d` bits as shown in the above comments
|
||||
@ -432,9 +426,8 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
|
||||
|
||||
// Because we check this entire mask which includes a 1 bit, we can be sure that
|
||||
// if this result passes, the input is not an infinity that would become a NaN.
|
||||
// This means that, for the JITs, if they only wanted to check for a subset of these
|
||||
// bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
|
||||
// they would have to check if the result was NaN before here.
|
||||
// If we had only checked for a subset of these bits (e.g. only checking if the last
|
||||
// one was 0), we would have needed to also check if the exponent was all ones.
|
||||
if ((result_bits & D_MASK) == EVEN_TIE)
|
||||
{
|
||||
// Because we have a tie, we now compute any error in the FMA calculation
|
||||
|
||||
@ -1284,9 +1284,9 @@ BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
|
||||
return cb.m_gqr_used & ~cb.m_gqr_modified;
|
||||
}
|
||||
|
||||
BitSet32 Jit64::CallerSavedRegistersInUse() const
|
||||
BitSet32 Jit64::CallerSavedRegistersInUse(BitSet32 additional_registers) const
|
||||
{
|
||||
BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16);
|
||||
BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16) | additional_registers;
|
||||
return in_use & ABI_ALL_CALLER_SAVED;
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +77,7 @@ public:
|
||||
// Returns false if no free memory region can be found for either of the two.
|
||||
bool SetEmitterStateToFreeCodeRegion();
|
||||
|
||||
BitSet32 CallerSavedRegistersInUse() const;
|
||||
BitSet32 CallerSavedRegistersInUse(BitSet32 additional_registers = {}) const;
|
||||
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
|
||||
|
||||
void IntializeSpeculativeConstants();
|
||||
@ -153,9 +153,10 @@ public:
|
||||
void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
|
||||
bool duplicate = false);
|
||||
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
|
||||
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber,
|
||||
std::optional<Gen::OpArg> Ra, std::optional<Gen::OpArg> Rb,
|
||||
std::optional<Gen::OpArg> Rc);
|
||||
[[nodiscard]] Gen::FixupBranch HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm,
|
||||
Gen::X64Reg clobber, std::optional<Gen::OpArg> Ra,
|
||||
std::optional<Gen::OpArg> Rb,
|
||||
std::optional<Gen::OpArg> Rc);
|
||||
|
||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||
|
||||
|
||||
@ -265,6 +265,10 @@ void Jit64AsmRoutineManager::GenerateCommon()
|
||||
GenMfcr();
|
||||
cdts = AlignCode4();
|
||||
GenConvertDoubleToSingle();
|
||||
fmadds_eft = AlignCode4();
|
||||
GenerateFmaddsEft();
|
||||
ps_madd_eft = AlignCode4();
|
||||
GeneratePsMaddEft();
|
||||
|
||||
GenQuantizedLoads();
|
||||
GenQuantizedSingleLoads();
|
||||
|
||||
@ -93,8 +93,9 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
|
||||
SetFPRFIfNeeded(input, false);
|
||||
}
|
||||
|
||||
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional<OpArg> Ra,
|
||||
std::optional<OpArg> Rb, std::optional<OpArg> Rc)
|
||||
FixupBranch Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber,
|
||||
std::optional<OpArg> Ra, std::optional<OpArg> Rb,
|
||||
std::optional<OpArg> Rc)
|
||||
{
|
||||
// | PowerPC | x86
|
||||
// ---------------------+----------+---------
|
||||
@ -104,9 +105,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
|
||||
// Dragon Ball: Revenge of King Piccolo requires generated NaNs
|
||||
// to be positive, so we'll have to handle them manually.
|
||||
|
||||
if (!m_accurate_nans)
|
||||
return;
|
||||
|
||||
if (inst.OPCD != 4)
|
||||
{
|
||||
// not paired-single
|
||||
@ -140,7 +138,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
|
||||
|
||||
FixupBranch done = J(Jump::Near);
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(done);
|
||||
return done;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -217,7 +215,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
|
||||
|
||||
FixupBranch done = J(Jump::Near);
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(done);
|
||||
return done;
|
||||
}
|
||||
}
|
||||
|
||||
@ -329,14 +327,21 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||
}
|
||||
}
|
||||
|
||||
switch (inst.SUBOP5)
|
||||
if (m_accurate_nans)
|
||||
{
|
||||
case 18:
|
||||
HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
|
||||
break;
|
||||
case 25:
|
||||
HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
|
||||
break;
|
||||
std::optional<FixupBranch> handled_nans;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18:
|
||||
handled_nans = HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
|
||||
break;
|
||||
case 25:
|
||||
handled_nans = HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
|
||||
break;
|
||||
}
|
||||
|
||||
if (handled_nans)
|
||||
SetJumpTarget(*handled_nans);
|
||||
}
|
||||
|
||||
if (single)
|
||||
@ -368,51 +373,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
const bool use_fma = Config::Get(Config::SESSION_USE_FMA);
|
||||
const bool software_fma = use_fma && !cpu_info.bFMA;
|
||||
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||
bool round_input = single && !js.op->fprIsSingle[c];
|
||||
bool preserve_inputs = m_accurate_nans;
|
||||
bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
|
||||
bool packed =
|
||||
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
|
||||
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||
const int a = inst.FA;
|
||||
const int b = inst.FB;
|
||||
const int c = inst.FC;
|
||||
const int d = inst.FD;
|
||||
|
||||
const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub
|
||||
const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd
|
||||
const bool madds0 = inst.SUBOP5 == 14;
|
||||
const bool madds1 = inst.SUBOP5 == 15;
|
||||
const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
|
||||
const bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||
const bool round_input = single && !js.op->fprIsSingle[c];
|
||||
|
||||
const bool error_free_transformation = single && m_accurate_fmadds;
|
||||
const bool packed =
|
||||
inst.OPCD == 4 ||
|
||||
(!cpu_info.bAtom && !software_fma && !error_free_transformation && single &&
|
||||
js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||
|
||||
const bool want_rc_rounded =
|
||||
(error_free_transformation || (software_fma && packed)) && round_input;
|
||||
const bool error_free_transformation_wants_rc_duplicated =
|
||||
(error_free_transformation && !want_rc_rounded) && (madds0 || madds1);
|
||||
const bool accurate_nans_wants_rc_duplicated = m_accurate_nans && (madds0 || madds1);
|
||||
const bool want_rc_duplicated =
|
||||
error_free_transformation_wants_rc_duplicated || accurate_nans_wants_rc_duplicated;
|
||||
|
||||
const bool preserve_d_due_to_a_or_b =
|
||||
(m_accurate_nans || error_free_transformation) && (a == d || b == d);
|
||||
const bool preserve_d_due_to_c =
|
||||
c == d && ((m_accurate_nans && (!want_rc_duplicated || software_fma)) ||
|
||||
(error_free_transformation && !want_rc_rounded));
|
||||
const bool preserve_d = preserve_d_due_to_a_or_b || preserve_d_due_to_c;
|
||||
|
||||
X64Reg scratch_xmm = XMM0;
|
||||
X64Reg result_xmm = XMM1;
|
||||
X64Reg Rc_duplicated = XMM2;
|
||||
X64Reg Rc_rounded = XMM3;
|
||||
|
||||
BitSet32 scratch_registers{XMM0 + 16, XMM1 + 16};
|
||||
|
||||
RCX64Reg xmm2_guard;
|
||||
RCX64Reg xmm3_guard;
|
||||
if (error_free_transformation)
|
||||
{
|
||||
xmm2_guard = fpr.Scratch(XMM2);
|
||||
xmm3_guard = fpr.Scratch(XMM3);
|
||||
RegCache::Realize(xmm2_guard, xmm3_guard);
|
||||
scratch_registers[XMM2 + 16] = true;
|
||||
scratch_registers[XMM3 + 16] = true;
|
||||
}
|
||||
else if (software_fma)
|
||||
{
|
||||
xmm2_guard = fpr.Scratch(XMM2);
|
||||
RegCache::Realize(xmm2_guard);
|
||||
scratch_registers[XMM2 + 16] = true;
|
||||
}
|
||||
|
||||
RCOpArg Ra;
|
||||
RCOpArg Rb;
|
||||
RCOpArg Rc;
|
||||
RCX64Reg Rd;
|
||||
RCX64Reg xmm2_guard;
|
||||
RCX64Reg result_xmm_guard;
|
||||
RCX64Reg Rc_duplicated_guard;
|
||||
if (software_fma)
|
||||
{
|
||||
xmm2_guard = fpr.Scratch(XMM2);
|
||||
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
|
||||
Ra = packed || error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||
Rb = packed || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = packed || (error_free_transformation && !want_rc_rounded && !want_rc_duplicated) ?
|
||||
fpr.Bind(c, RCMode::Read) :
|
||||
fpr.Use(c, RCMode::Read);
|
||||
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||
if (preserve_d && packed)
|
||||
{
|
||||
result_xmm_guard = fpr.Scratch();
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd, result_xmm_guard);
|
||||
result_xmm = Gen::X64Reg(result_xmm_guard);
|
||||
scratch_registers[result_xmm + 16] = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||
result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
|
||||
}
|
||||
}
|
||||
@ -421,48 +462,88 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
// For use_fma == true:
|
||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||
// if we have to pick one of a or b to bind, let's make it b.
|
||||
Ra = fpr.Use(a, RCMode::Read);
|
||||
Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = fpr.Use(c, RCMode::Read);
|
||||
Ra = error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||
Rb =
|
||||
use_fma || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||
Rc = error_free_transformation && !want_rc_rounded && !want_rc_duplicated ?
|
||||
fpr.Bind(c, RCMode::Read) :
|
||||
fpr.Use(c, RCMode::Read);
|
||||
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||
|
||||
if (madds_accurate_nans)
|
||||
{
|
||||
Rc_duplicated_guard = fpr.Scratch();
|
||||
RegCache::Realize(Rc_duplicated_guard);
|
||||
Rc_duplicated = Rc_duplicated_guard;
|
||||
}
|
||||
}
|
||||
|
||||
if (error_free_transformation_wants_rc_duplicated ||
|
||||
(accurate_nans_wants_rc_duplicated &&
|
||||
((!software_fma && !error_free_transformation) || (error_free_transformation && packed))))
|
||||
{
|
||||
Rc_duplicated_guard = fpr.Scratch();
|
||||
RegCache::Realize(Rc_duplicated_guard);
|
||||
Rc_duplicated = Rc_duplicated_guard;
|
||||
scratch_registers[Rc_duplicated + 16] = true;
|
||||
}
|
||||
|
||||
const auto registers_to_save = [&](BitSet32 scratch_registers_to_save) {
|
||||
const BitSet32 scratch_registers_not_to_save = scratch_registers & ~scratch_registers_to_save;
|
||||
return CallerSavedRegistersInUse(scratch_registers_to_save) & ~scratch_registers_not_to_save;
|
||||
};
|
||||
|
||||
if (software_fma)
|
||||
{
|
||||
if (want_rc_rounded)
|
||||
{
|
||||
if (error_free_transformation && madds0)
|
||||
{
|
||||
MOVDDUP(Rc_rounded, Rc);
|
||||
Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
|
||||
}
|
||||
else if (error_free_transformation && madds1)
|
||||
{
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_rounded, Rc, Rc, 3);
|
||||
Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
|
||||
}
|
||||
else
|
||||
{
|
||||
Force25BitPrecision(Rc_rounded, Rc, XMM2);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
||||
{
|
||||
if ((i == 0 || madds0) && !madds1)
|
||||
if (madds0 || (i == 0 && !madds1) || (want_rc_rounded && error_free_transformation && madds1))
|
||||
{
|
||||
if (round_input)
|
||||
if (want_rc_rounded)
|
||||
MOVAPD(XMM1, R(Rc_rounded));
|
||||
else if (round_input)
|
||||
Force25BitPrecision(XMM1, Rc, XMM2);
|
||||
else if (Rc.IsSimpleReg())
|
||||
MOVAPD(XMM1, Rc);
|
||||
else
|
||||
MOVSD(XMM1, Rc);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVHLPS(XMM1, Rc.GetSimpleReg());
|
||||
if (round_input)
|
||||
MOVHLPS(XMM1, want_rc_rounded ? Rc_rounded : Rc.GetSimpleReg());
|
||||
if (round_input && !want_rc_rounded)
|
||||
Force25BitPrecision(XMM1, R(XMM1), XMM2);
|
||||
}
|
||||
|
||||
// Write the result from the previous loop iteration into result_xmm so we don't lose it.
|
||||
// It's important that this is done after reading Rc above, in case we have madds1 and
|
||||
// result_xmm == Rd == Rc.
|
||||
// !want_rc_rounded and result_xmm == Rd == Rc.
|
||||
if (packed && i == 0)
|
||||
MOVLHPS(result_xmm, XMM0);
|
||||
|
||||
if (i == 0)
|
||||
{
|
||||
MOVSD(XMM0, Ra);
|
||||
MOVSD(XMM2, Rb);
|
||||
if (Ra.IsSimpleReg())
|
||||
MOVAPD(XMM0, Ra);
|
||||
else
|
||||
MOVSD(XMM0, Ra);
|
||||
|
||||
if (Rb.IsSimpleReg())
|
||||
MOVAPD(XMM2, Rb);
|
||||
else
|
||||
MOVSD(XMM2, Rb);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -473,23 +554,36 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
if (subtract)
|
||||
XORPS(XMM2, MConst(psSignBits));
|
||||
|
||||
BitSet32 registers_in_use = CallerSavedRegistersInUse();
|
||||
BitSet32 scratch_registers_to_save{};
|
||||
if (packed && i == 0)
|
||||
scratch_registers_to_save[result_xmm + 16] = true;
|
||||
if (want_rc_rounded && (error_free_transformation || i == 1))
|
||||
scratch_registers_to_save[Rc_rounded + 16] = true;
|
||||
|
||||
const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
|
||||
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
|
||||
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
|
||||
}
|
||||
|
||||
if (packed)
|
||||
{
|
||||
// result_xmm's upper lane has the result of the first loop iteration
|
||||
MOVSD(R(result_xmm), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
DEBUG_ASSERT(result_xmm == XMM0);
|
||||
}
|
||||
|
||||
if (madds_accurate_nans)
|
||||
if (want_rc_duplicated)
|
||||
{
|
||||
if (madds0)
|
||||
MOVDDUP(Rc_duplicated, Rc);
|
||||
else
|
||||
else if (madds1)
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3);
|
||||
else
|
||||
DEBUG_ASSERT(false);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -497,7 +591,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
if (madds0)
|
||||
{
|
||||
MOVDDUP(result_xmm, Rc);
|
||||
if (madds_accurate_nans)
|
||||
if (want_rc_duplicated)
|
||||
MOVAPD(R(Rc_duplicated), result_xmm);
|
||||
if (round_input)
|
||||
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
|
||||
@ -505,18 +599,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
else if (madds1)
|
||||
{
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
|
||||
if (madds_accurate_nans)
|
||||
if (want_rc_duplicated)
|
||||
MOVAPD(R(Rc_duplicated), result_xmm);
|
||||
if (round_input)
|
||||
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
|
||||
}
|
||||
else
|
||||
{
|
||||
DEBUG_ASSERT(!want_rc_duplicated);
|
||||
if (round_input)
|
||||
Force25BitPrecision(result_xmm, Rc, scratch_xmm);
|
||||
else
|
||||
MOVAPD(result_xmm, Rc);
|
||||
}
|
||||
if (want_rc_rounded)
|
||||
MOVAPD(R(Rc_rounded), result_xmm);
|
||||
|
||||
if (use_fma)
|
||||
{
|
||||
@ -556,6 +653,160 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
}
|
||||
}
|
||||
|
||||
if (m_accurate_nans && result_xmm == XMM0)
|
||||
{
|
||||
// HandleNaNs needs to clobber XMM0
|
||||
result_xmm = error_free_transformation ? XMM1 : Rd;
|
||||
MOVAPD(result_xmm, R(XMM0));
|
||||
DEBUG_ASSERT(!preserve_d);
|
||||
}
|
||||
|
||||
std::optional<FixupBranch> handled_nans;
|
||||
if (!packed && m_accurate_nans)
|
||||
{
|
||||
// The clobber register is unused when not packed.
|
||||
handled_nans =
|
||||
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
|
||||
}
|
||||
|
||||
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here.
|
||||
if (error_free_transformation)
|
||||
{
|
||||
if (result_xmm != XMM1)
|
||||
{
|
||||
MOVAPD(XMM1, R(result_xmm));
|
||||
result_xmm = XMM1;
|
||||
}
|
||||
|
||||
X64Reg Rc_rounded_duplicated = Rc.GetSimpleReg();
|
||||
BitSet32 scratch_registers_to_save = {XMM1 + 16, XMM2 + 16};
|
||||
if (want_rc_rounded)
|
||||
{
|
||||
Rc_rounded_duplicated = Rc_rounded;
|
||||
scratch_registers_to_save[Rc_rounded] = true;
|
||||
}
|
||||
else if (want_rc_duplicated)
|
||||
{
|
||||
Rc_rounded_duplicated = Rc_duplicated;
|
||||
scratch_registers_to_save[want_rc_duplicated] = true;
|
||||
}
|
||||
|
||||
// We've calculated s := a + b, with a = Ra * Rc_rounded_duplicated, b = subtract ? -Rb : Rb
|
||||
|
||||
if (packed)
|
||||
{
|
||||
// a' := s - b
|
||||
if (subtract)
|
||||
avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM0, R(XMM1), Rb);
|
||||
else
|
||||
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM1), Rb);
|
||||
|
||||
// b' := s - a'
|
||||
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM1), R(XMM0));
|
||||
|
||||
// da := a - a'
|
||||
if (software_fma)
|
||||
{
|
||||
scratch_registers_to_save[XMM0 + 16] = true;
|
||||
const BitSet32 registers_in_use_1 = registers_to_save(scratch_registers_to_save);
|
||||
ABI_PushRegistersAndAdjustStack(registers_in_use_1, 0);
|
||||
|
||||
avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
|
||||
MOVAPD(XMM0, R(Rc_rounded_duplicated));
|
||||
MOVAPD(XMM1, Ra);
|
||||
|
||||
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||
|
||||
// We will read from the upper lane of Rc_rounded_duplicated later,
|
||||
// so we need to make sure that that lane isn't overwritten.
|
||||
if (Rc_rounded_duplicated == XMM3)
|
||||
MOVSD(XMM3, R(XMM0));
|
||||
else
|
||||
MOVAPD(XMM3, R(XMM0));
|
||||
|
||||
ABI_PopRegistersAndAdjustStack(registers_in_use_1, 0);
|
||||
|
||||
scratch_registers_to_save[XMM0 + 16] = false;
|
||||
scratch_registers_to_save[XMM3 + 16] = true;
|
||||
const BitSet32 registers_in_use_2 = registers_to_save(scratch_registers_to_save);
|
||||
ABI_PushRegistersAndAdjustStack(registers_in_use_2, 0);
|
||||
|
||||
MOVHLPS(XMM2, XMM0);
|
||||
XORPS(XMM2, MConst(psSignBits));
|
||||
MOVHLPS(XMM0, Rc_rounded_duplicated);
|
||||
MOVHLPS(XMM1, Ra.GetSimpleReg());
|
||||
|
||||
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||
|
||||
ABI_PopRegistersAndAdjustStack(registers_in_use_2, 0);
|
||||
|
||||
UNPCKLPD(XMM0, R(XMM3));
|
||||
}
|
||||
else if (use_fma)
|
||||
{
|
||||
VFMSUB231PD(XMM0, Rc_rounded_duplicated, Ra);
|
||||
}
|
||||
else
|
||||
{
|
||||
avx_op(&XEmitter::VMULPD, &XEmitter::MULPD, XMM3, R(Rc_rounded_duplicated), Ra);
|
||||
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM3), R(XMM0), true, false, XMM3);
|
||||
}
|
||||
|
||||
// db := b - b'
|
||||
// (Transformed into -db := b' - b)
|
||||
if (subtract)
|
||||
avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM2, R(XMM2), Rb);
|
||||
else
|
||||
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM2), Rb);
|
||||
|
||||
CALL(GetAsmRoutines()->ps_madd_eft);
|
||||
}
|
||||
else
|
||||
{
|
||||
// a' := s - b
|
||||
if (subtract)
|
||||
avx_op(&XEmitter::VADDSD, &XEmitter::ADDSD, XMM0, R(XMM1), Rb, false);
|
||||
else
|
||||
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM1), Rb, false);
|
||||
|
||||
// b' := s - a'
|
||||
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM2, R(XMM1), R(XMM0), false);
|
||||
|
||||
// da := a - a'
|
||||
if (software_fma)
|
||||
{
|
||||
const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
|
||||
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
|
||||
|
||||
avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
|
||||
MOVAPD(XMM0, R(Rc_rounded_duplicated));
|
||||
MOVAPD(XMM1, Ra);
|
||||
|
||||
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||
|
||||
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
|
||||
}
|
||||
else if (use_fma)
|
||||
{
|
||||
VFMSUB231SD(XMM0, Rc_rounded_duplicated, Ra);
|
||||
}
|
||||
else
|
||||
{
|
||||
avx_op(&XEmitter::VMULSD, &XEmitter::MULSD, XMM3, R(Rc_rounded_duplicated), Ra, false);
|
||||
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM3), R(XMM0), false, false, XMM3);
|
||||
}
|
||||
|
||||
// db := b - b'
|
||||
// (Transformed into -db := b' - b)
|
||||
if (subtract)
|
||||
ADDSD(XMM2, Rb);
|
||||
else
|
||||
SUBSD(XMM2, Rb);
|
||||
|
||||
CALL(GetAsmRoutines()->fmadds_eft);
|
||||
}
|
||||
}
|
||||
|
||||
// Using x64's nmadd/nmsub would require us to swap the sign of the addend
|
||||
// (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
|
||||
// Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
|
||||
@ -563,16 +814,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
if (negate)
|
||||
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
|
||||
|
||||
if (m_accurate_nans && result_xmm == XMM0)
|
||||
if (packed && m_accurate_nans)
|
||||
{
|
||||
// HandleNaNs needs to clobber XMM0
|
||||
MOVAPD(Rd, R(result_xmm));
|
||||
result_xmm = Rd;
|
||||
DEBUG_ASSERT(!preserve_d);
|
||||
// If packed, the clobber register must be XMM0.
|
||||
handled_nans =
|
||||
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
|
||||
}
|
||||
|
||||
// If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
|
||||
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc);
|
||||
// If the handled_nans branch was taken in the non-packed case, that means the result is NaN,
|
||||
// so we can skip the XORPD and the error-free transformation. If the handled_nans branch was
|
||||
// taken in the packed case, we don't know if both of the results were NaN or only one, so we
|
||||
// can't skip anything.
|
||||
if (handled_nans)
|
||||
SetJumpTarget(*handled_nans);
|
||||
|
||||
if (single)
|
||||
FinalizeSingleResult(Rd, R(result_xmm), packed, true);
|
||||
|
||||
@ -100,12 +100,19 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||
default:
|
||||
PanicAlertFmt("ps_muls WTF!!!");
|
||||
}
|
||||
|
||||
if (round_input)
|
||||
Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0);
|
||||
else if (XMM1 != Rc_duplicated)
|
||||
MOVAPD(XMM1, Rc_duplicated);
|
||||
MULPD(XMM1, Ra);
|
||||
HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
|
||||
|
||||
if (m_accurate_nans)
|
||||
{
|
||||
const FixupBranch handled_nans = HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
|
||||
SetJumpTarget(handled_nans);
|
||||
}
|
||||
|
||||
FinalizeSingleResult(Rd, R(XMM1));
|
||||
}
|
||||
|
||||
|
||||
@ -741,7 +741,8 @@ void EmuCodeBlock::JitClearCA()
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
||||
void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
|
||||
const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible)
|
||||
const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible,
|
||||
X64Reg scratch)
|
||||
{
|
||||
if (arg1.IsSimpleReg(regOp))
|
||||
{
|
||||
@ -778,19 +779,19 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
||||
else
|
||||
{
|
||||
// The ugly case: Not reversible, and we have regOp == arg2 without AVX or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg(XMM0))
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (!arg1.IsSimpleReg(scratch))
|
||||
MOVAPD(scratch, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2);
|
||||
(this->*avxOp)(regOp, scratch, arg2);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2);
|
||||
(this->*sseOp)(scratch, arg2);
|
||||
if (packed)
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
MOVAPD(regOp, R(scratch));
|
||||
else
|
||||
MOVSD(regOp, R(XMM0));
|
||||
MOVSD(regOp, R(scratch));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -798,7 +799,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, u8),
|
||||
void (XEmitter::*sseOp)(X64Reg, const OpArg&, u8), X64Reg regOp,
|
||||
const OpArg& arg1, const OpArg& arg2, u8 imm)
|
||||
const OpArg& arg1, const OpArg& arg2, u8 imm, X64Reg scratch)
|
||||
{
|
||||
if (arg1.IsSimpleReg(regOp))
|
||||
{
|
||||
@ -816,21 +817,40 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
|
||||
else
|
||||
{
|
||||
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg(XMM0))
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (!arg1.IsSimpleReg(scratch))
|
||||
MOVAPD(scratch, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2, imm);
|
||||
(this->*avxOp)(regOp, scratch, arg2, imm);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2, imm);
|
||||
if (regOp != XMM0)
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
(this->*sseOp)(scratch, arg2, imm);
|
||||
if (regOp != scratch)
|
||||
MOVAPD(regOp, R(scratch));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, u8),
|
||||
void (XEmitter::*sseOp)(X64Reg, u8), X64Reg regOp1, X64Reg regOp2, u8 imm)
|
||||
{
|
||||
if (regOp1 == regOp2)
|
||||
{
|
||||
(this->*sseOp)(regOp1, imm);
|
||||
}
|
||||
else if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp1, regOp2, imm);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(regOp1, R(regOp2));
|
||||
(this->*sseOp)(regOp1, imm);
|
||||
}
|
||||
}
|
||||
|
||||
alignas(16) static const u64 psMantissaTruncate[2] = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
|
||||
alignas(16) static const u64 psRoundBit[2] = {0x8000000, 0x8000000};
|
||||
|
||||
@ -842,8 +862,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
|
||||
{
|
||||
if (m_jit.jo.accurateSinglePrecision)
|
||||
{
|
||||
DEBUG_ASSERT(output != tmp);
|
||||
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
||||
if (input.IsSimpleReg() && cpu_info.bAVX)
|
||||
if (input.IsSimpleReg() && !input.IsSimpleReg(tmp) && cpu_info.bAVX)
|
||||
{
|
||||
VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
|
||||
VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));
|
||||
|
||||
@ -113,10 +113,14 @@ public:
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), Gen::X64Reg regOp,
|
||||
const Gen::OpArg& arg1, const Gen::OpArg& arg2, bool packed = true,
|
||||
bool reversible = false);
|
||||
bool reversible = false, Gen::X64Reg scratch = Gen::XMM0);
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&, u8),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
|
||||
const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
|
||||
const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm,
|
||||
Gen::X64Reg scratch = Gen::XMM0);
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, u8),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, u8), Gen::X64Reg regOp1, Gen::X64Reg regOp2,
|
||||
u8 imm);
|
||||
|
||||
void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);
|
||||
|
||||
|
||||
@ -326,6 +326,98 @@ void CommonAsmRoutines::GenMfcr()
|
||||
Common::JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
|
||||
}
|
||||
|
||||
// Inputs:
|
||||
// XMM0: First error term
|
||||
// XMM1: Result with potentially incorrect rounding
|
||||
// XMM2: Second error term, negated
|
||||
//
|
||||
// Outputs result with corrected rounding in XMM1.
|
||||
// Clobbers RSCRATCH, RSCRATCH2, XMM0, XMM2, and flags.
|
||||
void CommonAsmRoutines::GenerateFmaddsEft()
|
||||
{
|
||||
// Check if XMM1 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
|
||||
MOVQ_xmm(R(RSCRATCH), XMM1);
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x80000000));
|
||||
LEA(32, RSCRATCH2, MComplex(RSCRATCH2, RSCRATCH, SCALE_8, 0));
|
||||
TEST(32, R(RSCRATCH2), R(RSCRATCH2));
|
||||
FixupBranch even_tie = J_CC(CCFlags::CC_Z);
|
||||
|
||||
const u8* ret = GetCodePtr();
|
||||
RET();
|
||||
|
||||
// Check if the error is 0
|
||||
SetJumpTarget(even_tie);
|
||||
SUBSD(XMM0, R(XMM2));
|
||||
XORPD(XMM2, R(XMM2));
|
||||
UCOMISD(XMM0, R(XMM2));
|
||||
J_CC(CCFlags::CC_E, ret);
|
||||
|
||||
// Round XMM1 up or down
|
||||
MOVQ_xmm(R(RSCRATCH2), XMM0);
|
||||
XOR(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
SAR(64, R(RSCRATCH2), Imm8(63));
|
||||
OR(64, R(RSCRATCH2), Imm8(1));
|
||||
ADD(64, R(RSCRATCH), R(RSCRATCH2));
|
||||
MOVQ_xmm(XMM1, R(RSCRATCH));
|
||||
RET();
|
||||
}
|
||||
|
||||
alignas(16) static const __m128i double_msb = _mm_set_epi64x(0x8000000000000000,
|
||||
0x8000000000000000);
|
||||
alignas(16) static const __m128i double_lsb = _mm_set_epi64x(1, 1);
|
||||
|
||||
// Inputs:
|
||||
// XMM0: First error terms
|
||||
// XMM1: Results with potentially incorrect rounding
|
||||
// XMM2: Second error terms, negated
|
||||
//
|
||||
// Outputs results with corrected rounding in XMM1. Clobbers RSCRATCH, XMM0-XMM3, and flags.
|
||||
void CommonAsmRoutines::GeneratePsMaddEft()
|
||||
{
|
||||
// Check if XMM1 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
|
||||
avx_op(&XEmitter::VPSLLQ, &XEmitter::PSLLQ, XMM3, XMM1, 35);
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PCMPEQQ(XMM3, MConst(double_msb));
|
||||
}
|
||||
else
|
||||
{
|
||||
PCMPEQW(XMM3, MConst(double_msb));
|
||||
PSHUFD(XMM3, R(XMM3), 0xF5);
|
||||
}
|
||||
|
||||
// Just for performance, exit early if there is no even tie
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PTEST(XMM3, R(XMM3));
|
||||
}
|
||||
else
|
||||
{
|
||||
PMOVMSKB(RSCRATCH, R(XMM3));
|
||||
TEST(32, R(RSCRATCH), R(RSCRATCH));
|
||||
}
|
||||
FixupBranch even_tie = J_CC(CCFlags::CC_NZ);
|
||||
RET();
|
||||
SetJumpTarget(even_tie);
|
||||
|
||||
// Check if the error is zero
|
||||
SUBPD(XMM0, R(XMM2));
|
||||
XORPD(XMM2, R(XMM2));
|
||||
CMPPD(XMM2, R(XMM0), CMP_EQ);
|
||||
|
||||
// Store -1 or 1 in XMM0 depending on whether we're rounding down or up
|
||||
PXOR(XMM0, R(XMM1));
|
||||
PSRAD(XMM0, 31);
|
||||
PSHUFD(XMM0, R(XMM0), 0xF5);
|
||||
POR(XMM0, MConst(double_lsb));
|
||||
|
||||
// Round the elements that have both a non-zero error and an even tie
|
||||
PANDN(XMM2, R(XMM3));
|
||||
PAND(XMM0, R(XMM2));
|
||||
PADDQ(XMM1, R(XMM0));
|
||||
RET();
|
||||
}
|
||||
|
||||
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
||||
alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||
alignas(16) static const float m_32767 = 32767.0f;
|
||||
|
||||
@ -33,6 +33,8 @@ public:
|
||||
|
||||
protected:
|
||||
void GenConvertDoubleToSingle();
|
||||
void GenerateFmaddsEft();
|
||||
void GeneratePsMaddEft();
|
||||
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
|
||||
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
|
||||
void GenQuantizedLoads();
|
||||
|
||||
@ -324,6 +324,8 @@ protected:
|
||||
void GenerateConvertDoubleToSingle();
|
||||
void GenerateConvertSingleToDouble();
|
||||
void GenerateFPRF(bool single);
|
||||
void GenerateFmaddsEft();
|
||||
void GeneratePsMaddEft();
|
||||
void GenerateQuantizedLoads();
|
||||
void GenerateQuantizedStores();
|
||||
|
||||
|
||||
@ -79,9 +79,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
const bool use_b = op5 != 25; // fmul uses no B
|
||||
const bool fma = use_b && use_c;
|
||||
const bool negate_result = (op5 & ~0x1) == 30;
|
||||
const bool negate_b = op5 == 28 || op5 == 30;
|
||||
|
||||
const bool output_is_single = inst.OPCD == 59;
|
||||
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
|
||||
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
|
||||
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
|
||||
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];
|
||||
|
||||
const auto inputs_are_singles_func = [&] {
|
||||
@ -89,13 +91,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
(!use_c || fpr.IsSingle(c, true));
|
||||
};
|
||||
|
||||
const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
|
||||
const bool single = inputs_are_singles_func() && output_is_single &&
|
||||
(error_free_transformation_requested || !nonfused_requested);
|
||||
const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
|
||||
const RegType type_out = output_is_single ?
|
||||
(single ? RegType::DuplicatedSingle : RegType::Duplicated) :
|
||||
RegType::LowerPair;
|
||||
const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
|
||||
|
||||
const bool nonfused = nonfused_requested && !single;
|
||||
const bool error_free_transformation =
|
||||
error_free_transformation_requested && !single && output_is_single;
|
||||
|
||||
if (error_free_transformation)
|
||||
{
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
|
||||
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
|
||||
}
|
||||
|
||||
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
|
||||
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
|
||||
@ -103,33 +116,47 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
|
||||
{
|
||||
Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||
Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;
|
||||
|
||||
ARM64Reg rounded_c_reg = VC;
|
||||
if (round_c)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
|
||||
|
||||
V0Q = fpr.GetScopedReg();
|
||||
rounded_c_reg = reg_encoder(V0Q);
|
||||
Force25BitPrecision(rounded_c_reg, VC);
|
||||
}
|
||||
|
||||
ARM64Reg inaccurate_fma_reg = VD;
|
||||
if (fma && inaccurate_fma && VD == VB)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
inaccurate_fma_reg = reg_encoder(V0Q);
|
||||
}
|
||||
|
||||
ARM64Reg result_reg = VD;
|
||||
const bool preserve_d =
|
||||
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
|
||||
if (preserve_d)
|
||||
ARM64Reg nonfused_reg = VD;
|
||||
if (error_free_transformation)
|
||||
{
|
||||
V1Q = fpr.GetScopedReg();
|
||||
result_reg = reg_encoder(V1Q);
|
||||
result_reg = reg_encoder(ARM64Reg::Q0);
|
||||
nonfused_reg = reg_encoder(ARM64Reg::Q0);
|
||||
|
||||
if (nonfused && V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
}
|
||||
else
|
||||
{
|
||||
const bool preserve_d =
|
||||
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
|
||||
if (preserve_d)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
result_reg = reg_encoder(V0Q);
|
||||
nonfused_reg = reg_encoder(V0Q);
|
||||
}
|
||||
else if (fma && nonfused && VD == VB)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
nonfused_reg = reg_encoder(V0Q);
|
||||
}
|
||||
}
|
||||
|
||||
if (round_c)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
|
||||
Force25BitPrecision(rounded_c_reg, VC);
|
||||
}
|
||||
|
||||
switch (op5)
|
||||
@ -152,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
|
||||
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
|
||||
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FSUB(result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -164,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
break;
|
||||
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
|
||||
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FADD(result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -180,6 +207,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
}
|
||||
|
||||
Common::SmallVector<FixupBranch, 4> nan_fixups;
|
||||
std::optional<FixupBranch> nan_early_fixup;
|
||||
if (m_accurate_nans)
|
||||
{
|
||||
// Check if we need to handle NaNs
|
||||
@ -216,7 +244,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
SetJumpTarget(skip);
|
||||
}
|
||||
|
||||
std::optional<FixupBranch> nan_early_fixup;
|
||||
if (negate_result)
|
||||
{
|
||||
// If we have a NaN, we must not execute FNEG.
|
||||
@ -230,11 +257,46 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
}
|
||||
|
||||
SwitchToNearCode();
|
||||
|
||||
if (nan_early_fixup)
|
||||
SetJumpTarget(*nan_early_fixup);
|
||||
}
|
||||
|
||||
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here
|
||||
if (error_free_transformation)
|
||||
{
|
||||
// We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
|
||||
|
||||
// a' := s - b
|
||||
if (negate_b)
|
||||
m_float_emit.FADD(ARM64Reg::D1, result_reg, VB);
|
||||
else
|
||||
m_float_emit.FSUB(ARM64Reg::D1, result_reg, VB);
|
||||
|
||||
// b' := s - a'
|
||||
m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
|
||||
|
||||
// da := a - a'
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
|
||||
m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.FNMSUB(ARM64Reg::D1, VA, rounded_c_reg, ARM64Reg::D1);
|
||||
}
|
||||
|
||||
// db := b - b'
|
||||
// (Transformed into -db := b' - b)
|
||||
if (negate_b)
|
||||
m_float_emit.FADD(ARM64Reg::D2, ARM64Reg::D2, VB);
|
||||
else
|
||||
m_float_emit.FSUB(ARM64Reg::D2, ARM64Reg::D2, VB);
|
||||
|
||||
BL(GetAsmRoutines()->fmadds_eft);
|
||||
}
|
||||
|
||||
if (nan_early_fixup)
|
||||
SetJumpTarget(*nan_early_fixup);
|
||||
|
||||
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
|
||||
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
|
||||
if (negate_result)
|
||||
@ -254,7 +316,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||
fpr.FixSinglePrecision(d);
|
||||
}
|
||||
|
||||
if (error_free_transformation)
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
|
||||
|
||||
SetFPRFIfNeeded(output_is_single, VD);
|
||||
|
||||
if (error_free_transformation)
|
||||
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
|
||||
}
|
||||
|
||||
void JitArm64::fp_logic(UGeckoInstruction inst)
|
||||
|
||||
@ -92,20 +92,31 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
const bool duplicated_c = muls || madds;
|
||||
const bool fma = use_b && use_c;
|
||||
const bool negate_result = (op5 & ~0x1) == 30;
|
||||
const bool msub = op5 == 28 || op5 == 30;
|
||||
const bool negate_b = op5 == 28 || op5 == 30;
|
||||
|
||||
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
|
||||
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
|
||||
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
|
||||
const bool round_c = use_c && !js.op->fprIsSingle[c];
|
||||
|
||||
const auto inputs_are_singles_func = [&] {
|
||||
return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
|
||||
};
|
||||
|
||||
const bool single = inputs_are_singles_func() && !inaccurate_fma;
|
||||
const bool single =
|
||||
inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
|
||||
const RegType type = single ? RegType::Single : RegType::Register;
|
||||
const u8 size = single ? 32 : 64;
|
||||
const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;
|
||||
|
||||
const bool nonfused = nonfused_requested && !single;
|
||||
const bool error_free_transformation = error_free_transformation_requested && !single;
|
||||
|
||||
if (error_free_transformation)
|
||||
{
|
||||
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
|
||||
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
|
||||
}
|
||||
|
||||
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
|
||||
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
|
||||
@ -119,41 +130,77 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
ARM64Reg rounded_c_reg = VC;
|
||||
if (round_c)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
|
||||
|
||||
V0Q = fpr.GetScopedReg();
|
||||
rounded_c_reg = reg_encoder(V0Q);
|
||||
Force25BitPrecision(rounded_c_reg, VC);
|
||||
}
|
||||
|
||||
ARM64Reg inaccurate_fma_reg = VD;
|
||||
if (fma && inaccurate_fma && VD == VB)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
if (error_free_transformation)
|
||||
{
|
||||
// This register happens to be free, so we can skip allocating one
|
||||
rounded_c_reg = ARM64Reg::Q3;
|
||||
}
|
||||
else
|
||||
{
|
||||
V0Q = fpr.GetScopedReg();
|
||||
inaccurate_fma_reg = reg_encoder(V0Q);
|
||||
rounded_c_reg = reg_encoder(V0Q);
|
||||
}
|
||||
}
|
||||
|
||||
ARM64Reg result_reg = VD;
|
||||
const bool need_accurate_fma_reg =
|
||||
fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
|
||||
const bool preserve_d =
|
||||
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
|
||||
if (need_accurate_fma_reg || preserve_d)
|
||||
ARM64Reg nonfused_reg = VD;
|
||||
if (error_free_transformation)
|
||||
{
|
||||
V1Q = fpr.GetScopedReg();
|
||||
result_reg = reg_encoder(V1Q);
|
||||
result_reg = reg_encoder(ARM64Reg::Q0);
|
||||
nonfused_reg = reg_encoder(ARM64Reg::Q0);
|
||||
}
|
||||
else
|
||||
{
|
||||
const bool need_fused_fma_reg =
|
||||
fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
|
||||
const bool preserve_d =
|
||||
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
|
||||
if (need_fused_fma_reg || preserve_d)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
result_reg = reg_encoder(V0Q);
|
||||
nonfused_reg = reg_encoder(V0Q);
|
||||
|
||||
if (need_fused_fma_reg && round_c)
|
||||
{
|
||||
V1Q = fpr.GetScopedReg();
|
||||
rounded_c_reg = reg_encoder(V1Q);
|
||||
}
|
||||
}
|
||||
else if (fma && nonfused && VD == VB)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
nonfused_reg = reg_encoder(V0Q);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_accurate_nans)
|
||||
{
|
||||
if (V0Q == ARM64Reg::INVALID_REG)
|
||||
V0Q = fpr.GetScopedReg();
|
||||
if (error_free_transformation)
|
||||
{
|
||||
// These registers happen to be free, so we can skip allocating new ones
|
||||
V1Q = ARM64Reg::Q1;
|
||||
V2Q = ARM64Reg::Q2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (V1Q == ARM64Reg::INVALID_REG)
|
||||
V1Q = fpr.GetScopedReg();
|
||||
|
||||
if (duplicated_c || VD == result_reg)
|
||||
V2Q = fpr.GetScopedReg();
|
||||
if (duplicated_c || VD == result_reg)
|
||||
V2Q = fpr.GetScopedReg();
|
||||
}
|
||||
}
|
||||
|
||||
if (round_c)
|
||||
{
|
||||
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
|
||||
Force25BitPrecision(rounded_c_reg, VC);
|
||||
}
|
||||
|
||||
std::optional<ARM64Reg> negated_b_reg;
|
||||
switch (op5)
|
||||
{
|
||||
case 12: // ps_muls0: d = a * c.ps0
|
||||
@ -163,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
|
||||
break;
|
||||
case 14: // ps_madds0: d = a * c.ps0 + b
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
|
||||
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
|
||||
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -176,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
}
|
||||
break;
|
||||
case 15: // ps_madds1: d = a * c.ps1 + b
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
|
||||
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
|
||||
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -202,23 +249,28 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
break;
|
||||
case 28: // ps_msub: d = a * c - b
|
||||
case 30: // ps_nmsub: d = -(a * c - b)
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.FNEG(size, result_reg, VB);
|
||||
if (error_free_transformation)
|
||||
{
|
||||
m_float_emit.MOV(ARM64Reg::Q4, result_reg);
|
||||
negated_b_reg = ARM64Reg::Q4;
|
||||
}
|
||||
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
|
||||
}
|
||||
break;
|
||||
case 29: // ps_madd: d = a * c + b
|
||||
case 31: // ps_nmadd: d = -(a * c + b)
|
||||
if (inaccurate_fma)
|
||||
if (nonfused)
|
||||
{
|
||||
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
|
||||
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
|
||||
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -232,11 +284,80 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
break;
|
||||
}
|
||||
|
||||
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here
|
||||
if (error_free_transformation)
|
||||
{
|
||||
// We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
|
||||
|
||||
// a' := s - b
|
||||
// (Transformed into -a' := b - s)
|
||||
if (negate_b)
|
||||
{
|
||||
if (!negated_b_reg)
|
||||
{
|
||||
m_float_emit.FNEG(size, ARM64Reg::Q4, VB);
|
||||
negated_b_reg = ARM64Reg::Q4;
|
||||
}
|
||||
m_float_emit.FSUB(size, ARM64Reg::Q1, *negated_b_reg, result_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_float_emit.FSUB(size, ARM64Reg::Q1, VB, result_reg);
|
||||
}
|
||||
|
||||
// b' := s - a'
|
||||
// (Transformed into b' := s + -a')
|
||||
m_float_emit.FADD(size, ARM64Reg::Q2, result_reg, ARM64Reg::Q1);
|
||||
|
||||
// da := a - a'
|
||||
// (Transformed into da := a + -a')
|
||||
if (nonfused)
|
||||
{
|
||||
switch (op5)
|
||||
{
|
||||
case 14: // ps_madds0: d = a * c.ps0 + b
|
||||
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 0);
|
||||
break;
|
||||
case 15: // ps_madds1: d = a * c.ps1 + b
|
||||
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 1);
|
||||
break;
|
||||
default:
|
||||
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg);
|
||||
break;
|
||||
}
|
||||
m_float_emit.FADD(size, ARM64Reg::Q1, ARM64Reg::Q3, ARM64Reg::Q1);
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (op5)
|
||||
{
|
||||
case 14: // ps_madds0: d = a * c.ps0 + b
|
||||
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 0);
|
||||
break;
|
||||
case 15: // ps_madds1: d = a * c.ps1 + b
|
||||
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 1);
|
||||
break;
|
||||
default:
|
||||
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// db := b - b'
|
||||
// (Transformed into -db := b' - b)
|
||||
if (negate_b)
|
||||
m_float_emit.FADD(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
|
||||
else
|
||||
m_float_emit.FSUB(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
|
||||
|
||||
BL(GetAsmRoutines()->ps_madd_eft);
|
||||
}
|
||||
|
||||
FixupBranch nan_fixup;
|
||||
if (m_accurate_nans)
|
||||
{
|
||||
const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
|
||||
const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
|
||||
const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
|
||||
const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);
|
||||
|
||||
// Check if we need to handle NaNs
|
||||
|
||||
@ -306,7 +427,13 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||
|
||||
fpr.FixSinglePrecision(d);
|
||||
|
||||
if (error_free_transformation)
|
||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
|
||||
|
||||
SetFPRFIfNeeded(true, VD);
|
||||
|
||||
if (error_free_transformation)
|
||||
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
|
||||
}
|
||||
|
||||
void JitArm64::ps_sel(UGeckoInstruction inst)
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <utility>
|
||||
|
||||
#include "Common/Arm64Emitter.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Common/Config/Config.h"
|
||||
#include "Common/FloatUtils.h"
|
||||
@ -265,6 +266,14 @@ void JitArm64::GenerateCommonAsm()
|
||||
GenerateFPRF(false);
|
||||
Common::JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF");
|
||||
|
||||
GetAsmRoutines()->fmadds_eft = GetCodePtr();
|
||||
GenerateFmaddsEft();
|
||||
Common::JitRegister::Register(GetAsmRoutines()->fmadds_eft, GetCodePtr(), "JIT_fmadds_eft");
|
||||
|
||||
GetAsmRoutines()->ps_madd_eft = GetCodePtr();
|
||||
GeneratePsMaddEft();
|
||||
Common::JitRegister::Register(GetAsmRoutines()->ps_madd_eft, GetCodePtr(), "JIT_ps_madd_eft");
|
||||
|
||||
GenerateQuantizedLoads();
|
||||
GenerateQuantizedStores();
|
||||
}
|
||||
@ -514,6 +523,90 @@ void JitArm64::GenerateFPRF(bool single)
|
||||
B(write_fprf_and_ret);
|
||||
}
|
||||
|
||||
// Inputs:
|
||||
// D0: Result with potentially incorrect rounding
|
||||
// D1: First error term
|
||||
// D2: Second error term, negated
|
||||
//
|
||||
// Outputs result with corrected rounding in D0. Clobbers X0-X1, D1, and flags.
|
||||
void JitArm64::GenerateFmaddsEft()
|
||||
{
|
||||
// Check if D0 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
|
||||
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
|
||||
MOVI2R(ARM64Reg::W1, 0x80000000);
|
||||
CMP(ARM64Reg::W1, ARM64Reg::W0, ArithOption(ARM64Reg::W0, ShiftType::LSL, 3));
|
||||
FixupBranch even_tie = B(CCFlags::CC_EQ);
|
||||
|
||||
const u8* ret = GetCodePtr();
|
||||
RET();
|
||||
|
||||
// Check if the error is 0
|
||||
SetJumpTarget(even_tie);
|
||||
m_float_emit.FSUB(ARM64Reg::D1, ARM64Reg::D1, ARM64Reg::D2);
|
||||
m_float_emit.FCMP(ARM64Reg::D1);
|
||||
B(CCFlags::CC_EQ, ret);
|
||||
|
||||
// Round D0 up or down
|
||||
MOVZ(ARM64Reg::X1, 1);
|
||||
CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
|
||||
CMP(ARM64Reg::X0, 0);
|
||||
CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
|
||||
ADD(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1);
|
||||
m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
|
||||
RET();
|
||||
}
|
||||
|
||||
// Inputs:
|
||||
// Q0: Results with potentially incorrect rounding
|
||||
// Q1: First error terms
|
||||
// Q2: Second error terms, negated
|
||||
//
|
||||
// Outputs results with corrected rounding in Q0. Clobbers X0, Q1-Q4, and flags.
|
||||
void JitArm64::GeneratePsMaddEft()
|
||||
{
|
||||
// Check if Q0 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
|
||||
MOVI2R(ARM64Reg::X0, 0x8000'0000'0000'0000);
|
||||
m_float_emit.SHL(64, ARM64Reg::Q3, ARM64Reg::Q0, 35);
|
||||
m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
|
||||
m_float_emit.CMEQ(64, ARM64Reg::Q3, ARM64Reg::Q3, ARM64Reg::Q4);
|
||||
|
||||
// Just for performance, exit early if there is no even tie
|
||||
m_float_emit.XTN(32, ARM64Reg::D4, ARM64Reg::Q3);
|
||||
FixupBranch even_tie;
|
||||
if (cpu_info.bAFP)
|
||||
{
|
||||
m_float_emit.FCMP(ARM64Reg::D4);
|
||||
even_tie = B(CCFlags::CC_NEQ);
|
||||
}
|
||||
else
|
||||
{
|
||||
// If we don't have AFP and the emulated software has NI set, subnormals will compare equal to
|
||||
// zero, so we can't use FCMP unless we were to put some shuffle instruction before it.
|
||||
// FMOV is a little slower than FCMP, but it's faster than adding an extra instruction.
|
||||
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D4);
|
||||
even_tie = CBNZ(ARM64Reg::X0);
|
||||
}
|
||||
RET();
|
||||
SetJumpTarget(even_tie);
|
||||
|
||||
// Check if the error is zero
|
||||
m_float_emit.FSUB(64, ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
|
||||
MOVZ(ARM64Reg::X0, 1);
|
||||
m_float_emit.FCMEQ(64, ARM64Reg::Q2, ARM64Reg::Q1);
|
||||
|
||||
// Store -1 or 1 in Q1 depending on whether we're rounding down or up
|
||||
m_float_emit.EOR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q0);
|
||||
m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
|
||||
m_float_emit.SSHR(64, ARM64Reg::Q1, ARM64Reg::Q1, 63);
|
||||
m_float_emit.ORR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q4);
|
||||
|
||||
// Round the elements that have both a non-zero error and an even tie
|
||||
m_float_emit.BIC(ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q2);
|
||||
m_float_emit.AND(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
|
||||
m_float_emit.ADD(64, ARM64Reg::Q0, ARM64Reg::Q0, ARM64Reg::Q1);
|
||||
RET();
|
||||
}
|
||||
|
||||
void JitArm64::GenerateQuantizedLoads()
|
||||
{
|
||||
// X0 is a temporary
|
||||
|
||||
@ -30,6 +30,8 @@ struct CommonAsmRoutinesBase
|
||||
const u8* cstd;
|
||||
const u8* fprf_single;
|
||||
const u8* fprf_double;
|
||||
const u8* fmadds_eft;
|
||||
const u8* ps_madd_eft;
|
||||
|
||||
// In: array index: GQR to use.
|
||||
// In: ECX: Address to read from.
|
||||
|
||||
@ -57,7 +57,7 @@
|
||||
// After resetting the stack to the top, we call _resetstkoflw() to restore
|
||||
// the guard page at the 256kb mark.
|
||||
|
||||
const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitBase::JIT_SETTINGS{{
|
||||
const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JitBase::JIT_SETTINGS{{
|
||||
{&JitBase::bJITOff, &Config::MAIN_DEBUG_JIT_OFF},
|
||||
{&JitBase::bJITLoadStoreOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_OFF},
|
||||
{&JitBase::bJITLoadStorelXzOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_LXZ_OFF},
|
||||
@ -79,6 +79,7 @@ const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitB
|
||||
{&JitBase::m_low_dcbz_hack, &Config::MAIN_LOW_DCBZ_HACK},
|
||||
{&JitBase::m_fprf, &Config::MAIN_FPRF},
|
||||
{&JitBase::m_accurate_nans, &Config::MAIN_ACCURATE_NANS},
|
||||
{&JitBase::m_accurate_fmadds, &Config::MAIN_ACCURATE_FMADDS},
|
||||
{&JitBase::m_fastmem_enabled, &Config::MAIN_FASTMEM},
|
||||
{&JitBase::m_accurate_cpu_cache_enabled, &Config::MAIN_ACCURATE_CPU_CACHE},
|
||||
}};
|
||||
|
||||
@ -158,6 +158,7 @@ protected:
|
||||
bool m_low_dcbz_hack = false;
|
||||
bool m_fprf = false;
|
||||
bool m_accurate_nans = false;
|
||||
bool m_accurate_fmadds = false;
|
||||
bool m_fastmem_enabled = false;
|
||||
bool m_accurate_cpu_cache_enabled = false;
|
||||
|
||||
@ -165,7 +166,7 @@ protected:
|
||||
bool m_cleanup_after_stackfault = false;
|
||||
u8* m_stack_guard = nullptr;
|
||||
|
||||
static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JIT_SETTINGS;
|
||||
static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JIT_SETTINGS;
|
||||
|
||||
bool DoesConfigNeedRefresh() const;
|
||||
void RefreshConfig();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user