Merge pull request #13900 from JosJuice/jit-fma-double-rounding

Jit: Implement error-free transformation for single-precision FMA
This commit is contained in:
JosJuice 2026-01-23 21:43:18 +01:00 committed by GitHub
commit 3221e982d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 958 additions and 191 deletions

View File

@ -3156,6 +3156,10 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
}
void ARM64FloatEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 0, 3, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
@ -3505,6 +3509,53 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
}
// Comparison
void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
}
void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
}
void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
}
void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
}
void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
}
void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
}
void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
}
void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
}
void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
}
void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0xA, Rd, Rn);
}
void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
}
// Float comparison
void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
{
EmitCompare(0, 0, 0, 0, Rn, Rm);
@ -3664,7 +3715,7 @@ void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
shift, src_size);
EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
EmitShiftImm(IsQuad(Rd), 0, src_size | shift, 0b01010, Rd, Rn);
}
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
@ -3674,11 +3725,18 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
}
void ARM64FloatEmitter::SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
shift, src_size);
EmitShiftImm(IsQuad(Rd), 0, src_size * 2 - shift, 0b00000, Rd, Rn);
}
void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
shift, src_size);
EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
EmitShiftImm(IsQuad(Rd), 1, src_size * 2 - shift, 0b00100, Rd, Rn);
}
void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)

View File

@ -800,6 +800,7 @@ public:
ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
}
void CNEG(ARM64Reg Rd, ARM64Reg Rn, CCFlags cond) { CSNEG(Rd, Rn, Rn, (CCFlags)((u32)cond ^ 1)); }
void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
{
@ -1281,6 +1282,7 @@ public:
void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -1342,6 +1344,19 @@ public:
void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
// Comparison
void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Float comparison
void FCMP(ARM64Reg Rn, ARM64Reg Rm);
void FCMP(ARM64Reg Rn);
@ -1380,6 +1395,7 @@ public:
void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);

View File

@ -2519,19 +2519,19 @@ void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
WriteSSEOp(0x66, 0x6C, dest, arg);
}
void XEmitter::PSRLW(X64Reg reg, int shift)
void XEmitter::PSRLW(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLD(X64Reg reg, int shift)
void XEmitter::PSRLD(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLQ(X64Reg reg, int shift)
void XEmitter::PSRLQ(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
Write8(shift);
@ -2542,38 +2542,38 @@ void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
WriteSSEOp(0x66, 0xd3, reg, arg);
}
void XEmitter::PSRLDQ(X64Reg reg, int shift)
void XEmitter::PSRLDQ(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
Write8(shift);
}
void XEmitter::PSLLW(X64Reg reg, int shift)
void XEmitter::PSLLW(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLD(X64Reg reg, int shift)
void XEmitter::PSLLD(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLQ(X64Reg reg, int shift)
void XEmitter::PSLLQ(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLDQ(X64Reg reg, int shift)
void XEmitter::PSLLDQ(X64Reg reg, u8 shift)
{
WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
Write8(shift);
}
// WARNING not REX compatible
void XEmitter::PSRAW(X64Reg reg, int shift)
void XEmitter::PSRAW(X64Reg reg, u8 shift)
{
if (reg > 7)
PanicAlertFmt("The PSRAW-emitter does not support regs above 7");
@ -2585,7 +2585,7 @@ void XEmitter::PSRAW(X64Reg reg, int shift)
}
// WARNING not REX compatible
void XEmitter::PSRAD(X64Reg reg, int shift)
void XEmitter::PSRAD(X64Reg reg, u8 shift)
{
if (reg > 7)
PanicAlertFmt("The PSRAD-emitter does not support regs above 7");
@ -2695,6 +2695,11 @@ void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
Write8(blend);
}
void XEmitter::PCMPEQQ(X64Reg dest, const OpArg& arg)
{
WriteSSE41Op(0x66, 0x3829, dest, arg);
}
void XEmitter::PAND(X64Reg dest, const OpArg& arg)
{
WriteSSEOp(0x66, 0xDB, dest, arg);
@ -3038,6 +3043,12 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
}
void XEmitter::VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift)
{
WriteAVXOp(0x66, 0x73, (X64Reg)6, regOp1, R(regOp2));
Write8(shift);
}
void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp)
{
WriteAVXOp(0x00, 0x29, regOp, X64Reg::INVALID_REG, arg);

View File

@ -801,19 +801,19 @@ public:
void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
void PSRLW(X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift);
void PSRLW(X64Reg reg, u8 shift);
void PSRLD(X64Reg reg, u8 shift);
void PSRLQ(X64Reg reg, u8 shift);
void PSRLQ(X64Reg reg, const OpArg& arg);
void PSRLDQ(X64Reg reg, int shift);
void PSRLDQ(X64Reg reg, u8 shift);
void PSLLW(X64Reg reg, int shift);
void PSLLD(X64Reg reg, int shift);
void PSLLQ(X64Reg reg, int shift);
void PSLLDQ(X64Reg reg, int shift);
void PSLLW(X64Reg reg, u8 shift);
void PSLLD(X64Reg reg, u8 shift);
void PSLLQ(X64Reg reg, u8 shift);
void PSLLDQ(X64Reg reg, u8 shift);
void PSRAW(X64Reg reg, int shift);
void PSRAD(X64Reg reg, int shift);
void PSRAW(X64Reg reg, u8 shift);
void PSRAD(X64Reg reg, u8 shift);
// SSE4: data type conversions
void PMOVSXBW(X64Reg dest, const OpArg& arg);
@ -836,6 +836,9 @@ public:
void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
// SSE4: compare instructions
void PCMPEQQ(X64Reg dest, const OpArg& arg);
// AVX
void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@ -878,6 +881,8 @@ public:
void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift);
void VMOVAPS(const OpArg& arg, X64Reg regOp);
void VZEROUPPER();

View File

@ -222,6 +222,7 @@ const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS{{System::Main, "Core", "DivByZer
false};
const Info<bool> MAIN_FPRF{{System::Main, "Core", "FPRF"}, false};
const Info<bool> MAIN_ACCURATE_NANS{{System::Main, "Core", "AccurateNaNs"}, false};
const Info<bool> MAIN_ACCURATE_FMADDS{{System::Main, "Core", "AccurateFmadds"}, true};
const Info<bool> MAIN_DISABLE_ICACHE{{System::Main, "Core", "DisableICache"}, false};
const Info<float> MAIN_EMULATION_SPEED{{System::Main, "Core", "EmulationSpeed"}, 1.0f};
#if defined(ANDROID)

View File

@ -128,6 +128,7 @@ extern const Info<bool> MAIN_FLOAT_EXCEPTIONS;
extern const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS;
extern const Info<bool> MAIN_FPRF;
extern const Info<bool> MAIN_ACCURATE_NANS;
extern const Info<bool> MAIN_ACCURATE_FMADDS;
extern const Info<bool> MAIN_DISABLE_ICACHE;
extern const Info<float> MAIN_EMULATION_SPEED;
extern const Info<bool> MAIN_PRECISION_FRAME_TIMING;

View File

@ -80,6 +80,7 @@ public:
layer->Set(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS, m_settings.divide_by_zero_exceptions);
layer->Set(Config::MAIN_FPRF, m_settings.fprf);
layer->Set(Config::MAIN_ACCURATE_NANS, m_settings.accurate_nans);
layer->Set(Config::MAIN_ACCURATE_FMADDS, m_settings.accurate_fmadds);
layer->Set(Config::MAIN_DISABLE_ICACHE, m_settings.disable_icache);
layer->Set(Config::MAIN_SYNC_ON_SKIP_IDLE, m_settings.sync_on_skip_idle);
layer->Set(Config::MAIN_SYNC_GPU, m_settings.sync_gpu);

View File

@ -68,6 +68,7 @@ struct NetSettings
bool divide_by_zero_exceptions = false;
bool fprf = false;
bool accurate_nans = false;
bool accurate_fmadds = false;
bool disable_icache = false;
bool sync_on_skip_idle = false;
bool sync_gpu = false;

View File

@ -1425,6 +1425,7 @@ bool NetPlayServer::SetupNetSettings()
settings.divide_by_zero_exceptions = Config::Get(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS);
settings.fprf = Config::Get(Config::MAIN_FPRF);
settings.accurate_nans = Config::Get(Config::MAIN_ACCURATE_NANS);
settings.accurate_fmadds = Config::Get(Config::MAIN_ACCURATE_FMADDS);
settings.disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE);
settings.sync_on_skip_idle = Config::Get(Config::MAIN_SYNC_ON_SKIP_IDLE);
settings.sync_gpu = Config::Get(Config::MAIN_SYNC_GPU);

View File

@ -342,12 +342,12 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
// - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
// 3. Tying up to even because `c` is too small
// a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
// b. The lowest bit of `f` is 1 (this means it ties to even downwards)
// b. The lowest bit of `f` is 1 (this means it ties to even upwards)
// c. `c` is negative and does not round `d` downwards
// - This is similar to the first one but in reverse, rounding up instead of down.
// 4. Tying down because `d` rounded down
// a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
// b. The lowest bit of `f` is 0 (this means it ties to even upwards)
// b. The lowest bit of `f` is 0 (this means it ties to even downwards)
// c. `c` is negative, and the highest bit of c is 1,
// and at least one other bit of c is nonzero
// - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
@ -375,12 +375,6 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
// - Correct ordering of NaN checking (for both double and single precision)
// - Rounding frC up
// - Rounding only once for single precision inputs (this will be the large majority of cases!)
// - Currently this is interpreter-only.
// This can be implemented in the JIT just as easily, though.
// Eventually the JITs should hopefully support detecting back to back
// single-precision operations, which will lead to no overhead at all.
// In the cases where JITs can't do this, an alternative method is used, as
// is done in the interpreter as well.
// - Rounding only once for double precision inputs
// - This is a side effect of how we handle single-precision inputs: By doing
// error calculations rather than checking if every input is a float, we ensure that we know
@ -421,7 +415,7 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
const double b_sign = sub ? -b : b;
result.value = std::fma(a, c_round, b_sign);
// We then check if we're currently tying in rounding directioh
// We then check if we're currently tying in rounding direction
const u64 result_bits = std::bit_cast<u64>(result.value);
// The mask of the `d` bits as shown in the above comments
@ -432,9 +426,8 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
// Because we check this entire mask which includes a 1 bit, we can be sure that
// if this result passes, the input is not an infinity that would become a NaN.
// This means that, for the JITs, if they only wanted to check for a subset of these
// bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
// they would have to check if the result was NaN before here.
// If we had only checked for a subset of these bits (e.g. only checking if the last
// one was 0), we would have needed to also check if the exponent was all ones.
if ((result_bits & D_MASK) == EVEN_TIE)
{
// Because we have a tie, we now compute any error in the FMA calculation

View File

@ -1284,9 +1284,9 @@ BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
return cb.m_gqr_used & ~cb.m_gqr_modified;
}
BitSet32 Jit64::CallerSavedRegistersInUse() const
BitSet32 Jit64::CallerSavedRegistersInUse(BitSet32 additional_registers) const
{
BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16);
BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16) | additional_registers;
return in_use & ABI_ALL_CALLER_SAVED;
}

View File

@ -77,7 +77,7 @@ public:
// Returns false if no free memory region can be found for either of the two.
bool SetEmitterStateToFreeCodeRegion();
BitSet32 CallerSavedRegistersInUse() const;
BitSet32 CallerSavedRegistersInUse(BitSet32 additional_registers = {}) const;
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
void IntializeSpeculativeConstants();
@ -153,9 +153,10 @@ public:
void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
bool duplicate = false);
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber,
std::optional<Gen::OpArg> Ra, std::optional<Gen::OpArg> Rb,
std::optional<Gen::OpArg> Rc);
[[nodiscard]] Gen::FixupBranch HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm,
Gen::X64Reg clobber, std::optional<Gen::OpArg> Ra,
std::optional<Gen::OpArg> Rb,
std::optional<Gen::OpArg> Rc);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);

View File

@ -265,6 +265,10 @@ void Jit64AsmRoutineManager::GenerateCommon()
GenMfcr();
cdts = AlignCode4();
GenConvertDoubleToSingle();
fmadds_eft = AlignCode4();
GenerateFmaddsEft();
ps_madd_eft = AlignCode4();
GeneratePsMaddEft();
GenQuantizedLoads();
GenQuantizedSingleLoads();

View File

@ -93,8 +93,9 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
SetFPRFIfNeeded(input, false);
}
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional<OpArg> Ra,
std::optional<OpArg> Rb, std::optional<OpArg> Rc)
FixupBranch Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber,
std::optional<OpArg> Ra, std::optional<OpArg> Rb,
std::optional<OpArg> Rc)
{
// | PowerPC | x86
// ---------------------+----------+---------
@ -104,9 +105,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
// Dragon Ball: Revenge of King Piccolo requires generated NaNs
// to be positive, so we'll have to handle them manually.
if (!m_accurate_nans)
return;
if (inst.OPCD != 4)
{
// not paired-single
@ -140,7 +138,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
FixupBranch done = J(Jump::Near);
SwitchToNearCode();
SetJumpTarget(done);
return done;
}
else
{
@ -217,7 +215,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
FixupBranch done = J(Jump::Near);
SwitchToNearCode();
SetJumpTarget(done);
return done;
}
}
@ -329,14 +327,21 @@ void Jit64::fp_arith(UGeckoInstruction inst)
}
}
switch (inst.SUBOP5)
if (m_accurate_nans)
{
case 18:
HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
break;
case 25:
HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
break;
std::optional<FixupBranch> handled_nans;
switch (inst.SUBOP5)
{
case 18:
handled_nans = HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
break;
case 25:
handled_nans = HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
break;
}
if (handled_nans)
SetJumpTarget(*handled_nans);
}
if (single)
@ -368,51 +373,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
const bool use_fma = Config::Get(Config::SESSION_USE_FMA);
const bool software_fma = use_fma && !cpu_info.bFMA;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
bool single = inst.OPCD == 4 || inst.OPCD == 59;
bool round_input = single && !js.op->fprIsSingle[c];
bool preserve_inputs = m_accurate_nans;
bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
bool packed =
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
const int a = inst.FA;
const int b = inst.FB;
const int c = inst.FC;
const int d = inst.FD;
const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub
const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd
const bool madds0 = inst.SUBOP5 == 14;
const bool madds1 = inst.SUBOP5 == 15;
const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
const bool single = inst.OPCD == 4 || inst.OPCD == 59;
const bool round_input = single && !js.op->fprIsSingle[c];
const bool error_free_transformation = single && m_accurate_fmadds;
const bool packed =
inst.OPCD == 4 ||
(!cpu_info.bAtom && !software_fma && !error_free_transformation && single &&
js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
const bool want_rc_rounded =
(error_free_transformation || (software_fma && packed)) && round_input;
const bool error_free_transformation_wants_rc_duplicated =
(error_free_transformation && !want_rc_rounded) && (madds0 || madds1);
const bool accurate_nans_wants_rc_duplicated = m_accurate_nans && (madds0 || madds1);
const bool want_rc_duplicated =
error_free_transformation_wants_rc_duplicated || accurate_nans_wants_rc_duplicated;
const bool preserve_d_due_to_a_or_b =
(m_accurate_nans || error_free_transformation) && (a == d || b == d);
const bool preserve_d_due_to_c =
c == d && ((m_accurate_nans && (!want_rc_duplicated || software_fma)) ||
(error_free_transformation && !want_rc_rounded));
const bool preserve_d = preserve_d_due_to_a_or_b || preserve_d_due_to_c;
X64Reg scratch_xmm = XMM0;
X64Reg result_xmm = XMM1;
X64Reg Rc_duplicated = XMM2;
X64Reg Rc_rounded = XMM3;
BitSet32 scratch_registers{XMM0 + 16, XMM1 + 16};
RCX64Reg xmm2_guard;
RCX64Reg xmm3_guard;
if (error_free_transformation)
{
xmm2_guard = fpr.Scratch(XMM2);
xmm3_guard = fpr.Scratch(XMM3);
RegCache::Realize(xmm2_guard, xmm3_guard);
scratch_registers[XMM2 + 16] = true;
scratch_registers[XMM3 + 16] = true;
}
else if (software_fma)
{
xmm2_guard = fpr.Scratch(XMM2);
RegCache::Realize(xmm2_guard);
scratch_registers[XMM2 + 16] = true;
}
RCOpArg Ra;
RCOpArg Rb;
RCOpArg Rc;
RCX64Reg Rd;
RCX64Reg xmm2_guard;
RCX64Reg result_xmm_guard;
RCX64Reg Rc_duplicated_guard;
if (software_fma)
{
xmm2_guard = fpr.Scratch(XMM2);
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
Ra = packed || error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
Rb = packed || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = packed || (error_free_transformation && !want_rc_rounded && !want_rc_duplicated) ?
fpr.Bind(c, RCMode::Read) :
fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
if (preserve_d && packed)
{
result_xmm_guard = fpr.Scratch();
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
RegCache::Realize(Ra, Rb, Rc, Rd, result_xmm_guard);
result_xmm = Gen::X64Reg(result_xmm_guard);
scratch_registers[result_xmm + 16] = true;
}
else
{
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
RegCache::Realize(Ra, Rb, Rc, Rd);
result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
}
}
@ -421,48 +462,88 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// For use_fma == true:
// Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b.
Ra = fpr.Use(a, RCMode::Read);
Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = fpr.Use(c, RCMode::Read);
Ra = error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
Rb =
use_fma || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = error_free_transformation && !want_rc_rounded && !want_rc_duplicated ?
fpr.Bind(c, RCMode::Read) :
fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd);
if (madds_accurate_nans)
{
Rc_duplicated_guard = fpr.Scratch();
RegCache::Realize(Rc_duplicated_guard);
Rc_duplicated = Rc_duplicated_guard;
}
}
if (error_free_transformation_wants_rc_duplicated ||
(accurate_nans_wants_rc_duplicated &&
((!software_fma && !error_free_transformation) || (error_free_transformation && packed))))
{
Rc_duplicated_guard = fpr.Scratch();
RegCache::Realize(Rc_duplicated_guard);
Rc_duplicated = Rc_duplicated_guard;
scratch_registers[Rc_duplicated + 16] = true;
}
const auto registers_to_save = [&](BitSet32 scratch_registers_to_save) {
const BitSet32 scratch_registers_not_to_save = scratch_registers & ~scratch_registers_to_save;
return CallerSavedRegistersInUse(scratch_registers_to_save) & ~scratch_registers_not_to_save;
};
if (software_fma)
{
if (want_rc_rounded)
{
if (error_free_transformation && madds0)
{
MOVDDUP(Rc_rounded, Rc);
Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
}
else if (error_free_transformation && madds1)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_rounded, Rc, Rc, 3);
Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
}
else
{
Force25BitPrecision(Rc_rounded, Rc, XMM2);
}
}
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
{
if ((i == 0 || madds0) && !madds1)
if (madds0 || (i == 0 && !madds1) || (want_rc_rounded && error_free_transformation && madds1))
{
if (round_input)
if (want_rc_rounded)
MOVAPD(XMM1, R(Rc_rounded));
else if (round_input)
Force25BitPrecision(XMM1, Rc, XMM2);
else if (Rc.IsSimpleReg())
MOVAPD(XMM1, Rc);
else
MOVSD(XMM1, Rc);
}
else
{
MOVHLPS(XMM1, Rc.GetSimpleReg());
if (round_input)
MOVHLPS(XMM1, want_rc_rounded ? Rc_rounded : Rc.GetSimpleReg());
if (round_input && !want_rc_rounded)
Force25BitPrecision(XMM1, R(XMM1), XMM2);
}
// Write the result from the previous loop iteration into result_xmm so we don't lose it.
// It's important that this is done after reading Rc above, in case we have madds1 and
// result_xmm == Rd == Rc.
// !want_rc_rounded and result_xmm == Rd == Rc.
if (packed && i == 0)
MOVLHPS(result_xmm, XMM0);
if (i == 0)
{
MOVSD(XMM0, Ra);
MOVSD(XMM2, Rb);
if (Ra.IsSimpleReg())
MOVAPD(XMM0, Ra);
else
MOVSD(XMM0, Ra);
if (Rb.IsSimpleReg())
MOVAPD(XMM2, Rb);
else
MOVSD(XMM2, Rb);
}
else
{
@ -473,23 +554,36 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
if (subtract)
XORPS(XMM2, MConst(psSignBits));
BitSet32 registers_in_use = CallerSavedRegistersInUse();
BitSet32 scratch_registers_to_save{};
if (packed && i == 0)
scratch_registers_to_save[result_xmm + 16] = true;
if (want_rc_rounded && (error_free_transformation || i == 1))
scratch_registers_to_save[Rc_rounded + 16] = true;
const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
}
if (packed)
{
// result_xmm's upper lane has the result of the first loop iteration
MOVSD(R(result_xmm), XMM0);
}
else
{
DEBUG_ASSERT(result_xmm == XMM0);
}
if (madds_accurate_nans)
if (want_rc_duplicated)
{
if (madds0)
MOVDDUP(Rc_duplicated, Rc);
else
else if (madds1)
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3);
else
DEBUG_ASSERT(false);
}
}
else
@ -497,7 +591,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
if (madds0)
{
MOVDDUP(result_xmm, Rc);
if (madds_accurate_nans)
if (want_rc_duplicated)
MOVAPD(R(Rc_duplicated), result_xmm);
if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
@ -505,18 +599,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
else if (madds1)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (madds_accurate_nans)
if (want_rc_duplicated)
MOVAPD(R(Rc_duplicated), result_xmm);
if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
}
else
{
DEBUG_ASSERT(!want_rc_duplicated);
if (round_input)
Force25BitPrecision(result_xmm, Rc, scratch_xmm);
else
MOVAPD(result_xmm, Rc);
}
if (want_rc_rounded)
MOVAPD(R(Rc_rounded), result_xmm);
if (use_fma)
{
@ -556,6 +653,160 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
}
}
if (m_accurate_nans && result_xmm == XMM0)
{
// HandleNaNs needs to clobber XMM0
result_xmm = error_free_transformation ? XMM1 : Rd;
MOVAPD(result_xmm, R(XMM0));
DEBUG_ASSERT(!preserve_d);
}
std::optional<FixupBranch> handled_nans;
if (!packed && m_accurate_nans)
{
// The clobber register is unused when not packed.
handled_nans =
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
}
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here.
if (error_free_transformation)
{
if (result_xmm != XMM1)
{
MOVAPD(XMM1, R(result_xmm));
result_xmm = XMM1;
}
X64Reg Rc_rounded_duplicated = Rc.GetSimpleReg();
BitSet32 scratch_registers_to_save = {XMM1 + 16, XMM2 + 16};
if (want_rc_rounded)
{
Rc_rounded_duplicated = Rc_rounded;
scratch_registers_to_save[Rc_rounded] = true;
}
else if (want_rc_duplicated)
{
Rc_rounded_duplicated = Rc_duplicated;
scratch_registers_to_save[want_rc_duplicated] = true;
}
// We've calculated s := a + b, with a = Ra * Rc_rounded_duplicated, b = subtract ? -Rb : Rb
if (packed)
{
// a' := s - b
if (subtract)
avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM0, R(XMM1), Rb);
else
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM1), Rb);
// b' := s - a'
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM1), R(XMM0));
// da := a - a'
if (software_fma)
{
scratch_registers_to_save[XMM0 + 16] = true;
const BitSet32 registers_in_use_1 = registers_to_save(scratch_registers_to_save);
ABI_PushRegistersAndAdjustStack(registers_in_use_1, 0);
avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
MOVAPD(XMM0, R(Rc_rounded_duplicated));
MOVAPD(XMM1, Ra);
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
// We will read from the upper lane of Rc_rounded_duplicated later,
// so we need to make sure that that lane isn't overwritten.
if (Rc_rounded_duplicated == XMM3)
MOVSD(XMM3, R(XMM0));
else
MOVAPD(XMM3, R(XMM0));
ABI_PopRegistersAndAdjustStack(registers_in_use_1, 0);
scratch_registers_to_save[XMM0 + 16] = false;
scratch_registers_to_save[XMM3 + 16] = true;
const BitSet32 registers_in_use_2 = registers_to_save(scratch_registers_to_save);
ABI_PushRegistersAndAdjustStack(registers_in_use_2, 0);
MOVHLPS(XMM2, XMM0);
XORPS(XMM2, MConst(psSignBits));
MOVHLPS(XMM0, Rc_rounded_duplicated);
MOVHLPS(XMM1, Ra.GetSimpleReg());
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
ABI_PopRegistersAndAdjustStack(registers_in_use_2, 0);
UNPCKLPD(XMM0, R(XMM3));
}
else if (use_fma)
{
VFMSUB231PD(XMM0, Rc_rounded_duplicated, Ra);
}
else
{
avx_op(&XEmitter::VMULPD, &XEmitter::MULPD, XMM3, R(Rc_rounded_duplicated), Ra);
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM3), R(XMM0), true, false, XMM3);
}
// db := b - b'
// (Transformed into -db := b' - b)
if (subtract)
avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM2, R(XMM2), Rb);
else
avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM2), Rb);
CALL(GetAsmRoutines()->ps_madd_eft);
}
else
{
// a' := s - b
if (subtract)
avx_op(&XEmitter::VADDSD, &XEmitter::ADDSD, XMM0, R(XMM1), Rb, false);
else
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM1), Rb, false);
// b' := s - a'
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM2, R(XMM1), R(XMM0), false);
// da := a - a'
if (software_fma)
{
const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
MOVAPD(XMM0, R(Rc_rounded_duplicated));
MOVAPD(XMM1, Ra);
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
}
else if (use_fma)
{
VFMSUB231SD(XMM0, Rc_rounded_duplicated, Ra);
}
else
{
avx_op(&XEmitter::VMULSD, &XEmitter::MULSD, XMM3, R(Rc_rounded_duplicated), Ra, false);
avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM3), R(XMM0), false, false, XMM3);
}
// db := b - b'
// (Transformed into -db := b' - b)
if (subtract)
ADDSD(XMM2, Rb);
else
SUBSD(XMM2, Rb);
CALL(GetAsmRoutines()->fmadds_eft);
}
}
// Using x64's nmadd/nmsub would require us to swap the sign of the addend
// (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
// Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
@ -563,16 +814,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
if (negate)
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
if (m_accurate_nans && result_xmm == XMM0)
if (packed && m_accurate_nans)
{
// HandleNaNs needs to clobber XMM0
MOVAPD(Rd, R(result_xmm));
result_xmm = Rd;
DEBUG_ASSERT(!preserve_d);
// If packed, the clobber register must be XMM0.
handled_nans =
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
}
// If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc);
// If the handled_nans branch was taken in the non-packed case, that means the result is NaN,
// so we can skip the XORPD and the error-free transformation. If the handled_nans branch was
// taken in the packed case, we don't know if both of the results were NaN or only one, so we
// can't skip anything.
if (handled_nans)
SetJumpTarget(*handled_nans);
if (single)
FinalizeSingleResult(Rd, R(result_xmm), packed, true);

View File

@ -100,12 +100,19 @@ void Jit64::ps_muls(UGeckoInstruction inst)
default:
PanicAlertFmt("ps_muls WTF!!!");
}
if (round_input)
Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0);
else if (XMM1 != Rc_duplicated)
MOVAPD(XMM1, Rc_duplicated);
MULPD(XMM1, Ra);
HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
if (m_accurate_nans)
{
const FixupBranch handled_nans = HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
SetJumpTarget(handled_nans);
}
FinalizeSingleResult(Rd, R(XMM1));
}

View File

@ -741,7 +741,8 @@ void EmuCodeBlock::JitClearCA()
// Abstract between AVX and SSE: automatically handle 3-operand instructions
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible)
const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible,
X64Reg scratch)
{
if (arg1.IsSimpleReg(regOp))
{
@ -778,19 +779,19 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
else
{
// The ugly case: Not reversible, and we have regOp == arg2 without AVX or with arg1 == memory
if (!arg1.IsSimpleReg(XMM0))
MOVAPD(XMM0, arg1);
if (!arg1.IsSimpleReg(scratch))
MOVAPD(scratch, arg1);
if (cpu_info.bAVX)
{
(this->*avxOp)(regOp, XMM0, arg2);
(this->*avxOp)(regOp, scratch, arg2);
}
else
{
(this->*sseOp)(XMM0, arg2);
(this->*sseOp)(scratch, arg2);
if (packed)
MOVAPD(regOp, R(XMM0));
MOVAPD(regOp, R(scratch));
else
MOVSD(regOp, R(XMM0));
MOVSD(regOp, R(scratch));
}
}
}
@ -798,7 +799,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
// Abstract between AVX and SSE: automatically handle 3-operand instructions
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, u8),
void (XEmitter::*sseOp)(X64Reg, const OpArg&, u8), X64Reg regOp,
const OpArg& arg1, const OpArg& arg2, u8 imm)
const OpArg& arg1, const OpArg& arg2, u8 imm, X64Reg scratch)
{
if (arg1.IsSimpleReg(regOp))
{
@ -816,21 +817,40 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
else
{
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
if (!arg1.IsSimpleReg(XMM0))
MOVAPD(XMM0, arg1);
if (!arg1.IsSimpleReg(scratch))
MOVAPD(scratch, arg1);
if (cpu_info.bAVX)
{
(this->*avxOp)(regOp, XMM0, arg2, imm);
(this->*avxOp)(regOp, scratch, arg2, imm);
}
else
{
(this->*sseOp)(XMM0, arg2, imm);
if (regOp != XMM0)
MOVAPD(regOp, R(XMM0));
(this->*sseOp)(scratch, arg2, imm);
if (regOp != scratch)
MOVAPD(regOp, R(scratch));
}
}
}
// Abstract between AVX and SSE: automatically handle 3-operand instructions
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, u8),
void (XEmitter::*sseOp)(X64Reg, u8), X64Reg regOp1, X64Reg regOp2, u8 imm)
{
if (regOp1 == regOp2)
{
(this->*sseOp)(regOp1, imm);
}
else if (cpu_info.bAVX)
{
(this->*avxOp)(regOp1, regOp2, imm);
}
else
{
MOVAPD(regOp1, R(regOp2));
(this->*sseOp)(regOp1, imm);
}
}
alignas(16) static const u64 psMantissaTruncate[2] = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
alignas(16) static const u64 psRoundBit[2] = {0x8000000, 0x8000000};
@ -842,8 +862,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
{
if (m_jit.jo.accurateSinglePrecision)
{
DEBUG_ASSERT(output != tmp);
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
if (input.IsSimpleReg() && cpu_info.bAVX)
if (input.IsSimpleReg() && !input.IsSimpleReg(tmp) && cpu_info.bAVX)
{
VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));

View File

@ -113,10 +113,14 @@ public:
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), Gen::X64Reg regOp,
const Gen::OpArg& arg1, const Gen::OpArg& arg2, bool packed = true,
bool reversible = false);
bool reversible = false, Gen::X64Reg scratch = Gen::XMM0);
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&, u8),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm,
Gen::X64Reg scratch = Gen::XMM0);
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, u8),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, u8), Gen::X64Reg regOp1, Gen::X64Reg regOp2,
u8 imm);
void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);

View File

@ -326,6 +326,98 @@ void CommonAsmRoutines::GenMfcr()
Common::JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
}
// Inputs:
// XMM0: First error term
// XMM1: Result with potentially incorrect rounding
// XMM2: Second error term, negated
//
// Outputs result with corrected rounding in XMM1.
// Clobbers RSCRATCH, RSCRATCH2, XMM0, XMM2, and flags.
void CommonAsmRoutines::GenerateFmaddsEft()
{
// Check if XMM1 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
MOVQ_xmm(R(RSCRATCH), XMM1);
MOV(32, R(RSCRATCH2), Imm32(0x80000000));
LEA(32, RSCRATCH2, MComplex(RSCRATCH2, RSCRATCH, SCALE_8, 0));
TEST(32, R(RSCRATCH2), R(RSCRATCH2));
FixupBranch even_tie = J_CC(CCFlags::CC_Z);
const u8* ret = GetCodePtr();
RET();
// Check if the error is 0
SetJumpTarget(even_tie);
SUBSD(XMM0, R(XMM2));
XORPD(XMM2, R(XMM2));
UCOMISD(XMM0, R(XMM2));
J_CC(CCFlags::CC_E, ret);
// Round XMM1 up or down
MOVQ_xmm(R(RSCRATCH2), XMM0);
XOR(64, R(RSCRATCH2), R(RSCRATCH));
SAR(64, R(RSCRATCH2), Imm8(63));
OR(64, R(RSCRATCH2), Imm8(1));
ADD(64, R(RSCRATCH), R(RSCRATCH2));
MOVQ_xmm(XMM1, R(RSCRATCH));
RET();
}
alignas(16) static const __m128i double_msb = _mm_set_epi64x(0x8000000000000000,
0x8000000000000000);
alignas(16) static const __m128i double_lsb = _mm_set_epi64x(1, 1);
// Inputs:
// XMM0: First error terms
// XMM1: Results with potentially incorrect rounding
// XMM2: Second error terms, negated
//
// Outputs results with corrected rounding in XMM1. Clobbers RSCRATCH, XMM0-XMM3, and flags.
void CommonAsmRoutines::GeneratePsMaddEft()
{
// Check if XMM1 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
avx_op(&XEmitter::VPSLLQ, &XEmitter::PSLLQ, XMM3, XMM1, 35);
if (cpu_info.bSSE4_1)
{
PCMPEQQ(XMM3, MConst(double_msb));
}
else
{
PCMPEQW(XMM3, MConst(double_msb));
PSHUFD(XMM3, R(XMM3), 0xF5);
}
// Just for performance, exit early if there is no even tie
if (cpu_info.bSSE4_1)
{
PTEST(XMM3, R(XMM3));
}
else
{
PMOVMSKB(RSCRATCH, R(XMM3));
TEST(32, R(RSCRATCH), R(RSCRATCH));
}
FixupBranch even_tie = J_CC(CCFlags::CC_NZ);
RET();
SetJumpTarget(even_tie);
// Check if the error is zero
SUBPD(XMM0, R(XMM2));
XORPD(XMM2, R(XMM2));
CMPPD(XMM2, R(XMM0), CMP_EQ);
// Store -1 or 1 in XMM0 depending on whether we're rounding down or up
PXOR(XMM0, R(XMM1));
PSRAD(XMM0, 31);
PSHUFD(XMM0, R(XMM0), 0xF5);
POR(XMM0, MConst(double_lsb));
// Round the elements that have both a non-zero error and an even tie
PANDN(XMM2, R(XMM3));
PAND(XMM0, R(XMM2));
PADDQ(XMM1, R(XMM0));
RET();
}
// Safe + Fast Quantizers, originally from JITIL by magumagu
alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
alignas(16) static const float m_32767 = 32767.0f;

View File

@ -33,6 +33,8 @@ public:
protected:
void GenConvertDoubleToSingle();
void GenerateFmaddsEft();
void GeneratePsMaddEft();
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
void GenQuantizedLoads();

View File

@ -324,6 +324,8 @@ protected:
void GenerateConvertDoubleToSingle();
void GenerateConvertSingleToDouble();
void GenerateFPRF(bool single);
void GenerateFmaddsEft();
void GeneratePsMaddEft();
void GenerateQuantizedLoads();
void GenerateQuantizedStores();

View File

@ -79,9 +79,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
const bool use_b = op5 != 25; // fmul uses no B
const bool fma = use_b && use_c;
const bool negate_result = (op5 & ~0x1) == 30;
const bool negate_b = op5 == 28 || op5 == 30;
const bool output_is_single = inst.OPCD == 59;
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];
const auto inputs_are_singles_func = [&] {
@ -89,13 +91,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
(!use_c || fpr.IsSingle(c, true));
};
const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
const bool single = inputs_are_singles_func() && output_is_single &&
(error_free_transformation_requested || !nonfused_requested);
const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
const RegType type_out = output_is_single ?
(single ? RegType::DuplicatedSingle : RegType::Duplicated) :
RegType::LowerPair;
const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
const bool nonfused = nonfused_requested && !single;
const bool error_free_transformation =
error_free_transformation_requested && !single && output_is_single;
if (error_free_transformation)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
}
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@ -103,33 +116,47 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
{
Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;
ARM64Reg rounded_c_reg = VC;
if (round_c)
{
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetScopedReg();
rounded_c_reg = reg_encoder(V0Q);
Force25BitPrecision(rounded_c_reg, VC);
}
ARM64Reg inaccurate_fma_reg = VD;
if (fma && inaccurate_fma && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
inaccurate_fma_reg = reg_encoder(V0Q);
}
ARM64Reg result_reg = VD;
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (preserve_d)
ARM64Reg nonfused_reg = VD;
if (error_free_transformation)
{
V1Q = fpr.GetScopedReg();
result_reg = reg_encoder(V1Q);
result_reg = reg_encoder(ARM64Reg::Q0);
nonfused_reg = reg_encoder(ARM64Reg::Q0);
if (nonfused && V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
}
else
{
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (preserve_d)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
result_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
}
else if (fma && nonfused && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
nonfused_reg = reg_encoder(V0Q);
}
}
if (round_c)
{
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
Force25BitPrecision(rounded_c_reg, VC);
}
switch (op5)
@ -152,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
m_float_emit.FSUB(result_reg, nonfused_reg, VB);
}
else
{
@ -164,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
break;
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
m_float_emit.FADD(result_reg, nonfused_reg, VB);
}
else
{
@ -180,6 +207,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
Common::SmallVector<FixupBranch, 4> nan_fixups;
std::optional<FixupBranch> nan_early_fixup;
if (m_accurate_nans)
{
// Check if we need to handle NaNs
@ -216,7 +244,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
SetJumpTarget(skip);
}
std::optional<FixupBranch> nan_early_fixup;
if (negate_result)
{
// If we have a NaN, we must not execute FNEG.
@ -230,11 +257,46 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
SwitchToNearCode();
if (nan_early_fixup)
SetJumpTarget(*nan_early_fixup);
}
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here
if (error_free_transformation)
{
// We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
// a' := s - b
if (negate_b)
m_float_emit.FADD(ARM64Reg::D1, result_reg, VB);
else
m_float_emit.FSUB(ARM64Reg::D1, result_reg, VB);
// b' := s - a'
m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
// da := a - a'
if (nonfused)
{
m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);
}
else
{
m_float_emit.FNMSUB(ARM64Reg::D1, VA, rounded_c_reg, ARM64Reg::D1);
}
// db := b - b'
// (Transformed into -db := b' - b)
if (negate_b)
m_float_emit.FADD(ARM64Reg::D2, ARM64Reg::D2, VB);
else
m_float_emit.FSUB(ARM64Reg::D2, ARM64Reg::D2, VB);
BL(GetAsmRoutines()->fmadds_eft);
}
if (nan_early_fixup)
SetJumpTarget(*nan_early_fixup);
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
if (negate_result)
@ -254,7 +316,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
fpr.FixSinglePrecision(d);
}
if (error_free_transformation)
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
SetFPRFIfNeeded(output_is_single, VD);
if (error_free_transformation)
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
}
void JitArm64::fp_logic(UGeckoInstruction inst)

View File

@ -92,20 +92,31 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
const bool duplicated_c = muls || madds;
const bool fma = use_b && use_c;
const bool negate_result = (op5 & ~0x1) == 30;
const bool msub = op5 == 28 || op5 == 30;
const bool negate_b = op5 == 28 || op5 == 30;
const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
const bool error_free_transformation_requested = fma && m_accurate_fmadds;
const bool round_c = use_c && !js.op->fprIsSingle[c];
const auto inputs_are_singles_func = [&] {
return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
};
const bool single = inputs_are_singles_func() && !inaccurate_fma;
const bool single =
inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
const RegType type = single ? RegType::Single : RegType::Register;
const u8 size = single ? 32 : 64;
const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;
const bool nonfused = nonfused_requested && !single;
const bool error_free_transformation = error_free_transformation_requested && !single;
if (error_free_transformation)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
}
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@ -119,41 +130,77 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
ARM64Reg rounded_c_reg = VC;
if (round_c)
{
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetScopedReg();
rounded_c_reg = reg_encoder(V0Q);
Force25BitPrecision(rounded_c_reg, VC);
}
ARM64Reg inaccurate_fma_reg = VD;
if (fma && inaccurate_fma && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
if (error_free_transformation)
{
// This register happens to be free, so we can skip allocating one
rounded_c_reg = ARM64Reg::Q3;
}
else
{
V0Q = fpr.GetScopedReg();
inaccurate_fma_reg = reg_encoder(V0Q);
rounded_c_reg = reg_encoder(V0Q);
}
}
ARM64Reg result_reg = VD;
const bool need_accurate_fma_reg =
fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_accurate_fma_reg || preserve_d)
ARM64Reg nonfused_reg = VD;
if (error_free_transformation)
{
V1Q = fpr.GetScopedReg();
result_reg = reg_encoder(V1Q);
result_reg = reg_encoder(ARM64Reg::Q0);
nonfused_reg = reg_encoder(ARM64Reg::Q0);
}
else
{
const bool need_fused_fma_reg =
fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
const bool preserve_d =
m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
if (need_fused_fma_reg || preserve_d)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
result_reg = reg_encoder(V0Q);
nonfused_reg = reg_encoder(V0Q);
if (need_fused_fma_reg && round_c)
{
V1Q = fpr.GetScopedReg();
rounded_c_reg = reg_encoder(V1Q);
}
}
else if (fma && nonfused && VD == VB)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
nonfused_reg = reg_encoder(V0Q);
}
}
if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetScopedReg();
if (error_free_transformation)
{
// These registers happen to be free, so we can skip allocating new ones
V1Q = ARM64Reg::Q1;
V2Q = ARM64Reg::Q2;
}
else
{
if (V1Q == ARM64Reg::INVALID_REG)
V1Q = fpr.GetScopedReg();
if (duplicated_c || VD == result_reg)
V2Q = fpr.GetScopedReg();
if (duplicated_c || VD == result_reg)
V2Q = fpr.GetScopedReg();
}
}
if (round_c)
{
ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
Force25BitPrecision(rounded_c_reg, VC);
}
std::optional<ARM64Reg> negated_b_reg;
switch (op5)
{
case 12: // ps_muls0: d = a * c.ps0
@ -163,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
break;
case 14: // ps_madds0: d = a * c.ps0 + b
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@ -176,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
}
break;
case 15: // ps_madds1: d = a * c.ps1 + b
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@ -202,23 +249,28 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
break;
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
}
else
{
m_float_emit.FNEG(size, result_reg, VB);
if (error_free_transformation)
{
m_float_emit.MOV(ARM64Reg::Q4, result_reg);
negated_b_reg = ARM64Reg::Q4;
}
m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
}
break;
case 29: // ps_madd: d = a * c + b
case 31: // ps_nmadd: d = -(a * c + b)
if (inaccurate_fma)
if (nonfused)
{
m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
}
else
{
@ -232,11 +284,80 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
break;
}
// Read the comment in the interpreter function NI_madd_msub to find out what's going on here
if (error_free_transformation)
{
// We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
// a' := s - b
// (Transformed into -a' := b - s)
if (negate_b)
{
if (!negated_b_reg)
{
m_float_emit.FNEG(size, ARM64Reg::Q4, VB);
negated_b_reg = ARM64Reg::Q4;
}
m_float_emit.FSUB(size, ARM64Reg::Q1, *negated_b_reg, result_reg);
}
else
{
m_float_emit.FSUB(size, ARM64Reg::Q1, VB, result_reg);
}
// b' := s - a'
// (Transformed into b' := s + -a')
m_float_emit.FADD(size, ARM64Reg::Q2, result_reg, ARM64Reg::Q1);
// da := a - a'
// (Transformed into da := a + -a')
if (nonfused)
{
switch (op5)
{
case 14: // ps_madds0: d = a * c.ps0 + b
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 0);
break;
case 15: // ps_madds1: d = a * c.ps1 + b
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 1);
break;
default:
m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg);
break;
}
m_float_emit.FADD(size, ARM64Reg::Q1, ARM64Reg::Q3, ARM64Reg::Q1);
}
else
{
switch (op5)
{
case 14: // ps_madds0: d = a * c.ps0 + b
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 0);
break;
case 15: // ps_madds1: d = a * c.ps1 + b
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 1);
break;
default:
m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg);
break;
}
}
// db := b - b'
// (Transformed into -db := b' - b)
if (negate_b)
m_float_emit.FADD(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
else
m_float_emit.FSUB(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
BL(GetAsmRoutines()->ps_madd_eft);
}
FixupBranch nan_fixup;
if (m_accurate_nans)
{
const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);
// Check if we need to handle NaNs
@ -306,7 +427,13 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
fpr.FixSinglePrecision(d);
if (error_free_transformation)
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
SetFPRFIfNeeded(true, VD);
if (error_free_transformation)
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
}
void JitArm64::ps_sel(UGeckoInstruction inst)

View File

@ -8,6 +8,7 @@
#include <utility>
#include "Common/Arm64Emitter.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/Config/Config.h"
#include "Common/FloatUtils.h"
@ -265,6 +266,14 @@ void JitArm64::GenerateCommonAsm()
GenerateFPRF(false);
Common::JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF");
GetAsmRoutines()->fmadds_eft = GetCodePtr();
GenerateFmaddsEft();
Common::JitRegister::Register(GetAsmRoutines()->fmadds_eft, GetCodePtr(), "JIT_fmadds_eft");
GetAsmRoutines()->ps_madd_eft = GetCodePtr();
GeneratePsMaddEft();
Common::JitRegister::Register(GetAsmRoutines()->ps_madd_eft, GetCodePtr(), "JIT_ps_madd_eft");
GenerateQuantizedLoads();
GenerateQuantizedStores();
}
@ -514,6 +523,90 @@ void JitArm64::GenerateFPRF(bool single)
B(write_fprf_and_ret);
}
// Inputs:
// D0: Result with potentially incorrect rounding
// D1: First error term
// D2: Second error term, negated
//
// Outputs result with corrected rounding in D0. Clobbers X0-X1, D1, and flags.
void JitArm64::GenerateFmaddsEft()
{
// Check if D0 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
MOVI2R(ARM64Reg::W1, 0x80000000);
CMP(ARM64Reg::W1, ARM64Reg::W0, ArithOption(ARM64Reg::W0, ShiftType::LSL, 3));
FixupBranch even_tie = B(CCFlags::CC_EQ);
const u8* ret = GetCodePtr();
RET();
// Check if the error is 0
SetJumpTarget(even_tie);
m_float_emit.FSUB(ARM64Reg::D1, ARM64Reg::D1, ARM64Reg::D2);
m_float_emit.FCMP(ARM64Reg::D1);
B(CCFlags::CC_EQ, ret);
// Round D0 up or down
MOVZ(ARM64Reg::X1, 1);
CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
CMP(ARM64Reg::X0, 0);
CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
ADD(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1);
m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
RET();
}
// Inputs:
// Q0: Results with potentially incorrect rounding
// Q1: First error terms
// Q2: Second error terms, negated
//
// Outputs results with corrected rounding in Q0. Clobbers X0, Q1-Q4, and flags.
void JitArm64::GeneratePsMaddEft()
{
// Check if Q0 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
MOVI2R(ARM64Reg::X0, 0x8000'0000'0000'0000);
m_float_emit.SHL(64, ARM64Reg::Q3, ARM64Reg::Q0, 35);
m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
m_float_emit.CMEQ(64, ARM64Reg::Q3, ARM64Reg::Q3, ARM64Reg::Q4);
// Just for performance, exit early if there is no even tie
m_float_emit.XTN(32, ARM64Reg::D4, ARM64Reg::Q3);
FixupBranch even_tie;
if (cpu_info.bAFP)
{
m_float_emit.FCMP(ARM64Reg::D4);
even_tie = B(CCFlags::CC_NEQ);
}
else
{
// If we don't have AFP and the emulated software has NI set, subnormals will compare equal to
// zero, so we can't use FCMP unless we were to put some shuffle instruction before it.
// FMOV is a little slower than FCMP, but it's faster than adding an extra instruction.
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D4);
even_tie = CBNZ(ARM64Reg::X0);
}
RET();
SetJumpTarget(even_tie);
// Check if the error is zero
m_float_emit.FSUB(64, ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
MOVZ(ARM64Reg::X0, 1);
m_float_emit.FCMEQ(64, ARM64Reg::Q2, ARM64Reg::Q1);
// Store -1 or 1 in Q1 depending on whether we're rounding down or up
m_float_emit.EOR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q0);
m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
m_float_emit.SSHR(64, ARM64Reg::Q1, ARM64Reg::Q1, 63);
m_float_emit.ORR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q4);
// Round the elements that have both a non-zero error and an even tie
m_float_emit.BIC(ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q2);
m_float_emit.AND(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
m_float_emit.ADD(64, ARM64Reg::Q0, ARM64Reg::Q0, ARM64Reg::Q1);
RET();
}
void JitArm64::GenerateQuantizedLoads()
{
// X0 is a temporary

View File

@ -30,6 +30,8 @@ struct CommonAsmRoutinesBase
const u8* cstd;
const u8* fprf_single;
const u8* fprf_double;
const u8* fmadds_eft;
const u8* ps_madd_eft;
// In: array index: GQR to use.
// In: ECX: Address to read from.

View File

@ -57,7 +57,7 @@
// After resetting the stack to the top, we call _resetstkoflw() to restore
// the guard page at the 256kb mark.
const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitBase::JIT_SETTINGS{{
const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JitBase::JIT_SETTINGS{{
{&JitBase::bJITOff, &Config::MAIN_DEBUG_JIT_OFF},
{&JitBase::bJITLoadStoreOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_OFF},
{&JitBase::bJITLoadStorelXzOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_LXZ_OFF},
@ -79,6 +79,7 @@ const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitB
{&JitBase::m_low_dcbz_hack, &Config::MAIN_LOW_DCBZ_HACK},
{&JitBase::m_fprf, &Config::MAIN_FPRF},
{&JitBase::m_accurate_nans, &Config::MAIN_ACCURATE_NANS},
{&JitBase::m_accurate_fmadds, &Config::MAIN_ACCURATE_FMADDS},
{&JitBase::m_fastmem_enabled, &Config::MAIN_FASTMEM},
{&JitBase::m_accurate_cpu_cache_enabled, &Config::MAIN_ACCURATE_CPU_CACHE},
}};

View File

@ -158,6 +158,7 @@ protected:
bool m_low_dcbz_hack = false;
bool m_fprf = false;
bool m_accurate_nans = false;
bool m_accurate_fmadds = false;
bool m_fastmem_enabled = false;
bool m_accurate_cpu_cache_enabled = false;
@ -165,7 +166,7 @@ protected:
bool m_cleanup_after_stackfault = false;
u8* m_stack_guard = nullptr;
static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JIT_SETTINGS;
static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JIT_SETTINGS;
bool DoesConfigNeedRefresh() const;
void RefreshConfig();