Merge pull request #13900 from JosJuice/jit-fma-double-rounding

Jit: Implement error-free transformation for single-precision FMA
2026-01-30 11:03:31 +00:00 · 2026-01-23 21:43:18 +01:00 · 2026-01-23 21:43:18 +01:00 · 3221e982d3
commit 3221e982d3
parent b18abc41ed 3b1a4739bc
26 changed files with 958 additions and 191 deletions
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@ -3156,6 +3156,10 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)

  EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
 }
+void ARM64FloatEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 0, 3, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
@ -3505,6 +3509,53 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
  EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
 }

+// Comparison
+void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
+}
+void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
+}
+void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
+}
+void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
+}
+void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0xA, Rd, Rn);
+}
+void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
+}
+
+// Float comparison
 void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
 {
  EmitCompare(0, 0, 0, 0, Rn, Rm);
@ -3664,7 +3715,7 @@ void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
             shift, src_size);
-  EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
+  EmitShiftImm(IsQuad(Rd), 0, src_size | shift, 0b01010, Rd, Rn);
 }

 void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
@ -3674,11 +3725,18 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
  EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
 }

+void ARM64FloatEmitter::SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
+             shift, src_size);
+  EmitShiftImm(IsQuad(Rd), 0, src_size * 2 - shift, 0b00000, Rd, Rn);
+}
+
 void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
             shift, src_size);
-  EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
+  EmitShiftImm(IsQuad(Rd), 1, src_size * 2 - shift, 0b00100, Rd, Rn);
 }

 void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@ -800,6 +800,7 @@ public:
    ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
    CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
  }
+  void CNEG(ARM64Reg Rd, ARM64Reg Rn, CCFlags cond) { CSNEG(Rd, Rn, Rn, (CCFlags)((u32)cond ^ 1)); }
  void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
  void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
  {
@ -1281,6 +1282,7 @@ public:
  void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
  void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@ -1342,6 +1344,19 @@ public:
  void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
  void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);

+  // Comparison
+  void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
  // Float comparison
  void FCMP(ARM64Reg Rn, ARM64Reg Rm);
  void FCMP(ARM64Reg Rn);
@ -1380,6 +1395,7 @@ public:
  void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
  void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@ -2519,19 +2519,19 @@ void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
  WriteSSEOp(0x66, 0x6C, dest, arg);
 }

-void XEmitter::PSRLW(X64Reg reg, int shift)
+void XEmitter::PSRLW(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
  Write8(shift);
 }

-void XEmitter::PSRLD(X64Reg reg, int shift)
+void XEmitter::PSRLD(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
  Write8(shift);
 }

-void XEmitter::PSRLQ(X64Reg reg, int shift)
+void XEmitter::PSRLQ(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
  Write8(shift);
@ -2542,38 +2542,38 @@ void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
  WriteSSEOp(0x66, 0xd3, reg, arg);
 }

-void XEmitter::PSRLDQ(X64Reg reg, int shift)
+void XEmitter::PSRLDQ(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
  Write8(shift);
 }

-void XEmitter::PSLLW(X64Reg reg, int shift)
+void XEmitter::PSLLW(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
  Write8(shift);
 }

-void XEmitter::PSLLD(X64Reg reg, int shift)
+void XEmitter::PSLLD(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
  Write8(shift);
 }

-void XEmitter::PSLLQ(X64Reg reg, int shift)
+void XEmitter::PSLLQ(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
  Write8(shift);
 }

-void XEmitter::PSLLDQ(X64Reg reg, int shift)
+void XEmitter::PSLLDQ(X64Reg reg, u8 shift)
 {
  WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
  Write8(shift);
 }

 // WARNING not REX compatible
-void XEmitter::PSRAW(X64Reg reg, int shift)
+void XEmitter::PSRAW(X64Reg reg, u8 shift)
 {
  if (reg > 7)
    PanicAlertFmt("The PSRAW-emitter does not support regs above 7");
@ -2585,7 +2585,7 @@ void XEmitter::PSRAW(X64Reg reg, int shift)
 }

 // WARNING not REX compatible
-void XEmitter::PSRAD(X64Reg reg, int shift)
+void XEmitter::PSRAD(X64Reg reg, u8 shift)
 {
  if (reg > 7)
    PanicAlertFmt("The PSRAD-emitter does not support regs above 7");
@ -2695,6 +2695,11 @@ void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
  Write8(blend);
 }

+void XEmitter::PCMPEQQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3829, dest, arg);
+}
+
 void XEmitter::PAND(X64Reg dest, const OpArg& arg)
 {
  WriteSSEOp(0x66, 0xDB, dest, arg);
@ -3038,6 +3043,12 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
  WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
 }

+void XEmitter::VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift)
+{
+  WriteAVXOp(0x66, 0x73, (X64Reg)6, regOp1, R(regOp2));
+  Write8(shift);
+}
+
 void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp)
 {
  WriteAVXOp(0x00, 0x29, regOp, X64Reg::INVALID_REG, arg);
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@ -801,19 +801,19 @@ public:
  void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
  void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);

-  void PSRLW(X64Reg reg, int shift);
-  void PSRLD(X64Reg reg, int shift);
-  void PSRLQ(X64Reg reg, int shift);
+  void PSRLW(X64Reg reg, u8 shift);
+  void PSRLD(X64Reg reg, u8 shift);
+  void PSRLQ(X64Reg reg, u8 shift);
  void PSRLQ(X64Reg reg, const OpArg& arg);
-  void PSRLDQ(X64Reg reg, int shift);
+  void PSRLDQ(X64Reg reg, u8 shift);

-  void PSLLW(X64Reg reg, int shift);
-  void PSLLD(X64Reg reg, int shift);
-  void PSLLQ(X64Reg reg, int shift);
-  void PSLLDQ(X64Reg reg, int shift);
+  void PSLLW(X64Reg reg, u8 shift);
+  void PSLLD(X64Reg reg, u8 shift);
+  void PSLLQ(X64Reg reg, u8 shift);
+  void PSLLDQ(X64Reg reg, u8 shift);

-  void PSRAW(X64Reg reg, int shift);
-  void PSRAD(X64Reg reg, int shift);
+  void PSRAW(X64Reg reg, u8 shift);
+  void PSRAD(X64Reg reg, u8 shift);

  // SSE4: data type conversions
  void PMOVSXBW(X64Reg dest, const OpArg& arg);
@ -836,6 +836,9 @@ public:
  void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
  void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);

+  // SSE4: compare instructions
+  void PCMPEQQ(X64Reg dest, const OpArg& arg);
+
  // AVX
  void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
  void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@ -878,6 +881,8 @@ public:
  void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
  void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);

+  void VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift);
+
  void VMOVAPS(const OpArg& arg, X64Reg regOp);

  void VZEROUPPER();
--- a/Source/Core/Core/Config/MainSettings.cpp
+++ b/Source/Core/Core/Config/MainSettings.cpp
@ -222,6 +222,7 @@ const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS{{System::Main, "Core", "DivByZer
                                                false};
 const Info<bool> MAIN_FPRF{{System::Main, "Core", "FPRF"}, false};
 const Info<bool> MAIN_ACCURATE_NANS{{System::Main, "Core", "AccurateNaNs"}, false};
+const Info<bool> MAIN_ACCURATE_FMADDS{{System::Main, "Core", "AccurateFmadds"}, true};
 const Info<bool> MAIN_DISABLE_ICACHE{{System::Main, "Core", "DisableICache"}, false};
 const Info<float> MAIN_EMULATION_SPEED{{System::Main, "Core", "EmulationSpeed"}, 1.0f};
 #if defined(ANDROID)
--- a/Source/Core/Core/Config/MainSettings.h
+++ b/Source/Core/Core/Config/MainSettings.h
@ -128,6 +128,7 @@ extern const Info<bool> MAIN_FLOAT_EXCEPTIONS;
 extern const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS;
 extern const Info<bool> MAIN_FPRF;
 extern const Info<bool> MAIN_ACCURATE_NANS;
+extern const Info<bool> MAIN_ACCURATE_FMADDS;
 extern const Info<bool> MAIN_DISABLE_ICACHE;
 extern const Info<float> MAIN_EMULATION_SPEED;
 extern const Info<bool> MAIN_PRECISION_FRAME_TIMING;
--- a/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp
+++ b/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp
@ -80,6 +80,7 @@ public:
    layer->Set(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS, m_settings.divide_by_zero_exceptions);
    layer->Set(Config::MAIN_FPRF, m_settings.fprf);
    layer->Set(Config::MAIN_ACCURATE_NANS, m_settings.accurate_nans);
+    layer->Set(Config::MAIN_ACCURATE_FMADDS, m_settings.accurate_fmadds);
    layer->Set(Config::MAIN_DISABLE_ICACHE, m_settings.disable_icache);
    layer->Set(Config::MAIN_SYNC_ON_SKIP_IDLE, m_settings.sync_on_skip_idle);
    layer->Set(Config::MAIN_SYNC_GPU, m_settings.sync_gpu);
--- a/Source/Core/Core/NetPlayProto.h
+++ b/Source/Core/Core/NetPlayProto.h
@ -68,6 +68,7 @@ struct NetSettings
  bool divide_by_zero_exceptions = false;
  bool fprf = false;
  bool accurate_nans = false;
+  bool accurate_fmadds = false;
  bool disable_icache = false;
  bool sync_on_skip_idle = false;
  bool sync_gpu = false;
--- a/Source/Core/Core/NetPlayServer.cpp
+++ b/Source/Core/Core/NetPlayServer.cpp
@ -1425,6 +1425,7 @@ bool NetPlayServer::SetupNetSettings()
  settings.divide_by_zero_exceptions = Config::Get(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS);
  settings.fprf = Config::Get(Config::MAIN_FPRF);
  settings.accurate_nans = Config::Get(Config::MAIN_ACCURATE_NANS);
+  settings.accurate_fmadds = Config::Get(Config::MAIN_ACCURATE_FMADDS);
  settings.disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE);
  settings.sync_on_skip_idle = Config::Get(Config::MAIN_SYNC_ON_SKIP_IDLE);
  settings.sync_gpu = Config::Get(Config::MAIN_SYNC_GPU);
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@ -342,12 +342,12 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
  //    - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
  // 3. Tying up to even because `c` is too small
  //    a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
-  //    b. The lowest bit of `f` is 1 (this means it ties to even downwards)
+  //    b. The lowest bit of `f` is 1 (this means it ties to even upwards)
  //    c. `c` is negative and does not round `d` downwards
  //    -  This is similar to the first one but in reverse, rounding up instead of down.
  // 4. Tying down because `d` rounded down
  //    a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
-  //    b. The lowest bit of `f` is 0 (this means it ties to even upwards)
+  //    b. The lowest bit of `f` is 0 (this means it ties to even downwards)
  //    c. `c` is negative, and the highest bit of c is 1,
  //       and at least one other bit of c is nonzero
  //    - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
@ -375,12 +375,6 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
  // - Correct ordering of NaN checking (for both double and single precision)
  // - Rounding frC up
  // - Rounding only once for single precision inputs (this will be the large majority of cases!)
-  //   - Currently this is interpreter-only.
-  //     This can be implemented in the JIT just as easily, though.
-  //     Eventually the JITs should hopefully support detecting back to back
-  //     single-precision operations, which will lead to no overhead at all.
-  //     In the cases where JITs can't do this, an alternative method is used, as
-  //     is done in the interpreter as well.
  // - Rounding only once for double precision inputs
  //   - This is a side effect of how we handle single-precision inputs: By doing
  //     error calculations rather than checking if every input is a float, we ensure that we know
@ -421,7 +415,7 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
    const double b_sign = sub ? -b : b;
    result.value = std::fma(a, c_round, b_sign);

-    // We then check if we're currently tying in rounding directioh
+    // We then check if we're currently tying in rounding direction
    const u64 result_bits = std::bit_cast<u64>(result.value);

    // The mask of the `d` bits as shown in the above comments
@ -432,9 +426,8 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double

    // Because we check this entire mask which includes a 1 bit, we can be sure that
    // if this result passes, the input is not an infinity that would become a NaN.
-    // This means that, for the JITs, if they only wanted to check for a subset of these
-    // bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
-    // they would have to check if the result was NaN before here.
+    // If we had only checked for a subset of these bits (e.g. only checking if the last
+    // one was 0), we would have needed to also check if the exponent was all ones.
    if ((result_bits & D_MASK) == EVEN_TIE)
    {
      // Because we have a tie, we now compute any error in the FMA calculation
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -1284,9 +1284,9 @@ BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
  return cb.m_gqr_used & ~cb.m_gqr_modified;
 }

-BitSet32 Jit64::CallerSavedRegistersInUse() const
+BitSet32 Jit64::CallerSavedRegistersInUse(BitSet32 additional_registers) const
 {
-  BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16);
+  BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16) | additional_registers;
  return in_use & ABI_ALL_CALLER_SAVED;
 }

--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -77,7 +77,7 @@ public:
  // Returns false if no free memory region can be found for either of the two.
  bool SetEmitterStateToFreeCodeRegion();

-  BitSet32 CallerSavedRegistersInUse() const;
+  BitSet32 CallerSavedRegistersInUse(BitSet32 additional_registers = {}) const;
  BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;

  void IntializeSpeculativeConstants();
@ -153,9 +153,10 @@ public:
  void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
                            bool duplicate = false);
  void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
-  void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber,
-                  std::optional<Gen::OpArg> Ra, std::optional<Gen::OpArg> Rb,
-                  std::optional<Gen::OpArg> Rc);
+  [[nodiscard]] Gen::FixupBranch HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm,
+                                            Gen::X64Reg clobber, std::optional<Gen::OpArg> Ra,
+                                            std::optional<Gen::OpArg> Rb,
+                                            std::optional<Gen::OpArg> Rc);

  void MultiplyImmediate(u32 imm, int a, int d, bool overflow);

--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@ -265,6 +265,10 @@ void Jit64AsmRoutineManager::GenerateCommon()
  GenMfcr();
  cdts = AlignCode4();
  GenConvertDoubleToSingle();
+  fmadds_eft = AlignCode4();
+  GenerateFmaddsEft();
+  ps_madd_eft = AlignCode4();
+  GeneratePsMaddEft();

  GenQuantizedLoads();
  GenQuantizedSingleLoads();
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -93,8 +93,9 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
  SetFPRFIfNeeded(input, false);
 }

-void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional<OpArg> Ra,
-                       std::optional<OpArg> Rb, std::optional<OpArg> Rc)
+FixupBranch Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber,
+                              std::optional<OpArg> Ra, std::optional<OpArg> Rb,
+                              std::optional<OpArg> Rc)
 {
  //                      | PowerPC  | x86
  // ---------------------+----------+---------
@ -104,9 +105,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
  // Dragon Ball: Revenge of King Piccolo requires generated NaNs
  // to be positive, so we'll have to handle them manually.

-  if (!m_accurate_nans)
-    return;
-
  if (inst.OPCD != 4)
  {
    // not paired-single
@ -140,7 +138,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::

    FixupBranch done = J(Jump::Near);
    SwitchToNearCode();
-    SetJumpTarget(done);
+    return done;
  }
  else
  {
@ -217,7 +215,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::

    FixupBranch done = J(Jump::Near);
    SwitchToNearCode();
-    SetJumpTarget(done);
+    return done;
  }
 }

@ -329,14 +327,21 @@ void Jit64::fp_arith(UGeckoInstruction inst)
    }
  }

-  switch (inst.SUBOP5)
+  if (m_accurate_nans)
  {
-  case 18:
-    HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
-    break;
-  case 25:
-    HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
-    break;
+    std::optional<FixupBranch> handled_nans;
+    switch (inst.SUBOP5)
+    {
+    case 18:
+      handled_nans = HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
+      break;
+    case 25:
+      handled_nans = HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
+      break;
+    }
+
+    if (handled_nans)
+      SetJumpTarget(*handled_nans);
  }

  if (single)
@ -368,51 +373,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  const bool use_fma = Config::Get(Config::SESSION_USE_FMA);
  const bool software_fma = use_fma && !cpu_info.bFMA;

-  int a = inst.FA;
-  int b = inst.FB;
-  int c = inst.FC;
-  int d = inst.FD;
-  bool single = inst.OPCD == 4 || inst.OPCD == 59;
-  bool round_input = single && !js.op->fprIsSingle[c];
-  bool preserve_inputs = m_accurate_nans;
-  bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
-  bool packed =
-      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
-                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+  const int a = inst.FA;
+  const int b = inst.FB;
+  const int c = inst.FC;
+  const int d = inst.FD;

  const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30;  // msub, nmsub
  const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31;    // nmsub, nmadd
  const bool madds0 = inst.SUBOP5 == 14;
  const bool madds1 = inst.SUBOP5 == 15;
-  const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
+  const bool single = inst.OPCD == 4 || inst.OPCD == 59;
+  const bool round_input = single && !js.op->fprIsSingle[c];
+
+  const bool error_free_transformation = single && m_accurate_fmadds;
+  const bool packed =
+      inst.OPCD == 4 ||
+      (!cpu_info.bAtom && !software_fma && !error_free_transformation && single &&
+       js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+
+  const bool want_rc_rounded =
+      (error_free_transformation || (software_fma && packed)) && round_input;
+  const bool error_free_transformation_wants_rc_duplicated =
+      (error_free_transformation && !want_rc_rounded) && (madds0 || madds1);
+  const bool accurate_nans_wants_rc_duplicated = m_accurate_nans && (madds0 || madds1);
+  const bool want_rc_duplicated =
+      error_free_transformation_wants_rc_duplicated || accurate_nans_wants_rc_duplicated;
+
+  const bool preserve_d_due_to_a_or_b =
+      (m_accurate_nans || error_free_transformation) && (a == d || b == d);
+  const bool preserve_d_due_to_c =
+      c == d && ((m_accurate_nans && (!want_rc_duplicated || software_fma)) ||
+                 (error_free_transformation && !want_rc_rounded));
+  const bool preserve_d = preserve_d_due_to_a_or_b || preserve_d_due_to_c;

  X64Reg scratch_xmm = XMM0;
  X64Reg result_xmm = XMM1;
  X64Reg Rc_duplicated = XMM2;
+  X64Reg Rc_rounded = XMM3;
+
+  BitSet32 scratch_registers{XMM0 + 16, XMM1 + 16};
+
+  RCX64Reg xmm2_guard;
+  RCX64Reg xmm3_guard;
+  if (error_free_transformation)
+  {
+    xmm2_guard = fpr.Scratch(XMM2);
+    xmm3_guard = fpr.Scratch(XMM3);
+    RegCache::Realize(xmm2_guard, xmm3_guard);
+    scratch_registers[XMM2 + 16] = true;
+    scratch_registers[XMM3 + 16] = true;
+  }
+  else if (software_fma)
+  {
+    xmm2_guard = fpr.Scratch(XMM2);
+    RegCache::Realize(xmm2_guard);
+    scratch_registers[XMM2 + 16] = true;
+  }

  RCOpArg Ra;
  RCOpArg Rb;
  RCOpArg Rc;
  RCX64Reg Rd;
-  RCX64Reg xmm2_guard;
  RCX64Reg result_xmm_guard;
  RCX64Reg Rc_duplicated_guard;
  if (software_fma)
  {
-    xmm2_guard = fpr.Scratch(XMM2);
-    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
-    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Ra = packed || error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed || (error_free_transformation && !want_rc_rounded && !want_rc_duplicated) ?
+             fpr.Bind(c, RCMode::Read) :
+             fpr.Use(c, RCMode::Read);
    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
    if (preserve_d && packed)
    {
      result_xmm_guard = fpr.Scratch();
-      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
+      RegCache::Realize(Ra, Rb, Rc, Rd, result_xmm_guard);
      result_xmm = Gen::X64Reg(result_xmm_guard);
+      scratch_registers[result_xmm + 16] = true;
    }
    else
    {
-      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
+      RegCache::Realize(Ra, Rb, Rc, Rd);
      result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
    }
  }
@ -421,48 +462,88 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    // For use_fma == true:
    //   Statistics suggests b is a lot less likely to be unbound in practice, so
    //   if we have to pick one of a or b to bind, let's make it b.
-    Ra = fpr.Use(a, RCMode::Read);
-    Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-    Rc = fpr.Use(c, RCMode::Read);
+    Ra = error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb =
+        use_fma || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = error_free_transformation && !want_rc_rounded && !want_rc_duplicated ?
+             fpr.Bind(c, RCMode::Read) :
+             fpr.Use(c, RCMode::Read);
    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
    RegCache::Realize(Ra, Rb, Rc, Rd);
-
-    if (madds_accurate_nans)
-    {
-      Rc_duplicated_guard = fpr.Scratch();
-      RegCache::Realize(Rc_duplicated_guard);
-      Rc_duplicated = Rc_duplicated_guard;
-    }
  }

+  if (error_free_transformation_wants_rc_duplicated ||
+      (accurate_nans_wants_rc_duplicated &&
+       ((!software_fma && !error_free_transformation) || (error_free_transformation && packed))))
+  {
+    Rc_duplicated_guard = fpr.Scratch();
+    RegCache::Realize(Rc_duplicated_guard);
+    Rc_duplicated = Rc_duplicated_guard;
+    scratch_registers[Rc_duplicated + 16] = true;
+  }
+
+  const auto registers_to_save = [&](BitSet32 scratch_registers_to_save) {
+    const BitSet32 scratch_registers_not_to_save = scratch_registers & ~scratch_registers_to_save;
+    return CallerSavedRegistersInUse(scratch_registers_to_save) & ~scratch_registers_not_to_save;
+  };
+
  if (software_fma)
  {
+    if (want_rc_rounded)
+    {
+      if (error_free_transformation && madds0)
+      {
+        MOVDDUP(Rc_rounded, Rc);
+        Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
+      }
+      else if (error_free_transformation && madds1)
+      {
+        avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_rounded, Rc, Rc, 3);
+        Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
+      }
+      else
+      {
+        Force25BitPrecision(Rc_rounded, Rc, XMM2);
+      }
+    }
+
    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
    {
-      if ((i == 0 || madds0) && !madds1)
+      if (madds0 || (i == 0 && !madds1) || (want_rc_rounded && error_free_transformation && madds1))
      {
-        if (round_input)
+        if (want_rc_rounded)
+          MOVAPD(XMM1, R(Rc_rounded));
+        else if (round_input)
          Force25BitPrecision(XMM1, Rc, XMM2);
+        else if (Rc.IsSimpleReg())
+          MOVAPD(XMM1, Rc);
        else
          MOVSD(XMM1, Rc);
      }
      else
      {
-        MOVHLPS(XMM1, Rc.GetSimpleReg());
-        if (round_input)
+        MOVHLPS(XMM1, want_rc_rounded ? Rc_rounded : Rc.GetSimpleReg());
+        if (round_input && !want_rc_rounded)
          Force25BitPrecision(XMM1, R(XMM1), XMM2);
      }

      // Write the result from the previous loop iteration into result_xmm so we don't lose it.
      // It's important that this is done after reading Rc above, in case we have madds1 and
-      // result_xmm == Rd == Rc.
+      // !want_rc_rounded and result_xmm == Rd == Rc.
      if (packed && i == 0)
        MOVLHPS(result_xmm, XMM0);

      if (i == 0)
      {
-        MOVSD(XMM0, Ra);
-        MOVSD(XMM2, Rb);
+        if (Ra.IsSimpleReg())
+          MOVAPD(XMM0, Ra);
+        else
+          MOVSD(XMM0, Ra);
+
+        if (Rb.IsSimpleReg())
+          MOVAPD(XMM2, Rb);
+        else
+          MOVSD(XMM2, Rb);
      }
      else
      {
@ -473,23 +554,36 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
      if (subtract)
        XORPS(XMM2, MConst(psSignBits));

-      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      BitSet32 scratch_registers_to_save{};
+      if (packed && i == 0)
+        scratch_registers_to_save[result_xmm + 16] = true;
+      if (want_rc_rounded && (error_free_transformation || i == 1))
+        scratch_registers_to_save[Rc_rounded + 16] = true;
+
+      const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
    }

    if (packed)
+    {
+      // result_xmm's upper lane has the result of the first loop iteration
      MOVSD(R(result_xmm), XMM0);
+    }
    else
+    {
      DEBUG_ASSERT(result_xmm == XMM0);
+    }

-    if (madds_accurate_nans)
+    if (want_rc_duplicated)
    {
      if (madds0)
        MOVDDUP(Rc_duplicated, Rc);
-      else
+      else if (madds1)
        avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3);
+      else
+        DEBUG_ASSERT(false);
    }
  }
  else
@ -497,7 +591,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    if (madds0)
    {
      MOVDDUP(result_xmm, Rc);
-      if (madds_accurate_nans)
+      if (want_rc_duplicated)
        MOVAPD(R(Rc_duplicated), result_xmm);
      if (round_input)
        Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
@ -505,18 +599,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    else if (madds1)
    {
      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
-      if (madds_accurate_nans)
+      if (want_rc_duplicated)
        MOVAPD(R(Rc_duplicated), result_xmm);
      if (round_input)
        Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
    }
    else
    {
+      DEBUG_ASSERT(!want_rc_duplicated);
      if (round_input)
        Force25BitPrecision(result_xmm, Rc, scratch_xmm);
      else
        MOVAPD(result_xmm, Rc);
    }
+    if (want_rc_rounded)
+      MOVAPD(R(Rc_rounded), result_xmm);

    if (use_fma)
    {
@ -556,6 +653,160 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
    }
  }

+  if (m_accurate_nans && result_xmm == XMM0)
+  {
+    // HandleNaNs needs to clobber XMM0
+    result_xmm = error_free_transformation ? XMM1 : Rd;
+    MOVAPD(result_xmm, R(XMM0));
+    DEBUG_ASSERT(!preserve_d);
+  }
+
+  std::optional<FixupBranch> handled_nans;
+  if (!packed && m_accurate_nans)
+  {
+    // The clobber register is unused when not packed.
+    handled_nans =
+        HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
+  }
+
+  // Read the comment in the interpreter function NI_madd_msub to find out what's going on here.
+  if (error_free_transformation)
+  {
+    if (result_xmm != XMM1)
+    {
+      MOVAPD(XMM1, R(result_xmm));
+      result_xmm = XMM1;
+    }
+
+    X64Reg Rc_rounded_duplicated = Rc.GetSimpleReg();
+    BitSet32 scratch_registers_to_save = {XMM1 + 16, XMM2 + 16};
+    if (want_rc_rounded)
+    {
+      Rc_rounded_duplicated = Rc_rounded;
+      scratch_registers_to_save[Rc_rounded] = true;
+    }
+    else if (want_rc_duplicated)
+    {
+      Rc_rounded_duplicated = Rc_duplicated;
+      scratch_registers_to_save[want_rc_duplicated] = true;
+    }
+
+    // We've calculated s := a + b, with a = Ra * Rc_rounded_duplicated, b = subtract ? -Rb : Rb
+
+    if (packed)
+    {
+      // a' := s - b
+      if (subtract)
+        avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM0, R(XMM1), Rb);
+      else
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM1), Rb);
+
+      // b' := s - a'
+      avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM1), R(XMM0));
+
+      // da := a - a'
+      if (software_fma)
+      {
+        scratch_registers_to_save[XMM0 + 16] = true;
+        const BitSet32 registers_in_use_1 = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use_1, 0);
+
+        avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
+        MOVAPD(XMM0, R(Rc_rounded_duplicated));
+        MOVAPD(XMM1, Ra);
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        // We will read from the upper lane of Rc_rounded_duplicated later,
+        // so we need to make sure that that lane isn't overwritten.
+        if (Rc_rounded_duplicated == XMM3)
+          MOVSD(XMM3, R(XMM0));
+        else
+          MOVAPD(XMM3, R(XMM0));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use_1, 0);
+
+        scratch_registers_to_save[XMM0 + 16] = false;
+        scratch_registers_to_save[XMM3 + 16] = true;
+        const BitSet32 registers_in_use_2 = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use_2, 0);
+
+        MOVHLPS(XMM2, XMM0);
+        XORPS(XMM2, MConst(psSignBits));
+        MOVHLPS(XMM0, Rc_rounded_duplicated);
+        MOVHLPS(XMM1, Ra.GetSimpleReg());
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use_2, 0);
+
+        UNPCKLPD(XMM0, R(XMM3));
+      }
+      else if (use_fma)
+      {
+        VFMSUB231PD(XMM0, Rc_rounded_duplicated, Ra);
+      }
+      else
+      {
+        avx_op(&XEmitter::VMULPD, &XEmitter::MULPD, XMM3, R(Rc_rounded_duplicated), Ra);
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM3), R(XMM0), true, false, XMM3);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (subtract)
+        avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM2, R(XMM2), Rb);
+      else
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM2), Rb);
+
+      CALL(GetAsmRoutines()->ps_madd_eft);
+    }
+    else
+    {
+      // a' := s - b
+      if (subtract)
+        avx_op(&XEmitter::VADDSD, &XEmitter::ADDSD, XMM0, R(XMM1), Rb, false);
+      else
+        avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM1), Rb, false);
+
+      // b' := s - a'
+      avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM2, R(XMM1), R(XMM0), false);
+
+      // da := a - a'
+      if (software_fma)
+      {
+        const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+
+        avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
+        MOVAPD(XMM0, R(Rc_rounded_duplicated));
+        MOVAPD(XMM1, Ra);
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+      }
+      else if (use_fma)
+      {
+        VFMSUB231SD(XMM0, Rc_rounded_duplicated, Ra);
+      }
+      else
+      {
+        avx_op(&XEmitter::VMULSD, &XEmitter::MULSD, XMM3, R(Rc_rounded_duplicated), Ra, false);
+        avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM3), R(XMM0), false, false, XMM3);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (subtract)
+        ADDSD(XMM2, Rb);
+      else
+        SUBSD(XMM2, Rb);
+
+      CALL(GetAsmRoutines()->fmadds_eft);
+    }
+  }
+
  // Using x64's nmadd/nmsub would require us to swap the sign of the addend
  // (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
  // Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
@ -563,16 +814,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  if (negate)
    XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));

-  if (m_accurate_nans && result_xmm == XMM0)
+  if (packed && m_accurate_nans)
  {
-    // HandleNaNs needs to clobber XMM0
-    MOVAPD(Rd, R(result_xmm));
-    result_xmm = Rd;
-    DEBUG_ASSERT(!preserve_d);
+    // If packed, the clobber register must be XMM0.
+    handled_nans =
+        HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
  }

-  // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
-  HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc);
+  // If the handled_nans branch was taken in the non-packed case, that means the result is NaN,
+  // so we can skip the XORPD and the error-free transformation. If the handled_nans branch was
+  // taken in the packed case, we don't know if both of the results were NaN or only one, so we
+  // can't skip anything.
+  if (handled_nans)
+    SetJumpTarget(*handled_nans);

  if (single)
    FinalizeSingleResult(Rd, R(result_xmm), packed, true);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -100,12 +100,19 @@ void Jit64::ps_muls(UGeckoInstruction inst)
  default:
    PanicAlertFmt("ps_muls WTF!!!");
  }
+
  if (round_input)
    Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0);
  else if (XMM1 != Rc_duplicated)
    MOVAPD(XMM1, Rc_duplicated);
  MULPD(XMM1, Ra);
-  HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
+
+  if (m_accurate_nans)
+  {
+    const FixupBranch handled_nans = HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
+    SetJumpTarget(handled_nans);
+  }
+
  FinalizeSingleResult(Rd, R(XMM1));
 }

--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -741,7 +741,8 @@ void EmuCodeBlock::JitClearCA()
 // Abstract between AVX and SSE: automatically handle 3-operand instructions
 void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
                          void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
-                          const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible)
+                          const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible,
+                          X64Reg scratch)
 {
  if (arg1.IsSimpleReg(regOp))
  {
@ -778,19 +779,19 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
  else
  {
    // The ugly case: Not reversible, and we have regOp == arg2 without AVX or with arg1 == memory
-    if (!arg1.IsSimpleReg(XMM0))
-      MOVAPD(XMM0, arg1);
+    if (!arg1.IsSimpleReg(scratch))
+      MOVAPD(scratch, arg1);
    if (cpu_info.bAVX)
    {
-      (this->*avxOp)(regOp, XMM0, arg2);
+      (this->*avxOp)(regOp, scratch, arg2);
    }
    else
    {
-      (this->*sseOp)(XMM0, arg2);
+      (this->*sseOp)(scratch, arg2);
      if (packed)
-        MOVAPD(regOp, R(XMM0));
+        MOVAPD(regOp, R(scratch));
      else
-        MOVSD(regOp, R(XMM0));
+        MOVSD(regOp, R(scratch));
    }
  }
 }
@ -798,7 +799,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
 // Abstract between AVX and SSE: automatically handle 3-operand instructions
 void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, u8),
                          void (XEmitter::*sseOp)(X64Reg, const OpArg&, u8), X64Reg regOp,
-                          const OpArg& arg1, const OpArg& arg2, u8 imm)
+                          const OpArg& arg1, const OpArg& arg2, u8 imm, X64Reg scratch)
 {
  if (arg1.IsSimpleReg(regOp))
  {
@ -816,21 +817,40 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
  else
  {
    // The ugly case: regOp == arg2 without AVX, or with arg1 == memory
-    if (!arg1.IsSimpleReg(XMM0))
-      MOVAPD(XMM0, arg1);
+    if (!arg1.IsSimpleReg(scratch))
+      MOVAPD(scratch, arg1);
    if (cpu_info.bAVX)
    {
-      (this->*avxOp)(regOp, XMM0, arg2, imm);
+      (this->*avxOp)(regOp, scratch, arg2, imm);
    }
    else
    {
-      (this->*sseOp)(XMM0, arg2, imm);
-      if (regOp != XMM0)
-        MOVAPD(regOp, R(XMM0));
+      (this->*sseOp)(scratch, arg2, imm);
+      if (regOp != scratch)
+        MOVAPD(regOp, R(scratch));
    }
  }
 }

+// Abstract between AVX and SSE: automatically handle 3-operand instructions
+void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, u8),
+                          void (XEmitter::*sseOp)(X64Reg, u8), X64Reg regOp1, X64Reg regOp2, u8 imm)
+{
+  if (regOp1 == regOp2)
+  {
+    (this->*sseOp)(regOp1, imm);
+  }
+  else if (cpu_info.bAVX)
+  {
+    (this->*avxOp)(regOp1, regOp2, imm);
+  }
+  else
+  {
+    MOVAPD(regOp1, R(regOp2));
+    (this->*sseOp)(regOp1, imm);
+  }
+}
+
 alignas(16) static const u64 psMantissaTruncate[2] = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
 alignas(16) static const u64 psRoundBit[2] = {0x8000000, 0x8000000};

@ -842,8 +862,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
 {
  if (m_jit.jo.accurateSinglePrecision)
  {
+    DEBUG_ASSERT(output != tmp);
    // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
-    if (input.IsSimpleReg() && cpu_info.bAVX)
+    if (input.IsSimpleReg() && !input.IsSimpleReg(tmp) && cpu_info.bAVX)
    {
      VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
      VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
@ -113,10 +113,14 @@ public:
  void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
              void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), Gen::X64Reg regOp,
              const Gen::OpArg& arg1, const Gen::OpArg& arg2, bool packed = true,
-              bool reversible = false);
+              bool reversible = false, Gen::X64Reg scratch = Gen::XMM0);
  void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&, u8),
              void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
-              const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
+              const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm,
+              Gen::X64Reg scratch = Gen::XMM0);
+  void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, u8),
+              void (Gen::XEmitter::*sseOp)(Gen::X64Reg, u8), Gen::X64Reg regOp1, Gen::X64Reg regOp2,
+              u8 imm);

  void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);

--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@ -326,6 +326,98 @@ void CommonAsmRoutines::GenMfcr()
  Common::JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
 }

+// Inputs:
+// XMM0: First error term
+// XMM1: Result with potentially incorrect rounding
+// XMM2: Second error term, negated
+//
+// Outputs result with corrected rounding in XMM1.
+// Clobbers RSCRATCH, RSCRATCH2, XMM0, XMM2, and flags.
+void CommonAsmRoutines::GenerateFmaddsEft()
+{
+  // Check if XMM1 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  MOVQ_xmm(R(RSCRATCH), XMM1);
+  MOV(32, R(RSCRATCH2), Imm32(0x80000000));
+  LEA(32, RSCRATCH2, MComplex(RSCRATCH2, RSCRATCH, SCALE_8, 0));
+  TEST(32, R(RSCRATCH2), R(RSCRATCH2));
+  FixupBranch even_tie = J_CC(CCFlags::CC_Z);
+
+  const u8* ret = GetCodePtr();
+  RET();
+
+  // Check if the error is 0
+  SetJumpTarget(even_tie);
+  SUBSD(XMM0, R(XMM2));
+  XORPD(XMM2, R(XMM2));
+  UCOMISD(XMM0, R(XMM2));
+  J_CC(CCFlags::CC_E, ret);
+
+  // Round XMM1 up or down
+  MOVQ_xmm(R(RSCRATCH2), XMM0);
+  XOR(64, R(RSCRATCH2), R(RSCRATCH));
+  SAR(64, R(RSCRATCH2), Imm8(63));
+  OR(64, R(RSCRATCH2), Imm8(1));
+  ADD(64, R(RSCRATCH), R(RSCRATCH2));
+  MOVQ_xmm(XMM1, R(RSCRATCH));
+  RET();
+}
+
+alignas(16) static const __m128i double_msb = _mm_set_epi64x(0x8000000000000000,
+                                                             0x8000000000000000);
+alignas(16) static const __m128i double_lsb = _mm_set_epi64x(1, 1);
+
+// Inputs:
+// XMM0: First error terms
+// XMM1: Results with potentially incorrect rounding
+// XMM2: Second error terms, negated
+//
+// Outputs results with corrected rounding in XMM1. Clobbers RSCRATCH, XMM0-XMM3, and flags.
+void CommonAsmRoutines::GeneratePsMaddEft()
+{
+  // Check if XMM1 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  avx_op(&XEmitter::VPSLLQ, &XEmitter::PSLLQ, XMM3, XMM1, 35);
+  if (cpu_info.bSSE4_1)
+  {
+    PCMPEQQ(XMM3, MConst(double_msb));
+  }
+  else
+  {
+    PCMPEQW(XMM3, MConst(double_msb));
+    PSHUFD(XMM3, R(XMM3), 0xF5);
+  }
+
+  // Just for performance, exit early if there is no even tie
+  if (cpu_info.bSSE4_1)
+  {
+    PTEST(XMM3, R(XMM3));
+  }
+  else
+  {
+    PMOVMSKB(RSCRATCH, R(XMM3));
+    TEST(32, R(RSCRATCH), R(RSCRATCH));
+  }
+  FixupBranch even_tie = J_CC(CCFlags::CC_NZ);
+  RET();
+  SetJumpTarget(even_tie);
+
+  // Check if the error is zero
+  SUBPD(XMM0, R(XMM2));
+  XORPD(XMM2, R(XMM2));
+  CMPPD(XMM2, R(XMM0), CMP_EQ);
+
+  // Store -1 or 1 in XMM0 depending on whether we're rounding down or up
+  PXOR(XMM0, R(XMM1));
+  PSRAD(XMM0, 31);
+  PSHUFD(XMM0, R(XMM0), 0xF5);
+  POR(XMM0, MConst(double_lsb));
+
+  // Round the elements that have both a non-zero error and an even tie
+  PANDN(XMM2, R(XMM3));
+  PAND(XMM0, R(XMM2));
+  PADDQ(XMM1, R(XMM0));
+  RET();
+}
+
 // Safe + Fast Quantizers, originally from JITIL by magumagu
 alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
 alignas(16) static const float m_32767 = 32767.0f;
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
@ -33,6 +33,8 @@ public:

 protected:
  void GenConvertDoubleToSingle();
+  void GenerateFmaddsEft();
+  void GeneratePsMaddEft();
  const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
  const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
  void GenQuantizedLoads();
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@ -324,6 +324,8 @@ protected:
  void GenerateConvertDoubleToSingle();
  void GenerateConvertSingleToDouble();
  void GenerateFPRF(bool single);
+  void GenerateFmaddsEft();
+  void GeneratePsMaddEft();
  void GenerateQuantizedLoads();
  void GenerateQuantizedStores();

--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -79,9 +79,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
  const bool use_b = op5 != 25;  // fmul uses no B
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;
+  const bool negate_b = op5 == 28 || op5 == 30;

  const bool output_is_single = inst.OPCD == 59;
-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];

  const auto inputs_are_singles_func = [&] {
@ -89,13 +91,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
           (!use_c || fpr.IsSingle(c, true));
  };

-  const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
+  const bool single = inputs_are_singles_func() && output_is_single &&
+                      (error_free_transformation_requested || !nonfused_requested);
  const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
  const RegType type_out = output_is_single ?
                               (single ? RegType::DuplicatedSingle : RegType::Duplicated) :
                               RegType::LowerPair;
  const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;

+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation =
+      error_free_transformation_requested && !single && output_is_single;
+
+  if (error_free_transformation)
+  {
+    gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
+    fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
+  }
+
  const ARM64Reg VA = reg_encoder(fpr.R(a, type));
  const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
  const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@ -103,33 +116,47 @@ void JitArm64::fp_arith(UGeckoInstruction inst)

  {
    Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
-    Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;

    ARM64Reg rounded_c_reg = VC;
    if (round_c)
    {
-      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
      V0Q = fpr.GetScopedReg();
      rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
    }

    ARM64Reg result_reg = VD;
-    const bool preserve_d =
-        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
-    if (preserve_d)
+    ARM64Reg nonfused_reg = VD;
+    if (error_free_transformation)
    {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      result_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
+
+      if (nonfused && V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+    }
+    else
+    {
+      const bool preserve_d =
+          m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+      if (preserve_d)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        result_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
+      }
+      else if (fma && nonfused && VD == VB)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        nonfused_reg = reg_encoder(V0Q);
+      }
+    }
+
+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
+      Force25BitPrecision(rounded_c_reg, VC);
    }

    switch (op5)
@ -152,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
    case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
    case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(result_reg, nonfused_reg, VB);
      }
      else
      {
@ -164,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
      break;
    case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
    case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(result_reg, nonfused_reg, VB);
      }
      else
      {
@ -180,6 +207,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    }

    Common::SmallVector<FixupBranch, 4> nan_fixups;
+    std::optional<FixupBranch> nan_early_fixup;
    if (m_accurate_nans)
    {
      // Check if we need to handle NaNs
@ -216,7 +244,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
        SetJumpTarget(skip);
      }

-      std::optional<FixupBranch> nan_early_fixup;
      if (negate_result)
      {
        // If we have a NaN, we must not execute FNEG.
@ -230,11 +257,46 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
      }

      SwitchToNearCode();
-
-      if (nan_early_fixup)
-        SetJumpTarget(*nan_early_fixup);
    }

+    // Read the comment in the interpreter function NI_madd_msub to find out what's going on here
+    if (error_free_transformation)
+    {
+      // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
+
+      // a' := s - b
+      if (negate_b)
+        m_float_emit.FADD(ARM64Reg::D1, result_reg, VB);
+      else
+        m_float_emit.FSUB(ARM64Reg::D1, result_reg, VB);
+
+      // b' := s - a'
+      m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
+
+      // da := a - a'
+      if (nonfused)
+      {
+        m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
+        m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);
+      }
+      else
+      {
+        m_float_emit.FNMSUB(ARM64Reg::D1, VA, rounded_c_reg, ARM64Reg::D1);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (negate_b)
+        m_float_emit.FADD(ARM64Reg::D2, ARM64Reg::D2, VB);
+      else
+        m_float_emit.FSUB(ARM64Reg::D2, ARM64Reg::D2, VB);
+
+      BL(GetAsmRoutines()->fmadds_eft);
+    }
+
+    if (nan_early_fixup)
+      SetJumpTarget(*nan_early_fixup);
+
    // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
    // for any of AArch64's FMA instructions, so we negate using a separate instruction.
    if (negate_result)
@ -254,7 +316,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    fpr.FixSinglePrecision(d);
  }

+  if (error_free_transformation)
+    gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
+
  SetFPRFIfNeeded(output_is_single, VD);
+
+  if (error_free_transformation)
+    fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
 }

 void JitArm64::fp_logic(UGeckoInstruction inst)
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -92,20 +92,31 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
  const bool duplicated_c = muls || madds;
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;
-  const bool msub = op5 == 28 || op5 == 30;
+  const bool negate_b = op5 == 28 || op5 == 30;

-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
  const bool round_c = use_c && !js.op->fprIsSingle[c];

  const auto inputs_are_singles_func = [&] {
    return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
  };

-  const bool single = inputs_are_singles_func() && !inaccurate_fma;
+  const bool single =
+      inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
  const RegType type = single ? RegType::Single : RegType::Register;
  const u8 size = single ? 32 : 64;
  const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;

+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation = error_free_transformation_requested && !single;
+
+  if (error_free_transformation)
+  {
+    gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
+    fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
+  }
+
  const ARM64Reg VA = reg_encoder(fpr.R(a, type));
  const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
  const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@ -119,41 +130,77 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
    ARM64Reg rounded_c_reg = VC;
    if (round_c)
    {
-      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
-      V0Q = fpr.GetScopedReg();
-      rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
+      if (error_free_transformation)
+      {
+        // This register happens to be free, so we can skip allocating one
+        rounded_c_reg = ARM64Reg::Q3;
+      }
+      else
+      {
        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
+        rounded_c_reg = reg_encoder(V0Q);
+      }
    }

    ARM64Reg result_reg = VD;
-    const bool need_accurate_fma_reg =
-        fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
-    const bool preserve_d =
-        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
-    if (need_accurate_fma_reg || preserve_d)
+    ARM64Reg nonfused_reg = VD;
+    if (error_free_transformation)
    {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      result_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
+    }
+    else
+    {
+      const bool need_fused_fma_reg =
+          fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
+      const bool preserve_d =
+          m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+      if (need_fused_fma_reg || preserve_d)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        result_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
+
+        if (need_fused_fma_reg && round_c)
+        {
+          V1Q = fpr.GetScopedReg();
+          rounded_c_reg = reg_encoder(V1Q);
+        }
+      }
+      else if (fma && nonfused && VD == VB)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        nonfused_reg = reg_encoder(V0Q);
+      }
    }

    if (m_accurate_nans)
    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
+      if (error_free_transformation)
+      {
+        // These registers happen to be free, so we can skip allocating new ones
+        V1Q = ARM64Reg::Q1;
+        V2Q = ARM64Reg::Q2;
+      }
+      else
+      {
+        if (V1Q == ARM64Reg::INVALID_REG)
+          V1Q = fpr.GetScopedReg();

-      if (duplicated_c || VD == result_reg)
-        V2Q = fpr.GetScopedReg();
+        if (duplicated_c || VD == result_reg)
+          V2Q = fpr.GetScopedReg();
+      }
    }

+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
+      Force25BitPrecision(rounded_c_reg, VC);
+    }
+
+    std::optional<ARM64Reg> negated_b_reg;
    switch (op5)
    {
    case 12:  // ps_muls0: d = a * c.ps0
@ -163,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
      m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
      break;
    case 14:  // ps_madds0: d = a * c.ps0 + b
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
      }
      else
      {
@ -176,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
      }
      break;
    case 15:  // ps_madds1: d = a * c.ps1 + b
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
      }
      else
      {
@ -202,23 +249,28 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
      break;
    case 28:  // ps_msub:  d = a * c - b
    case 30:  // ps_nmsub: d = -(a * c - b)
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
      }
      else
      {
        m_float_emit.FNEG(size, result_reg, VB);
+        if (error_free_transformation)
+        {
+          m_float_emit.MOV(ARM64Reg::Q4, result_reg);
+          negated_b_reg = ARM64Reg::Q4;
+        }
        m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
      }
      break;
    case 29:  // ps_madd:  d = a * c + b
    case 31:  // ps_nmadd: d = -(a * c + b)
-      if (inaccurate_fma)
+      if (nonfused)
      {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
      }
      else
      {
@ -232,11 +284,80 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
      break;
    }

+    // Read the comment in the interpreter function NI_madd_msub to find out what's going on here
+    if (error_free_transformation)
+    {
+      // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
+
+      // a' := s - b
+      // (Transformed into -a' := b - s)
+      if (negate_b)
+      {
+        if (!negated_b_reg)
+        {
+          m_float_emit.FNEG(size, ARM64Reg::Q4, VB);
+          negated_b_reg = ARM64Reg::Q4;
+        }
+        m_float_emit.FSUB(size, ARM64Reg::Q1, *negated_b_reg, result_reg);
+      }
+      else
+      {
+        m_float_emit.FSUB(size, ARM64Reg::Q1, VB, result_reg);
+      }
+
+      // b' := s - a'
+      // (Transformed into b' := s + -a')
+      m_float_emit.FADD(size, ARM64Reg::Q2, result_reg, ARM64Reg::Q1);
+
+      // da := a - a'
+      // (Transformed into da := a + -a')
+      if (nonfused)
+      {
+        switch (op5)
+        {
+        case 14:  // ps_madds0: d = a * c.ps0 + b
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 0);
+          break;
+        case 15:  // ps_madds1: d = a * c.ps1 + b
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 1);
+          break;
+        default:
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg);
+          break;
+        }
+        m_float_emit.FADD(size, ARM64Reg::Q1, ARM64Reg::Q3, ARM64Reg::Q1);
+      }
+      else
+      {
+        switch (op5)
+        {
+        case 14:  // ps_madds0: d = a * c.ps0 + b
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 0);
+          break;
+        case 15:  // ps_madds1: d = a * c.ps1 + b
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 1);
+          break;
+        default:
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg);
+          break;
+        }
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (negate_b)
+        m_float_emit.FADD(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
+      else
+        m_float_emit.FSUB(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
+
+      BL(GetAsmRoutines()->ps_madd_eft);
+    }
+
    FixupBranch nan_fixup;
    if (m_accurate_nans)
    {
-      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
-      const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
+      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
+      const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);

      // Check if we need to handle NaNs

@ -306,7 +427,13 @@ void JitArm64::ps_arith(UGeckoInstruction inst)

  fpr.FixSinglePrecision(d);

+  if (error_free_transformation)
+    gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
+
  SetFPRFIfNeeded(true, VD);
+
+  if (error_free_transformation)
+    fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
 }

 void JitArm64::ps_sel(UGeckoInstruction inst)
--- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
@ -8,6 +8,7 @@
 #include <utility>

 #include "Common/Arm64Emitter.h"
+#include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/Config/Config.h"
 #include "Common/FloatUtils.h"
@ -265,6 +266,14 @@ void JitArm64::GenerateCommonAsm()
  GenerateFPRF(false);
  Common::JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF");

+  GetAsmRoutines()->fmadds_eft = GetCodePtr();
+  GenerateFmaddsEft();
+  Common::JitRegister::Register(GetAsmRoutines()->fmadds_eft, GetCodePtr(), "JIT_fmadds_eft");
+
+  GetAsmRoutines()->ps_madd_eft = GetCodePtr();
+  GeneratePsMaddEft();
+  Common::JitRegister::Register(GetAsmRoutines()->ps_madd_eft, GetCodePtr(), "JIT_ps_madd_eft");
+
  GenerateQuantizedLoads();
  GenerateQuantizedStores();
 }
@ -514,6 +523,90 @@ void JitArm64::GenerateFPRF(bool single)
  B(write_fprf_and_ret);
 }

+// Inputs:
+// D0: Result with potentially incorrect rounding
+// D1: First error term
+// D2: Second error term, negated
+//
+// Outputs result with corrected rounding in D0. Clobbers X0-X1, D1, and flags.
+void JitArm64::GenerateFmaddsEft()
+{
+  // Check if D0 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
+  MOVI2R(ARM64Reg::W1, 0x80000000);
+  CMP(ARM64Reg::W1, ARM64Reg::W0, ArithOption(ARM64Reg::W0, ShiftType::LSL, 3));
+  FixupBranch even_tie = B(CCFlags::CC_EQ);
+
+  const u8* ret = GetCodePtr();
+  RET();
+
+  // Check if the error is 0
+  SetJumpTarget(even_tie);
+  m_float_emit.FSUB(ARM64Reg::D1, ARM64Reg::D1, ARM64Reg::D2);
+  m_float_emit.FCMP(ARM64Reg::D1);
+  B(CCFlags::CC_EQ, ret);
+
+  // Round D0 up or down
+  MOVZ(ARM64Reg::X1, 1);
+  CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
+  CMP(ARM64Reg::X0, 0);
+  CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
+  ADD(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1);
+  m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
+  RET();
+}
+
+// Inputs:
+// Q0: Results with potentially incorrect rounding
+// Q1: First error terms
+// Q2: Second error terms, negated
+//
+// Outputs results with corrected rounding in Q0. Clobbers X0, Q1-Q4, and flags.
+void JitArm64::GeneratePsMaddEft()
+{
+  // Check if Q0 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  MOVI2R(ARM64Reg::X0, 0x8000'0000'0000'0000);
+  m_float_emit.SHL(64, ARM64Reg::Q3, ARM64Reg::Q0, 35);
+  m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
+  m_float_emit.CMEQ(64, ARM64Reg::Q3, ARM64Reg::Q3, ARM64Reg::Q4);
+
+  // Just for performance, exit early if there is no even tie
+  m_float_emit.XTN(32, ARM64Reg::D4, ARM64Reg::Q3);
+  FixupBranch even_tie;
+  if (cpu_info.bAFP)
+  {
+    m_float_emit.FCMP(ARM64Reg::D4);
+    even_tie = B(CCFlags::CC_NEQ);
+  }
+  else
+  {
+    // If we don't have AFP and the emulated software has NI set, subnormals will compare equal to
+    // zero, so we can't use FCMP unless we were to put some shuffle instruction before it.
+    // FMOV is a little slower than FCMP, but it's faster than adding an extra instruction.
+    m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D4);
+    even_tie = CBNZ(ARM64Reg::X0);
+  }
+  RET();
+  SetJumpTarget(even_tie);
+
+  // Check if the error is zero
+  m_float_emit.FSUB(64, ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
+  MOVZ(ARM64Reg::X0, 1);
+  m_float_emit.FCMEQ(64, ARM64Reg::Q2, ARM64Reg::Q1);
+
+  // Store -1 or 1 in Q1 depending on whether we're rounding down or up
+  m_float_emit.EOR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q0);
+  m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
+  m_float_emit.SSHR(64, ARM64Reg::Q1, ARM64Reg::Q1, 63);
+  m_float_emit.ORR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q4);
+
+  // Round the elements that have both a non-zero error and an even tie
+  m_float_emit.BIC(ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q2);
+  m_float_emit.AND(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
+  m_float_emit.ADD(64, ARM64Reg::Q0, ARM64Reg::Q0, ARM64Reg::Q1);
+  RET();
+}
+
 void JitArm64::GenerateQuantizedLoads()
 {
  // X0 is a temporary
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@ -30,6 +30,8 @@ struct CommonAsmRoutinesBase
  const u8* cstd;
  const u8* fprf_single;
  const u8* fprf_double;
+  const u8* fmadds_eft;
+  const u8* ps_madd_eft;

  // In: array index: GQR to use.
  // In: ECX: Address to read from.
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp
@ -57,7 +57,7 @@
 // After resetting the stack to the top, we call _resetstkoflw() to restore
 // the guard page at the 256kb mark.

-const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitBase::JIT_SETTINGS{{
+const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JitBase::JIT_SETTINGS{{
    {&JitBase::bJITOff, &Config::MAIN_DEBUG_JIT_OFF},
    {&JitBase::bJITLoadStoreOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_OFF},
    {&JitBase::bJITLoadStorelXzOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_LXZ_OFF},
@ -79,6 +79,7 @@ const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitB
    {&JitBase::m_low_dcbz_hack, &Config::MAIN_LOW_DCBZ_HACK},
    {&JitBase::m_fprf, &Config::MAIN_FPRF},
    {&JitBase::m_accurate_nans, &Config::MAIN_ACCURATE_NANS},
+    {&JitBase::m_accurate_fmadds, &Config::MAIN_ACCURATE_FMADDS},
    {&JitBase::m_fastmem_enabled, &Config::MAIN_FASTMEM},
    {&JitBase::m_accurate_cpu_cache_enabled, &Config::MAIN_ACCURATE_CPU_CACHE},
 }};
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@ -158,6 +158,7 @@ protected:
  bool m_low_dcbz_hack = false;
  bool m_fprf = false;
  bool m_accurate_nans = false;
+  bool m_accurate_fmadds = false;
  bool m_fastmem_enabled = false;
  bool m_accurate_cpu_cache_enabled = false;

@ -165,7 +166,7 @@ protected:
  bool m_cleanup_after_stackfault = false;
  u8* m_stack_guard = nullptr;

-  static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JIT_SETTINGS;
+  static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JIT_SETTINGS;

  bool DoesConfigNeedRefresh() const;
  void RefreshConfig();