diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp
index d78257cf4fc..cdb0b7e625d 100644
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@@ -3156,6 +3156,10 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
 
   EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn);
 }
+void ARM64FloatEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 0, 3, Rd, Rn, Rm);
+}
 void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
   Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn);
@@ -3505,6 +3509,53 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale)
   EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn);
 }
 
+// Comparison
+void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
+}
+void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
+}
+void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn);
+}
+void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn);
+}
+void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
+{
+  Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0xA, Rd, Rn);
+}
+void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm);
+}
+
+// Float comparison
 void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm)
 {
   EmitCompare(0, 0, 0, 0, Rn, Rm);
@@ -3664,7 +3715,7 @@ void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
   ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
              shift, src_size);
-  EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn);
+  EmitShiftImm(IsQuad(Rd), 0, src_size | shift, 0b01010, Rd, Rn);
 }
 
 void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
@@ -3674,11 +3725,18 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
   EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn);
 }
 
+void ARM64FloatEmitter::SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
+{
+  ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
+             shift, src_size);
+  EmitShiftImm(IsQuad(Rd), 0, src_size * 2 - shift, 0b00000, Rd, Rn);
+}
+
 void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
 {
   ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}",
              shift, src_size);
-  EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn);
+  EmitShiftImm(IsQuad(Rd), 1, src_size * 2 - shift, 0b00100, Rd, Rn);
 }
 
 void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h
index 134f9b64cce..d0c91abd619 100644
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@@ -800,6 +800,7 @@ public:
     ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR;
     CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1));
   }
+  void CNEG(ARM64Reg Rd, ARM64Reg Rn, CCFlags cond) { CSNEG(Rd, Rn, Rn, (CCFlags)((u32)cond ^ 1)); }
   void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); }
   void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option)
   {
@@ -1281,6 +1282,7 @@ public:
   void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
+  void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
   void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
   void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
@@ -1342,6 +1344,19 @@ public:
   void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
   void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale);
 
+  // Comparison
+  void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+
   // Float comparison
   void FCMP(ARM64Reg Rn, ARM64Reg Rm);
   void FCMP(ARM64Reg Rn);
@@ -1380,6 +1395,7 @@ public:
   void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
+  void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
   void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 0a122e11218..f9a8a015f92 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -2519,19 +2519,19 @@ void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg)
   WriteSSEOp(0x66, 0x6C, dest, arg);
 }
 
-void XEmitter::PSRLW(X64Reg reg, int shift)
+void XEmitter::PSRLW(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSRLD(X64Reg reg, int shift)
+void XEmitter::PSRLD(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSRLQ(X64Reg reg, int shift)
+void XEmitter::PSRLQ(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
   Write8(shift);
@@ -2542,38 +2542,38 @@ void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg)
   WriteSSEOp(0x66, 0xd3, reg, arg);
 }
 
-void XEmitter::PSRLDQ(X64Reg reg, int shift)
+void XEmitter::PSRLDQ(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSLLW(X64Reg reg, int shift)
+void XEmitter::PSLLW(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSLLD(X64Reg reg, int shift)
+void XEmitter::PSLLD(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSLLQ(X64Reg reg, int shift)
+void XEmitter::PSLLQ(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
   Write8(shift);
 }
 
-void XEmitter::PSLLDQ(X64Reg reg, int shift)
+void XEmitter::PSLLDQ(X64Reg reg, u8 shift)
 {
   WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
   Write8(shift);
 }
 
 // WARNING not REX compatible
-void XEmitter::PSRAW(X64Reg reg, int shift)
+void XEmitter::PSRAW(X64Reg reg, u8 shift)
 {
   if (reg > 7)
     PanicAlertFmt("The PSRAW-emitter does not support regs above 7");
@@ -2585,7 +2585,7 @@ void XEmitter::PSRAW(X64Reg reg, int shift)
 }
 
 // WARNING not REX compatible
-void XEmitter::PSRAD(X64Reg reg, int shift)
+void XEmitter::PSRAD(X64Reg reg, u8 shift)
 {
   if (reg > 7)
     PanicAlertFmt("The PSRAD-emitter does not support regs above 7");
@@ -2695,6 +2695,11 @@ void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend)
   Write8(blend);
 }
 
+void XEmitter::PCMPEQQ(X64Reg dest, const OpArg& arg)
+{
+  WriteSSE41Op(0x66, 0x3829, dest, arg);
+}
+
 void XEmitter::PAND(X64Reg dest, const OpArg& arg)
 {
   WriteSSEOp(0x66, 0xDB, dest, arg);
@@ -3038,6 +3043,12 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
   WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
 }
 
+void XEmitter::VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift)
+{
+  WriteAVXOp(0x66, 0x73, (X64Reg)6, regOp1, R(regOp2));
+  Write8(shift);
+}
+
 void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp)
 {
   WriteAVXOp(0x00, 0x29, regOp, X64Reg::INVALID_REG, arg);
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index 69537709869..35d88a46bce 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -801,19 +801,19 @@ public:
   void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle);
   void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle);
 
-  void PSRLW(X64Reg reg, int shift);
-  void PSRLD(X64Reg reg, int shift);
-  void PSRLQ(X64Reg reg, int shift);
+  void PSRLW(X64Reg reg, u8 shift);
+  void PSRLD(X64Reg reg, u8 shift);
+  void PSRLQ(X64Reg reg, u8 shift);
   void PSRLQ(X64Reg reg, const OpArg& arg);
-  void PSRLDQ(X64Reg reg, int shift);
+  void PSRLDQ(X64Reg reg, u8 shift);
 
-  void PSLLW(X64Reg reg, int shift);
-  void PSLLD(X64Reg reg, int shift);
-  void PSLLQ(X64Reg reg, int shift);
-  void PSLLDQ(X64Reg reg, int shift);
+  void PSLLW(X64Reg reg, u8 shift);
+  void PSLLD(X64Reg reg, u8 shift);
+  void PSLLQ(X64Reg reg, u8 shift);
+  void PSLLDQ(X64Reg reg, u8 shift);
 
-  void PSRAW(X64Reg reg, int shift);
-  void PSRAD(X64Reg reg, int shift);
+  void PSRAW(X64Reg reg, u8 shift);
+  void PSRAD(X64Reg reg, u8 shift);
 
   // SSE4: data type conversions
   void PMOVSXBW(X64Reg dest, const OpArg& arg);
@@ -836,6 +836,9 @@ public:
   void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend);
   void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend);
 
+  // SSE4: compare instructions
+  void PCMPEQQ(X64Reg dest, const OpArg& arg);
+
   // AVX
   void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
   void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@@ -878,6 +881,8 @@ public:
   void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
   void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
 
+  void VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift);
+
   void VMOVAPS(const OpArg& arg, X64Reg regOp);
 
   void VZEROUPPER();
diff --git a/Source/Core/Core/Config/MainSettings.cpp b/Source/Core/Core/Config/MainSettings.cpp
index e42796e8646..3ef8872274c 100644
--- a/Source/Core/Core/Config/MainSettings.cpp
+++ b/Source/Core/Core/Config/MainSettings.cpp
@@ -222,6 +222,7 @@ const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS{{System::Main, "Core", "DivByZer
                                                 false};
 const Info<bool> MAIN_FPRF{{System::Main, "Core", "FPRF"}, false};
 const Info<bool> MAIN_ACCURATE_NANS{{System::Main, "Core", "AccurateNaNs"}, false};
+const Info<bool> MAIN_ACCURATE_FMADDS{{System::Main, "Core", "AccurateFmadds"}, true};
 const Info<bool> MAIN_DISABLE_ICACHE{{System::Main, "Core", "DisableICache"}, false};
 const Info<float> MAIN_EMULATION_SPEED{{System::Main, "Core", "EmulationSpeed"}, 1.0f};
 #if defined(ANDROID)
diff --git a/Source/Core/Core/Config/MainSettings.h b/Source/Core/Core/Config/MainSettings.h
index b6a7094c933..27756d2e2a5 100644
--- a/Source/Core/Core/Config/MainSettings.h
+++ b/Source/Core/Core/Config/MainSettings.h
@@ -128,6 +128,7 @@ extern const Info<bool> MAIN_FLOAT_EXCEPTIONS;
 extern const Info<bool> MAIN_DIVIDE_BY_ZERO_EXCEPTIONS;
 extern const Info<bool> MAIN_FPRF;
 extern const Info<bool> MAIN_ACCURATE_NANS;
+extern const Info<bool> MAIN_ACCURATE_FMADDS;
 extern const Info<bool> MAIN_DISABLE_ICACHE;
 extern const Info<float> MAIN_EMULATION_SPEED;
 extern const Info<bool> MAIN_PRECISION_FRAME_TIMING;
diff --git a/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp b/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp
index 855cc181b92..7966cb32d27 100644
--- a/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp
+++ b/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp
@@ -80,6 +80,7 @@ public:
     layer->Set(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS, m_settings.divide_by_zero_exceptions);
     layer->Set(Config::MAIN_FPRF, m_settings.fprf);
     layer->Set(Config::MAIN_ACCURATE_NANS, m_settings.accurate_nans);
+    layer->Set(Config::MAIN_ACCURATE_FMADDS, m_settings.accurate_fmadds);
     layer->Set(Config::MAIN_DISABLE_ICACHE, m_settings.disable_icache);
     layer->Set(Config::MAIN_SYNC_ON_SKIP_IDLE, m_settings.sync_on_skip_idle);
     layer->Set(Config::MAIN_SYNC_GPU, m_settings.sync_gpu);
diff --git a/Source/Core/Core/NetPlayProto.h b/Source/Core/Core/NetPlayProto.h
index 62a9aba9a4e..d085a7cf2e7 100644
--- a/Source/Core/Core/NetPlayProto.h
+++ b/Source/Core/Core/NetPlayProto.h
@@ -68,6 +68,7 @@ struct NetSettings
   bool divide_by_zero_exceptions = false;
   bool fprf = false;
   bool accurate_nans = false;
+  bool accurate_fmadds = false;
   bool disable_icache = false;
   bool sync_on_skip_idle = false;
   bool sync_gpu = false;
diff --git a/Source/Core/Core/NetPlayServer.cpp b/Source/Core/Core/NetPlayServer.cpp
index 948dabebcd5..9f432486e23 100644
--- a/Source/Core/Core/NetPlayServer.cpp
+++ b/Source/Core/Core/NetPlayServer.cpp
@@ -1425,6 +1425,7 @@ bool NetPlayServer::SetupNetSettings()
   settings.divide_by_zero_exceptions = Config::Get(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS);
   settings.fprf = Config::Get(Config::MAIN_FPRF);
   settings.accurate_nans = Config::Get(Config::MAIN_ACCURATE_NANS);
+  settings.accurate_fmadds = Config::Get(Config::MAIN_ACCURATE_FMADDS);
   settings.disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE);
   settings.sync_on_skip_idle = Config::Get(Config::MAIN_SYNC_ON_SKIP_IDLE);
   settings.sync_gpu = Config::Get(Config::MAIN_SYNC_GPU);
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 0727df26ae8..d01087fe1df 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -342,12 +342,12 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
   //    - This will cause `d` to round to 100...00, meaning it will tie then round upwards.
   // 3. Tying up to even because `c` is too small
   //    a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties)
-  //    b. The lowest bit of `f` is 1 (this means it ties to even downwards)
+  //    b. The lowest bit of `f` is 1 (this means it ties to even upwards)
   //    c. `c` is negative and does not round `d` downwards
   //    -  This is similar to the first one but in reverse, rounding up instead of down.
   // 4. Tying down because `d` rounded down
   //    a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0
-  //    b. The lowest bit of `f` is 0 (this means it ties to even upwards)
+  //    b. The lowest bit of `f` is 0 (this means it ties to even downwards)
   //    c. `c` is negative, and the highest bit of c is 1,
   //       and at least one other bit of c is nonzero
   //    - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00,
@@ -375,12 +375,6 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
   // - Correct ordering of NaN checking (for both double and single precision)
   // - Rounding frC up
   // - Rounding only once for single precision inputs (this will be the large majority of cases!)
-  //   - Currently this is interpreter-only.
-  //     This can be implemented in the JIT just as easily, though.
-  //     Eventually the JITs should hopefully support detecting back to back
-  //     single-precision operations, which will lead to no overhead at all.
-  //     In the cases where JITs can't do this, an alternative method is used, as
-  //     is done in the interpreter as well.
   // - Rounding only once for double precision inputs
   //   - This is a side effect of how we handle single-precision inputs: By doing
   //     error calculations rather than checking if every input is a float, we ensure that we know
@@ -421,7 +415,7 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
     const double b_sign = sub ? -b : b;
     result.value = std::fma(a, c_round, b_sign);
 
-    // We then check if we're currently tying in rounding directioh
+    // We then check if we're currently tying in rounding direction
     const u64 result_bits = std::bit_cast<u64>(result.value);
 
     // The mask of the `d` bits as shown in the above comments
@@ -432,9 +426,8 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double
 
     // Because we check this entire mask which includes a 1 bit, we can be sure that
     // if this result passes, the input is not an infinity that would become a NaN.
-    // This means that, for the JITs, if they only wanted to check for a subset of these
-    // bits (e.g. only checking if the last one was 0), then using the zero flag for a branch,
-    // they would have to check if the result was NaN before here.
+    // If we had only checked for a subset of these bits (e.g. only checking if the last
+    // one was 0), we would have needed to also check if the exponent was all ones.
     if ((result_bits & D_MASK) == EVEN_TIE)
     {
       // Because we have a tie, we now compute any error in the FMA calculation
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
index c0eda2f8f51..484be2bb286 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@@ -1284,9 +1284,9 @@ BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const
   return cb.m_gqr_used & ~cb.m_gqr_modified;
 }
 
-BitSet32 Jit64::CallerSavedRegistersInUse() const
+BitSet32 Jit64::CallerSavedRegistersInUse(BitSet32 additional_registers) const
 {
-  BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16);
+  BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16) | additional_registers;
   return in_use & ABI_ALL_CALLER_SAVED;
 }
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index ad5db1fa103..a14bc1dace0 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -77,7 +77,7 @@ public:
   // Returns false if no free memory region can be found for either of the two.
   bool SetEmitterStateToFreeCodeRegion();
 
-  BitSet32 CallerSavedRegistersInUse() const;
+  BitSet32 CallerSavedRegistersInUse(BitSet32 additional_registers = {}) const;
   BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
 
   void IntializeSpeculativeConstants();
@@ -153,9 +153,10 @@ public:
   void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
                             bool duplicate = false);
   void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
-  void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber,
-                  std::optional<Gen::OpArg> Ra, std::optional<Gen::OpArg> Rb,
-                  std::optional<Gen::OpArg> Rc);
+  [[nodiscard]] Gen::FixupBranch HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm,
+                                            Gen::X64Reg clobber, std::optional<Gen::OpArg> Ra,
+                                            std::optional<Gen::OpArg> Rb,
+                                            std::optional<Gen::OpArg> Rc);
 
   void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
 
diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
index 05d08f767ee..b7735595125 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp
@@ -265,6 +265,10 @@ void Jit64AsmRoutineManager::GenerateCommon()
   GenMfcr();
   cdts = AlignCode4();
   GenConvertDoubleToSingle();
+  fmadds_eft = AlignCode4();
+  GenerateFmaddsEft();
+  ps_madd_eft = AlignCode4();
+  GeneratePsMaddEft();
 
   GenQuantizedLoads();
   GenQuantizedSingleLoads();
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 75cfbed3d63..4cffa573c70 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -93,8 +93,9 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
   SetFPRFIfNeeded(input, false);
 }
 
-void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional<OpArg> Ra,
-                       std::optional<OpArg> Rb, std::optional<OpArg> Rc)
+FixupBranch Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber,
+                              std::optional<OpArg> Ra, std::optional<OpArg> Rb,
+                              std::optional<OpArg> Rc)
 {
   //                      | PowerPC  | x86
   // ---------------------+----------+---------
@@ -104,9 +105,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
   // Dragon Ball: Revenge of King Piccolo requires generated NaNs
   // to be positive, so we'll have to handle them manually.
 
-  if (!m_accurate_nans)
-    return;
-
   if (inst.OPCD != 4)
   {
     // not paired-single
@@ -140,7 +138,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
 
     FixupBranch done = J(Jump::Near);
     SwitchToNearCode();
-    SetJumpTarget(done);
+    return done;
   }
   else
   {
@@ -217,7 +215,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
 
     FixupBranch done = J(Jump::Near);
     SwitchToNearCode();
-    SetJumpTarget(done);
+    return done;
   }
 }
 
@@ -329,14 +327,21 @@ void Jit64::fp_arith(UGeckoInstruction inst)
     }
   }
 
-  switch (inst.SUBOP5)
+  if (m_accurate_nans)
   {
-  case 18:
-    HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
-    break;
-  case 25:
-    HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
-    break;
+    std::optional<FixupBranch> handled_nans;
+    switch (inst.SUBOP5)
+    {
+    case 18:
+      handled_nans = HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
+      break;
+    case 25:
+      handled_nans = HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
+      break;
+    }
+
+    if (handled_nans)
+      SetJumpTarget(*handled_nans);
   }
 
   if (single)
@@ -368,51 +373,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   const bool use_fma = Config::Get(Config::SESSION_USE_FMA);
   const bool software_fma = use_fma && !cpu_info.bFMA;
 
-  int a = inst.FA;
-  int b = inst.FB;
-  int c = inst.FC;
-  int d = inst.FD;
-  bool single = inst.OPCD == 4 || inst.OPCD == 59;
-  bool round_input = single && !js.op->fprIsSingle[c];
-  bool preserve_inputs = m_accurate_nans;
-  bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
-  bool packed =
-      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
-                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+  const int a = inst.FA;
+  const int b = inst.FB;
+  const int c = inst.FC;
+  const int d = inst.FD;
 
   const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30;  // msub, nmsub
   const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31;    // nmsub, nmadd
   const bool madds0 = inst.SUBOP5 == 14;
   const bool madds1 = inst.SUBOP5 == 15;
-  const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
+  const bool single = inst.OPCD == 4 || inst.OPCD == 59;
+  const bool round_input = single && !js.op->fprIsSingle[c];
+
+  const bool error_free_transformation = single && m_accurate_fmadds;
+  const bool packed =
+      inst.OPCD == 4 ||
+      (!cpu_info.bAtom && !software_fma && !error_free_transformation && single &&
+       js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+
+  const bool want_rc_rounded =
+      (error_free_transformation || (software_fma && packed)) && round_input;
+  const bool error_free_transformation_wants_rc_duplicated =
+      (error_free_transformation && !want_rc_rounded) && (madds0 || madds1);
+  const bool accurate_nans_wants_rc_duplicated = m_accurate_nans && (madds0 || madds1);
+  const bool want_rc_duplicated =
+      error_free_transformation_wants_rc_duplicated || accurate_nans_wants_rc_duplicated;
+
+  const bool preserve_d_due_to_a_or_b =
+      (m_accurate_nans || error_free_transformation) && (a == d || b == d);
+  const bool preserve_d_due_to_c =
+      c == d && ((m_accurate_nans && (!want_rc_duplicated || software_fma)) ||
+                 (error_free_transformation && !want_rc_rounded));
+  const bool preserve_d = preserve_d_due_to_a_or_b || preserve_d_due_to_c;
 
   X64Reg scratch_xmm = XMM0;
   X64Reg result_xmm = XMM1;
   X64Reg Rc_duplicated = XMM2;
+  X64Reg Rc_rounded = XMM3;
+
+  BitSet32 scratch_registers{XMM0 + 16, XMM1 + 16};
+
+  RCX64Reg xmm2_guard;
+  RCX64Reg xmm3_guard;
+  if (error_free_transformation)
+  {
+    xmm2_guard = fpr.Scratch(XMM2);
+    xmm3_guard = fpr.Scratch(XMM3);
+    RegCache::Realize(xmm2_guard, xmm3_guard);
+    scratch_registers[XMM2 + 16] = true;
+    scratch_registers[XMM3 + 16] = true;
+  }
+  else if (software_fma)
+  {
+    xmm2_guard = fpr.Scratch(XMM2);
+    RegCache::Realize(xmm2_guard);
+    scratch_registers[XMM2 + 16] = true;
+  }
 
   RCOpArg Ra;
   RCOpArg Rb;
   RCOpArg Rc;
   RCX64Reg Rd;
-  RCX64Reg xmm2_guard;
   RCX64Reg result_xmm_guard;
   RCX64Reg Rc_duplicated_guard;
   if (software_fma)
   {
-    xmm2_guard = fpr.Scratch(XMM2);
-    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
-    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Ra = packed || error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed || (error_free_transformation && !want_rc_rounded && !want_rc_duplicated) ?
+             fpr.Bind(c, RCMode::Read) :
+             fpr.Use(c, RCMode::Read);
     Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
     if (preserve_d && packed)
     {
       result_xmm_guard = fpr.Scratch();
-      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
+      RegCache::Realize(Ra, Rb, Rc, Rd, result_xmm_guard);
       result_xmm = Gen::X64Reg(result_xmm_guard);
+      scratch_registers[result_xmm + 16] = true;
     }
     else
     {
-      RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
+      RegCache::Realize(Ra, Rb, Rc, Rd);
       result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
     }
   }
@@ -421,48 +462,88 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     // For use_fma == true:
     //   Statistics suggests b is a lot less likely to be unbound in practice, so
     //   if we have to pick one of a or b to bind, let's make it b.
-    Ra = fpr.Use(a, RCMode::Read);
-    Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-    Rc = fpr.Use(c, RCMode::Read);
+    Ra = error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb =
+        use_fma || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = error_free_transformation && !want_rc_rounded && !want_rc_duplicated ?
+             fpr.Bind(c, RCMode::Read) :
+             fpr.Use(c, RCMode::Read);
     Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
     RegCache::Realize(Ra, Rb, Rc, Rd);
-
-    if (madds_accurate_nans)
-    {
-      Rc_duplicated_guard = fpr.Scratch();
-      RegCache::Realize(Rc_duplicated_guard);
-      Rc_duplicated = Rc_duplicated_guard;
-    }
   }
 
+  if (error_free_transformation_wants_rc_duplicated ||
+      (accurate_nans_wants_rc_duplicated &&
+       ((!software_fma && !error_free_transformation) || (error_free_transformation && packed))))
+  {
+    Rc_duplicated_guard = fpr.Scratch();
+    RegCache::Realize(Rc_duplicated_guard);
+    Rc_duplicated = Rc_duplicated_guard;
+    scratch_registers[Rc_duplicated + 16] = true;
+  }
+
+  const auto registers_to_save = [&](BitSet32 scratch_registers_to_save) {
+    const BitSet32 scratch_registers_not_to_save = scratch_registers & ~scratch_registers_to_save;
+    return CallerSavedRegistersInUse(scratch_registers_to_save) & ~scratch_registers_not_to_save;
+  };
+
   if (software_fma)
   {
+    if (want_rc_rounded)
+    {
+      if (error_free_transformation && madds0)
+      {
+        MOVDDUP(Rc_rounded, Rc);
+        Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
+      }
+      else if (error_free_transformation && madds1)
+      {
+        avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_rounded, Rc, Rc, 3);
+        Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2);
+      }
+      else
+      {
+        Force25BitPrecision(Rc_rounded, Rc, XMM2);
+      }
+    }
+
     for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
     {
-      if ((i == 0 || madds0) && !madds1)
+      if (madds0 || (i == 0 && !madds1) || (want_rc_rounded && error_free_transformation && madds1))
       {
-        if (round_input)
+        if (want_rc_rounded)
+          MOVAPD(XMM1, R(Rc_rounded));
+        else if (round_input)
           Force25BitPrecision(XMM1, Rc, XMM2);
+        else if (Rc.IsSimpleReg())
+          MOVAPD(XMM1, Rc);
         else
           MOVSD(XMM1, Rc);
       }
       else
       {
-        MOVHLPS(XMM1, Rc.GetSimpleReg());
-        if (round_input)
+        MOVHLPS(XMM1, want_rc_rounded ? Rc_rounded : Rc.GetSimpleReg());
+        if (round_input && !want_rc_rounded)
           Force25BitPrecision(XMM1, R(XMM1), XMM2);
       }
 
       // Write the result from the previous loop iteration into result_xmm so we don't lose it.
       // It's important that this is done after reading Rc above, in case we have madds1 and
-      // result_xmm == Rd == Rc.
+      // !want_rc_rounded and result_xmm == Rd == Rc.
       if (packed && i == 0)
         MOVLHPS(result_xmm, XMM0);
 
       if (i == 0)
       {
-        MOVSD(XMM0, Ra);
-        MOVSD(XMM2, Rb);
+        if (Ra.IsSimpleReg())
+          MOVAPD(XMM0, Ra);
+        else
+          MOVSD(XMM0, Ra);
+
+        if (Rb.IsSimpleReg())
+          MOVAPD(XMM2, Rb);
+        else
+          MOVSD(XMM2, Rb);
       }
       else
       {
@@ -473,23 +554,36 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
       if (subtract)
         XORPS(XMM2, MConst(psSignBits));
 
-      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      BitSet32 scratch_registers_to_save{};
+      if (packed && i == 0)
+        scratch_registers_to_save[result_xmm + 16] = true;
+      if (want_rc_rounded && (error_free_transformation || i == 1))
+        scratch_registers_to_save[Rc_rounded + 16] = true;
+
+      const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
       ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
       ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
       ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
     }
 
     if (packed)
+    {
+      // result_xmm's upper lane has the result of the first loop iteration
       MOVSD(R(result_xmm), XMM0);
+    }
     else
+    {
       DEBUG_ASSERT(result_xmm == XMM0);
+    }
 
-    if (madds_accurate_nans)
+    if (want_rc_duplicated)
     {
       if (madds0)
         MOVDDUP(Rc_duplicated, Rc);
-      else
+      else if (madds1)
         avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3);
+      else
+        DEBUG_ASSERT(false);
     }
   }
   else
@@ -497,7 +591,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     if (madds0)
     {
       MOVDDUP(result_xmm, Rc);
-      if (madds_accurate_nans)
+      if (want_rc_duplicated)
         MOVAPD(R(Rc_duplicated), result_xmm);
       if (round_input)
         Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
@@ -505,18 +599,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     else if (madds1)
     {
       avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
-      if (madds_accurate_nans)
+      if (want_rc_duplicated)
         MOVAPD(R(Rc_duplicated), result_xmm);
       if (round_input)
         Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
     }
     else
     {
+      DEBUG_ASSERT(!want_rc_duplicated);
       if (round_input)
         Force25BitPrecision(result_xmm, Rc, scratch_xmm);
       else
         MOVAPD(result_xmm, Rc);
     }
+    if (want_rc_rounded)
+      MOVAPD(R(Rc_rounded), result_xmm);
 
     if (use_fma)
     {
@@ -556,6 +653,160 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
     }
   }
 
+  if (m_accurate_nans && result_xmm == XMM0)
+  {
+    // HandleNaNs needs to clobber XMM0
+    result_xmm = error_free_transformation ? XMM1 : Rd;
+    MOVAPD(result_xmm, R(XMM0));
+    DEBUG_ASSERT(!preserve_d);
+  }
+
+  std::optional<FixupBranch> handled_nans;
+  if (!packed && m_accurate_nans)
+  {
+    // The clobber register is unused when not packed.
+    handled_nans =
+        HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
+  }
+
+  // Read the comment in the interpreter function NI_madd_msub to find out what's going on here.
+  if (error_free_transformation)
+  {
+    if (result_xmm != XMM1)
+    {
+      MOVAPD(XMM1, R(result_xmm));
+      result_xmm = XMM1;
+    }
+
+    X64Reg Rc_rounded_duplicated = Rc.GetSimpleReg();
+    BitSet32 scratch_registers_to_save = {XMM1 + 16, XMM2 + 16};
+    if (want_rc_rounded)
+    {
+      Rc_rounded_duplicated = Rc_rounded;
+      scratch_registers_to_save[Rc_rounded] = true;
+    }
+    else if (want_rc_duplicated)
+    {
+      Rc_rounded_duplicated = Rc_duplicated;
+      scratch_registers_to_save[want_rc_duplicated] = true;
+    }
+
+    // We've calculated s := a + b, with a = Ra * Rc_rounded_duplicated, b = subtract ? -Rb : Rb
+
+    if (packed)
+    {
+      // a' := s - b
+      if (subtract)
+        avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM0, R(XMM1), Rb);
+      else
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM1), Rb);
+
+      // b' := s - a'
+      avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM1), R(XMM0));
+
+      // da := a - a'
+      if (software_fma)
+      {
+        scratch_registers_to_save[XMM0 + 16] = true;
+        const BitSet32 registers_in_use_1 = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use_1, 0);
+
+        avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
+        MOVAPD(XMM0, R(Rc_rounded_duplicated));
+        MOVAPD(XMM1, Ra);
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        // We will read from the upper lane of Rc_rounded_duplicated later,
+        // so we need to make sure that that lane isn't overwritten.
+        if (Rc_rounded_duplicated == XMM3)
+          MOVSD(XMM3, R(XMM0));
+        else
+          MOVAPD(XMM3, R(XMM0));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use_1, 0);
+
+        scratch_registers_to_save[XMM0 + 16] = false;
+        scratch_registers_to_save[XMM3 + 16] = true;
+        const BitSet32 registers_in_use_2 = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use_2, 0);
+
+        MOVHLPS(XMM2, XMM0);
+        XORPS(XMM2, MConst(psSignBits));
+        MOVHLPS(XMM0, Rc_rounded_duplicated);
+        MOVHLPS(XMM1, Ra.GetSimpleReg());
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use_2, 0);
+
+        UNPCKLPD(XMM0, R(XMM3));
+      }
+      else if (use_fma)
+      {
+        VFMSUB231PD(XMM0, Rc_rounded_duplicated, Ra);
+      }
+      else
+      {
+        avx_op(&XEmitter::VMULPD, &XEmitter::MULPD, XMM3, R(Rc_rounded_duplicated), Ra);
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM3), R(XMM0), true, false, XMM3);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (subtract)
+        avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM2, R(XMM2), Rb);
+      else
+        avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM2), Rb);
+
+      CALL(GetAsmRoutines()->ps_madd_eft);
+    }
+    else
+    {
+      // a' := s - b
+      if (subtract)
+        avx_op(&XEmitter::VADDSD, &XEmitter::ADDSD, XMM0, R(XMM1), Rb, false);
+      else
+        avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM1), Rb, false);
+
+      // b' := s - a'
+      avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM2, R(XMM1), R(XMM0), false);
+
+      // da := a - a'
+      if (software_fma)
+      {
+        const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save);
+        ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+
+        avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits));
+        MOVAPD(XMM0, R(Rc_rounded_duplicated));
+        MOVAPD(XMM1, Ra);
+
+        ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+
+        ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+      }
+      else if (use_fma)
+      {
+        VFMSUB231SD(XMM0, Rc_rounded_duplicated, Ra);
+      }
+      else
+      {
+        avx_op(&XEmitter::VMULSD, &XEmitter::MULSD, XMM3, R(Rc_rounded_duplicated), Ra, false);
+        avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM3), R(XMM0), false, false, XMM3);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (subtract)
+        ADDSD(XMM2, Rb);
+      else
+        SUBSD(XMM2, Rb);
+
+      CALL(GetAsmRoutines()->fmadds_eft);
+    }
+  }
+
   // Using x64's nmadd/nmsub would require us to swap the sign of the addend
   // (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
   // Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
@@ -563,16 +814,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
   if (negate)
     XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
 
-  if (m_accurate_nans && result_xmm == XMM0)
+  if (packed && m_accurate_nans)
   {
-    // HandleNaNs needs to clobber XMM0
-    MOVAPD(Rd, R(result_xmm));
-    result_xmm = Rd;
-    DEBUG_ASSERT(!preserve_d);
+    // If packed, the clobber register must be XMM0.
+    handled_nans =
+        HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc);
   }
 
-  // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
-  HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc);
+  // If the handled_nans branch was taken in the non-packed case, that means the result is NaN,
+  // so we can skip the XORPD and the error-free transformation. If the handled_nans branch was
+  // taken in the packed case, we don't know if both of the results were NaN or only one, so we
+  // can't skip anything.
+  if (handled_nans)
+    SetJumpTarget(*handled_nans);
 
   if (single)
     FinalizeSingleResult(Rd, R(result_xmm), packed, true);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index 930757cdadd..8e44eb78b40 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -100,12 +100,19 @@ void Jit64::ps_muls(UGeckoInstruction inst)
   default:
     PanicAlertFmt("ps_muls WTF!!!");
   }
+
   if (round_input)
     Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0);
   else if (XMM1 != Rc_duplicated)
     MOVAPD(XMM1, Rc_duplicated);
   MULPD(XMM1, Ra);
-  HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
+
+  if (m_accurate_nans)
+  {
+    const FixupBranch handled_nans = HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated);
+    SetJumpTarget(handled_nans);
+  }
+
   FinalizeSingleResult(Rd, R(XMM1));
 }
 
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
index 0c186d0972f..d1415c09c40 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@@ -741,7 +741,8 @@ void EmuCodeBlock::JitClearCA()
 // Abstract between AVX and SSE: automatically handle 3-operand instructions
 void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
                           void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
-                          const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible)
+                          const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible,
+                          X64Reg scratch)
 {
   if (arg1.IsSimpleReg(regOp))
   {
@@ -778,19 +779,19 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
   else
   {
     // The ugly case: Not reversible, and we have regOp == arg2 without AVX or with arg1 == memory
-    if (!arg1.IsSimpleReg(XMM0))
-      MOVAPD(XMM0, arg1);
+    if (!arg1.IsSimpleReg(scratch))
+      MOVAPD(scratch, arg1);
     if (cpu_info.bAVX)
     {
-      (this->*avxOp)(regOp, XMM0, arg2);
+      (this->*avxOp)(regOp, scratch, arg2);
     }
     else
     {
-      (this->*sseOp)(XMM0, arg2);
+      (this->*sseOp)(scratch, arg2);
       if (packed)
-        MOVAPD(regOp, R(XMM0));
+        MOVAPD(regOp, R(scratch));
       else
-        MOVSD(regOp, R(XMM0));
+        MOVSD(regOp, R(scratch));
     }
   }
 }
@@ -798,7 +799,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
 // Abstract between AVX and SSE: automatically handle 3-operand instructions
 void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, u8),
                           void (XEmitter::*sseOp)(X64Reg, const OpArg&, u8), X64Reg regOp,
-                          const OpArg& arg1, const OpArg& arg2, u8 imm)
+                          const OpArg& arg1, const OpArg& arg2, u8 imm, X64Reg scratch)
 {
   if (arg1.IsSimpleReg(regOp))
   {
@@ -816,21 +817,40 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
   else
   {
     // The ugly case: regOp == arg2 without AVX, or with arg1 == memory
-    if (!arg1.IsSimpleReg(XMM0))
-      MOVAPD(XMM0, arg1);
+    if (!arg1.IsSimpleReg(scratch))
+      MOVAPD(scratch, arg1);
     if (cpu_info.bAVX)
     {
-      (this->*avxOp)(regOp, XMM0, arg2, imm);
+      (this->*avxOp)(regOp, scratch, arg2, imm);
     }
     else
     {
-      (this->*sseOp)(XMM0, arg2, imm);
-      if (regOp != XMM0)
-        MOVAPD(regOp, R(XMM0));
+      (this->*sseOp)(scratch, arg2, imm);
+      if (regOp != scratch)
+        MOVAPD(regOp, R(scratch));
     }
   }
 }
 
+// Abstract between AVX and SSE: automatically handle 3-operand instructions
+void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, u8),
+                          void (XEmitter::*sseOp)(X64Reg, u8), X64Reg regOp1, X64Reg regOp2, u8 imm)
+{
+  if (regOp1 == regOp2)
+  {
+    (this->*sseOp)(regOp1, imm);
+  }
+  else if (cpu_info.bAVX)
+  {
+    (this->*avxOp)(regOp1, regOp2, imm);
+  }
+  else
+  {
+    MOVAPD(regOp1, R(regOp2));
+    (this->*sseOp)(regOp1, imm);
+  }
+}
+
 alignas(16) static const u64 psMantissaTruncate[2] = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL};
 alignas(16) static const u64 psRoundBit[2] = {0x8000000, 0x8000000};
 
@@ -842,8 +862,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
 {
   if (m_jit.jo.accurateSinglePrecision)
   {
+    DEBUG_ASSERT(output != tmp);
     // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
-    if (input.IsSimpleReg() && cpu_info.bAVX)
+    if (input.IsSimpleReg() && !input.IsSimpleReg(tmp) && cpu_info.bAVX)
     {
       VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
       VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));
diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
index 54cade23c5a..b53b5da67b4 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h
@@ -113,10 +113,14 @@ public:
   void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&),
               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), Gen::X64Reg regOp,
               const Gen::OpArg& arg1, const Gen::OpArg& arg2, bool packed = true,
-              bool reversible = false);
+              bool reversible = false, Gen::X64Reg scratch = Gen::XMM0);
   void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&, u8),
               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
-              const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
+              const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm,
+              Gen::X64Reg scratch = Gen::XMM0);
+  void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, u8),
+              void (Gen::XEmitter::*sseOp)(Gen::X64Reg, u8), Gen::X64Reg regOp1, Gen::X64Reg regOp2,
+              u8 imm);
 
   void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);
 
diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
index de5527d2d8e..437c6c01795 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@@ -326,6 +326,98 @@ void CommonAsmRoutines::GenMfcr()
   Common::JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
 }
 
+// Inputs:
+// XMM0: First error term
+// XMM1: Result with potentially incorrect rounding
+// XMM2: Second error term, negated
+//
+// Outputs result with corrected rounding in XMM1.
+// Clobbers RSCRATCH, RSCRATCH2, XMM0, XMM2, and flags.
+void CommonAsmRoutines::GenerateFmaddsEft()
+{
+  // Check if XMM1 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  MOVQ_xmm(R(RSCRATCH), XMM1);
+  MOV(32, R(RSCRATCH2), Imm32(0x80000000));
+  LEA(32, RSCRATCH2, MComplex(RSCRATCH2, RSCRATCH, SCALE_8, 0));
+  TEST(32, R(RSCRATCH2), R(RSCRATCH2));
+  FixupBranch even_tie = J_CC(CCFlags::CC_Z);
+
+  const u8* ret = GetCodePtr();
+  RET();
+
+  // Check if the error is 0
+  SetJumpTarget(even_tie);
+  SUBSD(XMM0, R(XMM2));
+  XORPD(XMM2, R(XMM2));
+  UCOMISD(XMM0, R(XMM2));
+  J_CC(CCFlags::CC_E, ret);
+
+  // Round XMM1 up or down
+  MOVQ_xmm(R(RSCRATCH2), XMM0);
+  XOR(64, R(RSCRATCH2), R(RSCRATCH));
+  SAR(64, R(RSCRATCH2), Imm8(63));
+  OR(64, R(RSCRATCH2), Imm8(1));
+  ADD(64, R(RSCRATCH), R(RSCRATCH2));
+  MOVQ_xmm(XMM1, R(RSCRATCH));
+  RET();
+}
+
+alignas(16) static const __m128i double_msb = _mm_set_epi64x(0x8000000000000000,
+                                                             0x8000000000000000);
+alignas(16) static const __m128i double_lsb = _mm_set_epi64x(1, 1);
+
+// Inputs:
+// XMM0: First error terms
+// XMM1: Results with potentially incorrect rounding
+// XMM2: Second error terms, negated
+//
+// Outputs results with corrected rounding in XMM1. Clobbers RSCRATCH, XMM0-XMM3, and flags.
+void CommonAsmRoutines::GeneratePsMaddEft()
+{
+  // Check if XMM1 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  avx_op(&XEmitter::VPSLLQ, &XEmitter::PSLLQ, XMM3, XMM1, 35);
+  if (cpu_info.bSSE4_1)
+  {
+    PCMPEQQ(XMM3, MConst(double_msb));
+  }
+  else
+  {
+    PCMPEQW(XMM3, MConst(double_msb));
+    PSHUFD(XMM3, R(XMM3), 0xF5);
+  }
+
+  // Just for performance, exit early if there is no even tie
+  if (cpu_info.bSSE4_1)
+  {
+    PTEST(XMM3, R(XMM3));
+  }
+  else
+  {
+    PMOVMSKB(RSCRATCH, R(XMM3));
+    TEST(32, R(RSCRATCH), R(RSCRATCH));
+  }
+  FixupBranch even_tie = J_CC(CCFlags::CC_NZ);
+  RET();
+  SetJumpTarget(even_tie);
+
+  // Check if the error is zero
+  SUBPD(XMM0, R(XMM2));
+  XORPD(XMM2, R(XMM2));
+  CMPPD(XMM2, R(XMM0), CMP_EQ);
+
+  // Store -1 or 1 in XMM0 depending on whether we're rounding down or up
+  PXOR(XMM0, R(XMM1));
+  PSRAD(XMM0, 31);
+  PSHUFD(XMM0, R(XMM0), 0xF5);
+  POR(XMM0, MConst(double_lsb));
+
+  // Round the elements that have both a non-zero error and an even tie
+  PANDN(XMM2, R(XMM3));
+  PAND(XMM0, R(XMM2));
+  PADDQ(XMM1, R(XMM0));
+  RET();
+}
+
 // Safe + Fast Quantizers, originally from JITIL by magumagu
 alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
 alignas(16) static const float m_32767 = 32767.0f;
diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
index ffac2da1a1c..8d60b005b35 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
@@ -33,6 +33,8 @@ public:
 
 protected:
   void GenConvertDoubleToSingle();
+  void GenerateFmaddsEft();
+  void GeneratePsMaddEft();
   const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
   const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
   void GenQuantizedLoads();
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index 78288adb131..89de1e5d6e2 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -324,6 +324,8 @@ protected:
   void GenerateConvertDoubleToSingle();
   void GenerateConvertSingleToDouble();
   void GenerateFPRF(bool single);
+  void GenerateFmaddsEft();
+  void GeneratePsMaddEft();
   void GenerateQuantizedLoads();
   void GenerateQuantizedStores();
 
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index e03cb50c2f6..f2ac356fc56 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -79,9 +79,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   const bool use_b = op5 != 25;  // fmul uses no B
   const bool fma = use_b && use_c;
   const bool negate_result = (op5 & ~0x1) == 30;
+  const bool negate_b = op5 == 28 || op5 == 30;
 
   const bool output_is_single = inst.OPCD == 59;
-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
   const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c];
 
   const auto inputs_are_singles_func = [&] {
@@ -89,13 +91,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
            (!use_c || fpr.IsSingle(c, true));
   };
 
-  const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma;
+  const bool single = inputs_are_singles_func() && output_is_single &&
+                      (error_free_transformation_requested || !nonfused_requested);
   const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
   const RegType type_out = output_is_single ?
                                (single ? RegType::DuplicatedSingle : RegType::Duplicated) :
                                RegType::LowerPair;
   const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
 
+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation =
+      error_free_transformation_requested && !single && output_is_single;
+
+  if (error_free_transformation)
+  {
+    gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
+    fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
+  }
+
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
   const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
   const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@@ -103,33 +116,47 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
 
   {
     Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
-    Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;
 
     ARM64Reg rounded_c_reg = VC;
     if (round_c)
     {
-      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
       V0Q = fpr.GetScopedReg();
       rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
     }
 
     ARM64Reg result_reg = VD;
-    const bool preserve_d =
-        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
-    if (preserve_d)
+    ARM64Reg nonfused_reg = VD;
+    if (error_free_transformation)
     {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      result_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
+
+      if (nonfused && V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+    }
+    else
+    {
+      const bool preserve_d =
+          m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+      if (preserve_d)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        result_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
+      }
+      else if (fma && nonfused && VD == VB)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        nonfused_reg = reg_encoder(V0Q);
+      }
+    }
+
+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
+      Force25BitPrecision(rounded_c_reg, VC);
     }
 
     switch (op5)
@@ -152,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
     case 28:  // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
     case 30:  // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -164,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       break;
     case 29:  // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
     case 31:  // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -180,6 +207,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     }
 
     Common::SmallVector<FixupBranch, 4> nan_fixups;
+    std::optional<FixupBranch> nan_early_fixup;
     if (m_accurate_nans)
     {
       // Check if we need to handle NaNs
@@ -216,7 +244,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
         SetJumpTarget(skip);
       }
 
-      std::optional<FixupBranch> nan_early_fixup;
       if (negate_result)
       {
         // If we have a NaN, we must not execute FNEG.
@@ -230,11 +257,46 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       }
 
       SwitchToNearCode();
-
-      if (nan_early_fixup)
-        SetJumpTarget(*nan_early_fixup);
     }
 
+    // Read the comment in the interpreter function NI_madd_msub to find out what's going on here
+    if (error_free_transformation)
+    {
+      // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
+
+      // a' := s - b
+      if (negate_b)
+        m_float_emit.FADD(ARM64Reg::D1, result_reg, VB);
+      else
+        m_float_emit.FSUB(ARM64Reg::D1, result_reg, VB);
+
+      // b' := s - a'
+      m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1);
+
+      // da := a - a'
+      if (nonfused)
+      {
+        m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg);
+        m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1);
+      }
+      else
+      {
+        m_float_emit.FNMSUB(ARM64Reg::D1, VA, rounded_c_reg, ARM64Reg::D1);
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (negate_b)
+        m_float_emit.FADD(ARM64Reg::D2, ARM64Reg::D2, VB);
+      else
+        m_float_emit.FSUB(ARM64Reg::D2, ARM64Reg::D2, VB);
+
+      BL(GetAsmRoutines()->fmadds_eft);
+    }
+
+    if (nan_early_fixup)
+      SetJumpTarget(*nan_early_fixup);
+
     // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
     // for any of AArch64's FMA instructions, so we negate using a separate instruction.
     if (negate_result)
@@ -254,7 +316,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     fpr.FixSinglePrecision(d);
   }
 
+  if (error_free_transformation)
+    gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
+
   SetFPRFIfNeeded(output_is_single, VD);
+
+  if (error_free_transformation)
+    fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2);
 }
 
 void JitArm64::fp_logic(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index e1f3f096629..88ba86c2aff 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -92,20 +92,31 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
   const bool duplicated_c = muls || madds;
   const bool fma = use_b && use_c;
   const bool negate_result = (op5 & ~0x1) == 30;
-  const bool msub = op5 == 28 || op5 == 30;
+  const bool negate_b = op5 == 28 || op5 == 30;
 
-  const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA);
+  const bool error_free_transformation_requested = fma && m_accurate_fmadds;
   const bool round_c = use_c && !js.op->fprIsSingle[c];
 
   const auto inputs_are_singles_func = [&] {
     return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
   };
 
-  const bool single = inputs_are_singles_func() && !inaccurate_fma;
+  const bool single =
+      inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested);
   const RegType type = single ? RegType::Single : RegType::Register;
   const u8 size = single ? 32 : 64;
   const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad;
 
+  const bool nonfused = nonfused_requested && !single;
+  const bool error_free_transformation = error_free_transformation_requested && !single;
+
+  if (error_free_transformation)
+  {
+    gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
+    fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
+  }
+
   const ARM64Reg VA = reg_encoder(fpr.R(a, type));
   const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
   const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
@@ -119,41 +130,77 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
     ARM64Reg rounded_c_reg = VC;
     if (round_c)
     {
-      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
-      V0Q = fpr.GetScopedReg();
-      rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
+      if (error_free_transformation)
+      {
+        // This register happens to be free, so we can skip allocating one
+        rounded_c_reg = ARM64Reg::Q3;
+      }
+      else
+      {
         V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
+        rounded_c_reg = reg_encoder(V0Q);
+      }
     }
 
     ARM64Reg result_reg = VD;
-    const bool need_accurate_fma_reg =
-        fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
-    const bool preserve_d =
-        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
-    if (need_accurate_fma_reg || preserve_d)
+    ARM64Reg nonfused_reg = VD;
+    if (error_free_transformation)
     {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      result_reg = reg_encoder(ARM64Reg::Q0);
+      nonfused_reg = reg_encoder(ARM64Reg::Q0);
+    }
+    else
+    {
+      const bool need_fused_fma_reg =
+          fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg);
+      const bool preserve_d =
+          m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
+      if (need_fused_fma_reg || preserve_d)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        result_reg = reg_encoder(V0Q);
+        nonfused_reg = reg_encoder(V0Q);
+
+        if (need_fused_fma_reg && round_c)
+        {
+          V1Q = fpr.GetScopedReg();
+          rounded_c_reg = reg_encoder(V1Q);
+        }
+      }
+      else if (fma && nonfused && VD == VB)
+      {
+        if (V0Q == ARM64Reg::INVALID_REG)
+          V0Q = fpr.GetScopedReg();
+        nonfused_reg = reg_encoder(V0Q);
+      }
     }
 
     if (m_accurate_nans)
     {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
+      if (error_free_transformation)
+      {
+        // These registers happen to be free, so we can skip allocating new ones
+        V1Q = ARM64Reg::Q1;
+        V2Q = ARM64Reg::Q2;
+      }
+      else
+      {
+        if (V1Q == ARM64Reg::INVALID_REG)
+          V1Q = fpr.GetScopedReg();
 
-      if (duplicated_c || VD == result_reg)
-        V2Q = fpr.GetScopedReg();
+        if (duplicated_c || VD == result_reg)
+          V2Q = fpr.GetScopedReg();
+      }
     }
 
+    if (round_c)
+    {
+      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
+      Force25BitPrecision(rounded_c_reg, VC);
+    }
+
+    std::optional<ARM64Reg> negated_b_reg;
     switch (op5)
     {
     case 12:  // ps_muls0: d = a * c.ps0
@@ -163,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1);
       break;
     case 14:  // ps_madds0: d = a * c.ps0 + b
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -176,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       }
       break;
     case 15:  // ps_madds1: d = a * c.ps1 + b
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -202,23 +249,28 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       break;
     case 28:  // ps_msub:  d = a * c - b
     case 30:  // ps_nmsub: d = -(a * c - b)
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FSUB(size, result_reg, nonfused_reg, VB);
       }
       else
       {
         m_float_emit.FNEG(size, result_reg, VB);
+        if (error_free_transformation)
+        {
+          m_float_emit.MOV(ARM64Reg::Q4, result_reg);
+          negated_b_reg = ARM64Reg::Q4;
+        }
         m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg);
       }
       break;
     case 29:  // ps_madd:  d = a * c + b
     case 31:  // ps_nmadd: d = -(a * c + b)
-      if (inaccurate_fma)
+      if (nonfused)
       {
-        m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg);
-        m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB);
+        m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg);
+        m_float_emit.FADD(size, result_reg, nonfused_reg, VB);
       }
       else
       {
@@ -232,11 +284,80 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
       break;
     }
 
+    // Read the comment in the interpreter function NI_madd_msub to find out what's going on here
+    if (error_free_transformation)
+    {
+      // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB)
+
+      // a' := s - b
+      // (Transformed into -a' := b - s)
+      if (negate_b)
+      {
+        if (!negated_b_reg)
+        {
+          m_float_emit.FNEG(size, ARM64Reg::Q4, VB);
+          negated_b_reg = ARM64Reg::Q4;
+        }
+        m_float_emit.FSUB(size, ARM64Reg::Q1, *negated_b_reg, result_reg);
+      }
+      else
+      {
+        m_float_emit.FSUB(size, ARM64Reg::Q1, VB, result_reg);
+      }
+
+      // b' := s - a'
+      // (Transformed into b' := s + -a')
+      m_float_emit.FADD(size, ARM64Reg::Q2, result_reg, ARM64Reg::Q1);
+
+      // da := a - a'
+      // (Transformed into da := a + -a')
+      if (nonfused)
+      {
+        switch (op5)
+        {
+        case 14:  // ps_madds0: d = a * c.ps0 + b
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 0);
+          break;
+        case 15:  // ps_madds1: d = a * c.ps1 + b
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 1);
+          break;
+        default:
+          m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg);
+          break;
+        }
+        m_float_emit.FADD(size, ARM64Reg::Q1, ARM64Reg::Q3, ARM64Reg::Q1);
+      }
+      else
+      {
+        switch (op5)
+        {
+        case 14:  // ps_madds0: d = a * c.ps0 + b
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 0);
+          break;
+        case 15:  // ps_madds1: d = a * c.ps1 + b
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 1);
+          break;
+        default:
+          m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg);
+          break;
+        }
+      }
+
+      // db := b - b'
+      // (Transformed into -db := b' - b)
+      if (negate_b)
+        m_float_emit.FADD(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
+      else
+        m_float_emit.FSUB(size, ARM64Reg::Q2, ARM64Reg::Q2, VB);
+
+      BL(GetAsmRoutines()->ps_madd_eft);
+    }
+
     FixupBranch nan_fixup;
     if (m_accurate_nans)
     {
-      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
-      const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
+      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
+      const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);
 
       // Check if we need to handle NaNs
 
@@ -306,7 +427,13 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
 
   fpr.FixSinglePrecision(d);
 
+  if (error_free_transformation)
+    gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
+
   SetFPRFIfNeeded(true, VD);
+
+  if (error_free_transformation)
+    fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4);
 }
 
 void JitArm64::ps_sel(UGeckoInstruction inst)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
index 89ed9a2b053..a65fd33a8f4 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
@@ -8,6 +8,7 @@
 #include <utility>
 
 #include "Common/Arm64Emitter.h"
+#include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/Config/Config.h"
 #include "Common/FloatUtils.h"
@@ -265,6 +266,14 @@ void JitArm64::GenerateCommonAsm()
   GenerateFPRF(false);
   Common::JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF");
 
+  GetAsmRoutines()->fmadds_eft = GetCodePtr();
+  GenerateFmaddsEft();
+  Common::JitRegister::Register(GetAsmRoutines()->fmadds_eft, GetCodePtr(), "JIT_fmadds_eft");
+
+  GetAsmRoutines()->ps_madd_eft = GetCodePtr();
+  GeneratePsMaddEft();
+  Common::JitRegister::Register(GetAsmRoutines()->ps_madd_eft, GetCodePtr(), "JIT_ps_madd_eft");
+
   GenerateQuantizedLoads();
   GenerateQuantizedStores();
 }
@@ -514,6 +523,90 @@ void JitArm64::GenerateFPRF(bool single)
   B(write_fprf_and_ret);
 }
 
+// Inputs:
+// D0: Result with potentially incorrect rounding
+// D1: First error term
+// D2: Second error term, negated
+//
+// Outputs result with corrected rounding in D0. Clobbers X0-X1, D1, and flags.
+void JitArm64::GenerateFmaddsEft()
+{
+  // Check if D0 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
+  MOVI2R(ARM64Reg::W1, 0x80000000);
+  CMP(ARM64Reg::W1, ARM64Reg::W0, ArithOption(ARM64Reg::W0, ShiftType::LSL, 3));
+  FixupBranch even_tie = B(CCFlags::CC_EQ);
+
+  const u8* ret = GetCodePtr();
+  RET();
+
+  // Check if the error is 0
+  SetJumpTarget(even_tie);
+  m_float_emit.FSUB(ARM64Reg::D1, ARM64Reg::D1, ARM64Reg::D2);
+  m_float_emit.FCMP(ARM64Reg::D1);
+  B(CCFlags::CC_EQ, ret);
+
+  // Round D0 up or down
+  MOVZ(ARM64Reg::X1, 1);
+  CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
+  CMP(ARM64Reg::X0, 0);
+  CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT);
+  ADD(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1);
+  m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0);
+  RET();
+}
+
+// Inputs:
+// Q0: Results with potentially incorrect rounding
+// Q1: First error terms
+// Q2: Second error terms, negated
+//
+// Outputs results with corrected rounding in Q0. Clobbers X0, Q1-Q4, and flags.
+void JitArm64::GeneratePsMaddEft()
+{
+  // Check if Q0 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000
+  MOVI2R(ARM64Reg::X0, 0x8000'0000'0000'0000);
+  m_float_emit.SHL(64, ARM64Reg::Q3, ARM64Reg::Q0, 35);
+  m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
+  m_float_emit.CMEQ(64, ARM64Reg::Q3, ARM64Reg::Q3, ARM64Reg::Q4);
+
+  // Just for performance, exit early if there is no even tie
+  m_float_emit.XTN(32, ARM64Reg::D4, ARM64Reg::Q3);
+  FixupBranch even_tie;
+  if (cpu_info.bAFP)
+  {
+    m_float_emit.FCMP(ARM64Reg::D4);
+    even_tie = B(CCFlags::CC_NEQ);
+  }
+  else
+  {
+    // If we don't have AFP and the emulated software has NI set, subnormals will compare equal to
+    // zero, so we can't use FCMP unless we were to put some shuffle instruction before it.
+    // FMOV is a little slower than FCMP, but it's faster than adding an extra instruction.
+    m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D4);
+    even_tie = CBNZ(ARM64Reg::X0);
+  }
+  RET();
+  SetJumpTarget(even_tie);
+
+  // Check if the error is zero
+  m_float_emit.FSUB(64, ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
+  MOVZ(ARM64Reg::X0, 1);
+  m_float_emit.FCMEQ(64, ARM64Reg::Q2, ARM64Reg::Q1);
+
+  // Store -1 or 1 in Q1 depending on whether we're rounding down or up
+  m_float_emit.EOR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q0);
+  m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0);
+  m_float_emit.SSHR(64, ARM64Reg::Q1, ARM64Reg::Q1, 63);
+  m_float_emit.ORR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q4);
+
+  // Round the elements that have both a non-zero error and an even tie
+  m_float_emit.BIC(ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q2);
+  m_float_emit.AND(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2);
+  m_float_emit.ADD(64, ARM64Reg::Q0, ARM64Reg::Q0, ARM64Reg::Q1);
+  RET();
+}
+
 void JitArm64::GenerateQuantizedLoads()
 {
   // X0 is a temporary
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
index 4fd58bc8973..88132bdd6ab 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@@ -30,6 +30,8 @@ struct CommonAsmRoutinesBase
   const u8* cstd;
   const u8* fprf_single;
   const u8* fprf_double;
+  const u8* fmadds_eft;
+  const u8* ps_madd_eft;
 
   // In: array index: GQR to use.
   // In: ECX: Address to read from.
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp
index d2de2895a75..a8709c4f6e9 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp
@@ -57,7 +57,7 @@
 // After resetting the stack to the top, we call _resetstkoflw() to restore
 // the guard page at the 256kb mark.
 
-const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitBase::JIT_SETTINGS{{
+const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JitBase::JIT_SETTINGS{{
     {&JitBase::bJITOff, &Config::MAIN_DEBUG_JIT_OFF},
     {&JitBase::bJITLoadStoreOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_OFF},
     {&JitBase::bJITLoadStorelXzOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_LXZ_OFF},
@@ -79,6 +79,7 @@ const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JitB
     {&JitBase::m_low_dcbz_hack, &Config::MAIN_LOW_DCBZ_HACK},
     {&JitBase::m_fprf, &Config::MAIN_FPRF},
     {&JitBase::m_accurate_nans, &Config::MAIN_ACCURATE_NANS},
+    {&JitBase::m_accurate_fmadds, &Config::MAIN_ACCURATE_FMADDS},
     {&JitBase::m_fastmem_enabled, &Config::MAIN_FASTMEM},
     {&JitBase::m_accurate_cpu_cache_enabled, &Config::MAIN_ACCURATE_CPU_CACHE},
 }};
diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
index cb78fcc6fe2..468874a9836 100644
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@@ -158,6 +158,7 @@ protected:
   bool m_low_dcbz_hack = false;
   bool m_fprf = false;
   bool m_accurate_nans = false;
+  bool m_accurate_fmadds = false;
   bool m_fastmem_enabled = false;
   bool m_accurate_cpu_cache_enabled = false;
 
@@ -165,7 +166,7 @@ protected:
   bool m_cleanup_after_stackfault = false;
   u8* m_stack_guard = nullptr;
 
-  static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 23> JIT_SETTINGS;
+  static const std::array<std::pair<bool JitBase::*, const Config::Info<bool>*>, 24> JIT_SETTINGS;
 
   bool DoesConfigNeedRefresh() const;
   void RefreshConfig();