From caad84c636f30b47a30303ea1994bb61db63cc02 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sat, 16 Aug 2025 11:51:07 +0200
Subject: [PATCH] JitArm64: Reduce register pressure for inaccurate FMA with
 accurate NaNs

If result_reg is set to a temporary register instead of VD because of
accurate NaNs, there's no need to allocate a secondary temporary
register because of inaccurate FMA.
---
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 22 +++++-----
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 40 +++++++++++--------
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index e03cb50c2f6..6fc3c23531d 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -103,7 +103,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
 
   {
     Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
-    Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;
 
     ARM64Reg rounded_c_reg = VC;
     if (round_c)
@@ -115,21 +114,22 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
       Force25BitPrecision(rounded_c_reg, VC);
     }
 
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
-    }
-
     ARM64Reg result_reg = VD;
+    ARM64Reg inaccurate_fma_reg = VD;
     const bool preserve_d =
         m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
     if (preserve_d)
     {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      result_reg = reg_encoder(V0Q);
+      inaccurate_fma_reg = reg_encoder(V0Q);
+    }
+    else if (fma && inaccurate_fma && VD == VB)
+    {
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      inaccurate_fma_reg = reg_encoder(V0Q);
     }
 
     switch (op5)
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index e1f3f096629..1504b300441 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -120,40 +120,48 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
     if (round_c)
     {
       ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
       V0Q = fpr.GetScopedReg();
       rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
     }
 
     ARM64Reg result_reg = VD;
+    ARM64Reg inaccurate_fma_reg = VD;
     const bool need_accurate_fma_reg =
         fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
     const bool preserve_d =
         m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
     if (need_accurate_fma_reg || preserve_d)
     {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      result_reg = reg_encoder(V0Q);
+      inaccurate_fma_reg = reg_encoder(V0Q);
+
+      if (need_accurate_fma_reg && round_c)
+      {
+        V1Q = fpr.GetScopedReg();
+        rounded_c_reg = reg_encoder(V1Q);
+      }
+    }
+    else if (fma && inaccurate_fma && VD == VB)
+    {
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      inaccurate_fma_reg = reg_encoder(V0Q);
     }
 
     if (m_accurate_nans)
     {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
+      if (V1Q == ARM64Reg::INVALID_REG)
+        V1Q = fpr.GetScopedReg();
 
       if (duplicated_c || VD == result_reg)
         V2Q = fpr.GetScopedReg();
     }
 
+    if (round_c)
+      Force25BitPrecision(rounded_c_reg, VC);
+
     switch (op5)
     {
     case 12:  // ps_muls0: d = a * c.ps0
@@ -235,8 +243,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
     FixupBranch nan_fixup;
     if (m_accurate_nans)
     {
-      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
-      const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
+      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
+      const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);
 
       // Check if we need to handle NaNs