JitArm64: Reduce register pressure for inaccurate FMA with accurate NaNs

If result_reg is set to a temporary register instead of VD because of accurate NaNs, there's no need to allocate a secondary temporary register because of inaccurate FMA.
2026-02-04 21:45:59 +00:00 · 2025-08-16 11:51:07 +02:00 · 2025-08-16 11:51:07 +02:00 · caad84c636
commit caad84c636
parent 84261cfc23
2 changed files with 35 additions and 27 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -103,7 +103,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)

  {
    Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG;
-    Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG;

    ARM64Reg rounded_c_reg = VC;
    if (round_c)
@ -115,21 +114,22 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
      Force25BitPrecision(rounded_c_reg, VC);
    }

-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
-    }
-
    ARM64Reg result_reg = VD;
+    ARM64Reg inaccurate_fma_reg = VD;
    const bool preserve_d =
        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
    if (preserve_d)
    {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      result_reg = reg_encoder(V0Q);
+      inaccurate_fma_reg = reg_encoder(V0Q);
+    }
+    else if (fma && inaccurate_fma && VD == VB)
+    {
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      inaccurate_fma_reg = reg_encoder(V0Q);
    }

    switch (op5)
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -120,40 +120,48 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
    if (round_c)
    {
      ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single");
-
      V0Q = fpr.GetScopedReg();
      rounded_c_reg = reg_encoder(V0Q);
-      Force25BitPrecision(rounded_c_reg, VC);
-    }
-
-    ARM64Reg inaccurate_fma_reg = VD;
-    if (fma && inaccurate_fma && VD == VB)
-    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
-      inaccurate_fma_reg = reg_encoder(V0Q);
    }

    ARM64Reg result_reg = VD;
+    ARM64Reg inaccurate_fma_reg = VD;
    const bool need_accurate_fma_reg =
        fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg);
    const bool preserve_d =
        m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC));
    if (need_accurate_fma_reg || preserve_d)
    {
-      V1Q = fpr.GetScopedReg();
-      result_reg = reg_encoder(V1Q);
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      result_reg = reg_encoder(V0Q);
+      inaccurate_fma_reg = reg_encoder(V0Q);
+
+      if (need_accurate_fma_reg && round_c)
+      {
+        V1Q = fpr.GetScopedReg();
+        rounded_c_reg = reg_encoder(V1Q);
+      }
+    }
+    else if (fma && inaccurate_fma && VD == VB)
+    {
+      if (V0Q == ARM64Reg::INVALID_REG)
+        V0Q = fpr.GetScopedReg();
+      inaccurate_fma_reg = reg_encoder(V0Q);
    }

    if (m_accurate_nans)
    {
-      if (V0Q == ARM64Reg::INVALID_REG)
-        V0Q = fpr.GetScopedReg();
+      if (V1Q == ARM64Reg::INVALID_REG)
+        V1Q = fpr.GetScopedReg();

      if (duplicated_c || VD == result_reg)
        V2Q = fpr.GetScopedReg();
    }

+    if (round_c)
+      Force25BitPrecision(rounded_c_reg, VC);
+
    switch (op5)
    {
    case 12:  // ps_muls0: d = a * c.ps0
@ -235,8 +243,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
    FixupBranch nan_fixup;
    if (m_accurate_nans)
    {
-      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q);
-      const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q);
+      const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q);
+      const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q);

      // Check if we need to handle NaNs