From caad84c636f30b47a30303ea1994bb61db63cc02 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 16 Aug 2025 11:51:07 +0200 Subject: [PATCH] JitArm64: Reduce register pressure for inaccurate FMA with accurate NaNs If result_reg is set to a temporary register instead of VD because of accurate NaNs, there's no need to allocate a secondary temporary register because of inaccurate FMA. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 22 +++++----- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 40 +++++++++++-------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index e03cb50c2f6..6fc3c23531d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -103,7 +103,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst) { Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG; - Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG; ARM64Reg rounded_c_reg = VC; if (round_c) @@ -115,21 +114,22 @@ void JitArm64::fp_arith(UGeckoInstruction inst) Force25BitPrecision(rounded_c_reg, VC); } - ARM64Reg inaccurate_fma_reg = VD; - if (fma && inaccurate_fma && VD == VB) - { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); - } - ARM64Reg result_reg = VD; + ARM64Reg inaccurate_fma_reg = VD; const bool preserve_d = m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); if (preserve_d) { - V1Q = fpr.GetScopedReg(); - result_reg = reg_encoder(V1Q); + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + result_reg = reg_encoder(V0Q); + inaccurate_fma_reg = reg_encoder(V0Q); + } + else if (fma && inaccurate_fma && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + inaccurate_fma_reg = reg_encoder(V0Q); } switch (op5) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index e1f3f096629..1504b300441 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -120,40 +120,48 @@ void JitArm64::ps_arith(UGeckoInstruction inst) if (round_c) { ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single"); - V0Q = fpr.GetScopedReg(); rounded_c_reg = reg_encoder(V0Q); - Force25BitPrecision(rounded_c_reg, VC); - } - - ARM64Reg inaccurate_fma_reg = VD; - if (fma && inaccurate_fma && VD == VB) - { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); } ARM64Reg result_reg = VD; + ARM64Reg inaccurate_fma_reg = VD; const bool need_accurate_fma_reg = fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg); const bool preserve_d = m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); if (need_accurate_fma_reg || preserve_d) { - V1Q = fpr.GetScopedReg(); - result_reg = reg_encoder(V1Q); + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + result_reg = reg_encoder(V0Q); + inaccurate_fma_reg = reg_encoder(V0Q); + + if (need_accurate_fma_reg && round_c) + { + V1Q = fpr.GetScopedReg(); + rounded_c_reg = reg_encoder(V1Q); + } + } + else if (fma && inaccurate_fma && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + inaccurate_fma_reg = reg_encoder(V0Q); } if (m_accurate_nans) { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetScopedReg(); + if (V1Q == ARM64Reg::INVALID_REG) + V1Q = fpr.GetScopedReg(); if (duplicated_c || VD == result_reg) V2Q = fpr.GetScopedReg(); } + if (round_c) + Force25BitPrecision(rounded_c_reg, VC); + switch (op5) { case 12: // ps_muls0: d = a * c.ps0 @@ -235,8 +243,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst) FixupBranch nan_fixup; if (m_accurate_nans) { - const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q); - const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q); + const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q); + const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q); // Check if we need to handle NaNs