diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index d78257cf4fc..cdb0b7e625d 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -3156,6 +3156,10 @@ void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) EmitCopy(IsQuad(Rd), 0, imm5, 0, Rd, Rn); } +void ARM64FloatEmitter::EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 0, 3, Rd, Rn, Rm); +} void ARM64FloatEmitter::FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xF, Rd, Rn); @@ -3505,6 +3509,53 @@ void ARM64FloatEmitter::UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale) EmitConversion2(sf, 0, false, type, 0, 3, 64 - scale, Rd, Rn); } +// Comparison +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn); +} +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn); +} +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0x8, Rd, Rn); +} +void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x6, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, MathUtil::IntLog2(size) - 3, 0x7, Rd, Rn, Rm); +} +void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 1, MathUtil::IntLog2(size) - 3, 0x9, Rd, Rn); +} +void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(IsQuad(Rd), 0, MathUtil::IntLog2(size) - 3, 0xA, Rd, Rn); +} +void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, MathUtil::IntLog2(size) - 3, 0x11, Rd, Rn, Rm); +} + +// Float comparison void ARM64FloatEmitter::FCMP(ARM64Reg Rn, ARM64Reg Rm) { EmitCompare(0, 0, 0, 0, Rn, Rm); @@ -3664,7 +3715,7 @@ void ARM64FloatEmitter::SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", shift, src_size); - EmitShiftImm(1, 0, src_size | shift, 0b01010, Rd, Rn); + EmitShiftImm(IsQuad(Rd), 0, src_size | shift, 0b01010, Rd, Rn); } void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) @@ -3674,11 +3725,18 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, EmitShiftImm(upper, 0, src_size | shift, 0b10100, Rd, Rn); } +void ARM64FloatEmitter::SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", + shift, src_size); + EmitShiftImm(IsQuad(Rd), 0, src_size * 2 - shift, 0b00000, Rd, Rn); +} + void ARM64FloatEmitter::URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { ASSERT_MSG(DYNA_REC, shift < src_size, "Shift amount must be less than the element size! {} {}", shift, src_size); - EmitShiftImm(1, 1, src_size * 2 - shift, 0b00100, Rd, Rn); + EmitShiftImm(IsQuad(Rd), 1, src_size * 2 - shift, 0b00100, Rd, Rn); } void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 134f9b64cce..d0c91abd619 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -800,6 +800,7 @@ public: ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR; CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); } + void CNEG(ARM64Reg Rd, ARM64Reg Rn, CCFlags cond) { CSNEG(Rd, Rn, Rn, (CCFlags)((u32)cond ^ 1)); } void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) { @@ -1281,6 +1282,7 @@ public: void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); + void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -1342,6 +1344,19 @@ public: void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); + // Comparison + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + // Float comparison void FCMP(ARM64Reg Rn, ARM64Reg Rm); void FCMP(ARM64Reg Rn); @@ -1380,6 +1395,7 @@ public: void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 0a122e11218..f9a8a015f92 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -2519,19 +2519,19 @@ void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg& arg) WriteSSEOp(0x66, 0x6C, dest, arg); } -void XEmitter::PSRLW(X64Reg reg, int shift) +void XEmitter::PSRLW(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg)); Write8(shift); } -void XEmitter::PSRLD(X64Reg reg, int shift) +void XEmitter::PSRLD(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg)); Write8(shift); } -void XEmitter::PSRLQ(X64Reg reg, int shift) +void XEmitter::PSRLQ(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg)); Write8(shift); @@ -2542,38 +2542,38 @@ void XEmitter::PSRLQ(X64Reg reg, const OpArg& arg) WriteSSEOp(0x66, 0xd3, reg, arg); } -void XEmitter::PSRLDQ(X64Reg reg, int shift) +void XEmitter::PSRLDQ(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg)); Write8(shift); } -void XEmitter::PSLLW(X64Reg reg, int shift) +void XEmitter::PSLLW(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg)); Write8(shift); } -void XEmitter::PSLLD(X64Reg reg, int shift) +void XEmitter::PSLLD(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg)); Write8(shift); } -void XEmitter::PSLLQ(X64Reg reg, int shift) +void XEmitter::PSLLQ(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg)); Write8(shift); } -void XEmitter::PSLLDQ(X64Reg reg, int shift) +void XEmitter::PSLLDQ(X64Reg reg, u8 shift) { WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg)); Write8(shift); } // WARNING not REX compatible -void XEmitter::PSRAW(X64Reg reg, int shift) +void XEmitter::PSRAW(X64Reg reg, u8 shift) { if (reg > 7) PanicAlertFmt("The PSRAW-emitter does not support regs above 7"); @@ -2585,7 +2585,7 @@ void XEmitter::PSRAW(X64Reg reg, int shift) } // WARNING not REX compatible -void XEmitter::PSRAD(X64Reg reg, int shift) +void XEmitter::PSRAD(X64Reg reg, u8 shift) { if (reg > 7) PanicAlertFmt("The PSRAD-emitter does not support regs above 7"); @@ -2695,6 +2695,11 @@ void XEmitter::BLENDPD(X64Reg dest, const OpArg& arg, u8 blend) Write8(blend); } +void XEmitter::PCMPEQQ(X64Reg dest, const OpArg& arg) +{ + WriteSSE41Op(0x66, 0x3829, dest, arg); +} + void XEmitter::PAND(X64Reg dest, const OpArg& arg) { WriteSSEOp(0x66, 0xDB, dest, arg); @@ -3038,6 +3043,12 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg) WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg); } +void XEmitter::VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift) +{ + WriteAVXOp(0x66, 0x73, (X64Reg)6, regOp1, R(regOp2)); + Write8(shift); +} + void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp) { WriteAVXOp(0x00, 0x29, regOp, X64Reg::INVALID_REG, arg); diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 69537709869..35d88a46bce 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -801,19 +801,19 @@ public: void PSHUFLW(X64Reg dest, const OpArg& arg, u8 shuffle); void PSHUFHW(X64Reg dest, const OpArg& arg, u8 shuffle); - void PSRLW(X64Reg reg, int shift); - void PSRLD(X64Reg reg, int shift); - void PSRLQ(X64Reg reg, int shift); + void PSRLW(X64Reg reg, u8 shift); + void PSRLD(X64Reg reg, u8 shift); + void PSRLQ(X64Reg reg, u8 shift); void PSRLQ(X64Reg reg, const OpArg& arg); - void PSRLDQ(X64Reg reg, int shift); + void PSRLDQ(X64Reg reg, u8 shift); - void PSLLW(X64Reg reg, int shift); - void PSLLD(X64Reg reg, int shift); - void PSLLQ(X64Reg reg, int shift); - void PSLLDQ(X64Reg reg, int shift); + void PSLLW(X64Reg reg, u8 shift); + void PSLLD(X64Reg reg, u8 shift); + void PSLLQ(X64Reg reg, u8 shift); + void PSLLDQ(X64Reg reg, u8 shift); - void PSRAW(X64Reg reg, int shift); - void PSRAD(X64Reg reg, int shift); + void PSRAW(X64Reg reg, u8 shift); + void PSRAD(X64Reg reg, u8 shift); // SSE4: data type conversions void PMOVSXBW(X64Reg dest, const OpArg& arg); @@ -836,6 +836,9 @@ public: void BLENDPS(X64Reg dest, const OpArg& arg, u8 blend); void BLENDPD(X64Reg dest, const OpArg& arg, u8 blend); + // SSE4: compare instructions + void PCMPEQQ(X64Reg dest, const OpArg& arg); + // AVX void VADDSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); void VSUBSS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); @@ -878,6 +881,8 @@ public: void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); + void VPSLLQ(X64Reg regOp1, X64Reg regOp2, u8 shift); + void VMOVAPS(const OpArg& arg, X64Reg regOp); void VZEROUPPER(); diff --git a/Source/Core/Core/Config/MainSettings.cpp b/Source/Core/Core/Config/MainSettings.cpp index e42796e8646..3ef8872274c 100644 --- a/Source/Core/Core/Config/MainSettings.cpp +++ b/Source/Core/Core/Config/MainSettings.cpp @@ -222,6 +222,7 @@ const Info MAIN_DIVIDE_BY_ZERO_EXCEPTIONS{{System::Main, "Core", "DivByZer false}; const Info MAIN_FPRF{{System::Main, "Core", "FPRF"}, false}; const Info MAIN_ACCURATE_NANS{{System::Main, "Core", "AccurateNaNs"}, false}; +const Info MAIN_ACCURATE_FMADDS{{System::Main, "Core", "AccurateFmadds"}, true}; const Info MAIN_DISABLE_ICACHE{{System::Main, "Core", "DisableICache"}, false}; const Info MAIN_EMULATION_SPEED{{System::Main, "Core", "EmulationSpeed"}, 1.0f}; #if defined(ANDROID) diff --git a/Source/Core/Core/Config/MainSettings.h b/Source/Core/Core/Config/MainSettings.h index b6a7094c933..27756d2e2a5 100644 --- a/Source/Core/Core/Config/MainSettings.h +++ b/Source/Core/Core/Config/MainSettings.h @@ -128,6 +128,7 @@ extern const Info MAIN_FLOAT_EXCEPTIONS; extern const Info MAIN_DIVIDE_BY_ZERO_EXCEPTIONS; extern const Info MAIN_FPRF; extern const Info MAIN_ACCURATE_NANS; +extern const Info MAIN_ACCURATE_FMADDS; extern const Info MAIN_DISABLE_ICACHE; extern const Info MAIN_EMULATION_SPEED; extern const Info MAIN_PRECISION_FRAME_TIMING; diff --git a/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp b/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp index 855cc181b92..7966cb32d27 100644 --- a/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp +++ b/Source/Core/Core/ConfigLoaders/NetPlayConfigLoader.cpp @@ -80,6 +80,7 @@ public: layer->Set(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS, m_settings.divide_by_zero_exceptions); layer->Set(Config::MAIN_FPRF, m_settings.fprf); layer->Set(Config::MAIN_ACCURATE_NANS, m_settings.accurate_nans); + layer->Set(Config::MAIN_ACCURATE_FMADDS, m_settings.accurate_fmadds); layer->Set(Config::MAIN_DISABLE_ICACHE, m_settings.disable_icache); layer->Set(Config::MAIN_SYNC_ON_SKIP_IDLE, m_settings.sync_on_skip_idle); layer->Set(Config::MAIN_SYNC_GPU, m_settings.sync_gpu); diff --git a/Source/Core/Core/NetPlayProto.h b/Source/Core/Core/NetPlayProto.h index 62a9aba9a4e..d085a7cf2e7 100644 --- a/Source/Core/Core/NetPlayProto.h +++ b/Source/Core/Core/NetPlayProto.h @@ -68,6 +68,7 @@ struct NetSettings bool divide_by_zero_exceptions = false; bool fprf = false; bool accurate_nans = false; + bool accurate_fmadds = false; bool disable_icache = false; bool sync_on_skip_idle = false; bool sync_gpu = false; diff --git a/Source/Core/Core/NetPlayServer.cpp b/Source/Core/Core/NetPlayServer.cpp index 948dabebcd5..9f432486e23 100644 --- a/Source/Core/Core/NetPlayServer.cpp +++ b/Source/Core/Core/NetPlayServer.cpp @@ -1425,6 +1425,7 @@ bool NetPlayServer::SetupNetSettings() settings.divide_by_zero_exceptions = Config::Get(Config::MAIN_DIVIDE_BY_ZERO_EXCEPTIONS); settings.fprf = Config::Get(Config::MAIN_FPRF); settings.accurate_nans = Config::Get(Config::MAIN_ACCURATE_NANS); + settings.accurate_fmadds = Config::Get(Config::MAIN_ACCURATE_FMADDS); settings.disable_icache = Config::Get(Config::MAIN_DISABLE_ICACHE); settings.sync_on_skip_idle = Config::Get(Config::MAIN_SYNC_ON_SKIP_IDLE); settings.sync_gpu = Config::Get(Config::MAIN_SYNC_GPU); diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index 0727df26ae8..d01087fe1df 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -342,12 +342,12 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double // - This will cause `d` to round to 100...00, meaning it will tie then round upwards. // 3. Tying up to even because `c` is too small // a. The highest bit of `d` is 1, the rest of the bits of `d` are 0 (this means it ties) - // b. The lowest bit of `f` is 1 (this means it ties to even downwards) + // b. The lowest bit of `f` is 1 (this means it ties to even upwards) // c. `c` is negative and does not round `d` downwards // - This is similar to the first one but in reverse, rounding up instead of down. // 4. Tying down because `d` rounded down // a. The highest and lowest bits of `d` are 1, the rest of the bits of `d` are 0 - // b. The lowest bit of `f` is 0 (this means it ties to even upwards) + // b. The lowest bit of `f` is 0 (this means it ties to even downwards) // c. `c` is negative, and the highest bit of c is 1, // and at least one other bit of c is nonzero // - The backwards counterpart to case 2, this will cause `d` to round back down to 100..00, @@ -375,12 +375,6 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double // - Correct ordering of NaN checking (for both double and single precision) // - Rounding frC up // - Rounding only once for single precision inputs (this will be the large majority of cases!) - // - Currently this is interpreter-only. - // This can be implemented in the JIT just as easily, though. - // Eventually the JITs should hopefully support detecting back to back - // single-precision operations, which will lead to no overhead at all. - // In the cases where JITs can't do this, an alternative method is used, as - // is done in the interpreter as well. // - Rounding only once for double precision inputs // - This is a side effect of how we handle single-precision inputs: By doing // error calculations rather than checking if every input is a float, we ensure that we know @@ -421,7 +415,7 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double const double b_sign = sub ? -b : b; result.value = std::fma(a, c_round, b_sign); - // We then check if we're currently tying in rounding directioh + // We then check if we're currently tying in rounding direction const u64 result_bits = std::bit_cast(result.value); // The mask of the `d` bits as shown in the above comments @@ -432,9 +426,8 @@ inline FPResult NI_madd_msub(PowerPC::PowerPCState& ppc_state, double a, double // Because we check this entire mask which includes a 1 bit, we can be sure that // if this result passes, the input is not an infinity that would become a NaN. - // This means that, for the JITs, if they only wanted to check for a subset of these - // bits (e.g. only checking if the last one was 0), then using the zero flag for a branch, - // they would have to check if the result was NaN before here. + // If we had only checked for a subset of these bits (e.g. only checking if the last + // one was 0), we would have needed to also check if the exponent was all ones. if ((result_bits & D_MASK) == EVEN_TIE) { // Because we have a tie, we now compute any error in the FMA calculation diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index c0eda2f8f51..484be2bb286 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -1284,9 +1284,9 @@ BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const return cb.m_gqr_used & ~cb.m_gqr_modified; } -BitSet32 Jit64::CallerSavedRegistersInUse() const +BitSet32 Jit64::CallerSavedRegistersInUse(BitSet32 additional_registers) const { - BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16); + BitSet32 in_use = gpr.RegistersInUse() | (fpr.RegistersInUse() << 16) | additional_registers; return in_use & ABI_ALL_CALLER_SAVED; } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index ad5db1fa103..a14bc1dace0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -77,7 +77,7 @@ public: // Returns false if no free memory region can be found for either of the two. bool SetEmitterStateToFreeCodeRegion(); - BitSet32 CallerSavedRegistersInUse() const; + BitSet32 CallerSavedRegistersInUse(BitSet32 additional_registers = {}) const; BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const; void IntializeSpeculativeConstants(); @@ -153,9 +153,10 @@ public: void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, bool duplicate = false); void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); - void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, Gen::X64Reg clobber, - std::optional Ra, std::optional Rb, - std::optional Rc); + [[nodiscard]] Gen::FixupBranch HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm, + Gen::X64Reg clobber, std::optional Ra, + std::optional Rb, + std::optional Rc); void MultiplyImmediate(u32 imm, int a, int d, bool overflow); diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 05d08f767ee..b7735595125 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -265,6 +265,10 @@ void Jit64AsmRoutineManager::GenerateCommon() GenMfcr(); cdts = AlignCode4(); GenConvertDoubleToSingle(); + fmadds_eft = AlignCode4(); + GenerateFmaddsEft(); + ps_madd_eft = AlignCode4(); + GeneratePsMaddEft(); GenQuantizedLoads(); GenQuantizedSingleLoads(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 75cfbed3d63..4cffa573c70 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -93,8 +93,9 @@ void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input) SetFPRFIfNeeded(input, false); } -void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::optional Ra, - std::optional Rb, std::optional Rc) +FixupBranch Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, + std::optional Ra, std::optional Rb, + std::optional Rc) { // | PowerPC | x86 // ---------------------+----------+--------- @@ -104,9 +105,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: // Dragon Ball: Revenge of King Piccolo requires generated NaNs // to be positive, so we'll have to handle them manually. - if (!m_accurate_nans) - return; - if (inst.OPCD != 4) { // not paired-single @@ -140,7 +138,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: FixupBranch done = J(Jump::Near); SwitchToNearCode(); - SetJumpTarget(done); + return done; } else { @@ -217,7 +215,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std:: FixupBranch done = J(Jump::Near); SwitchToNearCode(); - SetJumpTarget(done); + return done; } } @@ -329,14 +327,21 @@ void Jit64::fp_arith(UGeckoInstruction inst) } } - switch (inst.SUBOP5) + if (m_accurate_nans) { - case 18: - HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt); - break; - case 25: - HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2); - break; + std::optional handled_nans; + switch (inst.SUBOP5) + { + case 18: + handled_nans = HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt); + break; + case 25: + handled_nans = HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2); + break; + } + + if (handled_nans) + SetJumpTarget(*handled_nans); } if (single) @@ -368,51 +373,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst) const bool use_fma = Config::Get(Config::SESSION_USE_FMA); const bool software_fma = use_fma && !cpu_info.bFMA; - int a = inst.FA; - int b = inst.FB; - int c = inst.FC; - int d = inst.FD; - bool single = inst.OPCD == 4 || inst.OPCD == 59; - bool round_input = single && !js.op->fprIsSingle[c]; - bool preserve_inputs = m_accurate_nans; - bool preserve_d = preserve_inputs && (a == d || b == d || c == d); - bool packed = - inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] && - js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); + const int a = inst.FA; + const int b = inst.FB; + const int c = inst.FC; + const int d = inst.FD; const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd const bool madds0 = inst.SUBOP5 == 14; const bool madds1 = inst.SUBOP5 == 15; - const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1); + const bool single = inst.OPCD == 4 || inst.OPCD == 59; + const bool round_input = single && !js.op->fprIsSingle[c]; + + const bool error_free_transformation = single && m_accurate_fmadds; + const bool packed = + inst.OPCD == 4 || + (!cpu_info.bAtom && !software_fma && !error_free_transformation && single && + js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); + + const bool want_rc_rounded = + (error_free_transformation || (software_fma && packed)) && round_input; + const bool error_free_transformation_wants_rc_duplicated = + (error_free_transformation && !want_rc_rounded) && (madds0 || madds1); + const bool accurate_nans_wants_rc_duplicated = m_accurate_nans && (madds0 || madds1); + const bool want_rc_duplicated = + error_free_transformation_wants_rc_duplicated || accurate_nans_wants_rc_duplicated; + + const bool preserve_d_due_to_a_or_b = + (m_accurate_nans || error_free_transformation) && (a == d || b == d); + const bool preserve_d_due_to_c = + c == d && ((m_accurate_nans && (!want_rc_duplicated || software_fma)) || + (error_free_transformation && !want_rc_rounded)); + const bool preserve_d = preserve_d_due_to_a_or_b || preserve_d_due_to_c; X64Reg scratch_xmm = XMM0; X64Reg result_xmm = XMM1; X64Reg Rc_duplicated = XMM2; + X64Reg Rc_rounded = XMM3; + + BitSet32 scratch_registers{XMM0 + 16, XMM1 + 16}; + + RCX64Reg xmm2_guard; + RCX64Reg xmm3_guard; + if (error_free_transformation) + { + xmm2_guard = fpr.Scratch(XMM2); + xmm3_guard = fpr.Scratch(XMM3); + RegCache::Realize(xmm2_guard, xmm3_guard); + scratch_registers[XMM2 + 16] = true; + scratch_registers[XMM3 + 16] = true; + } + else if (software_fma) + { + xmm2_guard = fpr.Scratch(XMM2); + RegCache::Realize(xmm2_guard); + scratch_registers[XMM2 + 16] = true; + } RCOpArg Ra; RCOpArg Rb; RCOpArg Rc; RCX64Reg Rd; - RCX64Reg xmm2_guard; RCX64Reg result_xmm_guard; RCX64Reg Rc_duplicated_guard; if (software_fma) { - xmm2_guard = fpr.Scratch(XMM2); - Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read); - Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); - Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read); + Ra = packed || error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read); + Rb = packed || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); + Rc = packed || (error_free_transformation && !want_rc_rounded && !want_rc_duplicated) ? + fpr.Bind(c, RCMode::Read) : + fpr.Use(c, RCMode::Read); Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); if (preserve_d && packed) { result_xmm_guard = fpr.Scratch(); - RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard); + RegCache::Realize(Ra, Rb, Rc, Rd, result_xmm_guard); result_xmm = Gen::X64Reg(result_xmm_guard); + scratch_registers[result_xmm + 16] = true; } else { - RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard); + RegCache::Realize(Ra, Rb, Rc, Rd); result_xmm = packed ? Gen::X64Reg(Rd) : XMM0; } } @@ -421,48 +462,88 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // For use_fma == true: // Statistics suggests b is a lot less likely to be unbound in practice, so // if we have to pick one of a or b to bind, let's make it b. - Ra = fpr.Use(a, RCMode::Read); - Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); - Rc = fpr.Use(c, RCMode::Read); + Ra = error_free_transformation ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read); + Rb = + use_fma || error_free_transformation ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); + Rc = error_free_transformation && !want_rc_rounded && !want_rc_duplicated ? + fpr.Bind(c, RCMode::Read) : + fpr.Use(c, RCMode::Read); Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); RegCache::Realize(Ra, Rb, Rc, Rd); - - if (madds_accurate_nans) - { - Rc_duplicated_guard = fpr.Scratch(); - RegCache::Realize(Rc_duplicated_guard); - Rc_duplicated = Rc_duplicated_guard; - } } + if (error_free_transformation_wants_rc_duplicated || + (accurate_nans_wants_rc_duplicated && + ((!software_fma && !error_free_transformation) || (error_free_transformation && packed)))) + { + Rc_duplicated_guard = fpr.Scratch(); + RegCache::Realize(Rc_duplicated_guard); + Rc_duplicated = Rc_duplicated_guard; + scratch_registers[Rc_duplicated + 16] = true; + } + + const auto registers_to_save = [&](BitSet32 scratch_registers_to_save) { + const BitSet32 scratch_registers_not_to_save = scratch_registers & ~scratch_registers_to_save; + return CallerSavedRegistersInUse(scratch_registers_to_save) & ~scratch_registers_not_to_save; + }; + if (software_fma) { + if (want_rc_rounded) + { + if (error_free_transformation && madds0) + { + MOVDDUP(Rc_rounded, Rc); + Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2); + } + else if (error_free_transformation && madds1) + { + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_rounded, Rc, Rc, 3); + Force25BitPrecision(Rc_rounded, R(Rc_rounded), XMM2); + } + else + { + Force25BitPrecision(Rc_rounded, Rc, XMM2); + } + } + for (size_t i = (packed ? 1 : 0); i != std::numeric_limits::max(); --i) { - if ((i == 0 || madds0) && !madds1) + if (madds0 || (i == 0 && !madds1) || (want_rc_rounded && error_free_transformation && madds1)) { - if (round_input) + if (want_rc_rounded) + MOVAPD(XMM1, R(Rc_rounded)); + else if (round_input) Force25BitPrecision(XMM1, Rc, XMM2); + else if (Rc.IsSimpleReg()) + MOVAPD(XMM1, Rc); else MOVSD(XMM1, Rc); } else { - MOVHLPS(XMM1, Rc.GetSimpleReg()); - if (round_input) + MOVHLPS(XMM1, want_rc_rounded ? Rc_rounded : Rc.GetSimpleReg()); + if (round_input && !want_rc_rounded) Force25BitPrecision(XMM1, R(XMM1), XMM2); } // Write the result from the previous loop iteration into result_xmm so we don't lose it. // It's important that this is done after reading Rc above, in case we have madds1 and - // result_xmm == Rd == Rc. + // !want_rc_rounded and result_xmm == Rd == Rc. if (packed && i == 0) MOVLHPS(result_xmm, XMM0); if (i == 0) { - MOVSD(XMM0, Ra); - MOVSD(XMM2, Rb); + if (Ra.IsSimpleReg()) + MOVAPD(XMM0, Ra); + else + MOVSD(XMM0, Ra); + + if (Rb.IsSimpleReg()) + MOVAPD(XMM2, Rb); + else + MOVSD(XMM2, Rb); } else { @@ -473,23 +554,36 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (subtract) XORPS(XMM2, MConst(psSignBits)); - BitSet32 registers_in_use = CallerSavedRegistersInUse(); + BitSet32 scratch_registers_to_save{}; + if (packed && i == 0) + scratch_registers_to_save[result_xmm + 16] = true; + if (want_rc_rounded && (error_free_transformation || i == 1)) + scratch_registers_to_save[Rc_rounded + 16] = true; + + const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save); ABI_PushRegistersAndAdjustStack(registers_in_use, 0); ABI_CallFunction(static_cast(&std::fma)); ABI_PopRegistersAndAdjustStack(registers_in_use, 0); } if (packed) + { + // result_xmm's upper lane has the result of the first loop iteration MOVSD(R(result_xmm), XMM0); + } else + { DEBUG_ASSERT(result_xmm == XMM0); + } - if (madds_accurate_nans) + if (want_rc_duplicated) { if (madds0) MOVDDUP(Rc_duplicated, Rc); - else + else if (madds1) avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3); + else + DEBUG_ASSERT(false); } } else @@ -497,7 +591,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (madds0) { MOVDDUP(result_xmm, Rc); - if (madds_accurate_nans) + if (want_rc_duplicated) MOVAPD(R(Rc_duplicated), result_xmm); if (round_input) Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); @@ -505,18 +599,21 @@ void Jit64::fmaddXX(UGeckoInstruction inst) else if (madds1) { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3); - if (madds_accurate_nans) + if (want_rc_duplicated) MOVAPD(R(Rc_duplicated), result_xmm); if (round_input) Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); } else { + DEBUG_ASSERT(!want_rc_duplicated); if (round_input) Force25BitPrecision(result_xmm, Rc, scratch_xmm); else MOVAPD(result_xmm, Rc); } + if (want_rc_rounded) + MOVAPD(R(Rc_rounded), result_xmm); if (use_fma) { @@ -556,6 +653,160 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } } + if (m_accurate_nans && result_xmm == XMM0) + { + // HandleNaNs needs to clobber XMM0 + result_xmm = error_free_transformation ? XMM1 : Rd; + MOVAPD(result_xmm, R(XMM0)); + DEBUG_ASSERT(!preserve_d); + } + + std::optional handled_nans; + if (!packed && m_accurate_nans) + { + // The clobber register is unused when not packed. + handled_nans = + HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc); + } + + // Read the comment in the interpreter function NI_madd_msub to find out what's going on here. + if (error_free_transformation) + { + if (result_xmm != XMM1) + { + MOVAPD(XMM1, R(result_xmm)); + result_xmm = XMM1; + } + + X64Reg Rc_rounded_duplicated = Rc.GetSimpleReg(); + BitSet32 scratch_registers_to_save = {XMM1 + 16, XMM2 + 16}; + if (want_rc_rounded) + { + Rc_rounded_duplicated = Rc_rounded; + scratch_registers_to_save[Rc_rounded] = true; + } + else if (want_rc_duplicated) + { + Rc_rounded_duplicated = Rc_duplicated; + scratch_registers_to_save[want_rc_duplicated] = true; + } + + // We've calculated s := a + b, with a = Ra * Rc_rounded_duplicated, b = subtract ? -Rb : Rb + + if (packed) + { + // a' := s - b + if (subtract) + avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM0, R(XMM1), Rb); + else + avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM1), Rb); + + // b' := s - a' + avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM1), R(XMM0)); + + // da := a - a' + if (software_fma) + { + scratch_registers_to_save[XMM0 + 16] = true; + const BitSet32 registers_in_use_1 = registers_to_save(scratch_registers_to_save); + ABI_PushRegistersAndAdjustStack(registers_in_use_1, 0); + + avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits)); + MOVAPD(XMM0, R(Rc_rounded_duplicated)); + MOVAPD(XMM1, Ra); + + ABI_CallFunction(static_cast(&std::fma)); + + // We will read from the upper lane of Rc_rounded_duplicated later, + // so we need to make sure that that lane isn't overwritten. + if (Rc_rounded_duplicated == XMM3) + MOVSD(XMM3, R(XMM0)); + else + MOVAPD(XMM3, R(XMM0)); + + ABI_PopRegistersAndAdjustStack(registers_in_use_1, 0); + + scratch_registers_to_save[XMM0 + 16] = false; + scratch_registers_to_save[XMM3 + 16] = true; + const BitSet32 registers_in_use_2 = registers_to_save(scratch_registers_to_save); + ABI_PushRegistersAndAdjustStack(registers_in_use_2, 0); + + MOVHLPS(XMM2, XMM0); + XORPS(XMM2, MConst(psSignBits)); + MOVHLPS(XMM0, Rc_rounded_duplicated); + MOVHLPS(XMM1, Ra.GetSimpleReg()); + + ABI_CallFunction(static_cast(&std::fma)); + + ABI_PopRegistersAndAdjustStack(registers_in_use_2, 0); + + UNPCKLPD(XMM0, R(XMM3)); + } + else if (use_fma) + { + VFMSUB231PD(XMM0, Rc_rounded_duplicated, Ra); + } + else + { + avx_op(&XEmitter::VMULPD, &XEmitter::MULPD, XMM3, R(Rc_rounded_duplicated), Ra); + avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM0, R(XMM3), R(XMM0), true, false, XMM3); + } + + // db := b - b' + // (Transformed into -db := b' - b) + if (subtract) + avx_op(&XEmitter::VADDPD, &XEmitter::ADDPD, XMM2, R(XMM2), Rb); + else + avx_op(&XEmitter::VSUBPD, &XEmitter::SUBPD, XMM2, R(XMM2), Rb); + + CALL(GetAsmRoutines()->ps_madd_eft); + } + else + { + // a' := s - b + if (subtract) + avx_op(&XEmitter::VADDSD, &XEmitter::ADDSD, XMM0, R(XMM1), Rb, false); + else + avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM1), Rb, false); + + // b' := s - a' + avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM2, R(XMM1), R(XMM0), false); + + // da := a - a' + if (software_fma) + { + const BitSet32 registers_in_use = registers_to_save(scratch_registers_to_save); + ABI_PushRegistersAndAdjustStack(registers_in_use, 0); + + avx_op(&XEmitter::VXORPS, &XEmitter::XORPS, XMM2, R(XMM0), MConst(psSignBits)); + MOVAPD(XMM0, R(Rc_rounded_duplicated)); + MOVAPD(XMM1, Ra); + + ABI_CallFunction(static_cast(&std::fma)); + + ABI_PopRegistersAndAdjustStack(registers_in_use, 0); + } + else if (use_fma) + { + VFMSUB231SD(XMM0, Rc_rounded_duplicated, Ra); + } + else + { + avx_op(&XEmitter::VMULSD, &XEmitter::MULSD, XMM3, R(Rc_rounded_duplicated), Ra, false); + avx_op(&XEmitter::VSUBSD, &XEmitter::SUBSD, XMM0, R(XMM3), R(XMM0), false, false, XMM3); + } + + // db := b - b' + // (Transformed into -db := b' - b) + if (subtract) + ADDSD(XMM2, Rb); + else + SUBSD(XMM2, Rb); + + CALL(GetAsmRoutines()->fmadds_eft); + } + } + // Using x64's nmadd/nmsub would require us to swap the sign of the addend // (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes. // Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub. @@ -563,16 +814,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (negate) XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits)); - if (m_accurate_nans && result_xmm == XMM0) + if (packed && m_accurate_nans) { - // HandleNaNs needs to clobber XMM0 - MOVAPD(Rd, R(result_xmm)); - result_xmm = Rd; - DEBUG_ASSERT(!preserve_d); + // If packed, the clobber register must be XMM0. + handled_nans = + HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, want_rc_duplicated ? R(Rc_duplicated) : Rc); } - // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused. - HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc); + // If the handled_nans branch was taken in the non-packed case, that means the result is NaN, + // so we can skip the XORPD and the error-free transformation. If the handled_nans branch was + // taken in the packed case, we don't know if both of the results were NaN or only one, so we + // can't skip anything. + if (handled_nans) + SetJumpTarget(*handled_nans); if (single) FinalizeSingleResult(Rd, R(result_xmm), packed, true); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 930757cdadd..8e44eb78b40 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -100,12 +100,19 @@ void Jit64::ps_muls(UGeckoInstruction inst) default: PanicAlertFmt("ps_muls WTF!!!"); } + if (round_input) Force25BitPrecision(XMM1, R(Rc_duplicated), XMM0); else if (XMM1 != Rc_duplicated) MOVAPD(XMM1, Rc_duplicated); MULPD(XMM1, Ra); - HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated); + + if (m_accurate_nans) + { + const FixupBranch handled_nans = HandleNaNs(inst, XMM1, XMM0, Ra, std::nullopt, Rc_duplicated); + SetJumpTarget(handled_nans); + } + FinalizeSingleResult(Rd, R(XMM1)); } diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 0c186d0972f..d1415c09c40 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -741,7 +741,8 @@ void EmuCodeBlock::JitClearCA() // Abstract between AVX and SSE: automatically handle 3-operand instructions void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp, - const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible) + const OpArg& arg1, const OpArg& arg2, bool packed, bool reversible, + X64Reg scratch) { if (arg1.IsSimpleReg(regOp)) { @@ -778,19 +779,19 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), else { // The ugly case: Not reversible, and we have regOp == arg2 without AVX or with arg1 == memory - if (!arg1.IsSimpleReg(XMM0)) - MOVAPD(XMM0, arg1); + if (!arg1.IsSimpleReg(scratch)) + MOVAPD(scratch, arg1); if (cpu_info.bAVX) { - (this->*avxOp)(regOp, XMM0, arg2); + (this->*avxOp)(regOp, scratch, arg2); } else { - (this->*sseOp)(XMM0, arg2); + (this->*sseOp)(scratch, arg2); if (packed) - MOVAPD(regOp, R(XMM0)); + MOVAPD(regOp, R(scratch)); else - MOVSD(regOp, R(XMM0)); + MOVSD(regOp, R(scratch)); } } } @@ -798,7 +799,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), // Abstract between AVX and SSE: automatically handle 3-operand instructions void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, u8), void (XEmitter::*sseOp)(X64Reg, const OpArg&, u8), X64Reg regOp, - const OpArg& arg1, const OpArg& arg2, u8 imm) + const OpArg& arg1, const OpArg& arg2, u8 imm, X64Reg scratch) { if (arg1.IsSimpleReg(regOp)) { @@ -816,21 +817,40 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&, else { // The ugly case: regOp == arg2 without AVX, or with arg1 == memory - if (!arg1.IsSimpleReg(XMM0)) - MOVAPD(XMM0, arg1); + if (!arg1.IsSimpleReg(scratch)) + MOVAPD(scratch, arg1); if (cpu_info.bAVX) { - (this->*avxOp)(regOp, XMM0, arg2, imm); + (this->*avxOp)(regOp, scratch, arg2, imm); } else { - (this->*sseOp)(XMM0, arg2, imm); - if (regOp != XMM0) - MOVAPD(regOp, R(XMM0)); + (this->*sseOp)(scratch, arg2, imm); + if (regOp != scratch) + MOVAPD(regOp, R(scratch)); } } } +// Abstract between AVX and SSE: automatically handle 3-operand instructions +void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, u8), + void (XEmitter::*sseOp)(X64Reg, u8), X64Reg regOp1, X64Reg regOp2, u8 imm) +{ + if (regOp1 == regOp2) + { + (this->*sseOp)(regOp1, imm); + } + else if (cpu_info.bAVX) + { + (this->*avxOp)(regOp1, regOp2, imm); + } + else + { + MOVAPD(regOp1, R(regOp2)); + (this->*sseOp)(regOp1, imm); + } +} + alignas(16) static const u64 psMantissaTruncate[2] = {0xFFFFFFFFF8000000ULL, 0xFFFFFFFFF8000000ULL}; alignas(16) static const u64 psRoundBit[2] = {0x8000000, 0x8000000}; @@ -842,8 +862,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg { if (m_jit.jo.accurateSinglePrecision) { + DEBUG_ASSERT(output != tmp); // mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1); - if (input.IsSimpleReg() && cpu_info.bAVX) + if (input.IsSimpleReg() && !input.IsSimpleReg(tmp) && cpu_info.bAVX) { VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit)); VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate)); diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h index 54cade23c5a..b53b5da67b4 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h @@ -113,10 +113,14 @@ public: void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&), Gen::X64Reg regOp, const Gen::OpArg& arg1, const Gen::OpArg& arg2, bool packed = true, - bool reversible = false); + bool reversible = false, Gen::X64Reg scratch = Gen::XMM0); void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, const Gen::OpArg&, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp, - const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm); + const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm, + Gen::X64Reg scratch = Gen::XMM0); + void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, u8), + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, u8), Gen::X64Reg regOp1, Gen::X64Reg regOp2, + u8 imm); void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp); diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index de5527d2d8e..437c6c01795 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -326,6 +326,98 @@ void CommonAsmRoutines::GenMfcr() Common::JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr"); } +// Inputs: +// XMM0: First error term +// XMM1: Result with potentially incorrect rounding +// XMM2: Second error term, negated +// +// Outputs result with corrected rounding in XMM1. +// Clobbers RSCRATCH, RSCRATCH2, XMM0, XMM2, and flags. +void CommonAsmRoutines::GenerateFmaddsEft() +{ + // Check if XMM1 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000 + MOVQ_xmm(R(RSCRATCH), XMM1); + MOV(32, R(RSCRATCH2), Imm32(0x80000000)); + LEA(32, RSCRATCH2, MComplex(RSCRATCH2, RSCRATCH, SCALE_8, 0)); + TEST(32, R(RSCRATCH2), R(RSCRATCH2)); + FixupBranch even_tie = J_CC(CCFlags::CC_Z); + + const u8* ret = GetCodePtr(); + RET(); + + // Check if the error is 0 + SetJumpTarget(even_tie); + SUBSD(XMM0, R(XMM2)); + XORPD(XMM2, R(XMM2)); + UCOMISD(XMM0, R(XMM2)); + J_CC(CCFlags::CC_E, ret); + + // Round XMM1 up or down + MOVQ_xmm(R(RSCRATCH2), XMM0); + XOR(64, R(RSCRATCH2), R(RSCRATCH)); + SAR(64, R(RSCRATCH2), Imm8(63)); + OR(64, R(RSCRATCH2), Imm8(1)); + ADD(64, R(RSCRATCH), R(RSCRATCH2)); + MOVQ_xmm(XMM1, R(RSCRATCH)); + RET(); +} + +alignas(16) static const __m128i double_msb = _mm_set_epi64x(0x8000000000000000, + 0x8000000000000000); +alignas(16) static const __m128i double_lsb = _mm_set_epi64x(1, 1); + +// Inputs: +// XMM0: First error terms +// XMM1: Results with potentially incorrect rounding +// XMM2: Second error terms, negated +// +// Outputs results with corrected rounding in XMM1. Clobbers RSCRATCH, XMM0-XMM3, and flags. +void CommonAsmRoutines::GeneratePsMaddEft() +{ + // Check if XMM1 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000 + avx_op(&XEmitter::VPSLLQ, &XEmitter::PSLLQ, XMM3, XMM1, 35); + if (cpu_info.bSSE4_1) + { + PCMPEQQ(XMM3, MConst(double_msb)); + } + else + { + PCMPEQW(XMM3, MConst(double_msb)); + PSHUFD(XMM3, R(XMM3), 0xF5); + } + + // Just for performance, exit early if there is no even tie + if (cpu_info.bSSE4_1) + { + PTEST(XMM3, R(XMM3)); + } + else + { + PMOVMSKB(RSCRATCH, R(XMM3)); + TEST(32, R(RSCRATCH), R(RSCRATCH)); + } + FixupBranch even_tie = J_CC(CCFlags::CC_NZ); + RET(); + SetJumpTarget(even_tie); + + // Check if the error is zero + SUBPD(XMM0, R(XMM2)); + XORPD(XMM2, R(XMM2)); + CMPPD(XMM2, R(XMM0), CMP_EQ); + + // Store -1 or 1 in XMM0 depending on whether we're rounding down or up + PXOR(XMM0, R(XMM1)); + PSRAD(XMM0, 31); + PSHUFD(XMM0, R(XMM0), 0xF5); + POR(XMM0, MConst(double_lsb)); + + // Round the elements that have both a non-zero error and an even tie + PANDN(XMM2, R(XMM3)); + PAND(XMM0, R(XMM2)); + PADDQ(XMM1, R(XMM0)); + RET(); +} + // Safe + Fast Quantizers, originally from JITIL by magumagu alignas(16) static const float m_65535[4] = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; alignas(16) static const float m_32767 = 32767.0f; diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h index ffac2da1a1c..8d60b005b35 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h @@ -33,6 +33,8 @@ public: protected: void GenConvertDoubleToSingle(); + void GenerateFmaddsEft(); + void GeneratePsMaddEft(); const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type); const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type); void GenQuantizedLoads(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 78288adb131..89de1e5d6e2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -324,6 +324,8 @@ protected: void GenerateConvertDoubleToSingle(); void GenerateConvertSingleToDouble(); void GenerateFPRF(bool single); + void GenerateFmaddsEft(); + void GeneratePsMaddEft(); void GenerateQuantizedLoads(); void GenerateQuantizedStores(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index e03cb50c2f6..f2ac356fc56 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -79,9 +79,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const bool use_b = op5 != 25; // fmul uses no B const bool fma = use_b && use_c; const bool negate_result = (op5 & ~0x1) == 30; + const bool negate_b = op5 == 28 || op5 == 30; const bool output_is_single = inst.OPCD == 59; - const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool error_free_transformation_requested = fma && m_accurate_fmadds; const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[c]; const auto inputs_are_singles_func = [&] { @@ -89,13 +91,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst) (!use_c || fpr.IsSingle(c, true)); }; - const bool single = inputs_are_singles_func() && output_is_single && !inaccurate_fma; + const bool single = inputs_are_singles_func() && output_is_single && + (error_free_transformation_requested || !nonfused_requested); const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair; const RegType type_out = output_is_single ? (single ? RegType::DuplicatedSingle : RegType::Duplicated) : RegType::LowerPair; const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble; + const bool nonfused = nonfused_requested && !single; + const bool error_free_transformation = + error_free_transformation_requested && !single && output_is_single; + + if (error_free_transformation) + { + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2); + } + const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; @@ -103,33 +116,47 @@ void JitArm64::fp_arith(UGeckoInstruction inst) { Arm64FPRCache::ScopedARM64Reg V0Q = ARM64Reg::INVALID_REG; - Arm64FPRCache::ScopedARM64Reg V1Q = ARM64Reg::INVALID_REG; ARM64Reg rounded_c_reg = VC; if (round_c) { - ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single"); - V0Q = fpr.GetScopedReg(); rounded_c_reg = reg_encoder(V0Q); - Force25BitPrecision(rounded_c_reg, VC); - } - - ARM64Reg inaccurate_fma_reg = VD; - if (fma && inaccurate_fma && VD == VB) - { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); } ARM64Reg result_reg = VD; - const bool preserve_d = - m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); - if (preserve_d) + ARM64Reg nonfused_reg = VD; + if (error_free_transformation) { - V1Q = fpr.GetScopedReg(); - result_reg = reg_encoder(V1Q); + result_reg = reg_encoder(ARM64Reg::Q0); + nonfused_reg = reg_encoder(ARM64Reg::Q0); + + if (nonfused && V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + } + else + { + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (preserve_d) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + result_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); + } + else if (fma && nonfused && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + nonfused_reg = reg_encoder(V0Q); + } + } + + if (round_c) + { + ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single"); + Force25BitPrecision(rounded_c_reg, VC); } switch (op5) @@ -152,10 +179,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst) // So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub. case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg); + m_float_emit.FSUB(result_reg, nonfused_reg, VB); } else { @@ -164,10 +191,10 @@ void JitArm64::fp_arith(UGeckoInstruction inst) break; case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(nonfused_reg, VA, rounded_c_reg); + m_float_emit.FADD(result_reg, nonfused_reg, VB); } else { @@ -180,6 +207,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } Common::SmallVector nan_fixups; + std::optional nan_early_fixup; if (m_accurate_nans) { // Check if we need to handle NaNs @@ -216,7 +244,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst) SetJumpTarget(skip); } - std::optional nan_early_fixup; if (negate_result) { // If we have a NaN, we must not execute FNEG. @@ -230,11 +257,46 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } SwitchToNearCode(); - - if (nan_early_fixup) - SetJumpTarget(*nan_early_fixup); } + // Read the comment in the interpreter function NI_madd_msub to find out what's going on here + if (error_free_transformation) + { + // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB) + + // a' := s - b + if (negate_b) + m_float_emit.FADD(ARM64Reg::D1, result_reg, VB); + else + m_float_emit.FSUB(ARM64Reg::D1, result_reg, VB); + + // b' := s - a' + m_float_emit.FSUB(ARM64Reg::D2, result_reg, ARM64Reg::D1); + + // da := a - a' + if (nonfused) + { + m_float_emit.FMUL(EncodeRegToDouble(V0Q), VA, rounded_c_reg); + m_float_emit.FSUB(ARM64Reg::D1, EncodeRegToDouble(V0Q), ARM64Reg::D1); + } + else + { + m_float_emit.FNMSUB(ARM64Reg::D1, VA, rounded_c_reg, ARM64Reg::D1); + } + + // db := b - b' + // (Transformed into -db := b' - b) + if (negate_b) + m_float_emit.FADD(ARM64Reg::D2, ARM64Reg::D2, VB); + else + m_float_emit.FSUB(ARM64Reg::D2, ARM64Reg::D2, VB); + + BL(GetAsmRoutines()->fmadds_eft); + } + + if (nan_early_fixup) + SetJumpTarget(*nan_early_fixup); + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case // for any of AArch64's FMA instructions, so we negate using a separate instruction. if (negate_result) @@ -254,7 +316,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst) fpr.FixSinglePrecision(d); } + if (error_free_transformation) + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + SetFPRFIfNeeded(output_is_single, VD); + + if (error_free_transformation) + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2); } void JitArm64::fp_logic(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index e1f3f096629..88ba86c2aff 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -92,20 +92,31 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const bool duplicated_c = muls || madds; const bool fma = use_b && use_c; const bool negate_result = (op5 & ~0x1) == 30; - const bool msub = op5 == 28 || op5 == 30; + const bool negate_b = op5 == 28 || op5 == 30; - const bool inaccurate_fma = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool nonfused_requested = fma && !Config::Get(Config::SESSION_USE_FMA); + const bool error_free_transformation_requested = fma && m_accurate_fmadds; const bool round_c = use_c && !js.op->fprIsSingle[c]; const auto inputs_are_singles_func = [&] { return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); }; - const bool single = inputs_are_singles_func() && !inaccurate_fma; + const bool single = + inputs_are_singles_func() && (error_free_transformation_requested || !nonfused_requested); const RegType type = single ? RegType::Single : RegType::Register; const u8 size = single ? 32 : 64; const auto reg_encoder = single ? EncodeRegToDouble : EncodeRegToQuad; + const bool nonfused = nonfused_requested && !single; + const bool error_free_transformation = error_free_transformation_requested && !single; + + if (error_free_transformation) + { + gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4); + } + const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; @@ -119,41 +130,77 @@ void JitArm64::ps_arith(UGeckoInstruction inst) ARM64Reg rounded_c_reg = VC; if (round_c) { - ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single"); - - V0Q = fpr.GetScopedReg(); - rounded_c_reg = reg_encoder(V0Q); - Force25BitPrecision(rounded_c_reg, VC); - } - - ARM64Reg inaccurate_fma_reg = VD; - if (fma && inaccurate_fma && VD == VB) - { - if (V0Q == ARM64Reg::INVALID_REG) + if (error_free_transformation) + { + // This register happens to be free, so we can skip allocating one + rounded_c_reg = ARM64Reg::Q3; + } + else + { V0Q = fpr.GetScopedReg(); - inaccurate_fma_reg = reg_encoder(V0Q); + rounded_c_reg = reg_encoder(V0Q); + } } ARM64Reg result_reg = VD; - const bool need_accurate_fma_reg = - fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg); - const bool preserve_d = - m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); - if (need_accurate_fma_reg || preserve_d) + ARM64Reg nonfused_reg = VD; + if (error_free_transformation) { - V1Q = fpr.GetScopedReg(); - result_reg = reg_encoder(V1Q); + result_reg = reg_encoder(ARM64Reg::Q0); + nonfused_reg = reg_encoder(ARM64Reg::Q0); + } + else + { + const bool need_fused_fma_reg = + fma && !nonfused && (negate_b || VD != VB) && (VD == VA || VD == rounded_c_reg); + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (need_fused_fma_reg || preserve_d) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + result_reg = reg_encoder(V0Q); + nonfused_reg = reg_encoder(V0Q); + + if (need_fused_fma_reg && round_c) + { + V1Q = fpr.GetScopedReg(); + rounded_c_reg = reg_encoder(V1Q); + } + } + else if (fma && nonfused && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetScopedReg(); + nonfused_reg = reg_encoder(V0Q); + } } if (m_accurate_nans) { - if (V0Q == ARM64Reg::INVALID_REG) - V0Q = fpr.GetScopedReg(); + if (error_free_transformation) + { + // These registers happen to be free, so we can skip allocating new ones + V1Q = ARM64Reg::Q1; + V2Q = ARM64Reg::Q2; + } + else + { + if (V1Q == ARM64Reg::INVALID_REG) + V1Q = fpr.GetScopedReg(); - if (duplicated_c || VD == result_reg) - V2Q = fpr.GetScopedReg(); + if (duplicated_c || VD == result_reg) + V2Q = fpr.GetScopedReg(); + } } + if (round_c) + { + ASSERT_MSG(DYNA_REC, !single, "Tried to apply 25-bit precision to single"); + Force25BitPrecision(rounded_c_reg, VC); + } + + std::optional negated_b_reg; switch (op5) { case 12: // ps_muls0: d = a * c.ps0 @@ -163,10 +210,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1); break; case 14: // ps_madds0: d = a * c.ps0 + b - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 0); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -176,10 +223,10 @@ void JitArm64::ps_arith(UGeckoInstruction inst) } break; case 15: // ps_madds1: d = a * c.ps1 + b - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg, 1); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -202,23 +249,28 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg); + m_float_emit.FSUB(size, result_reg, nonfused_reg, VB); } else { m_float_emit.FNEG(size, result_reg, VB); + if (error_free_transformation) + { + m_float_emit.MOV(ARM64Reg::Q4, result_reg); + negated_b_reg = ARM64Reg::Q4; + } m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) - if (inaccurate_fma) + if (nonfused) { - m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); - m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); + m_float_emit.FMUL(size, nonfused_reg, VA, rounded_c_reg); + m_float_emit.FADD(size, result_reg, nonfused_reg, VB); } else { @@ -232,11 +284,80 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; } + // Read the comment in the interpreter function NI_madd_msub to find out what's going on here + if (error_free_transformation) + { + // We've calculated s := a + b (with a = VA * rounded_c_reg, b = negate_b ? -VB : VB) + + // a' := s - b + // (Transformed into -a' := b - s) + if (negate_b) + { + if (!negated_b_reg) + { + m_float_emit.FNEG(size, ARM64Reg::Q4, VB); + negated_b_reg = ARM64Reg::Q4; + } + m_float_emit.FSUB(size, ARM64Reg::Q1, *negated_b_reg, result_reg); + } + else + { + m_float_emit.FSUB(size, ARM64Reg::Q1, VB, result_reg); + } + + // b' := s - a' + // (Transformed into b' := s + -a') + m_float_emit.FADD(size, ARM64Reg::Q2, result_reg, ARM64Reg::Q1); + + // da := a - a' + // (Transformed into da := a + -a') + if (nonfused) + { + switch (op5) + { + case 14: // ps_madds0: d = a * c.ps0 + b + m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 0); + break; + case 15: // ps_madds1: d = a * c.ps1 + b + m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg, 1); + break; + default: + m_float_emit.FMUL(size, ARM64Reg::Q3, VA, rounded_c_reg); + break; + } + m_float_emit.FADD(size, ARM64Reg::Q1, ARM64Reg::Q3, ARM64Reg::Q1); + } + else + { + switch (op5) + { + case 14: // ps_madds0: d = a * c.ps0 + b + m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 0); + break; + case 15: // ps_madds1: d = a * c.ps1 + b + m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg, 1); + break; + default: + m_float_emit.FMLA(size, ARM64Reg::Q1, VA, rounded_c_reg); + break; + } + } + + // db := b - b' + // (Transformed into -db := b' - b) + if (negate_b) + m_float_emit.FADD(size, ARM64Reg::Q2, ARM64Reg::Q2, VB); + else + m_float_emit.FSUB(size, ARM64Reg::Q2, ARM64Reg::Q2, VB); + + BL(GetAsmRoutines()->ps_madd_eft); + } + FixupBranch nan_fixup; if (m_accurate_nans) { - const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q); - const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q); + const ARM64Reg nan_temp_reg = single ? EncodeRegToSingle(V1Q) : EncodeRegToDouble(V1Q); + const ARM64Reg nan_temp_reg_paired = reg_encoder(V1Q); // Check if we need to handle NaNs @@ -306,7 +427,13 @@ void JitArm64::ps_arith(UGeckoInstruction inst) fpr.FixSinglePrecision(d); + if (error_free_transformation) + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); + SetFPRFIfNeeded(true, VD); + + if (error_free_transformation) + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1, ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q4); } void JitArm64::ps_sel(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 89ed9a2b053..a65fd33a8f4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -8,6 +8,7 @@ #include #include "Common/Arm64Emitter.h" +#include "Common/CPUDetect.h" #include "Common/CommonTypes.h" #include "Common/Config/Config.h" #include "Common/FloatUtils.h" @@ -265,6 +266,14 @@ void JitArm64::GenerateCommonAsm() GenerateFPRF(false); Common::JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF"); + GetAsmRoutines()->fmadds_eft = GetCodePtr(); + GenerateFmaddsEft(); + Common::JitRegister::Register(GetAsmRoutines()->fmadds_eft, GetCodePtr(), "JIT_fmadds_eft"); + + GetAsmRoutines()->ps_madd_eft = GetCodePtr(); + GeneratePsMaddEft(); + Common::JitRegister::Register(GetAsmRoutines()->ps_madd_eft, GetCodePtr(), "JIT_ps_madd_eft"); + GenerateQuantizedLoads(); GenerateQuantizedStores(); } @@ -514,6 +523,90 @@ void JitArm64::GenerateFPRF(bool single) B(write_fprf_and_ret); } +// Inputs: +// D0: Result with potentially incorrect rounding +// D1: First error term +// D2: Second error term, negated +// +// Outputs result with corrected rounding in D0. Clobbers X0-X1, D1, and flags. +void JitArm64::GenerateFmaddsEft() +{ + // Check if D0 is an even tie, i.e. check (input & 0x1fffffff) == 0x10000000 + m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0); + MOVI2R(ARM64Reg::W1, 0x80000000); + CMP(ARM64Reg::W1, ARM64Reg::W0, ArithOption(ARM64Reg::W0, ShiftType::LSL, 3)); + FixupBranch even_tie = B(CCFlags::CC_EQ); + + const u8* ret = GetCodePtr(); + RET(); + + // Check if the error is 0 + SetJumpTarget(even_tie); + m_float_emit.FSUB(ARM64Reg::D1, ARM64Reg::D1, ARM64Reg::D2); + m_float_emit.FCMP(ARM64Reg::D1); + B(CCFlags::CC_EQ, ret); + + // Round D0 up or down + MOVZ(ARM64Reg::X1, 1); + CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT); + CMP(ARM64Reg::X0, 0); + CNEG(ARM64Reg::X1, ARM64Reg::X1, CCFlags::CC_LT); + ADD(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1); + m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0); + RET(); +} + +// Inputs: +// Q0: Results with potentially incorrect rounding +// Q1: First error terms +// Q2: Second error terms, negated +// +// Outputs results with corrected rounding in Q0. Clobbers X0, Q1-Q4, and flags. +void JitArm64::GeneratePsMaddEft() +{ + // Check if Q0 has an even tie, i.e. check (input & 0x1fffffff) == 0x10000000 + MOVI2R(ARM64Reg::X0, 0x8000'0000'0000'0000); + m_float_emit.SHL(64, ARM64Reg::Q3, ARM64Reg::Q0, 35); + m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0); + m_float_emit.CMEQ(64, ARM64Reg::Q3, ARM64Reg::Q3, ARM64Reg::Q4); + + // Just for performance, exit early if there is no even tie + m_float_emit.XTN(32, ARM64Reg::D4, ARM64Reg::Q3); + FixupBranch even_tie; + if (cpu_info.bAFP) + { + m_float_emit.FCMP(ARM64Reg::D4); + even_tie = B(CCFlags::CC_NEQ); + } + else + { + // If we don't have AFP and the emulated software has NI set, subnormals will compare equal to + // zero, so we can't use FCMP unless we were to put some shuffle instruction before it. + // FMOV is a little slower than FCMP, but it's faster than adding an extra instruction. + m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D4); + even_tie = CBNZ(ARM64Reg::X0); + } + RET(); + SetJumpTarget(even_tie); + + // Check if the error is zero + m_float_emit.FSUB(64, ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2); + MOVZ(ARM64Reg::X0, 1); + m_float_emit.FCMEQ(64, ARM64Reg::Q2, ARM64Reg::Q1); + + // Store -1 or 1 in Q1 depending on whether we're rounding down or up + m_float_emit.EOR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q0); + m_float_emit.DUP(64, ARM64Reg::Q4, ARM64Reg::X0); + m_float_emit.SSHR(64, ARM64Reg::Q1, ARM64Reg::Q1, 63); + m_float_emit.ORR(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q4); + + // Round the elements that have both a non-zero error and an even tie + m_float_emit.BIC(ARM64Reg::Q2, ARM64Reg::Q3, ARM64Reg::Q2); + m_float_emit.AND(ARM64Reg::Q1, ARM64Reg::Q1, ARM64Reg::Q2); + m_float_emit.ADD(64, ARM64Reg::Q0, ARM64Reg::Q0, ARM64Reg::Q1); + RET(); +} + void JitArm64::GenerateQuantizedLoads() { // X0 is a temporary diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 4fd58bc8973..88132bdd6ab 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -30,6 +30,8 @@ struct CommonAsmRoutinesBase const u8* cstd; const u8* fprf_single; const u8* fprf_double; + const u8* fmadds_eft; + const u8* ps_madd_eft; // In: array index: GQR to use. // In: ECX: Address to read from. diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp index d2de2895a75..a8709c4f6e9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp @@ -57,7 +57,7 @@ // After resetting the stack to the top, we call _resetstkoflw() to restore // the guard page at the 256kb mark. -const std::array*>, 23> JitBase::JIT_SETTINGS{{ +const std::array*>, 24> JitBase::JIT_SETTINGS{{ {&JitBase::bJITOff, &Config::MAIN_DEBUG_JIT_OFF}, {&JitBase::bJITLoadStoreOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_OFF}, {&JitBase::bJITLoadStorelXzOff, &Config::MAIN_DEBUG_JIT_LOAD_STORE_LXZ_OFF}, @@ -79,6 +79,7 @@ const std::array*>, 23> JitB {&JitBase::m_low_dcbz_hack, &Config::MAIN_LOW_DCBZ_HACK}, {&JitBase::m_fprf, &Config::MAIN_FPRF}, {&JitBase::m_accurate_nans, &Config::MAIN_ACCURATE_NANS}, + {&JitBase::m_accurate_fmadds, &Config::MAIN_ACCURATE_FMADDS}, {&JitBase::m_fastmem_enabled, &Config::MAIN_FASTMEM}, {&JitBase::m_accurate_cpu_cache_enabled, &Config::MAIN_ACCURATE_CPU_CACHE}, }}; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index cb78fcc6fe2..468874a9836 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -158,6 +158,7 @@ protected: bool m_low_dcbz_hack = false; bool m_fprf = false; bool m_accurate_nans = false; + bool m_accurate_fmadds = false; bool m_fastmem_enabled = false; bool m_accurate_cpu_cache_enabled = false; @@ -165,7 +166,7 @@ protected: bool m_cleanup_after_stackfault = false; u8* m_stack_guard = nullptr; - static const std::array*>, 23> JIT_SETTINGS; + static const std::array*>, 24> JIT_SETTINGS; bool DoesConfigNeedRefresh() const; void RefreshConfig();