From 5893851029134f303b933a6b4b42fc71bfbd0ce6 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Sun, 16 Nov 2025 19:07:59 +0200 Subject: [PATCH] Core: Remove Intel-TSX ISA Extension based code --- rpcs3/Emu/Cell/PPUThread.cpp | 300 +---------- rpcs3/Emu/Cell/SPULLVMRecompiler.cpp | 2 +- rpcs3/Emu/Cell/SPUThread.cpp | 721 +-------------------------- rpcs3/Emu/Memory/vm_reservation.h | 187 +------ rpcs3/Emu/System.cpp | 20 - rpcs3/Emu/System.h | 4 - rpcs3/Emu/system_config.cpp | 5 - rpcs3/Emu/system_config.h | 8 - rpcs3/Emu/system_config_types.cpp | 16 - rpcs3/Emu/system_config_types.h | 7 - rpcs3/rpcs3qt/emu_settings.cpp | 8 - rpcs3/rpcs3qt/emu_settings_type.h | 2 - rpcs3/rpcs3qt/settings_dialog.cpp | 73 --- rpcs3/rpcs3qt/settings_dialog.ui | 18 - rpcs3/rpcs3qt/tooltips.h | 1 - rpcs3/util/asm.hpp | 67 --- 16 files changed, 14 insertions(+), 1425 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 456715ccf9..f0b9b4d462 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -2308,7 +2308,7 @@ void ppu_thread::cpu_sleep() raddr = 0; // Setup wait flag and memory flags to relock itself - state += g_use_rtm ? cpu_flag::wait : cpu_flag::wait + cpu_flag::memory; + state += cpu_flag::wait + cpu_flag::memory; if (auto ptr = vm::g_tls_locked) { @@ -2454,10 +2454,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3 // Trigger the scheduler state += cpu_flag::suspend; - if (!g_use_rtm) - { - state += cpu_flag::memory; - } + // Acquire memory passive lock + state += cpu_flag::memory; call_history.data.resize(g_cfg.core.ppu_call_history ? call_history_max_size : 1); syscall_history.data.resize(g_cfg.core.ppu_call_history ? syscall_history_max_size : 1); @@ -2703,11 +2701,7 @@ ppu_thread::ppu_thread(utils::serial& ar) // Trigger the scheduler state += cpu_flag::suspend; - - if (!g_use_rtm) - { - state += cpu_flag::memory; - } + state += cpu_flag::memory; ppu_tname = make_single(ar.pop()); @@ -3191,221 +3185,6 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr) return ppu_load_acquire_reservation(ppu, addr); } -const auto ppu_stcx_accurate_tx = build_function_asm("ppu_stcx_accurate_tx", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - Label fall = c.newLabel(); - Label fail = c.newLabel(); - Label _ret = c.newLabel(); - Label load = c.newLabel(); - - //if (utils::has_avx() && !s_tsx_avx) - //{ - // c.vzeroupper(); - //} - - // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) - c.push(x86::rbp); - c.push(x86::r13); - c.push(x86::r14); - c.sub(x86::rsp, 48); -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); - c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); - } -#endif - - // Prepare registers - build_swap_rdx_with(c, args, x86::r10); - c.movabs(x86::rbp, reinterpret_cast(&vm::g_sudo_addr)); - c.mov(x86::rbp, x86::qword_ptr(x86::rbp)); - c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); - c.and_(x86::rbp, -128); - c.prefetchw(x86::byte_ptr(x86::rbp, 0)); - c.prefetchw(x86::byte_ptr(x86::rbp, 64)); - c.movzx(args[0].r32(), args[0].r16()); - c.shr(args[0].r32(), 1); - c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); - c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); - c.and_(x86::r11, -128 / 2); - c.and_(args[0].r32(), 63); - - // Prepare data - if (s_tsx_avx) - { - c.vmovups(x86::ymm0, x86::ymmword_ptr(args[2], 0)); - c.vmovups(x86::ymm1, x86::ymmword_ptr(args[2], 32)); - c.vmovups(x86::ymm2, x86::ymmword_ptr(args[2], 64)); - c.vmovups(x86::ymm3, x86::ymmword_ptr(args[2], 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0)); - c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16)); - c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32)); - c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48)); - c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64)); - c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80)); - c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96)); - c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112)); - } - - // Alloc r14 to stamp0 - const auto stamp0 = x86::r14; - build_get_tsc(c, stamp0); - - Label fail2 = c.newLabel(); - - Label tx1 = build_transaction_enter(c, fall, [&]() - { - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.movabs(x86::r13, reinterpret_cast(&g_rtm_tx_limit2)); - c.cmp(x86::rax, x86::qword_ptr(x86::r13)); - c.jae(fall); - }); - - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast(cpu_flag::pause)); - c.jc(fall); - c.xbegin(tx1); - - if (s_tsx_avx) - { - c.vxorps(x86::ymm0, x86::ymm0, x86::ymmword_ptr(x86::rbp, 0)); - c.vxorps(x86::ymm1, x86::ymm1, x86::ymmword_ptr(x86::rbp, 32)); - c.vxorps(x86::ymm2, x86::ymm2, x86::ymmword_ptr(x86::rbp, 64)); - c.vxorps(x86::ymm3, x86::ymm3, x86::ymmword_ptr(x86::rbp, 96)); - c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); - c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); - c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); - c.vptest(x86::ymm0, x86::ymm0); - } - else - { - c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); - c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); - c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); - c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); - c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); - c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); - c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); - c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); - c.orps(x86::xmm0, x86::xmm1); - c.orps(x86::xmm2, x86::xmm3); - c.orps(x86::xmm4, x86::xmm5); - c.orps(x86::xmm6, x86::xmm7); - c.orps(x86::xmm0, x86::xmm2); - c.orps(x86::xmm4, x86::xmm6); - c.orps(x86::xmm0, x86::xmm4); - c.ptest(x86::xmm0, x86::xmm0); - } - - c.jnz(fail); - - // Store 8 bytes - c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]); - - c.xend(); - c.lock().add(x86::qword_ptr(x86::r11), 64); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.jmp(_ret); - - // XABORT is expensive so try to finish with xend instead - c.bind(fail); - - // Load old data to store back in rdata - if (s_tsx_avx) - { - c.vmovaps(x86::ymm0, x86::ymmword_ptr(x86::rbp, 0)); - c.vmovaps(x86::ymm1, x86::ymmword_ptr(x86::rbp, 32)); - c.vmovaps(x86::ymm2, x86::ymmword_ptr(x86::rbp, 64)); - c.vmovaps(x86::ymm3, x86::ymmword_ptr(x86::rbp, 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); - c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); - c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); - c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); - c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); - c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); - c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); - c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); - } - - c.xend(); - c.jmp(fail2); - - c.bind(fall); - c.mov(x86::rax, -1); - c.jmp(_ret); - - c.bind(fail2); - c.lock().sub(x86::qword_ptr(x86::r11), 64); - c.bind(load); - - // Store previous data back to rdata - if (s_tsx_avx) - { - c.vmovaps(x86::ymmword_ptr(args[2], 0), x86::ymm0); - c.vmovaps(x86::ymmword_ptr(args[2], 32), x86::ymm1); - c.vmovaps(x86::ymmword_ptr(args[2], 64), x86::ymm2); - c.vmovaps(x86::ymmword_ptr(args[2], 96), x86::ymm3); - } - else - { - c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0); - c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1); - c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2); - c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3); - c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4); - c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5); - c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6); - c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7); - } - - c.mov(x86::rax, -1); - c.mov(x86::qword_ptr(args[2], ::offset32(&ppu_thread::last_ftime) - ::offset32(&ppu_thread::rdata)), x86::rax); - c.xor_(x86::eax, x86::eax); - //c.jmp(_ret); - - c.bind(_ret); - -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); - } -#endif - - if (s_tsx_avx) - { - c.vzeroupper(); - } - - c.add(x86::rsp, 48); - c.pop(x86::r14); - c.pop(x86::r13); - c.pop(x86::rbp); - - maybe_flush_lbr(c); - c.ret(); -#else - UNUSED(args); - - // Unimplemented should fail. - c.brk(Imm(0x42)); - c.ret(a64::x30); -#endif -}); - template static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) { @@ -3486,77 +3265,6 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) return false; } - if (g_use_rtm) [[likely]] - { - switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast(new_data))) - { - case umax: - { - auto& all_data = *vm::get_super_ptr(addr & -128); - auto& sdata = *vm::get_super_ptr>(addr & -8); - - const bool ok = cpu_thread::suspend_all<+3>(&ppu, {all_data, all_data + 64, &res}, [&] - { - if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data)) - { - sdata.release(new_data); - res += 64; - return true; - } - - mov_rdata_nt(ppu.rdata, all_data); - res -= 64; - return false; - }); - - if (ok) - { - break; - } - - ppu.last_ftime = -1; - [[fallthrough]]; - } - case 0: - { - if (ppu.last_faddr == addr) - { - ppu.last_fail++; - } - - if (ppu.last_ftime != umax) - { - ppu.last_faddr = 0; - return false; - } - - utils::prefetch_read(ppu.rdata); - utils::prefetch_read(ppu.rdata + 64); - ppu.last_faddr = addr; - ppu.last_ftime = res.load() & -128; - ppu.last_ftsc = utils::get_tsc(); - return false; - } - default: - { - if (count > 20000 && g_cfg.core.perf_report) [[unlikely]] - { - perf_log.warning("STCX: took too long: %.3fus (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count); - } - - break; - } - } - - if (ppu.last_faddr == addr) - { - ppu.last_succ++; - } - - ppu.last_faddr = 0; - return true; - } - // Align address: we do not need the lower 7 bits anymore addr &= -128; diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index b275773e8b..8b820ca600 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -3991,7 +3991,7 @@ public: bool must_use_cpp_functions = !!g_cfg.core.spu_accurate_dma; - if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || !g_use_rtm) + if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || /*!g_use_rtm*/ true) { // TODO: don't require TSX (current implementation is TSX-only) if (cmdh == MFC_PUT_CMD || cmdh == MFC_SNDSIG_CMD) diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index a441f29e6b..7a6c931795 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -639,549 +639,6 @@ std::array op_branch_targets(u32 pc, spu_opcode_t op) return res; } -const auto spu_putllc_tx = build_function_asm("spu_putllc_tx", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - Label fall = c.newLabel(); - Label fail = c.newLabel(); - Label _ret = c.newLabel(); - Label load = c.newLabel(); - - //if (utils::has_avx() && !s_tsx_avx) - //{ - // c.vzeroupper(); - //} - - // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) - c.push(x86::rbp); - c.push(x86::rbx); -#ifdef _WIN32 - c.sub(x86::rsp, 168); - if (s_tsx_avx) - { - c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); - c.vmovups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); - } - else - { - c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); - c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); - c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8); - c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9); - c.movups(x86::oword_ptr(x86::rsp, 64), x86::xmm10); - c.movups(x86::oword_ptr(x86::rsp, 80), x86::xmm11); - c.movups(x86::oword_ptr(x86::rsp, 96), x86::xmm12); - c.movups(x86::oword_ptr(x86::rsp, 112), x86::xmm13); - c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14); - c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15); - } -#else - c.sub(x86::rsp, 40); -#endif - - // Prepare registers - build_swap_rdx_with(c, args, x86::r10); - c.movabs(args[1], reinterpret_cast(&vm::g_sudo_addr)); - c.mov(args[1], x86::qword_ptr(args[1])); - c.lea(args[1], x86::qword_ptr(args[1], args[0])); - c.prefetchw(x86::byte_ptr(args[1], 0)); - c.prefetchw(x86::byte_ptr(args[1], 64)); - c.and_(args[0].r32(), 0xff80); - c.shr(args[0].r32(), 1); - c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); - c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); - - // Prepare data - if (s_tsx_avx) - { - c.vmovups(x86::ymm0, x86::ymmword_ptr(args[2], 0)); - c.vmovups(x86::ymm1, x86::ymmword_ptr(args[2], 32)); - c.vmovups(x86::ymm2, x86::ymmword_ptr(args[2], 64)); - c.vmovups(x86::ymm3, x86::ymmword_ptr(args[2], 96)); - c.vmovups(x86::ymm4, x86::ymmword_ptr(args[3], 0)); - c.vmovups(x86::ymm5, x86::ymmword_ptr(args[3], 32)); - c.vmovups(x86::ymm6, x86::ymmword_ptr(args[3], 64)); - c.vmovups(x86::ymm7, x86::ymmword_ptr(args[3], 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0)); - c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16)); - c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32)); - c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48)); - c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64)); - c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80)); - c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96)); - c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112)); - c.movaps(x86::xmm8, x86::oword_ptr(args[3], 0)); - c.movaps(x86::xmm9, x86::oword_ptr(args[3], 16)); - c.movaps(x86::xmm10, x86::oword_ptr(args[3], 32)); - c.movaps(x86::xmm11, x86::oword_ptr(args[3], 48)); - c.movaps(x86::xmm12, x86::oword_ptr(args[3], 64)); - c.movaps(x86::xmm13, x86::oword_ptr(args[3], 80)); - c.movaps(x86::xmm14, x86::oword_ptr(args[3], 96)); - c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112)); - } - - // Alloc args[0] to stamp0 - const auto stamp0 = args[0]; - build_get_tsc(c, stamp0); - - Label fail2 = c.newLabel(); - - Label tx1 = build_transaction_enter(c, fall, [&]() - { - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit2)); - c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); - c.jae(fall); - }); - - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast(cpu_flag::pause)); - c.jc(fall); - c.xbegin(tx1); - - if (s_tsx_avx) - { - c.vxorps(x86::ymm0, x86::ymm0, x86::ymmword_ptr(args[1], 0)); - c.vxorps(x86::ymm1, x86::ymm1, x86::ymmword_ptr(args[1], 32)); - c.vxorps(x86::ymm2, x86::ymm2, x86::ymmword_ptr(args[1], 64)); - c.vxorps(x86::ymm3, x86::ymm3, x86::ymmword_ptr(args[1], 96)); - c.vorps(x86::ymm0, x86::ymm0, x86::ymm1); - c.vorps(x86::ymm1, x86::ymm2, x86::ymm3); - c.vorps(x86::ymm0, x86::ymm1, x86::ymm0); - c.vptest(x86::ymm0, x86::ymm0); - } - else - { - c.xorps(x86::xmm0, x86::oword_ptr(args[1], 0)); - c.xorps(x86::xmm1, x86::oword_ptr(args[1], 16)); - c.xorps(x86::xmm2, x86::oword_ptr(args[1], 32)); - c.xorps(x86::xmm3, x86::oword_ptr(args[1], 48)); - c.xorps(x86::xmm4, x86::oword_ptr(args[1], 64)); - c.xorps(x86::xmm5, x86::oword_ptr(args[1], 80)); - c.xorps(x86::xmm6, x86::oword_ptr(args[1], 96)); - c.xorps(x86::xmm7, x86::oword_ptr(args[1], 112)); - c.orps(x86::xmm0, x86::xmm1); - c.orps(x86::xmm2, x86::xmm3); - c.orps(x86::xmm4, x86::xmm5); - c.orps(x86::xmm6, x86::xmm7); - c.orps(x86::xmm0, x86::xmm2); - c.orps(x86::xmm4, x86::xmm6); - c.orps(x86::xmm0, x86::xmm4); - c.ptest(x86::xmm0, x86::xmm0); - } - - c.jnz(fail); - - if (s_tsx_avx) - { - c.vmovaps(x86::ymmword_ptr(args[1], 0), x86::ymm4); - c.vmovaps(x86::ymmword_ptr(args[1], 32), x86::ymm5); - c.vmovaps(x86::ymmword_ptr(args[1], 64), x86::ymm6); - c.vmovaps(x86::ymmword_ptr(args[1], 96), x86::ymm7); - } - else - { - c.movaps(x86::oword_ptr(args[1], 0), x86::xmm8); - c.movaps(x86::oword_ptr(args[1], 16), x86::xmm9); - c.movaps(x86::oword_ptr(args[1], 32), x86::xmm10); - c.movaps(x86::oword_ptr(args[1], 48), x86::xmm11); - c.movaps(x86::oword_ptr(args[1], 64), x86::xmm12); - c.movaps(x86::oword_ptr(args[1], 80), x86::xmm13); - c.movaps(x86::oword_ptr(args[1], 96), x86::xmm14); - c.movaps(x86::oword_ptr(args[1], 112), x86::xmm15); - } - - c.xend(); - c.lock().add(x86::qword_ptr(x86::r11), 64); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.jmp(_ret); - - // XABORT is expensive so try to finish with xend instead - c.bind(fail); - - // Load previous data to store back to rdata - if (s_tsx_avx) - { - c.vmovaps(x86::ymm0, x86::ymmword_ptr(args[1], 0)); - c.vmovaps(x86::ymm1, x86::ymmword_ptr(args[1], 32)); - c.vmovaps(x86::ymm2, x86::ymmword_ptr(args[1], 64)); - c.vmovaps(x86::ymm3, x86::ymmword_ptr(args[1], 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0)); - c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16)); - c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32)); - c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48)); - c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64)); - c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80)); - c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96)); - c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112)); - } - - c.xend(); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1); - c.jmp(fail2); - - c.bind(fall); - c.mov(x86::rax, -1); - c.jmp(_ret); - - c.bind(fail2); - c.lock().sub(x86::qword_ptr(x86::r11), 64); - c.bind(load); - - // Store previous data back to rdata - if (s_tsx_avx) - { - c.vmovaps(x86::ymmword_ptr(args[2], 0), x86::ymm0); - c.vmovaps(x86::ymmword_ptr(args[2], 32), x86::ymm1); - c.vmovaps(x86::ymmword_ptr(args[2], 64), x86::ymm2); - c.vmovaps(x86::ymmword_ptr(args[2], 96), x86::ymm3); - } - else - { - c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0); - c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1); - c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2); - c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3); - c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4); - c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5); - c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6); - c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7); - } - - c.mov(x86::rax, -1); - c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax); - c.xor_(x86::eax, x86::eax); - //c.jmp(_ret); - - c.bind(_ret); - -#ifdef _WIN32 - if (s_tsx_avx) - { - c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); - } - else - { - c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); - c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32)); - c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48)); - c.movups(x86::xmm10, x86::oword_ptr(x86::rsp, 64)); - c.movups(x86::xmm11, x86::oword_ptr(x86::rsp, 80)); - c.movups(x86::xmm12, x86::oword_ptr(x86::rsp, 96)); - c.movups(x86::xmm13, x86::oword_ptr(x86::rsp, 112)); - c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128)); - c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144)); - } - c.add(x86::rsp, 168); -#else - c.add(x86::rsp, 40); -#endif - - c.pop(x86::rbx); - c.pop(x86::rbp); - - if (s_tsx_avx) - { - c.vzeroupper(); - } - - maybe_flush_lbr(c); - c.ret(); -#else - UNUSED(args); - - c.brk(Imm(0x42)); - c.ret(a64::x30); -#endif -}); - -const auto spu_putlluc_tx = build_function_asm("spu_putlluc_tx", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - Label fall = c.newLabel(); - Label _ret = c.newLabel(); - - //if (utils::has_avx() && !s_tsx_avx) - //{ - // c.vzeroupper(); - //} - - // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) - c.push(x86::rbp); - c.push(x86::rbx); - c.sub(x86::rsp, 40); -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); - c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); - } -#endif - // Prepare registers - build_swap_rdx_with(c, args, x86::r10); - c.movabs(x86::r11, reinterpret_cast(&vm::g_sudo_addr)); - c.mov(x86::r11, x86::qword_ptr(x86::r11)); - c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); - c.prefetchw(x86::byte_ptr(x86::r11, 0)); - c.prefetchw(x86::byte_ptr(x86::r11, 64)); - - // Prepare data - if (s_tsx_avx) - { - c.vmovups(x86::ymm0, x86::ymmword_ptr(args[1], 0)); - c.vmovups(x86::ymm1, x86::ymmword_ptr(args[1], 32)); - c.vmovups(x86::ymm2, x86::ymmword_ptr(args[1], 64)); - c.vmovups(x86::ymm3, x86::ymmword_ptr(args[1], 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0)); - c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16)); - c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32)); - c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48)); - c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64)); - c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80)); - c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96)); - c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112)); - } - - c.and_(args[0].r32(), 0xff80); - c.shr(args[0].r32(), 1); - c.movabs(args[1], reinterpret_cast(+vm::g_reservations)); - c.lea(args[1], x86::qword_ptr(args[1], args[0])); - - // Alloc args[0] to stamp0 - const auto stamp0 = args[0]; - build_get_tsc(c, stamp0); - - Label tx1 = build_transaction_enter(c, fall, [&]() - { - // ftx++; - c.add(x86::qword_ptr(args[3]), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit2)); - c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); - c.jae(fall); - }); - - c.xbegin(tx1); - - if (s_tsx_avx) - { - c.vmovaps(x86::ymmword_ptr(x86::r11, 0), x86::ymm0); - c.vmovaps(x86::ymmword_ptr(x86::r11, 32), x86::ymm1); - c.vmovaps(x86::ymmword_ptr(x86::r11, 64), x86::ymm2); - c.vmovaps(x86::ymmword_ptr(x86::r11, 96), x86::ymm3); - } - else - { - c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0); - c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1); - c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2); - c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3); - c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4); - c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5); - c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6); - c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7); - } - - c.xend(); - c.lock().add(x86::qword_ptr(args[1]), 32); - // stx++ - c.add(x86::qword_ptr(args[2]), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.jmp(_ret); - - c.bind(fall); - c.xor_(x86::eax, x86::eax); - //c.jmp(_ret); - - c.bind(_ret); - -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); - } - c.add(x86::rsp, 40); -#endif - - if (s_tsx_avx) - { - c.vzeroupper(); - } - - c.add(x86::rsp, 40); - c.pop(x86::rbx); - c.pop(x86::rbp); - - maybe_flush_lbr(c); - c.ret(); -#else - UNUSED(args); - - c.brk(Imm(0x42)); - c.ret(a64::x30); -#endif -}); - -const auto spu_getllar_tx = build_function_asm("spu_getllar_tx", [](native_asm& c, auto& args) -{ - using namespace asmjit; - -#if defined(ARCH_X64) - Label fall = c.newLabel(); - Label _ret = c.newLabel(); - - //if (utils::has_avx() && !s_tsx_avx) - //{ - // c.vzeroupper(); - //} - - // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) - c.push(x86::rbp); - c.push(x86::rbx); - c.sub(x86::rsp, 40); -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); - c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7); - } -#endif - - // Prepare registers - build_swap_rdx_with(c, args, x86::r10); - c.movabs(x86::rbp, reinterpret_cast(&vm::g_sudo_addr)); - c.mov(x86::rbp, x86::qword_ptr(x86::rbp)); - c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); - c.and_(args[0].r32(), 0xff80); - c.shr(args[0].r32(), 1); - c.movabs(x86::r11, reinterpret_cast(+vm::g_reservations)); - c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); - - // Alloc args[0] to stamp0 - const auto stamp0 = args[0]; - build_get_tsc(c, stamp0); - - // Begin transaction - Label tx0 = build_transaction_enter(c, fall, [&]() - { - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - c.movabs(x86::rbx, reinterpret_cast(&g_rtm_tx_limit1)); - c.cmp(x86::rax, x86::qword_ptr(x86::rbx)); - c.jae(fall); - }); - - // Check pause flag - c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast(cpu_flag::pause)); - c.jc(fall); - c.mov(x86::rax, x86::qword_ptr(x86::r11)); - c.and_(x86::rax, -128); - c.cmp(x86::rax, args[3]); - c.jne(fall); - c.xbegin(tx0); - - // Just read data to registers - if (s_tsx_avx) - { - c.vmovups(x86::ymm0, x86::ymmword_ptr(x86::rbp, 0)); - c.vmovups(x86::ymm1, x86::ymmword_ptr(x86::rbp, 32)); - c.vmovups(x86::ymm2, x86::ymmword_ptr(x86::rbp, 64)); - c.vmovups(x86::ymm3, x86::ymmword_ptr(x86::rbp, 96)); - } - else - { - c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0)); - c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16)); - c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32)); - c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48)); - c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64)); - c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80)); - c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96)); - c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112)); - } - - c.xend(); - c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1); - build_get_tsc(c); - c.sub(x86::rax, stamp0); - - // Store data - if (s_tsx_avx) - { - c.vmovaps(x86::ymmword_ptr(args[1], 0), x86::ymm0); - c.vmovaps(x86::ymmword_ptr(args[1], 32), x86::ymm1); - c.vmovaps(x86::ymmword_ptr(args[1], 64), x86::ymm2); - c.vmovaps(x86::ymmword_ptr(args[1], 96), x86::ymm3); - } - else - { - c.movaps(x86::oword_ptr(args[1], 0), x86::xmm0); - c.movaps(x86::oword_ptr(args[1], 16), x86::xmm1); - c.movaps(x86::oword_ptr(args[1], 32), x86::xmm2); - c.movaps(x86::oword_ptr(args[1], 48), x86::xmm3); - c.movaps(x86::oword_ptr(args[1], 64), x86::xmm4); - c.movaps(x86::oword_ptr(args[1], 80), x86::xmm5); - c.movaps(x86::oword_ptr(args[1], 96), x86::xmm6); - c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7); - } - - c.jmp(_ret); - c.bind(fall); - c.xor_(x86::eax, x86::eax); - //c.jmp(_ret); - - c.bind(_ret); - -#ifdef _WIN32 - if (!s_tsx_avx) - { - c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); - c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16)); - } -#endif - - if (s_tsx_avx) - { - c.vzeroupper(); - } - - c.add(x86::rsp, 40); - c.pop(x86::rbx); - c.pop(x86::rbp); - - maybe_flush_lbr(c); - c.ret(); -#else - UNUSED(args); - - c.brk(Imm(0x42)); - c.ret(a64::x30); -#endif -}); - void spu_int_ctrl_t::set(u64 ints) { // leave only enabled interrupts @@ -2396,60 +1853,6 @@ void spu_thread::push_snr(u32 number, u32 value) const u32 event_bit = SPU_EVENT_S1 >> (number & 1); const bool bitor_bit = !!((snr_config >> number) & 1); - // Redundant, g_use_rtm is checked inside tx_start now. - if (g_use_rtm && false) - { - bool channel_notify = false; - bool thread_notify = false; - - const bool ok = utils::tx_start([&] - { - channel_notify = (channel->data.raw() == spu_channel::bit_wait); - thread_notify = (channel->data.raw() & spu_channel::bit_count) == 0; - - if (channel_notify) - { - ensure(channel->jostling_value.raw() == spu_channel::bit_wait); - channel->jostling_value.raw() = value; - channel->data.raw() = 0; - } - else if (bitor_bit) - { - channel->data.raw() &= ~spu_channel::bit_wait; - channel->data.raw() |= spu_channel::bit_count | value; - } - else - { - channel->data.raw() = spu_channel::bit_count | value; - } - - if (thread_notify) - { - ch_events.raw().events |= event_bit; - - if (ch_events.raw().mask & event_bit) - { - ch_events.raw().count = 1; - thread_notify = ch_events.raw().waiting != 0; - } - else - { - thread_notify = false; - } - } - }); - - if (ok) - { - if (channel_notify) - channel->data.notify_one(); - if (thread_notify) - this->notify(); - - return; - } - } - // Lock event channel in case it needs event notification ch_events.atomic_op([](ch_events_t& ev) { @@ -2590,7 +1993,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* rsx::reservation_lock rsx_lock(eal, args.size, !is_get && (g_cfg.video.strict_rendering_mode || (g_cfg.core.rsx_fifo_accuracy && !g_cfg.core.spu_accurate_dma && eal < rsx::constants::local_mem_base))); - if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]] + if (!is_get || g_cfg.core.spu_accurate_dma) [[unlikely]] { perf_meter<"ADMA_GET"_u64> perf_get = perf_; perf_meter<"ADMA_PUT"_u64> perf_put = perf_; @@ -3697,10 +3100,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) { rsx_lock.update_if_enabled(addr, size, range_lock); - if (!g_use_rtm) - { - vm::range_lock(range_lock, addr & -128, utils::align(addr + size, 128) - (addr & -128)); - } + vm::range_lock(range_lock, addr & -128, utils::align(addr + size, 128) - (addr & -128)); } else { @@ -3912,90 +3312,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return true; } } - else if (!g_use_rtm) + else { - *vm::_ptr>(addr) += 0; - } - - if (g_use_rtm) [[likely]] - { - switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write)) - { - case umax: - { - auto& data = *vm::get_super_ptr(addr); - - const bool ok = cpu_thread::suspend_all<+3>(this, {data, data + 64, &res}, [&]() - { - if ((res & -128) == rtime) - { - if (cmp_rdata(rdata, data)) - { - mov_rdata(data, to_write); - res += 64; - return true; - } - } - - // Save previous data - mov_rdata_nt(rdata, data); - res -= 64; - return false; - }); - - const u64 count2 = utils::get_tsc() - perf2.get(); - - if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]] - { - perf_log.warning("PUTLLC: took too long: %.3fus (%u c) (addr=0x%x) (S)", count2 / (utils::get_tsc_freq() / 1000'000.), count2, addr); - } - - if (ok) - { - break; - } - - last_ftime = -1; - [[fallthrough]]; - } - case 0: - { - if (addr == last_faddr) - { - last_fail++; - } - - if (last_ftime != umax) - { - last_faddr = 0; - return false; - } - - utils::prefetch_read(rdata); - utils::prefetch_read(rdata + 64); - last_faddr = addr; - last_ftime = res.load() & -128; - last_ftsc = utils::get_tsc(); - return false; - } - default: - { - if (count > 20000 && g_cfg.core.perf_report) [[unlikely]] - { - perf_log.warning("PUTLLC: took too long: %.3fus (%u c) (addr = 0x%x)", count / (utils::get_tsc_freq() / 1000'000.), count, addr); - } - - break; - } - } - - if (addr == last_faddr) - { - last_succ++; - } - - last_faddr = 0; - return true; + utils::trigger_write_page_fault(vm::base(addr)); } auto& super_data = *vm::get_super_ptr(addr); @@ -4189,7 +3508,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) { result = 0; } - else if (!g_use_rtm) + else { // Provoke page fault utils::trigger_write_page_fault(vm::base(addr)); @@ -4200,16 +3519,6 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) mov_rdata(sdata, *static_cast(to_write)); vm::reservation_acquire(addr) += 32; } - else if (cpu->get_class() != thread_class::spu) - { - u64 stx, ftx; - result = spu_putlluc_tx(addr, to_write, &stx, &ftx); - } - else - { - auto _spu = static_cast(cpu); - result = spu_putlluc_tx(addr, to_write, &_spu->stx, &_spu->ftx); - } if (result == 0) { @@ -5104,29 +4413,15 @@ bool spu_thread::process_mfc_cmd() { ntime = vm::reservation_acquire(addr); - if (ntime & vm::rsrv_unique_lock) + if (ntime & 127) { // There's an on-going reservation store, wait continue; } - u64 test_mask = -1; + mov_rdata(rdata, data); - if (ntime & 127) - { - // Try to use TSX to obtain data atomically - if (!g_use_rtm || !spu_getllar_tx(addr, rdata, this, ntime & -128)) - { - // See previous ntime check. - continue; - } - } - else - { - mov_rdata(rdata, data); - } - - if (u64 time0 = vm::reservation_acquire(addr); (ntime & test_mask) != (time0 & test_mask)) + if (u64 time0 = vm::reservation_acquire(addr); ntime != time0) { // Reservation data has been modified recently if (time0 & vm::rsrv_unique_lock) i += 12; diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index 6ff9aa2376..d21a593959 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -6,9 +6,6 @@ #include "util/tsc.hpp" #include -extern bool g_use_rtm; -extern u64 g_rtm_tx_limit2; - #ifdef _MSC_VER extern "C" { @@ -143,7 +140,7 @@ namespace vm void reservation_op_internal(u32 addr, std::function func); template - inline SAFE_BUFFERS(auto) reservation_op(CPU& cpu, _ptr_base ptr, F op) + inline SAFE_BUFFERS(auto) reservation_op(CPU& /*cpu*/, _ptr_base ptr, F op) { // Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type"); @@ -162,188 +159,6 @@ namespace vm auto& res = vm::reservation_acquire(addr); //_m_prefetchw(&res); -#if defined(ARCH_X64) - if (g_use_rtm) - { - // Stage 1: single optimistic transaction attempt - unsigned status = -1; - u64 _old = 0; - - auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0; - -#ifndef _MSC_VER - __asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2); -#else - status = _xbegin(); - if (status == umax) -#endif - { - if (res & rsrv_unique_lock) - { -#ifndef _MSC_VER - __asm__ volatile ("xend; mov $-1, %%eax;" ::: "memory"); -#else - _xend(); -#endif - goto stage2; - } - - if constexpr (std::is_void_v>) - { - std::invoke(op, *sptr); - const u64 old_time = res.fetch_add(128); -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - if constexpr (Ack) - reservation_notifier_notify(addr, old_time); - return; - } - else - { - if (auto result = std::invoke(op, *sptr)) - { - const u64 old_time = res.fetch_add(128); -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - if constexpr (Ack) - reservation_notifier_notify(addr, old_time); - return result; - } - else - { -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - return result; - } - } - } - - stage2: -#ifndef _MSC_VER - __asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory"); -#endif - stamp1 = utils::get_tsc(); - - // Stage 2: try to lock reservation first - _old = res.fetch_add(1); - - // Compute stamps excluding memory touch - stamp2 = utils::get_tsc() - (stamp1 - stamp0); - - // Start lightened transaction - for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc()) - { - if (cpu.has_pause_flag()) - { - break; - } - -#ifndef _MSC_VER - __asm__ goto ("xbegin %l[retry];" ::: "memory" : retry); -#else - status = _xbegin(); - - if (status != umax) [[unlikely]] - { - goto retry; - } -#endif - if constexpr (std::is_void_v>) - { - std::invoke(op, *sptr); -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - res += 127; - if (Ack) - reservation_notifier_notify(addr, _old); - return; - } - else - { - if (auto result = std::invoke(op, *sptr)) - { -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - res += 127; - if (Ack) - reservation_notifier_notify(addr, _old); - return result; - } - else - { -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - return result; - } - } - - retry: -#ifndef _MSC_VER - __asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory"); -#endif - - if (!status) - { - break; - } - } - - // Stage 3: all failed, heavyweight fallback (see comments at the bottom) - if constexpr (std::is_void_v>) - { - vm::reservation_op_internal(addr, [&] - { - std::invoke(op, *sptr); - return true; - }); - - if constexpr (Ack) - reservation_notifier_notify(addr, _old); - return; - } - else - { - auto result = std::invoke_result_t(); - - vm::reservation_op_internal(addr, [&] - { - if ((result = std::invoke(op, *sptr))) - { - return true; - } - else - { - return false; - } - }); - - if (Ack && result) - reservation_notifier_notify(addr, _old); - return result; - } - } -#else - static_cast(cpu); -#endif /* ARCH_X64 */ - // Lock reservation and perform heavyweight lock reservation_shared_lock_internal(res); diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index b4e0699fbc..6a4d941ecc 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -68,10 +68,6 @@ LOG_CHANNEL(sys_log, "SYS"); // Preallocate 32 MiB stx::manual_typemap g_fixed_typemap; -bool g_use_rtm = false; -u64 g_rtm_tx_limit1 = 0; -u64 g_rtm_tx_limit2 = 0; - std::string g_cfg_defaults; atomic_t g_watchdog_hold_ctr{0}; @@ -1540,9 +1536,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch, m_localized_title = std::string(psf::get_string(_psf, fmt::format("TITLE_%02d", static_cast(g_cfg.sys.language.get())), m_title)); sys_log.notice("Localized Title: %s", GetLocalizedTitle()); - // Set RTM usage - g_use_rtm = utils::has_rtm() && (((utils::has_mpx() && !utils::has_tsx_force_abort()) && g_cfg.core.enable_TSX == tsx_usage::enabled) || g_cfg.core.enable_TSX == tsx_usage::forced); - { // Log some extra info in case of boot #if defined(HAVE_VULKAN) @@ -1553,11 +1546,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch, #endif sys_log.notice("Used configuration:\n%s\n", g_cfg.to_string()); - if (g_use_rtm && (!utils::has_mpx() || utils::has_tsx_force_abort())) - { - sys_log.warning("TSX forced by User"); - } - // Initialize patch engine g_fxo->need(); @@ -1566,14 +1554,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch, g_fxo->get().append_title_patches(m_title_id); } - if (g_use_rtm) - { - // Update supplementary settings - const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.; - g_rtm_tx_limit1 = static_cast(g_cfg.core.tx_limit1_ns * _1ns); - g_rtm_tx_limit2 = static_cast(g_cfg.core.tx_limit2_ns * _1ns); - } - // Set bdvd_dir std::string bdvd_dir = g_cfg_vfs.get(g_cfg_vfs.dev_bdvd, rpcs3::utils::get_emu_dir()); { diff --git a/rpcs3/Emu/System.h b/rpcs3/Emu/System.h index 41dd6229bd..0c26d09a4b 100644 --- a/rpcs3/Emu/System.h +++ b/rpcs3/Emu/System.h @@ -478,7 +478,3 @@ public: }; extern Emulator Emu; - -extern bool g_use_rtm; -extern u64 g_rtm_tx_limit1; -extern u64 g_rtm_tx_limit2; diff --git a/rpcs3/Emu/system_config.cpp b/rpcs3/Emu/system_config.cpp index 49b949a9b8..cac546162d 100644 --- a/rpcs3/Emu/system_config.cpp +++ b/rpcs3/Emu/system_config.cpp @@ -8,11 +8,6 @@ cfg_root g_cfg{}; cfg_root g_backup_cfg{}; -bool cfg_root::node_core::enable_tsx_by_default() -{ - return utils::has_rtm() && utils::has_mpx() && !utils::has_tsx_force_abort(); -} - std::string cfg_root::node_sys::get_random_system_name() { std::srand(static_cast(std::time(nullptr))); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index dd3182cd11..bfb4cc8ba0 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -12,11 +12,6 @@ struct cfg_root : cfg::node { struct node_core : cfg::node { - private: - /** We don't wanna include the sysinfo header here */ - static bool enable_tsx_by_default(); - - public: node_core(cfg::node* _this) : cfg::node(_this, "Core") {} cfg::_enum ppu_decoder{ this, "PPU Decoder", ppu_decoder_type::llvm }; @@ -65,7 +60,6 @@ struct cfg_root : cfg::node cfg::uint<0, 16> mfc_transfers_shuffling{ this, "MFC Commands Shuffling Limit", 0 }; cfg::uint<0, 10000> mfc_transfers_timeout{ this, "MFC Commands Timeout", 0, true }; cfg::_bool mfc_shuffling_in_steps{ this, "MFC Commands Shuffling In Steps", false, true }; - cfg::_enum enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully cfg::_enum spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false }; cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) @@ -84,8 +78,6 @@ struct cfg_root : cfg::node cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" }; cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff }; - cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds - cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true }; diff --git a/rpcs3/Emu/system_config_types.cpp b/rpcs3/Emu/system_config_types.cpp index 20c6f7151c..52564c7d06 100644 --- a/rpcs3/Emu/system_config_types.cpp +++ b/rpcs3/Emu/system_config_types.cpp @@ -196,22 +196,6 @@ void fmt_class_string::format(std::string& out, u64 arg) }); } -template <> -void fmt_class_string::format(std::string& out, u64 arg) -{ - format_enum(out, arg, [](tsx_usage value) - { - switch (value) - { - case tsx_usage::disabled: return "Disabled"; - case tsx_usage::enabled: return "Enabled"; - case tsx_usage::forced: return "Forced"; - } - - return unknown; - }); -} - template <> void fmt_class_string::format(std::string& out, u64 arg) { diff --git a/rpcs3/Emu/system_config_types.h b/rpcs3/Emu/system_config_types.h index 6099b700a9..c88555efaa 100644 --- a/rpcs3/Emu/system_config_types.h +++ b/rpcs3/Emu/system_config_types.h @@ -248,13 +248,6 @@ enum class rsx_fifo_mode : unsigned as_ps3, }; -enum class tsx_usage -{ - disabled, - enabled, - forced, -}; - enum class enter_button_assign { circle, // CELL_SYSUTIL_ENTER_BUTTON_ASSIGN_CIRCLE diff --git a/rpcs3/rpcs3qt/emu_settings.cpp b/rpcs3/rpcs3qt/emu_settings.cpp index dc837569e3..0c99cfc119 100644 --- a/rpcs3/rpcs3qt/emu_settings.cpp +++ b/rpcs3/rpcs3qt/emu_settings.cpp @@ -988,14 +988,6 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_ case thread_scheduler_mode::os: return tr("Operating System", "Thread Scheduler Mode"); } break; - case emu_settings_type::EnableTSX: - switch (static_cast(index)) - { - case tsx_usage::disabled: return tr("Disabled", "Enable TSX"); - case tsx_usage::enabled: return tr("Enabled", "Enable TSX"); - case tsx_usage::forced: return tr("Forced", "Enable TSX"); - } - break; case emu_settings_type::Renderer: switch (static_cast(index)) { diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index 7a68b65d2a..c398b07a28 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -20,7 +20,6 @@ enum class emu_settings_type MFCDebug, MaxLLVMThreads, LLVMPrecompilation, - EnableTSX, AccurateSpuDMA, AccurateClineStores, AccurateRSXAccess, @@ -233,7 +232,6 @@ inline static const std::map settings_location { emu_settings_type::MFCDebug, { "Core", "MFC Debug"}}, { emu_settings_type::MaxLLVMThreads, { "Core", "Max LLVM Compile Threads"}}, { emu_settings_type::LLVMPrecompilation, { "Core", "LLVM Precompilation"}}, - { emu_settings_type::EnableTSX, { "Core", "Enable TSX"}}, { emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}}, { emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}}, { emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index 8798e95ae6..2e3964bbe7 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -288,79 +288,6 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std SubscribeTooltip(ui->gb_spu_threads, tooltips.settings.preferred_spu_threads); ui->preferredSPUThreads->setItemText(ui->preferredSPUThreads->findData(0), tr("Auto", "Preferred SPU threads")); - if (utils::has_rtm()) - { - m_emu_settings->EnhanceComboBox(ui->enableTSX, emu_settings_type::EnableTSX); - SubscribeTooltip(ui->gb_tsx, tooltips.settings.enable_tsx); - - if (!utils::has_mpx() || utils::has_tsx_force_abort()) - { - remove_item(ui->enableTSX, static_cast(tsx_usage::enabled), static_cast(g_cfg.core.enable_TSX.def)); - } - - connect(ui->enableTSX, QOverload::of(&QComboBox::currentIndexChanged), this, [this](int index) - { - if (index < 0) return; - if (const auto [text, value] = get_data(ui->enableTSX, index); value == static_cast(tsx_usage::forced) && - (!utils::has_mpx() || utils::has_tsx_force_abort())) - { - QString title; - QString message; - if (!utils::has_mpx()) - { - title = tr("Haswell/Broadwell TSX Warning"); - message = gui::utils::make_paragraph(tr( - "RPCS3 has detected that you are using TSX functions on a Haswell or Broadwell CPU.\n" - "Intel has deactivated these functions in newer Microcode revisions, since they can lead to unpredicted behaviour.\n" - "That means using TSX may break games or even damage your data.\n" - "We recommend to disable this feature and update your computer BIOS.\n" - "\n" - "Do you wish to use TSX anyway?" - )); - } - else - { - title = tr("TSX-FA Warning"); - message = gui::utils::make_paragraph(tr( - "RPCS3 has detected your CPU only supports TSX-FA.\n" - "That means using TSX may break games or even damage your data.\n" - "We recommend to disable this feature.\n" - "\n" - "Do you wish to use TSX anyway?" - )); - } - - QMessageBox mb; - mb.setWindowModality(Qt::WindowModal); - mb.setWindowTitle(title); - mb.setIcon(QMessageBox::Critical); - mb.setTextFormat(Qt::RichText); - mb.setStandardButtons(QMessageBox::Yes | QMessageBox::No); - mb.setDefaultButton(QMessageBox::No); - mb.setText(message); - mb.layout()->setSizeConstraint(QLayout::SetFixedSize); - - if (mb.exec() == QMessageBox::No) - { - // Reset if the messagebox was answered with no. This prevents the currentIndexChanged signal in EnhanceComboBox - ui->enableTSX->setCurrentIndex(find_item(ui->enableTSX, static_cast(g_cfg.core.enable_TSX.def))); - } - } - }); - } - else - { - ui->enableTSX->setEnabled(false); - ui->enableTSX->setPlaceholderText(tr("Not supported", "Enable TSX")); - SubscribeTooltip(ui->enableTSX, tr("Unfortunately, your CPU model does not support this instruction set.", "Enable TSX")); - - m_emu_settings->SetSetting(emu_settings_type::EnableTSX, fmt::format("%s", tsx_usage::disabled)); - connect(this, &settings_dialog::signal_restore_dependant_defaults, [this]() - { - m_emu_settings->SetSetting(emu_settings_type::EnableTSX, fmt::format("%s", tsx_usage::disabled)); - }); - } - // PPU tool tips SubscribeTooltip(ui->ppu__static, tooltips.settings.ppu__static); SubscribeTooltip(ui->ppu_llvm, tooltips.settings.ppu_llvm); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 267c140e44..4556ae9356 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -266,24 +266,6 @@ - - - - - 0 - 0 - - - - TSX Instructions - - - - - - - - diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index ccd3e1cdf8..f7ec927332 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -91,7 +91,6 @@ public: const QString xfloat = tr("Control accuracy to SPU float vectors processing.\nFixes bugs in various games at the cost of performance.\nThis setting is only applied when SPU Decoder is set to Dynamic or LLVM."); const QString enable_thread_scheduler = tr("Control how RPCS3 utilizes the threads of your system.\nEach option heavily depends on the game and on your CPU. It's recommended to try each option to find out which performs the best.\nChanging the thread scheduler is not supported on CPUs with less than 12 threads."); const QString spu_loop_detection = tr("Try to detect loop conditions in SPU kernels and use them as scheduling hints.\nImproves performance and reduces CPU usage.\nMay cause severe audio stuttering in rare cases."); - const QString enable_tsx = tr("Enable usage of TSX instructions.\nNeeds to be forced on some Haswell or Broadwell CPUs or CPUs with the TSX-FA instruction set.\nForcing TSX in these cases may lead to system and performance instability, use it with caution."); const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility."); const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value."); const QString max_cpu_preempt = tr("Reduces CPU usage and power consumption, improving battery life on mobile devices. (0 means disabled)\nHigher values cause a more pronounced effect, but may cause audio or performance issues. A value of 50 or less is recommended.\nThis option forces an FPS limit because it's active when framerate is stable.\nThe lighter the game is on the hardware, the more power is saved by it. (until the preemption count barrier is reached)"); diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index 2e09ad6dd0..deca38b413 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -5,9 +5,6 @@ #include "util/atomic.hpp" #include -extern bool g_use_rtm; -extern u64 g_rtm_tx_limit1; - #ifdef ARCH_X64 #ifdef _MSC_VER #include @@ -19,70 +16,6 @@ extern u64 g_rtm_tx_limit1; namespace utils { - // Transaction helper (result = pair of success and op result, or just bool) - template > - inline auto tx_start(F op) - { -#if defined(ARCH_X64) - uint status = -1; - - for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc()) - { -#ifndef _MSC_VER - __asm__ goto ("xbegin %l[retry];" ::: "memory" : retry); -#else - status = _xbegin(); - - if (status != _XBEGIN_STARTED) [[unlikely]] - { - goto retry; - } -#endif - - if constexpr (std::is_void_v) - { - std::invoke(op); -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - return true; - } - else - { - auto result = std::invoke(op); -#ifndef _MSC_VER - __asm__ volatile ("xend;" ::: "memory"); -#else - _xend(); -#endif - return std::make_pair(true, std::move(result)); - } - - retry: -#ifndef _MSC_VER - __asm__ volatile ("movl %%eax, %0;" : "=r" (status) :: "memory"); -#endif - if (!status) [[unlikely]] - { - break; - } - } -#else - static_cast(op); -#endif - - if constexpr (std::is_void_v) - { - return false; - } - else - { - return std::make_pair(false, R()); - } - }; - // Try to prefetch to Level 2 cache since it's not split to data/code on most processors template constexpr void prefetch_exec(T func)