From b76e05d5ddaa93039a6e9565fb3bf237e77e95a8 Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Tue, 9 Dec 2025 07:16:22 +0430 Subject: [PATCH] better block linking with delinkers; memory operand optims --- src/core/jit/arm64_codegen.cpp | 5 ++ src/core/jit/arm64_codegen.h | 1 + src/core/jit/block_manager.cpp | 32 ++++++++++++ src/core/jit/block_manager.h | 30 ++++++++++++ src/core/jit/execution_engine.cpp | 55 ++++++++++++++++----- src/core/jit/x86_64_translator.cpp | 78 ++++++++++++++++++++++++------ 6 files changed, 174 insertions(+), 27 deletions(-) diff --git a/src/core/jit/arm64_codegen.cpp b/src/core/jit/arm64_codegen.cpp index 516240295..bbcab2809 100644 --- a/src/core/jit/arm64_codegen.cpp +++ b/src/core/jit/arm64_codegen.cpp @@ -228,6 +228,11 @@ void Arm64CodeGenerator::add(int dst, int src1, int src2) { emit32(0x8B000000 | (dst << 0) | (src1 << 5) | (src2 << 16)); } +void Arm64CodeGenerator::add(int dst, int src1, int src2, int shift) { + ASSERT_MSG(shift >= 0 && shift <= 3, "Invalid shift amount"); + emit32(0x8B000000 | (dst << 0) | (src1 << 5) | (src2 << 16) | (shift << 12)); +} + void Arm64CodeGenerator::add_imm(int dst, int src1, s32 imm) { if (imm >= 0 && imm < 4096) { emit32(0x91000000 | (dst << 0) | (src1 << 5) | (imm << 10)); diff --git a/src/core/jit/arm64_codegen.h b/src/core/jit/arm64_codegen.h index 519802c44..82151ac45 100644 --- a/src/core/jit/arm64_codegen.h +++ b/src/core/jit/arm64_codegen.h @@ -43,6 +43,7 @@ public: // Arithmetic operations void add(int dst, int src1, int src2); + void add(int dst, int src1, int src2, int shift); void add_imm(int dst, int src1, s32 imm); void sub(int dst, int src1, int src2); void sub_imm(int dst, int src1, s32 imm); diff --git a/src/core/jit/block_manager.cpp b/src/core/jit/block_manager.cpp index 172a817ca..dd6b6e6cb 100644 --- a/src/core/jit/block_manager.cpp +++ b/src/core/jit/block_manager.cpp @@ -37,6 +37,16 @@ CodeBlock* BlockManager::CreateBlock(VAddr ps4_address, void* arm64_code, size_t void BlockManager::InvalidateBlock(VAddr ps4_address) { std::lock_guard lock(mutex); + + // Delink all links pointing to this block + auto lower = block_links.lower_bound({ps4_address, nullptr}); + auto upper = block_links.upper_bound( + {ps4_address, reinterpret_cast(UINTPTR_MAX)}); + for (auto it = lower; it != upper;) { + it->second(it->first.host_link); + it = block_links.erase(it); + } + blocks.erase(ps4_address); LOG_DEBUG(Core, "Invalidated code block at PS4 address {:#x}", ps4_address); } @@ -44,6 +54,17 @@ void BlockManager::InvalidateBlock(VAddr ps4_address) { void BlockManager::InvalidateRange(VAddr start, VAddr end) { std::lock_guard lock(mutex); + // Delink all links pointing to blocks in this range + auto link_it = block_links.begin(); + while (link_it != block_links.end()) { + if (link_it->first.guest_destination >= start && link_it->first.guest_destination < end) { + link_it->second(link_it->first.host_link); + link_it = block_links.erase(link_it); + } else { + ++link_it; + } + } + auto it = blocks.begin(); while (it != blocks.end()) { VAddr block_addr = it->first; @@ -77,8 +98,19 @@ void BlockManager::AddDependency(VAddr block_address, VAddr dependency) { } } +void BlockManager::AddBlockLink(VAddr guest_dest, ExitFunctionLinkData* link_data, + BlockDelinkerFunc delinker) { + std::lock_guard lock(mutex); + block_links[{guest_dest, link_data}] = delinker; +} + void BlockManager::Clear() { std::lock_guard lock(mutex); + // Delink all links before clearing + for (auto& [tag, delinker] : block_links) { + delinker(tag.host_link); + } + block_links.clear(); blocks.clear(); } diff --git a/src/core/jit/block_manager.h b/src/core/jit/block_manager.h index 07d6f80fa..90a8436dc 100644 --- a/src/core/jit/block_manager.h +++ b/src/core/jit/block_manager.h @@ -3,6 +3,9 @@ #pragma once +#include +#include +#include #include #include #include @@ -11,6 +14,30 @@ namespace Core::Jit { +struct ExitFunctionLinkData { + void* host_code; + VAddr guest_rip; + void* caller_address; + u32 original_instruction; +}; + +using BlockDelinkerFunc = std::function; + +struct BlockLinkTag { + VAddr guest_destination; + ExitFunctionLinkData* host_link; + + bool operator<(const BlockLinkTag& other) const { + if (guest_destination < other.guest_destination) { + return true; + } else if (guest_destination == other.guest_destination) { + return host_link < other.host_link; + } else { + return false; + } + } +}; + struct CodeBlock { VAddr ps4_address; void* arm64_code; @@ -41,6 +68,8 @@ public: void InvalidateBlock(VAddr ps4_address); void InvalidateRange(VAddr start, VAddr end); void AddDependency(VAddr block_address, VAddr dependency); + void AddBlockLink(VAddr guest_dest, ExitFunctionLinkData* link_data, + BlockDelinkerFunc delinker); void Clear(); size_t GetBlockCount() const { @@ -49,6 +78,7 @@ public: size_t GetTotalCodeSize() const; std::unordered_map> blocks; + std::map block_links; mutable std::mutex mutex; }; diff --git a/src/core/jit/execution_engine.cpp b/src/core/jit/execution_engine.cpp index ed7c986e2..65f2ffb06 100644 --- a/src/core/jit/execution_engine.cpp +++ b/src/core/jit/execution_engine.cpp @@ -250,6 +250,18 @@ CodeBlock* ExecutionEngine::TranslateBlock(VAddr ps4_address) { return new_block; } +static void DirectBlockDelinker(ExitFunctionLinkData* record, bool is_call) { + void* caller_addr = record->caller_address; + u32 original_inst = record->original_instruction; + + std::atomic_ref(*reinterpret_cast(caller_addr)) + .store(original_inst, std::memory_order::relaxed); +#if defined(__APPLE__) && defined(ARCH_ARM64) + __builtin___clear_cache(static_cast(caller_addr), static_cast(caller_addr) + 4); +#endif + delete record; +} + void ExecutionEngine::LinkBlock(CodeBlock* block, VAddr target_address) { CodeBlock* target_block = block_manager->GetBlock(target_address); if (!target_block) { @@ -261,25 +273,33 @@ void ExecutionEngine::LinkBlock(CodeBlock* block, VAddr target_address) { #if defined(__APPLE__) && defined(ARCH_ARM64) pthread_jit_write_protect_np(0); #endif - // Calculate offset from patch location to target - s64 offset = reinterpret_cast(target_block->arm64_code) - - reinterpret_cast(block->branch_patch_location); + void* caller_address = block->branch_patch_location; + s64 offset = + reinterpret_cast(target_block->arm64_code) - reinterpret_cast(caller_address); // Check if we can use a relative branch (within ±128MB) if (offset >= -0x8000000 && offset < 0x8000000) { s32 imm26 = static_cast(offset / 4); - u32* patch_ptr = reinterpret_cast(block->branch_patch_location); - // Patch the branch instruction: 0x14000000 | (imm26 & 0x3FFFFFF) - *patch_ptr = 0x14000000 | (imm26 & 0x3FFFFFF); + u32* patch_ptr = reinterpret_cast(caller_address); + u32 branch_inst = 0x14000000 | (imm26 & 0x3FFFFFF); + + u32 original_inst = *patch_ptr; + std::atomic_ref(*patch_ptr).store(branch_inst, std::memory_order::relaxed); + + // Register delinker + ExitFunctionLinkData* link_data = new ExitFunctionLinkData{ + target_block->arm64_code, target_address, caller_address, original_inst}; + block_manager->AddBlockLink(target_address, link_data, [](ExitFunctionLinkData* r) { + DirectBlockDelinker(r, false); + }); } else { - // Far branch - need to use indirect branch - // For now, leave as-is (will use the placeholder branch) + // Far branch - need to use indirect branch via thunk LOG_DEBUG(Core, "Branch target too far for direct linking: offset={}", offset); } #if defined(__APPLE__) && defined(ARCH_ARM64) pthread_jit_write_protect_np(1); - __builtin___clear_cache(static_cast(block->branch_patch_location), - static_cast(block->branch_patch_location) + 4); + __builtin___clear_cache(static_cast(caller_address), + static_cast(caller_address) + 4); #endif block->is_linked = true; LOG_DEBUG(Core, "Linked block {:#x} to {:#x}", block->ps4_address, target_address); @@ -295,8 +315,19 @@ void ExecutionEngine::LinkBlock(CodeBlock* block, VAddr target_address) { if (offset >= -0x8000000 && offset < 0x8000000) { s32 imm26 = static_cast(offset / 4); u32* patch_ptr = reinterpret_cast(link_location); - *patch_ptr = 0x14000000 | (imm26 & 0x3FFFFFF); - block->code_size += 4; // Update block size + u32 branch_inst = 0x14000000 | (imm26 & 0x3FFFFFF); + u32 original_inst = 0x14000002; + + std::atomic_ref(*patch_ptr).store(branch_inst, std::memory_order::relaxed); + + // Register delinker + ExitFunctionLinkData* link_data = new ExitFunctionLinkData{ + target_block->arm64_code, target_address, link_location, original_inst}; + block_manager->AddBlockLink(target_address, link_data, [](ExitFunctionLinkData* r) { + DirectBlockDelinker(r, false); + }); + + block->code_size += 4; } #if defined(__APPLE__) && defined(ARCH_ARM64) pthread_jit_write_protect_np(1); diff --git a/src/core/jit/x86_64_translator.cpp b/src/core/jit/x86_64_translator.cpp index 84fe8f7ec..d6391266c 100644 --- a/src/core/jit/x86_64_translator.cpp +++ b/src/core/jit/x86_64_translator.cpp @@ -116,29 +116,77 @@ void X86_64Translator::CalculateMemoryAddress(int dst_reg, const ZydisDecodedOpe } } - if (base_reg == -1 && index_reg == -1 && mem.disp.value == 0) { + s64 displacement = mem.disp.value; + + if (base_reg == -1 && index_reg == -1 && displacement == 0) { codegen.mov(dst_reg, 0); return; } - if (base_reg != -1) { - codegen.mov(dst_reg, base_reg); - } else { - codegen.mov(dst_reg, 0); - } - - if (index_reg != -1) { - if (mem.scale > 0 && mem.scale <= 8) { - codegen.mov(RegisterMapper::SCRATCH_REG, static_cast(mem.scale)); - codegen.mul(RegisterMapper::SCRATCH_REG, index_reg, RegisterMapper::SCRATCH_REG); - codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + if (index_reg == -1) { + if (base_reg != -1) { + if (displacement == 0) { + codegen.mov(dst_reg, base_reg); + } else if (displacement >= -256 && displacement < 256) { + codegen.mov(dst_reg, base_reg); + codegen.add_imm(dst_reg, dst_reg, static_cast(displacement)); + } else { + codegen.mov(dst_reg, base_reg); + codegen.mov_imm(RegisterMapper::SCRATCH_REG, displacement); + codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + } } else { - codegen.add(dst_reg, dst_reg, index_reg); + codegen.mov_imm(dst_reg, displacement); } + return; } - if (mem.disp.value != 0) { - codegen.add(dst_reg, dst_reg, static_cast(mem.disp.value)); + if (base_reg == -1) { + base_reg = 0; + } + + int scale = mem.scale; + if (scale == 0) { + scale = 1; + } + + if (scale == 1) { + if (displacement == 0) { + codegen.add(dst_reg, base_reg, index_reg); + } else if (displacement >= -256 && displacement < 256) { + codegen.add(dst_reg, base_reg, index_reg); + codegen.add_imm(dst_reg, dst_reg, static_cast(displacement)); + } else { + codegen.add(dst_reg, base_reg, index_reg); + codegen.mov_imm(RegisterMapper::SCRATCH_REG, displacement); + codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + } + } else if (scale == 2 || scale == 4 || scale == 8) { + int shift = (scale == 2) ? 1 : (scale == 4) ? 2 : 3; + if (displacement == 0) { + codegen.add(dst_reg, base_reg, index_reg, shift); + } else { + codegen.add(dst_reg, base_reg, index_reg, shift); + if (displacement >= -256 && displacement < 256) { + codegen.add_imm(dst_reg, dst_reg, static_cast(displacement)); + } else { + codegen.mov_imm(RegisterMapper::SCRATCH_REG, displacement); + codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + } + } + } else { + codegen.mov(dst_reg, base_reg); + codegen.mov_imm(RegisterMapper::SCRATCH_REG, scale); + codegen.mul(RegisterMapper::SCRATCH_REG, index_reg, RegisterMapper::SCRATCH_REG); + codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + if (displacement != 0) { + if (displacement >= -256 && displacement < 256) { + codegen.add_imm(dst_reg, dst_reg, static_cast(displacement)); + } else { + codegen.mov_imm(RegisterMapper::SCRATCH_REG, displacement); + codegen.add(dst_reg, dst_reg, RegisterMapper::SCRATCH_REG); + } + } } }