From 316e01995b5543287f47e5df445083f76e339130 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 7 Dec 2025 19:16:24 +0300 Subject: [PATCH] rsx/fp: Re-implement ROP output resolve --- rpcs3/Emu/CMakeLists.txt | 1 - .../RSX/Program/FragmentProgramDecompiler.cpp | 154 ++++++-------- .../RSX/Program/FragmentProgramDecompiler.h | 4 - .../RSX/Program/FragmentProgramRegister.cpp | 196 ------------------ .../Emu/RSX/Program/FragmentProgramRegister.h | 111 ---------- rpcs3/emucore.vcxproj | 2 - rpcs3/emucore.vcxproj.filters | 6 - 7 files changed, 69 insertions(+), 405 deletions(-) delete mode 100644 rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp delete mode 100644 rpcs3/Emu/RSX/Program/FragmentProgramRegister.h diff --git a/rpcs3/Emu/CMakeLists.txt b/rpcs3/Emu/CMakeLists.txt index 395babec48..48674612c7 100644 --- a/rpcs3/Emu/CMakeLists.txt +++ b/rpcs3/Emu/CMakeLists.txt @@ -525,7 +525,6 @@ target_sources(rpcs3_emu PRIVATE RSX/Program/CgBinaryFragmentProgram.cpp RSX/Program/CgBinaryVertexProgram.cpp RSX/Program/FragmentProgramDecompiler.cpp - RSX/Program/FragmentProgramRegister.cpp RSX/Program/GLSLCommon.cpp RSX/Program/ProgramStateCache.cpp RSX/Program/program_util.cpp diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index b03f534fa9..c8b6156065 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -14,6 +14,8 @@ namespace rsx { namespace fragment_program { + using namespace rsx::assembler; + static const std::string reg_table[] = { "wpos", @@ -22,6 +24,28 @@ namespace rsx "tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9", "ssa" }; + + static const std::vector s_fp32_output_set = + { + {.reg {.id = 0, .f16 = false }, .mask = 0xf }, + {.reg {.id = 2, .f16 = false }, .mask = 0xf }, + {.reg {.id = 3, .f16 = false }, .mask = 0xf }, + {.reg {.id = 4, .f16 = false }, .mask = 0xf }, + }; + + static const std::vector s_fp16_output_set = + { + {.reg {.id = 0, .f16 = true }, .mask = 0xf }, + {.reg {.id = 4, .f16 = true }, .mask = 0xf }, + {.reg {.id = 6, .f16 = true }, .mask = 0xf }, + {.reg {.id = 8, .f16 = true }, .mask = 0xf }, + }; + + static const RegisterRef s_z_export_reg = + { + .reg {.id = 1, .f16 = false }, + .mask = (1u << 2) + }; } } @@ -37,6 +61,26 @@ enum VectorLane : u8 W = 3, }; +std::vector get_fragment_program_output_set(u32 ctrl, u32 mrt_count) +{ + std::vector result; + if (mrt_count > 0) + { + result = (ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS) + ? s_fp32_output_set + : s_fp16_output_set; + + result.resize(mrt_count); + } + + if (ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT) + { + result.push_back(s_z_export_reg); + } + + return result; +} + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -157,8 +201,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) } const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg; - ensure(reg_index < temp_registers.size()); - if (dst.opcode == RSX_FP_OPCODE_MOV && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP && src0.tmp_reg_index == reg_index) @@ -171,8 +213,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) return; } } - - temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); } void FragmentProgramDecompiler::AddFlowOp(const std::string& code) @@ -528,26 +568,7 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) switch (src.reg_type) { case RSX_FP_REGISTER_TYPE_TEMP: - - if (!src.fp16) - { - if (dst.opcode == RSX_FP_OPCODE_UP16 || - dst.opcode == RSX_FP_OPCODE_UP2 || - dst.opcode == RSX_FP_OPCODE_UP4 || - dst.opcode == RSX_FP_OPCODE_UPB || - dst.opcode == RSX_FP_OPCODE_UPG) - { - auto ® = temp_registers[src.tmp_reg_index]; - if (reg.requires_gather(src.swizzle_x)) - { - properties.has_gather_op = true; - AddReg(src.tmp_reg_index, src.fp16); - ret = getFloatTypeName(4) + reg.gather_r(); - break; - } - } - } - else if (precision_modifier == RSX_FP_PRECISION_HALF) + if (src.fp16 && precision_modifier == RSX_FP_PRECISION_HALF) { // clamp16() is not a cheap operation when emulated; avoid at all costs precision_modifier = RSX_FP_PRECISION_REAL; @@ -778,17 +799,6 @@ std::string FragmentProgramDecompiler::BuildCode() { // Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!! m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value); - - auto& r1 = temp_registers[1]; - if (r1.requires_gather(VectorLane::Z)) - { - // r1.zw was not written to - properties.has_gather_op = true; - main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n"; - - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. - rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered."); - } } // Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z) @@ -816,33 +826,6 @@ std::string FragmentProgramDecompiler::BuildCode() continue; } - const auto block_index = ouput_register_indices[n]; - auto& r = temp_registers[block_index]; - - if (fp16_out) - { - // Check if we need a split/extract op - if (r.requires_split(0)) - { - main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n"; - - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. - rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name); - } - - continue; - } - - if (!r.requires_gather128()) - { - // Nothing to do - continue; - } - - // We need to gather the data from existing registers - main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n"; - properties.has_gather_op = true; - // Emit debug warning. Useful to diagnose regressions, but should be removed in future. rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name); } @@ -1030,28 +1013,6 @@ std::string FragmentProgramDecompiler::BuildCode() OS << Format(divsq_func); } - // Declare register gather/merge if needed - if (properties.has_gather_op) - { - std::string float2 = getFloatTypeName(2); - - OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n"; - OS << "{\n"; - OS << " float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n"; - OS << " float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n"; - OS << " float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n"; - OS << " float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n"; - OS << " return " << float4 << "(x, y, z, w);\n"; - OS << "}\n\n"; - - OS << float2 << " gather(" << float4 << " _h)\n"; - OS << "{\n"; - OS << " float x = uintBitsToFloat(packHalf2x16(_h.xy));\n"; - OS << " float y = uintBitsToFloat(packHalf2x16(_h.zw));\n"; - OS << " return " << float2 << "(x, y);\n"; - OS << "}\n\n"; - } - if (properties.has_dynamic_register_load) { OS << @@ -1303,8 +1264,28 @@ std::string FragmentProgramDecompiler::Decompile() { auto graph = deconstruct_fragment_program(m_prog); - if (g_cfg.video.shader_precision != gpu_preset_level::low) + if (!graph.blocks.empty()) { + // The RSX CFG is missing the output block. We inject a fake tail block that ingests the ROP outputs. + BasicBlock* rop_block = nullptr; + BasicBlock* tail_block = &graph.blocks.back(); + if (tail_block->instructions.size() == 0) + { + // Merge block. Use this directly + rop_block = tail_block; + } + else + { + graph.blocks.push_back({}); + rop_block = &graph.blocks.back(); + + tail_block->insert_succ(rop_block); + rop_block->insert_pred(tail_block); + } + + const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count); + rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end()); + FP::RegisterAnnotationPass annotation_pass{ m_prog }; FP::RegisterDependencyPass dependency_pass{}; @@ -1376,6 +1357,9 @@ std::string FragmentProgramDecompiler::Decompile() case EdgeType::ENDLOOP: // Pure merge block? break; + case EdgeType::NONE: + ensure(block.instructions.empty()); + break; default: fmt::throw_exception("Unhandled edge type %d", static_cast(pred.type)); break; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index b68750bdfc..09a02804c3 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -1,6 +1,5 @@ #pragma once #include "ShaderParam.h" -#include "FragmentProgramRegister.h" #include "RSXFragmentProgram.h" #include "Assembler/CFG.h" @@ -53,8 +52,6 @@ class FragmentProgramDecompiler int m_code_level; std::unordered_map m_constant_offsets; - std::array temp_registers; - std::string GetMask() const; void SetDst(std::string code, u32 flags = 0); @@ -175,7 +172,6 @@ public: // Decoded properties (out) bool has_lit_op = false; - bool has_gather_op = false; bool has_no_output = false; bool has_discard_op = false; bool has_tex_op = false; diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp deleted file mode 100644 index a14b142df6..0000000000 --- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp +++ /dev/null @@ -1,196 +0,0 @@ -#include "stdafx.h" -#include "FragmentProgramRegister.h" - -namespace rsx -{ - MixedPrecisionRegister::MixedPrecisionRegister() - { - std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined); - } - - void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w) - { - if (x) content_mask[0] = data_type_bits::f16; - if (y) content_mask[1] = data_type_bits::f16; - if (z) content_mask[2] = data_type_bits::f16; - if (w) content_mask[3] = data_type_bits::f16; - } - - void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w) - { - if (x) content_mask[4] = data_type_bits::f16; - if (y) content_mask[5] = data_type_bits::f16; - if (z) content_mask[6] = data_type_bits::f16; - if (w) content_mask[7] = data_type_bits::f16; - } - - void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w) - { - if (x) content_mask[0] = content_mask[1] = data_type_bits::f32; - if (y) content_mask[2] = content_mask[3] = data_type_bits::f32; - if (z) content_mask[4] = content_mask[5] = data_type_bits::f32; - if (w) content_mask[6] = content_mask[7] = data_type_bits::f32; - } - - void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w) - { - if (file_index == umax) - { - // First-time use. Initialize... - const u32 real_index = is_fp16 ? (index >> 1) : index; - file_index = real_index; - } - - if (is_fp16) - { - ensure((index / 2) == file_index); - - if (index & 1) - { - tag_h1(x, y, z, w); - return; - } - - tag_h0(x, y, z, w); - return; - } - - tag_r(x, y, z, w); - } - - std::string MixedPrecisionRegister::gather_r() const - { - const auto half_index = file_index << 1; - const std::string reg = "r" + std::to_string(file_index); - const std::string gather_half_regs[] = { - "gather(h" + std::to_string(half_index) + ")", - "gather(h" + std::to_string(half_index + 1) + ")" - }; - - std::string outputs[4]; - for (int ch = 0; ch < 4; ++ch) - { - // FIXME: This approach ignores mixed register bits. Not ideal!!!! - const auto channel0 = content_mask[ch * 2]; - const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16; - outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg; - } - - // Grouping. Only replace relevant bits... - if (outputs[0] == outputs[1]) outputs[0] = ""; - if (outputs[2] == outputs[3]) outputs[2] = ""; - - // Assemble - bool group = false; - std::string result = ""; - constexpr std::string_view swz_mask = "xyzw"; - - for (int ch = 0; ch < 4; ++ch) - { - if (outputs[ch].empty()) - { - group = true; - continue; - } - - if (!result.empty()) - { - result += ", "; - } - - if (group) - { - ensure(ch > 0); - group = false; - - if (outputs[ch] == reg) - { - result += reg + "." + swz_mask[ch - 1] + swz_mask[ch]; - continue; - } - - result += outputs[ch]; - continue; - } - - const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles - result += outputs[ch] + "." + swz_mask[subch]; - } - - // Optimize dual-gather (128-bit gather) to use special function - const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1]; - if (result == double_gather) - { - result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")"; - } - - return "(" + result + ")"; - } - - std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const - { - // Reads half-word 0 (H16x4) from a full real (R32x4) register - constexpr std::string_view swz_mask = "xyzw"; - const std::string reg = "r" + std::to_string(file_index); - const std::string hreg = "h" + std::to_string(file_index * 2 + word_index); - - const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")"; - const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")"; - const std::string words[] = { - "unpackHalf2x16(" + word0_bits + ")", - "unpackHalf2x16(" + word1_bits + ")" - }; - - // Assemble - std::string outputs[4]; - - ensure(word_index <= 1); - const int word_offset = word_index * 4; - for (int ch = 0; ch < 4; ++ch) - { - outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32 - ? words[ch / 2] - : hreg; - } - - // Grouping. Only replace relevant bits... - if (outputs[0] == outputs[1]) outputs[0] = ""; - if (outputs[2] == outputs[3]) outputs[2] = ""; - - // Assemble - bool group = false; - std::string result = ""; - - for (int ch = 0; ch < 4; ++ch) - { - if (outputs[ch].empty()) - { - group = true; - continue; - } - - if (!result.empty()) - { - result += ", "; - } - - if (group) - { - ensure(ch > 0); - group = false; - result += outputs[ch]; - - if (outputs[ch] == hreg) - { - result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch]; - } - continue; - } - - const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles - result += outputs[ch] + "." + swz_mask[subch]; - } - - return "(" + result + ")"; - } -} diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h deleted file mode 100644 index 6cfc8e76c3..0000000000 --- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once - -#include - -namespace rsx -{ - class MixedPrecisionRegister - { - enum data_type_bits - { - undefined = 0, - f16 = 1, - f32 = 2 - }; - - std::array content_mask; // Content details for each half-word - u32 file_index = umax; - - void tag_h0(bool x, bool y, bool z, bool w); - - void tag_h1(bool x, bool y, bool z, bool w); - - void tag_r(bool x, bool y, bool z, bool w); - - std::string fetch_halfreg(u32 word_index) const; - - public: - MixedPrecisionRegister(); - - void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w); - - std::string gather_r() const; - - std::string split_h0() const - { - return fetch_halfreg(0); - } - - std::string split_h1() const - { - return fetch_halfreg(1); - } - - // Getters - - // Return true if all values are unwritten to (undefined) - bool floating() const - { - return file_index == umax; - } - - // Return true if the first half register is all undefined - bool floating_h0() const - { - return content_mask[0] == content_mask[1] && - content_mask[1] == content_mask[2] && - content_mask[2] == content_mask[3] && - content_mask[3] == data_type_bits::undefined; - } - - // Return true if the second half register is all undefined - bool floating_h1() const - { - return content_mask[4] == content_mask[5] && - content_mask[5] == content_mask[6] && - content_mask[6] == content_mask[7] && - content_mask[7] == data_type_bits::undefined; - } - - // Return true if any of the half-words are 16-bit - bool requires_gather(u8 channel) const - { - // Data fetched from the single precision register requires merging of the two half registers - const auto channel_offset = channel * 2; - ensure(channel_offset <= 6); - - return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16); - } - - // Return true if the entire 128-bit register is filled with 2xfp16x4 data words - bool requires_gather128() const - { - // Full 128-bit check - for (const auto& ch : content_mask) - { - if (ch == data_type_bits::f16) - { - return true; - } - } - - return false; - } - - // Return true if the half-register is polluted with fp32 data - bool requires_split(u32 word_index) const - { - const u32 content_offset = word_index * 4; - for (u32 i = 0; i < 4; ++i) - { - if (content_mask[content_offset + i] == data_type_bits::f32) - { - return true; - } - } - - return false; - } - }; -} - diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 761f42dd07..77deb6088e 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -161,7 +161,6 @@ - @@ -710,7 +709,6 @@ - diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 937b22f5cc..6b15f662e5 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -1360,9 +1360,6 @@ Emu\Cell - - Emu\GPU\RSX\Program - Utilities @@ -2764,9 +2761,6 @@ Emu\Audio - - Emu\GPU\RSX\Program - Utilities