rsx/fp: Re-implement ROP output resolve

2025-12-16 04:09:07 +00:00 · 2025-12-07 19:16:24 +03:00 · 2025-12-07 19:16:24 +03:00 · 316e01995b
commit 316e01995b
parent f2913e4692
7 changed files with 69 additions and 405 deletions
--- a/rpcs3/Emu/CMakeLists.txt
+++ b/rpcs3/Emu/CMakeLists.txt
@ -525,7 +525,6 @@ target_sources(rpcs3_emu PRIVATE
    RSX/Program/CgBinaryFragmentProgram.cpp
    RSX/Program/CgBinaryVertexProgram.cpp
    RSX/Program/FragmentProgramDecompiler.cpp
    RSX/Program/FragmentProgramRegister.cpp
    RSX/Program/GLSLCommon.cpp
    RSX/Program/ProgramStateCache.cpp
    RSX/Program/program_util.cpp
--- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp
+++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp
@ -14,6 +14,8 @@ namespace rsx
 {
 	namespace fragment_program
 	{
 		using namespace rsx::assembler;
 		static const std::string reg_table[] =
 		{
 			"wpos",
@ -22,6 +24,28 @@ namespace rsx
 			"tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9",
 			"ssa"
 		};
 		static const std::vector<RegisterRef> s_fp32_output_set =
 		{
 			{.reg {.id = 0, .f16 = false }, .mask = 0xf },
 			{.reg {.id = 2, .f16 = false }, .mask = 0xf },
 			{.reg {.id = 3, .f16 = false }, .mask = 0xf },
 			{.reg {.id = 4, .f16 = false }, .mask = 0xf },
 		};
 		static const std::vector<RegisterRef> s_fp16_output_set =
 		{
 			{.reg {.id = 0, .f16 = true }, .mask = 0xf },
 			{.reg {.id = 4, .f16 = true }, .mask = 0xf },
 			{.reg {.id = 6, .f16 = true }, .mask = 0xf },
 			{.reg {.id = 8, .f16 = true }, .mask = 0xf },
 		};
 		static const RegisterRef s_z_export_reg =
 		{
 			.reg {.id = 1, .f16 = false },
 			.mask = (1u << 2)
 		};
 	}
 }
@ -37,6 +61,26 @@ enum VectorLane : u8
 	W = 3,
 };
 std::vector<RegisterRef> get_fragment_program_output_set(u32 ctrl, u32 mrt_count)
 {
 	std::vector<RegisterRef> result;
 	if (mrt_count > 0)
 	{
 		result = (ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)
 			? s_fp32_output_set
 			: s_fp16_output_set;
 		result.resize(mrt_count);
 	}
 	if (ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
 	{
 		result.push_back(s_z_export_reg);
 	}
 	return result;
 }
 FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size)
 	: m_size(size)
 	, m_prog(prog)
@ -157,8 +201,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
 	}
 	const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg;
 	ensure(reg_index < temp_registers.size());
 	if (dst.opcode == RSX_FP_OPCODE_MOV &&
 		src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP &&
 		src0.tmp_reg_index == reg_index)
@ -171,8 +213,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
 			return;
 		}
 	}
 	temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w);
 }
 void FragmentProgramDecompiler::AddFlowOp(const std::string& code)
@ -528,26 +568,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
 	switch (src.reg_type)
 	{
 	case RSX_FP_REGISTER_TYPE_TEMP:
-
+		if (src.fp16 && precision_modifier == RSX_FP_PRECISION_HALF)
 		if (!src.fp16)
 		{
 			if (dst.opcode == RSX_FP_OPCODE_UP16 ||
 				dst.opcode == RSX_FP_OPCODE_UP2 ||
 				dst.opcode == RSX_FP_OPCODE_UP4 ||
 				dst.opcode == RSX_FP_OPCODE_UPB ||
 				dst.opcode == RSX_FP_OPCODE_UPG)
 			{
 				auto &reg = temp_registers[src.tmp_reg_index];
 				if (reg.requires_gather(src.swizzle_x))
 				{
 					properties.has_gather_op = true;
 					AddReg(src.tmp_reg_index, src.fp16);
 					ret = getFloatTypeName(4) + reg.gather_r();
 					break;
 				}
 			}
 		}
 		else if (precision_modifier == RSX_FP_PRECISION_HALF)
 		{
 			// clamp16() is not a cheap operation when emulated; avoid at all costs
 			precision_modifier = RSX_FP_PRECISION_REAL;
@ -778,17 +799,6 @@ std::string FragmentProgramDecompiler::BuildCode()
 	{
 		// Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!!
 		m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value);
 		auto& r1 = temp_registers[1];
 		if (r1.requires_gather(VectorLane::Z))
 		{
 			// r1.zw was not written to
 			properties.has_gather_op = true;
 			main_epilogue << "	r1.z = " << float4_type << r1.gather_r() << ".z;\n";
 			// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
 			rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered.");
 		}
 	}
 	// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
@ -816,33 +826,6 @@ std::string FragmentProgramDecompiler::BuildCode()
 			continue;
 		}
 		const auto block_index = ouput_register_indices[n];
 		auto& r = temp_registers[block_index];
 		if (fp16_out)
 		{
 			// Check if we need a split/extract op
 			if (r.requires_split(0))
 			{
 				main_epilogue << "	" << reg_name << " = " << float4_type << r.split_h0() << ";\n";
 				// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
 				rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name);
 			}
 			continue;
 		}
 		if (!r.requires_gather128())
 		{
 			// Nothing to do
 			continue;
 		}
 		// We need to gather the data from existing registers
 		main_epilogue << "	" << reg_name << " = " << float4_type << r.gather_r() << ";\n";
 		properties.has_gather_op = true;
 		// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
 		rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name);
 	}
@ -1030,28 +1013,6 @@ std::string FragmentProgramDecompiler::BuildCode()
 		OS << Format(divsq_func);
 	}
 	// Declare register gather/merge if needed
 	if (properties.has_gather_op)
 	{
 		std::string float2 = getFloatTypeName(2);
 		OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n";
 		OS << "{\n";
 		OS << "	float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n";
 		OS << "	float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n";
 		OS << "	float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n";
 		OS << "	float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n";
 		OS << "	return " << float4 << "(x, y, z, w);\n";
 		OS << "}\n\n";
 		OS << float2 << " gather(" << float4 << " _h)\n";
 		OS << "{\n";
 		OS << "	float x = uintBitsToFloat(packHalf2x16(_h.xy));\n";
 		OS << "	float y = uintBitsToFloat(packHalf2x16(_h.zw));\n";
 		OS << "	return " << float2 << "(x, y);\n";
 		OS << "}\n\n";
 	}
 	if (properties.has_dynamic_register_load)
 	{
 		OS <<
@ -1303,8 +1264,28 @@ std::string FragmentProgramDecompiler::Decompile()
 {
 	auto graph = deconstruct_fragment_program(m_prog);
-	if (g_cfg.video.shader_precision != gpu_preset_level::low)
+	if (!graph.blocks.empty())
 	{
 		// The RSX CFG is missing the output block. We inject a fake tail block that ingests the ROP outputs.
 		BasicBlock* rop_block = nullptr;
 		BasicBlock* tail_block = &graph.blocks.back();
 		if (tail_block->instructions.size() == 0)
 		{
 			// Merge block. Use this directly
 			rop_block = tail_block;
 		}
 		else
 		{
 			graph.blocks.push_back({});
 			rop_block = &graph.blocks.back();
 			tail_block->insert_succ(rop_block);
 			rop_block->insert_pred(tail_block);
 		}
 		const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count);
 		rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end());
 		FP::RegisterAnnotationPass annotation_pass{ m_prog };
 		FP::RegisterDependencyPass dependency_pass{};
@ -1376,6 +1357,9 @@ std::string FragmentProgramDecompiler::Decompile()
 			case EdgeType::ENDLOOP:
 				// Pure merge block?
 				break;
 			case EdgeType::NONE:
 				ensure(block.instructions.empty());
 				break;
 			default:
 				fmt::throw_exception("Unhandled edge type %d", static_cast<int>(pred.type));
 				break;
--- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h
+++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h
@ -1,6 +1,5 @@
 #pragma once
 #include "ShaderParam.h"
 #include "FragmentProgramRegister.h"
 #include "RSXFragmentProgram.h"
 #include "Assembler/CFG.h"
@ -53,8 +52,6 @@ class FragmentProgramDecompiler
 	int m_code_level;
 	std::unordered_map<u32, u32> m_constant_offsets;
 	std::array<rsx::MixedPrecisionRegister, 64> temp_registers;
 	std::string GetMask() const;
 	void SetDst(std::string code, u32 flags = 0);
@ -175,7 +172,6 @@ public:
 		// Decoded properties (out)
 		bool has_lit_op = false;
 		bool has_gather_op = false;
 		bool has_no_output = false;
 		bool has_discard_op = false;
 		bool has_tex_op = false;
--- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp
+++ b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.cpp
@ -1,196 +0,0 @@
 #include "stdafx.h"
 #include "FragmentProgramRegister.h"
 namespace rsx
 {
 	MixedPrecisionRegister::MixedPrecisionRegister()
 	{
 		std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined);
 	}
 	void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w)
 	{
 		if (x) content_mask[0] = data_type_bits::f16;
 		if (y) content_mask[1] = data_type_bits::f16;
 		if (z) content_mask[2] = data_type_bits::f16;
 		if (w) content_mask[3] = data_type_bits::f16;
 	}
 	void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w)
 	{
 		if (x) content_mask[4] = data_type_bits::f16;
 		if (y) content_mask[5] = data_type_bits::f16;
 		if (z) content_mask[6] = data_type_bits::f16;
 		if (w) content_mask[7] = data_type_bits::f16;
 	}
 	void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w)
 	{
 		if (x) content_mask[0] = content_mask[1] = data_type_bits::f32;
 		if (y) content_mask[2] = content_mask[3] = data_type_bits::f32;
 		if (z) content_mask[4] = content_mask[5] = data_type_bits::f32;
 		if (w) content_mask[6] = content_mask[7] = data_type_bits::f32;
 	}
 	void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w)
 	{
 		if (file_index == umax)
 		{
 			// First-time use. Initialize...
 			const u32 real_index = is_fp16 ? (index >> 1) : index;
 			file_index = real_index;
 		}
 		if (is_fp16)
 		{
 			ensure((index / 2) == file_index);
 			if (index & 1)
 			{
 				tag_h1(x, y, z, w);
 				return;
 			}
 			tag_h0(x, y, z, w);
 			return;
 		}
 		tag_r(x, y, z, w);
 	}
 	std::string MixedPrecisionRegister::gather_r() const
 	{
 		const auto half_index = file_index << 1;
 		const std::string reg = "r" + std::to_string(file_index);
 		const std::string gather_half_regs[] = {
 			"gather(h" + std::to_string(half_index) + ")",
 			"gather(h" + std::to_string(half_index + 1) + ")"
 		};
 		std::string outputs[4];
 		for (int ch = 0; ch < 4; ++ch)
 		{
 			// FIXME: This approach ignores mixed register bits. Not ideal!!!!
 			const auto channel0 = content_mask[ch * 2];
 			const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16;
 			outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg;
 		}
 		// Grouping. Only replace relevant bits...
 		if (outputs[0] == outputs[1]) outputs[0] = "";
 		if (outputs[2] == outputs[3]) outputs[2] = "";
 		// Assemble
 		bool group = false;
 		std::string result = "";
 		constexpr std::string_view swz_mask = "xyzw";
 		for (int ch = 0; ch < 4; ++ch)
 		{
 			if (outputs[ch].empty())
 			{
 				group = true;
 				continue;
 			}
 			if (!result.empty())
 			{
 				result += ", ";
 			}
 			if (group)
 			{
 				ensure(ch > 0);
 				group = false;
 				if (outputs[ch] == reg)
 				{
 					result += reg + "." + swz_mask[ch - 1] + swz_mask[ch];
 					continue;
 				}
 				result += outputs[ch];
 				continue;
 			}
 			const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
 			result += outputs[ch] + "." + swz_mask[subch];
 		}
 		// Optimize dual-gather (128-bit gather) to use special function
 		const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1];
 		if (result == double_gather)
 		{
 			result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")";
 		}
 		return "(" + result + ")";
 	}
 	std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const
 	{
 		// Reads half-word 0 (H16x4) from a full real (R32x4) register
 		constexpr std::string_view swz_mask = "xyzw";
 		const std::string reg = "r" + std::to_string(file_index);
 		const std::string hreg = "h" + std::to_string(file_index * 2 + word_index);
 		const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")";
 		const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")";
 		const std::string words[] = {
 			"unpackHalf2x16(" + word0_bits + ")",
 			"unpackHalf2x16(" + word1_bits + ")"
 		};
 		// Assemble
 		std::string outputs[4];
 		ensure(word_index <= 1);
 		const int word_offset = word_index * 4;
 		for (int ch = 0; ch < 4; ++ch)
 		{
 			outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32
 				? words[ch / 2]
 				: hreg;
 		}
 		// Grouping. Only replace relevant bits...
 		if (outputs[0] == outputs[1]) outputs[0] = "";
 		if (outputs[2] == outputs[3]) outputs[2] = "";
 		// Assemble
 		bool group = false;
 		std::string result = "";
 		for (int ch = 0; ch < 4; ++ch)
 		{
 			if (outputs[ch].empty())
 			{
 				group = true;
 				continue;
 			}
 			if (!result.empty())
 			{
 				result += ", ";
 			}
 			if (group)
 			{
 				ensure(ch > 0);
 				group = false;
 				result += outputs[ch];
 				if (outputs[ch] == hreg)
 				{
 					result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch];
 				}
 				continue;
 			}
 			const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
 			result += outputs[ch] + "." + swz_mask[subch];
 		}
 		return "(" + result + ")";
 	}
 }
--- a/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h
+++ b/rpcs3/Emu/RSX/Program/FragmentProgramRegister.h
@ -1,111 +0,0 @@
 #pragma once
 #include <util/types.hpp>
 namespace rsx
 {
 	class MixedPrecisionRegister
 	{
 		enum data_type_bits
 		{
 			undefined = 0,
 			f16 = 1,
 			f32 = 2
 		};
 		std::array<data_type_bits, 8> content_mask; // Content details for each half-word
 		u32 file_index = umax;
 		void tag_h0(bool x, bool y, bool z, bool w);
 		void tag_h1(bool x, bool y, bool z, bool w);
 		void tag_r(bool x, bool y, bool z, bool w);
 		std::string fetch_halfreg(u32 word_index) const;
 	public:
 		MixedPrecisionRegister();
 		void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w);
 		std::string gather_r() const;
 		std::string split_h0() const
 		{
 			return fetch_halfreg(0);
 		}
 		std::string split_h1() const
 		{
 			return fetch_halfreg(1);
 		}
 		// Getters
 		// Return true if all values are unwritten to (undefined)
 		bool floating() const
 		{
 			return file_index == umax;
 		}
 		// Return true if the first half register is all undefined
 		bool floating_h0() const
 		{
 			return content_mask[0] == content_mask[1] &&
 				content_mask[1] == content_mask[2] &&
 				content_mask[2] == content_mask[3] &&
 				content_mask[3] == data_type_bits::undefined;
 		}
 		// Return true if the second half register is all undefined
 		bool floating_h1() const
 		{
 			return content_mask[4] == content_mask[5] &&
 				content_mask[5] == content_mask[6] &&
 				content_mask[6] == content_mask[7] &&
 				content_mask[7] == data_type_bits::undefined;
 		}
 		// Return true if any of the half-words are 16-bit
 		bool requires_gather(u8 channel) const
 		{
 			// Data fetched from the single precision register requires merging of the two half registers
 			const auto channel_offset = channel * 2;
 			ensure(channel_offset <= 6);
 			return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16);
 		}
 		// Return true if the entire 128-bit register is filled with 2xfp16x4 data words
 		bool requires_gather128() const
 		{
 			// Full 128-bit check
 			for (const auto& ch : content_mask)
 			{
 				if (ch == data_type_bits::f16)
 				{
 					return true;
 				}
 			}
 			return false;
 		}
 		// Return true if the half-register is polluted with fp32 data
 		bool requires_split(u32 word_index) const
 		{
 			const u32 content_offset = word_index * 4;
 			for (u32 i = 0; i < 4; ++i)
 			{
 				if (content_mask[content_offset + i] == data_type_bits::f32)
 				{
 					return true;
 				}
 			}
 			return false;
 		}
 	};
 }
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@ -161,7 +161,6 @@
    <ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" />
    <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp" />
    <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp" />
    <ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" />
    <ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" />
    <ClCompile Include="Emu\RSX\Program\program_util.cpp" />
    <ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp" />
@ -710,7 +709,6 @@
    <ClInclude Include="Emu\RSX\Program\Assembler\IR.h" />
    <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h" />
    <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h" />
    <ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" />
    <ClInclude Include="Emu\RSX\Program\GLSLTypes.h" />
    <ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" />
    <ClInclude Include="Emu\RSX\Program\program_util.h" />
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@ -1360,9 +1360,6 @@
    <ClCompile Include="Emu\Cell\ErrorCodes.cpp">
      <Filter>Emu\Cell</Filter>
    </ClCompile>
    <ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp">
      <Filter>Emu\GPU\RSX\Program</Filter>
    </ClCompile>
    <ClCompile Include="util\emu_utils.cpp">
      <Filter>Utilities</Filter>
    </ClCompile>
@ -2764,9 +2761,6 @@
    <ClInclude Include="Emu\Audio\audio_utils.h">
      <Filter>Emu\Audio</Filter>
    </ClInclude>
    <ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h">
      <Filter>Emu\GPU\RSX\Program</Filter>
    </ClInclude>
    <ClInclude Include="util\video_source.h">
      <Filter>Utilities</Filter>
    </ClInclude>