rsx/fp: Re-implement ROP output resolve

This commit is contained in:
kd-11 2025-12-07 19:16:24 +03:00 committed by kd-11
parent f2913e4692
commit 316e01995b
7 changed files with 69 additions and 405 deletions

View File

@ -525,7 +525,6 @@ target_sources(rpcs3_emu PRIVATE
RSX/Program/CgBinaryFragmentProgram.cpp RSX/Program/CgBinaryFragmentProgram.cpp
RSX/Program/CgBinaryVertexProgram.cpp RSX/Program/CgBinaryVertexProgram.cpp
RSX/Program/FragmentProgramDecompiler.cpp RSX/Program/FragmentProgramDecompiler.cpp
RSX/Program/FragmentProgramRegister.cpp
RSX/Program/GLSLCommon.cpp RSX/Program/GLSLCommon.cpp
RSX/Program/ProgramStateCache.cpp RSX/Program/ProgramStateCache.cpp
RSX/Program/program_util.cpp RSX/Program/program_util.cpp

View File

@ -14,6 +14,8 @@ namespace rsx
{ {
namespace fragment_program namespace fragment_program
{ {
using namespace rsx::assembler;
static const std::string reg_table[] = static const std::string reg_table[] =
{ {
"wpos", "wpos",
@ -22,6 +24,28 @@ namespace rsx
"tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9", "tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9",
"ssa" "ssa"
}; };
static const std::vector<RegisterRef> s_fp32_output_set =
{
{.reg {.id = 0, .f16 = false }, .mask = 0xf },
{.reg {.id = 2, .f16 = false }, .mask = 0xf },
{.reg {.id = 3, .f16 = false }, .mask = 0xf },
{.reg {.id = 4, .f16 = false }, .mask = 0xf },
};
static const std::vector<RegisterRef> s_fp16_output_set =
{
{.reg {.id = 0, .f16 = true }, .mask = 0xf },
{.reg {.id = 4, .f16 = true }, .mask = 0xf },
{.reg {.id = 6, .f16 = true }, .mask = 0xf },
{.reg {.id = 8, .f16 = true }, .mask = 0xf },
};
static const RegisterRef s_z_export_reg =
{
.reg {.id = 1, .f16 = false },
.mask = (1u << 2)
};
} }
} }
@ -37,6 +61,26 @@ enum VectorLane : u8
W = 3, W = 3,
}; };
std::vector<RegisterRef> get_fragment_program_output_set(u32 ctrl, u32 mrt_count)
{
std::vector<RegisterRef> result;
if (mrt_count > 0)
{
result = (ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS)
? s_fp32_output_set
: s_fp16_output_set;
result.resize(mrt_count);
}
if (ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
{
result.push_back(s_z_export_reg);
}
return result;
}
FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size)
: m_size(size) : m_size(size)
, m_prog(prog) , m_prog(prog)
@ -157,8 +201,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
} }
const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg; const u32 reg_index = dst.fp16 ? (dst.dest_reg >> 1) : dst.dest_reg;
ensure(reg_index < temp_registers.size());
if (dst.opcode == RSX_FP_OPCODE_MOV && if (dst.opcode == RSX_FP_OPCODE_MOV &&
src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP &&
src0.tmp_reg_index == reg_index) src0.tmp_reg_index == reg_index)
@ -171,8 +213,6 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
return; return;
} }
} }
temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w);
} }
void FragmentProgramDecompiler::AddFlowOp(const std::string& code) void FragmentProgramDecompiler::AddFlowOp(const std::string& code)
@ -528,26 +568,7 @@ template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
switch (src.reg_type) switch (src.reg_type)
{ {
case RSX_FP_REGISTER_TYPE_TEMP: case RSX_FP_REGISTER_TYPE_TEMP:
if (src.fp16 && precision_modifier == RSX_FP_PRECISION_HALF)
if (!src.fp16)
{
if (dst.opcode == RSX_FP_OPCODE_UP16 ||
dst.opcode == RSX_FP_OPCODE_UP2 ||
dst.opcode == RSX_FP_OPCODE_UP4 ||
dst.opcode == RSX_FP_OPCODE_UPB ||
dst.opcode == RSX_FP_OPCODE_UPG)
{
auto &reg = temp_registers[src.tmp_reg_index];
if (reg.requires_gather(src.swizzle_x))
{
properties.has_gather_op = true;
AddReg(src.tmp_reg_index, src.fp16);
ret = getFloatTypeName(4) + reg.gather_r();
break;
}
}
}
else if (precision_modifier == RSX_FP_PRECISION_HALF)
{ {
// clamp16() is not a cheap operation when emulated; avoid at all costs // clamp16() is not a cheap operation when emulated; avoid at all costs
precision_modifier = RSX_FP_PRECISION_REAL; precision_modifier = RSX_FP_PRECISION_REAL;
@ -778,17 +799,6 @@ std::string FragmentProgramDecompiler::BuildCode()
{ {
// Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!! // Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!!
m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value); m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value);
auto& r1 = temp_registers[1];
if (r1.requires_gather(VectorLane::Z))
{
// r1.zw was not written to
properties.has_gather_op = true;
main_epilogue << " r1.z = " << float4_type << r1.gather_r() << ".z;\n";
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from shader depth without writing to it. Final value will be gathered.");
}
} }
// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z) // Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
@ -816,33 +826,6 @@ std::string FragmentProgramDecompiler::BuildCode()
continue; continue;
} }
const auto block_index = ouput_register_indices[n];
auto& r = temp_registers[block_index];
if (fp16_out)
{
// Check if we need a split/extract op
if (r.requires_split(0))
{
main_epilogue << " " << reg_name << " = " << float4_type << r.split_h0() << ";\n";
// Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from %s without writing to it. Final value will be extracted from the 32-bit register.", reg_name);
}
continue;
}
if (!r.requires_gather128())
{
// Nothing to do
continue;
}
// We need to gather the data from existing registers
main_epilogue << " " << reg_name << " = " << float4_type << r.gather_r() << ";\n";
properties.has_gather_op = true;
// Emit debug warning. Useful to diagnose regressions, but should be removed in future. // Emit debug warning. Useful to diagnose regressions, but should be removed in future.
rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name); rsx_log.warning("ROP reads from %s without writing to it. Final value will be gathered.", reg_name);
} }
@ -1030,28 +1013,6 @@ std::string FragmentProgramDecompiler::BuildCode()
OS << Format(divsq_func); OS << Format(divsq_func);
} }
// Declare register gather/merge if needed
if (properties.has_gather_op)
{
std::string float2 = getFloatTypeName(2);
OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n";
OS << "{\n";
OS << " float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n";
OS << " float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n";
OS << " float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n";
OS << " float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n";
OS << " return " << float4 << "(x, y, z, w);\n";
OS << "}\n\n";
OS << float2 << " gather(" << float4 << " _h)\n";
OS << "{\n";
OS << " float x = uintBitsToFloat(packHalf2x16(_h.xy));\n";
OS << " float y = uintBitsToFloat(packHalf2x16(_h.zw));\n";
OS << " return " << float2 << "(x, y);\n";
OS << "}\n\n";
}
if (properties.has_dynamic_register_load) if (properties.has_dynamic_register_load)
{ {
OS << OS <<
@ -1303,8 +1264,28 @@ std::string FragmentProgramDecompiler::Decompile()
{ {
auto graph = deconstruct_fragment_program(m_prog); auto graph = deconstruct_fragment_program(m_prog);
if (g_cfg.video.shader_precision != gpu_preset_level::low) if (!graph.blocks.empty())
{ {
// The RSX CFG is missing the output block. We inject a fake tail block that ingests the ROP outputs.
BasicBlock* rop_block = nullptr;
BasicBlock* tail_block = &graph.blocks.back();
if (tail_block->instructions.size() == 0)
{
// Merge block. Use this directly
rop_block = tail_block;
}
else
{
graph.blocks.push_back({});
rop_block = &graph.blocks.back();
tail_block->insert_succ(rop_block);
rop_block->insert_pred(tail_block);
}
const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count);
rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end());
FP::RegisterAnnotationPass annotation_pass{ m_prog }; FP::RegisterAnnotationPass annotation_pass{ m_prog };
FP::RegisterDependencyPass dependency_pass{}; FP::RegisterDependencyPass dependency_pass{};
@ -1376,6 +1357,9 @@ std::string FragmentProgramDecompiler::Decompile()
case EdgeType::ENDLOOP: case EdgeType::ENDLOOP:
// Pure merge block? // Pure merge block?
break; break;
case EdgeType::NONE:
ensure(block.instructions.empty());
break;
default: default:
fmt::throw_exception("Unhandled edge type %d", static_cast<int>(pred.type)); fmt::throw_exception("Unhandled edge type %d", static_cast<int>(pred.type));
break; break;

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include "ShaderParam.h" #include "ShaderParam.h"
#include "FragmentProgramRegister.h"
#include "RSXFragmentProgram.h" #include "RSXFragmentProgram.h"
#include "Assembler/CFG.h" #include "Assembler/CFG.h"
@ -53,8 +52,6 @@ class FragmentProgramDecompiler
int m_code_level; int m_code_level;
std::unordered_map<u32, u32> m_constant_offsets; std::unordered_map<u32, u32> m_constant_offsets;
std::array<rsx::MixedPrecisionRegister, 64> temp_registers;
std::string GetMask() const; std::string GetMask() const;
void SetDst(std::string code, u32 flags = 0); void SetDst(std::string code, u32 flags = 0);
@ -175,7 +172,6 @@ public:
// Decoded properties (out) // Decoded properties (out)
bool has_lit_op = false; bool has_lit_op = false;
bool has_gather_op = false;
bool has_no_output = false; bool has_no_output = false;
bool has_discard_op = false; bool has_discard_op = false;
bool has_tex_op = false; bool has_tex_op = false;

View File

@ -1,196 +0,0 @@
#include "stdafx.h"
#include "FragmentProgramRegister.h"
namespace rsx
{
MixedPrecisionRegister::MixedPrecisionRegister()
{
std::fill(content_mask.begin(), content_mask.end(), data_type_bits::undefined);
}
void MixedPrecisionRegister::tag_h0(bool x, bool y, bool z, bool w)
{
if (x) content_mask[0] = data_type_bits::f16;
if (y) content_mask[1] = data_type_bits::f16;
if (z) content_mask[2] = data_type_bits::f16;
if (w) content_mask[3] = data_type_bits::f16;
}
void MixedPrecisionRegister::tag_h1(bool x, bool y, bool z, bool w)
{
if (x) content_mask[4] = data_type_bits::f16;
if (y) content_mask[5] = data_type_bits::f16;
if (z) content_mask[6] = data_type_bits::f16;
if (w) content_mask[7] = data_type_bits::f16;
}
void MixedPrecisionRegister::tag_r(bool x, bool y, bool z, bool w)
{
if (x) content_mask[0] = content_mask[1] = data_type_bits::f32;
if (y) content_mask[2] = content_mask[3] = data_type_bits::f32;
if (z) content_mask[4] = content_mask[5] = data_type_bits::f32;
if (w) content_mask[6] = content_mask[7] = data_type_bits::f32;
}
void MixedPrecisionRegister::tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w)
{
if (file_index == umax)
{
// First-time use. Initialize...
const u32 real_index = is_fp16 ? (index >> 1) : index;
file_index = real_index;
}
if (is_fp16)
{
ensure((index / 2) == file_index);
if (index & 1)
{
tag_h1(x, y, z, w);
return;
}
tag_h0(x, y, z, w);
return;
}
tag_r(x, y, z, w);
}
std::string MixedPrecisionRegister::gather_r() const
{
const auto half_index = file_index << 1;
const std::string reg = "r" + std::to_string(file_index);
const std::string gather_half_regs[] = {
"gather(h" + std::to_string(half_index) + ")",
"gather(h" + std::to_string(half_index + 1) + ")"
};
std::string outputs[4];
for (int ch = 0; ch < 4; ++ch)
{
// FIXME: This approach ignores mixed register bits. Not ideal!!!!
const auto channel0 = content_mask[ch * 2];
const auto is_fp16_ch = channel0 == content_mask[ch * 2 + 1] && channel0 == data_type_bits::f16;
outputs[ch] = is_fp16_ch ? gather_half_regs[ch / 2] : reg;
}
// Grouping. Only replace relevant bits...
if (outputs[0] == outputs[1]) outputs[0] = "";
if (outputs[2] == outputs[3]) outputs[2] = "";
// Assemble
bool group = false;
std::string result = "";
constexpr std::string_view swz_mask = "xyzw";
for (int ch = 0; ch < 4; ++ch)
{
if (outputs[ch].empty())
{
group = true;
continue;
}
if (!result.empty())
{
result += ", ";
}
if (group)
{
ensure(ch > 0);
group = false;
if (outputs[ch] == reg)
{
result += reg + "." + swz_mask[ch - 1] + swz_mask[ch];
continue;
}
result += outputs[ch];
continue;
}
const int subch = outputs[ch] == reg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
result += outputs[ch] + "." + swz_mask[subch];
}
// Optimize dual-gather (128-bit gather) to use special function
const std::string double_gather = gather_half_regs[0] + ", " + gather_half_regs[1];
if (result == double_gather)
{
result = "gather(h" + std::to_string(half_index) + ", h" + std::to_string(half_index + 1) + ")";
}
return "(" + result + ")";
}
std::string MixedPrecisionRegister::fetch_halfreg(u32 word_index) const
{
// Reads half-word 0 (H16x4) from a full real (R32x4) register
constexpr std::string_view swz_mask = "xyzw";
const std::string reg = "r" + std::to_string(file_index);
const std::string hreg = "h" + std::to_string(file_index * 2 + word_index);
const std::string word0_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2] + ")";
const std::string word1_bits = "floatBitsToUint(" + reg + "." + swz_mask[word_index * 2 + 1] + ")";
const std::string words[] = {
"unpackHalf2x16(" + word0_bits + ")",
"unpackHalf2x16(" + word1_bits + ")"
};
// Assemble
std::string outputs[4];
ensure(word_index <= 1);
const int word_offset = word_index * 4;
for (int ch = 0; ch < 4; ++ch)
{
outputs[ch] = content_mask[ch + word_offset] == data_type_bits::f32
? words[ch / 2]
: hreg;
}
// Grouping. Only replace relevant bits...
if (outputs[0] == outputs[1]) outputs[0] = "";
if (outputs[2] == outputs[3]) outputs[2] = "";
// Assemble
bool group = false;
std::string result = "";
for (int ch = 0; ch < 4; ++ch)
{
if (outputs[ch].empty())
{
group = true;
continue;
}
if (!result.empty())
{
result += ", ";
}
if (group)
{
ensure(ch > 0);
group = false;
result += outputs[ch];
if (outputs[ch] == hreg)
{
result += std::string(".") + swz_mask[ch - 1] + swz_mask[ch];
}
continue;
}
const int subch = outputs[ch] == hreg ? ch : (ch % 2); // Avoid .xyxy.z and other such ugly swizzles
result += outputs[ch] + "." + swz_mask[subch];
}
return "(" + result + ")";
}
}

View File

@ -1,111 +0,0 @@
#pragma once
#include <util/types.hpp>
namespace rsx
{
class MixedPrecisionRegister
{
enum data_type_bits
{
undefined = 0,
f16 = 1,
f32 = 2
};
std::array<data_type_bits, 8> content_mask; // Content details for each half-word
u32 file_index = umax;
void tag_h0(bool x, bool y, bool z, bool w);
void tag_h1(bool x, bool y, bool z, bool w);
void tag_r(bool x, bool y, bool z, bool w);
std::string fetch_halfreg(u32 word_index) const;
public:
MixedPrecisionRegister();
void tag(u32 index, bool is_fp16, bool x, bool y, bool z, bool w);
std::string gather_r() const;
std::string split_h0() const
{
return fetch_halfreg(0);
}
std::string split_h1() const
{
return fetch_halfreg(1);
}
// Getters
// Return true if all values are unwritten to (undefined)
bool floating() const
{
return file_index == umax;
}
// Return true if the first half register is all undefined
bool floating_h0() const
{
return content_mask[0] == content_mask[1] &&
content_mask[1] == content_mask[2] &&
content_mask[2] == content_mask[3] &&
content_mask[3] == data_type_bits::undefined;
}
// Return true if the second half register is all undefined
bool floating_h1() const
{
return content_mask[4] == content_mask[5] &&
content_mask[5] == content_mask[6] &&
content_mask[6] == content_mask[7] &&
content_mask[7] == data_type_bits::undefined;
}
// Return true if any of the half-words are 16-bit
bool requires_gather(u8 channel) const
{
// Data fetched from the single precision register requires merging of the two half registers
const auto channel_offset = channel * 2;
ensure(channel_offset <= 6);
return (content_mask[channel_offset] == data_type_bits::f16 || content_mask[channel_offset + 1] == data_type_bits::f16);
}
// Return true if the entire 128-bit register is filled with 2xfp16x4 data words
bool requires_gather128() const
{
// Full 128-bit check
for (const auto& ch : content_mask)
{
if (ch == data_type_bits::f16)
{
return true;
}
}
return false;
}
// Return true if the half-register is polluted with fp32 data
bool requires_split(u32 word_index) const
{
const u32 content_offset = word_index * 4;
for (u32 i = 0; i < 4; ++i)
{
if (content_mask[content_offset + i] == data_type_bits::f32)
{
return true;
}
}
return false;
}
};
}

View File

@ -161,7 +161,6 @@
<ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp" />
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" />
<ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" /> <ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" />
<ClCompile Include="Emu\RSX\Program\program_util.cpp" /> <ClCompile Include="Emu\RSX\Program\program_util.cpp" />
<ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp" /> <ClCompile Include="Emu\RSX\Program\SPIRVCommon.cpp" />
@ -710,7 +709,6 @@
<ClInclude Include="Emu\RSX\Program\Assembler\IR.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\IR.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h" />
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" />
<ClInclude Include="Emu\RSX\Program\GLSLTypes.h" /> <ClInclude Include="Emu\RSX\Program\GLSLTypes.h" />
<ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" /> <ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" />
<ClInclude Include="Emu\RSX\Program\program_util.h" /> <ClInclude Include="Emu\RSX\Program\program_util.h" />

View File

@ -1360,9 +1360,6 @@
<ClCompile Include="Emu\Cell\ErrorCodes.cpp"> <ClCompile Include="Emu\Cell\ErrorCodes.cpp">
<Filter>Emu\Cell</Filter> <Filter>Emu\Cell</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp">
<Filter>Emu\GPU\RSX\Program</Filter>
</ClCompile>
<ClCompile Include="util\emu_utils.cpp"> <ClCompile Include="util\emu_utils.cpp">
<Filter>Utilities</Filter> <Filter>Utilities</Filter>
</ClCompile> </ClCompile>
@ -2764,9 +2761,6 @@
<ClInclude Include="Emu\Audio\audio_utils.h"> <ClInclude Include="Emu\Audio\audio_utils.h">
<Filter>Emu\Audio</Filter> <Filter>Emu\Audio</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h">
<Filter>Emu\GPU\RSX\Program</Filter>
</ClInclude>
<ClInclude Include="util\video_source.h"> <ClInclude Include="util\video_source.h">
<Filter>Utilities</Filter> <Filter>Utilities</Filter>
</ClInclude> </ClInclude>