rsx/cfg: Implement partial barriers for 32-bit register channels

This commit is contained in:
kd-11 2025-12-07 22:59:12 +03:00 committed by kd-11
parent 81d657a960
commit 8d2f7ae85f
7 changed files with 217 additions and 31 deletions

View File

@ -263,13 +263,26 @@ namespace rsx::assembler
{
ensure(reg.length() > 1, "Invalid register specifier");
const auto index = std::stoi(reg.substr(1));
const auto parts = fmt::split(reg, { "." });
ensure(parts.size() > 0 && parts.size() <= 2);
const auto index = std::stoi(parts[0].substr(1));
RegisterRef ref
{
.reg { .id = index, .f16 = false },
.mask = 0x0F
};
if (parts.size() > 1 && parts[1].length() > 0)
{
// FIXME: No swizzles for now, just lane masking
ref.mask = 0;
if (parts[1].find("x") != std::string::npos) ref.mask |= (1u << 0);
if (parts[1].find("y") != std::string::npos) ref.mask |= (1u << 1);
if (parts[1].find("z") != std::string::npos) ref.mask |= (1u << 2);
if (parts[1].find("w") != std::string::npos) ref.mask |= (1u << 3);
}
if (reg[0] == 'H' || reg[0] == 'h')
{
ref.reg.f16 = true;
@ -325,7 +338,7 @@ namespace rsx::assembler
do { \
inst->opcode = encoding.op; \
d0.opcode = encoding.op & 0x3F; \
s1.opcode_is_branch = (encoding.op > 0x3F)? 1 : 0; \
s1.opcode_hi = (encoding.op > 0x3F)? 1 : 0; \
s0.exec_if_eq = encoding.exec_if_eq ? 1 : 0; \
s0.exec_if_gr = encoding.exec_if_gt ? 1 : 0; \
s0.exec_if_lt = encoding.exec_if_lt ? 1 : 0; \

View File

@ -75,7 +75,12 @@ namespace rsx::assembler
RSX_FP_OPCODE_IFE = 0x42, // If
RSX_FP_OPCODE_LOOP = 0x43, // Loop
RSX_FP_OPCODE_REP = 0x44, // Repeat
RSX_FP_OPCODE_RET = 0x45 // Return
RSX_FP_OPCODE_RET = 0x45, // Return
// Custom opcodes for dependency injection
RSX_FP_OPCODE_OR16_LO = 0x46, // Performs a 16-bit OR, taking one register channel as input and overwriting low 16 bits of the output
RSX_FP_OPCODE_OR16_HI = 0x47, // Same as the lo variant but now overwrites the high 16-bit block
};
namespace FP

View File

@ -159,7 +159,7 @@ namespace rsx::assembler
src2.HEX = decoded._u32[3];
end = !!dst.end;
const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6);
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
if (opcode == RSX_FP_OPCODE_NOP)
{

View File

@ -22,6 +22,20 @@ namespace rsx::assembler::FP
std::unordered_map<BasicBlock*, register_file_t> sync_register_map;
};
enum Register32BarrierFlags
{
NONE = 0,
OR_WORD0 = 1,
OR_WORD1 = 2,
DEFAULT = OR_WORD0 | OR_WORD1
};
struct RegisterBarrier32
{
RegisterRef ref;
u32 flags[4];
};
std::vector<RegisterRef> decode_lanes16(const std::unordered_set<u32>& lanes)
{
std::vector<RegisterRef> result;
@ -47,34 +61,45 @@ namespace rsx::assembler::FP
return result;
}
std::vector<RegisterRef> decode_lanes32(const std::unordered_set<u32>& lanes)
std::vector<RegisterBarrier32> decode_lanes32(const std::unordered_set<u32>& lanes)
{
std::vector<RegisterRef> result;
std::vector<RegisterBarrier32> result;
for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 16)
{
// Each register has 8 16-bit lanes
RegisterBarrier32 barrier{};
auto& ref = barrier.ref;
u32 mask = 0;
if (lanes.contains(file_offset + 0) || lanes.contains(file_offset + 2)) mask |= (1u << 0);
if (lanes.contains(file_offset + 4) || lanes.contains(file_offset + 6)) mask |= (1u << 1);
if (lanes.contains(file_offset + 8) || lanes.contains(file_offset + 10)) mask |= (1u << 2);
if (lanes.contains(file_offset + 12) || lanes.contains(file_offset + 14)) mask |= (1u << 3);
for (u32 lane = 0; lane < 16; lane += 2)
{
if (!lanes.contains(file_offset + lane))
{
continue;
}
if (mask == 0)
const u32 ch = (lane / 4);
const u32 flags = (lane & 3)
? Register32BarrierFlags::OR_WORD1
: Register32BarrierFlags::OR_WORD0;
ref.mask |= (1u << ch);
barrier.flags[ch] |= flags;
}
if (ref.mask == 0)
{
continue;
}
RegisterRef ref{ .reg{.id = static_cast<int>(index), .f16 = false } };
ref.mask = mask;
result.push_back(ref);
ref.reg = {.id = static_cast<int>(index), .f16 = false };
result.push_back(barrier);
}
return result;
}
std::vector<Instruction> build_barrier32(const RegisterRef& reg)
std::vector<Instruction> build_barrier32(const RegisterBarrier32& barrier)
{
// Upto 4 instructions are needed per 32-bit register
// R0.x = packHalf2x16(H0.xy)
@ -84,28 +109,27 @@ namespace rsx::assembler::FP
std::vector<Instruction> result;
for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch)
for (u32 mask = barrier.ref.mask, ch = 0; mask > 0; mask >>= 1, ++ch)
{
if (!(mask & 1))
{
continue;
}
const auto& reg = barrier.ref.reg;
const auto reg_id = reg.id;
Instruction instruction{};
OPDEST dst{};
dst.opcode = RSX_FP_OPCODE_PK2;
dst.prec = RSX_FP_PRECISION_REAL;
dst.fp16 = 0;
dst.dest_reg = reg.reg.id;
dst.dest_reg = reg_id;
dst.write_mask = (1u << ch);
const u32 src_reg_id = (ch / 2) + (reg.reg.id * 2);
const u32 src_reg_id = (ch / 2) + (reg_id * 2);
const bool is_word0 = !(ch & 1); // Only even
SRC0 src0{};
src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1;
src0.fp16 = 1;
if (is_word0)
{
src0.swizzle_x = 0;
@ -121,14 +145,50 @@ namespace rsx::assembler::FP
src0.swizzle_w = 3;
src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
src0.tmp_reg_index = src_reg_id;
src0.fp16 = 1;
instruction.opcode = dst.opcode;
// Prepare source 1 to match the output in case we need to encode an OR
SRC1 src1{};
src1.reg_type = RSX_FP_REGISTER_TYPE_TEMP;
src1.tmp_reg_index = reg_id;
src1.swizzle_x = ch;
src1.swizzle_y = ch;
src1.swizzle_z = ch;
src1.swizzle_w = ch;
u32 opcode = 0;
switch (barrier.flags[ch])
{
case Register32BarrierFlags::DEFAULT:
opcode = RSX_FP_OPCODE_PK2;
break;
case Register32BarrierFlags::OR_WORD0:
opcode = RSX_FP_OPCODE_OR16_LO;
// Swap inputs
std::swap(src0.HEX, src1.HEX);
break;
case Register32BarrierFlags::OR_WORD1:
opcode = RSX_FP_OPCODE_OR16_HI;
src0.swizzle_x = src0.swizzle_y;
std::swap(src0.HEX, src1.HEX);
break;
case Register32BarrierFlags::NONE:
default:
fmt::throw_exception("Unexpected lane barrier with no mask.");
}
dst.opcode = opcode & 0x3F;
src1.opcode_hi = (opcode > 0x3F) ? 1 : 0;
src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1;
instruction.opcode = opcode;
instruction.bytecode[0] = dst.HEX;
instruction.bytecode[1] = src0.HEX;
instruction.bytecode[2] = src1.HEX;
Register src_reg{ .id = static_cast<int>(src_reg_id), .f16 = true };
instruction.srcs.push_back({ .reg=src_reg, .mask=0xF });
instruction.dsts.push_back({ .reg{ .id = reg.reg.id, .f16 = false }, .mask = (1u << ch) });
instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF });
instruction.dsts.push_back({ .reg{ .id = reg_id, .f16 = false }, .mask = (1u << ch) });
result.push_back(instruction);
}
@ -207,10 +267,22 @@ namespace rsx::assembler::FP
{
std::vector<Instruction> result;
const auto regs = (f16 ? decode_lanes16 : decode_lanes32)(lanes);
for (const auto& ref : regs)
if (f16)
{
auto instructions = (f16 ? build_barrier16 : build_barrier32)(ref);
const auto regs = decode_lanes16(lanes);
for (const auto& ref : regs)
{
auto instructions = build_barrier16(ref);
result.insert(result.end(), instructions.begin(), instructions.end());
}
return result;
}
const auto barriers = decode_lanes32(lanes);
for (const auto& barrier : barriers)
{
auto instructions = build_barrier32(barrier);
result.insert(result.end(), instructions.begin(), instructions.end());
}

View File

@ -273,7 +273,7 @@ void CgBinaryDisasm::TaskFP()
src2.HEX = GetData(data[3]);
m_step = 4 * sizeof(u32);
m_opcode = dst.opcode | (src1.opcode_is_branch << 6);
m_opcode = dst.opcode | (src1.opcode_hi << 6);
auto SCT = [&]()
{

View File

@ -102,7 +102,7 @@ union SRC1
u32 src1_prec_mod : 3; // Precision modifier for src1 (CoD:MW series)
u32 src2_prec_mod : 3; // Precision modifier for src2 (unproven, should affect MAD instruction)
u32 scale : 3;
u32 opcode_is_branch : 1;
u32 opcode_hi : 1; // Opcode high bit
};
struct

View File

@ -472,4 +472,100 @@ namespace rsx::assembler
EXPECT_EQ(bb4->epilogue.size(), 0);
EXPECT_EQ(bb5->epilogue.size(), 0);
}
TEST(TestFPIR, RegisterDependencyPass_Partial32_0)
{
// Instruction 2 partially clobers H1 which in turn clobbers R0.
// Instruction 3 reads from R0 so a partial barrier32 is needed between them.
auto graph = CFG_from_source(R"(
ADD R1, R0, R1;
MOV H1.x, R1.x;
MOV R2, R0;
)");
ASSERT_EQ(graph.blocks.size(), 1);
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
auto& block = graph.blocks.front();
RSXFragmentProgram prog{};
FP::RegisterAnnotationPass annotation_pass{ prog };
FP::RegisterDependencyPass deps_pass{};
annotation_pass.run(graph);
deps_pass.run(graph);
ASSERT_EQ(block.instructions.size(), 4);
OPDEST dst{ .HEX = block.instructions[2].bytecode[0] };
SRC0 src0{ .HEX = block.instructions[2].bytecode[1] };
SRC1 src1{ .HEX = block.instructions[2].bytecode[2] };
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
// R0.z = packHalf2(H1.xy);
EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_LO);
EXPECT_EQ(dst.fp16, 0);
EXPECT_EQ(dst.dest_reg, 0);
EXPECT_EQ(dst.mask_x, false);
EXPECT_EQ(dst.mask_y, false);
EXPECT_EQ(dst.mask_z, true);
EXPECT_EQ(dst.mask_w, false);
EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
EXPECT_EQ(src0.tmp_reg_index, 0);
EXPECT_EQ(src0.fp16, 0);
EXPECT_EQ(src0.swizzle_x, 2);
EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
EXPECT_EQ(src1.tmp_reg_index, 1);
EXPECT_EQ(src1.fp16, 1);
EXPECT_EQ(src1.swizzle_x, 0);
}
TEST(TestFPIR, RegisterDependencyPass_Partial32_1)
{
// Instruction 2 partially clobers H1 which in turn clobbers R0.
// Instruction 3 reads from R0 so a partial barrier32 is needed between them.
auto graph = CFG_from_source(R"(
ADD R1, R0, R1;
MOV H1.y, R1.y;
MOV R2, R0;
)");
ASSERT_EQ(graph.blocks.size(), 1);
ASSERT_EQ(graph.blocks.front().instructions.size(), 3);
auto& block = graph.blocks.front();
RSXFragmentProgram prog{};
FP::RegisterAnnotationPass annotation_pass{ prog };
FP::RegisterDependencyPass deps_pass{};
annotation_pass.run(graph);
deps_pass.run(graph);
ASSERT_EQ(block.instructions.size(), 4);
OPDEST dst{ .HEX = block.instructions[2].bytecode[0] };
SRC0 src0{ .HEX = block.instructions[2].bytecode[1] };
SRC1 src1{ .HEX = block.instructions[2].bytecode[2] };
const u32 opcode = dst.opcode | (src1.opcode_hi << 6);
// R0.z = packHalf2(H1.xy);
EXPECT_EQ(opcode, RSX_FP_OPCODE_OR16_HI);
EXPECT_EQ(dst.fp16, 0);
EXPECT_EQ(dst.dest_reg, 0);
EXPECT_EQ(dst.mask_x, false);
EXPECT_EQ(dst.mask_y, false);
EXPECT_EQ(dst.mask_z, true);
EXPECT_EQ(dst.mask_w, false);
EXPECT_EQ(src0.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
EXPECT_EQ(src0.tmp_reg_index, 0);
EXPECT_EQ(src0.fp16, 0);
EXPECT_EQ(src0.swizzle_x, 2);
EXPECT_EQ(src1.reg_type, RSX_FP_REGISTER_TYPE_TEMP);
EXPECT_EQ(src1.tmp_reg_index, 1);
EXPECT_EQ(src1.fp16, 1);
EXPECT_EQ(src1.swizzle_x, 1);
}
}