diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp index 33bad149ea..0f528f66f3 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp @@ -3,6 +3,7 @@ #include "Emu/RSX/Program/Assembler/FPOpcodes.h" #include "Emu/RSX/Program/RSXFragmentProgram.h" +#include #include namespace rsx::assembler::FP @@ -13,6 +14,14 @@ namespace rsx::assembler::FP static constexpr char content_float16 = 'H'; static constexpr char content_dual = 'D'; + using register_file_t = std::array; + + struct DependencyPassContext + { + std::unordered_map exec_register_map; + std::unordered_map sync_register_map; + }; + std::vector decode_lanes16(const std::unordered_set& lanes) { std::vector result; @@ -193,9 +202,23 @@ namespace rsx::assembler::FP return result; } - void insert_dependency_barriers(BasicBlock* block) + std::vector resolve_dependencies(const std::unordered_set& lanes, bool f16) { - std::array register_file; + std::vector result; + + const auto regs = (f16 ? decode_lanes16 : decode_lanes32)(lanes); + for (const auto& ref : regs) + { + auto instructions = (f16 ? build_barrier16 : build_barrier32)(ref); + result.insert(result.end(), instructions.begin(), instructions.end()); + } + + return result; + } + + void insert_dependency_barriers(DependencyPassContext& ctx, BasicBlock* block) + { + register_file_t& register_file = ctx.exec_register_map[block]; std::memset(register_file.data(), content_unknown, register_file_length); std::unordered_set barrier16; @@ -275,14 +298,109 @@ namespace rsx::assembler::FP } } + void insert_block_register_dependency(DependencyPassContext& ctx, BasicBlock* block, const std::unordered_set& lanes, bool f16) + { + if (block->pred.empty()) + { + return; + } + + std::unordered_set clobbered_lanes; + std::unordered_set lanes_to_search; + + for (auto& back_edge : block->pred) + { + auto target = back_edge.from; + + // Did this target even clobber our register? + ensure(ctx.exec_register_map.find(target) != ctx.exec_register_map.end(), "Block has not been pre-processed"); + + if (ctx.sync_register_map.find(target) == ctx.sync_register_map.end()) + { + auto& blob = ctx.sync_register_map[target]; + std::memset(blob.data(), content_unknown, register_file_length); + } + + auto& sync_register_file = ctx.sync_register_map[target]; + const auto& exec_register_file = ctx.exec_register_map[target]; + const auto clobber_type = f16 ? content_float32 : content_float16; + + lanes_to_search.clear(); + clobbered_lanes.clear(); + + for (auto& lane : lanes) + { + if (exec_register_file[lane] == clobber_type && + sync_register_file[lane] == content_unknown) + { + clobbered_lanes.insert(lane); + sync_register_file[lane] = content_dual; + continue; + } + + if (exec_register_file[lane] == content_unknown) + { + lanes_to_search.insert(lane); + } + } + + if (!clobbered_lanes.empty()) + { + const auto instructions = resolve_dependencies(clobbered_lanes, f16); + target->epilogue.insert(target->epilogue.end(), instructions.begin(), instructions.end()); + } + + if (lanes_to_search.empty()) + { + break; + } + + // We have some missing lanes. Search upwards + if (!target->pred.empty()) + { + // We only need to search the last predecessor which is the true "root" of the branch + auto parent = target->pred.back().from; + insert_block_register_dependency(ctx, parent, lanes_to_search, f16); + } + } + } + + void insert_block_dependencies(DependencyPassContext& ctx, BasicBlock* block) + { + auto range_from_ref = [](const RegisterRef& ref) + { + const auto range = get_register_file_range(ref); + + std::unordered_set result; + for (const auto& value : range) + { + result.insert(value); + } + return result; + }; + + for (auto& ref : block->input_list) + { + const auto range = range_from_ref(ref); + insert_block_register_dependency(ctx, block, range, ref.reg.f16); + } + } + void RegisterDependencyPass::run(FlowGraph& graph) { + DependencyPassContext ctx{}; + // First, run intra-block dependency for (auto& block : graph.blocks) { - insert_dependency_barriers(&block); + insert_dependency_barriers(ctx, &block); } - // TODO: Create prologue/epilogue instructions + // Then, create prologue/epilogue instructions + // Traverse the list in reverse order to bubble up dependencies correctly. + for (auto it = graph.blocks.rbegin(); it != graph.blocks.rend(); ++it) + { + insert_block_dependencies(ctx, &(*it)); + } } } diff --git a/rpcs3/tests/test_rsx_fp_asm.cpp b/rpcs3/tests/test_rsx_fp_asm.cpp index 61202aa3e1..fc28525f94 100644 --- a/rpcs3/tests/test_rsx_fp_asm.cpp +++ b/rpcs3/tests/test_rsx_fp_asm.cpp @@ -48,6 +48,15 @@ namespace rsx::assembler bb.instructions = ir.build(); return graph; } + + static BasicBlock* BB_from_source(FlowGraph* graph, const std::string& asm_) + { + auto ir = FPIR::from_source(asm_); + graph->blocks.push_back({}); + BasicBlock& bb = graph->blocks.back(); + bb.instructions = ir.build(); + return &bb; + } TEST(TestFPIR, FromSource) { auto ir = FPIR::from_source(R"( @@ -232,8 +241,83 @@ namespace rsx::assembler EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_y, 3); } - TEST(TestFPIR, RegisterDependencyPass_Complex) + TEST(TestFPIR, RegisterDependencyPass_Complex_IF_BothPredecessorsClobber) { - // TODO: Multi-level block structure with nested IFs/LOOPs + // Multi-level but only single IF + // Mockup of a simple lighting function, R0 = Light vector, R1 = Decompressed normal. DP4 used for simplicity. + // Data hazards sprinkled in for testing. R3 is clobbered in the ancestor and the IF branch. + // Barrier should go in the IF branch here. + FlowGraph graph; + BasicBlock* bb0 = BB_from_source(&graph, R"( + DP4 R2, R0, R1 + SFL R3 + SGT R3, R2, R0 + IF.GE + )"); + + BasicBlock* bb1 = BB_from_source(&graph, R"( + ADD R0, R0, R2 + MOV H6, #{ 0.25 } + )"); + + BasicBlock* bb2 = BB_from_source(&graph, R"( + ADD R0, R0, R3 + MOV R1, R0 + )"); + + // Front edges + bb0->insert_succ(bb1, EdgeType::IF); + bb0->insert_succ(bb2, EdgeType::ENDIF); + bb1->insert_succ(bb2, EdgeType::ENDIF); + + // Back edges + bb2->insert_pred(bb1, EdgeType::ENDIF); + bb2->insert_pred(bb0, EdgeType::ENDIF); + bb1->insert_pred(bb0, EdgeType::IF); + + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(bb0->instructions.size(), 4); + ASSERT_EQ(bb1->instructions.size(), 2); + ASSERT_EQ(bb2->instructions.size(), 2); + + // bb1 has a epilogue + ASSERT_EQ(bb1->epilogue.size(), 2); + + // bb1 epilogue updates R3.xy + + // R3.x = packHalf2(H6.xy) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_x, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[0].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_x, 0); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[0].bytecode[1] }.swizzle_y, 1); + + // R3.y = packHalf2(H6.zw) + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.dest_reg, 3); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_y, true); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = bb1->epilogue[1].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.tmp_reg_index, 6); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_x, 2); + EXPECT_EQ(SRC0{ .HEX = bb1->epilogue[1].bytecode[1] }.swizzle_y, 3); } }