From 9d30716aa8633cea4153c29bb7dbcfbcf6444e35 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 29 Nov 2025 23:56:01 +0300 Subject: [PATCH] rsx: Implement register annotation pass --- rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp | 427 ++++++++++++++++++ rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h | 106 +++++ rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp | 1 + rpcs3/Emu/RSX/Program/Assembler/IR.h | 10 +- .../Passes/FP/RegisterAnnotationPass.cpp | 181 ++++++++ .../Passes/{ => FP}/RegisterAnnotationPass.h | 14 +- .../{ => FP}/RegisterDependencyPass.cpp | 2 +- .../Passes/{ => FP}/RegisterDependencyPass.h | 4 +- .../Passes/RegisterAnnotationPass.cpp | 10 - rpcs3/Emu/RSX/Program/RSXFragmentProgram.h | 95 +--- rpcs3/emucore.vcxproj | 10 +- rpcs3/emucore.vcxproj.filters | 25 +- 12 files changed, 787 insertions(+), 98 deletions(-) create mode 100644 rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp create mode 100644 rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h create mode 100644 rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp rename rpcs3/Emu/RSX/Program/Assembler/Passes/{ => FP}/RegisterAnnotationPass.h (54%) rename rpcs3/Emu/RSX/Program/Assembler/Passes/{ => FP}/RegisterDependencyPass.cpp (76%) rename rpcs3/Emu/RSX/Program/Assembler/Passes/{ => FP}/RegisterDependencyPass.h (89%) delete mode 100644 rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.cpp diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp new file mode 100644 index 0000000000..401bfd5492 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.cpp @@ -0,0 +1,427 @@ +#include "stdafx.h" +#include "FPOpcodes.h" + +#include "Emu/RSX/Common/simple_array.hpp" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include + +namespace rsx::assembler::FP +{ + u8 get_operand_count(FP_opcode opcode) + { + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + return 2; + case RSX_FP_OPCODE_MAD: + return 3; + case RSX_FP_OPCODE_DP3: + case RSX_FP_OPCODE_DP4: + return 2; + case RSX_FP_OPCODE_DST: + return 2; + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return 2; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return 2; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return 1; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + case RSX_FP_OPCODE_UP4: + return 1; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return 1; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + case RSX_FP_OPCODE_TXP: + return 1; + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return 1; + case RSX_FP_OPCODE_LIT: + return 1; + case RSX_FP_OPCODE_LRP: + return 3; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return 0; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return 1; + case RSX_FP_OPCODE_PK2: + case RSX_FP_OPCODE_UP2: + return 1; + case RSX_FP_OPCODE_PKB: + case RSX_FP_OPCODE_UPB: + case RSX_FP_OPCODE_PK16: + case RSX_FP_OPCODE_UP16: + case RSX_FP_OPCODE_PKG: + case RSX_FP_OPCODE_UPG: + return 1; + case RSX_FP_OPCODE_DP2A: + return 3; + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return 2; + case RSX_FP_OPCODE_DP2: + return 2; + case RSX_FP_OPCODE_NRM: + return 1; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return 2; + case RSX_FP_OPCODE_LIF: + return 1; + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + + // The rest are unimplemented and not encountered in real software. + // TODO: Probe these on real PS3 and figure out what they actually do. + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused + case RSX_FP_OPCODE_BEM: + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEM class instruction"); // Unused + case RSX_FP_OPCODE_REFL: + return 2; + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + default: + break; + } + + return 0; + } + + // Returns a lane mask for the given operand. + // The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel. + u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + constexpr u32 x = 0b0001; + constexpr u32 y = 0b0010; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + constexpr u32 xy = 0b0011; + constexpr u32 xyz = 0b0111; + constexpr u32 xyzw = 0b1111; + + const auto decode = [&](const rsx::simple_array& masks) -> u32 + { + return operand < masks.size() + ? masks[operand] + : 0u; + }; + + auto opcode = static_cast(instruction->opcode); + if (operand >= get_operand_count(opcode)) + { + return 0; + } + + OPDEST d0 { .HEX = instruction->bytecode[0] }; + const u32 dst_write_mask = d0.no_dest ? 0 : d0.write_mask; + + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + case RSX_FP_OPCODE_MAD: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_DP3: + return xyz; + case RSX_FP_OPCODE_DP4: + return xyzw; + case RSX_FP_OPCODE_DST: + return decode({ y | z, y | w }); + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + return xyzw; + case RSX_FP_OPCODE_UP4: + return x; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + switch (prog.get_texture_dimension(d0.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return x; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyz; + default: + return 0; + } + case RSX_FP_OPCODE_TXP: + switch (prog.get_texture_dimension(d0.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xyz; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyzw; + default: + return 0; + } + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return x; + case RSX_FP_OPCODE_LIT: + return xyzw; + case RSX_FP_OPCODE_LRP: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return xyzw & dst_write_mask; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return x; + case RSX_FP_OPCODE_PK2: + return xy; + case RSX_FP_OPCODE_UP2: + return x; + case RSX_FP_OPCODE_PKB: + return xyzw; + case RSX_FP_OPCODE_UPB: + return x; + case RSX_FP_OPCODE_PK16: + return xy; + case RSX_FP_OPCODE_UP16: + return x; + case RSX_FP_OPCODE_PKG: + return xyzw; + case RSX_FP_OPCODE_UPG: + return x; + case RSX_FP_OPCODE_DP2A: + return decode({ xy, xy, x }); + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return decode({ xy, x }); + case RSX_FP_OPCODE_REFL: + return xyzw; + case RSX_FP_OPCODE_DP2: + return xy; + case RSX_FP_OPCODE_NRM: + return xyz; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return decode({ xyzw, x }); + case RSX_FP_OPCODE_LIF: + return decode({ y | w }); + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused ?? + case RSX_FP_OPCODE_BEM: + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEM class instruction"); // Unused + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + default: + break; + } + + return 0; + } + + // Resolved vector lane mask with swizzles applied. + u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + // Brute-force this. There's only 16 permutations. + constexpr u32 x = 0b0000; + constexpr u32 y = 0b0000; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + + const u32 lane_mask = get_src_vector_lane_mask(prog, instruction, operand); + if (!lane_mask) + { + return lane_mask; + } + + // Now we resolve matching lanes. + // This sequence can be drastically sped up using lookup tables but that will come later. + std::unordered_set inputs; + SRC_Common src { .HEX = instruction->bytecode[operand + 1] }; + + if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP) + { + return 0; + } + + if (lane_mask & x) inputs.insert(src.swizzle_x); + if (lane_mask & y) inputs.insert(src.swizzle_y); + if (lane_mask & z) inputs.insert(src.swizzle_z); + if (lane_mask & w) inputs.insert(src.swizzle_w); + + u32 result = 0; + if (inputs.contains(0)) result |= x; + if (inputs.contains(1)) result |= y; + if (inputs.contains(2)) result |= z; + if (inputs.contains(3)) result |= w; + + return result; + } + + bool is_delay_slot(const Instruction* instruction) + { + OPDEST dst { .HEX = instruction->bytecode[0] }; + SRC0 src0 { .HEX = instruction->bytecode[1] }; + SRC1 src1{ .HEX = instruction->bytecode[2] }; + + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self + dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter + dst.saturate || // Precision modifier + (dst.prec != RSX_FP_PRECISION_REAL && + dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers + { + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; + } + + RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand) + { + SRC_Common src{ .HEX = instruction->bytecode[operand + 1] }; + if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP) + { + return {}; + } + + const u32 read_lanes = get_src_vector_lane_mask_shuffled(prog, instruction, operand); + if (!read_lanes) + { + return {}; + } + + RegisterRef ref{ .mask = read_lanes }; + Register& reg = ref.reg; + + reg.f16 = !!src.fp16; + reg.id = src.tmp_reg_index; + return ref; + } + + RegisterRef get_dst_register(const Instruction* instruction) + { + OPDEST dst { .HEX = instruction->bytecode[0] }; + if (dst.no_dest) + { + return {}; + } + + RegisterRef ref{ .mask = dst.write_mask }; + ref.reg.f16 = dst.fp16; + ref.reg.id = dst.dest_reg; + return ref; + } + + // Convert vector mask to file range + rsx::simple_array get_register_file_range(const RegisterRef& reg) + { + if (!reg.mask) + { + return {}; + } + + constexpr u32 register_file_max_len = 48 * 8; // H0 - H47, R0 - R23 + + const u32 lane_width = reg.reg.f16 ? 2 : 4; + const u32 file_offset = reg.reg.id * lane_width * 4; + + ensure(file_offset < register_file_max_len, "Invalid register index"); + + rsx::simple_array result{}; + auto insert_lane = [&](u32 word_offset) + { + for (u32 i = 0; i < lane_width; ++i) + { + result.push_back(file_offset + (word_offset * lane_width) + i); + } + }; + + if (reg.x) insert_lane(0); + if (reg.y) insert_lane(1); + if (reg.z) insert_lane(2); + if (reg.w) insert_lane(3); + + return result; + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h new file mode 100644 index 0000000000..b2297a24ab --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/FPOpcodes.h @@ -0,0 +1,106 @@ +#pragma once + +#include "IR.h" +#include "Emu/RSX/Common/simple_array.hpp" + +struct RSXFragmentProgram; + +namespace rsx::assembler +{ + enum FP_opcode + { + RSX_FP_OPCODE_NOP = 0x00, // No-Operation + RSX_FP_OPCODE_MOV = 0x01, // Move + RSX_FP_OPCODE_MUL = 0x02, // Multiply + RSX_FP_OPCODE_ADD = 0x03, // Add + RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add + RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product + RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product + RSX_FP_OPCODE_DST = 0x07, // Distance + RSX_FP_OPCODE_MIN = 0x08, // Minimum + RSX_FP_OPCODE_MAX = 0x09, // Maximum + RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan + RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual + RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual + RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan + RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual + RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal + RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract) + RSX_FP_OPCODE_FLR = 0x11, // Floor + RSX_FP_OPCODE_KIL = 0x12, // Kill fragment + RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values + RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values + RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x) + RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y) + RSX_FP_OPCODE_TEX = 0x17, // Texture lookup + RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup) + RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives) + RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal + RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root + RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2 + RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2 + RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients + RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation + RSX_FP_OPCODE_STR = 0x20, // Set-If-True + RSX_FP_OPCODE_SFL = 0x21, // Set-If-False + RSX_FP_OPCODE_COS = 0x22, // Cosine + RSX_FP_OPCODE_SIN = 0x23, // Sine + RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats + RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats + RSX_FP_OPCODE_POW = 0x26, // Power + RSX_FP_OPCODE_PKB = 0x27, // Pack bytes + RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes + RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits + RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16 + RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform) + RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation + RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma + RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition + RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD + RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias + RSX_FP_OPCODE_TEXBEM = 0x33, + RSX_FP_OPCODE_TXPBEM = 0x34, + RSX_FP_OPCODE_BEMLUM = 0x35, + RSX_FP_OPCODE_REFL = 0x36, // Reflection vector + RSX_FP_OPCODE_TIMESWTEX = 0x37, + RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product + RSX_FP_OPCODE_NRM = 0x39, // Normalize + RSX_FP_OPCODE_DIV = 0x3A, // Division + RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root + RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT + RSX_FP_OPCODE_FENCT = 0x3D, // Fence T? + RSX_FP_OPCODE_FENCB = 0x3E, // Fence B? + RSX_FP_OPCODE_BRK = 0x40, // Break + RSX_FP_OPCODE_CAL = 0x41, // Subroutine call + RSX_FP_OPCODE_IFE = 0x42, // If + RSX_FP_OPCODE_LOOP = 0x43, // Loop + RSX_FP_OPCODE_REP = 0x44, // Repeat + RSX_FP_OPCODE_RET = 0x45 // Return + }; + + namespace FP + { + // Returns number of operands consumed by an instruction + u8 get_operand_count(FP_opcode opcode); + + // Returns a lane mask for the given operand. + // The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel. + u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + + // Resolved vector lane mask with swizzles applied. + u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + + // Returns true on delay slot instructions. + bool is_delay_slot(const Instruction* instruction); + + // Generate register references + RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand); + RegisterRef get_dst_register(const Instruction* instruction); + + // Convert vector mask to file ranges + rsx::simple_array get_register_file_range(const RegisterRef& reg); + + // Compile a register file annotated blob to register references + std::vector compile_register_file(const std::array& file); + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp index d8de4eda0b..055d74a88c 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/FPToCFG.cpp @@ -126,6 +126,7 @@ namespace rsx::assembler std::memcpy(ir_inst.bytecode, &decoded._u32[0], 16); ir_inst.length = 4; ir_inst.addr = pc * 16; + ir_inst.opcode = opcode; switch (opcode) { diff --git a/rpcs3/Emu/RSX/Program/Assembler/IR.h b/rpcs3/Emu/RSX/Program/Assembler/IR.h index 65960f3d99..0fbd1f98ae 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/IR.h +++ b/rpcs3/Emu/RSX/Program/Assembler/IR.h @@ -19,7 +19,7 @@ namespace rsx::assembler // Vector information union { - u32 mask; + u32 mask = 0; struct { @@ -29,6 +29,11 @@ namespace rsx::assembler bool w : 1; }; }; + + operator bool() const + { + return !!mask; + } }; struct Instruction @@ -78,6 +83,9 @@ namespace rsx::assembler std::vector prologue; // Prologue, created by passes std::vector epilogue; // Epilogue, created by passes + std::vector input_list; // Register inputs. + std::vector clobber_list; // Clobbered outputs + FlowEdge* insert_succ(BasicBlock* b, EdgeType type = EdgeType::NONE) { FlowEdge e{ .type = type, .from = this, .to = b }; diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp new file mode 100644 index 0000000000..196bc19361 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp @@ -0,0 +1,181 @@ +#include "stdafx.h" +#include "RegisterAnnotationPass.h" +#include "Emu/RSX/Program/Assembler/FPOpcodes.h" + +#include +#include + +namespace rsx::assembler::FP +{ + static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers + static constexpr char content_unknown = 0; + static constexpr char content_float32 = 'F'; + static constexpr char content_float16 = 'H'; + static constexpr char content_dual = 'D'; + + std::vector compile_register_file(const std::array& file) + { + std::vector results; + + // F16 register processing + for (int reg16 = 0; reg16 < 48; ++reg16) + { + const u32 offset = reg16 * 8; + auto word = *reinterpret_cast(&file[offset]); + + if (!word) [[ likely ]] + { + // Trivial rejection, very commonly hit. + continue; + } + + RegisterRef ref{ .reg {.id = reg16, .f16 = true } }; + ref.x = (file[offset] == content_dual || file[offset] == content_float16); + ref.y = (file[offset + 2] == content_dual || file[offset + 2] == content_float16); + ref.z = (file[offset + 4] == content_dual || file[offset + 4] == content_float16); + ref.w = (file[offset + 6] == content_dual || file[offset + 6] == content_float16); + + if (ref) + { + results.push_back(ref); + } + } + + // Helper to check a span for 32-bit access + auto match_any_32 = [](const std::span lanes) + { + for (const auto& c : lanes) + { + if (c == content_dual || c == content_float32) + { + return true; + } + } + return false; + }; + + // F32 register processing + for (int reg32 = 0; reg32 < 24; ++reg32) + { + const u32 offset = reg32 * 16; + auto word0 = *reinterpret_cast(&file[offset]); + auto word1 = *reinterpret_cast(&file[offset + 8]); + + if (!word0 && !word1) [[ likely ]] + { + // Trivial rejection, very commonly hit. + continue; + } + + RegisterRef ref{ .reg {.id = reg32, .f16 = false } }; + if (word0) + { + ref.x = match_any_32({ &file[offset], 4 }); + ref.y = match_any_32({ &file[offset + 4], 4 }); + } + + if (word1) + { + ref.z = match_any_32({ &file[offset + 8], 4 }); + ref.w = match_any_32({ &file[offset + 12], 4 }); + } + + if (ref) + { + results.push_back(ref); + } + } + + return results; + } + + // Decay instructions into register references + void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog) + { + for (auto& instruction : block->instructions) + { + const u32 operand_count = get_operand_count(static_cast(instruction.opcode)); + for (u32 i = 0; i < operand_count; i++) + { + RegisterRef reg = get_src_register(prog, &instruction, i); + ensure(reg.mask, "Invalid register read"); + + instruction.srcs.push_back(reg); + } + + RegisterRef dst = get_dst_register(&instruction); + if (dst) + { + instruction.dsts.push_back(dst); + } + } + } + + // Annotate each block with input and output lanes (read and clobber list) + void annotate_block_io(BasicBlock* block) + { + alignas(16) std::array output_register_file; + alignas(16) std::array input_register_file; // We'll eventually replace with a bitfield mask, but for ease of debugging, we use char for now + + std::memset(output_register_file.data(), content_unknown, register_file_length); + std::memset(input_register_file.data(), content_unknown, register_file_length); + + for (const auto& instruction : block->instructions) + { + for (const auto& src : instruction.srcs) + { + const auto read_bytes = get_register_file_range(src); + const char expected_type = src.reg.f16 ? content_float16 : content_float16; + for (const auto& index : read_bytes) + { + if (output_register_file[index] != content_unknown) + { + // Something already wrote to this lane + continue; + } + + if (input_register_file[index] == expected_type) + { + // We already know about this input + continue; + } + + if (input_register_file[index] == 0) + { + // Not known, tag as input + input_register_file[index] = expected_type; + continue; + } + + // Collision on the lane + input_register_file[index] = content_dual; + } + } + + if (!instruction.dsts.empty()) + { + const auto& dst = instruction.dsts.front(); + const auto write_bytes = get_register_file_range(dst); + const char expected_type = dst.reg.f16 ? content_float16 : content_float16; + + for (const auto& index : write_bytes) + { + output_register_file[index] = expected_type; + } + } + } + + // Compile the input and output refs into register references + block->clobber_list = compile_register_file(output_register_file); + block->input_list = compile_register_file(input_register_file); + } + + void RegisterAnnotationPass::run(FlowGraph& graph) + { + for (auto& block : graph.blocks) + { + annotate_instructions(&block, m_prog); + annotate_block_io(&block); + } + } +} diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.h b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h similarity index 54% rename from rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.h rename to rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h index f08f0d6e8e..8856ecb44e 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.h +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h @@ -1,16 +1,26 @@ #pragma once -#include "../CFG.h" +#include "../../CFG.h" -namespace rsx::assembler +struct RSXFragmentProgram; + +namespace rsx::assembler::FP { // The annotation pass annotates each basic block with 2 pieces of information: // 1. The "input" register list for a block. // 2. The "output" register list for a block (clobber list). // The information can be used by other passes to set up prologue/epilogue on each block. + // The pass also populates register reference members of each instruction, such as the input and output lanes. class RegisterAnnotationPass : public CFGPass { public: + RegisterAnnotationPass(RSXFragmentProgram& prog) + : m_prog(prog) + {} + void run(FlowGraph& graph) override; + + private: + const RSXFragmentProgram& m_prog; }; } diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp similarity index 76% rename from rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.cpp rename to rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp index 290f1a4022..1e60c85519 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp @@ -1,7 +1,7 @@ #include "stdafx.h" #include "RegisterDependencyPass.h" -namespace rsx::assembler +namespace rsx::assembler::FP { void RegisterDependencyPass::run(FlowGraph& graph) { diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.h b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h similarity index 89% rename from rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.h rename to rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h index b7af86761b..48068691e1 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterDependencyPass.h +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h @@ -1,8 +1,8 @@ #pragma once -#include "../CFG.h" +#include "../../CFG.h" -namespace rsx::assembler +namespace rsx::assembler::FP { // The register dependency pass identifies data hazards for each basic block and injects barrier instructions. // Real PS3 does not have explicit barriers, but does instead often use delay slots or fence instructions to stall until a specific hardware unit clears the fence to advance. diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.cpp deleted file mode 100644 index 8d23080daa..0000000000 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/RegisterAnnotationPass.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "stdafx.h" -#include "RegisterAnnotationPass.h" - -namespace rsx::assembler -{ - void RegisterAnnotationPass::run(FlowGraph& graph) - { - // TODO - } -} diff --git a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h index f834b7c7f5..e20098ff57 100644 --- a/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h +++ b/rpcs3/Emu/RSX/Program/RSXFragmentProgram.h @@ -1,6 +1,7 @@ #pragma once #include "program_util.h" +#include "Assembler/FPOpcodes.h" #include #include @@ -23,76 +24,7 @@ enum register_precision RSX_FP_PRECISION_UNKNOWN = 5 // Unknown what this actually does; seems to do nothing on hwtests but then why would their compiler emit it? }; -enum fp_opcode -{ - RSX_FP_OPCODE_NOP = 0x00, // No-Operation - RSX_FP_OPCODE_MOV = 0x01, // Move - RSX_FP_OPCODE_MUL = 0x02, // Multiply - RSX_FP_OPCODE_ADD = 0x03, // Add - RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add - RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product - RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product - RSX_FP_OPCODE_DST = 0x07, // Distance - RSX_FP_OPCODE_MIN = 0x08, // Minimum - RSX_FP_OPCODE_MAX = 0x09, // Maximum - RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan - RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual - RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual - RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan - RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual - RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal - RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract) - RSX_FP_OPCODE_FLR = 0x11, // Floor - RSX_FP_OPCODE_KIL = 0x12, // Kill fragment - RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values - RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values - RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x) - RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y) - RSX_FP_OPCODE_TEX = 0x17, // Texture lookup - RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup) - RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives) - RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal - RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root - RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2 - RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2 - RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients - RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation - RSX_FP_OPCODE_STR = 0x20, // Set-If-True - RSX_FP_OPCODE_SFL = 0x21, // Set-If-False - RSX_FP_OPCODE_COS = 0x22, // Cosine - RSX_FP_OPCODE_SIN = 0x23, // Sine - RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats - RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats - RSX_FP_OPCODE_POW = 0x26, // Power - RSX_FP_OPCODE_PKB = 0x27, // Pack bytes - RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes - RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits - RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16 - RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform) - RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation - RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma - RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition - RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD - RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias - RSX_FP_OPCODE_TEXBEM = 0x33, - RSX_FP_OPCODE_TXPBEM = 0x34, - RSX_FP_OPCODE_BEMLUM = 0x35, - RSX_FP_OPCODE_REFL = 0x36, // Reflection vector - RSX_FP_OPCODE_TIMESWTEX = 0x37, - RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product - RSX_FP_OPCODE_NRM = 0x39, // Normalize - RSX_FP_OPCODE_DIV = 0x3A, // Division - RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root - RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT - RSX_FP_OPCODE_FENCT = 0x3D, // Fence T? - RSX_FP_OPCODE_FENCB = 0x3E, // Fence B? - RSX_FP_OPCODE_BRK = 0x40, // Break - RSX_FP_OPCODE_CAL = 0x41, // Subroutine call - RSX_FP_OPCODE_IFE = 0x42, // If - RSX_FP_OPCODE_LOOP = 0x43, // Loop - RSX_FP_OPCODE_REP = 0x44, // Repeat - RSX_FP_OPCODE_RET = 0x45 // Return -}; +using enum rsx::assembler::FP_opcode; union OPDEST { @@ -116,6 +48,12 @@ union OPDEST u32 no_dest : 1; u32 saturate : 1; // _sat }; + + struct + { + u32 : 9; + u32 write_mask : 4; + }; }; union SRC0 @@ -207,6 +145,23 @@ union SRC2 }; }; +union SRC_Common +{ + u32 HEX; + + struct + { + u32 reg_type : 2; + u32 tmp_reg_index : 6; + u32 fp16 : 1; + u32 swizzle_x : 2; + u32 swizzle_y : 2; + u32 swizzle_z : 2; + u32 swizzle_w : 2; + u32 neg : 1; + }; +}; + constexpr const char* rsx_fp_input_attr_regs[] = { "WPOS", "COL0", "COL1", "FOGC", "TEX0", diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 5e8971fab2..1b46634019 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -156,9 +156,10 @@ + - - + + @@ -703,9 +704,10 @@ + - - + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index d0b2a25233..94cc8c5aed 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -139,6 +139,9 @@ {d13db076-47e4-45b9-bb8a-6b711ea40622} + + {7fb59544-9761-4b4a-bb04-07deb43cf3c2} + @@ -1381,11 +1384,14 @@ Emu\GPU\RSX\Program\Assembler - - Emu\GPU\RSX\Program\Assembler\Passes + + Emu\GPU\RSX\Program\Assembler\Passes\FP - - Emu\GPU\RSX\Program\Assembler\Passes + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler @@ -2785,11 +2791,14 @@ Emu\GPU\RSX\Program\Assembler - - Emu\GPU\RSX\Program\Assembler\Passes + + Emu\GPU\RSX\Program\Assembler - - Emu\GPU\RSX\Program\Assembler\Passes + + Emu\GPU\RSX\Program\Assembler\Passes\FP + + + Emu\GPU\RSX\Program\Assembler\Passes\FP