rsx: Implement register annotation pass

This commit is contained in:
kd-11 2025-11-29 23:56:01 +03:00 committed by kd-11
parent e20bae3cd7
commit 9d30716aa8
12 changed files with 787 additions and 98 deletions

View File

@ -0,0 +1,427 @@
#include "stdafx.h"
#include "FPOpcodes.h"
#include "Emu/RSX/Common/simple_array.hpp"
#include "Emu/RSX/Program/RSXFragmentProgram.h"
#include <unordered_set>
namespace rsx::assembler::FP
{
u8 get_operand_count(FP_opcode opcode)
{
switch (opcode)
{
case RSX_FP_OPCODE_NOP:
return 0;
case RSX_FP_OPCODE_MOV:
case RSX_FP_OPCODE_MUL:
case RSX_FP_OPCODE_ADD:
return 2;
case RSX_FP_OPCODE_MAD:
return 3;
case RSX_FP_OPCODE_DP3:
case RSX_FP_OPCODE_DP4:
return 2;
case RSX_FP_OPCODE_DST:
return 2;
case RSX_FP_OPCODE_MIN:
case RSX_FP_OPCODE_MAX:
return 2;
case RSX_FP_OPCODE_SLT:
case RSX_FP_OPCODE_SGE:
case RSX_FP_OPCODE_SLE:
case RSX_FP_OPCODE_SGT:
case RSX_FP_OPCODE_SNE:
case RSX_FP_OPCODE_SEQ:
return 2;
case RSX_FP_OPCODE_FRC:
case RSX_FP_OPCODE_FLR:
return 1;
case RSX_FP_OPCODE_KIL:
return 0;
case RSX_FP_OPCODE_PK4:
case RSX_FP_OPCODE_UP4:
return 1;
case RSX_FP_OPCODE_DDX:
case RSX_FP_OPCODE_DDY:
return 1;
case RSX_FP_OPCODE_TEX:
case RSX_FP_OPCODE_TXD:
case RSX_FP_OPCODE_TXP:
return 1;
case RSX_FP_OPCODE_RCP:
case RSX_FP_OPCODE_RSQ:
case RSX_FP_OPCODE_EX2:
case RSX_FP_OPCODE_LG2:
return 1;
case RSX_FP_OPCODE_LIT:
return 1;
case RSX_FP_OPCODE_LRP:
return 3;
case RSX_FP_OPCODE_STR:
case RSX_FP_OPCODE_SFL:
return 0;
case RSX_FP_OPCODE_COS:
case RSX_FP_OPCODE_SIN:
return 1;
case RSX_FP_OPCODE_PK2:
case RSX_FP_OPCODE_UP2:
return 1;
case RSX_FP_OPCODE_PKB:
case RSX_FP_OPCODE_UPB:
case RSX_FP_OPCODE_PK16:
case RSX_FP_OPCODE_UP16:
case RSX_FP_OPCODE_PKG:
case RSX_FP_OPCODE_UPG:
return 1;
case RSX_FP_OPCODE_DP2A:
return 3;
case RSX_FP_OPCODE_TXL:
case RSX_FP_OPCODE_TXB:
return 2;
case RSX_FP_OPCODE_DP2:
return 2;
case RSX_FP_OPCODE_NRM:
return 1;
case RSX_FP_OPCODE_DIV:
case RSX_FP_OPCODE_DIVSQ:
return 2;
case RSX_FP_OPCODE_LIF:
return 1;
case RSX_FP_OPCODE_FENCT:
case RSX_FP_OPCODE_FENCB:
case RSX_FP_OPCODE_BRK:
case RSX_FP_OPCODE_CAL:
case RSX_FP_OPCODE_IFE:
case RSX_FP_OPCODE_LOOP:
case RSX_FP_OPCODE_REP:
case RSX_FP_OPCODE_RET:
// Flow control. Special registers are provided for these outside the common file
return 0;
// The rest are unimplemented and not encountered in real software.
// TODO: Probe these on real PS3 and figure out what they actually do.
case RSX_FP_OPCODE_POW:
fmt::throw_exception("Unimplemented POW instruction."); // Unused
case RSX_FP_OPCODE_BEM:
case RSX_FP_OPCODE_TEXBEM:
case RSX_FP_OPCODE_TXPBEM:
case RSX_FP_OPCODE_BEMLUM:
fmt::throw_exception("Unimplemented BEM class instruction"); // Unused
case RSX_FP_OPCODE_REFL:
return 2;
case RSX_FP_OPCODE_TIMESWTEX:
fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused
default:
break;
}
return 0;
}
// Returns a lane mask for the given operand.
// The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel.
u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
{
constexpr u32 x = 0b0001;
constexpr u32 y = 0b0010;
constexpr u32 z = 0b0100;
constexpr u32 w = 0b1000;
constexpr u32 xy = 0b0011;
constexpr u32 xyz = 0b0111;
constexpr u32 xyzw = 0b1111;
const auto decode = [&](const rsx::simple_array<u32>& masks) -> u32
{
return operand < masks.size()
? masks[operand]
: 0u;
};
auto opcode = static_cast<FP_opcode>(instruction->opcode);
if (operand >= get_operand_count(opcode))
{
return 0;
}
OPDEST d0 { .HEX = instruction->bytecode[0] };
const u32 dst_write_mask = d0.no_dest ? 0 : d0.write_mask;
switch (opcode)
{
case RSX_FP_OPCODE_NOP:
return 0;
case RSX_FP_OPCODE_MOV:
case RSX_FP_OPCODE_MUL:
case RSX_FP_OPCODE_ADD:
case RSX_FP_OPCODE_MAD:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_DP3:
return xyz;
case RSX_FP_OPCODE_DP4:
return xyzw;
case RSX_FP_OPCODE_DST:
return decode({ y | z, y | w });
case RSX_FP_OPCODE_MIN:
case RSX_FP_OPCODE_MAX:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_SLT:
case RSX_FP_OPCODE_SGE:
case RSX_FP_OPCODE_SLE:
case RSX_FP_OPCODE_SGT:
case RSX_FP_OPCODE_SNE:
case RSX_FP_OPCODE_SEQ:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_FRC:
case RSX_FP_OPCODE_FLR:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_KIL:
return 0;
case RSX_FP_OPCODE_PK4:
return xyzw;
case RSX_FP_OPCODE_UP4:
return x;
case RSX_FP_OPCODE_DDX:
case RSX_FP_OPCODE_DDY:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_TEX:
case RSX_FP_OPCODE_TXD:
switch (prog.get_texture_dimension(d0.tex_num))
{
case rsx::texture_dimension_extended::texture_dimension_1d:
return x;
case rsx::texture_dimension_extended::texture_dimension_2d:
return xy;
case rsx::texture_dimension_extended::texture_dimension_3d:
case rsx::texture_dimension_extended::texture_dimension_cubemap:
return xyz;
default:
return 0;
}
case RSX_FP_OPCODE_TXP:
switch (prog.get_texture_dimension(d0.tex_num))
{
case rsx::texture_dimension_extended::texture_dimension_1d:
return xy;
case rsx::texture_dimension_extended::texture_dimension_2d:
return xyz;
case rsx::texture_dimension_extended::texture_dimension_3d:
case rsx::texture_dimension_extended::texture_dimension_cubemap:
return xyzw;
default:
return 0;
}
case RSX_FP_OPCODE_RCP:
case RSX_FP_OPCODE_RSQ:
case RSX_FP_OPCODE_EX2:
case RSX_FP_OPCODE_LG2:
return x;
case RSX_FP_OPCODE_LIT:
return xyzw;
case RSX_FP_OPCODE_LRP:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_STR:
case RSX_FP_OPCODE_SFL:
return xyzw & dst_write_mask;
case RSX_FP_OPCODE_COS:
case RSX_FP_OPCODE_SIN:
return x;
case RSX_FP_OPCODE_PK2:
return xy;
case RSX_FP_OPCODE_UP2:
return x;
case RSX_FP_OPCODE_PKB:
return xyzw;
case RSX_FP_OPCODE_UPB:
return x;
case RSX_FP_OPCODE_PK16:
return xy;
case RSX_FP_OPCODE_UP16:
return x;
case RSX_FP_OPCODE_PKG:
return xyzw;
case RSX_FP_OPCODE_UPG:
return x;
case RSX_FP_OPCODE_DP2A:
return decode({ xy, xy, x });
case RSX_FP_OPCODE_TXL:
case RSX_FP_OPCODE_TXB:
return decode({ xy, x });
case RSX_FP_OPCODE_REFL:
return xyzw;
case RSX_FP_OPCODE_DP2:
return xy;
case RSX_FP_OPCODE_NRM:
return xyz;
case RSX_FP_OPCODE_DIV:
case RSX_FP_OPCODE_DIVSQ:
return decode({ xyzw, x });
case RSX_FP_OPCODE_LIF:
return decode({ y | w });
case RSX_FP_OPCODE_FENCT:
case RSX_FP_OPCODE_FENCB:
case RSX_FP_OPCODE_BRK:
case RSX_FP_OPCODE_CAL:
case RSX_FP_OPCODE_IFE:
case RSX_FP_OPCODE_LOOP:
case RSX_FP_OPCODE_REP:
case RSX_FP_OPCODE_RET:
// Flow control. Special registers are provided for these outside the common file
return 0;
case RSX_FP_OPCODE_POW:
fmt::throw_exception("Unimplemented POW instruction."); // Unused ??
case RSX_FP_OPCODE_BEM:
case RSX_FP_OPCODE_TEXBEM:
case RSX_FP_OPCODE_TXPBEM:
case RSX_FP_OPCODE_BEMLUM:
fmt::throw_exception("Unimplemented BEM class instruction"); // Unused
case RSX_FP_OPCODE_TIMESWTEX:
fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused
default:
break;
}
return 0;
}
// Resolved vector lane mask with swizzles applied.
u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
{
// Brute-force this. There's only 16 permutations.
constexpr u32 x = 0b0000;
constexpr u32 y = 0b0000;
constexpr u32 z = 0b0100;
constexpr u32 w = 0b1000;
const u32 lane_mask = get_src_vector_lane_mask(prog, instruction, operand);
if (!lane_mask)
{
return lane_mask;
}
// Now we resolve matching lanes.
// This sequence can be drastically sped up using lookup tables but that will come later.
std::unordered_set<u32> inputs;
SRC_Common src { .HEX = instruction->bytecode[operand + 1] };
if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP)
{
return 0;
}
if (lane_mask & x) inputs.insert(src.swizzle_x);
if (lane_mask & y) inputs.insert(src.swizzle_y);
if (lane_mask & z) inputs.insert(src.swizzle_z);
if (lane_mask & w) inputs.insert(src.swizzle_w);
u32 result = 0;
if (inputs.contains(0)) result |= x;
if (inputs.contains(1)) result |= y;
if (inputs.contains(2)) result |= z;
if (inputs.contains(3)) result |= w;
return result;
}
bool is_delay_slot(const Instruction* instruction)
{
OPDEST dst { .HEX = instruction->bytecode[0] };
SRC0 src0 { .HEX = instruction->bytecode[1] };
SRC1 src1{ .HEX = instruction->bytecode[2] };
if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV
dst.no_dest || // Must have a sink
src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg
dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self
dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter
dst.saturate || // Precision modifier
(dst.prec != RSX_FP_PRECISION_REAL &&
dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers
{
return false;
}
// Check if we have precision modifiers on the source
if (src0.abs || src0.neg || src1.scale)
{
return false;
}
if (dst.mask_x && src0.swizzle_x != 0) return false;
if (dst.mask_y && src0.swizzle_y != 1) return false;
if (dst.mask_z && src0.swizzle_z != 2) return false;
if (dst.mask_w && src0.swizzle_w != 3) return false;
return true;
}
RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand)
{
SRC_Common src{ .HEX = instruction->bytecode[operand + 1] };
if (src.reg_type != RSX_FP_REGISTER_TYPE_TEMP)
{
return {};
}
const u32 read_lanes = get_src_vector_lane_mask_shuffled(prog, instruction, operand);
if (!read_lanes)
{
return {};
}
RegisterRef ref{ .mask = read_lanes };
Register& reg = ref.reg;
reg.f16 = !!src.fp16;
reg.id = src.tmp_reg_index;
return ref;
}
RegisterRef get_dst_register(const Instruction* instruction)
{
OPDEST dst { .HEX = instruction->bytecode[0] };
if (dst.no_dest)
{
return {};
}
RegisterRef ref{ .mask = dst.write_mask };
ref.reg.f16 = dst.fp16;
ref.reg.id = dst.dest_reg;
return ref;
}
// Convert vector mask to file range
rsx::simple_array<u32> get_register_file_range(const RegisterRef& reg)
{
if (!reg.mask)
{
return {};
}
constexpr u32 register_file_max_len = 48 * 8; // H0 - H47, R0 - R23
const u32 lane_width = reg.reg.f16 ? 2 : 4;
const u32 file_offset = reg.reg.id * lane_width * 4;
ensure(file_offset < register_file_max_len, "Invalid register index");
rsx::simple_array<u32> result{};
auto insert_lane = [&](u32 word_offset)
{
for (u32 i = 0; i < lane_width; ++i)
{
result.push_back(file_offset + (word_offset * lane_width) + i);
}
};
if (reg.x) insert_lane(0);
if (reg.y) insert_lane(1);
if (reg.z) insert_lane(2);
if (reg.w) insert_lane(3);
return result;
}
}

View File

@ -0,0 +1,106 @@
#pragma once
#include "IR.h"
#include "Emu/RSX/Common/simple_array.hpp"
struct RSXFragmentProgram;
namespace rsx::assembler
{
enum FP_opcode
{
RSX_FP_OPCODE_NOP = 0x00, // No-Operation
RSX_FP_OPCODE_MOV = 0x01, // Move
RSX_FP_OPCODE_MUL = 0x02, // Multiply
RSX_FP_OPCODE_ADD = 0x03, // Add
RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add
RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product
RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product
RSX_FP_OPCODE_DST = 0x07, // Distance
RSX_FP_OPCODE_MIN = 0x08, // Minimum
RSX_FP_OPCODE_MAX = 0x09, // Maximum
RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan
RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual
RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual
RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan
RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual
RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal
RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract)
RSX_FP_OPCODE_FLR = 0x11, // Floor
RSX_FP_OPCODE_KIL = 0x12, // Kill fragment
RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values
RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values
RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x)
RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y)
RSX_FP_OPCODE_TEX = 0x17, // Texture lookup
RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup)
RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives)
RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal
RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root
RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2
RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2
RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients
RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation
RSX_FP_OPCODE_STR = 0x20, // Set-If-True
RSX_FP_OPCODE_SFL = 0x21, // Set-If-False
RSX_FP_OPCODE_COS = 0x22, // Cosine
RSX_FP_OPCODE_SIN = 0x23, // Sine
RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats
RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats
RSX_FP_OPCODE_POW = 0x26, // Power
RSX_FP_OPCODE_PKB = 0x27, // Pack bytes
RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes
RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits
RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16
RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform)
RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation
RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma
RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition
RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD
RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias
RSX_FP_OPCODE_TEXBEM = 0x33,
RSX_FP_OPCODE_TXPBEM = 0x34,
RSX_FP_OPCODE_BEMLUM = 0x35,
RSX_FP_OPCODE_REFL = 0x36, // Reflection vector
RSX_FP_OPCODE_TIMESWTEX = 0x37,
RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product
RSX_FP_OPCODE_NRM = 0x39, // Normalize
RSX_FP_OPCODE_DIV = 0x3A, // Division
RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root
RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT
RSX_FP_OPCODE_FENCT = 0x3D, // Fence T?
RSX_FP_OPCODE_FENCB = 0x3E, // Fence B?
RSX_FP_OPCODE_BRK = 0x40, // Break
RSX_FP_OPCODE_CAL = 0x41, // Subroutine call
RSX_FP_OPCODE_IFE = 0x42, // If
RSX_FP_OPCODE_LOOP = 0x43, // Loop
RSX_FP_OPCODE_REP = 0x44, // Repeat
RSX_FP_OPCODE_RET = 0x45 // Return
};
namespace FP
{
// Returns number of operands consumed by an instruction
u8 get_operand_count(FP_opcode opcode);
// Returns a lane mask for the given operand.
// The lane mask is the fixed function hardware lane so swizzles need to be applied on top to resolve the real data channel.
u32 get_src_vector_lane_mask(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
// Resolved vector lane mask with swizzles applied.
u32 get_src_vector_lane_mask_shuffled(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
// Returns true on delay slot instructions.
bool is_delay_slot(const Instruction* instruction);
// Generate register references
RegisterRef get_src_register(const RSXFragmentProgram& prog, const Instruction* instruction, u32 operand);
RegisterRef get_dst_register(const Instruction* instruction);
// Convert vector mask to file ranges
rsx::simple_array<u32> get_register_file_range(const RegisterRef& reg);
// Compile a register file annotated blob to register references
std::vector<RegisterRef> compile_register_file(const std::array<char, 48 * 8>& file);
}
}

View File

@ -126,6 +126,7 @@ namespace rsx::assembler
std::memcpy(ir_inst.bytecode, &decoded._u32[0], 16); std::memcpy(ir_inst.bytecode, &decoded._u32[0], 16);
ir_inst.length = 4; ir_inst.length = 4;
ir_inst.addr = pc * 16; ir_inst.addr = pc * 16;
ir_inst.opcode = opcode;
switch (opcode) switch (opcode)
{ {

View File

@ -19,7 +19,7 @@ namespace rsx::assembler
// Vector information // Vector information
union union
{ {
u32 mask; u32 mask = 0;
struct struct
{ {
@ -29,6 +29,11 @@ namespace rsx::assembler
bool w : 1; bool w : 1;
}; };
}; };
operator bool() const
{
return !!mask;
}
}; };
struct Instruction struct Instruction
@ -78,6 +83,9 @@ namespace rsx::assembler
std::vector<Instruction> prologue; // Prologue, created by passes std::vector<Instruction> prologue; // Prologue, created by passes
std::vector<Instruction> epilogue; // Epilogue, created by passes std::vector<Instruction> epilogue; // Epilogue, created by passes
std::vector<RegisterRef> input_list; // Register inputs.
std::vector<RegisterRef> clobber_list; // Clobbered outputs
FlowEdge* insert_succ(BasicBlock* b, EdgeType type = EdgeType::NONE) FlowEdge* insert_succ(BasicBlock* b, EdgeType type = EdgeType::NONE)
{ {
FlowEdge e{ .type = type, .from = this, .to = b }; FlowEdge e{ .type = type, .from = this, .to = b };

View File

@ -0,0 +1,181 @@
#include "stdafx.h"
#include "RegisterAnnotationPass.h"
#include "Emu/RSX/Program/Assembler/FPOpcodes.h"
#include <span>
#include <unordered_map>
namespace rsx::assembler::FP
{
static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers
static constexpr char content_unknown = 0;
static constexpr char content_float32 = 'F';
static constexpr char content_float16 = 'H';
static constexpr char content_dual = 'D';
std::vector<RegisterRef> compile_register_file(const std::array<char, 48 * 8>& file)
{
std::vector<RegisterRef> results;
// F16 register processing
for (int reg16 = 0; reg16 < 48; ++reg16)
{
const u32 offset = reg16 * 8;
auto word = *reinterpret_cast<const u64*>(&file[offset]);
if (!word) [[ likely ]]
{
// Trivial rejection, very commonly hit.
continue;
}
RegisterRef ref{ .reg {.id = reg16, .f16 = true } };
ref.x = (file[offset] == content_dual || file[offset] == content_float16);
ref.y = (file[offset + 2] == content_dual || file[offset + 2] == content_float16);
ref.z = (file[offset + 4] == content_dual || file[offset + 4] == content_float16);
ref.w = (file[offset + 6] == content_dual || file[offset + 6] == content_float16);
if (ref)
{
results.push_back(ref);
}
}
// Helper to check a span for 32-bit access
auto match_any_32 = [](const std::span<const char> lanes)
{
for (const auto& c : lanes)
{
if (c == content_dual || c == content_float32)
{
return true;
}
}
return false;
};
// F32 register processing
for (int reg32 = 0; reg32 < 24; ++reg32)
{
const u32 offset = reg32 * 16;
auto word0 = *reinterpret_cast<const u64*>(&file[offset]);
auto word1 = *reinterpret_cast<const u64*>(&file[offset + 8]);
if (!word0 && !word1) [[ likely ]]
{
// Trivial rejection, very commonly hit.
continue;
}
RegisterRef ref{ .reg {.id = reg32, .f16 = false } };
if (word0)
{
ref.x = match_any_32({ &file[offset], 4 });
ref.y = match_any_32({ &file[offset + 4], 4 });
}
if (word1)
{
ref.z = match_any_32({ &file[offset + 8], 4 });
ref.w = match_any_32({ &file[offset + 12], 4 });
}
if (ref)
{
results.push_back(ref);
}
}
return results;
}
// Decay instructions into register references
void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog)
{
for (auto& instruction : block->instructions)
{
const u32 operand_count = get_operand_count(static_cast<FP_opcode>(instruction.opcode));
for (u32 i = 0; i < operand_count; i++)
{
RegisterRef reg = get_src_register(prog, &instruction, i);
ensure(reg.mask, "Invalid register read");
instruction.srcs.push_back(reg);
}
RegisterRef dst = get_dst_register(&instruction);
if (dst)
{
instruction.dsts.push_back(dst);
}
}
}
// Annotate each block with input and output lanes (read and clobber list)
void annotate_block_io(BasicBlock* block)
{
alignas(16) std::array<char, register_file_length> output_register_file;
alignas(16) std::array<char, register_file_length> input_register_file; // We'll eventually replace with a bitfield mask, but for ease of debugging, we use char for now
std::memset(output_register_file.data(), content_unknown, register_file_length);
std::memset(input_register_file.data(), content_unknown, register_file_length);
for (const auto& instruction : block->instructions)
{
for (const auto& src : instruction.srcs)
{
const auto read_bytes = get_register_file_range(src);
const char expected_type = src.reg.f16 ? content_float16 : content_float16;
for (const auto& index : read_bytes)
{
if (output_register_file[index] != content_unknown)
{
// Something already wrote to this lane
continue;
}
if (input_register_file[index] == expected_type)
{
// We already know about this input
continue;
}
if (input_register_file[index] == 0)
{
// Not known, tag as input
input_register_file[index] = expected_type;
continue;
}
// Collision on the lane
input_register_file[index] = content_dual;
}
}
if (!instruction.dsts.empty())
{
const auto& dst = instruction.dsts.front();
const auto write_bytes = get_register_file_range(dst);
const char expected_type = dst.reg.f16 ? content_float16 : content_float16;
for (const auto& index : write_bytes)
{
output_register_file[index] = expected_type;
}
}
}
// Compile the input and output refs into register references
block->clobber_list = compile_register_file(output_register_file);
block->input_list = compile_register_file(input_register_file);
}
void RegisterAnnotationPass::run(FlowGraph& graph)
{
for (auto& block : graph.blocks)
{
annotate_instructions(&block, m_prog);
annotate_block_io(&block);
}
}
}

View File

@ -1,16 +1,26 @@
#pragma once #pragma once
#include "../CFG.h" #include "../../CFG.h"
namespace rsx::assembler struct RSXFragmentProgram;
namespace rsx::assembler::FP
{ {
// The annotation pass annotates each basic block with 2 pieces of information: // The annotation pass annotates each basic block with 2 pieces of information:
// 1. The "input" register list for a block. // 1. The "input" register list for a block.
// 2. The "output" register list for a block (clobber list). // 2. The "output" register list for a block (clobber list).
// The information can be used by other passes to set up prologue/epilogue on each block. // The information can be used by other passes to set up prologue/epilogue on each block.
// The pass also populates register reference members of each instruction, such as the input and output lanes.
class RegisterAnnotationPass : public CFGPass class RegisterAnnotationPass : public CFGPass
{ {
public: public:
RegisterAnnotationPass(RSXFragmentProgram& prog)
: m_prog(prog)
{}
void run(FlowGraph& graph) override; void run(FlowGraph& graph) override;
private:
const RSXFragmentProgram& m_prog;
}; };
} }

View File

@ -1,7 +1,7 @@
#include "stdafx.h" #include "stdafx.h"
#include "RegisterDependencyPass.h" #include "RegisterDependencyPass.h"
namespace rsx::assembler namespace rsx::assembler::FP
{ {
void RegisterDependencyPass::run(FlowGraph& graph) void RegisterDependencyPass::run(FlowGraph& graph)
{ {

View File

@ -1,8 +1,8 @@
#pragma once #pragma once
#include "../CFG.h" #include "../../CFG.h"
namespace rsx::assembler namespace rsx::assembler::FP
{ {
// The register dependency pass identifies data hazards for each basic block and injects barrier instructions. // The register dependency pass identifies data hazards for each basic block and injects barrier instructions.
// Real PS3 does not have explicit barriers, but does instead often use delay slots or fence instructions to stall until a specific hardware unit clears the fence to advance. // Real PS3 does not have explicit barriers, but does instead often use delay slots or fence instructions to stall until a specific hardware unit clears the fence to advance.

View File

@ -1,10 +0,0 @@
#include "stdafx.h"
#include "RegisterAnnotationPass.h"
namespace rsx::assembler
{
void RegisterAnnotationPass::run(FlowGraph& graph)
{
// TODO
}
}

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include "program_util.h" #include "program_util.h"
#include "Assembler/FPOpcodes.h"
#include <string> #include <string>
#include <vector> #include <vector>
@ -23,76 +24,7 @@ enum register_precision
RSX_FP_PRECISION_UNKNOWN = 5 // Unknown what this actually does; seems to do nothing on hwtests but then why would their compiler emit it? RSX_FP_PRECISION_UNKNOWN = 5 // Unknown what this actually does; seems to do nothing on hwtests but then why would their compiler emit it?
}; };
enum fp_opcode using enum rsx::assembler::FP_opcode;
{
RSX_FP_OPCODE_NOP = 0x00, // No-Operation
RSX_FP_OPCODE_MOV = 0x01, // Move
RSX_FP_OPCODE_MUL = 0x02, // Multiply
RSX_FP_OPCODE_ADD = 0x03, // Add
RSX_FP_OPCODE_MAD = 0x04, // Multiply-Add
RSX_FP_OPCODE_DP3 = 0x05, // 3-component Dot Product
RSX_FP_OPCODE_DP4 = 0x06, // 4-component Dot Product
RSX_FP_OPCODE_DST = 0x07, // Distance
RSX_FP_OPCODE_MIN = 0x08, // Minimum
RSX_FP_OPCODE_MAX = 0x09, // Maximum
RSX_FP_OPCODE_SLT = 0x0A, // Set-If-LessThan
RSX_FP_OPCODE_SGE = 0x0B, // Set-If-GreaterEqual
RSX_FP_OPCODE_SLE = 0x0C, // Set-If-LessEqual
RSX_FP_OPCODE_SGT = 0x0D, // Set-If-GreaterThan
RSX_FP_OPCODE_SNE = 0x0E, // Set-If-NotEqual
RSX_FP_OPCODE_SEQ = 0x0F, // Set-If-Equal
RSX_FP_OPCODE_FRC = 0x10, // Fraction (fract)
RSX_FP_OPCODE_FLR = 0x11, // Floor
RSX_FP_OPCODE_KIL = 0x12, // Kill fragment
RSX_FP_OPCODE_PK4 = 0x13, // Pack four signed 8-bit values
RSX_FP_OPCODE_UP4 = 0x14, // Unpack four signed 8-bit values
RSX_FP_OPCODE_DDX = 0x15, // Partial-derivative in x (Screen space derivative w.r.t. x)
RSX_FP_OPCODE_DDY = 0x16, // Partial-derivative in y (Screen space derivative w.r.t. y)
RSX_FP_OPCODE_TEX = 0x17, // Texture lookup
RSX_FP_OPCODE_TXP = 0x18, // Texture sample with projection (Projective texture lookup)
RSX_FP_OPCODE_TXD = 0x19, // Texture sample with partial differentiation (Texture lookup with derivatives)
RSX_FP_OPCODE_RCP = 0x1A, // Reciprocal
RSX_FP_OPCODE_RSQ = 0x1B, // Reciprocal Square Root
RSX_FP_OPCODE_EX2 = 0x1C, // Exponentiation base 2
RSX_FP_OPCODE_LG2 = 0x1D, // Log base 2
RSX_FP_OPCODE_LIT = 0x1E, // Lighting coefficients
RSX_FP_OPCODE_LRP = 0x1F, // Linear Interpolation
RSX_FP_OPCODE_STR = 0x20, // Set-If-True
RSX_FP_OPCODE_SFL = 0x21, // Set-If-False
RSX_FP_OPCODE_COS = 0x22, // Cosine
RSX_FP_OPCODE_SIN = 0x23, // Sine
RSX_FP_OPCODE_PK2 = 0x24, // Pack two 16-bit floats
RSX_FP_OPCODE_UP2 = 0x25, // Unpack two 16-bit floats
RSX_FP_OPCODE_POW = 0x26, // Power
RSX_FP_OPCODE_PKB = 0x27, // Pack bytes
RSX_FP_OPCODE_UPB = 0x28, // Unpack bytes
RSX_FP_OPCODE_PK16 = 0x29, // Pack 16 bits
RSX_FP_OPCODE_UP16 = 0x2A, // Unpack 16
RSX_FP_OPCODE_BEM = 0x2B, // Bump-environment map (a.k.a. 2D coordinate transform)
RSX_FP_OPCODE_PKG = 0x2C, // Pack with sRGB transformation
RSX_FP_OPCODE_UPG = 0x2D, // Unpack gamma
RSX_FP_OPCODE_DP2A = 0x2E, // 2-component dot product with scalar addition
RSX_FP_OPCODE_TXL = 0x2F, // Texture sample with explicit LOD
RSX_FP_OPCODE_TXB = 0x31, // Texture sample with bias
RSX_FP_OPCODE_TEXBEM = 0x33,
RSX_FP_OPCODE_TXPBEM = 0x34,
RSX_FP_OPCODE_BEMLUM = 0x35,
RSX_FP_OPCODE_REFL = 0x36, // Reflection vector
RSX_FP_OPCODE_TIMESWTEX = 0x37,
RSX_FP_OPCODE_DP2 = 0x38, // 2-component dot product
RSX_FP_OPCODE_NRM = 0x39, // Normalize
RSX_FP_OPCODE_DIV = 0x3A, // Division
RSX_FP_OPCODE_DIVSQ = 0x3B, // Divide by Square Root
RSX_FP_OPCODE_LIF = 0x3C, // Final part of LIT
RSX_FP_OPCODE_FENCT = 0x3D, // Fence T?
RSX_FP_OPCODE_FENCB = 0x3E, // Fence B?
RSX_FP_OPCODE_BRK = 0x40, // Break
RSX_FP_OPCODE_CAL = 0x41, // Subroutine call
RSX_FP_OPCODE_IFE = 0x42, // If
RSX_FP_OPCODE_LOOP = 0x43, // Loop
RSX_FP_OPCODE_REP = 0x44, // Repeat
RSX_FP_OPCODE_RET = 0x45 // Return
};
union OPDEST union OPDEST
{ {
@ -116,6 +48,12 @@ union OPDEST
u32 no_dest : 1; u32 no_dest : 1;
u32 saturate : 1; // _sat u32 saturate : 1; // _sat
}; };
struct
{
u32 : 9;
u32 write_mask : 4;
};
}; };
union SRC0 union SRC0
@ -207,6 +145,23 @@ union SRC2
}; };
}; };
union SRC_Common
{
u32 HEX;
struct
{
u32 reg_type : 2;
u32 tmp_reg_index : 6;
u32 fp16 : 1;
u32 swizzle_x : 2;
u32 swizzle_y : 2;
u32 swizzle_z : 2;
u32 swizzle_w : 2;
u32 neg : 1;
};
};
constexpr const char* rsx_fp_input_attr_regs[] = constexpr const char* rsx_fp_input_attr_regs[] =
{ {
"WPOS", "COL0", "COL1", "FOGC", "TEX0", "WPOS", "COL0", "COL1", "FOGC", "TEX0",

View File

@ -156,9 +156,10 @@
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" /> <ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog.cpp" />
<ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" /> <ClCompile Include="Emu\RSX\Overlays\Shaders\shader_loading_dialog_native.cpp" />
<ClCompile Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.cpp" /> <ClCompile Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\FPOpcodes.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\RegisterAnnotationPass.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp" />
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\RegisterDependencyPass.cpp" /> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp" />
<ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" /> <ClCompile Include="Emu\RSX\Program\FragmentProgramRegister.cpp" />
<ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" /> <ClCompile Include="Emu\RSX\Program\ProgramStateCache.cpp" />
<ClCompile Include="Emu\RSX\Program\program_util.cpp" /> <ClCompile Include="Emu\RSX\Program\program_util.cpp" />
@ -703,9 +704,10 @@
<ClInclude Include="Emu\RSX\Overlays\overlay_video.h" /> <ClInclude Include="Emu\RSX\Overlays\overlay_video.h" />
<ClInclude Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.h" /> <ClInclude Include="Emu\RSX\Overlays\Trophies\overlay_trophy_list_dialog.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\CFG.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\CFG.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\FPOpcodes.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\IR.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\IR.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\RegisterAnnotationPass.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h" />
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\RegisterDependencyPass.h" /> <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h" />
<ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" /> <ClInclude Include="Emu\RSX\Program\FragmentProgramRegister.h" />
<ClInclude Include="Emu\RSX\Program\GLSLTypes.h" /> <ClInclude Include="Emu\RSX\Program\GLSLTypes.h" />
<ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" /> <ClInclude Include="Emu\RSX\Program\ProgramStateCache.h" />

View File

@ -139,6 +139,9 @@
<Filter Include="Emu\GPU\RSX\Program\Assembler\Passes"> <Filter Include="Emu\GPU\RSX\Program\Assembler\Passes">
<UniqueIdentifier>{d13db076-47e4-45b9-bb8a-6b711ea40622}</UniqueIdentifier> <UniqueIdentifier>{d13db076-47e4-45b9-bb8a-6b711ea40622}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Emu\GPU\RSX\Program\Assembler\Passes\FP">
<UniqueIdentifier>{7fb59544-9761-4b4a-bb04-07deb43cf3c2}</UniqueIdentifier>
</Filter>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="Crypto\aes.cpp"> <ClCompile Include="Crypto\aes.cpp">
@ -1381,11 +1384,14 @@
<ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp"> <ClCompile Include="Emu\RSX\Program\Assembler\FPToCFG.cpp">
<Filter>Emu\GPU\RSX\Program\Assembler</Filter> <Filter>Emu\GPU\RSX\Program\Assembler</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\RegisterAnnotationPass.cpp"> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.cpp">
<Filter>Emu\GPU\RSX\Program\Assembler\Passes</Filter> <Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="Emu\RSX\Program\Assembler\Passes\RegisterDependencyPass.cpp"> <ClCompile Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.cpp">
<Filter>Emu\GPU\RSX\Program\Assembler\Passes</Filter> <Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\Program\Assembler\FPOpcodes.cpp">
<Filter>Emu\GPU\RSX\Program\Assembler</Filter>
</ClCompile> </ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
@ -2785,11 +2791,14 @@
<ClInclude Include="Emu\RSX\Program\Assembler\IR.h"> <ClInclude Include="Emu\RSX\Program\Assembler\IR.h">
<Filter>Emu\GPU\RSX\Program\Assembler</Filter> <Filter>Emu\GPU\RSX\Program\Assembler</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\RegisterAnnotationPass.h"> <ClInclude Include="Emu\RSX\Program\Assembler\FPOpcodes.h">
<Filter>Emu\GPU\RSX\Program\Assembler\Passes</Filter> <Filter>Emu\GPU\RSX\Program\Assembler</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\RegisterDependencyPass.h"> <ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterAnnotationPass.h">
<Filter>Emu\GPU\RSX\Program\Assembler\Passes</Filter> <Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Program\Assembler\Passes\FP\RegisterDependencyPass.h">
<Filter>Emu\GPU\RSX\Program\Assembler\Passes\FP</Filter>
</ClInclude> </ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>