This commit is contained in:
TJnotJT 2025-12-14 20:12:35 +07:00 committed by GitHub
commit e6aef219cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 2650 additions and 409 deletions

View File

@ -1,6 +1,9 @@
// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#define ACCURATE_LINES 1
#define ACCURATE_TRIANGLES 2
#define FMT_32 0
#define FMT_24 1
#define FMT_16 2
@ -21,6 +24,11 @@
#define GS_FORWARD_PRIMID 0
#endif
#ifndef ZTST_GEQUAL
#define ZTST_GEQUAL 2
#define ZTST_GREATER 3
#endif
#ifndef PS_FST
#define PS_IIP 0
#define PS_FST 0
@ -84,6 +92,7 @@
#define SW_BLEND_NEEDS_RT (SW_BLEND && (PS_BLEND_A == 1 || PS_BLEND_B == 1 || PS_BLEND_C == 1 || PS_BLEND_D == 1))
#define SW_AD_TO_HW (PS_BLEND_C == 1 && PS_A_MASKED)
#define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1)
#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)
struct VS_INPUT
{
@ -94,6 +103,9 @@ struct VS_INPUT
uint z : POSITION1;
uint2 uv : TEXCOORD2;
float4 f : COLOR1;
#ifdef VS_ACCURATE_PRIMS
uint vertex_id : SV_VertexID;
#endif
};
struct VS_OUTPUT
@ -107,6 +119,12 @@ struct VS_OUTPUT
#else
nointerpolation float4 c : COLOR0;
#endif
#if VS_ACCURATE_PRIMS
nointerpolation uint accurate_prims_index : TEXCOORD3;
#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
nointerpolation uint accurate_triangles_interior : TEXCOORD4;
#endif
#endif
};
struct PS_INPUT
@ -122,6 +140,38 @@ struct PS_INPUT
#if (PS_DATE >= 1 && PS_DATE <= 3) || GS_FORWARD_PRIMID
uint primid : SV_PrimitiveID;
#endif
#if PS_ACCURATE_PRIMS
nointerpolation uint accurate_prims_index : TEXCOORD3;
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
nointerpolation uint accurate_triangles_interior : TEXCOORD4;
#endif
#endif
};
struct AccuratePrimsEdgeData
{
// Interpolated attributes
float4 t_float0; // 0
float4 t_float1; // 16
float4 t_int0; // 32
float4 t_int1; // 48
float4 c0; // 64
float4 c1; // 80
float4 p0; // 96
float4 p1; // 112
int4 edge0; // 128
int4 edge1; // 144
int2 xy0; // 160
int2 xy1; // 168
uint step_x; // 176
uint draw0; // 180
uint draw1; // 184
uint top_left; // 188
uint side; // 192
uint _pad0; // 196
uint _pad1; // 200
uint _pad2; // 204
// Total 208
};
#ifdef PIXEL_SHADER
@ -147,6 +197,8 @@ Texture2D<float4> Texture : register(t0);
Texture2D<float4> Palette : register(t1);
Texture2D<float4> RtTexture : register(t2);
Texture2D<float> PrimMinTexture : register(t3);
Texture2D<float> DepthTexture : register(t4);
StructuredBuffer<AccuratePrimsEdgeData> accurate_prims_data : register(t5);
SamplerState TextureSampler : register(s0);
#ifdef DX12
@ -172,6 +224,12 @@ cbuffer cb1
float4x4 DitherMatrix;
float ScaledScaleFactor;
float RcpScaleFactor;
uint _pad0;
uint _pad1;
uint accurate_prims_base_index;
uint _pad2;
uint _pad3;
uint _pad4;
};
float4 sample_c(float2 uv, float uv_w, int2 xy)
@ -1015,9 +1073,242 @@ void ps_blend(inout float4 Color, inout float4 As_rgba, float2 pos_xy)
}
}
#if PS_ACCURATE_PRIMS
// Interpolate vertex attributes over a line/edge manually.
void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1, inout PS_INPUT input)
{
float weight0_f = float(weight0);
float weight1_f = float(weight1);
float weight_total = float(weight0 + weight1);
float4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
float4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
float4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
// No interpolation for constant attributes.
input.t = lerp(t_float_interp, data.t_float1, float4(data.t_float1 == data.t_float0));
input.ti = lerp(t_int_interp, data.t_int1, float4(data.t_int1 == data.t_int0));
input.c = lerp(c_interp, data.c1, float4(data.c1 == data.c0));
input.p.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
// Clamp attributes. Fog/Z are normalized.
input.c = clamp(input.c, 0.0f, 255.0f);
input.t.z = clamp(input.t.z, 0.0f, 1.0f);
input.p.z = clamp(input.p.z, 0.0f, 1.0f);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
void HandleAccurateLines(inout PS_INPUT input, out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index];
int2 xy0 = data.xy0;
int2 xy1 = data.xy1;
int2 dxy = xy1 - xy0;
int2 xy0_i = (xy0 + 8) & ~0xF;
int2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool draw0 = bool(data.draw0);
bool draw1 = bool(data.draw1);
// 4-bit fixed point: 16 subpixels per pixel
int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside line range
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
if ((major_i == major0_i && !draw0) ||
(major_i == major1_i && !draw1))
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
#if PS_ACCURATE_PRIMS_AA
// Proper fixed-point AA rounding
int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
int alpha_i;
if (minor_i == minor_i_expected_0)
alpha_i = alpha_i_0;
else if (minor_i == minor_i_expected_1)
alpha_i = alpha_i_1;
else
{
alpha_i = 0; // Prevent compiler warning.
discard;
}
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
// Non-AA: fixed-point rounding and 4-bit alignment
int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
if (minor_i != minor_i_expected)
discard;
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1, input);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
void HandleAccurateTrianglesEdge(inout PS_INPUT input, out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index];
int2 xy0 = data.xy0;
int2 xy1 = data.xy1;
int2 dxy = xy1 - xy0;
int2 xy0_i = (xy0 + 8) & ~0xF;
int2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool side = bool(data.side);
bool top_left = bool(data.top_left);
// 4-bit fixed point: 16 subpixels per pixel
int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside edge range.
// Note: this is not exactly what the SW rasterizer does.
// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
// Discard if on wrong side of other edges
if (dot(data.edge0, int4(xy_i, 1, 0)) <= 0 ||
dot(data.edge1, int4(xy_i, 1, 0)) <= 0)
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
int minor_i_expected = minor_line / d_major;
int minor_i_expected_0 = minor_i_expected & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
// Proper fixed-point AA rounding
int alpha_i;
if ((minor_i_expected & 0xF) == 0)
{
// On a pixel center
alpha_i = top_left ? 0 : d_major_scaled;
minor_i_expected += top_left ? (side ? -16 : 16) : 0;
}
else if (side)
{
minor_i_expected = minor_i_expected_0;
alpha_i = alpha_i_0;
}
else
{
minor_i_expected = minor_i_expected_1;
alpha_i = alpha_i_1;
}
if (minor_i != minor_i_expected)
discard;
#if PS_ACCURATE_PRIMS_AA
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1, input);
}
#endif
PS_OUTPUT ps_main(PS_INPUT input)
{
#if PS_ACCURATE_PRIMS
float alpha_coverage;
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
HandleAccurateLines(input, alpha_coverage);
#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(input.accurate_triangles_interior))
{
alpha_coverage = 128.0f;
}
else
{
HandleAccurateTrianglesEdge(input, alpha_coverage);
}
#endif
#endif // PS_ACCURATE_PRIMS
#if NEEDS_DEPTH
float current_depth = DepthTexture.Load(int3(floor(input.p.xy), 0)).r;
#endif
#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
#if PS_ZTST == ZTST_GEQUAL
if (input.p.z < current_depth)
discard;
#elif PS_ZTST == ZTST_GREATER
if (input.p.z <= current_depth)
discard;
#endif
#endif // PS_ZTST
float4 C = ps_color(input);
#if PS_FIXED_ONE_A
// AA (Fixed one) will output a coverage of 1.0 as alpha
C.a = 128.0f;
#elif PS_ACCURATE_PRIMS_AA
// AA: coverage is computed in alpha_coverage
#if PS_ACCURATE_PRIMS_AA_ABE
if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
C.a = alpha_coverage;
#else
C.a = alpha_coverage;
#endif
#endif
bool atst_pass = atst(C);
#if PS_AFAIL == 0 // KEEP or ATST off
@ -1034,14 +1325,6 @@ PS_OUTPUT ps_main(PS_INPUT input)
discard;
}
// Must be done before alpha correction
// AA (Fixed one) will output a coverage of 1.0 as alpha
if (PS_FIXED_ONE_A)
{
C.a = 128.0f;
}
float4 alpha_blend = (float4)0.0f;
if (SW_AD_TO_HW)
{
@ -1210,7 +1493,14 @@ PS_OUTPUT ps_main(PS_INPUT input)
#endif // PS_DATE != 1/2
#if PS_ZCLAMP
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(input.accurate_triangles_interior))
output.depth = min(input.p.z, MaxDepthPS);
else
output.depth = current_depth; // No depth update for triangle edges.
#else
output.depth = min(input.p.z, MaxDepthPS);
#endif
#endif
return output;
@ -1236,7 +1526,9 @@ cbuffer cb0
float2 TextureOffset;
float2 PointSize;
uint MaxDepth;
uint BaseVertex; // Only used in DX11.
uint pad_cb0;
uint BaseVertex;
uint pad_cb0_2;
};
VS_OUTPUT vs_main(VS_INPUT input)
@ -1256,6 +1548,28 @@ VS_OUTPUT vs_main(VS_INPUT input)
output.p.xy = output.p.xy * float2(VertexScale.x, -VertexScale.y) - float2(VertexOffset.x, -VertexOffset.y);
output.p.z *= exp2(-32.0f); // integer->float depth
#if VS_ACCURATE_PRIMS == ACCURATE_LINES
output.accurate_prims_index = input.vertex_id / 6;
output.t = 0.0f;
output.ti = 0.0f;
output.c = 0.0f;
return output; // Don't send line vertex attributes - they are interpolated manually in the pixel shader.
#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
uint prim_id = input.vertex_id / 21;
output.accurate_triangles_interior = uint((input.vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
if (!bool(output.accurate_triangles_interior))
{
uint edge = (input.vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
output.accurate_prims_index = 3 * prim_id + edge;
output.t = 0.0f;
output.ti = 0.0f;
output.c = 0.0f;
return output; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
}
output.accurate_prims_index = 0;
// Send the interior vertex attributes for fixed function interpolation.
#endif
if(VS_TME)
{
float2 uv = input.uv - TextureOffset;

View File

@ -3,6 +3,9 @@
//#version 420 // Keep it for text editor detection
#define ACCURATE_LINES 1
#define ACCURATE_TRIANGLES 2
#define FMT_32 0
#define FMT_24 1
#define FMT_16 2
@ -11,6 +14,11 @@
#define SHUFFLE_WRITE 2
#define SHUFFLE_READWRITE 3
#ifndef ZTST_GEQUAL
#define ZTST_GEQUAL 2
#define ZTST_GREATER 3
#endif
// TEX_COORD_DEBUG output the uv coordinate as color. It is useful
// to detect bad sampling due to upscaling
//#define TEX_COORD_DEBUG
@ -28,6 +36,9 @@
#define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1)
#define NEEDS_RT (NEEDS_RT_EARLY || NEEDS_RT_FOR_AFAIL || (!PS_PRIMID_INIT && (PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW)))
#define NEEDS_TEX (PS_TFX != 4)
#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)
vec4 FragCoord;
layout(std140, binding = 0) uniform cb21
{
@ -57,8 +68,71 @@ layout(std140, binding = 0) uniform cb21
float ScaledScaleFactor;
float RcpScaleFactor;
uint _pad0;
uint _pad1;
uint accurate_prims_base_index;
uint _pad2;
uint _pad3;
uint _pad4;
};
#if PS_ACCURATE_PRIMS
struct
{
vec4 t_float;
vec4 t_int;
vec4 c;
} PSin;
in SHADER
{
vec4 t_float;
vec4 t_int;
#if PS_IIP != 0
vec4 c;
#else
flat vec4 c;
#endif
} PSinReal;
flat in uint accurate_prims_index;
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
flat in uint accurate_triangles_interior;
#endif
struct AccuratePrimsEdgeData
{
// Interpolated attributes
vec4 t_float0; // 0
vec4 t_float1; // 16
vec4 t_int0; // 32
vec4 t_int1; // 48
vec4 c0; // 64
vec4 c1; // 80
vec4 p0; // 96
vec4 p1; // 112
ivec4 edge0; // 128
ivec4 edge1; // 144
ivec2 xy0; // 160
ivec2 xy1; // 168
uint step_x; // 176
uint draw0; // 180
uint draw1; // 184
uint top_left; // 188
uint side; // 192
uint _pad0; // 196
uint _pad1; // 200
uint _pad2; // 204
// Total 208
};
layout (std140, binding = 3) buffer AccuratePrimsEdgeDataBuffer {
AccuratePrimsEdgeData accurate_prims_data[];
};
#else
in SHADER
{
vec4 t_float;
@ -70,6 +144,7 @@ in SHADER
flat vec4 c;
#endif
} PSin;
#endif
#define TARGET_0_QUALIFIER out
@ -107,9 +182,10 @@ layout(binding = 2) uniform sampler2D RtSampler; // note 2 already use by the im
#if PS_DATE == 3
layout(binding = 3) uniform sampler2D img_prim_min;
#endif
// I don't remember why I set this parameter but it is surely useless
//layout(pixel_center_integer) in vec4 gl_FragCoord;
#if NEEDS_DEPTH
layout(binding = 4) uniform sampler2D DepthSampler;
#endif
vec4 sample_from_rt()
@ -119,7 +195,16 @@ vec4 sample_from_rt()
#elif HAS_FRAMEBUFFER_FETCH
return LAST_FRAG_COLOR;
#else
return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0);
return texelFetch(RtSampler, ivec2(FragCoord.xy), 0);
#endif
}
vec4 sample_from_depth()
{
#if !NEEDS_DEPTH
return vec4(0.0);
#else
return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0);
#endif
}
@ -315,7 +400,7 @@ int fetch_raw_depth()
#if PS_TEX_IS_FB == 1
return int(sample_from_rt().r * multiplier);
#else
return int(texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0).r * multiplier);
return int(texelFetch(TextureSampler, ivec2(FragCoord.xy), 0).r * multiplier);
#endif
}
@ -324,7 +409,7 @@ vec4 fetch_raw_color()
#if PS_TEX_IS_FB == 1
return sample_from_rt();
#else
return texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0);
return texelFetch(TextureSampler, ivec2(FragCoord.xy), 0);
#endif
}
@ -724,9 +809,9 @@ void ps_dither(inout vec3 C, float As)
{
#if PS_DITHER > 0 && PS_DITHER < 3
#if PS_DITHER == 2
ivec2 fpos = ivec2(gl_FragCoord.xy);
ivec2 fpos = ivec2(FragCoord.xy);
#else
ivec2 fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor);
ivec2 fpos = ivec2(FragCoord.xy * RcpScaleFactor);
#endif
float value = DitherMatrix[fpos.y&3][fpos.x&3];
@ -967,11 +1052,233 @@ float As = As_rgba.a;
#endif
}
#if PS_ACCURATE_PRIMS
// Interpolate vertex attributes over a line/edge manually.
void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1)
{
float weight0_f = float(weight0);
float weight1_f = float(weight1);
float weight_total = float(weight0 + weight1);
vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
// No interpolation for constant attributes.
PSin.t_float = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0));
PSin.t_int = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0));
PSin.c = mix(c_interp, data.c1, equal(data.c1, data.c0));
FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
// Clamp attributes. Fog/Z are normalized.
PSin.c = clamp(PSin.c, 0.0f, 255.0f);
PSin.t_float.z = clamp(PSin.t_float.z, 0.0f, 1.0f);
FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
void HandleAccurateLines(out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
ivec2 xy0 = data.xy0;
ivec2 xy1 = data.xy1;
ivec2 dxy = xy1 - xy0;
ivec2 xy0_i = (xy0 + 8) & ~0xF;
ivec2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool draw0 = bool(data.draw0);
bool draw1 = bool(data.draw1);
// 4-bit fixed point: 16 subpixels per pixel
ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside line range
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
if ((major_i == major0_i && !draw0) ||
(major_i == major1_i && !draw1))
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
#if PS_ACCURATE_PRIMS_AA
// Proper fixed-point AA rounding
int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
int alpha_i;
if (minor_i == minor_i_expected_0)
alpha_i = alpha_i_0;
else if (minor_i == minor_i_expected_1)
alpha_i = alpha_i_1;
else
discard;
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
// Non-AA: fixed-point rounding and 4-bit alignment
int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
if (minor_i != minor_i_expected)
discard;
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
void HandleAccurateTrianglesEdge(out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
ivec2 xy0 = data.xy0;
ivec2 xy1 = data.xy1;
ivec2 dxy = xy1 - xy0;
ivec2 xy0_i = (xy0 + 8) & ~0xF;
ivec2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool side = bool(data.side);
bool top_left = bool(data.top_left);
// 4-bit fixed point: 16 subpixels per pixel
ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside edge range.
// Note: this is not exactly what the SW rasterizer does.
// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
// Discard if on wrong side of other edges
if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 ||
dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0)
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
int minor_i_expected = minor_line / d_major;
int minor_i_expected_0 = minor_i_expected & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
bool minor_i_pixel_center = ((minor_line - d_major * minor_i_expected_0) & 0xF) == 0;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
// Proper fixed-point AA rounding
int alpha_i;
if ((minor_i_expected & 0xF) == 0)
{
// On a pixel center
alpha_i = top_left ? 0 : d_major_scaled;
minor_i_expected += top_left ? (side ? -16 : 16) : 0;
}
else if (side)
{
minor_i_expected = minor_i_expected_0;
alpha_i = alpha_i_0;
}
else
{
minor_i_expected = minor_i_expected_1;
alpha_i = alpha_i_1;
}
if (minor_i != minor_i_expected)
discard;
#if PS_ACCURATE_PRIMS_AA
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1);
}
#endif
void ps_main()
{
FragCoord = gl_FragCoord;
#if PS_ACCURATE_PRIMS
float alpha_coverage;
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
HandleAccurateLines(alpha_coverage);
#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(accurate_triangles_interior))
{
alpha_coverage = 128.0f;
PSin.t_float = PSinReal.t_float;
PSin.t_int = PSinReal.t_int;
PSin.c = PSinReal.c;
}
else
{
HandleAccurateTrianglesEdge(alpha_coverage);
}
#endif
#endif // PS_ACCURATE_PRIMS
#if NEEDS_DEPTH
float current_depth = sample_from_depth().r;
#endif
#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
#if PS_ZTST == ZTST_GEQUAL
if (FragCoord.z < current_depth)
discard;
#elif PS_ZTST == ZTST_GREATER
if (FragCoord.z <= current_depth)
discard;
#endif
#endif // PS_ZTST
#if PS_SCANMSK & 2
// fail depth test on prohibited lines
if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1))
if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1))
discard;
#endif
@ -1007,7 +1314,7 @@ void ps_main()
#endif
#if PS_DATE == 3
int stencil_ceil = int(texelFetch(img_prim_min, ivec2(gl_FragCoord.xy), 0).r);
int stencil_ceil = int(texelFetch(img_prim_min, ivec2(FragCoord.xy), 0).r);
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
// the bad alpha value so we must keep it.
@ -1017,6 +1324,20 @@ void ps_main()
#endif
vec4 C = ps_color();
#if PS_FIXED_ONE_A
// AA (Fixed one) will output a coverage of 1.0 as alpha
C.a = 128.0f;
#elif PS_ACCURATE_PRIMS_AA
// AA: coverage is computed in alpha_coverage
#if PS_ACCURATE_PRIMS_AA_ABE
if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
C.a = alpha_coverage;
#else
C.a = alpha_coverage;
#endif
#endif
bool atst_pass = atst(C);
#if PS_AFAIL == 0 // KEEP or ATST off
@ -1024,13 +1345,6 @@ void ps_main()
discard;
#endif
// Must be done before alpha correction
// AA (Fixed one) will output a coverage of 1.0 as alpha
#if PS_FIXED_ONE_A
C.a = 128.0f;
#endif
#if SW_AD_TO_HW
#if PS_RTA_CORRECTION
vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f);
@ -1144,6 +1458,13 @@ void ps_main()
#endif
#if PS_ZCLAMP
gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS);
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(accurate_triangles_interior))
gl_FragDepth = min(FragCoord.z, MaxDepthPS);
else
gl_FragDepth = current_depth; // No depth update for triangle edges.
#else
gl_FragDepth = min(FragCoord.z, MaxDepthPS);
#endif
#endif
}

View File

@ -3,6 +3,16 @@
//#version 420 // Keep it for text editor detection
#define ACCURATE_LINES 1
#define ACCURATE_TRIANGLES 2
#if VS_ACCURATE_PRIMS
flat out uint accurate_prims_index;
#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
flat out uint accurate_triangles_interior;
#endif
#endif
layout(std140, binding = 1) uniform cb20
{
vec2 VertexScale;
@ -14,6 +24,8 @@ layout(std140, binding = 1) uniform cb20
vec2 PointSize;
uint MaxDepth;
uint pad_cb20;
uint BaseVertex;
uint pad_cb20_2;
};
#ifdef VERTEX_SHADER
@ -75,6 +87,28 @@ void vs_main()
gl_Position.z = float(z) * exp_min32;
gl_Position.w = 1.0f;
#if VS_ACCURATE_PRIMS == ACCURATE_LINES
accurate_prims_index = (gl_VertexID - BaseVertex) / 6;
VSout.t_float = vec4(0.0f);
VSout.t_int = vec4(0.0f);
VSout.c = vec4(0.0f);
return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader.
#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
uint vertex_id = gl_VertexID - BaseVertex;
uint prim_id = vertex_id / 21;
accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
if (!bool(accurate_triangles_interior))
{
uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
accurate_prims_index = 3 * prim_id + edge;
VSout.t_float = vec4(0.0f);
VSout.t_int = vec4(0.0f);
VSout.c = vec4(0.0f);
return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
}
// Send the interior vertex attributes for fixed function interpolation.
#endif
texture_coord();
VSout.c = i_c;

View File

@ -1,12 +1,23 @@
// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#define ACCURATE_LINES 1
#define ACCURATE_TRIANGLES 2
//////////////////////////////////////////////////////////////////////
// Vertex Shader
//////////////////////////////////////////////////////////////////////
#if defined(VERTEX_SHADER)
#if VS_ACCURATE_PRIMS
layout(location = 7) flat out uint accurate_prims_index;
#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
layout(location = 8) flat out uint accurate_triangles_interior;
#endif
#endif
layout(std140, set = 0, binding = 0) uniform cb0
{
vec2 VertexScale;
@ -16,6 +27,8 @@ layout(std140, set = 0, binding = 0) uniform cb0
vec2 PointSize;
uint MaxDepth;
uint pad_cb0;
uint BaseVertex;
uint pad_cb0_2;
};
layout(location = 0) out VSOutput
@ -55,6 +68,28 @@ void main()
gl_Position.z *= exp2(-32.0f); // integer->float depth
gl_Position.y = -gl_Position.y;
#if VS_ACCURATE_PRIMS == ACCURATE_LINES
accurate_prims_index = (gl_VertexIndex - BaseVertex) / 6;
vsOut.t = vec4(0.0f);
vsOut.ti = vec4(0.0f);
vsOut.c = vec4(0.0f);
return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader.
#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
uint vertex_id = gl_VertexIndex - BaseVertex;
uint prim_id = vertex_id / 21;
accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
if (!bool(accurate_triangles_interior))
{
uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
accurate_prims_index = 3 * prim_id + edge;
vsOut.t = vec4(0.0f);
vsOut.ti = vec4(0.0f);
vsOut.c = vec4(0.0f);
return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
}
// Send the interior vertex attributes for fixed function interpolation.
#endif
#if VS_TME
vec2 uv = a_uv - TextureOffset;
vec2 st = a_st - TextureOffset;
@ -245,6 +280,11 @@ void main()
#define GS_LINE 0
#endif
#ifndef ZTST_GEQUAL
#define ZTST_GEQUAL 2
#define ZTST_GREATER 3
#endif
#ifndef PS_FST
#define PS_FST 0
#define PS_WMS 0
@ -298,9 +338,12 @@ void main()
#define AFAIL_NEEDS_RT (PS_AFAIL == 3 && PS_NO_COLOR1)
#define PS_FEEDBACK_LOOP_IS_NEEDED (PS_TEX_IS_FB == 1 || AFAIL_NEEDS_RT || PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW || (PS_DATE >= 5))
#define PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)
#define NEEDS_TEX (PS_TFX != 4)
vec4 FragCoord;
layout(std140, set = 0, binding = 1) uniform cb1
{
vec3 FogColor;
@ -320,8 +363,71 @@ layout(std140, set = 0, binding = 1) uniform cb1
mat4 DitherMatrix;
float ScaledScaleFactor;
float RcpScaleFactor;
uint _pad0;
uint _pad1;
uint accurate_prims_base_index;
uint _pad2;
uint _pad3;
uint _pad4;
};
#if PS_ACCURATE_PRIMS
struct
{
vec4 t;
vec4 ti;
vec4 c;
} vsIn;
layout(location = 0) in VSOutput
{
vec4 t;
vec4 ti;
#if PS_IIP != 0
vec4 c;
#else
flat vec4 c;
#endif
} vsInReal;
layout(location = 7) flat in uint accurate_prims_index;
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
layout(location = 8) flat in uint accurate_triangles_interior;
#endif
struct AccuratePrimsEdgeData
{
// Interpolated attributes
vec4 t_float0; // 0
vec4 t_float1; // 16
vec4 t_int0; // 32
vec4 t_int1; // 48
vec4 c0; // 64
vec4 c1; // 80
vec4 p0; // 96
vec4 p1; // 112
ivec4 edge0; // 128
ivec4 edge1; // 144
ivec2 xy0; // 160
ivec2 xy1; // 168
uint step_x; // 176
uint draw0; // 180
uint draw1; // 184
uint top_left; // 188
uint side; // 192
uint _pad0; // 196
uint _pad1; // 200
uint _pad2; // 204
// Total 208
};
layout (std140, set = 0, binding = 3) readonly buffer AccuratePrimsEdgeDataBuffer {
AccuratePrimsEdgeData accurate_prims_data[];
};
#else // PS_ACCURATE_PRIMS
layout(location = 0) in VSOutput
{
vec4 t;
@ -333,6 +439,8 @@ layout(location = 0) in VSOutput
#endif
} vsIn;
#endif
#if !PS_NO_COLOR && !PS_NO_COLOR1
layout(location = 0, index = 0) out vec4 o_col0;
layout(location = 0, index = 1) out vec4 o_col1;
@ -345,13 +453,21 @@ layout(set = 1, binding = 0) uniform sampler2D Texture;
layout(set = 1, binding = 1) uniform texture2D Palette;
#endif
#if PS_FEEDBACK_LOOP_IS_NEEDED
#if PS_FEEDBACK_LOOP_IS_NEEDED || PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
#if defined(DISABLE_TEXTURE_BARRIER) || defined(HAS_FEEDBACK_LOOP_LAYOUT)
layout(set = 1, binding = 2) uniform texture2D RtSampler;
vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0); }
vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(FragCoord.xy), 0); }
#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
layout(set = 1, binding = 4) uniform texture2D DepthSampler;
vec4 sample_from_depth() { return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0); }
#endif
#else
layout(input_attachment_index = 0, set = 1, binding = 2) uniform subpassInput RtSampler;
vec4 sample_from_rt() { return subpassLoad(RtSampler); }
#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
layout(input_attachment_index = 1, set = 1, binding = 4) uniform subpassInput DepthSampler;
vec4 sample_from_depth() { return subpassLoad(DepthSampler); }
#endif
#endif
#endif
@ -925,19 +1041,19 @@ vec4 ps_color()
#if !NEEDS_TEX
vec4 T = vec4(0.0f);
#elif PS_CHANNEL_FETCH == 1
vec4 T = fetch_red(ivec2(gl_FragCoord.xy));
vec4 T = fetch_red(ivec2(FragCoord.xy));
#elif PS_CHANNEL_FETCH == 2
vec4 T = fetch_green(ivec2(gl_FragCoord.xy));
vec4 T = fetch_green(ivec2(FragCoord.xy));
#elif PS_CHANNEL_FETCH == 3
vec4 T = fetch_blue(ivec2(gl_FragCoord.xy));
vec4 T = fetch_blue(ivec2(FragCoord.xy));
#elif PS_CHANNEL_FETCH == 4
vec4 T = fetch_alpha(ivec2(gl_FragCoord.xy));
vec4 T = fetch_alpha(ivec2(FragCoord.xy));
#elif PS_CHANNEL_FETCH == 5
vec4 T = fetch_rgb(ivec2(gl_FragCoord.xy));
vec4 T = fetch_rgb(ivec2(FragCoord.xy));
#elif PS_CHANNEL_FETCH == 6
vec4 T = fetch_gXbY(ivec2(gl_FragCoord.xy));
vec4 T = fetch_gXbY(ivec2(FragCoord.xy));
#elif PS_DEPTH_FMT > 0
vec4 T = sample_depth(st_int, ivec2(gl_FragCoord.xy));
vec4 T = sample_depth(st_int, ivec2(FragCoord.xy));
#else
vec4 T = sample_color(st);
#endif
@ -985,9 +1101,9 @@ void ps_dither(inout vec3 C, float As)
ivec2 fpos;
#if PS_DITHER == 2
fpos = ivec2(gl_FragCoord.xy);
fpos = ivec2(FragCoord.xy);
#else
fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor);
fpos = ivec2(FragCoord.xy * RcpScaleFactor);
#endif
float value = DitherMatrix[fpos.y & 3][fpos.x & 3];
@ -1228,11 +1344,232 @@ void ps_blend(inout vec4 Color, inout vec4 As_rgba)
#endif
}
#if PS_ACCURATE_PRIMS
// Interpolate vertex attributes over a line/edge manually.
void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1)
{
float weight0_f = float(weight0);
float weight1_f = float(weight1);
float weight_total = float(weight0 + weight1);
vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
// No interpolation for constant attributes.
vsIn.t = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0));
vsIn.ti = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0));
vsIn.c = mix(c_interp, data.c1, equal(data.c1, data.c0));
FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
// Clamp attributes. Fog/Z are normalized.
vsIn.c = clamp(vsIn.c, 0.0f, 255.0f);
vsIn.t.z = clamp(vsIn.t.z, 0.0f, 1.0f);
FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
void HandleAccurateLines(out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
ivec2 xy0 = data.xy0;
ivec2 xy1 = data.xy1;
ivec2 dxy = xy1 - xy0;
ivec2 xy0_i = (xy0 + 8) & ~0xF;
ivec2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool draw0 = bool(data.draw0);
bool draw1 = bool(data.draw1);
// 4-bit fixed point: 16 subpixels per pixel
ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside line range
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
if ((major_i == major0_i && !draw0) ||
(major_i == major1_i && !draw1))
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
#if PS_ACCURATE_PRIMS_AA
// Proper fixed-point AA rounding
int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
int alpha_i;
if (minor_i == minor_i_expected_0)
alpha_i = alpha_i_0;
else if (minor_i == minor_i_expected_1)
alpha_i = alpha_i_1;
else
discard;
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
// Non-AA: fixed-point rounding and 4-bit alignment
int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
if (minor_i != minor_i_expected)
discard;
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1);
}
#endif
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
void HandleAccurateTrianglesEdge(out float alpha_coverage)
{
AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
ivec2 xy0 = data.xy0;
ivec2 xy1 = data.xy1;
ivec2 dxy = xy1 - xy0;
ivec2 xy0_i = (xy0 + 8) & ~0xF;
ivec2 xy1_i = (xy1 + 8) & ~0xF;
bool step_x = bool(data.step_x);
bool side = bool(data.side);
bool top_left = bool(data.top_left);
// 4-bit fixed point: 16 subpixels per pixel
ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
// Determine major/minor axes
int major0 = step_x ? xy0.x : xy0.y;
int major1 = step_x ? xy1.x : xy1.y;
int minor0 = step_x ? xy0.y : xy0.x;
int minor1 = step_x ? xy1.y : xy1.x;
int major_i = step_x ? xy_i.x : xy_i.y;
int minor_i = step_x ? xy_i.y : xy_i.x;
int d_major = step_x ? dxy.x : dxy.y;
int d_major_scaled = 16 * d_major;
int major0_i = step_x ? xy0_i.x : xy0_i.y;
int major1_i = step_x ? xy1_i.x : xy1_i.y;
// Discard if outside edge range.
// Note: this is not exactly what the SW rasterizer does.
// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
if (major_i < min(major0_i, major1_i) ||
major_i > max(major0_i, major1_i))
discard;
// Discard if on wrong side of other edges
if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 ||
dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0)
discard;
int weight0 = major1 - major_i;
int weight1 = major_i - major0;
// Compute minor axis line in fixed-point
int minor_line = weight1 * minor1 + weight0 * minor0;
int minor_i_expected = minor_line / d_major;
int minor_i_expected_0 = minor_i_expected & ~0xF;
int minor_i_expected_1 = minor_i_expected_0 + 16;
int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
int alpha_i_1 = d_major_scaled - alpha_i_0;
// Proper fixed-point AA rounding
int alpha_i;
if ((minor_i_expected & 0xF) == 0)
{
// On a pixel center
alpha_i = top_left ? 0 : d_major_scaled;
minor_i_expected += top_left ? (side ? -16 : 16) : 0;
}
else if (side)
{
minor_i_expected = minor_i_expected_0;
alpha_i = alpha_i_0;
}
else
{
minor_i_expected = minor_i_expected_1;
alpha_i = alpha_i_1;
}
if (minor_i != minor_i_expected)
discard;
#if PS_ACCURATE_PRIMS_AA
// Make sure that the output alpha is always <= 127 for AA.
alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
#else
alpha_coverage = 128.0f;
#endif
// Interpolate attributes
InterpolateAttributesManual(data, weight0, weight1);
}
#endif
void main()
{
FragCoord = gl_FragCoord;
#if PS_ACCURATE_PRIMS
float alpha_coverage;
#if PS_ACCURATE_PRIMS == ACCURATE_LINES
HandleAccurateLines(alpha_coverage);
#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(accurate_triangles_interior))
{
alpha_coverage = 128.0f;
vsIn.t = vsInReal.t;
vsIn.ti = vsInReal.ti;
vsIn.c = vsInReal.c;
}
else
{
HandleAccurateTrianglesEdge(alpha_coverage);
}
#endif
#endif // PS_ACCURATE_PRIMS
#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
float current_depth = sample_from_depth().r;
#endif
#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
#if PS_ZTST == ZTST_GEQUAL
if (FragCoord.z < current_depth)
discard;
#elif PS_ZTST == ZTST_GREATER
if (FragCoord.z <= current_depth)
discard;
#endif
#endif // PS_ZTST
#if PS_SCANMSK & 2
// fail depth test on prohibited lines
if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1))
if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1))
discard;
#endif
#if PS_DATE >= 5
@ -1267,7 +1604,7 @@ void main()
#endif // PS_DATE >= 5
#if PS_DATE == 3
int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(gl_FragCoord.xy), 0).r);
int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(FragCoord.xy), 0).r);
// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
// the bad alpha value so we must keep it.
@ -1277,6 +1614,20 @@ void main()
#endif
vec4 C = ps_color();
#if PS_FIXED_ONE_A
// AA (Fixed one) will output a coverage of 1.0 as alpha
C.a = 128.0f;
#elif PS_ACCURATE_PRIMS_AA
// AA: coverage is computed in alpha_coverage
#if PS_ACCURATE_PRIMS_AA_ABE
if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
C.a = alpha_coverage;
#else
C.a = alpha_coverage;
#endif
#endif
bool atst_pass = atst(C);
#if PS_AFAIL == 0 // KEEP or ATST off
@ -1284,13 +1635,6 @@ void main()
discard;
#endif
// Must be done before alpha correction
// AA (Fixed one) will output a coverage of 1.0 as alpha
#if PS_FIXED_ONE_A
C.a = 128.0f;
#endif
#if SW_AD_TO_HW
#if PS_RTA_CORRECTION
vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f);
@ -1401,9 +1745,15 @@ void main()
#endif
#if PS_ZCLAMP
gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS);
#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
if (bool(accurate_triangles_interior))
gl_FragDepth = min(FragCoord.z, MaxDepthPS);
else
gl_FragDepth = current_depth; // No depth update for triangle edges.
#else
gl_FragDepth = min(FragCoord.z, MaxDepthPS);
#endif
#endif
#endif // PS_DATE
}

View File

@ -757,6 +757,7 @@ struct Pcsx2Config
PreloadFrameWithGSData : 1,
Mipmap : 1,
HWMipmap : 1,
HWAccuratePrims: 1,
ManualUserHacks : 1,
UserHacks_AlignSpriteX : 1,
UserHacks_CPUFBConversion : 1,

View File

@ -431,6 +431,10 @@ const char* GSState::GetFlushReasonString(GSFlushReason reason)
return "VSYNC";
case GSFlushReason::GSREOPEN:
return "GS REOPEN";
case GSFlushReason::VERTEXCOUNT:
return "VERTEX COUNT";
case GSFlushReason::VERTEXCOUNTEXPANDED:
return "VERTEX COUNT EXPANDED";
case GSFlushReason::UNKNOWN:
default:
return "UNKNOWN";
@ -3265,6 +3269,20 @@ void GSState::UpdateVertexKick()
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim];
if (UsingAccuratePrims())
{
if (GSUtil::GetPrimClass(prim) == GS_LINE_CLASS)
m_vertex_expansion_factor = 3;
else if (GSUtil::GetPrimClass(prim) == GS_TRIANGLE_CLASS)
m_vertex_expansion_factor = 7;
else
pxFail("Wrong primitive class."); // Impossible.
}
else
{
m_vertex_expansion_factor = 1;
}
}
void GSState::GrowVertexBuffer()
@ -4891,6 +4909,12 @@ __forceinline void GSState::VertexKick(u32 skip)
constexpr u32 max_vertices = MaxVerticesForPrim(prim);
if (max_vertices != 0 && m_vertex.tail >= max_vertices)
Flush(VERTEXCOUNT);
if (m_vertex_expansion_factor != 1)
{
if (max_vertices != 0 && (m_vertex_expansion_factor * m_index.tail) >= max_vertices)
Flush(VERTEXCOUNTEXPANDED);
}
}
/// Checks if region repeat is used (applying it does something to at least one of the values in min...max)
@ -5227,12 +5251,15 @@ void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
// Limit max to 255 as we send 500 when we don't know, makes calculating 24/16bit easier.
int min = tex_alpha_min, max = std::min(tex_alpha_max, 255);
if (IsCoverageAlpha())
if (IsCoverageAlphaFixedOne())
{
// HW renderer doesn't currently support AA, so its min is 128.
// If we add AA support to the HW renderer, this will need to be changed.
// (Will probably only be supported with ROV/FBFetch so we would want to check for that.)
min = GSIsHardwareRenderer() ? 128 : 0;
// HW renderer doesn't support AA1, assume alpha is constant 128.
min = 128;
max = 128;
}
else if (IsCoverageAlphaSupported())
{
min = 0;
max = 128;
}
else
@ -5527,7 +5554,24 @@ bool GSState::IsMipMapActive()
bool GSState::IsCoverageAlpha()
{
return !PRIM->ABE && PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
return PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
}
bool GSState::IsCoverageAlphaFixedOne()
{
return IsCoverageAlpha() && !PRIM->ABE && !IsCoverageAlphaSupported();
}
bool GSState::IsCoverageAlphaSupported()
{
return false;
}
bool GSState::UsingAccuratePrims()
{
return g_gs_device->Features().accurate_prims &&
(GSUtil::GetPrimClass(PRIM->PRIM) == GS_LINE_CLASS ||
(GSUtil::GetPrimClass(PRIM->PRIM) == GS_TRIANGLE_CLASS && PRIM->AA1));
}
GIFRegTEX0 GSState::GetTex0Layer(u32 lod)

View File

@ -165,6 +165,8 @@ protected:
u32 tail;
} m_draw_index = {};
int m_vertex_expansion_factor = 1;
void UpdateContext();
void UpdateScissor();
@ -207,6 +209,9 @@ protected:
bool IsMipMapDraw();
bool IsMipMapActive();
bool IsCoverageAlpha();
bool IsCoverageAlphaFixedOne();
virtual bool IsCoverageAlphaSupported();
bool UsingAccuratePrims();
void CalcAlphaMinMax(const int tex_min, const int tex_max);
void CorrectATEAlphaMinMax(const u32 atst, const int aref);
@ -327,6 +332,7 @@ public:
VSYNC = 1 << 13,
GSREOPEN = 1 << 14,
VERTEXCOUNT = 1 << 15,
VERTEXCOUNTEXPANDED = 1 << 16,
};
GSFlushReason m_state_flush_reason = UNKNOWN;

View File

@ -57,6 +57,16 @@ public:
return (std::memcmp(this, &v, sizeof(*this)) != 0);
}
constexpr GSVector2T operator+(const GSVector2T& v) const
{
return {x + v.x, y + v.y};
}
constexpr GSVector2T operator-(const GSVector2T& v) const
{
return {x - v.x, y - v.y};
}
constexpr GSVector2T operator*(const GSVector2T& v) const
{
return { x * v.x, y * v.y };
@ -81,6 +91,11 @@ public:
typedef GSVector2T<float> GSVector2;
typedef GSVector2T<int> GSVector2i;
constexpr GSVector2i operator&(const GSVector2i& v0, const GSVector2i& v1)
{
return {v0.x & v1.x, v0.y & v1.y};
}
class GSVector4;
class GSVector4i;

View File

@ -289,6 +289,41 @@ struct HWBlend
BlendFactor src, dst;
};
struct alignas(16) AccuratePrimsEdgeData
{
// Interpolated attributes
GSVector4 t_float0; // 0
GSVector4 t_float1; // 16
GSVector4 t_int0; // 32
GSVector4 t_int1; // 48
GSVector4 c0; // 64
GSVector4 c1; // 80
GSVector4 p0; // 96
GSVector4 p1; // 112
GSVector4i edge0; // 128
GSVector4i edge1; // 144
GSVector2i xy0; // 160
GSVector2i xy1; // 168
u32 step_x; // 176
u32 draw0; // 180
u32 draw1; // 184
u32 top_left; // 188
u32 side; // 192
u32 _pad0; // 196
u32 _pad1; // 200
u32 _pad2; // 204
// Total 208
};
static_assert(sizeof(AccuratePrimsEdgeData) == 208);
enum
{
ACCURATE_PRIMS_DISABLE = 0,
ACCURATE_PRIMS_LINE = 1,
ACCURATE_PRIMS_TRIANGLE = 2
};
struct alignas(16) GSHWDrawConfig
{
enum class Topology: u8
@ -316,7 +351,7 @@ struct alignas(16) GSHWDrawConfig
u8 iip : 1;
u8 point_size : 1; ///< Set when points need to be expanded without VS expanding.
VSExpand expand : 2;
u8 _free : 2;
u8 accurate_prims : 2; // 0 - disables; 1 - lines; 2 - triangles.
};
u8 key;
};
@ -354,6 +389,7 @@ struct alignas(16) GSHWDrawConfig
u32 date : 3;
u32 atst : 3;
u32 afail : 2;
u32 ztst : 2;
// Color sampling
u32 fst : 1; // Investigate to do it on the VS
u32 tfx : 3;
@ -414,6 +450,11 @@ struct alignas(16) GSHWDrawConfig
// Scan mask
u32 scanmsk : 2;
// Accurate lines
u32 accurate_prims : 2; // 0 - disabled; 1 - lines; 2 - triangles
u32 accurate_prims_aa : 1;
u32 accurate_prims_aa_abe : 1;
};
struct
@ -435,6 +476,13 @@ struct alignas(16) GSHWDrawConfig
return channel_fb || tex_is_fb || fbmask || (date >= 5) || sw_blend_needs_rt;
}
__fi bool IsFeedbackLoopDepth() const
{
// Note: Manual depth testing/interpolation for accurate prims is bundled with zclamp to reduce pipeline combinations.
// The zclamp is used to indicate that either Z write of Z testing is enabled.
return (accurate_prims == ACCURATE_PRIMS_TRIANGLE) && accurate_prims_aa && zclamp;
}
/// Disables color output from the pixel shader, this is done when all channels are masked.
__fi void DisableColorOutput()
{
@ -579,6 +627,7 @@ struct alignas(16) GSHWDrawConfig
GSVector2 texture_offset;
GSVector2 point_size;
GSVector2i max_depth;
GSVector2i base_vertex;
__fi VSConstantBuffer()
{
memset(static_cast<void*>(this), 0, sizeof(*this));
@ -628,6 +677,8 @@ struct alignas(16) GSHWDrawConfig
GSVector4 ScaleFactor;
GSVector4i accurate_prims_base_index;
__fi PSConstantBuffer()
{
memset(static_cast<void*>(this), 0, sizeof(*this));
@ -745,6 +796,9 @@ struct alignas(16) GSHWDrawConfig
SetDATM datm : 2;
bool line_expand : 1;
bool accurate_prims;
std::vector<AccuratePrimsEdgeData>* accurate_prims_edge_data;
struct AlphaPass
{
alignas(8) PSSelector ps;
@ -843,6 +897,7 @@ public:
bool stencil_buffer : 1; ///< Supports stencil buffer, and can use for DATE.
bool cas_sharpening : 1; ///< Supports sufficient functionality for contrast adaptive sharpening.
bool test_and_sample_depth: 1; ///< Supports concurrently binding the depth-stencil buffer for sampling and depth testing.
bool accurate_prims : 1; ///< Supports AA1 triangles/lines and accurate lines shaders.
FeatureSupport()
{
memset(this, 0, sizeof(*this));

View File

@ -14,6 +14,7 @@
#include "common/Error.h"
#include "common/Path.h"
#include "common/StringUtil.h"
#include "common/ScopedGuard.h"
#include "imgui.h"
#include "IconsFontAwesome6.h"
@ -395,6 +396,39 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
}
}
bd = {};
if (m_features.accurate_prims)
{
bd.Usage = D3D11_USAGE_DEFAULT;
bd.CPUAccessFlags = 0;
bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
bd.StructureByteStride = sizeof(AccuratePrimsEdgeData);
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
if (FAILED(m_dev->CreateBuffer(&bd, nullptr, m_accurate_prims_b.put())))
{
Console.Error("D3D11: Failed to create accurate prims buffer.");
return false;
}
const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc(
D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0,
ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData));
if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc,
m_accurate_prims_b_srv.put())))
{
Console.Error("D3D11: Failed to create accurate prims buffer SRV.");
return false;
}
// If MAX_TEXTURES changes, please change the register for this buffer in the shader.
static_assert(MAX_TEXTURES == 5);
m_ctx->PSSetShaderResources(5, 1, m_accurate_prims_b_srv.addressof());
}
// rasterizer
memset(&rd, 0, sizeof(rd));
@ -541,6 +575,8 @@ void GSDevice11::Destroy()
m_expand_vb_srv.reset();
m_expand_vb.reset();
m_expand_ib.reset();
m_accurate_prims_b.reset();
m_accurate_prims_b_srv.reset();
m_vs.clear();
m_vs_cb.reset();
@ -599,6 +635,8 @@ void GSDevice11::SetFeatures(IDXGIAdapter1* adapter)
m_max_texture_size = (m_feature_level >= D3D_FEATURE_LEVEL_11_0) ?
D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION :
D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
m_features.accurate_prims = GSConfig.HWAccuratePrims;
}
bool GSDevice11::HasSurface() const
@ -1665,6 +1703,7 @@ void GSDevice11::SetupVS(VSSelector sel, const GSHWDrawConfig::VSConstantBuffer*
sm.AddMacro("VS_FST", sel.fst);
sm.AddMacro("VS_IIP", sel.iip);
sm.AddMacro("VS_EXPAND", static_cast<int>(sel.expand));
sm.AddMacro("VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));
static constexpr const D3D11_INPUT_ELEMENT_DESC layout[] =
{
@ -1766,6 +1805,10 @@ void GSDevice11::SetupPS(const PSSelector& sel, const GSHWDrawConfig::PSConstant
sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb);
sm.AddMacro("PS_NO_COLOR", sel.no_color);
sm.AddMacro("PS_NO_COLOR1", sel.no_color1);
sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims);
sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
sm.AddMacro("PS_ZTST", sel.ztst);
wil::com_ptr_nothrow<ID3D11PixelShader> ps = m_shader_cache.GetPixelShader(m_dev.get(), m_tfx_source, sm.GetPtr(), "ps_main");
i = m_ps.try_emplace(sel, std::move(ps)).first;
@ -2280,6 +2323,32 @@ bool GSDevice11::IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 cou
return true;
}
bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
if (size > ACCURATE_PRIMS_BUFFER_SIZE)
return false;
// Performance note: UpdateSubresource() copies data to a temp staging buffer to avoid stalling the GPU,
// so a manual ring buffer is not needed here like VK/DX12.
D3D11_BOX dst_region{};
dst_region.left = 0;
dst_region.right = size;
dst_region.top = 0;
dst_region.bottom = 1;
dst_region.front = 0;
dst_region.back = 1;
m_ctx->UpdateSubresource(m_accurate_prims_b.get(), 0, &dst_region, config.accurate_prims_edge_data->data(), size, 0);
config.cb_ps.accurate_prims_base_index.x = 0; // No offsetting needed like DX12/VK since we don't use a ring buffer.
}
return true;
}
u16* GSDevice11::IAMapIndexBuffer(u32 count)
{
if (count > (INDEX_BUFFER_SIZE / sizeof(u16)))
@ -2583,6 +2652,18 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
{
const GSVector2i rtsize = (config.rt ? config.rt : config.ds)->GetSize();
GSTexture* colclip_rt = g_gs_device->GetColorClipTexture();
GSTexture* draw_rt_clone = nullptr;
GSTexture* draw_ds_clone = nullptr;
GSTexture* primid_texture = nullptr;
ScopedGuard recycle_temp_textures([&]() {
if (draw_rt_clone)
Recycle(draw_rt_clone);
if (draw_ds_clone)
Recycle(draw_ds_clone);
if (primid_texture)
Recycle(primid_texture);
});
if (colclip_rt)
{
@ -2627,7 +2708,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
// Destination Alpha Setup
const bool multidraw_fb_copy = m_features.multidraw_fb_copy && (config.require_one_barrier || config.require_full_barrier);
GSTexture* primid_texture = nullptr;
if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking)
{
primid_texture = CreateRenderTarget(rtsize.x, rtsize.y, GSTexture::Format::PrimID, false);
@ -2652,7 +2732,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
return;
}
config.cb_vs.max_depth.y = m_vertex.start;
config.cb_vs.base_vertex = m_vertex.start;
}
else
{
@ -2663,6 +2743,12 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
}
}
if (!SetupAccuratePrims(config))
{
Console.Error("D3D11: Failed to setup accurate prims");
return;
}
if (config.vs.UseExpandIndexBuffer())
{
IASetIndexBuffer(m_expand_ib.get());
@ -2742,8 +2828,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
draw_ds = m_state.cached_dsv;
}
GSTexture* draw_rt_clone = nullptr;
if (draw_rt && (config.require_one_barrier || (config.require_full_barrier && m_features.multidraw_fb_copy) || (config.tex && config.tex == config.rt)))
{
// Requires a copy of the RT.
@ -2754,6 +2838,15 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
Console.Warning("D3D11: Failed to allocate temp texture for RT copy.");
}
if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth())
{
// Requires a copy of the DS.
// Used as "bind ds" flag when texture barrier is unsupported for tex is fb.
draw_ds_clone = CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true);
if (!draw_rt_clone)
Console.Warning("D3D11: Failed to allocate temp texture for DS copy.");
}
OMSetRenderTargets(draw_rt, draw_ds, &config.scissor, read_only_dsv);
SetupOM(config.depth, OMBlendSelector(config.colormask, config.blend), config.blend.constant);
@ -2761,7 +2854,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::StencilOne && multidraw_fb_copy)
m_ctx->ClearDepthStencilView(*static_cast<GSTexture11*>(draw_ds), D3D11_CLEAR_STENCIL, 0.0f, 1);
SendHWDraw(config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false);
SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, config.require_one_barrier, config.require_full_barrier, false);
if (config.blend_multi_pass.enable)
{
@ -2787,15 +2880,10 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
}
SetupOM(config.alpha_second_pass.depth, OMBlendSelector(config.alpha_second_pass.colormask, config.blend), config.blend.constant);
SendHWDraw(config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
}
if (draw_rt_clone)
Recycle(draw_rt_clone);
if (primid_texture)
Recycle(primid_texture);
if (colclip_rt)
{
config.colclip_update_area = config.colclip_update_area.runion(config.drawarea);
@ -2814,19 +2902,29 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
}
}
void GSDevice11::SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
void GSDevice11::SendHWDraw(const GSHWDrawConfig& config,
GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds,
const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
{
if (draw_rt_clone)
if (draw_rt_clone || draw_ds_clone)
{
#ifdef PCSX2_DEVBUILD
if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
Console.Warning("D3D11: Possible unnecessary copy detected.");
#endif
auto CopyAndBind = [&](GSVector4i drawarea) {
if (draw_rt_clone)
CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top);
if (draw_ds_clone)
CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top);
if (one_barrier || full_barrier)
{
if (draw_rt_clone)
PSSetShaderResource(2, draw_rt_clone);
if (draw_ds_clone)
PSSetShaderResource(4, draw_ds_clone);
}
if (config.tex && config.tex == config.rt)
PSSetShaderResource(0, draw_rt_clone);
};

View File

@ -83,10 +83,14 @@ public:
private:
enum : u32
{
MAX_TEXTURES = 4,
MAX_TEXTURES = 5,
MAX_SAMPLERS = 1,
VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
// Structured buffer size must be multiple of element size.
ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData),
NUM_TIMESTAMP_QUERIES = 5,
};
@ -126,6 +130,8 @@ private:
wil::com_ptr_nothrow<ID3D11Buffer> m_expand_vb;
wil::com_ptr_nothrow<ID3D11Buffer> m_expand_ib;
wil::com_ptr_nothrow<ID3D11ShaderResourceView> m_expand_vb_srv;
wil::com_ptr_nothrow<ID3D11Buffer> m_accurate_prims_b;
wil::com_ptr_nothrow<ID3D11ShaderResourceView> m_accurate_prims_b_srv;
D3D_FEATURE_LEVEL m_feature_level = D3D_FEATURE_LEVEL_10_0;
u32 m_vb_pos = 0; // bytes
@ -317,6 +323,7 @@ public:
void IAUnmapVertexBuffer(u32 stride, u32 count);
bool IASetVertexBuffer(const void* vertex, u32 stride, u32 count);
bool IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 count);
bool SetupAccuratePrims(GSHWDrawConfig& config);
u16* IAMapIndexBuffer(u32 count);
void IAUnmapIndexBuffer(u32 count);
@ -345,7 +352,9 @@ public:
void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, u8 afix);
void RenderHW(GSHWDrawConfig& config) override;
void SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
void SendHWDraw(const GSHWDrawConfig& config,
GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds,
const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
void ClearSamplerCache() override;

View File

@ -20,29 +20,33 @@ D3D12StreamBuffer::~D3D12StreamBuffer()
Destroy();
}
bool D3D12StreamBuffer::Create(u32 size)
bool D3D12StreamBuffer::Create(u32 size, bool default_heap)
{
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN,
{1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
D3D12MA::ALLOCATION_DESC allocationDesc = {};
allocationDesc.Flags = D3D12MA::ALLOCATION_FLAG_COMMITTED;
allocationDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD;
allocationDesc.HeapType = default_heap ? D3D12_HEAP_TYPE_DEFAULT : D3D12_HEAP_TYPE_UPLOAD;
wil::com_ptr_nothrow<ID3D12Resource> buffer;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocationDesc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
default_heap ? D3D12_RESOURCE_STATE_COMMON : D3D12_RESOURCE_STATE_GENERIC_READ,
nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
pxAssertMsg(SUCCEEDED(hr), "Allocate buffer");
if (FAILED(hr))
return false;
static const D3D12_RANGE read_range = {};
u8* host_pointer;
u8* host_pointer = nullptr;
if (!default_heap)
{
hr = buffer->Map(0, &read_range, reinterpret_cast<void**>(&host_pointer));
pxAssertMsg(SUCCEEDED(hr), "Map buffer");
if (FAILED(hr))
return false;
}
Destroy(true);
@ -51,6 +55,7 @@ bool D3D12StreamBuffer::Create(u32 size)
m_host_pointer = host_pointer;
m_size = size;
m_gpu_pointer = m_buffer->GetGPUVirtualAddress();
m_default_heap = default_heap;
return true;
}
@ -148,6 +153,7 @@ void D3D12StreamBuffer::Destroy(bool defer)
m_current_offset = 0;
m_current_space = 0;
m_current_gpu_position = 0;
m_default_heap = false;
m_tracked_fences.clear();
}

View File

@ -22,7 +22,7 @@ public:
D3D12StreamBuffer();
~D3D12StreamBuffer();
bool Create(u32 size);
bool Create(u32 size, bool default_heap = false);
__fi bool IsValid() const { return static_cast<bool>(m_buffer); }
__fi ID3D12Resource* GetBuffer() const { return m_buffer.get(); }
@ -54,7 +54,8 @@ private:
wil::com_ptr_nothrow<ID3D12Resource> m_buffer;
wil::com_ptr_nothrow<D3D12MA::Allocation> m_allocation;
D3D12_GPU_VIRTUAL_ADDRESS m_gpu_pointer = {};
u8* m_host_pointer = nullptr;
u8* m_host_pointer = nullptr; // Only used for upload heaps.
bool m_default_heap = false; // False for upload heap; true for default heap.
// List of fences and the corresponding positions in the buffer
std::deque<std::pair<u64, u32>> m_tracked_fences;

View File

@ -624,52 +624,91 @@ bool GSDevice12::SetGPUTimingEnabled(bool enabled)
bool GSDevice12::AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer,
D3D12MA::Allocation** gpu_allocation, const std::function<void(void*)>& fill_callback)
{
// Try to place the fixed index buffer in GPU local memory.
// Use the staging buffer to copy into it.
// Allocate and fill staging buffer
ID3D12Resource* cpu_buffer = AllocateUploadStagingBuffer(size, fill_callback);
// Create GPU buffer
const D3D12_RESOURCE_DESC rd = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0},
D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
const D3D12MA::ALLOCATION_DESC cpu_ad = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
ComPtr<ID3D12Resource> cpu_buffer;
ComPtr<D3D12MA::Allocation> cpu_allocation;
HRESULT hr = m_allocator->CreateResource(
&cpu_ad, &rd, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, cpu_allocation.put(), IID_PPV_ARGS(cpu_buffer.put()));
pxAssertMsg(SUCCEEDED(hr), "Allocate CPU buffer");
if (FAILED(hr))
return false;
static constexpr const D3D12_RANGE read_range = {};
const D3D12_RANGE write_range = {0, size};
void* mapped;
hr = cpu_buffer->Map(0, &read_range, &mapped);
pxAssertMsg(SUCCEEDED(hr), "Map CPU buffer");
if (FAILED(hr))
return false;
fill_callback(mapped);
cpu_buffer->Unmap(0, &write_range);
const D3D12MA::ALLOCATION_DESC gpu_ad = {D3D12MA::ALLOCATION_FLAG_COMMITTED, D3D12_HEAP_TYPE_DEFAULT};
hr = m_allocator->CreateResource(
HRESULT hr = m_allocator->CreateResource(
&gpu_ad, &rd, D3D12_RESOURCE_STATE_COMMON, nullptr, gpu_allocation, IID_PPV_ARGS(gpu_buffer));
pxAssertMsg(SUCCEEDED(hr), "Allocate GPU buffer");
if (FAILED(hr))
return false;
GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer.get(), 0, size);
// Copy the data
GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer, 0, size);
// Transition GPU buffer to COPY_DEST
D3D12_RESOURCE_BARRIER rb = {D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, D3D12_RESOURCE_BARRIER_FLAG_NONE};
rb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
rb.Transition.pResource = *gpu_buffer;
rb.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; // COMMON -> COPY_DEST at first use.
rb.Transition.StateAfter = D3D12_RESOURCE_STATE_INDEX_BUFFER;
GetInitCommandList()->ResourceBarrier(1, &rb);
DeferResourceDestruction(cpu_allocation.get(), cpu_buffer.get());
return true;
}
ID3D12Resource* GSDevice12::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
{
if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
GSDevice12::GetInstance()->ExecuteCommandList(
false, "While waiting for %u bytes in texture upload buffer", size);
if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
return nullptr;
}
}
offset_out = m_texture_stream_buffer.GetCurrentOffset();
write_data(m_texture_stream_buffer.GetCurrentHostPointer());
m_texture_stream_buffer.CommitMemory(size);
return m_texture_stream_buffer.GetBuffer();
}
ID3D12Resource* GSDevice12::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
{
wil::com_ptr_nothrow<ID3D12Resource> resource;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
// Allocate staging buffer
const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1,
DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
HRESULT hr = GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
return nullptr;
}
// Map
static constexpr const D3D12_RANGE read_range = {};
void* map_ptr;
hr = resource->Map(0, &read_range, &map_ptr);
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
return nullptr;
}
// Write data
write_data(map_ptr);
// Unmap
const D3D12_RANGE write_range = {0, size};
resource->Unmap(0, &write_range);
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
// This adds the reference needed to keep the buffer alive.
DeferResourceDestruction(allocation.get(), resource.get());
return resource.get();
}
RenderAPI GSDevice12::GetRenderAPI() const
{
return RenderAPI::D3D12;
@ -1250,6 +1289,8 @@ bool GSDevice12::CheckFeatures(const u32& vendor_id)
DXGI_FEATURE_PRESENT_ALLOW_TEARING, &allow_tearing_supported, sizeof(allow_tearing_supported));
m_allow_tearing_supported = (SUCCEEDED(hr) && allow_tearing_supported == TRUE);
m_features.accurate_prims = GSConfig.HWAccuratePrims;
return true;
}
@ -2178,6 +2219,93 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count)
m_index_stream_buffer.CommitMemory(size);
}
void GSDevice12::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
// Unbind the buffer.
m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING;
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
// Reserve the GPU region.
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
{
ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
pxFailRel("Failed to reserve space for accurate prims");
}
const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
if (InRenderPass())
EndRenderPass();
// Copy data to an upload buffer.
ID3D12Resource* upload_buffer;
u32 upload_buffer_offset;
const auto upload_data = [&](void* map_ptr) {
std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (size > m_texture_stream_buffer.GetSize() / 2)
{
upload_buffer_offset = 0;
upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
}
else
{
upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
}
if (!upload_buffer)
{
Console.Error("Failed to get upload buffer for accurate prims data.");
return;
}
// Copy data from upload to GPU buffer.
const D3D12_RESOURCE_BARRIER barrier_sr_to_dst = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
D3D12_RESOURCE_STATE_COPY_DEST}}};
GetCommandList()->ResourceBarrier(1, &barrier_sr_to_dst);
GetCommandList()->CopyBufferRegion(
m_accurate_prims_stream_buffer.GetBuffer(), offset, upload_buffer, upload_buffer_offset, size);
// Commit the GPU region.
m_accurate_prims_stream_buffer.CommitMemory(size);
// Issue the barrier since this will be used next draw.
const D3D12_RESOURCE_BARRIER barrier_dst_to_sr = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
GetCommandList()->ResourceBarrier(1, &barrier_dst_to_sr);
m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
}
}
void GSDevice12::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
config.cb_vs.base_vertex = m_vertex.start;
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
}
}
void GSDevice12::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor)
{
GSTexture12* vkRt = static_cast<GSTexture12*>(rt);
@ -2305,9 +2433,9 @@ bool GSDevice12::GetTextureGroupDescriptors(
}
D3D12_CPU_DESCRIPTOR_HANDLE dst_handle = *gpu_handle;
D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TFX_TEXTURES];
UINT src_sizes[NUM_TFX_TEXTURES];
pxAssert(count <= NUM_TFX_TEXTURES);
D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TOTAL_TFX_TEXTURES];
UINT src_sizes[NUM_TOTAL_TFX_TEXTURES];
pxAssert(count <= NUM_TOTAL_TFX_TEXTURES);
for (u32 i = 0; i < count; i++)
{
src_handles[i] = cpu_handles[i];
@ -2365,6 +2493,39 @@ bool GSDevice12::CreateBuffers()
return false;
}
if (!m_accurate_prims_stream_buffer.Create(
m_features.accurate_prims ? ACCURATE_PRIMS_BUFFER_SIZE : sizeof(AccuratePrimsEdgeData), true))
{
Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
return false;
}
if (!m_descriptor_heap_manager.Allocate(&m_accurate_prims_srv_descriptor_cpu))
{
Console.Error("Failed to allocate accurate prims CPU descriptor");
return false;
}
if (m_features.accurate_prims)
{
// Transition to accurate prims buffer to pixel shader resource and create the shader resource view.
const D3D12_RESOURCE_BARRIER barrier = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_COMMON,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
GetInitCommandList()->ResourceBarrier(1, &barrier);
D3D12_SHADER_RESOURCE_VIEW_DESC desc = {
DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING};
desc.Buffer.FirstElement = 0;
desc.Buffer.NumElements = ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData);
desc.Buffer.StructureByteStride = sizeof(AccuratePrimsEdgeData);
m_device->CreateShaderResourceView(m_accurate_prims_stream_buffer.GetBuffer(), &desc,
m_accurate_prims_srv_descriptor_cpu.cpu_handle);
}
if (!m_vertex_constant_buffer.Create(VERTEX_UNIFORM_BUFFER_SIZE))
{
Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer");
@ -2415,9 +2576,11 @@ bool GSDevice12::CreateRootSignatures()
rsb.AddCBVParameter(0, D3D12_SHADER_VISIBILITY_ALL);
rsb.AddCBVParameter(1, D3D12_SHADER_VISIBILITY_PIXEL);
rsb.AddSRVParameter(0, D3D12_SHADER_VISIBILITY_VERTEX);
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL);
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL); // Source / Palette
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 0, NUM_TFX_SAMPLERS, D3D12_SHADER_VISIBILITY_PIXEL);
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL);
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL); // RT / PrimID
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 4, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Depth
rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 5, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Accurate Prims
if (!(m_tfx_root_signature = rsb.Create()))
return false;
D3D12::SetObjectName(m_tfx_root_signature.get(), "TFX root signature");
@ -2805,6 +2968,7 @@ void GSDevice12::DestroyResources()
m_vertex_constant_buffer.Destroy(false);
m_index_stream_buffer.Destroy(false);
m_vertex_stream_buffer.Destroy(false);
m_accurate_prims_stream_buffer.Destroy(false);
m_utility_root_signature.reset();
m_tfx_root_signature.reset();
@ -2818,6 +2982,7 @@ void GSDevice12::DestroyResources()
m_shader_cache.Close();
m_descriptor_heap_manager.Free(&m_null_srv_descriptor);
m_descriptor_heap_manager.Free(&m_accurate_prims_srv_descriptor_cpu);
m_timestamp_query_buffer.reset();
m_timestamp_query_allocation.reset();
m_sampler_heap_manager.Destroy();
@ -2851,6 +3016,7 @@ const ID3DBlob* GSDevice12::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel)
sm.AddMacro("VS_FST", sel.fst);
sm.AddMacro("VS_IIP", sel.iip);
sm.AddMacro("VS_EXPAND", static_cast<int>(sel.expand));
sm.AddMacro("VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));
const char* entry_point = (sel.expand != GSHWDrawConfig::VSExpand::None) ? "vs_main_expand" : "vs_main";
ComPtr<ID3DBlob> vs(m_shader_cache.GetVertexShader(m_tfx_source, sm.GetPtr(), entry_point));
@ -2922,6 +3088,10 @@ const ID3DBlob* GSDevice12::GetTFXPixelShader(const GSHWDrawConfig::PSSelector&
sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb);
sm.AddMacro("PS_NO_COLOR", sel.no_color);
sm.AddMacro("PS_NO_COLOR1", sel.no_color1);
sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims);
sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
sm.AddMacro("PS_ZTST", sel.ztst);
ComPtr<ID3DBlob> ps(m_shader_cache.GetPixelShader(m_tfx_source, sm.GetPtr(), "ps_main"));
it = m_tfx_pixel_shaders.emplace(sel, std::move(ps)).first;
@ -3155,6 +3325,7 @@ void GSDevice12::InvalidateCachedState()
m_tfx_textures_handle_gpu.Clear();
m_tfx_samplers_handle_gpu.Clear();
m_tfx_rt_textures_handle_gpu.Clear();
m_tfx_depth_textures_handle_gpu.Clear();
}
void GSDevice12::SetVertexBuffer(D3D12_GPU_VIRTUAL_ADDRESS buffer, size_t size, size_t stride)
@ -3236,7 +3407,11 @@ void GSDevice12::PSSetShaderResource(int i, GSTexture* sr, bool check_state)
return;
m_tfx_textures[i] = handle;
m_dirty_flags |= (i < 2) ? DIRTY_FLAG_TFX_TEXTURES : DIRTY_FLAG_TFX_RT_TEXTURES;
m_dirty_flags |=
(i < 2) ? DIRTY_FLAG_TFX_TEXTURES :
(i < 4) ? DIRTY_FLAG_TFX_RT_TEXTURES :
(i < 5) ? DIRTY_FLAG_TFX_DEPTH_TEXTURES :
0;
}
void GSDevice12::PSSetSampler(GSHWDrawConfig::SamplerSelector sel)
@ -3642,6 +3817,17 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2;
}
if (flags & DIRTY_FLAG_TFX_DEPTH_TEXTURES)
{
if (!GetTextureGroupDescriptors(&m_tfx_depth_textures_handle_gpu, m_tfx_textures.data() + 4, 1))
{
ExecuteCommandListAndRestartRenderPass(false, "Ran out of TFX depth descriptor descriptor groups");
return ApplyTFXState(true);
}
flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3;
}
ID3D12GraphicsCommandList* cmdlist = GetCommandList();
if (m_current_root_signature != RootSignature::TFX)
@ -3649,7 +3835,8 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
m_current_root_signature = RootSignature::TFX;
flags |= DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING |
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE |
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_PIPELINE;
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 |
DIRTY_FLAG_PIPELINE;
cmdlist->SetGraphicsRootSignature(m_tfx_root_signature.get());
}
@ -3662,12 +3849,28 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
cmdlist->SetGraphicsRootShaderResourceView(TFX_ROOT_SIGNATURE_PARAM_VS_SRV,
m_vertex_stream_buffer.GetGPUPointer() + m_vertex.start * sizeof(GSVertex));
}
if (flags & DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING)
{
if (!GetDescriptorAllocator().Allocate(1, &m_accurate_prims_srv_descriptor_gpu))
{
Console.Error("Failed to allocate accurate prims GPU descriptor");
return false;
}
m_device.get()->CopyDescriptorsSimple(
1, m_accurate_prims_srv_descriptor_gpu, m_accurate_prims_srv_descriptor_cpu, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV, m_accurate_prims_srv_descriptor_gpu);
}
if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE)
cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES, m_tfx_textures_handle_gpu);
if (flags & DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE)
cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS, m_tfx_samplers_handle_gpu);
if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2)
cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES, m_tfx_rt_textures_handle_gpu);
if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3)
cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES, m_tfx_depth_textures_handle_gpu);
ApplyBaseState(flags, cmdlist);
return true;
@ -3832,12 +4035,26 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
GSTexture12* draw_rt = static_cast<GSTexture12*>(config.rt);
GSTexture12* draw_ds = static_cast<GSTexture12*>(config.ds);
GSTexture12* draw_rt_clone = nullptr;
GSTexture12* draw_ds_clone = nullptr;
GSTexture12* date_image = nullptr;
ScopedGuard recycle_temp_textures([&]() {
if (draw_rt_clone)
Recycle(draw_rt_clone);
if (draw_ds_clone)
Recycle(draw_ds_clone);
if (date_image)
Recycle(date_image);
});
// Align the render area to 128x128, hopefully avoiding render pass restarts for small render area changes (e.g. Ratchet and Clank).
const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize());
PipelineSelector& pipe = m_pipeline_selector;
// Copying buffers needs to done outside render pass so do this early.
SetupAccuratePrimsBuffer(config);
// figure out the pipeline
UpdateHWPipelineSelector(config);
@ -3906,7 +4123,6 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
}
// Primitive ID tracking DATE setup.
GSTexture12* date_image = nullptr;
if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking)
{
GSTexture* backup_rt = config.rt;
@ -3994,6 +4210,15 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
Console.Warning("D3D12: Failed to allocate temp texture for RT copy.");
}
if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth())
{
// Requires a copy of the DS.
// Used as "bind ds" flag when texture barrier is unsupported for tex is fb.
draw_ds_clone = static_cast<GSTexture12*>(CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true));
if (!draw_rt_clone)
Console.Warning("D3D12: Failed to allocate temp texture for DS copy.");
}
OMSetRenderTargets(draw_rt, draw_ds, config.scissor);
// Begin render pass if new target or out of the area.
@ -4040,7 +4265,8 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
UploadHWDrawVerticesAndIndices(config);
// now we can do the actual draw
SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false);
SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
config.require_one_barrier, config.require_full_barrier, false);
// blend second pass
if (config.blend_multi_pass.enable)
@ -4070,15 +4296,10 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
pipe.cms = config.alpha_second_pass.colormask;
pipe.dss = config.alpha_second_pass.depth;
pipe.bs = config.blend;
SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
}
if (draw_rt_clone)
Recycle(draw_rt_clone);
if (date_image)
Recycle(date_image);
// now blit the colclip texture back to the original target
if (colclip_rt)
{
@ -4113,23 +4334,40 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
}
}
void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config,
GSTexture12* draw_rt_clone, GSTexture12* draw_rt,
GSTexture12* draw_ds_clone, GSTexture12* draw_ds,
const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
{
if (draw_rt_clone)
if (draw_rt_clone || draw_ds_clone)
{
#ifdef PCSX2_DEVBUILD
if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
Console.Warning("D3D12: Possible unnecessary copy detected.");
#endif
auto CopyAndBind = [&](GSVector4i drawarea) {
EndRenderPass();
if (draw_rt_clone)
{
CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top);
draw_rt->TransitionToState(D3D12_RESOURCE_STATE_RENDER_TARGET);
}
if (draw_ds_clone)
{
CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top);
draw_ds->TransitionToState(D3D12_RESOURCE_STATE_DEPTH_WRITE);
}
if (one_barrier || full_barrier)
{
if (draw_rt_clone)
PSSetShaderResource(2, draw_rt_clone, true);
if (draw_ds_clone)
PSSetShaderResource(4, draw_ds_clone, true);
}
if (config.tex && config.tex == config.rt)
PSSetShaderResource(0, draw_rt_clone, true);
};
@ -4158,7 +4396,6 @@ void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig&
return;
}
// Optimization: For alpha second pass we can reuse the copy snapshot from the first pass.
if (!skip_first_barrier)
CopyAndBind(config.drawarea);
@ -4182,7 +4419,7 @@ void GSDevice12::UpdateHWPipelineSelector(GSHWDrawConfig& config)
m_pipeline_selector.ds = config.ds != nullptr;
}
void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
{
IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts);
@ -4200,4 +4437,7 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
{
IASetIndexBuffer(config.indices, config.nindices);
}
// Needs to be done after vertex offset is set.
SetupAccuratePrimsConstants(config);
}

View File

@ -129,6 +129,8 @@ public:
// Allocates a temporary CPU staging buffer, fires the callback with it to populate, then copies to a GPU buffer.
bool AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation,
const std::function<void(void*)>& fill_callback);
ID3D12Resource* AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
ID3D12Resource* WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);
private:
struct CommandListResources
@ -256,7 +258,8 @@ public:
NUM_TFX_CONSTANT_BUFFERS = 2,
NUM_TFX_TEXTURES = 2,
NUM_TFX_RT_TEXTURES = 2,
NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES,
NUM_TFX_DEPTH_TEXTURES = 1,
NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES + NUM_TFX_DEPTH_TEXTURES,
NUM_TFX_SAMPLERS = 1,
NUM_UTILITY_TEXTURES = 1,
NUM_UTILITY_SAMPLERS = 1,
@ -264,6 +267,10 @@ public:
VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
// Structured buffer size must be multiple of element size.
ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData),
VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
@ -273,6 +280,8 @@ public:
TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 3,
TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS = 4,
TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES = 5,
TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES = 6,
TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV = 7,
UTILITY_ROOT_SIGNATURE_PARAM_PUSH_CONSTANTS = 0,
UTILITY_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 1,
@ -299,6 +308,10 @@ private:
D3D12StreamBuffer m_vertex_stream_buffer;
D3D12StreamBuffer m_index_stream_buffer;
D3D12StreamBuffer m_accurate_prims_stream_buffer;
u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw.
D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu;
D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu;
D3D12StreamBuffer m_vertex_constant_buffer;
D3D12StreamBuffer m_pixel_constant_buffer;
D3D12StreamBuffer m_texture_stream_buffer;
@ -455,6 +468,8 @@ public:
void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
void IASetIndexBuffer(const void* index, size_t count);
void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
void SetupAccuratePrimsConstants(GSHWDrawConfig& config);
void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);
@ -466,10 +481,13 @@ public:
bool BindDrawPipeline(const PipelineSelector& p);
void RenderHW(GSHWDrawConfig& config) override;
void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config,
GSTexture12* draw_rt_clone, GSTexture12* draw_rt,
GSTexture12* draw_ds_clone, GSTexture12* draw_ds,
const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
void UpdateHWPipelineSelector(GSHWDrawConfig& config);
void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config);
void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config);
public:
/// Ends any render pass, executes the command buffer, and invalidates cached state.
@ -527,33 +545,37 @@ private:
DIRTY_FLAG_TFX_TEXTURES = (1 << 2),
DIRTY_FLAG_TFX_SAMPLERS = (1 << 3),
DIRTY_FLAG_TFX_RT_TEXTURES = (1 << 4),
DIRTY_FLAG_TFX_DEPTH_TEXTURES = (1 << 5),
DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 5),
DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 6),
DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 7),
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 8),
DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 9),
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 10),
DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 6),
DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 7),
DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 8),
DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING = (1 << 9),
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 10),
DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 11),
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 12),
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 = (1 << 13),
DIRTY_FLAG_VERTEX_BUFFER = (1 << 11),
DIRTY_FLAG_INDEX_BUFFER = (1 << 12),
DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 13),
DIRTY_FLAG_VIEWPORT = (1 << 14),
DIRTY_FLAG_SCISSOR = (1 << 15),
DIRTY_FLAG_RENDER_TARGET = (1 << 16),
DIRTY_FLAG_PIPELINE = (1 << 17),
DIRTY_FLAG_BLEND_CONSTANTS = (1 << 18),
DIRTY_FLAG_STENCIL_REF = (1 << 19),
DIRTY_FLAG_VERTEX_BUFFER = (1 << 14),
DIRTY_FLAG_INDEX_BUFFER = (1 << 15),
DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 16),
DIRTY_FLAG_VIEWPORT = (1 << 17),
DIRTY_FLAG_SCISSOR = (1 << 18),
DIRTY_FLAG_RENDER_TARGET = (1 << 19),
DIRTY_FLAG_PIPELINE = (1 << 20),
DIRTY_FLAG_BLEND_CONSTANTS = (1 << 21),
DIRTY_FLAG_STENCIL_REF = (1 << 22),
DIRTY_BASE_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING |
DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE |
DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 |
DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING |
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE |
DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 |
DIRTY_FLAG_VERTEX_BUFFER | DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PRIMITIVE_TOPOLOGY |
DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR | DIRTY_FLAG_RENDER_TARGET | DIRTY_FLAG_PIPELINE |
DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_STENCIL_REF,
DIRTY_TFX_STATE =
DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS | DIRTY_FLAG_TFX_RT_TEXTURES,
DIRTY_TFX_STATE = DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS |
DIRTY_FLAG_TFX_RT_TEXTURES | DIRTY_FLAG_TFX_DEPTH_TEXTURES,
DIRTY_UTILITY_STATE = DIRTY_BASE_STATE,
DIRTY_CONSTANT_BUFFER_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER | DIRTY_FLAG_PS_CONSTANT_BUFFER,
};
@ -594,6 +616,7 @@ private:
D3D12DescriptorHandle m_tfx_textures_handle_gpu;
D3D12DescriptorHandle m_tfx_samplers_handle_gpu;
D3D12DescriptorHandle m_tfx_rt_textures_handle_gpu;
D3D12DescriptorHandle m_tfx_depth_textures_handle_gpu;
D3D12DescriptorHandle m_utility_texture_cpu;
D3D12DescriptorHandle m_utility_texture_gpu;

View File

@ -350,43 +350,6 @@ ID3D12GraphicsCommandList* GSTexture12::GetCommandBufferForUpdate()
return dev->GetInitCommandList();
}
ID3D12Resource* GSTexture12::AllocateUploadStagingBuffer(
const void* data, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 buffer_size = CalcUploadSize(height, upload_pitch);
wil::com_ptr_nothrow<ID3D12Resource> resource;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, buffer_size, 1, 1, 1,
DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
return nullptr;
}
void* map_ptr;
hr = resource->Map(0, nullptr, &map_ptr);
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
return nullptr;
}
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
const D3D12_RANGE write_range = {0, buffer_size};
resource->Unmap(0, &write_range);
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
// This adds the reference needed to keep the buffer alive.
GSDevice12::GetInstance()->DeferResourceDestruction(allocation.get(), resource.get());
return resource.get();
}
void GSTexture12::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 block_size = GetCompressedBlockSize();
@ -406,7 +369,7 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
const u32 width = Common::AlignUpPow2(r.width(), block_size);
const u32 height = Common::AlignUpPow2(r.height(), block_size);
const u32 upload_pitch = Common::AlignUpPow2<u32>(pitch, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
const u32 required_size = CalcUploadSize(r.height(), upload_pitch);
const u32 required_size = CalcUploadSize(height, upload_pitch);
D3D12_TEXTURE_COPY_LOCATION srcloc;
srcloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
@ -416,35 +379,25 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
srcloc.PlacedFootprint.Footprint.Format = m_dxgi_format;
srcloc.PlacedFootprint.Footprint.RowPitch = upload_pitch;
const auto upload_data = [&](void* map_ptr) {
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (required_size > (GSDevice12::GetInstance()->GetTextureStreamBuffer().GetSize() / 2))
{
srcloc.pResource = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
if (!srcloc.pResource)
return false;
srcloc.pResource = GSDevice12::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
srcloc.PlacedFootprint.Offset = 0;
}
else
{
D3D12StreamBuffer& sbuffer = GSDevice12::GetInstance()->GetTextureStreamBuffer();
if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
GSDevice12::GetInstance()->ExecuteCommandList(
false, "While waiting for %u bytes in texture upload buffer", required_size);
if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
u32 offset;
srcloc.pResource = GSDevice12::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, offset);
srcloc.PlacedFootprint.Offset = offset;
}
if (!srcloc.pResource)
return false;
}
}
srcloc.pResource = sbuffer.GetBuffer();
srcloc.PlacedFootprint.Offset = sbuffer.GetCurrentOffset();
CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
sbuffer.CommitMemory(required_size);
}
ID3D12GraphicsCommandList* cmdlist = GetCommandBufferForUpdate();
GL_PUSH("GSTexture12::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);

View File

@ -79,7 +79,6 @@ private:
static bool CreateUAVDescriptor(ID3D12Resource* resource, DXGI_FORMAT format, D3D12DescriptorHandle* dh);
ID3D12GraphicsCommandList* GetCommandBufferForUpdate();
ID3D12Resource* AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;
wil::com_ptr_nothrow<ID3D12Resource> m_resource;

View File

@ -291,6 +291,360 @@ void GSRendererHW::Lines2Sprites()
}
}
static __forceinline void GetCoveringQuad(const GSVector2i& v0, const GSVector2i& v1, GSVertex* out)
{
float x0 = static_cast<float>(v0.x) / 16.0f;
float y0 = static_cast<float>(v0.y) / 16.0f;
float x1 = static_cast<float>(v1.x) / 16.0f;
float y1 = static_cast<float>(v1.y) / 16.0f;
float dx = x1 - x0;
float dy = y1 - y0;
float d_len = sqrtf(dx * dx + dy * dy);
dx = 2.0f * dx / d_len;
dy = 2.0f * dy / d_len;
float nx = -dy;
float ny = dx;
int dxi = static_cast<int>(16.0f * dx);
int dyi = static_cast<int>(16.0f * dy);
int nxi = static_cast<int>(16.0f * nx);
int nyi = static_cast<int>(16.0f * ny);
GSVertex v[4];
std::memset(v, 0, sizeof(v));
v[0].XYZ.X = static_cast<u32>(std::clamp<int>(v0.x - dxi - nxi, 0, 0xFFFF));
v[0].XYZ.Y = static_cast<u32>(std::clamp<int>(v0.y - dyi - nyi, 0, 0xFFFF));
v[1].XYZ.X = static_cast<u32>(std::clamp<int>(v0.x - dxi + nxi, 0, 0xFFFF));
v[1].XYZ.Y = static_cast<u32>(std::clamp<int>(v0.y - dyi + nyi, 0, 0xFFFF));
v[2].XYZ.X = static_cast<u32>(std::clamp<int>(v1.x + dxi - nxi, 0, 0xFFFF));
v[2].XYZ.Y = static_cast<u32>(std::clamp<int>(v1.y + dyi - nyi, 0, 0xFFFF));
v[3].XYZ.X = static_cast<u32>(std::clamp<int>(v1.x + dxi + nxi, 0, 0xFFFF));
v[3].XYZ.Y = static_cast<u32>(std::clamp<int>(v1.y + dyi + nyi, 0, 0xFFFF));
out[0] = v[0];
out[1] = v[1];
out[2] = v[2];
out[3] = v[1];
out[4] = v[2];
out[5] = v[3];
}
void GSRendererHW::GetAccuratePrimsEdgeVertexAttributes(const GSVertex& vtx0, const GSVertex& vtx1, const GSVertex* vtx_provoking, AccuratePrimsEdgeData& data)
{
GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
// Interpolated attributes - mimicks transformations done in vertex shader.
GSVector2 uv0 = GSVector2(static_cast<float>(vtx0.U), static_cast<float>(vtx0.V)) - m_conf.cb_vs.texture_offset;
GSVector2 uv1 = GSVector2(static_cast<float>(vtx1.U), static_cast<float>(vtx1.V)) - m_conf.cb_vs.texture_offset;
GSVector2 uv0_scale = uv0 * m_conf.cb_vs.texture_scale;
GSVector2 uv1_scale = uv1 * m_conf.cb_vs.texture_scale;
GSVector2 st0 = GSVector2(vtx0.ST.S, vtx0.ST.T) - m_conf.cb_vs.texture_offset;
GSVector2 st1 = GSVector2(vtx1.ST.S, vtx1.ST.T) - m_conf.cb_vs.texture_offset;
GSVector2 st0_scale = PRIM->TME ? st0 / m_conf.cb_vs.texture_scale : GSVector2(0);
GSVector2 st1_scale = PRIM->TME ? st1 / m_conf.cb_vs.texture_scale : GSVector2(0);
float fog0;
float fog1;
if (vtx_provoking)
{
fog0 = fog1 = static_cast<float>(vtx_provoking->FOG) / 255.0f;
}
else
{
fog0 = static_cast<float>(vtx0.FOG) / 255.0f;
fog1 = static_cast<float>(vtx1.FOG) / 255.0f;
}
data.t_float0 = GSVector4(st0.x, st0.y, fog0, vtx0.RGBAQ.Q);
data.t_float1 = GSVector4(st1.x, st1.y, fog1, vtx1.RGBAQ.Q);
data.t_int0 = GSVector4(uv0_scale.x, uv0_scale.y);
data.t_int1 = GSVector4(uv1_scale.x, uv1_scale.y);
if (m_conf.vs.fst)
{
data.t_int0.z = uv0.x;
data.t_int0.w = uv0.y;
data.t_int1.z = uv1.x;
data.t_int1.w = uv1.y;
}
else
{
data.t_int0.z = st0_scale.x;
data.t_int0.w = st0_scale.y;
data.t_int1.z = st1_scale.x;
data.t_int1.w = st1_scale.y;
}
constexpr float exp_min32 = 0x1p-32f;
float z0 = static_cast<float>(std::min(vtx0.XYZ.Z, static_cast<u32>(m_conf.cb_vs.max_depth.x)));
float z1 = static_cast<float>(std::min(vtx1.XYZ.Z, static_cast<u32>(m_conf.cb_vs.max_depth.x)));
GSVector2 xy0 = GSVector2(v0.x, v0.y) - GSVector2(0.05f);
GSVector2 xy1 = GSVector2(v1.x, v1.y) - GSVector2(0.05f);
xy0 = xy0 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset;
xy1 = xy1 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset;
GSRendererType renderer = GSGetCurrentRenderer();
float y_sign = (renderer == GSRendererType::DX11 || renderer == GSRendererType::DX12) ? -1.0f : 1.0f;
data.p0 = GSVector4(xy0.x, y_sign * xy0.y, z0 * exp_min32, 1.0f);
data.p1 = GSVector4(xy1.x, y_sign * xy1.y, z1 * exp_min32, 1.0f);
if (vtx_provoking)
{
data.c0 = data.c1 = GSVector4(
static_cast<float>(vtx_provoking->RGBAQ.R),
static_cast<float>(vtx_provoking->RGBAQ.G),
static_cast<float>(vtx_provoking->RGBAQ.B),
static_cast<float>(vtx_provoking->RGBAQ.A));
}
else
{
data.c0 = GSVector4(
static_cast<float>(vtx0.RGBAQ.R),
static_cast<float>(vtx0.RGBAQ.G),
static_cast<float>(vtx0.RGBAQ.B),
static_cast<float>(vtx0.RGBAQ.A));
data.c1 = GSVector4(
static_cast<float>(vtx1.RGBAQ.R),
static_cast<float>(vtx1.RGBAQ.G),
static_cast<float>(vtx1.RGBAQ.B),
static_cast<float>(vtx1.RGBAQ.A));
}
}
void GSRendererHW::ExpandAccurateTrianglesEdge(
const GSVertex& vtx0,
const GSVertex& vtx1,
const GSVertex* vtx_provoking,
const GSVector4i& edge0,
const GSVector4i& edge1,
bool top_left,
AccuratePrimsEdgeData& data,
GSVertex* vertex_out)
{
const GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
const GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
const GSVector4i& xyof = m_context->scissor.xyof;
data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y);
data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y);
const GSVector2i dxy = data.xy1 - data.xy0;
const bool pos_x = dxy.x >= 0;
const bool pos_y = dxy.y >= 0;
data.edge0 = edge0;
data.edge1 = edge1;
data.step_x = std::abs(dxy.x) >= std::abs(dxy.y);
data.side = top_left != (data.step_x && (dxy.y != 0) && (pos_x == pos_y));
GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data);
GetCoveringQuad(v0, v1, vertex_out);
}
static const u8 s_ysort[8][4] =
{
{0, 1, 2, 0}, // y0 <= y1 <= y2
{1, 0, 2, 0}, // y1 < y0 <= y2
{0, 0, 0, 0},
{1, 2, 0, 0}, // y1 <= y2 < y0
{0, 2, 1, 0}, // y0 <= y2 < y1
{0, 0, 0, 0},
{2, 0, 1, 0}, // y2 < y0 <= y1
{2, 1, 0, 0}, // y2 < y1 < y0
};
void GSRendererHW::ExpandAccurateTrianglesVertices()
{
constexpr int verts_per_prim = 21; // 3 verts for triangle interior; 3 x 6 verts for the edges.
const int prims = m_index.tail / 3;
while (m_vertex.maxcount < static_cast<u32>(prims * verts_per_prim))
GrowVertexBuffer();
m_accurate_prims_edge_data.clear();
m_accurate_prims_edge_data.resize(3 * prims);
const GSVector4i& xyof = m_context->scissor.xyof;
const bool flat_shade = !PRIM->IIP;
const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 2 : 0;
for (int i = 0; i < prims; i++)
{
// Code from GSRasterizer
const GSVertex& vtx0_orig = m_vertex.buff[m_index.buff[3 * i + 0]];
const GSVertex& vtx1_orig = m_vertex.buff[m_index.buff[3 * i + 1]];
const GSVertex& vtx2_orig = m_vertex.buff[m_index.buff[3 * i + 2]];
const GSVector2i v0_orig = { static_cast<int>(vtx0_orig.XYZ.X) - xyof.x, static_cast<int>(vtx0_orig.XYZ.Y) - xyof.y };
const GSVector2i v1_orig = { static_cast<int>(vtx1_orig.XYZ.X) - xyof.x, static_cast<int>(vtx1_orig.XYZ.Y) - xyof.y };
const GSVector2i v2_orig = { static_cast<int>(vtx2_orig.XYZ.X) - xyof.x, static_cast<int>(vtx2_orig.XYZ.Y) - xyof.y };
GSVector4i y0011(v0_orig.y, v0_orig.y, v1_orig.y, v1_orig.y);
GSVector4i y1221(v1_orig.y, v2_orig.y, v2_orig.y, v1_orig.y);
int m1 = GSVector4::cast(y0011 > y1221).mask() & 7;
const u8* idx = s_ysort[m1];
const GSVertex* vtx[3] = { &vtx0_orig, &vtx1_orig, &vtx2_orig };
const GSVector2i* v[3] = { &v0_orig, &v1_orig, &v2_orig };
const GSVertex& vtx0 = *vtx[idx[0]];
const GSVertex& vtx1 = *vtx[idx[1]];
const GSVertex& vtx2 = *vtx[idx[2]];
const GSVertex* vtx_provoking = flat_shade ? vtx[idx[provoking_offset]] : nullptr;
const GSVector2i& v0 = *v[idx[0]];
const GSVector2i& v1 = *v[idx[1]];
const GSVector2i& v2 = *v[idx[2]];
y0011 = GSVector4i(v0.y, v0.y, v1.y, v1.y);
y1221 = GSVector4i(v1.y, v2.y, v2.y, v1.y);
m1 = GSVector4::cast(y0011 == y1221).mask() & 7;
if (m1 == 7)
continue; // Degenerate triangle.
GSVector2i dv0 = v1 - v0;
GSVector2i dv1 = v2 - v0;
GSVector2i dv2 = v2 - v1;
int cross = dv0.y * dv1.x - dv0.x * dv1.y;
if (cross == 0)
continue; // Degenerate triangle
bool clockwise = cross < 0;
const bool tl0 = (v0.y == v1.y) || !clockwise;
const bool tl1 = clockwise;
const bool tl2 = (v1.y != v2.y) && !clockwise;
GSVector4i edge0 = GSVector4i( dv0.y, -dv0.x, 0, 0);
GSVector4i edge1 = GSVector4i(-dv1.y, dv1.x, 0, 0);
GSVector4i edge2 = GSVector4i( dv2.y, -dv2.x, 0, 0);
edge0.z = v1.x * v0.y - v0.x * v1.y;
edge1.z = v0.x * v2.y - v2.x * v0.y;
edge2.z = v2.x * v1.y - v1.x * v2.y;
if (clockwise)
{
edge0 = GSVector4i(0) - edge0;
edge1 = GSVector4i(0) - edge1;
edge2 = GSVector4i(0) - edge2;
}
// Bias for top-left edges.
edge0.z += tl0 ? 1 : 0;
edge1.z += tl1 ? 1 : 0;
edge2.z += tl2 ? 1 : 0;
// Interior triangle
m_vertex.buff_copy[verts_per_prim * i + 0] = vtx0;
m_vertex.buff_copy[verts_per_prim * i + 1] = vtx1;
m_vertex.buff_copy[verts_per_prim * i + 2] = vtx2;
// Edges
ExpandAccurateTrianglesEdge(vtx0, vtx1, vtx_provoking, edge1, edge2, tl0, m_accurate_prims_edge_data[3 * i + 0],
&m_vertex.buff_copy[verts_per_prim * i + 3]);
ExpandAccurateTrianglesEdge(vtx0, vtx2, vtx_provoking, edge2, edge0, tl1, m_accurate_prims_edge_data[3 * i + 1],
&m_vertex.buff_copy[verts_per_prim * i + 9]);
ExpandAccurateTrianglesEdge(vtx1, vtx2, vtx_provoking, edge0, edge1, tl2, m_accurate_prims_edge_data[3 * i + 2],
&m_vertex.buff_copy[verts_per_prim * i + 15]);
}
m_index.tail = prims * verts_per_prim;
for (std::size_t i = 0; i < m_index.tail; i++)
{
m_index.buff[i] = i;
}
m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail;
std::swap(m_vertex.buff, m_vertex.buff_copy);
}
void GSRendererHW::ExpandAccurateLinesVertices()
{
constexpr int verts_per_prim = 6; // 6 verts to form quad covering each line.
const int prims = m_index.tail / 2;
const bool flat_shade = !PRIM->IIP;
const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 1 : 0;
const auto ExitRule = [](const GSVector2i& d, bool step_x, bool pos_step) {
int dist = std::abs(d.x) + std::abs(d.y);
if (dist < 8)
return false;
if (step_x)
{
bool x_good = pos_step ? (d.x > 0) : (d.x < 0);
return x_good && (dist > 8 || d.y >= 0);
}
else
{
bool y_good = pos_step ? (d.y > 0) : (d.y < 0);
return y_good && (dist > 8 || d.x >= 0);
}
};
while (m_vertex.maxcount < static_cast<u32>(verts_per_prim * prims))
GrowVertexBuffer();
m_accurate_prims_edge_data.clear();
m_accurate_prims_edge_data.resize(prims);
const GSVector4i& xyof = m_context->scissor.xyof;
for (int i = 0; i < prims; i++)
{
const GSVertex& vtx0 = m_vertex.buff[m_index.buff[2 * i + 0]];
const GSVertex& vtx1 = m_vertex.buff[m_index.buff[2 * i + 1]];
const GSVertex* vtx_provoking = flat_shade ? &m_vertex.buff[m_index.buff[2 * i + provoking_offset]] : nullptr;
const GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
const GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
AccuratePrimsEdgeData& data = m_accurate_prims_edge_data[i];
data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y);
data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y);
const GSVector2i dxy = data.xy1 - data.xy0;
const GSVector2i xy0_i = (data.xy0 + 8) & GSVector2i(~0xF);
const GSVector2i xy1_i = (data.xy1 + 8) & GSVector2i(~0xF);
data.step_x = std::abs(dxy.x) >= std::abs(dxy.y);
bool pos_step = data.step_x ? dxy.x >= 0 : dxy.y >= 0;
data.draw0 = !ExitRule(data.xy0 - xy0_i, data.step_x, pos_step);
data.draw1 = ExitRule(data.xy1 - xy1_i, data.step_x, pos_step);
GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data);
GetCoveringQuad(v0, v1, &m_vertex.buff_copy[i * verts_per_prim]);
}
m_index.tail = prims * verts_per_prim;
for (std::size_t i = 0; i < m_index.tail; i++)
{
m_index.buff[i] = i;
}
m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail;
std::swap(m_vertex.buff, m_vertex.buff_copy);
}
void GSRendererHW::ExpandLineIndices()
{
const u32 process_count = (m_index.tail + 7) / 8 * 8;
@ -2471,7 +2825,7 @@ void GSRendererHW::Draw()
// Need to fix the alpha test, since the alpha will be fixed to 1.0 if ABE is disabled and AA1 is enabled
// So if it doesn't meet the condition, always fail, if it does, always pass (turn off the test).
if (IsCoverageAlpha() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1)
if (IsCoverageAlphaFixedOne() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1)
{
const float aref = static_cast<float>(m_cached_ctx.TEST.AREF);
const int old_ATST = m_cached_ctx.TEST.ATST;
@ -5017,6 +5371,21 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
break;
case GS_LINE_CLASS:
{
if (features.accurate_prims)
{
GL_INS("HW: Using accurate lines");
ExpandAccurateLinesVertices();
m_conf.accurate_prims = true;
m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data;
m_conf.vs.accurate_prims = ACCURATE_PRIMS_LINE;
m_conf.ps.accurate_prims = ACCURATE_PRIMS_LINE;
m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0);
m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0);
m_conf.topology = GSHWDrawConfig::Topology::Triangle;
m_conf.indices_per_prim = 6;
}
else
{
m_conf.topology = GSHWDrawConfig::Topology::Line;
m_conf.indices_per_prim = 2;
@ -5036,6 +5405,7 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
}
}
}
}
break;
case GS_SPRITE_CLASS:
@ -5076,6 +5446,20 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
break;
case GS_TRIANGLE_CLASS:
if (features.accurate_prims && PRIM->AA1)
{
GL_INS("HW: Using accurate triangles");
ExpandAccurateTrianglesVertices();
m_conf.accurate_prims = true;
m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data;
m_conf.vs.accurate_prims = ACCURATE_PRIMS_TRIANGLE;
m_conf.ps.accurate_prims = ACCURATE_PRIMS_TRIANGLE;
m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0);
m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0);
m_conf.topology = GSHWDrawConfig::Topology::Triangle;
m_conf.indices_per_prim = 21;
}
else
{
m_conf.topology = GSHWDrawConfig::Topology::Triangle;
m_conf.indices_per_prim = 3;
@ -5130,6 +5514,10 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds)
m_conf.depth.ztst = ZTST_ALWAYS;
}
// Accurate prims requires a manual depth interpolation in the pixel shader.
// Piggy-back on Z clamp to avoid creating more pipeline combinations.
bool accurate_prims_clamp_z = UsingAccuratePrims() && (m_conf.depth.zwe || m_conf.depth.ztst != ZTST_ALWAYS);
// On the real GS we appear to do clamping on the max z value the format allows.
// Clamping is done after rasterization.
const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8);
@ -5139,16 +5527,23 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds)
//ps_cb.MaxDepth = GSVector4(0.0f, 0.0f, 0.0f, 1.0f);
m_conf.ps.zclamp = 0;
if (clamp_z)
if (clamp_z || accurate_prims_clamp_z)
{
if (m_vt.m_primclass == GS_SPRITE_CLASS || m_vt.m_primclass == GS_POINT_CLASS)
{
m_conf.cb_vs.max_depth = GSVector2i(max_z);
}
else if (!m_cached_ctx.ZBUF.ZMSK)
else if (!m_cached_ctx.ZBUF.ZMSK || accurate_prims_clamp_z)
{
m_conf.cb_ps.TA_MaxDepth_Af.z = static_cast<float>(max_z) * 0x1p-32f;
m_conf.ps.zclamp = 1;
if (accurate_prims_clamp_z && m_vt.m_primclass == GS_TRIANGLE_CLASS && PRIM->AA1 &&
m_cached_ctx.TEST.ZTE && (m_conf.depth.ztst == ZTST_GEQUAL || m_conf.depth.ztst == ZTST_GREATER))
{
// For HW AA1 with triangles we must do Z test in the shader to get proper
// updating of the Z buffer (interior triangle points update the Z buffer but edges should not).
m_conf.ps.ztst = m_conf.depth.ztst;
}
}
}
}
@ -5619,15 +6014,13 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
{
const GIFRegALPHA& ALPHA = m_context->ALPHA;
{
// AA1: Blending needs to be enabled on draw.
const bool AA1 = PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
// PABE: Check condition early as an optimization, no blending when As < 128.
// For Cs*As + Cd*(1 - As) if As is 128 then blending can be disabled as well.
const bool PABE_skip = m_draw_env->PABE.PABE &&
((GetAlphaMinMax().max < 128) || (GetAlphaMinMax().max == 128 && ALPHA.A == 0 && ALPHA.B == 1 && ALPHA.C == 0 && ALPHA.D == 1));
// No blending or coverage anti-aliasing so early exit
if (PABE_skip || !(NeedsBlending() || AA1))
if (PABE_skip || !(NeedsBlending() || IsCoverageAlpha()))
{
m_conf.blend = {};
m_conf.ps.no_color1 = true;
@ -7315,8 +7708,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
const bool is_overlap_alpha = m_prim_overlap != PRIM_OVERLAP_NO && !(m_cached_ctx.FRAME.FBMSK & 0x80000000);
if (m_cached_ctx.TEST.DATM == 0)
{
// Some pixles are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass).
DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlpha())));
// Some pixels are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass).
DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlphaFixedOne())));
// All pixels fail.
if (DATE && rt->m_alpha_min >= 128)
@ -7324,8 +7717,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
}
else
{
// Some pixles are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass).
DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlpha())));
// Some pixels are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass).
DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlphaFixedOne())));
// All pixels fail.
if (DATE && rt->m_alpha_max < 128)
@ -7477,7 +7870,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
}
// When Blending is disabled and Edge Anti Aliasing is enabled,
// the output alpha is Coverage (which we force to 128) so DATE will fail/pass guaranteed on second pass.
else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlpha()) && features.stencil_buffer)
else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlphaFixedOne()) && features.stencil_buffer)
{
GL_PERF("DATE: Fast with FBA, all pixels will be >= 128");
DATE_one = !m_cached_ctx.TEST.DATM;
@ -7663,7 +8056,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
}
// AA1: Set alpha source to coverage 128 when there is no alpha blending.
m_conf.ps.fixed_one_a = IsCoverageAlpha();
m_conf.ps.fixed_one_a = IsCoverageAlphaFixedOne();
if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) || (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle)))
{
@ -8030,6 +8423,23 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
m_conf.require_full_barrier = false;
}
if ((features.texture_barrier || features.multidraw_fb_copy) && UsingAccuratePrims() &&
(m_vt.m_primclass == GS_TRIANGLE_CLASS) && PRIM->AA1 && m_conf.ps.zclamp)
{
// Manual depth test in the shader requires full barrier.
if (m_prim_overlap == PRIM_OVERLAP_NO)
m_conf.require_one_barrier = true;
else
m_conf.require_full_barrier = true;
}
if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy))
{
ComputeDrawlistGetSize(rt->m_scale);
m_conf.drawlist = &m_drawlist;
m_conf.drawlist_bbox = &m_drawlist_bbox;
}
// rs
const GSVector4i hacked_scissor = m_channel_shuffle ? GSVector4i::cxpr(0, 0, 1024, 1024) : m_context->scissor.in;
const GSVector4i scissor(GSVector4i(GSVector4(rtscale) * GSVector4(hacked_scissor)).rintersect(GSVector4i::loadh(rtsize)));
@ -8125,13 +8535,6 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
m_conf.alpha_second_pass.enable = false;
}
if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy))
{
ComputeDrawlistGetSize(rt->m_scale);
m_conf.drawlist = &m_drawlist;
m_conf.drawlist_bbox = &m_drawlist_bbox;
}
if (!m_channel_shuffle_width)
g_gs_device->RenderHW(m_conf);
else
@ -9574,3 +9977,10 @@ std::size_t GSRendererHW::ComputeDrawlistGetSize(float scale)
}
return m_drawlist.size();
}
bool GSRendererHW::IsCoverageAlphaSupported()
{
return IsCoverageAlpha() &&
((m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS) &&
g_gs_device->Features().accurate_prims);
}

View File

@ -137,6 +137,21 @@ private:
bool IsUsingCsInBlend();
bool IsUsingAsInBlend();
void GetAccuratePrimsEdgeVertexAttributes(
const GSVertex& vtx0,
const GSVertex& vtx1,
const GSVertex* vtx_provoking,
AccuratePrimsEdgeData& data);
void ExpandAccurateTrianglesEdge(
const GSVertex& vtx0,
const GSVertex& vtx1,
const GSVertex* vtx_provoking,
const GSVector4i& edge0,
const GSVector4i& edge1,
bool top_left,
AccuratePrimsEdgeData& data,
GSVertex* vertex_out);
// We modify some of the context registers to optimize away unnecessary operations.
// Instead of messing with the real context, we copy them and use those instead.
struct HWCachedCtx
@ -205,6 +220,8 @@ private:
std::unique_ptr<GSTextureCacheSW::Texture> m_sw_texture[7 + 1];
std::unique_ptr<GSVirtualAlignedClass<32>> m_sw_rasterizer;
std::vector<AccuratePrimsEdgeData> m_accurate_prims_edge_data;
public:
GSRendererHW();
virtual ~GSRendererHW() override;
@ -221,6 +238,8 @@ public:
void Lines2Sprites();
bool VerifyIndices();
void ExpandLineIndices();
void ExpandAccurateLinesVertices();
void ExpandAccurateTrianglesVertices();
void ConvertSpriteTextureShuffle(u32& process_rg, u32& process_ba, bool& shuffle_across, GSTextureCache::Target* rt, GSTextureCache::Source* tex);
GSVector4 RealignTargetTextureCoordinate(const GSTextureCache::Source* tex);
GSVector4i ComputeBoundingBox(const GSVector2i& rtsize, float rtscale);
@ -273,4 +292,6 @@ public:
/// Compute the drawlist (if not already present) and bounding boxes for the current draw.
std::size_t ComputeDrawlistGetSize(float scale);
bool IsCoverageAlphaSupported() override;
};

View File

@ -94,6 +94,11 @@ struct GSMTLMainVSUniform
vector_float2 texture_offset;
vector_float2 point_size;
uint max_depth;
uint _pad0;
uint base_vertex;
uint _pad1;
uint _pad2;
uint _pad3;
};
struct GSMTLMainPSUniform
@ -134,6 +139,8 @@ struct GSMTLMainPSUniform
matrix_float4x4 dither_matrix;
vector_float4 scale_factor;
vector_uint4 accurate_prims_base_index;
};
enum GSMTLAttributes

View File

@ -310,10 +310,10 @@ namespace
};
} // namespace
std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size)
std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size, bool nonsyncing)
{
std::unique_ptr<GLStreamBuffer> buf;
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage)
if (!nonsyncing && (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage))
{
buf = BufferStorageStreamBuffer::Create(target, size);
if (buf)

View File

@ -38,7 +38,7 @@ public:
/// Returns the minimum granularity of blocks which sync objects will be created around.
virtual u32 GetChunkSize() const = 0;
static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size);
static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size, bool nonsyncing = false);
protected:
GLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);

View File

@ -26,6 +26,7 @@ static constexpr u32 g_ps_cb_index = 0;
static constexpr u32 VERTEX_BUFFER_SIZE = 32 * 1024 * 1024;
static constexpr u32 INDEX_BUFFER_SIZE = 16 * 1024 * 1024;
static constexpr u32 ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024;
static constexpr u32 VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024;
static constexpr u32 FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024;
static constexpr u32 TEXTURE_UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024;
@ -258,10 +259,18 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE);
m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE);
if (m_features.accurate_prims)
{
// Performance note: prefer a non-syncing buffer for accurate prims so that it is more likely to be GPU local.
// Rationale: we expect this buffer to be updated relatively rarely and it's used as a pixel shader resource.
m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE, true);
}
m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE);
m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE);
glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment);
if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer)
if (!m_vertex_stream_buffer || !m_index_stream_buffer ||
(m_features.accurate_prims && !m_accurate_prims_stream_buffer) ||
!m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer)
{
Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers");
return false;
@ -303,6 +312,11 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
glBufferData(GL_ELEMENT_ARRAY_BUFFER, EXPAND_BUFFER_SIZE, expand_data.get(), GL_STATIC_DRAW);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, m_vertex_stream_buffer->GetGLBufferId(), 0, VERTEX_BUFFER_SIZE);
}
if (m_features.accurate_prims)
{
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 3, m_accurate_prims_stream_buffer->GetGLBufferId(), 0, ACCURATE_PRIMS_BUFFER_SIZE);
}
}
// ****************************************************************
@ -770,6 +784,8 @@ bool GSDeviceOGL::CheckFeatures()
m_features.line_expand ? "hardware" : (m_features.vs_expand ? "vertex expanding" : "UNSUPPORTED"),
m_features.vs_expand ? "vertex expanding" : "CPU");
m_features.accurate_prims = GSConfig.HWAccuratePrims;
return true;
}
@ -840,6 +856,7 @@ void GSDeviceOGL::DestroyResources()
m_fragment_uniform_stream_buffer.reset();
m_vertex_uniform_stream_buffer.reset();
m_accurate_prims_stream_buffer.reset();
glBindVertexArray(0);
if (m_expand_ibo != 0)
@ -1330,8 +1347,9 @@ std::string GSDeviceOGL::GetVSSource(VSSelector sel)
std::string macro = fmt::format("#define VS_FST {}\n", static_cast<u32>(sel.fst))
+ fmt::format("#define VS_IIP {}\n", static_cast<u32>(sel.iip))
+ fmt::format("#define VS_POINT_SIZE {}\n", static_cast<u32>(sel.point_size))
+ fmt::format("#define VS_EXPAND {}\n", static_cast<int>(sel.expand));
+ fmt::format("#define VS_EXPAND {}\n", static_cast<int>(sel.expand))
+ fmt::format("#define VS_ACCURATE_PRIMS {}\n", static_cast<int>(sel.accurate_prims))
;
std::string src = GenGlslHeader("vs_main", GL_VERTEX_SHADER, macro);
src += m_shader_tfx_vgs;
return src;
@ -1396,6 +1414,10 @@ std::string GSDeviceOGL::GetPSSource(const PSSelector& sel)
+ fmt::format("#define PS_SCANMSK {}\n", sel.scanmsk)
+ fmt::format("#define PS_NO_COLOR {}\n", sel.no_color)
+ fmt::format("#define PS_NO_COLOR1 {}\n", sel.no_color1)
+ fmt::format("#define PS_ACCURATE_PRIMS {}\n", sel.accurate_prims)
+ fmt::format("#define PS_ACCURATE_PRIMS_AA {}\n", sel.accurate_prims_aa)
+ fmt::format("#define PS_ACCURATE_PRIMS_AA_ABE {}\n", sel.accurate_prims_aa_abe)
+ fmt::format("#define PS_ZTST {}\n", sel.ztst)
;
std::string src = GenGlslHeader("ps_main", GL_FRAGMENT_SHADER, macro);
@ -2012,6 +2034,21 @@ void GSDeviceOGL::ClearSamplerCache()
}
}
void GSDeviceOGL::SetupAccuratePrims(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
auto res = m_accurate_prims_stream_buffer->Map(sizeof(AccuratePrimsEdgeData), size);
std::memcpy(res.pointer, config.accurate_prims_edge_data->data(), size);
m_accurate_prims_stream_buffer->Unmap(size);
config.cb_vs.base_vertex.x = m_vertex.start;
config.cb_ps.accurate_prims_base_index.x = res.index_aligned;
}
}
bool GSDeviceOGL::CreateCASPrograms()
{
std::optional<std::string> cas_source = ReadShaderSource("shaders/opengl/cas.glsl");
@ -2525,6 +2562,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
IASetVertexBuffer(config.verts, config.nverts, GetVertexAlignment(config.vs.expand));
m_vertex.start *= GetExpansionFactor(config.vs.expand);
SetupAccuratePrims(config);
if (config.vs.UseExpandIndexBuffer())
{
IASetVAO(m_expand_vao);
@ -2554,6 +2593,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
PSSetShaderResource(2, draw_rt_clone);
else if (config.require_one_barrier || config.require_full_barrier)
PSSetShaderResource(2, colclip_rt ? colclip_rt : config.rt);
if ((config.require_one_barrier || config.require_full_barrier) && config.ps.IsFeedbackLoopDepth())
PSSetShaderResource(4, config.ds);
SetupSampler(config.sampler);
@ -2761,7 +2802,7 @@ void GSDeviceOGL::SendHWDraw(const GSHWDrawConfig& config, bool one_barrier, boo
}
#ifdef PCSX2_DEVBUILD
if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
Console.Warning("OpenGL: Possible unnecessary barrier detected.");
#endif

View File

@ -157,6 +157,7 @@ private:
std::unique_ptr<GLStreamBuffer> m_vertex_stream_buffer;
std::unique_ptr<GLStreamBuffer> m_index_stream_buffer;
std::unique_ptr<GLStreamBuffer> m_accurate_prims_stream_buffer;
GLuint m_expand_ibo = 0;
GLuint m_vao = 0;
GLuint m_expand_vao = 0;
@ -346,6 +347,7 @@ public:
void IASetPrimitiveTopology(GLenum topology);
void IASetVertexBuffer(const void* vertices, size_t count, size_t align_multiplier = 1);
void IASetIndexBuffer(const void* index, size_t count);
void SetupAccuratePrims(GSHWDrawConfig& config);
void PSSetShaderResource(int i, GSTexture* sr);
void PSSetSamplerState(GLuint ss);

View File

@ -82,6 +82,8 @@ protected:
template <u32 primclass>
void RewriteVerticesIfSTOverflow();
bool IsCoverageAlphaSupported() override { return true; }
public:
GSRendererSW(int threads);
~GSRendererSW() override;

View File

@ -41,6 +41,7 @@ enum : u32
VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024,
VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
TEXTURE_BUFFER_SIZE = 64 * 1024 * 1024,
@ -932,7 +933,7 @@ bool GSDeviceVK::CreateGlobalDescriptorPool()
{
static constexpr const VkDescriptorPoolSize pool_sizes[] = {
{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 2},
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2},
{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3},
};
VkDescriptorPoolCreateInfo pool_create_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr,
@ -1501,12 +1502,13 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
VkAttachmentReference* color_reference_ptr = nullptr;
VkAttachmentReference depth_reference;
VkAttachmentReference* depth_reference_ptr = nullptr;
VkAttachmentReference input_reference;
VkAttachmentReference* input_reference_ptr = nullptr;
VkSubpassDependency subpass_dependency;
VkSubpassDependency* subpass_dependency_ptr = nullptr;
std::array<VkAttachmentReference, 2> input_reference;
u32 num_subpass_inputs = 0;
std::array<VkSubpassDependency, 2> subpass_dependency;
u32 num_subpass_dependencies = 0;
std::array<VkAttachmentDescription, 2> attachments;
u32 num_attachments = 0;
bool actual_color_feedback_loop = false;
if (key.color_format != VK_FORMAT_UNDEFINED)
{
const VkImageLayout layout =
@ -1522,28 +1524,32 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
if (key.color_feedback_loop)
{
actual_color_feedback_loop = true;
if (!UseFeedbackLoopLayout())
{
input_reference.attachment = num_attachments;
input_reference.layout = layout;
input_reference_ptr = &input_reference;
pxAssert(num_subpass_inputs == 0); // Must always have the color input first.
input_reference[num_subpass_inputs].attachment = num_attachments;
input_reference[num_subpass_inputs].layout = layout;
num_subpass_inputs++;
}
if (!m_features.framebuffer_fetch)
{
pxAssert(num_subpass_dependencies == 0); // Must always have the color input first.
// don't need the framebuffer-local dependency when we have rasterization order attachment access
subpass_dependency.srcSubpass = 0;
subpass_dependency.dstSubpass = 0;
subpass_dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
subpass_dependency.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
subpass_dependency.srcAccessMask =
subpass_dependency[num_subpass_dependencies].srcSubpass = 0;
subpass_dependency[num_subpass_dependencies].dstSubpass = 0;
subpass_dependency[num_subpass_dependencies].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
subpass_dependency[num_subpass_dependencies].srcAccessMask =
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
subpass_dependency.dstAccessMask =
subpass_dependency[num_subpass_dependencies].dstAccessMask =
UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
subpass_dependency.dependencyFlags =
subpass_dependency[num_subpass_dependencies].dependencyFlags =
UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
VK_DEPENDENCY_BY_REGION_BIT;
subpass_dependency_ptr = &subpass_dependency;
num_subpass_dependencies++;
}
}
@ -1562,6 +1568,41 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
depth_reference.attachment = num_attachments;
depth_reference.layout = layout;
depth_reference_ptr = &depth_reference;
if (actual_color_feedback_loop && key.depth_sampling)
{
// Note: We only allow depth to be bound in a feedback loop if color is already bound as such.
// This is partly because it doesn't seem likely that we will ever need a depth feedback loop
// without a color feedback loop and to simplify the indices for subpass inputs (0 for color; 1 for depth);
if (!UseFeedbackLoopLayout())
{
pxAssert(num_subpass_inputs == 1); // Must always have the color input first.
input_reference[num_subpass_inputs].attachment = num_attachments;
input_reference[num_subpass_inputs].layout = layout;
num_subpass_inputs++;
}
if (!m_features.framebuffer_fetch)
{
pxAssert(num_subpass_dependencies == 1); // Must always have the color input first.
// don't need the framebuffer-local dependency when we have rasterization order attachment access
subpass_dependency[num_subpass_dependencies].srcSubpass = 0;
subpass_dependency[num_subpass_dependencies].dstSubpass = 0;
subpass_dependency[num_subpass_dependencies].srcStageMask =
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
subpass_dependency[num_subpass_dependencies].srcAccessMask =
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
subpass_dependency[num_subpass_dependencies].dstAccessMask =
UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
subpass_dependency[num_subpass_dependencies].dependencyFlags =
UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
VK_DEPENDENCY_BY_REGION_BIT;
num_subpass_dependencies++;
}
}
num_attachments++;
}
@ -1569,11 +1610,11 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
(key.color_feedback_loop && m_optional_extensions.vk_ext_rasterization_order_attachment_access) ?
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT :
0;
const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, input_reference_ptr ? 1u : 0u,
input_reference_ptr ? input_reference_ptr : nullptr, color_reference_ptr ? 1u : 0u,
const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, num_subpass_inputs,
num_subpass_inputs ? input_reference.data() : nullptr, color_reference_ptr ? 1u : 0u,
color_reference_ptr ? color_reference_ptr : nullptr, nullptr, depth_reference_ptr, 0, nullptr};
const VkRenderPassCreateInfo pass_info = {VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, nullptr, 0u, num_attachments,
attachments.data(), 1u, &subpass, subpass_dependency_ptr ? 1u : 0u, subpass_dependency_ptr};
attachments.data(), 1u, &subpass, num_subpass_dependencies, num_subpass_dependencies ? subpass_dependency.data() : nullptr};
VkRenderPass pass;
const VkResult res = vkCreateRenderPass(m_device, &pass_info, nullptr, &pass);
@ -2679,6 +2720,8 @@ bool GSDeviceVK::CheckFeatures()
m_max_texture_size = m_device_properties.limits.maxImageDimension2D;
m_features.accurate_prims = GSConfig.HWAccuratePrims;
return true;
}
@ -3363,6 +3406,135 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count)
SetIndexBuffer(m_index_stream_buffer.GetBuffer());
}
void GSDeviceVK::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
// Reserve the GPU region.
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
{
ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
pxFailRel("Failed to reserve space for accurate prims");
}
const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
if (InRenderPass())
EndRenderPass();
// Copy data to an upload buffer.
VkBuffer upload_buffer;
u32 upload_buffer_offset;
const auto upload_data = [&](void* map_ptr) {
std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (size > m_texture_stream_buffer.GetCurrentSize() / 2)
{
upload_buffer_offset = 0;
upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
}
else
{
upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
}
if (upload_buffer == VK_NULL_HANDLE)
{
Console.Error("Failed to get upload buffer for accurate prims data.");
return;
}
// Copy data from upload to GPU buffer.
VkBufferCopy copyRegion = {upload_buffer_offset, offset, size};
vkCmdCopyBuffer(GetCurrentCommandBuffer(), upload_buffer, m_accurate_prims_stream_buffer.GetBuffer(), 1, &copyRegion);
// Commit the GPU region.
m_accurate_prims_stream_buffer.CommitMemory(size);
// Issue the barrier since this will be used next draw.
VkBufferMemoryBarrier barrier = {
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, nullptr,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
m_accurate_prims_stream_buffer.GetBuffer(), offset, size};
vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
0, 0, nullptr, 1, &barrier, 0, nullptr);
m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
}
}
void GSDeviceVK::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
// We separate this from setting up the buffer to mirror Vulkan, which requires it.
config.cb_vs.base_vertex = m_vertex.start;
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
}
}
VkBuffer GSDeviceVK::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
{
if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
{
ExecuteCommandBuffer(
false, "While waiting for %u bytes in texture upload buffer", size);
if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
return VK_NULL_HANDLE;
}
}
offset_out = m_texture_stream_buffer.GetCurrentOffset();
write_data(m_texture_stream_buffer.GetCurrentHostPointer());
m_texture_stream_buffer.CommitMemory(size);
return m_texture_stream_buffer.GetBuffer();
}
VkBuffer GSDeviceVK::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
{
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
// Don't worry about setting the coherent bit for this upload, the main reason we had
// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
// smaller uploads, but we're writing to the whole thing anyway.
VmaAllocationCreateInfo aci = {};
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
VmaAllocationInfo ai;
VkBuffer buffer;
VmaAllocation allocation;
VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
if (res != VK_SUCCESS)
{
LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
return VK_NULL_HANDLE;
}
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
// And write the data.
write_data(ai.pMappedData);
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
return buffer;
}
void GSDeviceVK::OMSetRenderTargets(
GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop)
{
@ -3379,12 +3551,15 @@ void GSDeviceVK::OMSetRenderTargets(
if (vkRt)
{
m_current_framebuffer =
vkRt->GetLinkedFramebuffer(vkDs, (feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0);
vkRt->GetLinkedFramebuffer(vkDs,
(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0,
(feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth) != 0);
}
else
{
pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT));
m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false);
pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) &&
!(feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth));
m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false, false);
}
}
else if (InRenderPass())
@ -3494,7 +3669,21 @@ void GSDeviceVK::OMSetRenderTargets(
if (vkDs)
{
// need to update descriptors to reflect the new layout
if (feedback_loop & FeedbackLoopFlag_ReadDS)
if (feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth)
{
// NVIDIA drivers appear to return random garbage when sampling the RT via a feedback loop, if the load op for
// the render pass is CLEAR. Using vkCmdClearAttachments() doesn't work, so we have to clear the image instead.
// Note: DS feedback loop was added later - we will assume that the same issue is relevant.
if (vkDs->GetState() == GSTexture::State::Cleared && IsDeviceNVIDIA())
vkDs->CommitClear();
if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop)
{
m_dirty_flags |= (DIRTY_FLAG_TFX_TEXTURE_0 << TFX_TEXTURE_DEPTH);
vkDs->TransitionToLayout(GSTextureVK::Layout::FeedbackLoop);
}
}
else if (feedback_loop & FeedbackLoopFlag_ReadDepth)
{
if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop)
{
@ -3675,6 +3864,16 @@ bool GSDeviceVK::CreateBuffers()
return false;
}
if (m_features.accurate_prims)
{
if (!m_accurate_prims_stream_buffer.Create(
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, ACCURATE_PRIMS_BUFFER_SIZE, true))
{
Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
return false;
}
}
if (!m_vertex_uniform_stream_buffer.Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VERTEX_UNIFORM_BUFFER_SIZE))
{
Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer");
@ -3734,6 +3933,8 @@ bool GSDeviceVK::CreatePipelineLayouts()
dslb.AddBinding(1, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
if (m_features.vs_expand)
dslb.AddBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_VERTEX_BIT);
if (m_features.accurate_prims)
dslb.AddBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
if ((m_tfx_ubo_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE)
return false;
Vulkan::SetObjectName(dev, m_tfx_ubo_ds_layout, "TFX UBO descriptor layout");
@ -3746,6 +3947,10 @@ bool GSDeviceVK::CreatePipelineLayouts()
VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
1, VK_SHADER_STAGE_FRAGMENT_BIT);
dslb.AddBinding(TFX_TEXTURE_PRIMID, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
dslb.AddBinding(TFX_TEXTURE_DEPTH,
(m_features.texture_barrier && !UseFeedbackLoopLayout()) ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT :
VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
1, VK_SHADER_STAGE_FRAGMENT_BIT);
if ((m_tfx_texture_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE)
return false;
Vulkan::SetObjectName(dev, m_tfx_texture_ds_layout, "TFX texture descriptor layout");
@ -4603,6 +4808,7 @@ void GSDeviceVK::DestroyResources()
m_fragment_uniform_stream_buffer.Destroy(false);
m_vertex_uniform_stream_buffer.Destroy(false);
m_index_stream_buffer.Destroy(false);
m_accurate_prims_stream_buffer.Destroy(false);
m_vertex_stream_buffer.Destroy(false);
if (m_expand_index_buffer != VK_NULL_HANDLE)
vmaDestroyBuffer(m_allocator, m_expand_index_buffer, m_expand_index_buffer_allocation);
@ -4670,6 +4876,7 @@ VkShaderModule GSDeviceVK::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel)
AddMacro(ss, "VS_POINT_SIZE", sel.point_size);
AddMacro(ss, "VS_EXPAND", static_cast<int>(sel.expand));
AddMacro(ss, "VS_PROVOKING_VERTEX_LAST", static_cast<int>(m_features.provoking_vertex_last));
AddMacro(ss, "VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));
ss << m_tfx_source;
VkShaderModule mod = g_vulkan_shader_cache->GetVertexShader(ss.str());
@ -4744,6 +4951,10 @@ VkShaderModule GSDeviceVK::GetTFXFragmentShader(const GSHWDrawConfig::PSSelector
AddMacro(ss, "PS_TEX_IS_FB", sel.tex_is_fb);
AddMacro(ss, "PS_NO_COLOR", sel.no_color);
AddMacro(ss, "PS_NO_COLOR1", sel.no_color1);
AddMacro(ss, "PS_ACCURATE_PRIMS", sel.accurate_prims);
AddMacro(ss, "PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
AddMacro(ss, "PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
AddMacro(ss, "PS_ZTST", sel.ztst);
ss << m_tfx_source;
VkShaderModule mod = g_vulkan_shader_cache->GetFragmentShader(ss.str());
@ -4945,6 +5156,11 @@ bool GSDeviceVK::CreatePersistentDescriptorSets()
dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
m_vertex_stream_buffer.GetBuffer(), 0, VERTEX_BUFFER_SIZE);
}
if (m_features.accurate_prims)
{
dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
m_accurate_prims_stream_buffer.GetBuffer(), 0, ACCURATE_PRIMS_BUFFER_SIZE);
}
dsub.Update(dev);
Vulkan::SetObjectName(dev, m_tfx_ubo_descriptor_set, "Persistent TFX UBO set");
return true;
@ -5341,11 +5557,15 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed)
m_current_pipeline_layout = PipelineLayout::TFX;
flags |= DIRTY_FLAG_TFX_UBO | DIRTY_FLAG_TFX_TEXTURES;
// Clear out the RT binding if feedback loop isn't on, because it'll be in the wrong state and make
// Clear out the RT/DS binding if feedback loop isn't on, because it'll be in the wrong state and make
// the validation layer cranky. Not a big deal since we need to write it anyway.
const GSTextureVK::Layout rt_tex_layout = m_tfx_textures[TFX_TEXTURE_RT]->GetLayout();
if (rt_tex_layout != GSTextureVK::Layout::FeedbackLoop && rt_tex_layout != GSTextureVK::Layout::ShaderReadOnly)
m_tfx_textures[TFX_TEXTURE_RT] = m_null_texture.get();
std::array<TFX_TEXTURES, 2> texture_types = { TFX_TEXTURE_RT, TFX_TEXTURE_DEPTH };
for (u32 texture_type : texture_types)
{
const GSTextureVK::Layout tex_layout = m_tfx_textures[texture_type]->GetLayout();
if (tex_layout != GSTextureVK::Layout::FeedbackLoop && tex_layout != GSTextureVK::Layout::ShaderReadOnly)
m_tfx_textures[texture_type] = m_null_texture.get();
}
}
if (flags & DIRTY_FLAG_TFX_UBO)
@ -5386,6 +5606,19 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed)
dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_PRIMID,
m_tfx_textures[TFX_TEXTURE_PRIMID]->GetView(), m_tfx_textures[TFX_TEXTURE_PRIMID]->GetVkLayout());
}
if (flags & DIRTY_FLAG_TFX_TEXTURE_DEPTH)
{
if (m_features.texture_barrier && !UseFeedbackLoopLayout())
{
dsub.AddInputAttachmentDescriptorWrite(
VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(), VK_IMAGE_LAYOUT_GENERAL);
}
else
{
dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(),
m_tfx_textures[TFX_TEXTURE_DEPTH]->GetVkLayout());
}
}
dsub.PushUpdate(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_tfx_pipeline_layout, TFX_DESCRIPTOR_SET_TEXTURES);
}
@ -5545,13 +5778,15 @@ GSTextureVK* GSDeviceVK::SetupPrimitiveTrackingDATE(GSHWDrawConfig& config)
void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
{
const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize());
GSTextureVK* draw_rt = static_cast<GSTextureVK*>(config.rt);
GSTextureVK* draw_ds = static_cast<GSTextureVK*>(config.ds);
GSTextureVK* draw_rt_clone = nullptr;
GSTextureVK* colclip_rt = static_cast<GSTextureVK*>(g_gs_device->GetColorClipTexture());
// Copying buffers needs to done outside render pass so do this early.
SetupAccuratePrimsBuffer(config);
// stream buffer in first, in case we need to exec
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
@ -5597,8 +5832,12 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
UpdateHWPipelineSelector(config, pipe);
// If we don't have a barrier but the texture was drawn to last draw, end the pass to insert a barrier.
if (InRenderPass() && !pipe.IsRTFeedbackLoop() && (config.tex == m_current_render_target || config.tex == m_current_depth_target))
if (InRenderPass())
{
if ((!pipe.IsRTFeedbackLoop() && config.tex == m_current_render_target) ||
(!pipe.IsDepthFeedbackLoop() && config.tex == m_current_depth_target))
EndRenderPass();
}
// now blit the colclip texture back to the original target
if (colclip_rt)
@ -5781,20 +6020,31 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
// Despite the layout changing enforcing the execution dependency between previous draws and the first
// input attachment read, it still wants the region/fragment-local barrier...
const bool skip_first_barrier =
(draw_rt && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop && !pipe.ps.colclip_hw && !IsDeviceAMD());
bool skip_first_barrier = !pipe.ps.colclip_hw && !IsDeviceAMD();
if (draw_rt)
skip_first_barrier = skip_first_barrier && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop;
if (draw_ds)
skip_first_barrier = skip_first_barrier && draw_ds->GetLayout() != GSTextureVK::Layout::FeedbackLoop;
OMSetRenderTargets(draw_rt, draw_ds, config.scissor, static_cast<FeedbackLoopFlag>(pipe.feedback_loop_flags));
if (pipe.IsRTFeedbackLoop())
{
pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled");
PSSetShaderResource(2, draw_rt, false);
PSSetShaderResource(TFX_TEXTURE_RT, draw_rt, false);
// If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor.
// Otherwise, we might have a previous descriptor left over, that has the RT in a different state.
m_dirty_flags |= (skip_first_barrier ? static_cast<u32>(DIRTY_FLAG_TFX_TEXTURE_RT) : 0);
}
if (pipe.IsDepthFeedbackLoop())
{
pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled");
PSSetShaderResource(TFX_TEXTURE_DEPTH, draw_ds, false);
// If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor.
// Otherwise, we might have a previous descriptor left over, that has the RT in a different state.
m_dirty_flags |= (skip_first_barrier ? static_cast<u32>(DIRTY_FLAG_TFX_TEXTURE_DEPTH) : 0);
}
// Begin render pass if new target or out of the area.
if (!InRenderPass())
{
@ -5868,7 +6118,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
// now we can do the actual draw
if (BindDrawPipeline(pipe))
SendHWDraw(config, draw_rt, config.require_one_barrier, config.require_full_barrier, skip_first_barrier);
SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr,
config.require_one_barrier, config.require_full_barrier, skip_first_barrier);
// blend second pass
if (config.blend_multi_pass.enable)
@ -5903,8 +6154,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
pipe.bs = config.blend;
if (BindDrawPipeline(pipe))
{
SendHWDraw(config, draw_rt, config.alpha_second_pass.require_one_barrier,
config.alpha_second_pass.require_full_barrier, false);
SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr,
config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, false);
}
}
@ -5981,19 +6232,24 @@ void GSDeviceVK::UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelect
pipe.rt = config.rt != nullptr;
pipe.ds = config.ds != nullptr;
pipe.line_width = config.line_expand;
pipe.feedback_loop_flags =
(m_features.texture_barrier &&
(config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier)) ?
FeedbackLoopFlag_ReadAndWriteRT :
FeedbackLoopFlag_None;
pipe.feedback_loop_flags |=
(config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDS : FeedbackLoopFlag_None;
pipe.feedback_loop_flags = FeedbackLoopFlag_None;
if (m_features.texture_barrier && (config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier))
{
pipe.feedback_loop_flags |= FeedbackLoopFlag_ReadAndWriteRT;
// We only allow DS feedback loop if RT is already in a feedback loop.
pipe.feedback_loop_flags |= (pipe.ds && config.ps.IsFeedbackLoopDepth()) ? FeedbackLoopFlag_ReadAndWriteDepth : FeedbackLoopFlag_None;
}
if (!(pipe.feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth))
{
pipe.feedback_loop_flags |= (config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDepth : FeedbackLoopFlag_None;
}
// enable point size in the vertex shader if we're rendering points regardless of upscaling.
pipe.vs.point_size |= (config.topology == GSHWDrawConfig::Topology::Point);
}
void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
{
IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts, GetVertexAlignment(config.vs.expand));
m_vertex.start *= GetExpansionFactor(config.vs.expand);
@ -6008,6 +6264,9 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
{
IASetIndexBuffer(config.indices, config.nindices);
}
// Needs to be done after vertex offset is set.
SetupAccuratePrimsConstants(config);
}
VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const
@ -6021,13 +6280,31 @@ VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const
VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, rt->GetImage(), {VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u}};
}
VkImageMemoryBarrier GSDeviceVK::GetDepthStencilBufferBarrier(GSTextureVK* ds) const
{
const VkImageLayout layout =
UseFeedbackLoopLayout() ? VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT : VK_IMAGE_LAYOUT_GENERAL;
const VkAccessFlags dst_access =
UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
return {VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, nullptr,
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, dst_access, layout, layout,
VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, ds->GetImage(),
{VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0u, 1u, 0u, 1u}};
}
VkDependencyFlags GSDeviceVK::GetColorBufferBarrierFlags() const
{
return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
VK_DEPENDENCY_BY_REGION_BIT;
}
void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
VkDependencyFlags GSDeviceVK::GetDepthStencilBufferBarrierFlags() const
{
return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
VK_DEPENDENCY_BY_REGION_BIT;
}
void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds,
bool one_barrier, bool full_barrier, bool skip_first_barrier)
{
if (!m_features.texture_barrier) [[unlikely]]
@ -6037,21 +6314,48 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
}
#ifdef PCSX2_DEVBUILD
if ((one_barrier || full_barrier) && !m_pipeline_selector.ps.IsFeedbackLoop()) [[unlikely]]
if ((one_barrier || full_barrier) && !(m_pipeline_selector.ps.IsFeedbackLoop() || m_pipeline_selector.ps.IsFeedbackLoopDepth())) [[unlikely]]
Console.Warning("VK: Possible unnecessary barrier detected.");
#endif
const VkDependencyFlags barrier_flags = GetColorBufferBarrierFlags();
std::array<VkDependencyFlags, 2> barrier_flags = {
GetColorBufferBarrierFlags(),
GetDepthStencilBufferBarrierFlags(),
};
std::array<VkImageMemoryBarrier, 2> barrier;
u32 barriers_per_draw = 0;
if (full_barrier || one_barrier)
{
if (draw_rt)
barrier[barriers_per_draw++] = GetColorBufferBarrier(draw_rt);
if (draw_ds)
barrier[barriers_per_draw++] = GetDepthStencilBufferBarrier(draw_ds);
}
const auto IssueBarriers = [&]() {
if (draw_rt)
{
vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[0], 0, nullptr, 0, nullptr, 1, &barrier[0]);
}
if (draw_ds)
{
vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[1], 0, nullptr, 0, nullptr, 1, &barrier[1]);
}
};
if (full_barrier)
{
pxAssert(config.drawlist && !config.drawlist->empty());
const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt);
const u32 indices_per_prim = config.indices_per_prim;
const u32 draw_list_size = static_cast<u32>(config.drawlist->size());
GL_PUSH("Split the draw");
g_perfmon.Put(
GSPerfMon::Barriers, static_cast<u32>(draw_list_size) - static_cast<u32>(skip_first_barrier));
g_perfmon.Put(GSPerfMon::Barriers,
barriers_per_draw * (static_cast<u32>(draw_list_size) - static_cast<u32>(skip_first_barrier)));
u32 p = 0;
u32 n = 0;
@ -6066,8 +6370,7 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
for (; n < draw_list_size; n++)
{
vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier);
IssueBarriers();
const u32 count = (*config.drawlist)[n] * indices_per_prim;
DrawIndexedPrimitive(p, count);
@ -6079,11 +6382,8 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
if (one_barrier && !skip_first_barrier)
{
g_perfmon.Put(GSPerfMon::Barriers, 1);
const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt);
vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier);
g_perfmon.Put(GSPerfMon::Barriers, barriers_per_draw);
IssueBarriers();
}
DrawIndexedPrimitive();

View File

@ -98,6 +98,8 @@ public:
__fi VkCommandBuffer GetCurrentCommandBuffer() const { return m_current_command_buffer; }
__fi VKStreamBuffer& GetTextureUploadBuffer() { return m_texture_stream_buffer; }
VkCommandBuffer GetCurrentInitCommandBuffer();
VkBuffer AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
VkBuffer WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);
/// Allocates a descriptor set from the pool reserved for the current frame.
VkDescriptorSet AllocatePersistentDescriptorSet(VkDescriptorSetLayout set_layout);
@ -293,7 +295,8 @@ public:
{
FeedbackLoopFlag_None = 0,
FeedbackLoopFlag_ReadAndWriteRT = 1,
FeedbackLoopFlag_ReadDS = 2,
FeedbackLoopFlag_ReadDepth = 2,
FeedbackLoopFlag_ReadAndWriteDepth = 4,
};
struct alignas(8) PipelineSelector
@ -308,7 +311,7 @@ public:
u32 rt : 1;
u32 ds : 1;
u32 line_width : 1;
u32 feedback_loop_flags : 2;
u32 feedback_loop_flags : 3;
};
u32 key;
@ -326,7 +329,8 @@ public:
__fi PipelineSelector() { std::memset(this, 0, sizeof(*this)); }
__fi bool IsRTFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteRT) != 0); }
__fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadDS) != 0); }
__fi bool IsDepthFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth) != 0); }
__fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & (FeedbackLoopFlag_ReadDepth | FeedbackLoopFlag_ReadAndWriteDepth)) != 0); }
};
static_assert(sizeof(PipelineSelector) == 24, "Pipeline selector is 24 bytes");
@ -357,10 +361,11 @@ public:
};
enum TFX_TEXTURES : u32
{
TFX_TEXTURE_TEXTURE,
TFX_TEXTURE_TEXTURE = 0,
TFX_TEXTURE_PALETTE,
TFX_TEXTURE_RT,
TFX_TEXTURE_PRIMID,
TFX_TEXTURE_DEPTH,
NUM_TFX_TEXTURES
};
@ -377,6 +382,8 @@ private:
VKStreamBuffer m_vertex_stream_buffer;
VKStreamBuffer m_index_stream_buffer;
VKStreamBuffer m_accurate_prims_stream_buffer;
u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw.
VKStreamBuffer m_vertex_uniform_stream_buffer;
VKStreamBuffer m_fragment_uniform_stream_buffer;
VKStreamBuffer m_texture_stream_buffer;
@ -559,6 +566,9 @@ public:
void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);
void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
void SetupAccuratePrimsConstants(GSHWDrawConfig& config);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor,
FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None);
@ -568,10 +578,12 @@ public:
void RenderHW(GSHWDrawConfig& config) override;
void UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelector& pipe);
void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config);
void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config);
VkImageMemoryBarrier GetColorBufferBarrier(GSTextureVK* rt) const;
VkDependencyFlags GetColorBufferBarrierFlags() const;
void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
VkImageMemoryBarrier GetDepthStencilBufferBarrier(GSTextureVK* ds) const;
VkDependencyFlags GetDepthStencilBufferBarrierFlags() const;
void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds,
bool one_barrier, bool full_barrier, bool skip_first_barrier);
//////////////////////////////////////////////////////////////////////////
@ -621,25 +633,27 @@ public:
private:
enum DIRTY_FLAG : u32
{
DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3
DIRTY_FLAG_TFX_UBO = (1 << 4),
DIRTY_FLAG_UTILITY_TEXTURE = (1 << 5),
DIRTY_FLAG_BLEND_CONSTANTS = (1 << 6),
DIRTY_FLAG_LINE_WIDTH = (1 << 7),
DIRTY_FLAG_INDEX_BUFFER = (1 << 8),
DIRTY_FLAG_VIEWPORT = (1 << 9),
DIRTY_FLAG_SCISSOR = (1 << 10),
DIRTY_FLAG_PIPELINE = (1 << 11),
DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 12),
DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 13),
DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3, 4
DIRTY_FLAG_TFX_UBO = (1 << 5),
DIRTY_FLAG_UTILITY_TEXTURE = (1 << 6),
DIRTY_FLAG_BLEND_CONSTANTS = (1 << 7),
DIRTY_FLAG_LINE_WIDTH = (1 << 8),
DIRTY_FLAG_INDEX_BUFFER = (1 << 9),
DIRTY_FLAG_VIEWPORT = (1 << 10),
DIRTY_FLAG_SCISSOR = (1 << 11),
DIRTY_FLAG_PIPELINE = (1 << 12),
DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 13),
DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 14),
DIRTY_FLAG_TFX_TEXTURE_TEX = (DIRTY_FLAG_TFX_TEXTURE_0 << 0),
DIRTY_FLAG_TFX_TEXTURE_PALETTE = (DIRTY_FLAG_TFX_TEXTURE_0 << 1),
DIRTY_FLAG_TFX_TEXTURE_RT = (DIRTY_FLAG_TFX_TEXTURE_0 << 2),
DIRTY_FLAG_TFX_TEXTURE_PRIMID = (DIRTY_FLAG_TFX_TEXTURE_0 << 3),
DIRTY_FLAG_TFX_TEXTURE_DEPTH = (DIRTY_FLAG_TFX_TEXTURE_0 << 4),
DIRTY_FLAG_TFX_TEXTURES = DIRTY_FLAG_TFX_TEXTURE_TEX | DIRTY_FLAG_TFX_TEXTURE_PALETTE |
DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID,
DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID |
DIRTY_FLAG_TFX_TEXTURE_DEPTH,
DIRTY_BASE_STATE = DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PIPELINE | DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR |
DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_LINE_WIDTH,

View File

@ -114,7 +114,7 @@ std::unique_ptr<GSTextureVK> GSTextureVK::Create(Type type, Format format, int w
VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT |
(GSDeviceVK::GetInstance()->UseFeedbackLoopLayout() ? VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT
: 0);
: VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT);
vci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
}
break;
@ -198,7 +198,7 @@ void GSTextureVK::Destroy(bool defer)
if (m_type == Type::RenderTarget || m_type == Type::DepthStencil)
{
for (const auto& [other_tex, fb, feedback] : m_framebuffers)
for (const auto& [other_tex, fb, feedback_color, feedback_depth] : m_framebuffers)
{
if (other_tex)
{
@ -270,38 +270,6 @@ void GSTextureVK::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch
StringUtil::StrideMemCpy(dst, upload_pitch, src, pitch, std::min(upload_pitch, pitch), count);
}
VkBuffer GSTextureVK::AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 size = upload_pitch * height;
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
// Don't worry about setting the coherent bit for this upload, the main reason we had
// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
// smaller uploads, but we're writing to the whole thing anyway.
VmaAllocationCreateInfo aci = {};
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
VmaAllocationInfo ai;
VkBuffer buffer;
VmaAllocation allocation;
VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
if (res != VK_SUCCESS)
{
LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
return VK_NULL_HANDLE;
}
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
// And write the data.
CopyTextureDataForUpload(ai.pMappedData, data, pitch, upload_pitch, height);
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
return buffer;
}
void GSTextureVK::UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height,
u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset)
{
@ -333,6 +301,10 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
const u32 upload_pitch = Common::AlignUpPow2(pitch, GSDeviceVK::GetInstance()->GetBufferCopyRowPitchAlignment());
const u32 required_size = CalcUploadSize(height, upload_pitch);
const auto upload_data = [&](void* map_ptr) {
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
VkBuffer buffer;
@ -340,29 +312,14 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
if (required_size > (GSDeviceVK::GetInstance()->GetTextureUploadBuffer().GetCurrentSize() / 2))
{
buffer_offset = 0;
buffer = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
if (buffer == VK_NULL_HANDLE)
return false;
buffer = GSDeviceVK::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
}
else
{
VKStreamBuffer& sbuffer = GSDeviceVK::GetInstance()->GetTextureUploadBuffer();
if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
{
GSDeviceVK::GetInstance()->ExecuteCommandBuffer(
false, "While waiting for %u bytes in texture upload buffer", required_size);
if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
buffer = GSDeviceVK::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, buffer_offset);
}
if (buffer == VK_NULL_HANDLE)
return false;
}
}
buffer = sbuffer.GetBuffer();
buffer_offset = sbuffer.GetCurrentOffset();
CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
sbuffer.CommitMemory(required_size);
}
const VkCommandBuffer cmdbuf = GetCommandBufferForUpdate();
GL_PUSH("GSTextureVK::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);
@ -738,16 +695,16 @@ void GSTextureVK::TransitionSubresourcesToLayout(
VkFramebuffer GSTextureVK::GetFramebuffer(bool feedback_loop)
{
return GetLinkedFramebuffer(nullptr, feedback_loop);
return GetLinkedFramebuffer(nullptr, feedback_loop, false);
}
VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop)
VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth)
{
pxAssertRel(m_type != Type::Texture, "Texture is a render target");
for (const auto& [other_tex, fb, other_feedback_loop] : m_framebuffers)
for (const auto& [other_tex, fb, other_feedback_loop_color, other_feedback_loop_depth] : m_framebuffers)
{
if (other_tex == depth_texture && other_feedback_loop == feedback_loop)
if (other_tex == depth_texture && other_feedback_loop_color == feedback_loop_color && other_feedback_loop_depth == feedback_loop_depth)
return fb;
}
@ -756,7 +713,7 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool
(m_type != GSTexture::Type::DepthStencil) ? (depth_texture ? depth_texture->m_vk_format : VK_FORMAT_UNDEFINED) :
m_vk_format,
VK_ATTACHMENT_LOAD_OP_LOAD, VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_LOAD,
VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop);
VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop_color, feedback_loop_depth);
if (!rp)
return VK_NULL_HANDLE;
@ -771,9 +728,9 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool
if (!fb)
return VK_NULL_HANDLE;
m_framebuffers.emplace_back(depth_texture, fb, feedback_loop);
m_framebuffers.emplace_back(depth_texture, fb, feedback_loop_color, feedback_loop_depth);
if (depth_texture)
depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop);
depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop_color, feedback_loop_depth);
return fb;
}

View File

@ -73,7 +73,7 @@ public:
/// Framebuffers are lazily allocated.
VkFramebuffer GetFramebuffer(bool feedback_loop);
VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop);
VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth);
// Call when the texture is bound to the pipeline, or read from in a copy.
__fi void SetUseFenceCounter(u64 counter) { m_use_fence_counter = counter; }
@ -84,7 +84,6 @@ private:
VkCommandBuffer GetCommandBufferForUpdate();
void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;
VkBuffer AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
void UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height,
u32 row_length, VkBuffer buffer, u32 buffer_offset);
@ -103,7 +102,7 @@ private:
// linked framebuffer is combined with depth texture
// list of color textures this depth texture is linked to or vice versa
std::vector<std::tuple<GSTextureVK*, VkFramebuffer, bool>> m_framebuffers;
std::vector<std::tuple<GSTextureVK*, VkFramebuffer, bool, bool>> m_framebuffers;
};
class GSDownloadTextureVK final : public GSDownloadTexture

View File

@ -19,6 +19,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
, m_allocation(move.m_allocation)
, m_buffer(move.m_buffer)
, m_host_pointer(move.m_host_pointer)
, m_device_local(move.m_device_local)
, m_tracked_fences(std::move(move.m_tracked_fences))
{
move.m_size = 0;
@ -28,6 +29,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
move.m_allocation = VK_NULL_HANDLE;
move.m_buffer = VK_NULL_HANDLE;
move.m_host_pointer = nullptr;
move.m_device_local = false;
}
VKStreamBuffer::~VKStreamBuffer()
@ -48,19 +50,29 @@ VKStreamBuffer& VKStreamBuffer::operator=(VKStreamBuffer&& move)
std::swap(m_buffer, move.m_buffer);
std::swap(m_host_pointer, move.m_host_pointer);
std::swap(m_tracked_fences, move.m_tracked_fences);
std::swap(m_device_local, move.m_device_local);
return *this;
}
bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size, bool device_local)
{
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
usage, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
VmaAllocationCreateInfo aci = {};
if (device_local)
{
// GPU default buffer
aci.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
}
else
{
// CPU upload buffer
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
VmaAllocationInfo ai = {};
VkBuffer new_buffer = VK_NULL_HANDLE;
@ -83,7 +95,8 @@ bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
m_tracked_fences.clear();
m_allocation = new_allocation;
m_buffer = new_buffer;
m_host_pointer = static_cast<u8*>(ai.pMappedData);
m_host_pointer = device_local ? nullptr : static_cast<u8*>(ai.pMappedData);
m_device_local = device_local;
return true;
}
@ -104,6 +117,7 @@ void VKStreamBuffer::Destroy(bool defer)
m_buffer = VK_NULL_HANDLE;
m_allocation = VK_NULL_HANDLE;
m_host_pointer = nullptr;
m_device_local = false;
}
bool VKStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
@ -180,8 +194,11 @@ void VKStreamBuffer::CommitMemory(u32 final_num_bytes)
pxAssert((m_current_offset + final_num_bytes) <= m_size);
pxAssert(final_num_bytes <= m_current_space);
if (!m_device_local)
{
// For non-coherent mappings, flush the memory range
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes);
}
m_current_offset += final_num_bytes;
m_current_space -= final_num_bytes;

View File

@ -30,14 +30,13 @@ public:
__fi u32 GetCurrentSpace() const { return m_current_space; }
__fi u32 GetCurrentOffset() const { return m_current_offset; }
bool Create(VkBufferUsageFlags usage, u32 size);
bool Create(VkBufferUsageFlags usage, u32 size, bool device_local = false);
void Destroy(bool defer);
bool ReserveMemory(u32 num_bytes, u32 alignment);
void CommitMemory(u32 final_num_bytes);
private:
bool AllocateBuffer(VkBufferUsageFlags usage, u32 size);
void UpdateCurrentFencePosition();
void UpdateGPUPosition();
@ -51,7 +50,8 @@ private:
VmaAllocation m_allocation = VK_NULL_HANDLE;
VkBuffer m_buffer = VK_NULL_HANDLE;
u8* m_host_pointer = nullptr;
u8* m_host_pointer = nullptr; // Only used for upload buffers.
bool m_device_local = false; // False for upload buffer; true for default buffer.
// List of fences and the corresponding positions in the buffer
std::deque<std::pair<u64, u32>> m_tracked_fences;

View File

@ -751,6 +751,7 @@ Pcsx2Config::GSOptions::GSOptions()
PreloadFrameWithGSData = false;
Mipmap = true;
HWMipmap = true;
HWAccuratePrims = false;
ManualUserHacks = false;
UserHacks_AlignSpriteX = false;
@ -1021,6 +1022,7 @@ void Pcsx2Config::GSOptions::LoadSave(SettingsWrapper& wrap)
SettingsWrapEntryEx(UpscaleMultiplier, "upscale_multiplier");
SettingsWrapBitBoolEx(HWMipmap, "hw_mipmap");
SettingsWrapBitBoolEx(HWAccuratePrims, "HWAccuratePrims");
SettingsWrapIntEnumEx(AccurateBlendingUnit, "accurate_blending_unit");
SettingsWrapIntEnumEx(TextureFiltering, "filter");
SettingsWrapIntEnumEx(TexturePreloading, "texture_preloading");