From 16a7cfebdde63c67b7176024bf15839a781f2e2e Mon Sep 17 00:00:00 2001 From: TJnotJT Date: Thu, 6 Nov 2025 20:19:23 -0500 Subject: [PATCH 1/2] GS/VK/GL/DX12/DX11: Implement accurate drawing/AA1 for lines and triangles. Also manual depth testing in shader and depth feedback loop. --- bin/resources/shaders/dx11/tfx.fx | 334 ++++++++++++- bin/resources/shaders/opengl/tfx_fs.glsl | 355 +++++++++++++- bin/resources/shaders/opengl/tfx_vgs.glsl | 34 ++ bin/resources/shaders/vulkan/tfx.glsl | 396 ++++++++++++++- pcsx2/Config.h | 1 + pcsx2/GS/GSState.cpp | 56 ++- pcsx2/GS/GSState.h | 6 + pcsx2/GS/GSVector.h | 15 + pcsx2/GS/Renderers/Common/GSDevice.h | 57 ++- pcsx2/GS/Renderers/DX11/GSDevice11.cpp | 136 +++++- pcsx2/GS/Renderers/DX11/GSDevice11.h | 14 +- pcsx2/GS/Renderers/DX12/GSDevice12.cpp | 175 ++++++- pcsx2/GS/Renderers/DX12/GSDevice12.h | 63 ++- pcsx2/GS/Renderers/HW/GSRendererHW.cpp | 476 +++++++++++++++++-- pcsx2/GS/Renderers/HW/GSRendererHW.h | 21 + pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h | 7 + pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp | 45 +- pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h | 2 + pcsx2/GS/Renderers/SW/GSRendererSW.h | 2 + pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp | 306 +++++++++--- pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h | 46 +- pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp | 18 +- pcsx2/GS/Renderers/Vulkan/GSTextureVK.h | 4 +- pcsx2/Pcsx2Config.cpp | 2 + 24 files changed, 2325 insertions(+), 246 deletions(-) diff --git a/bin/resources/shaders/dx11/tfx.fx b/bin/resources/shaders/dx11/tfx.fx index 1312131212..97daa14150 100644 --- a/bin/resources/shaders/dx11/tfx.fx +++ b/bin/resources/shaders/dx11/tfx.fx @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team // SPDX-License-Identifier: GPL-3.0+ +#define ACCURATE_LINES 1 +#define ACCURATE_TRIANGLES 2 + #define FMT_32 0 #define FMT_24 1 #define FMT_16 2 @@ -21,6 +24,11 @@ #define GS_FORWARD_PRIMID 0 #endif +#ifndef ZTST_GEQUAL +#define ZTST_GEQUAL 2 +#define ZTST_GREATER 3 +#endif + #ifndef PS_FST #define PS_IIP 0 #define PS_FST 0 @@ -84,6 +92,7 @@ #define SW_BLEND_NEEDS_RT (SW_BLEND && (PS_BLEND_A == 1 || PS_BLEND_B == 1 || PS_BLEND_C == 1 || PS_BLEND_D == 1)) #define SW_AD_TO_HW (PS_BLEND_C == 1 && PS_A_MASKED) #define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1) +#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP) struct VS_INPUT { @@ -94,6 +103,9 @@ struct VS_INPUT uint z : POSITION1; uint2 uv : TEXCOORD2; float4 f : COLOR1; +#ifdef VS_ACCURATE_PRIMS + uint vertex_id : SV_VertexID; +#endif }; struct VS_OUTPUT @@ -107,6 +119,12 @@ struct VS_OUTPUT #else nointerpolation float4 c : COLOR0; #endif +#if VS_ACCURATE_PRIMS + nointerpolation uint accurate_prims_index : TEXCOORD3; +#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + nointerpolation uint accurate_triangles_interior : TEXCOORD4; +#endif +#endif }; struct PS_INPUT @@ -122,6 +140,38 @@ struct PS_INPUT #if (PS_DATE >= 1 && PS_DATE <= 3) || GS_FORWARD_PRIMID uint primid : SV_PrimitiveID; #endif +#if PS_ACCURATE_PRIMS + nointerpolation uint accurate_prims_index : TEXCOORD3; +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + nointerpolation uint accurate_triangles_interior : TEXCOORD4; +#endif +#endif +}; + +struct AccuratePrimsEdgeData +{ + // Interpolated attributes + float4 t_float0; // 0 + float4 t_float1; // 16 + float4 t_int0; // 32 + float4 t_int1; // 48 + float4 c0; // 64 + float4 c1; // 80 + float4 p0; // 96 + float4 p1; // 112 + int4 edge0; // 128 + int4 edge1; // 144 + int2 xy0; // 160 + int2 xy1; // 168 + uint step_x; // 176 + uint draw0; // 180 + uint draw1; // 184 + uint top_left; // 188 + uint side; // 192 + uint _pad0; // 196 + uint _pad1; // 200 + uint _pad2; // 204 + // Total 208 }; #ifdef PIXEL_SHADER @@ -147,6 +197,8 @@ Texture2D Texture : register(t0); Texture2D Palette : register(t1); Texture2D RtTexture : register(t2); Texture2D PrimMinTexture : register(t3); +Texture2D DepthTexture : register(t4); +StructuredBuffer accurate_prims_data : register(t5); SamplerState TextureSampler : register(s0); #ifdef DX12 @@ -172,6 +224,12 @@ cbuffer cb1 float4x4 DitherMatrix; float ScaledScaleFactor; float RcpScaleFactor; + uint _pad0; + uint _pad1; + uint accurate_prims_base_index; + uint _pad2; + uint _pad3; + uint _pad4; }; float4 sample_c(float2 uv, float uv_w, int2 xy) @@ -1015,9 +1073,242 @@ void ps_blend(inout float4 Color, inout float4 As_rgba, float2 pos_xy) } } +#if PS_ACCURATE_PRIMS +// Interpolate vertex attributes over a line/edge manually. +void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1, inout PS_INPUT input) +{ + float weight0_f = float(weight0); + float weight1_f = float(weight1); + float weight_total = float(weight0 + weight1); + + float4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total; + float4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total; + float4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total; + float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total; + + // No interpolation for constant attributes. + input.t = lerp(t_float_interp, data.t_float1, float4(data.t_float1 == data.t_float0)); + input.ti = lerp(t_int_interp, data.t_int1, float4(data.t_int1 == data.t_int0)); + input.c = lerp(c_interp, data.c1, float4(data.c1 == data.c0)); + input.p.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp; + + // Clamp attributes. Fog/Z are normalized. + input.c = clamp(input.c, 0.0f, 255.0f); + input.t.z = clamp(input.t.z, 0.0f, 1.0f); + input.p.z = clamp(input.p.z, 0.0f, 1.0f); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_LINES +void HandleAccurateLines(inout PS_INPUT input, out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index]; + + int2 xy0 = data.xy0; + int2 xy1 = data.xy1; + int2 dxy = xy1 - xy0; + int2 xy0_i = (xy0 + 8) & ~0xF; + int2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool draw0 = bool(data.draw0); + bool draw1 = bool(data.draw1); + + // 4-bit fixed point: 16 subpixels per pixel + int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside line range + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + if ((major_i == major0_i && !draw0) || + (major_i == major1_i && !draw1)) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + +#if PS_ACCURATE_PRIMS_AA + // Proper fixed-point AA rounding + int minor_i_expected_0 = (minor_line / d_major) & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + int alpha_i; + if (minor_i == minor_i_expected_0) + alpha_i = alpha_i_0; + else if (minor_i == minor_i_expected_1) + alpha_i = alpha_i_1; + else + { + alpha_i = 0; // Prevent compiler warning. + discard; + } + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + // Non-AA: fixed-point rounding and 4-bit alignment + int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF; + if (minor_i != minor_i_expected) + discard; + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1, input); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +void HandleAccurateTrianglesEdge(inout PS_INPUT input, out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index]; + + int2 xy0 = data.xy0; + int2 xy1 = data.xy1; + int2 dxy = xy1 - xy0; + int2 xy0_i = (xy0 + 8) & ~0xF; + int2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool side = bool(data.side); + bool top_left = bool(data.top_left); + + // 4-bit fixed point: 16 subpixels per pixel + int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside edge range. + // Note: this is not exactly what the SW rasterizer does. + // See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking. + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + // Discard if on wrong side of other edges + if (dot(data.edge0, int4(xy_i, 1, 0)) <= 0 || + dot(data.edge1, int4(xy_i, 1, 0)) <= 0) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + int minor_i_expected = minor_line / d_major; + int minor_i_expected_0 = minor_i_expected & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + // Proper fixed-point AA rounding + int alpha_i; + if ((minor_i_expected & 0xF) == 0) + { + // On a pixel center + alpha_i = top_left ? 0 : d_major_scaled; + minor_i_expected += top_left ? (side ? -16 : 16) : 0; + } + else if (side) + { + minor_i_expected = minor_i_expected_0; + alpha_i = alpha_i_0; + } + else + { + minor_i_expected = minor_i_expected_1; + alpha_i = alpha_i_1; + } + if (minor_i != minor_i_expected) + discard; + +#if PS_ACCURATE_PRIMS_AA + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1, input); +} +#endif + PS_OUTPUT ps_main(PS_INPUT input) { +#if PS_ACCURATE_PRIMS + float alpha_coverage; +#if PS_ACCURATE_PRIMS == ACCURATE_LINES + HandleAccurateLines(input, alpha_coverage); +#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(input.accurate_triangles_interior)) + { + alpha_coverage = 128.0f; + } + else + { + HandleAccurateTrianglesEdge(input, alpha_coverage); + } +#endif +#endif // PS_ACCURATE_PRIMS + +#if NEEDS_DEPTH + float current_depth = DepthTexture.Load(int3(floor(input.p.xy), 0)).r; +#endif + +#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER) + #if PS_ZTST == ZTST_GEQUAL + if (input.p.z < current_depth) + discard; + #elif PS_ZTST == ZTST_GREATER + if (input.p.z <= current_depth) + discard; + #endif +#endif // PS_ZTST + float4 C = ps_color(input); + +#if PS_FIXED_ONE_A + // AA (Fixed one) will output a coverage of 1.0 as alpha + C.a = 128.0f; +#elif PS_ACCURATE_PRIMS_AA + // AA: coverage is computed in alpha_coverage + #if PS_ACCURATE_PRIMS_AA_ABE + if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128. + C.a = alpha_coverage; + #else + C.a = alpha_coverage; + #endif +#endif + bool atst_pass = atst(C); #if PS_AFAIL == 0 // KEEP or ATST off @@ -1034,14 +1325,6 @@ PS_OUTPUT ps_main(PS_INPUT input) discard; } - // Must be done before alpha correction - - // AA (Fixed one) will output a coverage of 1.0 as alpha - if (PS_FIXED_ONE_A) - { - C.a = 128.0f; - } - float4 alpha_blend = (float4)0.0f; if (SW_AD_TO_HW) { @@ -1210,7 +1493,14 @@ PS_OUTPUT ps_main(PS_INPUT input) #endif // PS_DATE != 1/2 #if PS_ZCLAMP - output.depth = min(input.p.z, MaxDepthPS); + #if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(input.accurate_triangles_interior)) + output.depth = min(input.p.z, MaxDepthPS); + else + output.depth = current_depth; // No depth update for triangle edges. + #else + output.depth = min(input.p.z, MaxDepthPS); + #endif #endif return output; @@ -1236,7 +1526,9 @@ cbuffer cb0 float2 TextureOffset; float2 PointSize; uint MaxDepth; - uint BaseVertex; // Only used in DX11. + uint pad_cb0; + uint BaseVertex; + uint pad_cb0_2; }; VS_OUTPUT vs_main(VS_INPUT input) @@ -1256,6 +1548,28 @@ VS_OUTPUT vs_main(VS_INPUT input) output.p.xy = output.p.xy * float2(VertexScale.x, -VertexScale.y) - float2(VertexOffset.x, -VertexOffset.y); output.p.z *= exp2(-32.0f); // integer->float depth + #if VS_ACCURATE_PRIMS == ACCURATE_LINES + output.accurate_prims_index = input.vertex_id / 6; + output.t = 0.0f; + output.ti = 0.0f; + output.c = 0.0f; + return output; // Don't send line vertex attributes - they are interpolated manually in the pixel shader. + #elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + uint prim_id = input.vertex_id / 21; + output.accurate_triangles_interior = uint((input.vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior. + if (!bool(output.accurate_triangles_interior)) + { + uint edge = (input.vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge. + output.accurate_prims_index = 3 * prim_id + edge; + output.t = 0.0f; + output.ti = 0.0f; + output.c = 0.0f; + return output; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader. + } + output.accurate_prims_index = 0; + // Send the interior vertex attributes for fixed function interpolation. + #endif + if(VS_TME) { float2 uv = input.uv - TextureOffset; diff --git a/bin/resources/shaders/opengl/tfx_fs.glsl b/bin/resources/shaders/opengl/tfx_fs.glsl index afa0b9a1de..9c5ab3552f 100644 --- a/bin/resources/shaders/opengl/tfx_fs.glsl +++ b/bin/resources/shaders/opengl/tfx_fs.glsl @@ -3,6 +3,9 @@ //#version 420 // Keep it for text editor detection +#define ACCURATE_LINES 1 +#define ACCURATE_TRIANGLES 2 + #define FMT_32 0 #define FMT_24 1 #define FMT_16 2 @@ -11,6 +14,11 @@ #define SHUFFLE_WRITE 2 #define SHUFFLE_READWRITE 3 +#ifndef ZTST_GEQUAL +#define ZTST_GEQUAL 2 +#define ZTST_GREATER 3 +#endif + // TEX_COORD_DEBUG output the uv coordinate as color. It is useful // to detect bad sampling due to upscaling //#define TEX_COORD_DEBUG @@ -28,6 +36,9 @@ #define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1) #define NEEDS_RT (NEEDS_RT_EARLY || NEEDS_RT_FOR_AFAIL || (!PS_PRIMID_INIT && (PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW))) #define NEEDS_TEX (PS_TFX != 4) +#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP) + +vec4 FragCoord; layout(std140, binding = 0) uniform cb21 { @@ -57,8 +68,71 @@ layout(std140, binding = 0) uniform cb21 float ScaledScaleFactor; float RcpScaleFactor; + uint _pad0; + uint _pad1; + + uint accurate_prims_base_index; + uint _pad2; + uint _pad3; + uint _pad4; }; +#if PS_ACCURATE_PRIMS +struct +{ + vec4 t_float; + vec4 t_int; + vec4 c; +} PSin; + +in SHADER +{ + vec4 t_float; + vec4 t_int; + + #if PS_IIP != 0 + vec4 c; + #else + flat vec4 c; + #endif +} PSinReal; + +flat in uint accurate_prims_index; +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +flat in uint accurate_triangles_interior; +#endif + +struct AccuratePrimsEdgeData +{ + // Interpolated attributes + vec4 t_float0; // 0 + vec4 t_float1; // 16 + vec4 t_int0; // 32 + vec4 t_int1; // 48 + vec4 c0; // 64 + vec4 c1; // 80 + vec4 p0; // 96 + vec4 p1; // 112 + ivec4 edge0; // 128 + ivec4 edge1; // 144 + ivec2 xy0; // 160 + ivec2 xy1; // 168 + uint step_x; // 176 + uint draw0; // 180 + uint draw1; // 184 + uint top_left; // 188 + uint side; // 192 + uint _pad0; // 196 + uint _pad1; // 200 + uint _pad2; // 204 + // Total 208 +}; + +layout (std140, binding = 3) buffer AccuratePrimsEdgeDataBuffer { + AccuratePrimsEdgeData accurate_prims_data[]; +}; + +#else in SHADER { vec4 t_float; @@ -70,6 +144,7 @@ in SHADER flat vec4 c; #endif } PSin; +#endif #define TARGET_0_QUALIFIER out @@ -107,9 +182,10 @@ layout(binding = 2) uniform sampler2D RtSampler; // note 2 already use by the im #if PS_DATE == 3 layout(binding = 3) uniform sampler2D img_prim_min; +#endif -// I don't remember why I set this parameter but it is surely useless -//layout(pixel_center_integer) in vec4 gl_FragCoord; +#if NEEDS_DEPTH +layout(binding = 4) uniform sampler2D DepthSampler; #endif vec4 sample_from_rt() @@ -119,7 +195,16 @@ vec4 sample_from_rt() #elif HAS_FRAMEBUFFER_FETCH return LAST_FRAG_COLOR; #else - return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0); + return texelFetch(RtSampler, ivec2(FragCoord.xy), 0); +#endif +} + +vec4 sample_from_depth() +{ +#if !NEEDS_DEPTH + return vec4(0.0); +#else + return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0); #endif } @@ -315,7 +400,7 @@ int fetch_raw_depth() #if PS_TEX_IS_FB == 1 return int(sample_from_rt().r * multiplier); #else - return int(texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0).r * multiplier); + return int(texelFetch(TextureSampler, ivec2(FragCoord.xy), 0).r * multiplier); #endif } @@ -324,7 +409,7 @@ vec4 fetch_raw_color() #if PS_TEX_IS_FB == 1 return sample_from_rt(); #else - return texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0); + return texelFetch(TextureSampler, ivec2(FragCoord.xy), 0); #endif } @@ -724,9 +809,9 @@ void ps_dither(inout vec3 C, float As) { #if PS_DITHER > 0 && PS_DITHER < 3 #if PS_DITHER == 2 - ivec2 fpos = ivec2(gl_FragCoord.xy); + ivec2 fpos = ivec2(FragCoord.xy); #else - ivec2 fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor); + ivec2 fpos = ivec2(FragCoord.xy * RcpScaleFactor); #endif float value = DitherMatrix[fpos.y&3][fpos.x&3]; @@ -967,11 +1052,233 @@ float As = As_rgba.a; #endif } +#if PS_ACCURATE_PRIMS +// Interpolate vertex attributes over a line/edge manually. +void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1) +{ + float weight0_f = float(weight0); + float weight1_f = float(weight1); + float weight_total = float(weight0 + weight1); + + vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total; + vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total; + vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total; + float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total; + + // No interpolation for constant attributes. + PSin.t_float = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0)); + PSin.t_int = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0)); + PSin.c = mix(c_interp, data.c1, equal(data.c1, data.c0)); + FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp; + + // Clamp attributes. Fog/Z are normalized. + PSin.c = clamp(PSin.c, 0.0f, 255.0f); + PSin.t_float.z = clamp(PSin.t_float.z, 0.0f, 1.0f); + FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_LINES +void HandleAccurateLines(out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index]; + + ivec2 xy0 = data.xy0; + ivec2 xy1 = data.xy1; + ivec2 dxy = xy1 - xy0; + ivec2 xy0_i = (xy0 + 8) & ~0xF; + ivec2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool draw0 = bool(data.draw0); + bool draw1 = bool(data.draw1); + + // 4-bit fixed point: 16 subpixels per pixel + ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside line range + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + if ((major_i == major0_i && !draw0) || + (major_i == major1_i && !draw1)) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + +#if PS_ACCURATE_PRIMS_AA + // Proper fixed-point AA rounding + int minor_i_expected_0 = (minor_line / d_major) & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + int alpha_i; + if (minor_i == minor_i_expected_0) + alpha_i = alpha_i_0; + else if (minor_i == minor_i_expected_1) + alpha_i = alpha_i_1; + else + discard; + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + // Non-AA: fixed-point rounding and 4-bit alignment + int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF; + if (minor_i != minor_i_expected) + discard; + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +void HandleAccurateTrianglesEdge(out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index]; + + ivec2 xy0 = data.xy0; + ivec2 xy1 = data.xy1; + ivec2 dxy = xy1 - xy0; + ivec2 xy0_i = (xy0 + 8) & ~0xF; + ivec2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool side = bool(data.side); + bool top_left = bool(data.top_left); + + // 4-bit fixed point: 16 subpixels per pixel + ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside edge range. + // Note: this is not exactly what the SW rasterizer does. + // See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking. + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + // Discard if on wrong side of other edges + if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 || + dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + int minor_i_expected = minor_line / d_major; + int minor_i_expected_0 = minor_i_expected & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + bool minor_i_pixel_center = ((minor_line - d_major * minor_i_expected_0) & 0xF) == 0; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + // Proper fixed-point AA rounding + int alpha_i; + if ((minor_i_expected & 0xF) == 0) + { + // On a pixel center + alpha_i = top_left ? 0 : d_major_scaled; + minor_i_expected += top_left ? (side ? -16 : 16) : 0; + } + else if (side) + { + minor_i_expected = minor_i_expected_0; + alpha_i = alpha_i_0; + } + else + { + minor_i_expected = minor_i_expected_1; + alpha_i = alpha_i_1; + } + if (minor_i != minor_i_expected) + discard; + +#if PS_ACCURATE_PRIMS_AA + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1); +} +#endif + void ps_main() { + FragCoord = gl_FragCoord; + +#if PS_ACCURATE_PRIMS + float alpha_coverage; +#if PS_ACCURATE_PRIMS == ACCURATE_LINES + HandleAccurateLines(alpha_coverage); +#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(accurate_triangles_interior)) + { + alpha_coverage = 128.0f; + PSin.t_float = PSinReal.t_float; + PSin.t_int = PSinReal.t_int; + PSin.c = PSinReal.c; + } + else + { + HandleAccurateTrianglesEdge(alpha_coverage); + } +#endif +#endif // PS_ACCURATE_PRIMS + +#if NEEDS_DEPTH + float current_depth = sample_from_depth().r; +#endif + +#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER) + #if PS_ZTST == ZTST_GEQUAL + if (FragCoord.z < current_depth) + discard; + #elif PS_ZTST == ZTST_GREATER + if (FragCoord.z <= current_depth) + discard; + #endif +#endif // PS_ZTST + #if PS_SCANMSK & 2 // fail depth test on prohibited lines - if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1)) + if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1)) discard; #endif @@ -1007,7 +1314,7 @@ void ps_main() #endif #if PS_DATE == 3 - int stencil_ceil = int(texelFetch(img_prim_min, ivec2(gl_FragCoord.xy), 0).r); + int stencil_ceil = int(texelFetch(img_prim_min, ivec2(FragCoord.xy), 0).r); // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update // the bad alpha value so we must keep it. @@ -1017,6 +1324,20 @@ void ps_main() #endif vec4 C = ps_color(); + +#if PS_FIXED_ONE_A + // AA (Fixed one) will output a coverage of 1.0 as alpha + C.a = 128.0f; +#elif PS_ACCURATE_PRIMS_AA + // AA: coverage is computed in alpha_coverage + #if PS_ACCURATE_PRIMS_AA_ABE + if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128. + C.a = alpha_coverage; + #else + C.a = alpha_coverage; + #endif +#endif + bool atst_pass = atst(C); #if PS_AFAIL == 0 // KEEP or ATST off @@ -1024,13 +1345,6 @@ void ps_main() discard; #endif - // Must be done before alpha correction - - // AA (Fixed one) will output a coverage of 1.0 as alpha -#if PS_FIXED_ONE_A - C.a = 128.0f; -#endif - #if SW_AD_TO_HW #if PS_RTA_CORRECTION vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f); @@ -1144,6 +1458,13 @@ void ps_main() #endif #if PS_ZCLAMP - gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS); + #if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(accurate_triangles_interior)) + gl_FragDepth = min(FragCoord.z, MaxDepthPS); + else + gl_FragDepth = current_depth; // No depth update for triangle edges. + #else + gl_FragDepth = min(FragCoord.z, MaxDepthPS); + #endif #endif } diff --git a/bin/resources/shaders/opengl/tfx_vgs.glsl b/bin/resources/shaders/opengl/tfx_vgs.glsl index 4cc2e85ce5..9f1a2dcc02 100644 --- a/bin/resources/shaders/opengl/tfx_vgs.glsl +++ b/bin/resources/shaders/opengl/tfx_vgs.glsl @@ -3,6 +3,16 @@ //#version 420 // Keep it for text editor detection +#define ACCURATE_LINES 1 +#define ACCURATE_TRIANGLES 2 + +#if VS_ACCURATE_PRIMS +flat out uint accurate_prims_index; +#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +flat out uint accurate_triangles_interior; +#endif +#endif + layout(std140, binding = 1) uniform cb20 { vec2 VertexScale; @@ -14,6 +24,8 @@ layout(std140, binding = 1) uniform cb20 vec2 PointSize; uint MaxDepth; uint pad_cb20; + uint BaseVertex; + uint pad_cb20_2; }; #ifdef VERTEX_SHADER @@ -75,6 +87,28 @@ void vs_main() gl_Position.z = float(z) * exp_min32; gl_Position.w = 1.0f; + #if VS_ACCURATE_PRIMS == ACCURATE_LINES + accurate_prims_index = (gl_VertexID - BaseVertex) / 6; + VSout.t_float = vec4(0.0f); + VSout.t_int = vec4(0.0f); + VSout.c = vec4(0.0f); + return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader. + #elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + uint vertex_id = gl_VertexID - BaseVertex; + uint prim_id = vertex_id / 21; + accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior. + if (!bool(accurate_triangles_interior)) + { + uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge. + accurate_prims_index = 3 * prim_id + edge; + VSout.t_float = vec4(0.0f); + VSout.t_int = vec4(0.0f); + VSout.c = vec4(0.0f); + return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader. + } + // Send the interior vertex attributes for fixed function interpolation. + #endif + texture_coord(); VSout.c = i_c; diff --git a/bin/resources/shaders/vulkan/tfx.glsl b/bin/resources/shaders/vulkan/tfx.glsl index aba5993ca4..77838c158a 100644 --- a/bin/resources/shaders/vulkan/tfx.glsl +++ b/bin/resources/shaders/vulkan/tfx.glsl @@ -1,12 +1,23 @@ // SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team // SPDX-License-Identifier: GPL-3.0+ +#define ACCURATE_LINES 1 +#define ACCURATE_TRIANGLES 2 + ////////////////////////////////////////////////////////////////////// // Vertex Shader ////////////////////////////////////////////////////////////////////// + #if defined(VERTEX_SHADER) +#if VS_ACCURATE_PRIMS +layout(location = 7) flat out uint accurate_prims_index; +#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +layout(location = 8) flat out uint accurate_triangles_interior; +#endif +#endif + layout(std140, set = 0, binding = 0) uniform cb0 { vec2 VertexScale; @@ -16,6 +27,8 @@ layout(std140, set = 0, binding = 0) uniform cb0 vec2 PointSize; uint MaxDepth; uint pad_cb0; + uint BaseVertex; + uint pad_cb0_2; }; layout(location = 0) out VSOutput @@ -55,6 +68,28 @@ void main() gl_Position.z *= exp2(-32.0f); // integer->float depth gl_Position.y = -gl_Position.y; + #if VS_ACCURATE_PRIMS == ACCURATE_LINES + accurate_prims_index = (gl_VertexIndex - BaseVertex) / 6; + vsOut.t = vec4(0.0f); + vsOut.ti = vec4(0.0f); + vsOut.c = vec4(0.0f); + return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader. + #elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + uint vertex_id = gl_VertexIndex - BaseVertex; + uint prim_id = vertex_id / 21; + accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior. + if (!bool(accurate_triangles_interior)) + { + uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge. + accurate_prims_index = 3 * prim_id + edge; + vsOut.t = vec4(0.0f); + vsOut.ti = vec4(0.0f); + vsOut.c = vec4(0.0f); + return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader. + } + // Send the interior vertex attributes for fixed function interpolation. + #endif + #if VS_TME vec2 uv = a_uv - TextureOffset; vec2 st = a_st - TextureOffset; @@ -245,6 +280,11 @@ void main() #define GS_LINE 0 #endif +#ifndef ZTST_GEQUAL +#define ZTST_GEQUAL 2 +#define ZTST_GREATER 3 +#endif + #ifndef PS_FST #define PS_FST 0 #define PS_WMS 0 @@ -298,9 +338,12 @@ void main() #define AFAIL_NEEDS_RT (PS_AFAIL == 3 && PS_NO_COLOR1) #define PS_FEEDBACK_LOOP_IS_NEEDED (PS_TEX_IS_FB == 1 || AFAIL_NEEDS_RT || PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW || (PS_DATE >= 5)) +#define PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP) #define NEEDS_TEX (PS_TFX != 4) +vec4 FragCoord; + layout(std140, set = 0, binding = 1) uniform cb1 { vec3 FogColor; @@ -320,8 +363,71 @@ layout(std140, set = 0, binding = 1) uniform cb1 mat4 DitherMatrix; float ScaledScaleFactor; float RcpScaleFactor; + uint _pad0; + uint _pad1; + + uint accurate_prims_base_index; + uint _pad2; + uint _pad3; + uint _pad4; }; +#if PS_ACCURATE_PRIMS +struct +{ + vec4 t; + vec4 ti; + vec4 c; +} vsIn; + +layout(location = 0) in VSOutput +{ + vec4 t; + vec4 ti; + #if PS_IIP != 0 + vec4 c; + #else + flat vec4 c; + #endif +} vsInReal; + +layout(location = 7) flat in uint accurate_prims_index; +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +layout(location = 8) flat in uint accurate_triangles_interior; +#endif + +struct AccuratePrimsEdgeData +{ + // Interpolated attributes + vec4 t_float0; // 0 + vec4 t_float1; // 16 + vec4 t_int0; // 32 + vec4 t_int1; // 48 + vec4 c0; // 64 + vec4 c1; // 80 + vec4 p0; // 96 + vec4 p1; // 112 + ivec4 edge0; // 128 + ivec4 edge1; // 144 + ivec2 xy0; // 160 + ivec2 xy1; // 168 + uint step_x; // 176 + uint draw0; // 180 + uint draw1; // 184 + uint top_left; // 188 + uint side; // 192 + uint _pad0; // 196 + uint _pad1; // 200 + uint _pad2; // 204 + // Total 208 +}; + +layout (std140, set = 0, binding = 3) readonly buffer AccuratePrimsEdgeDataBuffer { + AccuratePrimsEdgeData accurate_prims_data[]; +}; + +#else // PS_ACCURATE_PRIMS + layout(location = 0) in VSOutput { vec4 t; @@ -333,6 +439,8 @@ layout(location = 0) in VSOutput #endif } vsIn; +#endif + #if !PS_NO_COLOR && !PS_NO_COLOR1 layout(location = 0, index = 0) out vec4 o_col0; layout(location = 0, index = 1) out vec4 o_col1; @@ -345,13 +453,21 @@ layout(set = 1, binding = 0) uniform sampler2D Texture; layout(set = 1, binding = 1) uniform texture2D Palette; #endif -#if PS_FEEDBACK_LOOP_IS_NEEDED +#if PS_FEEDBACK_LOOP_IS_NEEDED || PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH #if defined(DISABLE_TEXTURE_BARRIER) || defined(HAS_FEEDBACK_LOOP_LAYOUT) layout(set = 1, binding = 2) uniform texture2D RtSampler; - vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0); } + vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(FragCoord.xy), 0); } + #if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH + layout(set = 1, binding = 4) uniform texture2D DepthSampler; + vec4 sample_from_depth() { return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0); } + #endif #else layout(input_attachment_index = 0, set = 1, binding = 2) uniform subpassInput RtSampler; vec4 sample_from_rt() { return subpassLoad(RtSampler); } + #if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH + layout(input_attachment_index = 1, set = 1, binding = 4) uniform subpassInput DepthSampler; + vec4 sample_from_depth() { return subpassLoad(DepthSampler); } + #endif #endif #endif @@ -925,19 +1041,19 @@ vec4 ps_color() #if !NEEDS_TEX vec4 T = vec4(0.0f); #elif PS_CHANNEL_FETCH == 1 - vec4 T = fetch_red(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_red(ivec2(FragCoord.xy)); #elif PS_CHANNEL_FETCH == 2 - vec4 T = fetch_green(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_green(ivec2(FragCoord.xy)); #elif PS_CHANNEL_FETCH == 3 - vec4 T = fetch_blue(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_blue(ivec2(FragCoord.xy)); #elif PS_CHANNEL_FETCH == 4 - vec4 T = fetch_alpha(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_alpha(ivec2(FragCoord.xy)); #elif PS_CHANNEL_FETCH == 5 - vec4 T = fetch_rgb(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_rgb(ivec2(FragCoord.xy)); #elif PS_CHANNEL_FETCH == 6 - vec4 T = fetch_gXbY(ivec2(gl_FragCoord.xy)); + vec4 T = fetch_gXbY(ivec2(FragCoord.xy)); #elif PS_DEPTH_FMT > 0 - vec4 T = sample_depth(st_int, ivec2(gl_FragCoord.xy)); + vec4 T = sample_depth(st_int, ivec2(FragCoord.xy)); #else vec4 T = sample_color(st); #endif @@ -985,9 +1101,9 @@ void ps_dither(inout vec3 C, float As) ivec2 fpos; #if PS_DITHER == 2 - fpos = ivec2(gl_FragCoord.xy); + fpos = ivec2(FragCoord.xy); #else - fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor); + fpos = ivec2(FragCoord.xy * RcpScaleFactor); #endif float value = DitherMatrix[fpos.y & 3][fpos.x & 3]; @@ -1228,11 +1344,232 @@ void ps_blend(inout vec4 Color, inout vec4 As_rgba) #endif } +#if PS_ACCURATE_PRIMS +// Interpolate vertex attributes over a line/edge manually. +void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1) +{ + float weight0_f = float(weight0); + float weight1_f = float(weight1); + float weight_total = float(weight0 + weight1); + + vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total; + vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total; + vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total; + float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total; + + // No interpolation for constant attributes. + vsIn.t = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0)); + vsIn.ti = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0)); + vsIn.c = mix(c_interp, data.c1, equal(data.c1, data.c0)); + FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp; + + // Clamp attributes. Fog/Z are normalized. + vsIn.c = clamp(vsIn.c, 0.0f, 255.0f); + vsIn.t.z = clamp(vsIn.t.z, 0.0f, 1.0f); + FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_LINES +void HandleAccurateLines(out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index]; + + ivec2 xy0 = data.xy0; + ivec2 xy1 = data.xy1; + ivec2 dxy = xy1 - xy0; + ivec2 xy0_i = (xy0 + 8) & ~0xF; + ivec2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool draw0 = bool(data.draw0); + bool draw1 = bool(data.draw1); + + // 4-bit fixed point: 16 subpixels per pixel + ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside line range + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + if ((major_i == major0_i && !draw0) || + (major_i == major1_i && !draw1)) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + +#if PS_ACCURATE_PRIMS_AA + // Proper fixed-point AA rounding + int minor_i_expected_0 = (minor_line / d_major) & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + int alpha_i; + if (minor_i == minor_i_expected_0) + alpha_i = alpha_i_0; + else if (minor_i == minor_i_expected_1) + alpha_i = alpha_i_1; + else + discard; + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + // Non-AA: fixed-point rounding and 4-bit alignment + int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF; + if (minor_i != minor_i_expected) + discard; + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1); +} +#endif + +#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES +void HandleAccurateTrianglesEdge(out float alpha_coverage) +{ + AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index]; + + ivec2 xy0 = data.xy0; + ivec2 xy1 = data.xy1; + ivec2 dxy = xy1 - xy0; + ivec2 xy0_i = (xy0 + 8) & ~0xF; + ivec2 xy1_i = (xy1 + 8) & ~0xF; + bool step_x = bool(data.step_x); + bool side = bool(data.side); + bool top_left = bool(data.top_left); + + // 4-bit fixed point: 16 subpixels per pixel + ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center. + + // Determine major/minor axes + int major0 = step_x ? xy0.x : xy0.y; + int major1 = step_x ? xy1.x : xy1.y; + int minor0 = step_x ? xy0.y : xy0.x; + int minor1 = step_x ? xy1.y : xy1.x; + int major_i = step_x ? xy_i.x : xy_i.y; + int minor_i = step_x ? xy_i.y : xy_i.x; + int d_major = step_x ? dxy.x : dxy.y; + int d_major_scaled = 16 * d_major; + + int major0_i = step_x ? xy0_i.x : xy0_i.y; + int major1_i = step_x ? xy1_i.x : xy1_i.y; + + // Discard if outside edge range. + // Note: this is not exactly what the SW rasterizer does. + // See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking. + if (major_i < min(major0_i, major1_i) || + major_i > max(major0_i, major1_i)) + discard; + + // Discard if on wrong side of other edges + if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 || + dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0) + discard; + + int weight0 = major1 - major_i; + int weight1 = major_i - major0; + + // Compute minor axis line in fixed-point + int minor_line = weight1 * minor1 + weight0 * minor0; + int minor_i_expected = minor_line / d_major; + int minor_i_expected_0 = minor_i_expected & ~0xF; + int minor_i_expected_1 = minor_i_expected_0 + 16; + int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0); + int alpha_i_1 = d_major_scaled - alpha_i_0; + + // Proper fixed-point AA rounding + int alpha_i; + if ((minor_i_expected & 0xF) == 0) + { + // On a pixel center + alpha_i = top_left ? 0 : d_major_scaled; + minor_i_expected += top_left ? (side ? -16 : 16) : 0; + } + else if (side) + { + minor_i_expected = minor_i_expected_0; + alpha_i = alpha_i_0; + } + else + { + minor_i_expected = minor_i_expected_1; + alpha_i = alpha_i_1; + } + if (minor_i != minor_i_expected) + discard; + +#if PS_ACCURATE_PRIMS_AA + // Make sure that the output alpha is always <= 127 for AA. + alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f)); +#else + alpha_coverage = 128.0f; +#endif + + // Interpolate attributes + InterpolateAttributesManual(data, weight0, weight1); +} +#endif + void main() { + FragCoord = gl_FragCoord; + +#if PS_ACCURATE_PRIMS + float alpha_coverage; +#if PS_ACCURATE_PRIMS == ACCURATE_LINES + HandleAccurateLines(alpha_coverage); +#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(accurate_triangles_interior)) + { + alpha_coverage = 128.0f; + vsIn.t = vsInReal.t; + vsIn.ti = vsInReal.ti; + vsIn.c = vsInReal.c; + } + else + { + HandleAccurateTrianglesEdge(alpha_coverage); + } +#endif +#endif // PS_ACCURATE_PRIMS + +#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH + float current_depth = sample_from_depth().r; +#endif + +#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER) + #if PS_ZTST == ZTST_GEQUAL + if (FragCoord.z < current_depth) + discard; + #elif PS_ZTST == ZTST_GREATER + if (FragCoord.z <= current_depth) + discard; + #endif +#endif // PS_ZTST + #if PS_SCANMSK & 2 // fail depth test on prohibited lines - if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1)) + if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1)) discard; #endif #if PS_DATE >= 5 @@ -1267,7 +1604,7 @@ void main() #endif // PS_DATE >= 5 #if PS_DATE == 3 - int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(gl_FragCoord.xy), 0).r); + int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(FragCoord.xy), 0).r); // Note gl_PrimitiveID == stencil_ceil will be the primitive that will update // the bad alpha value so we must keep it. @@ -1277,6 +1614,20 @@ void main() #endif vec4 C = ps_color(); + +#if PS_FIXED_ONE_A + // AA (Fixed one) will output a coverage of 1.0 as alpha + C.a = 128.0f; +#elif PS_ACCURATE_PRIMS_AA + // AA: coverage is computed in alpha_coverage + #if PS_ACCURATE_PRIMS_AA_ABE + if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128. + C.a = alpha_coverage; + #else + C.a = alpha_coverage; + #endif +#endif + bool atst_pass = atst(C); #if PS_AFAIL == 0 // KEEP or ATST off @@ -1284,13 +1635,6 @@ void main() discard; #endif - // Must be done before alpha correction - - // AA (Fixed one) will output a coverage of 1.0 as alpha -#if PS_FIXED_ONE_A - C.a = 128.0f; -#endif - #if SW_AD_TO_HW #if PS_RTA_CORRECTION vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f); @@ -1327,7 +1671,7 @@ void main() #else ps_blend(C, alpha_blend); -#if PS_SHUFFLE + #if PS_SHUFFLE #if !PS_READ16_SRC && !PS_SHUFFLE_SAME && !(PS_PROCESS_BA == SHUFFLE_READWRITE && PS_PROCESS_RG == SHUFFLE_READWRITE) uvec4 denorm_c_after = uvec4(C); #if (PS_PROCESS_BA & SHUFFLE_READ) @@ -1401,9 +1745,15 @@ void main() #endif #if PS_ZCLAMP - gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS); + #if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES + if (bool(accurate_triangles_interior)) + gl_FragDepth = min(FragCoord.z, MaxDepthPS); + else + gl_FragDepth = current_depth; // No depth update for triangle edges. + #else + gl_FragDepth = min(FragCoord.z, MaxDepthPS); + #endif #endif - #endif // PS_DATE } diff --git a/pcsx2/Config.h b/pcsx2/Config.h index 220101e6f6..6ee8b36d2f 100644 --- a/pcsx2/Config.h +++ b/pcsx2/Config.h @@ -756,6 +756,7 @@ struct Pcsx2Config PreloadFrameWithGSData : 1, Mipmap : 1, HWMipmap : 1, + HWAccuratePrims: 1, ManualUserHacks : 1, UserHacks_AlignSpriteX : 1, UserHacks_CPUFBConversion : 1, diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp index 63bd069648..935d4bb9af 100644 --- a/pcsx2/GS/GSState.cpp +++ b/pcsx2/GS/GSState.cpp @@ -431,6 +431,10 @@ const char* GSState::GetFlushReasonString(GSFlushReason reason) return "VSYNC"; case GSFlushReason::GSREOPEN: return "GS REOPEN"; + case GSFlushReason::VERTEXCOUNT: + return "VERTEX COUNT"; + case GSFlushReason::VERTEXCOUNTEXPANDED: + return "VERTEX COUNT EXPANDED"; case GSFlushReason::UNKNOWN: default: return "UNKNOWN"; @@ -3265,6 +3269,20 @@ void GSState::UpdateVertexKick() m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim]; m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim]; + + if (UsingAccuratePrims()) + { + if (GSUtil::GetPrimClass(prim) == GS_LINE_CLASS) + m_vertex_expansion_factor = 3; + else if (GSUtil::GetPrimClass(prim) == GS_TRIANGLE_CLASS) + m_vertex_expansion_factor = 7; + else + pxFail("Wrong primitive class."); // Impossible. + } + else + { + m_vertex_expansion_factor = 1; + } } void GSState::GrowVertexBuffer() @@ -4632,6 +4650,12 @@ __forceinline void GSState::VertexKick(u32 skip) constexpr u32 max_vertices = MaxVerticesForPrim(prim); if (max_vertices != 0 && m_vertex.tail >= max_vertices) Flush(VERTEXCOUNT); + + if (m_vertex_expansion_factor != 1) + { + if (max_vertices != 0 && (m_vertex_expansion_factor * m_index.tail) >= max_vertices) + Flush(VERTEXCOUNTEXPANDED); + } } /// Checks if region repeat is used (applying it does something to at least one of the values in min...max) @@ -4968,12 +4992,15 @@ void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max) // Limit max to 255 as we send 500 when we don't know, makes calculating 24/16bit easier. int min = tex_alpha_min, max = std::min(tex_alpha_max, 255); - if (IsCoverageAlpha()) + if (IsCoverageAlphaFixedOne()) { - // HW renderer doesn't currently support AA, so its min is 128. - // If we add AA support to the HW renderer, this will need to be changed. - // (Will probably only be supported with ROV/FBFetch so we would want to check for that.) - min = GSIsHardwareRenderer() ? 128 : 0; + // HW renderer doesn't support AA1, assume alpha is constant 128. + min = 128; + max = 128; + } + else if (IsCoverageAlphaSupported()) + { + min = 0; max = 128; } else @@ -5268,7 +5295,24 @@ bool GSState::IsMipMapActive() bool GSState::IsCoverageAlpha() { - return !PRIM->ABE && PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS); + return PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS); +} + +bool GSState::IsCoverageAlphaFixedOne() +{ + return IsCoverageAlpha() && !PRIM->ABE && !IsCoverageAlphaSupported(); +} + +bool GSState::IsCoverageAlphaSupported() +{ + return false; +} + +bool GSState::UsingAccuratePrims() +{ + return g_gs_device->Features().accurate_prims && + (GSUtil::GetPrimClass(PRIM->PRIM) == GS_LINE_CLASS || + (GSUtil::GetPrimClass(PRIM->PRIM) == GS_TRIANGLE_CLASS && PRIM->AA1)); } GIFRegTEX0 GSState::GetTex0Layer(u32 lod) diff --git a/pcsx2/GS/GSState.h b/pcsx2/GS/GSState.h index 8f30d12249..2a74363c17 100644 --- a/pcsx2/GS/GSState.h +++ b/pcsx2/GS/GSState.h @@ -165,6 +165,8 @@ protected: u32 tail; } m_draw_index = {}; + int m_vertex_expansion_factor = 1; + void UpdateContext(); void UpdateScissor(); @@ -206,6 +208,9 @@ protected: bool IsMipMapDraw(); bool IsMipMapActive(); bool IsCoverageAlpha(); + bool IsCoverageAlphaFixedOne(); + virtual bool IsCoverageAlphaSupported(); + bool UsingAccuratePrims(); void CalcAlphaMinMax(const int tex_min, const int tex_max); void CorrectATEAlphaMinMax(const u32 atst, const int aref); @@ -312,6 +317,7 @@ public: VSYNC = 1 << 13, GSREOPEN = 1 << 14, VERTEXCOUNT = 1 << 15, + VERTEXCOUNTEXPANDED = 1 << 16, }; GSFlushReason m_state_flush_reason = UNKNOWN; diff --git a/pcsx2/GS/GSVector.h b/pcsx2/GS/GSVector.h index 924a039adb..cd09f6ac83 100644 --- a/pcsx2/GS/GSVector.h +++ b/pcsx2/GS/GSVector.h @@ -57,6 +57,16 @@ public: return (std::memcmp(this, &v, sizeof(*this)) != 0); } + constexpr GSVector2T operator+(const GSVector2T& v) const + { + return {x + v.x, y + v.y}; + } + + constexpr GSVector2T operator-(const GSVector2T& v) const + { + return {x - v.x, y - v.y}; + } + constexpr GSVector2T operator*(const GSVector2T& v) const { return { x * v.x, y * v.y }; @@ -81,6 +91,11 @@ public: typedef GSVector2T GSVector2; typedef GSVector2T GSVector2i; +constexpr GSVector2i operator&(const GSVector2i& v0, const GSVector2i& v1) +{ + return {v0.x & v1.x, v0.y & v1.y}; +} + class GSVector4; class GSVector4i; diff --git a/pcsx2/GS/Renderers/Common/GSDevice.h b/pcsx2/GS/Renderers/Common/GSDevice.h index 0ca09becd6..b4562c246f 100644 --- a/pcsx2/GS/Renderers/Common/GSDevice.h +++ b/pcsx2/GS/Renderers/Common/GSDevice.h @@ -290,6 +290,41 @@ struct HWBlend BlendFactor src, dst; }; +struct alignas(16) AccuratePrimsEdgeData +{ + // Interpolated attributes + GSVector4 t_float0; // 0 + GSVector4 t_float1; // 16 + GSVector4 t_int0; // 32 + GSVector4 t_int1; // 48 + GSVector4 c0; // 64 + GSVector4 c1; // 80 + GSVector4 p0; // 96 + GSVector4 p1; // 112 + GSVector4i edge0; // 128 + GSVector4i edge1; // 144 + GSVector2i xy0; // 160 + GSVector2i xy1; // 168 + u32 step_x; // 176 + u32 draw0; // 180 + u32 draw1; // 184 + u32 top_left; // 188 + u32 side; // 192 + u32 _pad0; // 196 + u32 _pad1; // 200 + u32 _pad2; // 204 + // Total 208 +}; + +static_assert(sizeof(AccuratePrimsEdgeData) == 208); + +enum +{ + ACCURATE_PRIMS_DISABLE = 0, + ACCURATE_PRIMS_LINE = 1, + ACCURATE_PRIMS_TRIANGLE = 2 +}; + struct alignas(16) GSHWDrawConfig { enum class Topology: u8 @@ -317,7 +352,7 @@ struct alignas(16) GSHWDrawConfig u8 iip : 1; u8 point_size : 1; ///< Set when points need to be expanded without VS expanding. VSExpand expand : 2; - u8 _free : 2; + u8 accurate_prims : 2; // 0 - disables; 1 - lines; 2 - triangles. }; u8 key; }; @@ -355,6 +390,7 @@ struct alignas(16) GSHWDrawConfig u32 date : 3; u32 atst : 3; u32 afail : 2; + u32 ztst : 2; // Color sampling u32 fst : 1; // Investigate to do it on the VS u32 tfx : 3; @@ -415,6 +451,11 @@ struct alignas(16) GSHWDrawConfig // Scan mask u32 scanmsk : 2; + + // Accurate lines + u32 accurate_prims : 2; // 0 - disabled; 1 - lines; 2 - triangles + u32 accurate_prims_aa : 1; + u32 accurate_prims_aa_abe : 1; }; struct @@ -436,6 +477,13 @@ struct alignas(16) GSHWDrawConfig return channel_fb || tex_is_fb || fbmask || (date > 0 && date != 3) || sw_blend_needs_rt; } + __fi bool IsFeedbackLoopDepth() const + { + // Note: Manual depth testing/interpolation for accurate prims is bundled with zclamp to reduce pipeline combinations. + // The zclamp is used to indicate that either Z write of Z testing is enabled. + return (accurate_prims == ACCURATE_PRIMS_TRIANGLE) && accurate_prims_aa && zclamp; + } + /// Disables color output from the pixel shader, this is done when all channels are masked. __fi void DisableColorOutput() { @@ -580,6 +628,7 @@ struct alignas(16) GSHWDrawConfig GSVector2 texture_offset; GSVector2 point_size; GSVector2i max_depth; + GSVector2i base_vertex; __fi VSConstantBuffer() { memset(static_cast(this), 0, sizeof(*this)); @@ -629,6 +678,8 @@ struct alignas(16) GSHWDrawConfig GSVector4 ScaleFactor; + GSVector4i accurate_prims_base_index; + __fi PSConstantBuffer() { memset(static_cast(this), 0, sizeof(*this)); @@ -746,6 +797,9 @@ struct alignas(16) GSHWDrawConfig SetDATM datm : 2; bool line_expand : 1; + bool accurate_prims; + std::vector* accurate_prims_edge_data; + struct AlphaPass { alignas(8) PSSelector ps; @@ -843,6 +897,7 @@ public: bool stencil_buffer : 1; ///< Supports stencil buffer, and can use for DATE. bool cas_sharpening : 1; ///< Supports sufficient functionality for contrast adaptive sharpening. bool test_and_sample_depth: 1; ///< Supports concurrently binding the depth-stencil buffer for sampling and depth testing. + bool accurate_prims : 1; ///< Supports AA1 triangles/lines and accurate lines shaders. FeatureSupport() { memset(this, 0, sizeof(*this)); diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp index 796d4e17e3..d95e22a249 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp @@ -14,6 +14,7 @@ #include "common/Error.h" #include "common/Path.h" #include "common/StringUtil.h" +#include "common/ScopedGuard.h" #include "imgui.h" #include "IconsFontAwesome6.h" @@ -395,6 +396,32 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) } } + if (m_features.accurate_prims) + { + bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + bd.StructureByteStride = sizeof(AccuratePrimsEdgeData); + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + + if (FAILED(m_dev->CreateBuffer(&bd, nullptr, m_accurate_prims_b.put()))) + { + Console.Error("D3D11: Failed to create accurate prims buffer."); + return false; + } + + const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc( + D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData)); + if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, m_accurate_prims_b_srv.put()))) + { + Console.Error("D3D11: Failed to create accurate prims buffer SRV."); + return false; + } + + // If MAX_TEXTURES changes, please change the register for this buffer in the shader. + static_assert(MAX_TEXTURES == 5); + m_ctx->PSSetShaderResources(MAX_TEXTURES, 1, m_accurate_prims_b_srv.addressof()); + } + // rasterizer memset(&rd, 0, sizeof(rd)); @@ -541,6 +568,8 @@ void GSDevice11::Destroy() m_expand_vb_srv.reset(); m_expand_vb.reset(); m_expand_ib.reset(); + m_accurate_prims_b.reset(); + m_accurate_prims_b_srv.reset(); m_vs.clear(); m_vs_cb.reset(); @@ -599,6 +628,8 @@ void GSDevice11::SetFeatures(IDXGIAdapter1* adapter) m_max_texture_size = (m_feature_level >= D3D_FEATURE_LEVEL_11_0) ? D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION : D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION; + + m_features.accurate_prims = GSConfig.HWAccuratePrims; } bool GSDevice11::HasSurface() const @@ -1665,6 +1696,7 @@ void GSDevice11::SetupVS(VSSelector sel, const GSHWDrawConfig::VSConstantBuffer* sm.AddMacro("VS_FST", sel.fst); sm.AddMacro("VS_IIP", sel.iip); sm.AddMacro("VS_EXPAND", static_cast(sel.expand)); + sm.AddMacro("VS_ACCURATE_PRIMS", static_cast(sel.accurate_prims)); static constexpr const D3D11_INPUT_ELEMENT_DESC layout[] = { @@ -1766,6 +1798,10 @@ void GSDevice11::SetupPS(const PSSelector& sel, const GSHWDrawConfig::PSConstant sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb); sm.AddMacro("PS_NO_COLOR", sel.no_color); sm.AddMacro("PS_NO_COLOR1", sel.no_color1); + sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims); + sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa); + sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe); + sm.AddMacro("PS_ZTST", sel.ztst); wil::com_ptr_nothrow ps = m_shader_cache.GetPixelShader(m_dev.get(), m_tfx_source, sm.GetPtr(), "ps_main"); i = m_ps.try_emplace(sel, std::move(ps)).first; @@ -2280,6 +2316,43 @@ bool GSDevice11::IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 cou return true; } +bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + const u32 count = config.accurate_prims_edge_data->size(); + const u32 size = count * sizeof(AccuratePrimsEdgeData); + + if (size > ACCURATE_PRIMS_BUFFER_SIZE) + return false; + + D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE; + + pxAssert(m_accurate_prims_b_pos % sizeof(AccuratePrimsEdgeData) == 0); + + if (m_accurate_prims_b_pos + size > ACCURATE_PRIMS_BUFFER_SIZE) + { + m_accurate_prims_b_pos = 0; + type = D3D11_MAP_WRITE_DISCARD; + } + + D3D11_MAPPED_SUBRESOURCE m; + if (FAILED(m_ctx->Map(m_accurate_prims_b.get(), 0, type, 0, &m))) + return false; + + void* map = static_cast(m.pData) + m_accurate_prims_b_pos; + + GSVector4i::storent(map, config.accurate_prims_edge_data->data(), size); + + m_ctx->Unmap(m_accurate_prims_b.get(), 0); + + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_b_pos / sizeof(AccuratePrimsEdgeData); + + m_accurate_prims_b_pos += size; + } + return true; +} + u16* GSDevice11::IAMapIndexBuffer(u32 count) { if (count > (INDEX_BUFFER_SIZE / sizeof(u16))) @@ -2583,6 +2656,18 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) { const GSVector2i rtsize = (config.rt ? config.rt : config.ds)->GetSize(); GSTexture* colclip_rt = g_gs_device->GetColorClipTexture(); + GSTexture* draw_rt_clone = nullptr; + GSTexture* draw_ds_clone = nullptr; + GSTexture* primid_texture = nullptr; + + ScopedGuard recycle_temp_textures([&]() { + if (draw_rt_clone) + Recycle(draw_rt_clone); + if (draw_ds_clone) + Recycle(draw_ds_clone); + if (primid_texture) + Recycle(primid_texture); + }); if (colclip_rt) { @@ -2627,7 +2712,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) // Destination Alpha Setup const bool multidraw_fb_copy = m_features.multidraw_fb_copy && (config.require_one_barrier || config.require_full_barrier); - GSTexture* primid_texture = nullptr; if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking) { primid_texture = CreateRenderTarget(rtsize.x, rtsize.y, GSTexture::Format::PrimID, false); @@ -2652,7 +2736,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) return; } - config.cb_vs.max_depth.y = m_vertex.start; + config.cb_vs.base_vertex = m_vertex.start; } else { @@ -2663,6 +2747,12 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) } } + if (!SetupAccuratePrims(config)) + { + Console.Error("D3D11: Failed to setup accurate prims"); + return; + } + if (config.vs.UseExpandIndexBuffer()) { IASetIndexBuffer(m_expand_ib.get()); @@ -2742,8 +2832,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) draw_ds = m_state.cached_dsv; } - GSTexture* draw_rt_clone = nullptr; - if (draw_rt && (config.require_one_barrier || (config.require_full_barrier && m_features.multidraw_fb_copy) || (config.tex && config.tex == config.rt))) { // Requires a copy of the RT. @@ -2754,6 +2842,15 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) Console.Warning("D3D11: Failed to allocate temp texture for RT copy."); } + if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth()) + { + // Requires a copy of the DS. + // Used as "bind ds" flag when texture barrier is unsupported for tex is fb. + draw_ds_clone = CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true); + if (!draw_rt_clone) + Console.Warning("D3D11: Failed to allocate temp texture for DS copy."); + } + OMSetRenderTargets(draw_rt, draw_ds, &config.scissor, read_only_dsv); SetupOM(config.depth, OMBlendSelector(config.colormask, config.blend), config.blend.constant); @@ -2761,7 +2858,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::StencilOne && multidraw_fb_copy) m_ctx->ClearDepthStencilView(*static_cast(draw_ds), D3D11_CLEAR_STENCIL, 0.0f, 1); - SendHWDraw(config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false); + SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, config.require_one_barrier, config.require_full_barrier, false); if (config.blend_multi_pass.enable) { @@ -2786,15 +2883,10 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) } SetupOM(config.alpha_second_pass.depth, OMBlendSelector(config.alpha_second_pass.colormask, config.blend), config.blend.constant); - SendHWDraw(config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true); + SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, + config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true); } - if (draw_rt_clone) - Recycle(draw_rt_clone); - - if (primid_texture) - Recycle(primid_texture); - if (colclip_rt) { config.colclip_update_area = config.colclip_update_area.runion(config.drawarea); @@ -2813,19 +2905,29 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config) } } -void GSDevice11::SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier) +void GSDevice11::SendHWDraw(const GSHWDrawConfig& config, + GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds, + const bool one_barrier, const bool full_barrier, const bool skip_first_barrier) { - if (draw_rt_clone) + if (draw_rt_clone || draw_ds_clone) { #ifdef PCSX2_DEVBUILD - if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]] + if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]] Console.Warning("D3D11: Possible unnecessary copy detected."); #endif auto CopyAndBind = [&](GSVector4i drawarea) { - CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top); + if (draw_rt_clone) + CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top); + if (draw_ds_clone) + CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top); if (one_barrier || full_barrier) - PSSetShaderResource(2, draw_rt_clone); + { + if (draw_rt_clone) + PSSetShaderResource(2, draw_rt_clone); + if (draw_ds_clone) + PSSetShaderResource(4, draw_ds_clone); + } if (config.tex && config.tex == config.rt) PSSetShaderResource(0, draw_rt_clone); }; diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.h b/pcsx2/GS/Renderers/DX11/GSDevice11.h index 761646e52b..b790e3eed9 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.h +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h @@ -83,10 +83,14 @@ public: private: enum : u32 { - MAX_TEXTURES = 4, + MAX_TEXTURES = 5, MAX_SAMPLERS = 1, VERTEX_BUFFER_SIZE = 32 * 1024 * 1024, INDEX_BUFFER_SIZE = 16 * 1024 * 1024, + + // Structured buffer size must be multiple of element size. + ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData), + NUM_TIMESTAMP_QUERIES = 5, }; @@ -126,11 +130,14 @@ private: wil::com_ptr_nothrow m_expand_vb; wil::com_ptr_nothrow m_expand_ib; wil::com_ptr_nothrow m_expand_vb_srv; + wil::com_ptr_nothrow m_accurate_prims_b; + wil::com_ptr_nothrow m_accurate_prims_b_srv; D3D_FEATURE_LEVEL m_feature_level = D3D_FEATURE_LEVEL_10_0; u32 m_vb_pos = 0; // bytes u32 m_ib_pos = 0; // indices/sizeof(u32) u32 m_structured_vb_pos = 0; // bytes + u32 m_accurate_prims_b_pos = 0; // bytes/sizeof(AccuratePrimsEdgeData) bool m_allow_tearing_supported = false; bool m_using_flip_model_swap_chain = true; @@ -317,6 +324,7 @@ public: void IAUnmapVertexBuffer(u32 stride, u32 count); bool IASetVertexBuffer(const void* vertex, u32 stride, u32 count); bool IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 count); + bool SetupAccuratePrims(GSHWDrawConfig& config); u16* IAMapIndexBuffer(u32 count); void IAUnmapIndexBuffer(u32 count); @@ -345,7 +353,9 @@ public: void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, u8 afix); void RenderHW(GSHWDrawConfig& config) override; - void SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier); + void SendHWDraw(const GSHWDrawConfig& config, + GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds, + const bool one_barrier, const bool full_barrier, const bool skip_first_barrier); void ClearSamplerCache() override; diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp index 7c5f1f490d..4f6c991f91 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp @@ -1250,6 +1250,8 @@ bool GSDevice12::CheckFeatures(const u32& vendor_id) DXGI_FEATURE_PRESENT_ALLOW_TEARING, &allow_tearing_supported, sizeof(allow_tearing_supported)); m_allow_tearing_supported = (SUCCEEDED(hr) && allow_tearing_supported == TRUE); + m_features.accurate_prims = GSConfig.HWAccuratePrims; + return true; } @@ -2178,6 +2180,33 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count) m_index_stream_buffer.CommitMemory(size); } +void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING; + + const u32 count = config.accurate_prims_edge_data->size(); + const u32 size = count * sizeof(AccuratePrimsEdgeData); + + if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) + { + ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); + if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) + pxFailRel("Failed to reserve space for accurate prims"); + } + + config.cb_vs.base_vertex = m_vertex.start; + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + + SetVSConstantBuffer(config.cb_vs); + SetPSConstantBuffer(config.cb_ps); + + std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); + m_accurate_prims_stream_buffer.CommitMemory(size); + } +} + void GSDevice12::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor) { GSTexture12* vkRt = static_cast(rt); @@ -2305,9 +2334,9 @@ bool GSDevice12::GetTextureGroupDescriptors( } D3D12_CPU_DESCRIPTOR_HANDLE dst_handle = *gpu_handle; - D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TFX_TEXTURES]; - UINT src_sizes[NUM_TFX_TEXTURES]; - pxAssert(count <= NUM_TFX_TEXTURES); + D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TOTAL_TFX_TEXTURES]; + UINT src_sizes[NUM_TOTAL_TFX_TEXTURES]; + pxAssert(count <= NUM_TOTAL_TFX_TEXTURES); for (u32 i = 0; i < count; i++) { src_handles[i] = cpu_handles[i]; @@ -2365,6 +2394,29 @@ bool GSDevice12::CreateBuffers() return false; } + if (!m_accurate_prims_stream_buffer.Create(ACCURATE_PRIMS_BUFFER_SIZE)) + { + Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); + return false; + } + + if (!m_descriptor_heap_manager.Allocate(&m_accurate_prims_srv_descriptor_cpu)) + { + Console.Error("Failed to allocate accurate prims CPU descriptor"); + return false; + } + + // Create the shader resource view for the accurate prims buffer. + { + D3D12_SHADER_RESOURCE_VIEW_DESC desc = { + DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING}; + desc.Buffer.FirstElement = 0; + desc.Buffer.NumElements = ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData); + desc.Buffer.StructureByteStride = sizeof(AccuratePrimsEdgeData); + m_device->CreateShaderResourceView(m_accurate_prims_stream_buffer.GetBuffer(), &desc, + m_accurate_prims_srv_descriptor_cpu.cpu_handle); + } + if (!m_vertex_constant_buffer.Create(VERTEX_UNIFORM_BUFFER_SIZE)) { Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer"); @@ -2415,9 +2467,11 @@ bool GSDevice12::CreateRootSignatures() rsb.AddCBVParameter(0, D3D12_SHADER_VISIBILITY_ALL); rsb.AddCBVParameter(1, D3D12_SHADER_VISIBILITY_PIXEL); rsb.AddSRVParameter(0, D3D12_SHADER_VISIBILITY_VERTEX); - rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL); + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL); // Source / Palette rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 0, NUM_TFX_SAMPLERS, D3D12_SHADER_VISIBILITY_PIXEL); - rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL); + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL); // RT / PrimID + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 4, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Depth + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 5, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Accurate Prims if (!(m_tfx_root_signature = rsb.Create())) return false; D3D12::SetObjectName(m_tfx_root_signature.get(), "TFX root signature"); @@ -2805,6 +2859,7 @@ void GSDevice12::DestroyResources() m_vertex_constant_buffer.Destroy(false); m_index_stream_buffer.Destroy(false); m_vertex_stream_buffer.Destroy(false); + m_accurate_prims_stream_buffer.Destroy(false); m_utility_root_signature.reset(); m_tfx_root_signature.reset(); @@ -2818,6 +2873,7 @@ void GSDevice12::DestroyResources() m_shader_cache.Close(); m_descriptor_heap_manager.Free(&m_null_srv_descriptor); + m_descriptor_heap_manager.Free(&m_accurate_prims_srv_descriptor_cpu); m_timestamp_query_buffer.reset(); m_timestamp_query_allocation.reset(); m_sampler_heap_manager.Destroy(); @@ -2851,6 +2907,7 @@ const ID3DBlob* GSDevice12::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel) sm.AddMacro("VS_FST", sel.fst); sm.AddMacro("VS_IIP", sel.iip); sm.AddMacro("VS_EXPAND", static_cast(sel.expand)); + sm.AddMacro("VS_ACCURATE_PRIMS", static_cast(sel.accurate_prims)); const char* entry_point = (sel.expand != GSHWDrawConfig::VSExpand::None) ? "vs_main_expand" : "vs_main"; ComPtr vs(m_shader_cache.GetVertexShader(m_tfx_source, sm.GetPtr(), entry_point)); @@ -2922,6 +2979,10 @@ const ID3DBlob* GSDevice12::GetTFXPixelShader(const GSHWDrawConfig::PSSelector& sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb); sm.AddMacro("PS_NO_COLOR", sel.no_color); sm.AddMacro("PS_NO_COLOR1", sel.no_color1); + sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims); + sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa); + sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe); + sm.AddMacro("PS_ZTST", sel.ztst); ComPtr ps(m_shader_cache.GetPixelShader(m_tfx_source, sm.GetPtr(), "ps_main")); it = m_tfx_pixel_shaders.emplace(sel, std::move(ps)).first; @@ -3155,6 +3216,7 @@ void GSDevice12::InvalidateCachedState() m_tfx_textures_handle_gpu.Clear(); m_tfx_samplers_handle_gpu.Clear(); m_tfx_rt_textures_handle_gpu.Clear(); + m_tfx_depth_textures_handle_gpu.Clear(); } void GSDevice12::SetVertexBuffer(D3D12_GPU_VIRTUAL_ADDRESS buffer, size_t size, size_t stride) @@ -3236,7 +3298,11 @@ void GSDevice12::PSSetShaderResource(int i, GSTexture* sr, bool check_state) return; m_tfx_textures[i] = handle; - m_dirty_flags |= (i < 2) ? DIRTY_FLAG_TFX_TEXTURES : DIRTY_FLAG_TFX_RT_TEXTURES; + m_dirty_flags |= + (i < 2) ? DIRTY_FLAG_TFX_TEXTURES : + (i < 4) ? DIRTY_FLAG_TFX_RT_TEXTURES : + (i < 5) ? DIRTY_FLAG_TFX_DEPTH_TEXTURES : + 0; } void GSDevice12::PSSetSampler(GSHWDrawConfig::SamplerSelector sel) @@ -3639,6 +3705,17 @@ bool GSDevice12::ApplyTFXState(bool already_execed) flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2; } + if (flags & DIRTY_FLAG_TFX_DEPTH_TEXTURES) + { + if (!GetTextureGroupDescriptors(&m_tfx_depth_textures_handle_gpu, m_tfx_textures.data() + 4, 1)) + { + ExecuteCommandListAndRestartRenderPass(false, "Ran out of TFX depth descriptor descriptor groups"); + return ApplyTFXState(true); + } + + flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3; + } + ID3D12GraphicsCommandList* cmdlist = GetCommandList(); if (m_current_root_signature != RootSignature::TFX) @@ -3646,7 +3723,8 @@ bool GSDevice12::ApplyTFXState(bool already_execed) m_current_root_signature = RootSignature::TFX; flags |= DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | - DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_PIPELINE; + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 | + DIRTY_FLAG_PIPELINE; cmdlist->SetGraphicsRootSignature(m_tfx_root_signature.get()); } @@ -3659,12 +3737,28 @@ bool GSDevice12::ApplyTFXState(bool already_execed) cmdlist->SetGraphicsRootShaderResourceView(TFX_ROOT_SIGNATURE_PARAM_VS_SRV, m_vertex_stream_buffer.GetGPUPointer() + m_vertex.start * sizeof(GSVertex)); } + if (flags & DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING) + { + if (!GetDescriptorAllocator().Allocate(1, &m_accurate_prims_srv_descriptor_gpu)) + { + Console.Error("Failed to allocate accurate prims GPU descriptor"); + return false; + } + + m_device.get()->CopyDescriptorsSimple( + 1, m_accurate_prims_srv_descriptor_gpu, m_accurate_prims_srv_descriptor_cpu, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + + cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV, m_accurate_prims_srv_descriptor_gpu); + + } if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE) cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES, m_tfx_textures_handle_gpu); if (flags & DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE) cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS, m_tfx_samplers_handle_gpu); if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2) cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES, m_tfx_rt_textures_handle_gpu); + if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3) + cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES, m_tfx_depth_textures_handle_gpu); ApplyBaseState(flags, cmdlist); return true; @@ -3829,6 +3923,17 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) GSTexture12* draw_rt = static_cast(config.rt); GSTexture12* draw_ds = static_cast(config.ds); GSTexture12* draw_rt_clone = nullptr; + GSTexture12* draw_ds_clone = nullptr; + GSTexture12* date_image = nullptr; + + ScopedGuard recycle_temp_textures([&]() { + if (draw_rt_clone) + Recycle(draw_rt_clone); + if (draw_ds_clone) + Recycle(draw_ds_clone); + if (date_image) + Recycle(date_image); + }); // Align the render area to 128x128, hopefully avoiding render pass restarts for small render area changes (e.g. Ratchet and Clank). const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize()); @@ -3903,7 +4008,6 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) } // Primitive ID tracking DATE setup. - GSTexture12* date_image = nullptr; if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking) { GSTexture* backup_rt = config.rt; @@ -3991,6 +4095,15 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) Console.Warning("D3D12: Failed to allocate temp texture for RT copy."); } + if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth()) + { + // Requires a copy of the DS. + // Used as "bind ds" flag when texture barrier is unsupported for tex is fb. + draw_ds_clone = static_cast(CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true)); + if (!draw_rt_clone) + Console.Warning("D3D12: Failed to allocate temp texture for DS copy."); + } + OMSetRenderTargets(draw_rt, draw_ds, config.scissor); // Begin render pass if new target or out of the area. @@ -4036,7 +4149,8 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) UploadHWDrawVerticesAndIndices(config); // now we can do the actual draw - SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false); + SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, + config.require_one_barrier, config.require_full_barrier, false); // blend second pass if (config.blend_multi_pass.enable) @@ -4065,15 +4179,10 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) pipe.cms = config.alpha_second_pass.colormask; pipe.dss = config.alpha_second_pass.depth; pipe.bs = config.blend; - SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true); + SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, + config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true); } - if (draw_rt_clone) - Recycle(draw_rt_clone); - - if (date_image) - Recycle(date_image); - // now blit the colclip texture back to the original target if (colclip_rt) { @@ -4108,23 +4217,40 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) } } -void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier) +void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, + GSTexture12* draw_rt_clone, GSTexture12* draw_rt, + GSTexture12* draw_ds_clone, GSTexture12* draw_ds, + const bool one_barrier, const bool full_barrier, const bool skip_first_barrier) { - if (draw_rt_clone) + if (draw_rt_clone || draw_ds_clone) { #ifdef PCSX2_DEVBUILD - if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]] + if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]] Console.Warning("D3D12: Possible unnecessary copy detected."); #endif auto CopyAndBind = [&](GSVector4i drawarea) { EndRenderPass(); - CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top); - draw_rt->TransitionToState(D3D12_RESOURCE_STATE_RENDER_TARGET); + if (draw_rt_clone) + { + CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top); + draw_rt->TransitionToState(D3D12_RESOURCE_STATE_RENDER_TARGET); + } + + if (draw_ds_clone) + { + CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top); + draw_ds->TransitionToState(D3D12_RESOURCE_STATE_DEPTH_WRITE); + } if (one_barrier || full_barrier) - PSSetShaderResource(2, draw_rt_clone, true); + { + if (draw_rt_clone) + PSSetShaderResource(2, draw_rt_clone, true); + if (draw_ds_clone) + PSSetShaderResource(4, draw_ds_clone, true); + } if (config.tex && config.tex == config.rt) PSSetShaderResource(0, draw_rt_clone, true); }; @@ -4153,7 +4279,6 @@ void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& return; } - // Optimization: For alpha second pass we can reuse the copy snapshot from the first pass. if (!skip_first_barrier) CopyAndBind(config.drawarea); @@ -4177,7 +4302,7 @@ void GSDevice12::UpdateHWPipelineSelector(GSHWDrawConfig& config) m_pipeline_selector.ds = config.ds != nullptr; } -void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config) +void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) { IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts); @@ -4195,4 +4320,6 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config) { IASetIndexBuffer(config.indices, config.nindices); } + + SetupAccuratePrims(config); } diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.h b/pcsx2/GS/Renderers/DX12/GSDevice12.h index bd5af07dad..7b452a9f4f 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.h +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.h @@ -256,7 +256,8 @@ public: NUM_TFX_CONSTANT_BUFFERS = 2, NUM_TFX_TEXTURES = 2, NUM_TFX_RT_TEXTURES = 2, - NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES, + NUM_TFX_DEPTH_TEXTURES = 1, + NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES + NUM_TFX_DEPTH_TEXTURES, NUM_TFX_SAMPLERS = 1, NUM_UTILITY_TEXTURES = 1, NUM_UTILITY_SAMPLERS = 1, @@ -264,6 +265,10 @@ public: VERTEX_BUFFER_SIZE = 32 * 1024 * 1024, INDEX_BUFFER_SIZE = 16 * 1024 * 1024, + + // Structured buffer size must be multiple of element size. + ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData), + VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024, FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024, @@ -273,6 +278,8 @@ public: TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 3, TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS = 4, TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES = 5, + TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES = 6, + TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV = 7, UTILITY_ROOT_SIGNATURE_PARAM_PUSH_CONSTANTS = 0, UTILITY_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 1, @@ -299,6 +306,9 @@ private: D3D12StreamBuffer m_vertex_stream_buffer; D3D12StreamBuffer m_index_stream_buffer; + D3D12StreamBuffer m_accurate_prims_stream_buffer; + D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu; + D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu; D3D12StreamBuffer m_vertex_constant_buffer; D3D12StreamBuffer m_pixel_constant_buffer; D3D12StreamBuffer m_texture_stream_buffer; @@ -455,6 +465,7 @@ public: void IASetVertexBuffer(const void* vertex, size_t stride, size_t count); void IASetIndexBuffer(const void* index, size_t count); + void SetupAccuratePrims(GSHWDrawConfig& config); void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); @@ -466,10 +477,13 @@ public: bool BindDrawPipeline(const PipelineSelector& p); void RenderHW(GSHWDrawConfig& config) override; - void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier); + void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, + GSTexture12* draw_rt_clone, GSTexture12* draw_rt, + GSTexture12* draw_ds_clone, GSTexture12* draw_ds, + const bool one_barrier, const bool full_barrier, const bool skip_first_barrier); void UpdateHWPipelineSelector(GSHWDrawConfig& config); - void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config); + void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config); public: /// Ends any render pass, executes the command buffer, and invalidates cached state. @@ -527,33 +541,37 @@ private: DIRTY_FLAG_TFX_TEXTURES = (1 << 2), DIRTY_FLAG_TFX_SAMPLERS = (1 << 3), DIRTY_FLAG_TFX_RT_TEXTURES = (1 << 4), + DIRTY_FLAG_TFX_DEPTH_TEXTURES = (1 << 5), - DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 5), - DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 6), - DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 7), - DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 8), - DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 9), - DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 10), + DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 6), + DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 7), + DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 8), + DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING = (1 << 9), + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 10), + DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 11), + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 12), + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 = (1 << 13), - DIRTY_FLAG_VERTEX_BUFFER = (1 << 11), - DIRTY_FLAG_INDEX_BUFFER = (1 << 12), - DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 13), - DIRTY_FLAG_VIEWPORT = (1 << 14), - DIRTY_FLAG_SCISSOR = (1 << 15), - DIRTY_FLAG_RENDER_TARGET = (1 << 16), - DIRTY_FLAG_PIPELINE = (1 << 17), - DIRTY_FLAG_BLEND_CONSTANTS = (1 << 18), - DIRTY_FLAG_STENCIL_REF = (1 << 19), + DIRTY_FLAG_VERTEX_BUFFER = (1 << 14), + DIRTY_FLAG_INDEX_BUFFER = (1 << 15), + DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 16), + DIRTY_FLAG_VIEWPORT = (1 << 17), + DIRTY_FLAG_SCISSOR = (1 << 18), + DIRTY_FLAG_RENDER_TARGET = (1 << 19), + DIRTY_FLAG_PIPELINE = (1 << 20), + DIRTY_FLAG_BLEND_CONSTANTS = (1 << 21), + DIRTY_FLAG_STENCIL_REF = (1 << 22), DIRTY_BASE_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING | - DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | - DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | + DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING | + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | + DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 | DIRTY_FLAG_VERTEX_BUFFER | DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PRIMITIVE_TOPOLOGY | DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR | DIRTY_FLAG_RENDER_TARGET | DIRTY_FLAG_PIPELINE | DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_STENCIL_REF, - DIRTY_TFX_STATE = - DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS | DIRTY_FLAG_TFX_RT_TEXTURES, + DIRTY_TFX_STATE = DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS | + DIRTY_FLAG_TFX_RT_TEXTURES | DIRTY_FLAG_TFX_DEPTH_TEXTURES, DIRTY_UTILITY_STATE = DIRTY_BASE_STATE, DIRTY_CONSTANT_BUFFER_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER | DIRTY_FLAG_PS_CONSTANT_BUFFER, }; @@ -594,6 +612,7 @@ private: D3D12DescriptorHandle m_tfx_textures_handle_gpu; D3D12DescriptorHandle m_tfx_samplers_handle_gpu; D3D12DescriptorHandle m_tfx_rt_textures_handle_gpu; + D3D12DescriptorHandle m_tfx_depth_textures_handle_gpu; D3D12DescriptorHandle m_utility_texture_cpu; D3D12DescriptorHandle m_utility_texture_gpu; diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index 17c846f71c..bfa223d134 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -291,6 +291,360 @@ void GSRendererHW::Lines2Sprites() } } +static __forceinline void GetCoveringQuad(const GSVector2i& v0, const GSVector2i& v1, GSVertex* out) +{ + float x0 = static_cast(v0.x) / 16.0f; + float y0 = static_cast(v0.y) / 16.0f; + float x1 = static_cast(v1.x) / 16.0f; + float y1 = static_cast(v1.y) / 16.0f; + + float dx = x1 - x0; + float dy = y1 - y0; + float d_len = sqrtf(dx * dx + dy * dy); + dx = 2.0f * dx / d_len; + dy = 2.0f * dy / d_len; + + float nx = -dy; + float ny = dx; + + int dxi = static_cast(16.0f * dx); + int dyi = static_cast(16.0f * dy); + int nxi = static_cast(16.0f * nx); + int nyi = static_cast(16.0f * ny); + + GSVertex v[4]; + std::memset(v, 0, sizeof(v)); + + v[0].XYZ.X = static_cast(std::clamp(v0.x - dxi - nxi, 0, 0xFFFF)); + v[0].XYZ.Y = static_cast(std::clamp(v0.y - dyi - nyi, 0, 0xFFFF)); + + v[1].XYZ.X = static_cast(std::clamp(v0.x - dxi + nxi, 0, 0xFFFF)); + v[1].XYZ.Y = static_cast(std::clamp(v0.y - dyi + nyi, 0, 0xFFFF)); + + v[2].XYZ.X = static_cast(std::clamp(v1.x + dxi - nxi, 0, 0xFFFF)); + v[2].XYZ.Y = static_cast(std::clamp(v1.y + dyi - nyi, 0, 0xFFFF)); + + v[3].XYZ.X = static_cast(std::clamp(v1.x + dxi + nxi, 0, 0xFFFF)); + v[3].XYZ.Y = static_cast(std::clamp(v1.y + dyi + nyi, 0, 0xFFFF)); + + out[0] = v[0]; + out[1] = v[1]; + out[2] = v[2]; + + out[3] = v[1]; + out[4] = v[2]; + out[5] = v[3]; +} + +void GSRendererHW::GetAccuratePrimsEdgeVertexAttributes(const GSVertex& vtx0, const GSVertex& vtx1, const GSVertex* vtx_provoking, AccuratePrimsEdgeData& data) +{ + GSVector2i v0 = { static_cast(vtx0.XYZ.X), static_cast(vtx0.XYZ.Y) }; + GSVector2i v1 = { static_cast(vtx1.XYZ.X), static_cast(vtx1.XYZ.Y) }; + + // Interpolated attributes - mimicks transformations done in vertex shader. + GSVector2 uv0 = GSVector2(static_cast(vtx0.U), static_cast(vtx0.V)) - m_conf.cb_vs.texture_offset; + GSVector2 uv1 = GSVector2(static_cast(vtx1.U), static_cast(vtx1.V)) - m_conf.cb_vs.texture_offset; + GSVector2 uv0_scale = uv0 * m_conf.cb_vs.texture_scale; + GSVector2 uv1_scale = uv1 * m_conf.cb_vs.texture_scale; + GSVector2 st0 = GSVector2(vtx0.ST.S, vtx0.ST.T) - m_conf.cb_vs.texture_offset; + GSVector2 st1 = GSVector2(vtx1.ST.S, vtx1.ST.T) - m_conf.cb_vs.texture_offset; + GSVector2 st0_scale = PRIM->TME ? st0 / m_conf.cb_vs.texture_scale : GSVector2(0); + GSVector2 st1_scale = PRIM->TME ? st1 / m_conf.cb_vs.texture_scale : GSVector2(0); + + float fog0; + float fog1; + if (vtx_provoking) + { + fog0 = fog1 = static_cast(vtx_provoking->FOG) / 255.0f; + } + else + { + fog0 = static_cast(vtx0.FOG) / 255.0f; + fog1 = static_cast(vtx1.FOG) / 255.0f; + } + + data.t_float0 = GSVector4(st0.x, st0.y, fog0, vtx0.RGBAQ.Q); + data.t_float1 = GSVector4(st1.x, st1.y, fog1, vtx1.RGBAQ.Q); + data.t_int0 = GSVector4(uv0_scale.x, uv0_scale.y); + data.t_int1 = GSVector4(uv1_scale.x, uv1_scale.y); + + if (m_conf.vs.fst) + { + data.t_int0.z = uv0.x; + data.t_int0.w = uv0.y; + data.t_int1.z = uv1.x; + data.t_int1.w = uv1.y; + } + else + { + data.t_int0.z = st0_scale.x; + data.t_int0.w = st0_scale.y; + data.t_int1.z = st1_scale.x; + data.t_int1.w = st1_scale.y; + } + + constexpr float exp_min32 = 0x1p-32f; + float z0 = static_cast(std::min(vtx0.XYZ.Z, static_cast(m_conf.cb_vs.max_depth.x))); + float z1 = static_cast(std::min(vtx1.XYZ.Z, static_cast(m_conf.cb_vs.max_depth.x))); + + GSVector2 xy0 = GSVector2(v0.x, v0.y) - GSVector2(0.05f); + GSVector2 xy1 = GSVector2(v1.x, v1.y) - GSVector2(0.05f); + + xy0 = xy0 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset; + xy1 = xy1 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset; + + GSRendererType renderer = GSGetCurrentRenderer(); + float y_sign = (renderer == GSRendererType::DX11 || renderer == GSRendererType::DX12) ? -1.0f : 1.0f; + data.p0 = GSVector4(xy0.x, y_sign * xy0.y, z0 * exp_min32, 1.0f); + data.p1 = GSVector4(xy1.x, y_sign * xy1.y, z1 * exp_min32, 1.0f); + + if (vtx_provoking) + { + data.c0 = data.c1 = GSVector4( + static_cast(vtx_provoking->RGBAQ.R), + static_cast(vtx_provoking->RGBAQ.G), + static_cast(vtx_provoking->RGBAQ.B), + static_cast(vtx_provoking->RGBAQ.A)); + } + else + { + data.c0 = GSVector4( + static_cast(vtx0.RGBAQ.R), + static_cast(vtx0.RGBAQ.G), + static_cast(vtx0.RGBAQ.B), + static_cast(vtx0.RGBAQ.A)); + data.c1 = GSVector4( + static_cast(vtx1.RGBAQ.R), + static_cast(vtx1.RGBAQ.G), + static_cast(vtx1.RGBAQ.B), + static_cast(vtx1.RGBAQ.A)); + } +} + +void GSRendererHW::ExpandAccurateTrianglesEdge( + const GSVertex& vtx0, + const GSVertex& vtx1, + const GSVertex* vtx_provoking, + const GSVector4i& edge0, + const GSVector4i& edge1, + bool top_left, + AccuratePrimsEdgeData& data, + GSVertex* vertex_out) +{ + const GSVector2i v0 = { static_cast(vtx0.XYZ.X), static_cast(vtx0.XYZ.Y) }; + const GSVector2i v1 = { static_cast(vtx1.XYZ.X), static_cast(vtx1.XYZ.Y) }; + + const GSVector4i& xyof = m_context->scissor.xyof; + + data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y); + data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y); + const GSVector2i dxy = data.xy1 - data.xy0; + const bool pos_x = dxy.x >= 0; + const bool pos_y = dxy.y >= 0; + data.edge0 = edge0; + data.edge1 = edge1; + data.step_x = std::abs(dxy.x) >= std::abs(dxy.y); + data.side = top_left != (data.step_x && (dxy.y != 0) && (pos_x == pos_y)); + + GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data); + + GetCoveringQuad(v0, v1, vertex_out); +} + +static const u8 s_ysort[8][4] = +{ + {0, 1, 2, 0}, // y0 <= y1 <= y2 + {1, 0, 2, 0}, // y1 < y0 <= y2 + {0, 0, 0, 0}, + {1, 2, 0, 0}, // y1 <= y2 < y0 + {0, 2, 1, 0}, // y0 <= y2 < y1 + {0, 0, 0, 0}, + {2, 0, 1, 0}, // y2 < y0 <= y1 + {2, 1, 0, 0}, // y2 < y1 < y0 +}; + +void GSRendererHW::ExpandAccurateTrianglesVertices() +{ + constexpr int verts_per_prim = 21; // 3 verts for triangle interior; 3 x 6 verts for the edges. + const int prims = m_index.tail / 3; + + while (m_vertex.maxcount < static_cast(prims * verts_per_prim)) + GrowVertexBuffer(); + + m_accurate_prims_edge_data.clear(); + m_accurate_prims_edge_data.resize(3 * prims); + + const GSVector4i& xyof = m_context->scissor.xyof; + + const bool flat_shade = !PRIM->IIP; + const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 2 : 0; + + for (int i = 0; i < prims; i++) + { + // Code from GSRasterizer + const GSVertex& vtx0_orig = m_vertex.buff[m_index.buff[3 * i + 0]]; + const GSVertex& vtx1_orig = m_vertex.buff[m_index.buff[3 * i + 1]]; + const GSVertex& vtx2_orig = m_vertex.buff[m_index.buff[3 * i + 2]]; + + const GSVector2i v0_orig = { static_cast(vtx0_orig.XYZ.X) - xyof.x, static_cast(vtx0_orig.XYZ.Y) - xyof.y }; + const GSVector2i v1_orig = { static_cast(vtx1_orig.XYZ.X) - xyof.x, static_cast(vtx1_orig.XYZ.Y) - xyof.y }; + const GSVector2i v2_orig = { static_cast(vtx2_orig.XYZ.X) - xyof.x, static_cast(vtx2_orig.XYZ.Y) - xyof.y }; + + GSVector4i y0011(v0_orig.y, v0_orig.y, v1_orig.y, v1_orig.y); + GSVector4i y1221(v1_orig.y, v2_orig.y, v2_orig.y, v1_orig.y); + + int m1 = GSVector4::cast(y0011 > y1221).mask() & 7; + + const u8* idx = s_ysort[m1]; + + const GSVertex* vtx[3] = { &vtx0_orig, &vtx1_orig, &vtx2_orig }; + const GSVector2i* v[3] = { &v0_orig, &v1_orig, &v2_orig }; + + const GSVertex& vtx0 = *vtx[idx[0]]; + const GSVertex& vtx1 = *vtx[idx[1]]; + const GSVertex& vtx2 = *vtx[idx[2]]; + const GSVertex* vtx_provoking = flat_shade ? vtx[idx[provoking_offset]] : nullptr; + + const GSVector2i& v0 = *v[idx[0]]; + const GSVector2i& v1 = *v[idx[1]]; + const GSVector2i& v2 = *v[idx[2]]; + + y0011 = GSVector4i(v0.y, v0.y, v1.y, v1.y); + y1221 = GSVector4i(v1.y, v2.y, v2.y, v1.y); + + m1 = GSVector4::cast(y0011 == y1221).mask() & 7; + + if (m1 == 7) + continue; // Degenerate triangle. + + GSVector2i dv0 = v1 - v0; + GSVector2i dv1 = v2 - v0; + GSVector2i dv2 = v2 - v1; + + int cross = dv0.y * dv1.x - dv0.x * dv1.y; + + if (cross == 0) + continue; // Degenerate triangle + + bool clockwise = cross < 0; + + const bool tl0 = (v0.y == v1.y) || !clockwise; + const bool tl1 = clockwise; + const bool tl2 = (v1.y != v2.y) && !clockwise; + + GSVector4i edge0 = GSVector4i( dv0.y, -dv0.x, 0, 0); + GSVector4i edge1 = GSVector4i(-dv1.y, dv1.x, 0, 0); + GSVector4i edge2 = GSVector4i( dv2.y, -dv2.x, 0, 0); + + edge0.z = v1.x * v0.y - v0.x * v1.y; + edge1.z = v0.x * v2.y - v2.x * v0.y; + edge2.z = v2.x * v1.y - v1.x * v2.y; + + if (clockwise) + { + edge0 = GSVector4i(0) - edge0; + edge1 = GSVector4i(0) - edge1; + edge2 = GSVector4i(0) - edge2; + } + + // Bias for top-left edges. + edge0.z += tl0 ? 1 : 0; + edge1.z += tl1 ? 1 : 0; + edge2.z += tl2 ? 1 : 0; + + // Interior triangle + m_vertex.buff_copy[verts_per_prim * i + 0] = vtx0; + m_vertex.buff_copy[verts_per_prim * i + 1] = vtx1; + m_vertex.buff_copy[verts_per_prim * i + 2] = vtx2; + + // Edges + ExpandAccurateTrianglesEdge(vtx0, vtx1, vtx_provoking, edge1, edge2, tl0, m_accurate_prims_edge_data[3 * i + 0], + &m_vertex.buff_copy[verts_per_prim * i + 3]); + ExpandAccurateTrianglesEdge(vtx0, vtx2, vtx_provoking, edge2, edge0, tl1, m_accurate_prims_edge_data[3 * i + 1], + &m_vertex.buff_copy[verts_per_prim * i + 9]); + ExpandAccurateTrianglesEdge(vtx1, vtx2, vtx_provoking, edge0, edge1, tl2, m_accurate_prims_edge_data[3 * i + 2], + &m_vertex.buff_copy[verts_per_prim * i + 15]); + } + + m_index.tail = prims * verts_per_prim; + for (std::size_t i = 0; i < m_index.tail; i++) + { + m_index.buff[i] = i; + } + m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail; + + std::swap(m_vertex.buff, m_vertex.buff_copy); +} + +void GSRendererHW::ExpandAccurateLinesVertices() +{ + constexpr int verts_per_prim = 6; // 6 verts to form quad covering each line. + const int prims = m_index.tail / 2; + + const bool flat_shade = !PRIM->IIP; + const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 1 : 0; + + const auto ExitRule = [](const GSVector2i& d, bool step_x, bool pos_step) { + int dist = std::abs(d.x) + std::abs(d.y); + if (dist < 8) + return false; + + if (step_x) + { + bool x_good = pos_step ? (d.x > 0) : (d.x < 0); + return x_good && (dist > 8 || d.y >= 0); + } + else + { + bool y_good = pos_step ? (d.y > 0) : (d.y < 0); + return y_good && (dist > 8 || d.x >= 0); + } + }; + + while (m_vertex.maxcount < static_cast(verts_per_prim * prims)) + GrowVertexBuffer(); + + m_accurate_prims_edge_data.clear(); + m_accurate_prims_edge_data.resize(prims); + + const GSVector4i& xyof = m_context->scissor.xyof; + + for (int i = 0; i < prims; i++) + { + const GSVertex& vtx0 = m_vertex.buff[m_index.buff[2 * i + 0]]; + const GSVertex& vtx1 = m_vertex.buff[m_index.buff[2 * i + 1]]; + const GSVertex* vtx_provoking = flat_shade ? &m_vertex.buff[m_index.buff[2 * i + provoking_offset]] : nullptr; + + const GSVector2i v0 = { static_cast(vtx0.XYZ.X), static_cast(vtx0.XYZ.Y) }; + const GSVector2i v1 = { static_cast(vtx1.XYZ.X), static_cast(vtx1.XYZ.Y) }; + + AccuratePrimsEdgeData& data = m_accurate_prims_edge_data[i]; + + data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y); + data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y); + const GSVector2i dxy = data.xy1 - data.xy0; + const GSVector2i xy0_i = (data.xy0 + 8) & GSVector2i(~0xF); + const GSVector2i xy1_i = (data.xy1 + 8) & GSVector2i(~0xF); + data.step_x = std::abs(dxy.x) >= std::abs(dxy.y); + bool pos_step = data.step_x ? dxy.x >= 0 : dxy.y >= 0; + data.draw0 = !ExitRule(data.xy0 - xy0_i, data.step_x, pos_step); + data.draw1 = ExitRule(data.xy1 - xy1_i, data.step_x, pos_step); + + GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data); + + GetCoveringQuad(v0, v1, &m_vertex.buff_copy[i * verts_per_prim]); + } + + m_index.tail = prims * verts_per_prim; + for (std::size_t i = 0; i < m_index.tail; i++) + { + m_index.buff[i] = i; + } + m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail; + + std::swap(m_vertex.buff, m_vertex.buff_copy); +} + void GSRendererHW::ExpandLineIndices() { const u32 process_count = (m_index.tail + 7) / 8 * 8; @@ -2471,7 +2825,7 @@ void GSRendererHW::Draw() // Need to fix the alpha test, since the alpha will be fixed to 1.0 if ABE is disabled and AA1 is enabled // So if it doesn't meet the condition, always fail, if it does, always pass (turn off the test). - if (IsCoverageAlpha() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1) + if (IsCoverageAlphaFixedOne() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1) { const float aref = static_cast(m_cached_ctx.TEST.AREF); const int old_ATST = m_cached_ctx.TEST.ATST; @@ -5018,21 +5372,37 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert case GS_LINE_CLASS: { - m_conf.topology = GSHWDrawConfig::Topology::Line; - m_conf.indices_per_prim = 2; - if (unscale_pt_ln) + if (features.accurate_prims) { - if (features.line_expand) + GL_INS("HW: Using accurate lines"); + ExpandAccurateLinesVertices(); + m_conf.accurate_prims = true; + m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data; + m_conf.vs.accurate_prims = ACCURATE_PRIMS_LINE; + m_conf.ps.accurate_prims = ACCURATE_PRIMS_LINE; + m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0); + m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0); + m_conf.topology = GSHWDrawConfig::Topology::Triangle; + m_conf.indices_per_prim = 6; + } + else + { + m_conf.topology = GSHWDrawConfig::Topology::Line; + m_conf.indices_per_prim = 2; + if (unscale_pt_ln) { - m_conf.line_expand = true; - } - else if (features.vs_expand) - { - m_conf.vs.expand = GSHWDrawConfig::VSExpand::Line; - m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy); - m_conf.topology = GSHWDrawConfig::Topology::Triangle; - m_conf.indices_per_prim = 6; - ExpandLineIndices(); + if (features.line_expand) + { + m_conf.line_expand = true; + } + else if (features.vs_expand) + { + m_conf.vs.expand = GSHWDrawConfig::VSExpand::Line; + m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy); + m_conf.topology = GSHWDrawConfig::Topology::Triangle; + m_conf.indices_per_prim = 6; + ExpandLineIndices(); + } } } } @@ -5076,6 +5446,20 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert break; case GS_TRIANGLE_CLASS: + if (features.accurate_prims && PRIM->AA1) + { + GL_INS("HW: Using accurate triangles"); + ExpandAccurateTrianglesVertices(); + m_conf.accurate_prims = true; + m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data; + m_conf.vs.accurate_prims = ACCURATE_PRIMS_TRIANGLE; + m_conf.ps.accurate_prims = ACCURATE_PRIMS_TRIANGLE; + m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0); + m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0); + m_conf.topology = GSHWDrawConfig::Topology::Triangle; + m_conf.indices_per_prim = 21; + } + else { m_conf.topology = GSHWDrawConfig::Topology::Triangle; m_conf.indices_per_prim = 3; @@ -5130,6 +5514,10 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds) m_conf.depth.ztst = ZTST_ALWAYS; } + // Accurate prims requires a manual depth interpolation in the pixel shader. + // Piggy-back on Z clamp to avoid creating more pipeline combinations. + bool accurate_prims_clamp_z = UsingAccuratePrims() && (m_conf.depth.zwe || m_conf.depth.ztst != ZTST_ALWAYS); + // On the real GS we appear to do clamping on the max z value the format allows. // Clamping is done after rasterization. const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8); @@ -5139,16 +5527,23 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds) //ps_cb.MaxDepth = GSVector4(0.0f, 0.0f, 0.0f, 1.0f); m_conf.ps.zclamp = 0; - if (clamp_z) + if (clamp_z || accurate_prims_clamp_z) { if (m_vt.m_primclass == GS_SPRITE_CLASS || m_vt.m_primclass == GS_POINT_CLASS) { m_conf.cb_vs.max_depth = GSVector2i(max_z); } - else if (!m_cached_ctx.ZBUF.ZMSK) + else if (!m_cached_ctx.ZBUF.ZMSK || accurate_prims_clamp_z) { m_conf.cb_ps.TA_MaxDepth_Af.z = static_cast(max_z) * 0x1p-32f; m_conf.ps.zclamp = 1; + if (accurate_prims_clamp_z && m_vt.m_primclass == GS_TRIANGLE_CLASS && PRIM->AA1 && + m_cached_ctx.TEST.ZTE && (m_conf.depth.ztst == ZTST_GEQUAL || m_conf.depth.ztst == ZTST_GREATER)) + { + // For HW AA1 with triangles we must do Z test in the shader to get proper + // updating of the Z buffer (interior triangle points update the Z buffer but edges should not). + m_conf.ps.ztst = m_conf.depth.ztst; + } } } } @@ -5619,15 +6014,13 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo { const GIFRegALPHA& ALPHA = m_context->ALPHA; { - // AA1: Blending needs to be enabled on draw. - const bool AA1 = PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS); // PABE: Check condition early as an optimization, no blending when As < 128. // For Cs*As + Cd*(1 - As) if As is 128 then blending can be disabled as well. const bool PABE_skip = m_draw_env->PABE.PABE && ((GetAlphaMinMax().max < 128) || (GetAlphaMinMax().max == 128 && ALPHA.A == 0 && ALPHA.B == 1 && ALPHA.C == 0 && ALPHA.D == 1)); // No blending or coverage anti-aliasing so early exit - if (PABE_skip || !(NeedsBlending() || AA1)) + if (PABE_skip || !(NeedsBlending() || IsCoverageAlpha())) { m_conf.blend = {}; m_conf.ps.no_color1 = true; @@ -7310,8 +7703,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta const bool is_overlap_alpha = m_prim_overlap != PRIM_OVERLAP_NO && !(m_cached_ctx.FRAME.FBMSK & 0x80000000); if (m_cached_ctx.TEST.DATM == 0) { - // Some pixles are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass). - DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlpha()))); + // Some pixels are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass). + DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlphaFixedOne()))); // All pixels fail. if (DATE && rt->m_alpha_min >= 128) @@ -7319,8 +7712,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta } else { - // Some pixles are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass). - DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlpha()))); + // Some pixels are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass). + DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlphaFixedOne()))); // All pixels fail. if (DATE && rt->m_alpha_max < 128) @@ -7472,7 +7865,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta } // When Blending is disabled and Edge Anti Aliasing is enabled, // the output alpha is Coverage (which we force to 128) so DATE will fail/pass guaranteed on second pass. - else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlpha()) && features.stencil_buffer) + else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlphaFixedOne()) && features.stencil_buffer) { GL_PERF("DATE: Fast with FBA, all pixels will be >= 128"); DATE_one = !m_cached_ctx.TEST.DATM; @@ -7658,7 +8051,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta } // AA1: Set alpha source to coverage 128 when there is no alpha blending. - m_conf.ps.fixed_one_a = IsCoverageAlpha(); + m_conf.ps.fixed_one_a = IsCoverageAlphaFixedOne(); if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) || (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle))) { @@ -8024,6 +8417,23 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta m_conf.require_full_barrier = false; } + if ((features.texture_barrier || features.multidraw_fb_copy) && UsingAccuratePrims() && + (m_vt.m_primclass == GS_TRIANGLE_CLASS) && PRIM->AA1 && m_conf.ps.zclamp) + { + // Manual depth test in the shader requires full barrier. + if (m_prim_overlap == PRIM_OVERLAP_NO) + m_conf.require_one_barrier = true; + else + m_conf.require_full_barrier = true; + } + + if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy)) + { + ComputeDrawlistGetSize(rt->m_scale); + m_conf.drawlist = &m_drawlist; + m_conf.drawlist_bbox = &m_drawlist_bbox; + } + // rs const GSVector4i hacked_scissor = m_channel_shuffle ? GSVector4i::cxpr(0, 0, 1024, 1024) : m_context->scissor.in; const GSVector4i scissor(GSVector4i(GSVector4(rtscale) * GSVector4(hacked_scissor)).rintersect(GSVector4i::loadh(rtsize))); @@ -8118,14 +8528,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta m_conf.cb_ps.FogColor_AREF.a = m_conf.alpha_second_pass.ps_aref; m_conf.alpha_second_pass.enable = false; } - - if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy)) - { - ComputeDrawlistGetSize(rt->m_scale); - m_conf.drawlist = &m_drawlist; - m_conf.drawlist_bbox = &m_drawlist_bbox; - } - + if (!m_channel_shuffle_width) g_gs_device->RenderHW(m_conf); else @@ -9568,3 +9971,10 @@ std::size_t GSRendererHW::ComputeDrawlistGetSize(float scale) } return m_drawlist.size(); } + +bool GSRendererHW::IsCoverageAlphaSupported() +{ + return IsCoverageAlpha() && + ((m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS) && + g_gs_device->Features().accurate_prims); +} diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.h b/pcsx2/GS/Renderers/HW/GSRendererHW.h index ab02574707..64db504ce9 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.h +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.h @@ -137,6 +137,21 @@ private: bool IsUsingCsInBlend(); bool IsUsingAsInBlend(); + void GetAccuratePrimsEdgeVertexAttributes( + const GSVertex& vtx0, + const GSVertex& vtx1, + const GSVertex* vtx_provoking, + AccuratePrimsEdgeData& data); + void ExpandAccurateTrianglesEdge( + const GSVertex& vtx0, + const GSVertex& vtx1, + const GSVertex* vtx_provoking, + const GSVector4i& edge0, + const GSVector4i& edge1, + bool top_left, + AccuratePrimsEdgeData& data, + GSVertex* vertex_out); + // We modify some of the context registers to optimize away unnecessary operations. // Instead of messing with the real context, we copy them and use those instead. struct HWCachedCtx @@ -205,6 +220,8 @@ private: std::unique_ptr m_sw_texture[7 + 1]; std::unique_ptr> m_sw_rasterizer; + std::vector m_accurate_prims_edge_data; + public: GSRendererHW(); virtual ~GSRendererHW() override; @@ -221,6 +238,8 @@ public: void Lines2Sprites(); bool VerifyIndices(); void ExpandLineIndices(); + void ExpandAccurateLinesVertices(); + void ExpandAccurateTrianglesVertices(); void ConvertSpriteTextureShuffle(u32& process_rg, u32& process_ba, bool& shuffle_across, GSTextureCache::Target* rt, GSTextureCache::Source* tex); GSVector4 RealignTargetTextureCoordinate(const GSTextureCache::Source* tex); GSVector4i ComputeBoundingBox(const GSVector2i& rtsize, float rtscale); @@ -273,4 +292,6 @@ public: /// Compute the drawlist (if not already present) and bounding boxes for the current draw. std::size_t ComputeDrawlistGetSize(float scale); + + bool IsCoverageAlphaSupported() override; }; diff --git a/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h b/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h index f7fb7f72e1..0cca9172a4 100644 --- a/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h +++ b/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h @@ -94,6 +94,11 @@ struct GSMTLMainVSUniform vector_float2 texture_offset; vector_float2 point_size; uint max_depth; + uint _pad0; + uint base_vertex; + uint _pad1; + uint _pad2; + uint _pad3; }; struct GSMTLMainPSUniform @@ -134,6 +139,8 @@ struct GSMTLMainPSUniform matrix_float4x4 dither_matrix; vector_float4 scale_factor; + + vector_uint4 accurate_prims_base_index; }; enum GSMTLAttributes diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp index b0e3359fee..174f69174f 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp @@ -26,6 +26,7 @@ static constexpr u32 g_ps_cb_index = 0; static constexpr u32 VERTEX_BUFFER_SIZE = 32 * 1024 * 1024; static constexpr u32 INDEX_BUFFER_SIZE = 16 * 1024 * 1024; +static constexpr u32 ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024; static constexpr u32 VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024; static constexpr u32 FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024; static constexpr u32 TEXTURE_UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024; @@ -259,10 +260,12 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE); m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE); + m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE); m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE); m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE); glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment); - if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer) + if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_accurate_prims_stream_buffer || + !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer) { Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers"); return false; @@ -304,6 +307,11 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) glBufferData(GL_ELEMENT_ARRAY_BUFFER, EXPAND_BUFFER_SIZE, expand_data.get(), GL_STATIC_DRAW); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, m_vertex_stream_buffer->GetGLBufferId(), 0, VERTEX_BUFFER_SIZE); } + + if (m_features.accurate_prims) + { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 3, m_accurate_prims_stream_buffer->GetGLBufferId(), 0, ACCURATE_PRIMS_BUFFER_SIZE); + } } // **************************************************************** @@ -768,6 +776,8 @@ bool GSDeviceOGL::CheckFeatures(bool& buggy_pbo) m_features.line_expand ? "hardware" : (m_features.vs_expand ? "vertex expanding" : "UNSUPPORTED"), m_features.vs_expand ? "vertex expanding" : "CPU"); + m_features.accurate_prims = GSConfig.HWAccuratePrims; + return true; } @@ -838,6 +848,7 @@ void GSDeviceOGL::DestroyResources() m_fragment_uniform_stream_buffer.reset(); m_vertex_uniform_stream_buffer.reset(); + m_accurate_prims_stream_buffer.reset(); glBindVertexArray(0); if (m_expand_ibo != 0) @@ -1328,8 +1339,9 @@ std::string GSDeviceOGL::GetVSSource(VSSelector sel) std::string macro = fmt::format("#define VS_FST {}\n", static_cast(sel.fst)) + fmt::format("#define VS_IIP {}\n", static_cast(sel.iip)) + fmt::format("#define VS_POINT_SIZE {}\n", static_cast(sel.point_size)) - + fmt::format("#define VS_EXPAND {}\n", static_cast(sel.expand)); - + + fmt::format("#define VS_EXPAND {}\n", static_cast(sel.expand)) + + fmt::format("#define VS_ACCURATE_PRIMS {}\n", static_cast(sel.accurate_prims)) + ; std::string src = GenGlslHeader("vs_main", GL_VERTEX_SHADER, macro); src += m_shader_tfx_vgs; return src; @@ -1394,6 +1406,10 @@ std::string GSDeviceOGL::GetPSSource(const PSSelector& sel) + fmt::format("#define PS_SCANMSK {}\n", sel.scanmsk) + fmt::format("#define PS_NO_COLOR {}\n", sel.no_color) + fmt::format("#define PS_NO_COLOR1 {}\n", sel.no_color1) + + fmt::format("#define PS_ACCURATE_PRIMS {}\n", sel.accurate_prims) + + fmt::format("#define PS_ACCURATE_PRIMS_AA {}\n", sel.accurate_prims_aa) + + fmt::format("#define PS_ACCURATE_PRIMS_AA_ABE {}\n", sel.accurate_prims_aa_abe) + + fmt::format("#define PS_ZTST {}\n", sel.ztst) ; std::string src = GenGlslHeader("ps_main", GL_FRAGMENT_SHADER, macro); @@ -2010,6 +2026,21 @@ void GSDeviceOGL::ClearSamplerCache() } } +void GSDeviceOGL::SetupAccuratePrims(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + const u32 count = config.accurate_prims_edge_data->size(); + const u32 size = count * sizeof(AccuratePrimsEdgeData); + auto res = m_accurate_prims_stream_buffer->Map(sizeof(AccuratePrimsEdgeData), size); + std::memcpy(res.pointer, config.accurate_prims_edge_data->data(), size); + m_accurate_prims_stream_buffer->Unmap(size); + + config.cb_vs.base_vertex.x = m_vertex.start; + config.cb_ps.accurate_prims_base_index.x = res.index_aligned; + } +} + bool GSDeviceOGL::CreateCASPrograms() { std::optional cas_source = ReadShaderSource("shaders/opengl/cas.glsl"); @@ -2514,6 +2545,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config) IASetVertexBuffer(config.verts, config.nverts, GetVertexAlignment(config.vs.expand)); m_vertex.start *= GetExpansionFactor(config.vs.expand); + SetupAccuratePrims(config); + if (config.vs.UseExpandIndexBuffer()) { IASetVAO(m_expand_vao); @@ -2543,6 +2576,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config) PSSetShaderResource(2, draw_rt_clone); else if (config.require_one_barrier || config.require_full_barrier) PSSetShaderResource(2, colclip_rt ? colclip_rt : config.rt); + if ((config.require_one_barrier || config.require_full_barrier) && config.ps.IsFeedbackLoopDepth()) + PSSetShaderResource(4, config.ds); SetupSampler(config.sampler); @@ -2647,7 +2682,7 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config) OMSetRenderTargets(draw_rt, draw_ds, &config.scissor); OMSetColorMaskState(config.colormask); SetupOM(config.depth); - + // Clear stencil as close as possible to the RT bind, to avoid framebuffer swaps. if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::StencilOne && m_features.texture_barrier) { @@ -2736,7 +2771,7 @@ void GSDeviceOGL::SendHWDraw(const GSHWDrawConfig& config, bool one_barrier, boo } #ifdef PCSX2_DEVBUILD - if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]] + if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]] Console.Warning("OpenGL: Possible unnecessary barrier detected."); #endif diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h index d1ef40d11f..7b67ea24ef 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h @@ -151,6 +151,7 @@ private: std::unique_ptr m_vertex_stream_buffer; std::unique_ptr m_index_stream_buffer; + std::unique_ptr m_accurate_prims_stream_buffer; GLuint m_expand_ibo = 0; GLuint m_vao = 0; GLuint m_expand_vao = 0; @@ -340,6 +341,7 @@ public: void IASetPrimitiveTopology(GLenum topology); void IASetVertexBuffer(const void* vertices, size_t count, size_t align_multiplier = 1); void IASetIndexBuffer(const void* index, size_t count); + void SetupAccuratePrims(GSHWDrawConfig& config); void PSSetShaderResource(int i, GSTexture* sr); void PSSetSamplerState(GLuint ss); diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.h b/pcsx2/GS/Renderers/SW/GSRendererSW.h index b724fd5575..2e6c9920d7 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.h +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.h @@ -82,6 +82,8 @@ protected: template void RewriteVerticesIfSTOverflow(); + bool IsCoverageAlphaSupported() override { return true; } + public: GSRendererSW(int threads); ~GSRendererSW() override; diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp index fb9cc45e3d..0881bdb7e0 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp @@ -41,6 +41,7 @@ enum : u32 VERTEX_BUFFER_SIZE = 32 * 1024 * 1024, INDEX_BUFFER_SIZE = 16 * 1024 * 1024, + ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024, VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024, FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024, TEXTURE_BUFFER_SIZE = 64 * 1024 * 1024, @@ -932,7 +933,7 @@ bool GSDeviceVK::CreateGlobalDescriptorPool() { static constexpr const VkDescriptorPoolSize pool_sizes[] = { {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 2}, - {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2}, + {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3}, }; VkDescriptorPoolCreateInfo pool_create_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr, @@ -1501,12 +1502,13 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key) VkAttachmentReference* color_reference_ptr = nullptr; VkAttachmentReference depth_reference; VkAttachmentReference* depth_reference_ptr = nullptr; - VkAttachmentReference input_reference; - VkAttachmentReference* input_reference_ptr = nullptr; - VkSubpassDependency subpass_dependency; - VkSubpassDependency* subpass_dependency_ptr = nullptr; + std::array input_reference; + u32 num_subpass_inputs = 0; + std::array subpass_dependency; + u32 num_subpass_dependencies = 0; std::array attachments; u32 num_attachments = 0; + bool actual_color_feedback_loop = false; if (key.color_format != VK_FORMAT_UNDEFINED) { const VkImageLayout layout = @@ -1522,28 +1524,32 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key) if (key.color_feedback_loop) { + actual_color_feedback_loop = true; + if (!UseFeedbackLoopLayout()) { - input_reference.attachment = num_attachments; - input_reference.layout = layout; - input_reference_ptr = &input_reference; + pxAssert(num_subpass_inputs == 0); // Must always have the color input first. + input_reference[num_subpass_inputs].attachment = num_attachments; + input_reference[num_subpass_inputs].layout = layout; + num_subpass_inputs++; } if (!m_features.framebuffer_fetch) { + pxAssert(num_subpass_dependencies == 0); // Must always have the color input first. // don't need the framebuffer-local dependency when we have rasterization order attachment access - subpass_dependency.srcSubpass = 0; - subpass_dependency.dstSubpass = 0; - subpass_dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - subpass_dependency.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; - subpass_dependency.srcAccessMask = + subpass_dependency[num_subpass_dependencies].srcSubpass = 0; + subpass_dependency[num_subpass_dependencies].dstSubpass = 0; + subpass_dependency[num_subpass_dependencies].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + subpass_dependency[num_subpass_dependencies].srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - subpass_dependency.dstAccessMask = + subpass_dependency[num_subpass_dependencies].dstAccessMask = UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; - subpass_dependency.dependencyFlags = + subpass_dependency[num_subpass_dependencies].dependencyFlags = UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) : VK_DEPENDENCY_BY_REGION_BIT; - subpass_dependency_ptr = &subpass_dependency; + num_subpass_dependencies++; } } @@ -1562,6 +1568,41 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key) depth_reference.attachment = num_attachments; depth_reference.layout = layout; depth_reference_ptr = &depth_reference; + + if (actual_color_feedback_loop && key.depth_sampling) + { + // Note: We only allow depth to be bound in a feedback loop if color is already bound as such. + // This is partly because it doesn't seem likely that we will ever need a depth feedback loop + // without a color feedback loop and to simplify the indices for subpass inputs (0 for color; 1 for depth); + + if (!UseFeedbackLoopLayout()) + { + pxAssert(num_subpass_inputs == 1); // Must always have the color input first. + input_reference[num_subpass_inputs].attachment = num_attachments; + input_reference[num_subpass_inputs].layout = layout; + num_subpass_inputs++; + } + + if (!m_features.framebuffer_fetch) + { + pxAssert(num_subpass_dependencies == 1); // Must always have the color input first. + // don't need the framebuffer-local dependency when we have rasterization order attachment access + subpass_dependency[num_subpass_dependencies].srcSubpass = 0; + subpass_dependency[num_subpass_dependencies].dstSubpass = 0; + subpass_dependency[num_subpass_dependencies].srcStageMask = + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; + subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + subpass_dependency[num_subpass_dependencies].srcAccessMask = + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + subpass_dependency[num_subpass_dependencies].dstAccessMask = + UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; + subpass_dependency[num_subpass_dependencies].dependencyFlags = + UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) : + VK_DEPENDENCY_BY_REGION_BIT; + num_subpass_dependencies++; + } + } + num_attachments++; } @@ -1569,11 +1610,11 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key) (key.color_feedback_loop && m_optional_extensions.vk_ext_rasterization_order_attachment_access) ? VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT : 0; - const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, input_reference_ptr ? 1u : 0u, - input_reference_ptr ? input_reference_ptr : nullptr, color_reference_ptr ? 1u : 0u, + const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, num_subpass_inputs, + num_subpass_inputs ? input_reference.data() : nullptr, color_reference_ptr ? 1u : 0u, color_reference_ptr ? color_reference_ptr : nullptr, nullptr, depth_reference_ptr, 0, nullptr}; const VkRenderPassCreateInfo pass_info = {VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, nullptr, 0u, num_attachments, - attachments.data(), 1u, &subpass, subpass_dependency_ptr ? 1u : 0u, subpass_dependency_ptr}; + attachments.data(), 1u, &subpass, num_subpass_dependencies, num_subpass_dependencies ? subpass_dependency.data() : nullptr}; VkRenderPass pass; const VkResult res = vkCreateRenderPass(m_device, &pass_info, nullptr, &pass); @@ -2679,6 +2720,8 @@ bool GSDeviceVK::CheckFeatures() m_max_texture_size = m_device_properties.limits.maxImageDimension2D; + m_features.accurate_prims = GSConfig.HWAccuratePrims; + return true; } @@ -3363,6 +3406,31 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count) SetIndexBuffer(m_index_stream_buffer.GetBuffer()); } +void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + const u32 count = config.accurate_prims_edge_data->size(); + const u32 size = count * sizeof(AccuratePrimsEdgeData); + + if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) + { + ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); + if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) + pxFailRel("Failed to reserve space for accurate prims"); + } + + config.cb_vs.base_vertex = m_vertex.start; + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + + SetVSConstantBuffer(config.cb_vs); + SetPSConstantBuffer(config.cb_ps); + + std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); + m_accurate_prims_stream_buffer.CommitMemory(size); + } +} + void GSDeviceVK::OMSetRenderTargets( GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop) { @@ -3379,12 +3447,15 @@ void GSDeviceVK::OMSetRenderTargets( if (vkRt) { m_current_framebuffer = - vkRt->GetLinkedFramebuffer(vkDs, (feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0); + vkRt->GetLinkedFramebuffer(vkDs, + (feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0, + (feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth) != 0); } else { - pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT)); - m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false); + pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) && + !(feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth)); + m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false, false); } } else if (InRenderPass()) @@ -3494,7 +3565,21 @@ void GSDeviceVK::OMSetRenderTargets( if (vkDs) { // need to update descriptors to reflect the new layout - if (feedback_loop & FeedbackLoopFlag_ReadDS) + if (feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth) + { + // NVIDIA drivers appear to return random garbage when sampling the RT via a feedback loop, if the load op for + // the render pass is CLEAR. Using vkCmdClearAttachments() doesn't work, so we have to clear the image instead. + // Note: DS feedback loop was added later - we will assume that the same issue is relevant. + if (vkDs->GetState() == GSTexture::State::Cleared && IsDeviceNVIDIA()) + vkDs->CommitClear(); + + if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop) + { + m_dirty_flags |= (DIRTY_FLAG_TFX_TEXTURE_0 << TFX_TEXTURE_DEPTH); + vkDs->TransitionToLayout(GSTextureVK::Layout::FeedbackLoop); + } + } + else if (feedback_loop & FeedbackLoopFlag_ReadDepth) { if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop) { @@ -3675,6 +3760,15 @@ bool GSDeviceVK::CreateBuffers() return false; } + if (m_features.accurate_prims) + { + if (!m_accurate_prims_stream_buffer.Create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ACCURATE_PRIMS_BUFFER_SIZE)) + { + Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); + return false; + } + } + if (!m_vertex_uniform_stream_buffer.Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VERTEX_UNIFORM_BUFFER_SIZE)) { Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer"); @@ -3734,6 +3828,8 @@ bool GSDeviceVK::CreatePipelineLayouts() dslb.AddBinding(1, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1, VK_SHADER_STAGE_FRAGMENT_BIT); if (m_features.vs_expand) dslb.AddBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_VERTEX_BIT); + if (m_features.accurate_prims) + dslb.AddBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_FRAGMENT_BIT); if ((m_tfx_ubo_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(dev, m_tfx_ubo_ds_layout, "TFX UBO descriptor layout"); @@ -3743,9 +3839,13 @@ bool GSDeviceVK::CreatePipelineLayouts() dslb.AddBinding(TFX_TEXTURE_PALETTE, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT); dslb.AddBinding(TFX_TEXTURE_RT, (m_features.texture_barrier && !UseFeedbackLoopLayout()) ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT : - VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT); dslb.AddBinding(TFX_TEXTURE_PRIMID, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT); + dslb.AddBinding(TFX_TEXTURE_DEPTH, + (m_features.texture_barrier && !UseFeedbackLoopLayout()) ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT : + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + 1, VK_SHADER_STAGE_FRAGMENT_BIT); if ((m_tfx_texture_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(dev, m_tfx_texture_ds_layout, "TFX texture descriptor layout"); @@ -4603,6 +4703,7 @@ void GSDeviceVK::DestroyResources() m_fragment_uniform_stream_buffer.Destroy(false); m_vertex_uniform_stream_buffer.Destroy(false); m_index_stream_buffer.Destroy(false); + m_accurate_prims_stream_buffer.Destroy(false); m_vertex_stream_buffer.Destroy(false); if (m_expand_index_buffer != VK_NULL_HANDLE) vmaDestroyBuffer(m_allocator, m_expand_index_buffer, m_expand_index_buffer_allocation); @@ -4670,6 +4771,7 @@ VkShaderModule GSDeviceVK::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel) AddMacro(ss, "VS_POINT_SIZE", sel.point_size); AddMacro(ss, "VS_EXPAND", static_cast(sel.expand)); AddMacro(ss, "VS_PROVOKING_VERTEX_LAST", static_cast(m_features.provoking_vertex_last)); + AddMacro(ss, "VS_ACCURATE_PRIMS", static_cast(sel.accurate_prims)); ss << m_tfx_source; VkShaderModule mod = g_vulkan_shader_cache->GetVertexShader(ss.str()); @@ -4744,6 +4846,10 @@ VkShaderModule GSDeviceVK::GetTFXFragmentShader(const GSHWDrawConfig::PSSelector AddMacro(ss, "PS_TEX_IS_FB", sel.tex_is_fb); AddMacro(ss, "PS_NO_COLOR", sel.no_color); AddMacro(ss, "PS_NO_COLOR1", sel.no_color1); + AddMacro(ss, "PS_ACCURATE_PRIMS", sel.accurate_prims); + AddMacro(ss, "PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa); + AddMacro(ss, "PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe); + AddMacro(ss, "PS_ZTST", sel.ztst); ss << m_tfx_source; VkShaderModule mod = g_vulkan_shader_cache->GetFragmentShader(ss.str()); @@ -4945,6 +5051,11 @@ bool GSDeviceVK::CreatePersistentDescriptorSets() dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_vertex_stream_buffer.GetBuffer(), 0, VERTEX_BUFFER_SIZE); } + if (m_features.accurate_prims) + { + dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + m_accurate_prims_stream_buffer.GetBuffer(), 0, ACCURATE_PRIMS_BUFFER_SIZE); + } dsub.Update(dev); Vulkan::SetObjectName(dev, m_tfx_ubo_descriptor_set, "Persistent TFX UBO set"); return true; @@ -5341,11 +5452,15 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed) m_current_pipeline_layout = PipelineLayout::TFX; flags |= DIRTY_FLAG_TFX_UBO | DIRTY_FLAG_TFX_TEXTURES; - // Clear out the RT binding if feedback loop isn't on, because it'll be in the wrong state and make + // Clear out the RT/DS binding if feedback loop isn't on, because it'll be in the wrong state and make // the validation layer cranky. Not a big deal since we need to write it anyway. - const GSTextureVK::Layout rt_tex_layout = m_tfx_textures[TFX_TEXTURE_RT]->GetLayout(); - if (rt_tex_layout != GSTextureVK::Layout::FeedbackLoop && rt_tex_layout != GSTextureVK::Layout::ShaderReadOnly) - m_tfx_textures[TFX_TEXTURE_RT] = m_null_texture.get(); + std::array texture_types = { TFX_TEXTURE_RT, TFX_TEXTURE_DEPTH }; + for (u32 texture_type : texture_types) + { + const GSTextureVK::Layout tex_layout = m_tfx_textures[texture_type]->GetLayout(); + if (tex_layout != GSTextureVK::Layout::FeedbackLoop && tex_layout != GSTextureVK::Layout::ShaderReadOnly) + m_tfx_textures[texture_type] = m_null_texture.get(); + } } if (flags & DIRTY_FLAG_TFX_UBO) @@ -5386,6 +5501,19 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed) dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_PRIMID, m_tfx_textures[TFX_TEXTURE_PRIMID]->GetView(), m_tfx_textures[TFX_TEXTURE_PRIMID]->GetVkLayout()); } + if (flags & DIRTY_FLAG_TFX_TEXTURE_DEPTH) + { + if (m_features.texture_barrier && !UseFeedbackLoopLayout()) + { + dsub.AddInputAttachmentDescriptorWrite( + VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(), VK_IMAGE_LAYOUT_GENERAL); + } + else + { + dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(), + m_tfx_textures[TFX_TEXTURE_DEPTH]->GetVkLayout()); + } + } dsub.PushUpdate(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_tfx_pipeline_layout, TFX_DESCRIPTOR_SET_TEXTURES); } @@ -5597,8 +5725,12 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config) UpdateHWPipelineSelector(config, pipe); // If we don't have a barrier but the texture was drawn to last draw, end the pass to insert a barrier. - if (InRenderPass() && !pipe.IsRTFeedbackLoop() && (config.tex == m_current_render_target || config.tex == m_current_depth_target)) - EndRenderPass(); + if (InRenderPass()) + { + if ((!pipe.IsRTFeedbackLoop() && config.tex == m_current_render_target) || + (!pipe.IsDepthFeedbackLoop() && config.tex == m_current_depth_target)) + EndRenderPass(); + } // now blit the colclip texture back to the original target if (colclip_rt) @@ -5781,20 +5913,31 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config) // Despite the layout changing enforcing the execution dependency between previous draws and the first // input attachment read, it still wants the region/fragment-local barrier... - const bool skip_first_barrier = - (draw_rt && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop && !pipe.ps.colclip_hw && !IsDeviceAMD()); + bool skip_first_barrier = !pipe.ps.colclip_hw && !IsDeviceAMD(); + if (draw_rt) + skip_first_barrier = skip_first_barrier && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop; + if (draw_ds) + skip_first_barrier = skip_first_barrier && draw_ds->GetLayout() != GSTextureVK::Layout::FeedbackLoop; OMSetRenderTargets(draw_rt, draw_ds, config.scissor, static_cast(pipe.feedback_loop_flags)); if (pipe.IsRTFeedbackLoop()) { pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled"); - PSSetShaderResource(2, draw_rt, false); + PSSetShaderResource(TFX_TEXTURE_RT, draw_rt, false); // If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor. // Otherwise, we might have a previous descriptor left over, that has the RT in a different state. m_dirty_flags |= (skip_first_barrier ? static_cast(DIRTY_FLAG_TFX_TEXTURE_RT) : 0); } + if (pipe.IsDepthFeedbackLoop()) + { + pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled"); + PSSetShaderResource(TFX_TEXTURE_DEPTH, draw_ds, false); + // If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor. + // Otherwise, we might have a previous descriptor left over, that has the RT in a different state. + m_dirty_flags |= (skip_first_barrier ? static_cast(DIRTY_FLAG_TFX_TEXTURE_DEPTH) : 0); + } // Begin render pass if new target or out of the area. if (!InRenderPass()) { @@ -5868,7 +6011,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config) // now we can do the actual draw if (BindDrawPipeline(pipe)) - SendHWDraw(config, draw_rt, config.require_one_barrier, config.require_full_barrier, skip_first_barrier); + SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr, + config.require_one_barrier, config.require_full_barrier, skip_first_barrier); // blend second pass if (config.blend_multi_pass.enable) @@ -5902,8 +6046,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config) pipe.bs = config.blend; if (BindDrawPipeline(pipe)) { - SendHWDraw(config, draw_rt, config.alpha_second_pass.require_one_barrier, - config.alpha_second_pass.require_full_barrier, false); + SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr, + config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, false); } } @@ -5980,19 +6124,24 @@ void GSDeviceVK::UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelect pipe.rt = config.rt != nullptr; pipe.ds = config.ds != nullptr; pipe.line_width = config.line_expand; - pipe.feedback_loop_flags = - (m_features.texture_barrier && - (config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier)) ? - FeedbackLoopFlag_ReadAndWriteRT : - FeedbackLoopFlag_None; - pipe.feedback_loop_flags |= - (config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDS : FeedbackLoopFlag_None; + pipe.feedback_loop_flags = FeedbackLoopFlag_None; + if (m_features.texture_barrier && (config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier)) + { + pipe.feedback_loop_flags |= FeedbackLoopFlag_ReadAndWriteRT; + + // We only allow DS feedback loop if RT is already in a feedback loop. + pipe.feedback_loop_flags |= (pipe.ds && config.ps.IsFeedbackLoopDepth()) ? FeedbackLoopFlag_ReadAndWriteDepth : FeedbackLoopFlag_None; + } + if (!(pipe.feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth)) + { + pipe.feedback_loop_flags |= (config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDepth : FeedbackLoopFlag_None; + } // enable point size in the vertex shader if we're rendering points regardless of upscaling. pipe.vs.point_size |= (config.topology == GSHWDrawConfig::Topology::Point); } -void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config) +void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) { IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts, GetVertexAlignment(config.vs.expand)); m_vertex.start *= GetExpansionFactor(config.vs.expand); @@ -6007,6 +6156,8 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config) { IASetIndexBuffer(config.indices, config.nindices); } + + SetupAccuratePrims(config); } VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const @@ -6020,13 +6171,31 @@ VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, rt->GetImage(), {VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u}}; } +VkImageMemoryBarrier GSDeviceVK::GetDepthStencilBufferBarrier(GSTextureVK* ds) const +{ + const VkImageLayout layout = + UseFeedbackLoopLayout() ? VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT : VK_IMAGE_LAYOUT_GENERAL; + const VkAccessFlags dst_access = + UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; + return {VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, nullptr, + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, dst_access, layout, layout, + VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, ds->GetImage(), + {VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0u, 1u, 0u, 1u}}; +} + VkDependencyFlags GSDeviceVK::GetColorBufferBarrierFlags() const { return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) : VK_DEPENDENCY_BY_REGION_BIT; } -void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, +VkDependencyFlags GSDeviceVK::GetDepthStencilBufferBarrierFlags() const +{ + return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) : + VK_DEPENDENCY_BY_REGION_BIT; +} + +void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds, bool one_barrier, bool full_barrier, bool skip_first_barrier) { if (!m_features.texture_barrier) [[unlikely]] @@ -6036,21 +6205,48 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, } #ifdef PCSX2_DEVBUILD - if ((one_barrier || full_barrier) && !m_pipeline_selector.ps.IsFeedbackLoop()) [[unlikely]] + if ((one_barrier || full_barrier) && !(m_pipeline_selector.ps.IsFeedbackLoop() || m_pipeline_selector.ps.IsFeedbackLoopDepth())) [[unlikely]] Console.Warning("VK: Possible unnecessary barrier detected."); #endif - const VkDependencyFlags barrier_flags = GetColorBufferBarrierFlags(); + std::array barrier_flags = { + GetColorBufferBarrierFlags(), + GetDepthStencilBufferBarrierFlags(), + }; + std::array barrier; + u32 barriers_per_draw = 0; + if (full_barrier || one_barrier) + { + if (draw_rt) + barrier[barriers_per_draw++] = GetColorBufferBarrier(draw_rt); + if (draw_ds) + barrier[barriers_per_draw++] = GetDepthStencilBufferBarrier(draw_ds); + } + + const auto IssueBarriers = [&]() { + if (draw_rt) + { + vkCmdPipelineBarrier(GetCurrentCommandBuffer(), + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[0], 0, nullptr, 0, nullptr, 1, &barrier[0]); + } + if (draw_ds) + { + vkCmdPipelineBarrier(GetCurrentCommandBuffer(), + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[1], 0, nullptr, 0, nullptr, 1, &barrier[1]); + } + }; + if (full_barrier) { pxAssert(config.drawlist && !config.drawlist->empty()); - const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt); const u32 indices_per_prim = config.indices_per_prim; const u32 draw_list_size = static_cast(config.drawlist->size()); GL_PUSH("Split the draw"); - g_perfmon.Put( - GSPerfMon::Barriers, static_cast(draw_list_size) - static_cast(skip_first_barrier)); + g_perfmon.Put(GSPerfMon::Barriers, + barriers_per_draw * (static_cast(draw_list_size) - static_cast(skip_first_barrier))); u32 p = 0; u32 n = 0; @@ -6065,8 +6261,7 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, for (; n < draw_list_size; n++) { - vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier); + IssueBarriers(); const u32 count = (*config.drawlist)[n] * indices_per_prim; DrawIndexedPrimitive(p, count); @@ -6078,11 +6273,8 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, if (one_barrier && !skip_first_barrier) { - g_perfmon.Put(GSPerfMon::Barriers, 1); - - const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt); - vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier); + g_perfmon.Put(GSPerfMon::Barriers, barriers_per_draw); + IssueBarriers(); } DrawIndexedPrimitive(); diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h index d696d4addd..e8eb0f8ff5 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h @@ -293,7 +293,8 @@ public: { FeedbackLoopFlag_None = 0, FeedbackLoopFlag_ReadAndWriteRT = 1, - FeedbackLoopFlag_ReadDS = 2, + FeedbackLoopFlag_ReadDepth = 2, + FeedbackLoopFlag_ReadAndWriteDepth = 4, }; struct alignas(8) PipelineSelector @@ -308,7 +309,7 @@ public: u32 rt : 1; u32 ds : 1; u32 line_width : 1; - u32 feedback_loop_flags : 2; + u32 feedback_loop_flags : 3; }; u32 key; @@ -326,7 +327,8 @@ public: __fi PipelineSelector() { std::memset(this, 0, sizeof(*this)); } __fi bool IsRTFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteRT) != 0); } - __fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadDS) != 0); } + __fi bool IsDepthFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth) != 0); } + __fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & (FeedbackLoopFlag_ReadDepth | FeedbackLoopFlag_ReadAndWriteDepth)) != 0); } }; static_assert(sizeof(PipelineSelector) == 24, "Pipeline selector is 24 bytes"); @@ -357,10 +359,11 @@ public: }; enum TFX_TEXTURES : u32 { - TFX_TEXTURE_TEXTURE, + TFX_TEXTURE_TEXTURE = 0, TFX_TEXTURE_PALETTE, TFX_TEXTURE_RT, TFX_TEXTURE_PRIMID, + TFX_TEXTURE_DEPTH, NUM_TFX_TEXTURES }; @@ -377,6 +380,7 @@ private: VKStreamBuffer m_vertex_stream_buffer; VKStreamBuffer m_index_stream_buffer; + VKStreamBuffer m_accurate_prims_stream_buffer; VKStreamBuffer m_vertex_uniform_stream_buffer; VKStreamBuffer m_fragment_uniform_stream_buffer; VKStreamBuffer m_texture_stream_buffer; @@ -559,6 +563,8 @@ public: void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); + void SetupAccuratePrims(GSHWDrawConfig& config); + void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None); @@ -568,10 +574,12 @@ public: void RenderHW(GSHWDrawConfig& config) override; void UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelector& pipe); - void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config); + void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config); VkImageMemoryBarrier GetColorBufferBarrier(GSTextureVK* rt) const; VkDependencyFlags GetColorBufferBarrierFlags() const; - void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, + VkImageMemoryBarrier GetDepthStencilBufferBarrier(GSTextureVK* ds) const; + VkDependencyFlags GetDepthStencilBufferBarrierFlags() const; + void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds, bool one_barrier, bool full_barrier, bool skip_first_barrier); ////////////////////////////////////////////////////////////////////////// @@ -621,25 +629,27 @@ public: private: enum DIRTY_FLAG : u32 { - DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3 - DIRTY_FLAG_TFX_UBO = (1 << 4), - DIRTY_FLAG_UTILITY_TEXTURE = (1 << 5), - DIRTY_FLAG_BLEND_CONSTANTS = (1 << 6), - DIRTY_FLAG_LINE_WIDTH = (1 << 7), - DIRTY_FLAG_INDEX_BUFFER = (1 << 8), - DIRTY_FLAG_VIEWPORT = (1 << 9), - DIRTY_FLAG_SCISSOR = (1 << 10), - DIRTY_FLAG_PIPELINE = (1 << 11), - DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 12), - DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 13), + DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3, 4 + DIRTY_FLAG_TFX_UBO = (1 << 5), + DIRTY_FLAG_UTILITY_TEXTURE = (1 << 6), + DIRTY_FLAG_BLEND_CONSTANTS = (1 << 7), + DIRTY_FLAG_LINE_WIDTH = (1 << 8), + DIRTY_FLAG_INDEX_BUFFER = (1 << 9), + DIRTY_FLAG_VIEWPORT = (1 << 10), + DIRTY_FLAG_SCISSOR = (1 << 11), + DIRTY_FLAG_PIPELINE = (1 << 12), + DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 13), + DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 14), DIRTY_FLAG_TFX_TEXTURE_TEX = (DIRTY_FLAG_TFX_TEXTURE_0 << 0), DIRTY_FLAG_TFX_TEXTURE_PALETTE = (DIRTY_FLAG_TFX_TEXTURE_0 << 1), DIRTY_FLAG_TFX_TEXTURE_RT = (DIRTY_FLAG_TFX_TEXTURE_0 << 2), DIRTY_FLAG_TFX_TEXTURE_PRIMID = (DIRTY_FLAG_TFX_TEXTURE_0 << 3), + DIRTY_FLAG_TFX_TEXTURE_DEPTH = (DIRTY_FLAG_TFX_TEXTURE_0 << 4), DIRTY_FLAG_TFX_TEXTURES = DIRTY_FLAG_TFX_TEXTURE_TEX | DIRTY_FLAG_TFX_TEXTURE_PALETTE | - DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID, + DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID | + DIRTY_FLAG_TFX_TEXTURE_DEPTH, DIRTY_BASE_STATE = DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PIPELINE | DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR | DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_LINE_WIDTH, diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp index a0aa8bcca5..d7aec5b8af 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp @@ -114,7 +114,7 @@ std::unique_ptr GSTextureVK::Create(Type type, Format format, int w VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | (GSDeviceVK::GetInstance()->UseFeedbackLoopLayout() ? VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT - : 0); + : VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT); vci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; } break; @@ -198,7 +198,7 @@ void GSTextureVK::Destroy(bool defer) if (m_type == Type::RenderTarget || m_type == Type::DepthStencil) { - for (const auto& [other_tex, fb, feedback] : m_framebuffers) + for (const auto& [other_tex, fb, feedback_color, feedback_depth] : m_framebuffers) { if (other_tex) { @@ -738,16 +738,16 @@ void GSTextureVK::TransitionSubresourcesToLayout( VkFramebuffer GSTextureVK::GetFramebuffer(bool feedback_loop) { - return GetLinkedFramebuffer(nullptr, feedback_loop); + return GetLinkedFramebuffer(nullptr, feedback_loop, false); } -VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop) +VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth) { pxAssertRel(m_type != Type::Texture, "Texture is a render target"); - for (const auto& [other_tex, fb, other_feedback_loop] : m_framebuffers) + for (const auto& [other_tex, fb, other_feedback_loop_color, other_feedback_loop_depth] : m_framebuffers) { - if (other_tex == depth_texture && other_feedback_loop == feedback_loop) + if (other_tex == depth_texture && other_feedback_loop_color == feedback_loop_color && other_feedback_loop_depth == feedback_loop_depth) return fb; } @@ -756,7 +756,7 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool (m_type != GSTexture::Type::DepthStencil) ? (depth_texture ? depth_texture->m_vk_format : VK_FORMAT_UNDEFINED) : m_vk_format, VK_ATTACHMENT_LOAD_OP_LOAD, VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_LOAD, - VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop); + VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop_color, feedback_loop_depth); if (!rp) return VK_NULL_HANDLE; @@ -771,9 +771,9 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool if (!fb) return VK_NULL_HANDLE; - m_framebuffers.emplace_back(depth_texture, fb, feedback_loop); + m_framebuffers.emplace_back(depth_texture, fb, feedback_loop_color, feedback_loop_depth); if (depth_texture) - depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop); + depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop_color, feedback_loop_depth); return fb; } diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h index e798514365..41cd2edbb2 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h @@ -73,7 +73,7 @@ public: /// Framebuffers are lazily allocated. VkFramebuffer GetFramebuffer(bool feedback_loop); - VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop); + VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth); // Call when the texture is bound to the pipeline, or read from in a copy. __fi void SetUseFenceCounter(u64 counter) { m_use_fence_counter = counter; } @@ -103,7 +103,7 @@ private: // linked framebuffer is combined with depth texture // list of color textures this depth texture is linked to or vice versa - std::vector> m_framebuffers; + std::vector> m_framebuffers; }; class GSDownloadTextureVK final : public GSDownloadTexture diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp index 142fa8d011..6344544e73 100644 --- a/pcsx2/Pcsx2Config.cpp +++ b/pcsx2/Pcsx2Config.cpp @@ -750,6 +750,7 @@ Pcsx2Config::GSOptions::GSOptions() PreloadFrameWithGSData = false; Mipmap = true; HWMipmap = true; + HWAccuratePrims = false; ManualUserHacks = false; UserHacks_AlignSpriteX = false; @@ -1019,6 +1020,7 @@ void Pcsx2Config::GSOptions::LoadSave(SettingsWrapper& wrap) SettingsWrapEntryEx(UpscaleMultiplier, "upscale_multiplier"); SettingsWrapBitBoolEx(HWMipmap, "hw_mipmap"); + SettingsWrapBitBoolEx(HWAccuratePrims, "HWAccuratePrims"); SettingsWrapIntEnumEx(AccurateBlendingUnit, "accurate_blending_unit"); SettingsWrapIntEnumEx(TextureFiltering, "filter"); SettingsWrapIntEnumEx(TexturePreloading, "texture_preloading"); From cbd4a9c92f98b406a2ca33013ae78c48e847ffca Mon Sep 17 00:00:00 2001 From: TJnotJT Date: Tue, 25 Nov 2025 18:25:57 -0500 Subject: [PATCH 2/2] GS/VK/GL/DX12/DX11: Use default buffer instead of upload buffer for accurate prims data. Should hopefully give better performance. Also refactor some upload/staging buffer handling in VK/DX12. --- pcsx2/GS/Renderers/DX11/GSDevice11.cpp | 46 ++--- pcsx2/GS/Renderers/DX11/GSDevice11.h | 1 - pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp | 22 ++- pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h | 5 +- pcsx2/GS/Renderers/DX12/GSDevice12.cpp | 185 ++++++++++++++---- pcsx2/GS/Renderers/DX12/GSDevice12.h | 6 +- pcsx2/GS/Renderers/DX12/GSTexture12.cpp | 69 ++----- pcsx2/GS/Renderers/DX12/GSTexture12.h | 1 - pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp | 4 +- pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h | 2 +- pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp | 10 +- pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp | 124 +++++++++++- pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h | 6 +- pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp | 59 +----- pcsx2/GS/Renderers/Vulkan/GSTextureVK.h | 1 - pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp | 31 ++- pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h | 6 +- 17 files changed, 370 insertions(+), 208 deletions(-) diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp index d95e22a249..56bb0a06c6 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp @@ -396,8 +396,12 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) } } + bd = {}; + if (m_features.accurate_prims) { + bd.Usage = D3D11_USAGE_DEFAULT; + bd.CPUAccessFlags = 0; bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE; bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; bd.StructureByteStride = sizeof(AccuratePrimsEdgeData); @@ -410,8 +414,11 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) } const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc( - D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData)); - if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, m_accurate_prims_b_srv.put()))) + D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, + ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData)); + + if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, + m_accurate_prims_b_srv.put()))) { Console.Error("D3D11: Failed to create accurate prims buffer SRV."); return false; @@ -419,7 +426,7 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) // If MAX_TEXTURES changes, please change the register for this buffer in the shader. static_assert(MAX_TEXTURES == 5); - m_ctx->PSSetShaderResources(MAX_TEXTURES, 1, m_accurate_prims_b_srv.addressof()); + m_ctx->PSSetShaderResources(5, 1, m_accurate_prims_b_srv.addressof()); } // rasterizer @@ -2326,29 +2333,18 @@ bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config) if (size > ACCURATE_PRIMS_BUFFER_SIZE) return false; - D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE; + // Performance note: UpdateSubresource() copies data to a temp staging buffer to avoid stalling the GPU, + // so a manual ring buffer is not needed here like VK/DX12. + D3D11_BOX dst_region{}; + dst_region.left = 0; + dst_region.right = size; + dst_region.top = 0; + dst_region.bottom = 1; + dst_region.front = 0; + dst_region.back = 1; + m_ctx->UpdateSubresource(m_accurate_prims_b.get(), 0, &dst_region, config.accurate_prims_edge_data->data(), size, 0); - pxAssert(m_accurate_prims_b_pos % sizeof(AccuratePrimsEdgeData) == 0); - - if (m_accurate_prims_b_pos + size > ACCURATE_PRIMS_BUFFER_SIZE) - { - m_accurate_prims_b_pos = 0; - type = D3D11_MAP_WRITE_DISCARD; - } - - D3D11_MAPPED_SUBRESOURCE m; - if (FAILED(m_ctx->Map(m_accurate_prims_b.get(), 0, type, 0, &m))) - return false; - - void* map = static_cast(m.pData) + m_accurate_prims_b_pos; - - GSVector4i::storent(map, config.accurate_prims_edge_data->data(), size); - - m_ctx->Unmap(m_accurate_prims_b.get(), 0); - - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_b_pos / sizeof(AccuratePrimsEdgeData); - - m_accurate_prims_b_pos += size; + config.cb_ps.accurate_prims_base_index.x = 0; // No offsetting needed like DX12/VK since we don't use a ring buffer. } return true; } diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.h b/pcsx2/GS/Renderers/DX11/GSDevice11.h index b790e3eed9..a90296da05 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.h +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h @@ -137,7 +137,6 @@ private: u32 m_vb_pos = 0; // bytes u32 m_ib_pos = 0; // indices/sizeof(u32) u32 m_structured_vb_pos = 0; // bytes - u32 m_accurate_prims_b_pos = 0; // bytes/sizeof(AccuratePrimsEdgeData) bool m_allow_tearing_supported = false; bool m_using_flip_model_swap_chain = true; diff --git a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp index 7219dc48f1..da87429193 100644 --- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp +++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp @@ -20,29 +20,33 @@ D3D12StreamBuffer::~D3D12StreamBuffer() Destroy(); } -bool D3D12StreamBuffer::Create(u32 size) +bool D3D12StreamBuffer::Create(u32 size, bool default_heap) { const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; D3D12MA::ALLOCATION_DESC allocationDesc = {}; allocationDesc.Flags = D3D12MA::ALLOCATION_FLAG_COMMITTED; - allocationDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD; + allocationDesc.HeapType = default_heap ? D3D12_HEAP_TYPE_DEFAULT : D3D12_HEAP_TYPE_UPLOAD; wil::com_ptr_nothrow buffer; wil::com_ptr_nothrow allocation; HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocationDesc, &resource_desc, - D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(buffer.put())); + default_heap ? D3D12_RESOURCE_STATE_COMMON : D3D12_RESOURCE_STATE_GENERIC_READ, + nullptr, allocation.put(), IID_PPV_ARGS(buffer.put())); pxAssertMsg(SUCCEEDED(hr), "Allocate buffer"); if (FAILED(hr)) return false; static const D3D12_RANGE read_range = {}; - u8* host_pointer; - hr = buffer->Map(0, &read_range, reinterpret_cast(&host_pointer)); - pxAssertMsg(SUCCEEDED(hr), "Map buffer"); - if (FAILED(hr)) - return false; + u8* host_pointer = nullptr; + if (!default_heap) + { + hr = buffer->Map(0, &read_range, reinterpret_cast(&host_pointer)); + pxAssertMsg(SUCCEEDED(hr), "Map buffer"); + if (FAILED(hr)) + return false; + } Destroy(true); @@ -51,6 +55,7 @@ bool D3D12StreamBuffer::Create(u32 size) m_host_pointer = host_pointer; m_size = size; m_gpu_pointer = m_buffer->GetGPUVirtualAddress(); + m_default_heap = default_heap; return true; } @@ -148,6 +153,7 @@ void D3D12StreamBuffer::Destroy(bool defer) m_current_offset = 0; m_current_space = 0; m_current_gpu_position = 0; + m_default_heap = false; m_tracked_fences.clear(); } diff --git a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h index da1655f4bd..f5164db4f3 100644 --- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h +++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h @@ -22,7 +22,7 @@ public: D3D12StreamBuffer(); ~D3D12StreamBuffer(); - bool Create(u32 size); + bool Create(u32 size, bool default_heap = false); __fi bool IsValid() const { return static_cast(m_buffer); } __fi ID3D12Resource* GetBuffer() const { return m_buffer.get(); } @@ -54,7 +54,8 @@ private: wil::com_ptr_nothrow m_buffer; wil::com_ptr_nothrow m_allocation; D3D12_GPU_VIRTUAL_ADDRESS m_gpu_pointer = {}; - u8* m_host_pointer = nullptr; + u8* m_host_pointer = nullptr; // Only used for upload heaps. + bool m_default_heap = false; // False for upload heap; true for default heap. // List of fences and the corresponding positions in the buffer std::deque> m_tracked_fences; diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp index 4f6c991f91..1416e8369d 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp @@ -624,52 +624,91 @@ bool GSDevice12::SetGPUTimingEnabled(bool enabled) bool GSDevice12::AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation, const std::function& fill_callback) { - // Try to place the fixed index buffer in GPU local memory. - // Use the staging buffer to copy into it. + // Allocate and fill staging buffer + ID3D12Resource* cpu_buffer = AllocateUploadStagingBuffer(size, fill_callback); + + // Create GPU buffer const D3D12_RESOURCE_DESC rd = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; - - const D3D12MA::ALLOCATION_DESC cpu_ad = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; - - ComPtr cpu_buffer; - ComPtr cpu_allocation; - HRESULT hr = m_allocator->CreateResource( - &cpu_ad, &rd, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, cpu_allocation.put(), IID_PPV_ARGS(cpu_buffer.put())); - pxAssertMsg(SUCCEEDED(hr), "Allocate CPU buffer"); - if (FAILED(hr)) - return false; - - static constexpr const D3D12_RANGE read_range = {}; - const D3D12_RANGE write_range = {0, size}; - void* mapped; - hr = cpu_buffer->Map(0, &read_range, &mapped); - pxAssertMsg(SUCCEEDED(hr), "Map CPU buffer"); - if (FAILED(hr)) - return false; - fill_callback(mapped); - cpu_buffer->Unmap(0, &write_range); - const D3D12MA::ALLOCATION_DESC gpu_ad = {D3D12MA::ALLOCATION_FLAG_COMMITTED, D3D12_HEAP_TYPE_DEFAULT}; - - hr = m_allocator->CreateResource( + HRESULT hr = m_allocator->CreateResource( &gpu_ad, &rd, D3D12_RESOURCE_STATE_COMMON, nullptr, gpu_allocation, IID_PPV_ARGS(gpu_buffer)); pxAssertMsg(SUCCEEDED(hr), "Allocate GPU buffer"); if (FAILED(hr)) return false; - GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer.get(), 0, size); + // Copy the data + GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer, 0, size); + // Transition GPU buffer to COPY_DEST D3D12_RESOURCE_BARRIER rb = {D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, D3D12_RESOURCE_BARRIER_FLAG_NONE}; rb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; rb.Transition.pResource = *gpu_buffer; rb.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; // COMMON -> COPY_DEST at first use. rb.Transition.StateAfter = D3D12_RESOURCE_STATE_INDEX_BUFFER; GetInitCommandList()->ResourceBarrier(1, &rb); - - DeferResourceDestruction(cpu_allocation.get(), cpu_buffer.get()); return true; } +ID3D12Resource* GSDevice12::WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out) +{ + if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) + { + GSDevice12::GetInstance()->ExecuteCommandList( + false, "While waiting for %u bytes in texture upload buffer", size); + if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) + { + Console.Error("Failed to reserve texture upload memory (%u bytes).", size); + return nullptr; + } + } + + offset_out = m_texture_stream_buffer.GetCurrentOffset(); + write_data(m_texture_stream_buffer.GetCurrentHostPointer()); + m_texture_stream_buffer.CommitMemory(size); + return m_texture_stream_buffer.GetBuffer(); +} + +ID3D12Resource* GSDevice12::AllocateUploadStagingBuffer(u32 size, std::function write_data) +{ + wil::com_ptr_nothrow resource; + wil::com_ptr_nothrow allocation; + + // Allocate staging buffer + const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; + const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, + DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; + HRESULT hr = GetAllocator()->CreateResource(&allocation_desc, &resource_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put())); + if (FAILED(hr)) + { + Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr); + return nullptr; + } + + // Map + static constexpr const D3D12_RANGE read_range = {}; + void* map_ptr; + hr = resource->Map(0, &read_range, &map_ptr); + if (FAILED(hr)) + { + Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr); + return nullptr; + } + + // Write data + write_data(map_ptr); + + // Unmap + const D3D12_RANGE write_range = {0, size}; + resource->Unmap(0, &write_range); + + // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. + // This adds the reference needed to keep the buffer alive. + DeferResourceDestruction(allocation.get(), resource.get()); + return resource.get(); +} + RenderAPI GSDevice12::GetRenderAPI() const { return RenderAPI::D3D12; @@ -2180,15 +2219,17 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count) m_index_stream_buffer.CommitMemory(size); } -void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config) +void GSDevice12::SetupAccuratePrimsBuffer(GSHWDrawConfig& config) { if (config.accurate_prims) { + // Unbind the buffer. m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING; const u32 count = config.accurate_prims_edge_data->size(); const u32 size = count * sizeof(AccuratePrimsEdgeData); + // Reserve the GPU region. if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) { ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); @@ -2196,14 +2237,72 @@ void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config) pxFailRel("Failed to reserve space for accurate prims"); } + const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset(); + + if (InRenderPass()) + EndRenderPass(); + + // Copy data to an upload buffer. + ID3D12Resource* upload_buffer; + u32 upload_buffer_offset; + + const auto upload_data = [&](void* map_ptr) { + std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size); + }; + + // If the texture is larger than half our streaming buffer size, use a separate buffer. + // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. + if (size > m_texture_stream_buffer.GetSize() / 2) + { + upload_buffer_offset = 0; + upload_buffer = AllocateUploadStagingBuffer(size, upload_data); + } + else + { + upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset); + } + if (!upload_buffer) + { + Console.Error("Failed to get upload buffer for accurate prims data."); + return; + } + + // Copy data from upload to GPU buffer. + const D3D12_RESOURCE_BARRIER barrier_sr_to_dst = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}}}; + GetCommandList()->ResourceBarrier(1, &barrier_sr_to_dst); + GetCommandList()->CopyBufferRegion( + m_accurate_prims_stream_buffer.GetBuffer(), offset, upload_buffer, upload_buffer_offset, size); + + // Commit the GPU region. + m_accurate_prims_stream_buffer.CommitMemory(size); + + // Issue the barrier since this will be used next draw. + const D3D12_RESOURCE_BARRIER barrier_dst_to_sr = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}}; + GetCommandList()->ResourceBarrier(1, &barrier_dst_to_sr); + + m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer. + } +} + +void GSDevice12::SetupAccuratePrimsConstants(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { config.cb_vs.base_vertex = m_vertex.start; - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData); SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); - - std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); - m_accurate_prims_stream_buffer.CommitMemory(size); } } @@ -2394,7 +2493,8 @@ bool GSDevice12::CreateBuffers() return false; } - if (!m_accurate_prims_stream_buffer.Create(ACCURATE_PRIMS_BUFFER_SIZE)) + if (!m_accurate_prims_stream_buffer.Create( + m_features.accurate_prims ? ACCURATE_PRIMS_BUFFER_SIZE : sizeof(AccuratePrimsEdgeData), true)) { Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); return false; @@ -2406,8 +2506,17 @@ bool GSDevice12::CreateBuffers() return false; } - // Create the shader resource view for the accurate prims buffer. + if (m_features.accurate_prims) { + // Transition to accurate prims buffer to pixel shader resource and create the shader resource view. + const D3D12_RESOURCE_BARRIER barrier = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}}; + GetInitCommandList()->ResourceBarrier(1, &barrier); + D3D12_SHADER_RESOURCE_VIEW_DESC desc = { DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING}; desc.Buffer.FirstElement = 0; @@ -3940,6 +4049,9 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) PipelineSelector& pipe = m_pipeline_selector; + // Copying buffers needs to done outside render pass so do this early. + SetupAccuratePrimsBuffer(config); + // figure out the pipeline UpdateHWPipelineSelector(config); @@ -4321,5 +4433,6 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) IASetIndexBuffer(config.indices, config.nindices); } - SetupAccuratePrims(config); + // Needs to be done after vertex offset is set. + SetupAccuratePrimsConstants(config); } diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.h b/pcsx2/GS/Renderers/DX12/GSDevice12.h index 7b452a9f4f..f479ae3b12 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.h +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.h @@ -129,6 +129,8 @@ public: // Allocates a temporary CPU staging buffer, fires the callback with it to populate, then copies to a GPU buffer. bool AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation, const std::function& fill_callback); + ID3D12Resource* AllocateUploadStagingBuffer(u32 size, std::function write_data); + ID3D12Resource* WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out); private: struct CommandListResources @@ -307,6 +309,7 @@ private: D3D12StreamBuffer m_vertex_stream_buffer; D3D12StreamBuffer m_index_stream_buffer; D3D12StreamBuffer m_accurate_prims_stream_buffer; + u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw. D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu; D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu; D3D12StreamBuffer m_vertex_constant_buffer; @@ -465,7 +468,8 @@ public: void IASetVertexBuffer(const void* vertex, size_t stride, size_t count); void IASetIndexBuffer(const void* index, size_t count); - void SetupAccuratePrims(GSHWDrawConfig& config); + void SetupAccuratePrimsBuffer(GSHWDrawConfig& config); + void SetupAccuratePrimsConstants(GSHWDrawConfig& config); void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); diff --git a/pcsx2/GS/Renderers/DX12/GSTexture12.cpp b/pcsx2/GS/Renderers/DX12/GSTexture12.cpp index e21f749254..348b5a17c4 100644 --- a/pcsx2/GS/Renderers/DX12/GSTexture12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSTexture12.cpp @@ -350,43 +350,6 @@ ID3D12GraphicsCommandList* GSTexture12::GetCommandBufferForUpdate() return dev->GetInitCommandList(); } -ID3D12Resource* GSTexture12::AllocateUploadStagingBuffer( - const void* data, u32 pitch, u32 upload_pitch, u32 height) const -{ - const u32 buffer_size = CalcUploadSize(height, upload_pitch); - wil::com_ptr_nothrow resource; - wil::com_ptr_nothrow allocation; - - const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; - const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, buffer_size, 1, 1, 1, - DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; - HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocation_desc, &resource_desc, - D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put())); - if (FAILED(hr)) - { - Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr); - return nullptr; - } - - void* map_ptr; - hr = resource->Map(0, nullptr, &map_ptr); - if (FAILED(hr)) - { - Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr); - return nullptr; - } - - CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); - - const D3D12_RANGE write_range = {0, buffer_size}; - resource->Unmap(0, &write_range); - - // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. - // This adds the reference needed to keep the buffer alive. - GSDevice12::GetInstance()->DeferResourceDestruction(allocation.get(), resource.get()); - return resource.get(); -} - void GSTexture12::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const { const u32 block_size = GetCompressedBlockSize(); @@ -406,7 +369,7 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l const u32 width = Common::AlignUpPow2(r.width(), block_size); const u32 height = Common::AlignUpPow2(r.height(), block_size); const u32 upload_pitch = Common::AlignUpPow2(pitch, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT); - const u32 required_size = CalcUploadSize(r.height(), upload_pitch); + const u32 required_size = CalcUploadSize(height, upload_pitch); D3D12_TEXTURE_COPY_LOCATION srcloc; srcloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; @@ -416,35 +379,25 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l srcloc.PlacedFootprint.Footprint.Format = m_dxgi_format; srcloc.PlacedFootprint.Footprint.RowPitch = upload_pitch; + const auto upload_data = [&](void* map_ptr) { + CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); + }; + // If the texture is larger than half our streaming buffer size, use a separate buffer. // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. if (required_size > (GSDevice12::GetInstance()->GetTextureStreamBuffer().GetSize() / 2)) { - srcloc.pResource = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height); - if (!srcloc.pResource) - return false; - + srcloc.pResource = GSDevice12::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data); srcloc.PlacedFootprint.Offset = 0; } else { - D3D12StreamBuffer& sbuffer = GSDevice12::GetInstance()->GetTextureStreamBuffer(); - if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) - { - GSDevice12::GetInstance()->ExecuteCommandList( - false, "While waiting for %u bytes in texture upload buffer", required_size); - if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) - { - Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size); - return false; - } - } - - srcloc.pResource = sbuffer.GetBuffer(); - srcloc.PlacedFootprint.Offset = sbuffer.GetCurrentOffset(); - CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height); - sbuffer.CommitMemory(required_size); + u32 offset; + srcloc.pResource = GSDevice12::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, offset); + srcloc.PlacedFootprint.Offset = offset; } + if (!srcloc.pResource) + return false; ID3D12GraphicsCommandList* cmdlist = GetCommandBufferForUpdate(); GL_PUSH("GSTexture12::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer); diff --git a/pcsx2/GS/Renderers/DX12/GSTexture12.h b/pcsx2/GS/Renderers/DX12/GSTexture12.h index 49c82d034f..ced6fa545f 100644 --- a/pcsx2/GS/Renderers/DX12/GSTexture12.h +++ b/pcsx2/GS/Renderers/DX12/GSTexture12.h @@ -79,7 +79,6 @@ private: static bool CreateUAVDescriptor(ID3D12Resource* resource, DXGI_FORMAT format, D3D12DescriptorHandle* dh); ID3D12GraphicsCommandList* GetCommandBufferForUpdate(); - ID3D12Resource* AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const; void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const; wil::com_ptr_nothrow m_resource; diff --git a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp index 0271500956..0b7086cb4f 100644 --- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp @@ -310,10 +310,10 @@ namespace }; } // namespace -std::unique_ptr GLStreamBuffer::Create(GLenum target, u32 size) +std::unique_ptr GLStreamBuffer::Create(GLenum target, u32 size, bool nonsyncing) { std::unique_ptr buf; - if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) + if (!nonsyncing && (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage)) { buf = BufferStorageStreamBuffer::Create(target, size); if (buf) diff --git a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h index 3da57e125e..bd63ec760c 100644 --- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h +++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h @@ -38,7 +38,7 @@ public: /// Returns the minimum granularity of blocks which sync objects will be created around. virtual u32 GetChunkSize() const = 0; - static std::unique_ptr Create(GLenum target, u32 size); + static std::unique_ptr Create(GLenum target, u32 size, bool nonsyncing = false); protected: GLStreamBuffer(GLenum target, GLuint buffer_id, u32 size); diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp index 174f69174f..019ec607fc 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp @@ -260,11 +260,17 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE); m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE); - m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE); + if (m_features.accurate_prims) + { + // Performance note: prefer a non-syncing buffer for accurate prims so that it is more likely to be GPU local. + // Rationale: we expect this buffer to be updated relatively rarely and it's used as a pixel shader resource. + m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE, true); + } m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE); m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE); glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment); - if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_accurate_prims_stream_buffer || + if (!m_vertex_stream_buffer || !m_index_stream_buffer || + (m_features.accurate_prims && !m_accurate_prims_stream_buffer) || !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer) { Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers"); diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp index 0881bdb7e0..59a9f3acbb 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp @@ -3406,13 +3406,14 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count) SetIndexBuffer(m_index_stream_buffer.GetBuffer()); } -void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config) +void GSDeviceVK::SetupAccuratePrimsBuffer(GSHWDrawConfig& config) { if (config.accurate_prims) { const u32 count = config.accurate_prims_edge_data->size(); const u32 size = count * sizeof(AccuratePrimsEdgeData); + // Reserve the GPU region. if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) { ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); @@ -3420,17 +3421,120 @@ void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config) pxFailRel("Failed to reserve space for accurate prims"); } + const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset(); + + if (InRenderPass()) + EndRenderPass(); + + // Copy data to an upload buffer. + VkBuffer upload_buffer; + u32 upload_buffer_offset; + + const auto upload_data = [&](void* map_ptr) { + std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size); + }; + + // If the texture is larger than half our streaming buffer size, use a separate buffer. + // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. + if (size > m_texture_stream_buffer.GetCurrentSize() / 2) + { + upload_buffer_offset = 0; + upload_buffer = AllocateUploadStagingBuffer(size, upload_data); + } + else + { + upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset); + } + if (upload_buffer == VK_NULL_HANDLE) + { + Console.Error("Failed to get upload buffer for accurate prims data."); + return; + } + + // Copy data from upload to GPU buffer. + VkBufferCopy copyRegion = {upload_buffer_offset, offset, size}; + vkCmdCopyBuffer(GetCurrentCommandBuffer(), upload_buffer, m_accurate_prims_stream_buffer.GetBuffer(), 1, ©Region); + + // Commit the GPU region. + m_accurate_prims_stream_buffer.CommitMemory(size); + + // Issue the barrier since this will be used next draw. + VkBufferMemoryBarrier barrier = { + VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, nullptr, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, + VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, + m_accurate_prims_stream_buffer.GetBuffer(), offset, size}; + vkCmdPipelineBarrier(GetCurrentCommandBuffer(), + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + 0, 0, nullptr, 1, &barrier, 0, nullptr); + + m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer. + } +} + +void GSDeviceVK::SetupAccuratePrimsConstants(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + // We separate this from setting up the buffer to mirror Vulkan, which requires it. config.cb_vs.base_vertex = m_vertex.start; - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData); SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); - - std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); - m_accurate_prims_stream_buffer.CommitMemory(size); } } +VkBuffer GSDeviceVK::WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out) +{ + if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment())) + { + ExecuteCommandBuffer( + false, "While waiting for %u bytes in texture upload buffer", size); + if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment())) + { + Console.Error("Failed to reserve texture upload memory (%u bytes).", size); + return VK_NULL_HANDLE; + } + } + + offset_out = m_texture_stream_buffer.GetCurrentOffset(); + write_data(m_texture_stream_buffer.GetCurrentHostPointer()); + m_texture_stream_buffer.CommitMemory(size); + return m_texture_stream_buffer.GetBuffer(); +} + +VkBuffer GSDeviceVK::AllocateUploadStagingBuffer(u32 size, std::function write_data) +{ + const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; + + // Don't worry about setting the coherent bit for this upload, the main reason we had + // that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on + // smaller uploads, but we're writing to the whole thing anyway. + VmaAllocationCreateInfo aci = {}; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + + VmaAllocationInfo ai; + VkBuffer buffer; + VmaAllocation allocation; + VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai); + if (res != VK_SUCCESS) + { + LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: "); + return VK_NULL_HANDLE; + } + + // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. + GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation); + + // And write the data. + write_data(ai.pMappedData); + vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size); + return buffer; +} + void GSDeviceVK::OMSetRenderTargets( GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop) { @@ -3762,7 +3866,8 @@ bool GSDeviceVK::CreateBuffers() if (m_features.accurate_prims) { - if (!m_accurate_prims_stream_buffer.Create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ACCURATE_PRIMS_BUFFER_SIZE)) + if (!m_accurate_prims_stream_buffer.Create( + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, ACCURATE_PRIMS_BUFFER_SIZE, true)) { Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); return false; @@ -5673,13 +5778,15 @@ GSTextureVK* GSDeviceVK::SetupPrimitiveTrackingDATE(GSHWDrawConfig& config) void GSDeviceVK::RenderHW(GSHWDrawConfig& config) { - const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize()); GSTextureVK* draw_rt = static_cast(config.rt); GSTextureVK* draw_ds = static_cast(config.ds); GSTextureVK* draw_rt_clone = nullptr; GSTextureVK* colclip_rt = static_cast(g_gs_device->GetColorClipTexture()); + // Copying buffers needs to done outside render pass so do this early. + SetupAccuratePrimsBuffer(config); + // stream buffer in first, in case we need to exec SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); @@ -6157,7 +6264,8 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) IASetIndexBuffer(config.indices, config.nindices); } - SetupAccuratePrims(config); + // Needs to be done after vertex offset is set. + SetupAccuratePrimsConstants(config); } VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h index e8eb0f8ff5..17ac9d9f3d 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h @@ -98,6 +98,8 @@ public: __fi VkCommandBuffer GetCurrentCommandBuffer() const { return m_current_command_buffer; } __fi VKStreamBuffer& GetTextureUploadBuffer() { return m_texture_stream_buffer; } VkCommandBuffer GetCurrentInitCommandBuffer(); + VkBuffer AllocateUploadStagingBuffer(u32 size, std::function write_data); + VkBuffer WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out); /// Allocates a descriptor set from the pool reserved for the current frame. VkDescriptorSet AllocatePersistentDescriptorSet(VkDescriptorSetLayout set_layout); @@ -381,6 +383,7 @@ private: VKStreamBuffer m_vertex_stream_buffer; VKStreamBuffer m_index_stream_buffer; VKStreamBuffer m_accurate_prims_stream_buffer; + u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw. VKStreamBuffer m_vertex_uniform_stream_buffer; VKStreamBuffer m_fragment_uniform_stream_buffer; VKStreamBuffer m_texture_stream_buffer; @@ -563,7 +566,8 @@ public: void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); - void SetupAccuratePrims(GSHWDrawConfig& config); + void SetupAccuratePrimsBuffer(GSHWDrawConfig& config); + void SetupAccuratePrimsConstants(GSHWDrawConfig& config); void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None); diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp index d7aec5b8af..ff681e525a 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp @@ -270,38 +270,6 @@ void GSTextureVK::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch StringUtil::StrideMemCpy(dst, upload_pitch, src, pitch, std::min(upload_pitch, pitch), count); } -VkBuffer GSTextureVK::AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const -{ - const u32 size = upload_pitch * height; - const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), - VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; - - // Don't worry about setting the coherent bit for this upload, the main reason we had - // that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on - // smaller uploads, but we're writing to the whole thing anyway. - VmaAllocationCreateInfo aci = {}; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - - VmaAllocationInfo ai; - VkBuffer buffer; - VmaAllocation allocation; - VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai); - if (res != VK_SUCCESS) - { - LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: "); - return VK_NULL_HANDLE; - } - - // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. - GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation); - - // And write the data. - CopyTextureDataForUpload(ai.pMappedData, data, pitch, upload_pitch, height); - vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size); - return buffer; -} - void GSTextureVK::UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset) { @@ -333,6 +301,10 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l const u32 upload_pitch = Common::AlignUpPow2(pitch, GSDeviceVK::GetInstance()->GetBufferCopyRowPitchAlignment()); const u32 required_size = CalcUploadSize(height, upload_pitch); + const auto upload_data = [&](void* map_ptr) { + CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); + }; + // If the texture is larger than half our streaming buffer size, use a separate buffer. // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. VkBuffer buffer; @@ -340,29 +312,14 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l if (required_size > (GSDeviceVK::GetInstance()->GetTextureUploadBuffer().GetCurrentSize() / 2)) { buffer_offset = 0; - buffer = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height); - if (buffer == VK_NULL_HANDLE) - return false; + buffer = GSDeviceVK::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data); } else { - VKStreamBuffer& sbuffer = GSDeviceVK::GetInstance()->GetTextureUploadBuffer(); - if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment())) - { - GSDeviceVK::GetInstance()->ExecuteCommandBuffer( - false, "While waiting for %u bytes in texture upload buffer", required_size); - if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment())) - { - Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size); - return false; - } - } - - buffer = sbuffer.GetBuffer(); - buffer_offset = sbuffer.GetCurrentOffset(); - CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height); - sbuffer.CommitMemory(required_size); + buffer = GSDeviceVK::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, buffer_offset); } + if (buffer == VK_NULL_HANDLE) + return false; const VkCommandBuffer cmdbuf = GetCommandBufferForUpdate(); GL_PUSH("GSTextureVK::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer); diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h index 41cd2edbb2..121946eb01 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h @@ -84,7 +84,6 @@ private: VkCommandBuffer GetCommandBufferForUpdate(); void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const; - VkBuffer AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const; void UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset); diff --git a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp index 8f6323d6fa..ad4e581f85 100644 --- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp +++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp @@ -19,6 +19,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move) , m_allocation(move.m_allocation) , m_buffer(move.m_buffer) , m_host_pointer(move.m_host_pointer) + , m_device_local(move.m_device_local) , m_tracked_fences(std::move(move.m_tracked_fences)) { move.m_size = 0; @@ -28,6 +29,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move) move.m_allocation = VK_NULL_HANDLE; move.m_buffer = VK_NULL_HANDLE; move.m_host_pointer = nullptr; + move.m_device_local = false; } VKStreamBuffer::~VKStreamBuffer() @@ -48,19 +50,29 @@ VKStreamBuffer& VKStreamBuffer::operator=(VKStreamBuffer&& move) std::swap(m_buffer, move.m_buffer); std::swap(m_host_pointer, move.m_host_pointer); std::swap(m_tracked_fences, move.m_tracked_fences); + std::swap(m_device_local, move.m_device_local); return *this; } -bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size) +bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size, bool device_local) { const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), usage, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; VmaAllocationCreateInfo aci = {}; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + if (device_local) + { + // GPU default buffer + aci.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + } + else + { + // CPU upload buffer + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + } VmaAllocationInfo ai = {}; VkBuffer new_buffer = VK_NULL_HANDLE; @@ -83,7 +95,8 @@ bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size) m_tracked_fences.clear(); m_allocation = new_allocation; m_buffer = new_buffer; - m_host_pointer = static_cast(ai.pMappedData); + m_host_pointer = device_local ? nullptr : static_cast(ai.pMappedData); + m_device_local = device_local; return true; } @@ -104,6 +117,7 @@ void VKStreamBuffer::Destroy(bool defer) m_buffer = VK_NULL_HANDLE; m_allocation = VK_NULL_HANDLE; m_host_pointer = nullptr; + m_device_local = false; } bool VKStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment) @@ -180,8 +194,11 @@ void VKStreamBuffer::CommitMemory(u32 final_num_bytes) pxAssert((m_current_offset + final_num_bytes) <= m_size); pxAssert(final_num_bytes <= m_current_space); - // For non-coherent mappings, flush the memory range - vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes); + if (!m_device_local) + { + // For non-coherent mappings, flush the memory range + vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes); + } m_current_offset += final_num_bytes; m_current_space -= final_num_bytes; diff --git a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h index e0839a838e..07de25a8cb 100644 --- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h +++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h @@ -30,14 +30,13 @@ public: __fi u32 GetCurrentSpace() const { return m_current_space; } __fi u32 GetCurrentOffset() const { return m_current_offset; } - bool Create(VkBufferUsageFlags usage, u32 size); + bool Create(VkBufferUsageFlags usage, u32 size, bool device_local = false); void Destroy(bool defer); bool ReserveMemory(u32 num_bytes, u32 alignment); void CommitMemory(u32 final_num_bytes); private: - bool AllocateBuffer(VkBufferUsageFlags usage, u32 size); void UpdateCurrentFencePosition(); void UpdateGPUPosition(); @@ -51,7 +50,8 @@ private: VmaAllocation m_allocation = VK_NULL_HANDLE; VkBuffer m_buffer = VK_NULL_HANDLE; - u8* m_host_pointer = nullptr; + u8* m_host_pointer = nullptr; // Only used for upload buffers. + bool m_device_local = false; // False for upload buffer; true for default buffer. // List of fences and the corresponding positions in the buffer std::deque> m_tracked_fences;