Merge cbd4a9c92f into cf4412ecbe

2025-12-16 04:08:48 +00:00 · 2025-12-14 20:12:35 +07:00 · 2025-12-14 20:12:35 +07:00 · e6aef219cc
commit e6aef219cc
parent cf4412ecbe cbd4a9c92f
32 changed files with 2650 additions and 409 deletions
--- a/bin/resources/shaders/dx11/tfx.fx
+++ b/bin/resources/shaders/dx11/tfx.fx
@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
 // SPDX-License-Identifier: GPL-3.0+

+#define ACCURATE_LINES 1
+#define ACCURATE_TRIANGLES 2
+
 #define FMT_32 0
 #define FMT_24 1
 #define FMT_16 2
@ -21,6 +24,11 @@
 #define GS_FORWARD_PRIMID 0
 #endif

+#ifndef ZTST_GEQUAL
+#define ZTST_GEQUAL 2
+#define ZTST_GREATER 3
+#endif
+
 #ifndef PS_FST
 #define PS_IIP 0
 #define PS_FST 0
@ -84,6 +92,7 @@
 #define SW_BLEND_NEEDS_RT (SW_BLEND && (PS_BLEND_A == 1 || PS_BLEND_B == 1 || PS_BLEND_C == 1 || PS_BLEND_D == 1))
 #define SW_AD_TO_HW (PS_BLEND_C == 1 && PS_A_MASKED)
 #define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1)
+#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)

 struct VS_INPUT
 {
@ -94,6 +103,9 @@ struct VS_INPUT
 	uint z : POSITION1;
 	uint2 uv : TEXCOORD2;
 	float4 f : COLOR1;
+#ifdef VS_ACCURATE_PRIMS
+	uint vertex_id : SV_VertexID;
+#endif
 };

 struct VS_OUTPUT
@ -107,6 +119,12 @@ struct VS_OUTPUT
 #else
 	nointerpolation float4 c : COLOR0;
 #endif
+#if VS_ACCURATE_PRIMS
+	nointerpolation uint accurate_prims_index : TEXCOORD3;
+#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+	nointerpolation uint accurate_triangles_interior : TEXCOORD4;
+#endif
+#endif
 };

 struct PS_INPUT
@ -122,6 +140,38 @@ struct PS_INPUT
 #if (PS_DATE >= 1 && PS_DATE <= 3) || GS_FORWARD_PRIMID
 	uint primid : SV_PrimitiveID;
 #endif
+#if PS_ACCURATE_PRIMS
+	nointerpolation uint accurate_prims_index : TEXCOORD3;
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+	nointerpolation uint accurate_triangles_interior : TEXCOORD4;
+#endif
+#endif
+};
+
+struct AccuratePrimsEdgeData
+{
+	// Interpolated attributes
+	float4 t_float0; // 0
+	float4 t_float1; // 16
+	float4 t_int0; // 32
+	float4 t_int1; // 48
+	float4 c0; // 64
+	float4 c1; // 80
+	float4 p0; // 96
+	float4 p1; // 112
+	int4 edge0; // 128
+	int4 edge1; // 144
+	int2 xy0; // 160
+	int2 xy1; // 168
+	uint step_x; // 176
+	uint draw0; // 180
+	uint draw1; // 184
+	uint top_left; // 188
+	uint side; // 192
+	uint _pad0; // 196
+	uint _pad1; // 200
+	uint _pad2; // 204
+	// Total 208
 };

 #ifdef PIXEL_SHADER
@ -147,6 +197,8 @@ Texture2D<float4> Texture : register(t0);
 Texture2D<float4> Palette : register(t1);
 Texture2D<float4> RtTexture : register(t2);
 Texture2D<float> PrimMinTexture : register(t3);
+Texture2D<float> DepthTexture : register(t4);
+StructuredBuffer<AccuratePrimsEdgeData> accurate_prims_data : register(t5);
 SamplerState TextureSampler : register(s0);

 #ifdef DX12
@ -172,6 +224,12 @@ cbuffer cb1
 	float4x4 DitherMatrix;
 	float ScaledScaleFactor;
 	float RcpScaleFactor;
+	uint _pad0;
+	uint _pad1;
+	uint accurate_prims_base_index;
+	uint _pad2;
+	uint _pad3;
+	uint _pad4;
 };

 float4 sample_c(float2 uv, float uv_w, int2 xy)
@ -1015,9 +1073,242 @@ void ps_blend(inout float4 Color, inout float4 As_rgba, float2 pos_xy)
 	}
 }

+#if PS_ACCURATE_PRIMS
+// Interpolate vertex attributes over a line/edge manually.
+void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1, inout PS_INPUT input)
+{
+	float weight0_f = float(weight0);
+	float weight1_f = float(weight1);
+	float weight_total = float(weight0 + weight1);
+
+	float4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
+	float4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
+	float4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
+	float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
+
+	// No interpolation for constant attributes.
+	input.t = lerp(t_float_interp, data.t_float1, float4(data.t_float1 == data.t_float0));
+	input.ti = lerp(t_int_interp, data.t_int1, float4(data.t_int1 == data.t_int0));
+	input.c = lerp(c_interp, data.c1, float4(data.c1 == data.c0));
+	input.p.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
+
+	// Clamp attributes. Fog/Z are normalized.
+	input.c = clamp(input.c, 0.0f, 255.0f);
+	input.t.z = clamp(input.t.z, 0.0f, 1.0f);
+	input.p.z = clamp(input.p.z, 0.0f, 1.0f);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+void HandleAccurateLines(inout PS_INPUT input, out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index];
+
+	int2 xy0 = data.xy0;
+	int2 xy1 = data.xy1;
+	int2 dxy = xy1 - xy0;
+	int2 xy0_i = (xy0 + 8) & ~0xF;
+	int2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool draw0 = bool(data.draw0);
+	bool draw1 = bool(data.draw1);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside line range
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	if ((major_i == major0_i && !draw0) ||
+		(major_i == major1_i && !draw1))
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+
+#if PS_ACCURATE_PRIMS_AA
+	// Proper fixed-point AA rounding
+	int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	int alpha_i;
+	if (minor_i == minor_i_expected_0)
+		alpha_i = alpha_i_0;
+	else if (minor_i == minor_i_expected_1)
+		alpha_i = alpha_i_1;
+	else
+	{
+		alpha_i = 0; // Prevent compiler warning.
+		discard;
+	}
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	// Non-AA: fixed-point rounding and 4-bit alignment
+	int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
+	if (minor_i != minor_i_expected)
+		discard;
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1, input);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+void HandleAccurateTrianglesEdge(inout PS_INPUT input, out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + input.accurate_prims_index];
+
+	int2 xy0 = data.xy0;
+	int2 xy1 = data.xy1;
+	int2 dxy = xy1 - xy0;
+	int2 xy0_i = (xy0 + 8) & ~0xF;
+	int2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool side = bool(data.side);
+	bool top_left = bool(data.top_left);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	int2 xy_i = 16 * int2(floor(input.p.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside edge range.
+	// Note: this is not exactly what the SW rasterizer does.
+	// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	// Discard if on wrong side of other edges
+	if (dot(data.edge0, int4(xy_i, 1, 0)) <= 0 ||
+		dot(data.edge1, int4(xy_i, 1, 0)) <= 0)
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+	int minor_i_expected = minor_line / d_major;
+	int minor_i_expected_0 = minor_i_expected & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	// Proper fixed-point AA rounding
+	int alpha_i;
+	if ((minor_i_expected & 0xF) == 0)
+	{
+		// On a pixel center
+		alpha_i = top_left ? 0 : d_major_scaled;
+		minor_i_expected += top_left ? (side ? -16 : 16) : 0;
+	}
+	else if (side)
+	{
+		minor_i_expected = minor_i_expected_0;
+		alpha_i = alpha_i_0;
+	}
+	else
+	{
+		minor_i_expected = minor_i_expected_1;
+		alpha_i = alpha_i_1;
+	}
+	if (minor_i != minor_i_expected)
+		discard;
+
+#if PS_ACCURATE_PRIMS_AA
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1, input);
+}
+#endif
+
 PS_OUTPUT ps_main(PS_INPUT input)
 {
+#if PS_ACCURATE_PRIMS
+	float alpha_coverage;
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+	HandleAccurateLines(input, alpha_coverage);
+#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+	if (bool(input.accurate_triangles_interior))
+	{
+		alpha_coverage = 128.0f;
+	}
+	else
+	{
+		HandleAccurateTrianglesEdge(input, alpha_coverage);
+	}
+#endif
+#endif // PS_ACCURATE_PRIMS
+
+#if NEEDS_DEPTH
+	float current_depth = DepthTexture.Load(int3(floor(input.p.xy), 0)).r;
+#endif
+
+#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
+	#if PS_ZTST == ZTST_GEQUAL
+		if (input.p.z < current_depth)
+			discard;
+	#elif PS_ZTST == ZTST_GREATER
+	if (input.p.z <= current_depth)
+		discard;
+	#endif
+#endif // PS_ZTST
+
 	float4 C = ps_color(input);
+
+#if PS_FIXED_ONE_A
+	// AA (Fixed one) will output a coverage of 1.0 as alpha
+	C.a = 128.0f;
+#elif PS_ACCURATE_PRIMS_AA
+	// AA: coverage is computed in alpha_coverage
+	#if PS_ACCURATE_PRIMS_AA_ABE
+		if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
+			C.a = alpha_coverage;
+	#else
+		C.a = alpha_coverage;
+	#endif
+#endif
+
 	bool atst_pass = atst(C);

 #if PS_AFAIL == 0 // KEEP or ATST off
@ -1034,14 +1325,6 @@ PS_OUTPUT ps_main(PS_INPUT input)
 			discard;
 	}

-	// Must be done before alpha correction
-
-	// AA (Fixed one) will output a coverage of 1.0 as alpha
-	if (PS_FIXED_ONE_A)
-	{
-		C.a = 128.0f;
-	}
-
 	float4 alpha_blend = (float4)0.0f;
 	if (SW_AD_TO_HW)
 	{
@ -1210,7 +1493,14 @@ PS_OUTPUT ps_main(PS_INPUT input)
 #endif // PS_DATE != 1/2

 #if PS_ZCLAMP
+	#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+		if (bool(input.accurate_triangles_interior))
 			output.depth = min(input.p.z, MaxDepthPS);
+		else
+			output.depth = current_depth; // No depth update for triangle edges.
+	#else
+		output.depth = min(input.p.z, MaxDepthPS);
+	#endif
 #endif

 	return output;
@ -1236,7 +1526,9 @@ cbuffer cb0
 	float2 TextureOffset;
 	float2 PointSize;
 	uint MaxDepth;
-	uint BaseVertex; // Only used in DX11.
+	uint pad_cb0;
+	uint BaseVertex;
+	uint pad_cb0_2;
 };

 VS_OUTPUT vs_main(VS_INPUT input)
@ -1256,6 +1548,28 @@ VS_OUTPUT vs_main(VS_INPUT input)
 	output.p.xy = output.p.xy * float2(VertexScale.x, -VertexScale.y) - float2(VertexOffset.x, -VertexOffset.y);
 	output.p.z *= exp2(-32.0f);		// integer->float depth

+	#if VS_ACCURATE_PRIMS == ACCURATE_LINES
+		output.accurate_prims_index = input.vertex_id / 6;
+		output.t = 0.0f;
+		output.ti = 0.0f;
+		output.c = 0.0f;
+		return output; // Don't send line vertex attributes - they are interpolated manually in the pixel shader.
+	#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+		uint prim_id = input.vertex_id / 21;
+		output.accurate_triangles_interior = uint((input.vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
+		if (!bool(output.accurate_triangles_interior))
+		{
+			uint edge = (input.vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
+			output.accurate_prims_index = 3 * prim_id + edge;
+			output.t = 0.0f;
+			output.ti = 0.0f;
+			output.c = 0.0f;
+			return output; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
+		}
+		output.accurate_prims_index = 0;
+		// Send the interior vertex attributes for fixed function interpolation.
+	#endif
+
 	if(VS_TME)
 	{
 		float2 uv = input.uv - TextureOffset;
--- a/bin/resources/shaders/opengl/tfx_fs.glsl
+++ b/bin/resources/shaders/opengl/tfx_fs.glsl
@ -3,6 +3,9 @@

 //#version 420 // Keep it for text editor detection

+#define ACCURATE_LINES 1
+#define ACCURATE_TRIANGLES 2
+
 #define FMT_32 0
 #define FMT_24 1
 #define FMT_16 2
@ -11,6 +14,11 @@
 #define SHUFFLE_WRITE 2
 #define SHUFFLE_READWRITE 3

+#ifndef ZTST_GEQUAL
+#define ZTST_GEQUAL 2
+#define ZTST_GREATER 3
+#endif
+
 // TEX_COORD_DEBUG output the uv coordinate as color. It is useful
 // to detect bad sampling due to upscaling
 //#define TEX_COORD_DEBUG
@ -28,6 +36,9 @@
 #define NEEDS_RT_FOR_AFAIL (PS_AFAIL == 3 && PS_NO_COLOR1)
 #define NEEDS_RT (NEEDS_RT_EARLY || NEEDS_RT_FOR_AFAIL || (!PS_PRIMID_INIT && (PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW)))
 #define NEEDS_TEX (PS_TFX != 4)
+#define NEEDS_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)
+
+vec4 FragCoord;

 layout(std140, binding = 0) uniform cb21
 {
@ -57,8 +68,71 @@ layout(std140, binding = 0) uniform cb21

 	float ScaledScaleFactor;
 	float RcpScaleFactor;
+	uint _pad0;
+	uint _pad1;
+
+	uint accurate_prims_base_index;
+	uint _pad2;
+	uint _pad3;
+	uint _pad4;
 };

+#if PS_ACCURATE_PRIMS
+struct
+{
+	vec4 t_float;
+	vec4 t_int;
+	vec4 c;
+} PSin;
+
+in SHADER
+{
+	vec4 t_float;
+	vec4 t_int;
+
+	#if PS_IIP != 0
+		vec4 c;
+	#else
+		flat vec4 c;
+	#endif
+} PSinReal;
+
+flat in uint accurate_prims_index;
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+flat in uint accurate_triangles_interior;
+#endif
+
+struct AccuratePrimsEdgeData
+{
+	// Interpolated attributes
+	vec4 t_float0; // 0
+	vec4 t_float1; // 16
+	vec4 t_int0; // 32
+	vec4 t_int1; // 48
+	vec4 c0; // 64
+	vec4 c1; // 80
+	vec4 p0; // 96
+	vec4 p1; // 112
+	ivec4 edge0; // 128
+	ivec4 edge1; // 144
+	ivec2 xy0; // 160
+	ivec2 xy1; // 168
+	uint step_x; // 176
+	uint draw0; // 180
+	uint draw1; // 184
+	uint top_left; // 188
+	uint side; // 192
+	uint _pad0; // 196
+	uint _pad1; // 200
+	uint _pad2; // 204
+	// Total 208
+};
+
+layout (std140, binding = 3) buffer AccuratePrimsEdgeDataBuffer {
+	AccuratePrimsEdgeData accurate_prims_data[];
+};
+
+#else
 in SHADER
 {
 	vec4 t_float;
@ -70,6 +144,7 @@ in SHADER
 		flat vec4 c;
 	#endif
 } PSin;
+#endif

 #define TARGET_0_QUALIFIER out

@ -107,9 +182,10 @@ layout(binding = 2) uniform sampler2D RtSampler; // note 2 already use by the im

 #if PS_DATE == 3
 layout(binding = 3) uniform sampler2D img_prim_min;
+#endif

-// I don't remember why I set this parameter but it is surely useless
-//layout(pixel_center_integer) in vec4 gl_FragCoord;
+#if NEEDS_DEPTH
+layout(binding = 4) uniform sampler2D DepthSampler;
 #endif

 vec4 sample_from_rt()
@ -119,7 +195,16 @@ vec4 sample_from_rt()
 #elif HAS_FRAMEBUFFER_FETCH
 	return LAST_FRAG_COLOR;
 #else
-	return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0);
+	return texelFetch(RtSampler, ivec2(FragCoord.xy), 0);
+#endif
+}
+
+vec4 sample_from_depth()
+{
+#if !NEEDS_DEPTH
+	return vec4(0.0);
+#else
+	return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0);
 #endif
 }

@ -315,7 +400,7 @@ int fetch_raw_depth()
 #if PS_TEX_IS_FB == 1
 	return int(sample_from_rt().r * multiplier);
 #else
-	return int(texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0).r * multiplier);
+	return int(texelFetch(TextureSampler, ivec2(FragCoord.xy), 0).r * multiplier);
 #endif
 }

@ -324,7 +409,7 @@ vec4 fetch_raw_color()
 #if PS_TEX_IS_FB == 1
 	return sample_from_rt();
 #else
-	return texelFetch(TextureSampler, ivec2(gl_FragCoord.xy), 0);
+	return texelFetch(TextureSampler, ivec2(FragCoord.xy), 0);
 #endif
 }

@ -724,9 +809,9 @@ void ps_dither(inout vec3 C, float As)
 {
 #if PS_DITHER > 0 && PS_DITHER < 3
 	#if PS_DITHER == 2
-		ivec2 fpos = ivec2(gl_FragCoord.xy);
+		ivec2 fpos = ivec2(FragCoord.xy);
 	#else
-		ivec2 fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor);
+		ivec2 fpos = ivec2(FragCoord.xy * RcpScaleFactor);
 	#endif
 		float value = DitherMatrix[fpos.y&3][fpos.x&3];

@ -967,11 +1052,233 @@ float As = As_rgba.a;
 #endif
 }

+#if PS_ACCURATE_PRIMS
+// Interpolate vertex attributes over a line/edge manually.
+void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1)
+{
+	float weight0_f = float(weight0);
+	float weight1_f = float(weight1);
+	float weight_total = float(weight0 + weight1);
+
+	vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
+	vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
+	vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
+	float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
+
+	// No interpolation for constant attributes.
+	PSin.t_float = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0));
+	PSin.t_int = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0));
+	PSin.c = mix(c_interp, data.c1, equal(data.c1, data.c0));
+	FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
+
+	// Clamp attributes. Fog/Z are normalized.
+	PSin.c = clamp(PSin.c, 0.0f, 255.0f);
+	PSin.t_float.z = clamp(PSin.t_float.z, 0.0f, 1.0f);
+	FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+void HandleAccurateLines(out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
+
+	ivec2 xy0 = data.xy0;
+	ivec2 xy1 = data.xy1;
+	ivec2 dxy = xy1 - xy0;
+	ivec2 xy0_i = (xy0 + 8) & ~0xF;
+	ivec2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool draw0 = bool(data.draw0);
+	bool draw1 = bool(data.draw1);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside line range
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	if ((major_i == major0_i && !draw0) ||
+		(major_i == major1_i && !draw1))
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+
+#if PS_ACCURATE_PRIMS_AA
+	// Proper fixed-point AA rounding
+	int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	int alpha_i;
+	if (minor_i == minor_i_expected_0)
+		alpha_i = alpha_i_0;
+	else if (minor_i == minor_i_expected_1)
+		alpha_i = alpha_i_1;
+	else
+		discard;
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	// Non-AA: fixed-point rounding and 4-bit alignment
+	int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
+	if (minor_i != minor_i_expected)
+		discard;
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+void HandleAccurateTrianglesEdge(out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
+
+	ivec2 xy0 = data.xy0;
+	ivec2 xy1 = data.xy1;
+	ivec2 dxy = xy1 - xy0;
+	ivec2 xy0_i = (xy0 + 8) & ~0xF;
+	ivec2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool side = bool(data.side);
+	bool top_left = bool(data.top_left);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside edge range.
+	// Note: this is not exactly what the SW rasterizer does.
+	// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	// Discard if on wrong side of other edges
+	if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 ||
+		dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0)
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+	int minor_i_expected = minor_line / d_major;
+	int minor_i_expected_0 = minor_i_expected & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	bool minor_i_pixel_center = ((minor_line - d_major * minor_i_expected_0) & 0xF) == 0;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	// Proper fixed-point AA rounding
+	int alpha_i;
+	if ((minor_i_expected & 0xF) == 0)
+	{
+		// On a pixel center
+		alpha_i = top_left ? 0 : d_major_scaled;
+		minor_i_expected += top_left ? (side ? -16 : 16) : 0;
+	}
+	else if (side)
+	{
+		minor_i_expected = minor_i_expected_0;
+		alpha_i = alpha_i_0;
+	}
+	else
+	{
+		minor_i_expected = minor_i_expected_1;
+		alpha_i = alpha_i_1;
+	}
+	if (minor_i != minor_i_expected)
+		discard;
+	
+#if PS_ACCURATE_PRIMS_AA
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1);
+}
+#endif
+
 void ps_main()
 {
+	FragCoord = gl_FragCoord;
+
+#if PS_ACCURATE_PRIMS
+	float alpha_coverage;
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+	HandleAccurateLines(alpha_coverage);
+#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+	if (bool(accurate_triangles_interior))
+	{
+		alpha_coverage = 128.0f;
+		PSin.t_float = PSinReal.t_float;
+		PSin.t_int = PSinReal.t_int;
+		PSin.c = PSinReal.c;
+	}
+	else
+	{
+		HandleAccurateTrianglesEdge(alpha_coverage);
+	}
+#endif
+#endif // PS_ACCURATE_PRIMS
+
+#if NEEDS_DEPTH
+	float current_depth = sample_from_depth().r;
+#endif
+
+#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
+	#if PS_ZTST == ZTST_GEQUAL
+		if (FragCoord.z < current_depth)
+			discard;
+	#elif PS_ZTST == ZTST_GREATER
+		if (FragCoord.z <= current_depth)
+			discard;
+	#endif
+#endif // PS_ZTST
+
 #if PS_SCANMSK & 2
 	// fail depth test on prohibited lines
-	if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1))
+	if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1))
 		discard;
 #endif

@ -1007,7 +1314,7 @@ void ps_main()
 #endif

 #if PS_DATE == 3
-	int stencil_ceil = int(texelFetch(img_prim_min, ivec2(gl_FragCoord.xy), 0).r);
+	int stencil_ceil = int(texelFetch(img_prim_min, ivec2(FragCoord.xy), 0).r);
 	// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
 	// the bad alpha value so we must keep it.

@ -1017,6 +1324,20 @@ void ps_main()
 #endif

 	vec4 C = ps_color();
+
+#if PS_FIXED_ONE_A
+	// AA (Fixed one) will output a coverage of 1.0 as alpha
+	C.a = 128.0f;
+#elif PS_ACCURATE_PRIMS_AA
+	// AA: coverage is computed in alpha_coverage
+	#if PS_ACCURATE_PRIMS_AA_ABE
+		if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
+			C.a = alpha_coverage;
+	#else
+		C.a = alpha_coverage;
+	#endif
+#endif
+
 	bool atst_pass = atst(C);

 #if PS_AFAIL == 0 // KEEP or ATST off
@ -1024,13 +1345,6 @@ void ps_main()
 		discard;
 #endif

-	// Must be done before alpha correction
-
-	// AA (Fixed one) will output a coverage of 1.0 as alpha
-#if PS_FIXED_ONE_A
-	C.a = 128.0f;
-#endif
-
 #if SW_AD_TO_HW
 	#if PS_RTA_CORRECTION
 		vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f);
@ -1144,6 +1458,13 @@ void ps_main()
 #endif

 #if PS_ZCLAMP
-	gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS);
+	#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+		if (bool(accurate_triangles_interior))
+			gl_FragDepth = min(FragCoord.z, MaxDepthPS);
+		else
+			gl_FragDepth = current_depth; // No depth update for triangle edges.
+	#else
+		gl_FragDepth = min(FragCoord.z, MaxDepthPS);
+	#endif
 #endif
 }
--- a/bin/resources/shaders/opengl/tfx_vgs.glsl
+++ b/bin/resources/shaders/opengl/tfx_vgs.glsl
@ -3,6 +3,16 @@

 //#version 420 // Keep it for text editor detection

+#define ACCURATE_LINES 1
+#define ACCURATE_TRIANGLES 2
+
+#if VS_ACCURATE_PRIMS
+flat out uint accurate_prims_index;
+#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+flat out uint accurate_triangles_interior;
+#endif
+#endif
+
 layout(std140, binding = 1) uniform cb20
 {
 	vec2  VertexScale;
@ -14,6 +24,8 @@ layout(std140, binding = 1) uniform cb20
 	vec2  PointSize;
 	uint  MaxDepth;
 	uint  pad_cb20;
+	uint  BaseVertex;
+	uint  pad_cb20_2;
 };

 #ifdef VERTEX_SHADER
@ -75,6 +87,28 @@ void vs_main()
 	gl_Position.z = float(z) * exp_min32;
 	gl_Position.w = 1.0f;

+	#if VS_ACCURATE_PRIMS == ACCURATE_LINES
+		accurate_prims_index = (gl_VertexID - BaseVertex) / 6;
+		VSout.t_float = vec4(0.0f);
+		VSout.t_int = vec4(0.0f);
+		VSout.c = vec4(0.0f);
+		return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader.
+	#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+		uint vertex_id = gl_VertexID - BaseVertex;
+		uint prim_id = vertex_id / 21;
+		accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
+		if (!bool(accurate_triangles_interior))
+		{
+			uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
+			accurate_prims_index = 3 * prim_id + edge;
+			VSout.t_float = vec4(0.0f);
+			VSout.t_int = vec4(0.0f);
+			VSout.c = vec4(0.0f);
+			return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
+		}
+		// Send the interior vertex attributes for fixed function interpolation.
+	#endif
+
 	texture_coord();

 	VSout.c = i_c;
--- a/bin/resources/shaders/vulkan/tfx.glsl
+++ b/bin/resources/shaders/vulkan/tfx.glsl
@ -1,12 +1,23 @@
 // SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
 // SPDX-License-Identifier: GPL-3.0+

+#define ACCURATE_LINES 1
+#define ACCURATE_TRIANGLES 2
+
 //////////////////////////////////////////////////////////////////////
 // Vertex Shader
 //////////////////////////////////////////////////////////////////////

+
 #if defined(VERTEX_SHADER)

+#if VS_ACCURATE_PRIMS
+layout(location = 7) flat out uint accurate_prims_index;
+#if VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+layout(location = 8) flat out uint accurate_triangles_interior;
+#endif
+#endif
+
 layout(std140, set = 0, binding = 0) uniform cb0
 {
 	vec2 VertexScale;
@ -16,6 +27,8 @@ layout(std140, set = 0, binding = 0) uniform cb0
 	vec2 PointSize;
 	uint MaxDepth;
 	uint pad_cb0;
+	uint BaseVertex;
+	uint pad_cb0_2;
 };

 layout(location = 0) out VSOutput
@ -55,6 +68,28 @@ void main()
 	gl_Position.z *= exp2(-32.0f);		// integer->float depth
 	gl_Position.y = -gl_Position.y;

+	#if VS_ACCURATE_PRIMS == ACCURATE_LINES
+		accurate_prims_index = (gl_VertexIndex - BaseVertex) / 6;
+		vsOut.t = vec4(0.0f);
+		vsOut.ti = vec4(0.0f);
+		vsOut.c = vec4(0.0f);
+		return; // Don't send line vertex attributes - they are interpolated manually in the fragment shader.
+	#elif VS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+		uint vertex_id = gl_VertexIndex - BaseVertex;
+		uint prim_id = vertex_id / 21;
+		accurate_triangles_interior = uint((vertex_id - 21 * prim_id) < 3); // First 3 vertices in each group of 21 is interior.
+		if (!bool(accurate_triangles_interior))
+		{
+			uint edge = (vertex_id - 21 * prim_id - 3) / 6; // Each group of 6 vertices after first 3 is one edge.
+			accurate_prims_index = 3 * prim_id + edge;
+			vsOut.t = vec4(0.0f);
+			vsOut.ti = vec4(0.0f);
+			vsOut.c = vec4(0.0f);
+			return; // Don't send edge vertex attributes - they are interpolated manually in the fragment shader.
+		}
+		// Send the interior vertex attributes for fixed function interpolation.
+	#endif
+
 	#if VS_TME
 		vec2 uv = a_uv - TextureOffset;
 		vec2 st = a_st - TextureOffset;
@ -245,6 +280,11 @@ void main()
 #define GS_LINE 0
 #endif

+#ifndef ZTST_GEQUAL
+#define ZTST_GEQUAL 2
+#define ZTST_GREATER 3
+#endif
+
 #ifndef PS_FST
 #define PS_FST 0
 #define PS_WMS 0
@ -298,9 +338,12 @@ void main()
 #define AFAIL_NEEDS_RT (PS_AFAIL == 3 && PS_NO_COLOR1)

 #define PS_FEEDBACK_LOOP_IS_NEEDED (PS_TEX_IS_FB == 1 || AFAIL_NEEDS_RT || PS_FBMASK || SW_BLEND_NEEDS_RT || SW_AD_TO_HW || (PS_DATE >= 5))
+#define PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH ((PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES) && PS_ACCURATE_PRIMS_AA && PS_ZCLAMP)

 #define NEEDS_TEX (PS_TFX != 4)

+vec4 FragCoord;
+
 layout(std140, set = 0, binding = 1) uniform cb1
 {
 	vec3 FogColor;
@ -320,8 +363,71 @@ layout(std140, set = 0, binding = 1) uniform cb1
 	mat4 DitherMatrix;
 	float ScaledScaleFactor;
 	float RcpScaleFactor;
+	uint _pad0;
+	uint _pad1;
+
+	uint accurate_prims_base_index;
+	uint _pad2;
+	uint _pad3;
+	uint _pad4;
 };

+#if PS_ACCURATE_PRIMS
+struct
+{
+	vec4 t;
+	vec4 ti;
+	vec4 c;
+} vsIn;
+
+layout(location = 0) in VSOutput
+{
+	vec4 t;
+	vec4 ti;
+	#if PS_IIP != 0
+		vec4 c;
+	#else
+		flat vec4 c;
+	#endif
+} vsInReal;
+
+layout(location = 7) flat in uint accurate_prims_index;
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+layout(location = 8) flat in uint accurate_triangles_interior;
+#endif
+
+struct AccuratePrimsEdgeData
+{
+	// Interpolated attributes
+	vec4 t_float0; // 0
+	vec4 t_float1; // 16
+	vec4 t_int0; // 32
+	vec4 t_int1; // 48
+	vec4 c0; // 64
+	vec4 c1; // 80
+	vec4 p0; // 96
+	vec4 p1; // 112
+	ivec4 edge0; // 128
+	ivec4 edge1; // 144
+	ivec2 xy0; // 160
+	ivec2 xy1; // 168
+	uint step_x; // 176
+	uint draw0; // 180
+	uint draw1; // 184
+	uint top_left; // 188
+	uint side; // 192
+	uint _pad0; // 196
+	uint _pad1; // 200
+	uint _pad2; // 204
+	// Total 208
+};
+
+layout (std140, set = 0, binding = 3) readonly buffer AccuratePrimsEdgeDataBuffer {
+	AccuratePrimsEdgeData accurate_prims_data[];
+};
+
+#else // PS_ACCURATE_PRIMS
+
 layout(location = 0) in VSOutput
 {
 	vec4 t;
@ -333,6 +439,8 @@ layout(location = 0) in VSOutput
 	#endif
 } vsIn;

+#endif
+
 #if !PS_NO_COLOR && !PS_NO_COLOR1
 layout(location = 0, index = 0) out vec4 o_col0;
 layout(location = 0, index = 1) out vec4 o_col1;
@ -345,13 +453,21 @@ layout(set = 1, binding = 0) uniform sampler2D Texture;
 layout(set = 1, binding = 1) uniform texture2D Palette;
 #endif

-#if PS_FEEDBACK_LOOP_IS_NEEDED
+#if PS_FEEDBACK_LOOP_IS_NEEDED || PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
 	#if defined(DISABLE_TEXTURE_BARRIER) || defined(HAS_FEEDBACK_LOOP_LAYOUT)
 		layout(set = 1, binding = 2) uniform texture2D RtSampler;
-		vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(gl_FragCoord.xy), 0); }
+		vec4 sample_from_rt() { return texelFetch(RtSampler, ivec2(FragCoord.xy), 0); }
+		#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
+			layout(set = 1, binding = 4) uniform texture2D DepthSampler;
+			vec4 sample_from_depth() { return texelFetch(DepthSampler, ivec2(FragCoord.xy), 0); }
+		#endif
 	#else
 		layout(input_attachment_index = 0, set = 1, binding = 2) uniform subpassInput RtSampler;
 		vec4 sample_from_rt() { return subpassLoad(RtSampler); }
+		#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
+			layout(input_attachment_index = 1, set = 1, binding = 4) uniform subpassInput DepthSampler;
+			vec4 sample_from_depth() { return subpassLoad(DepthSampler); }
+		#endif
 	#endif
 #endif

@ -925,19 +1041,19 @@ vec4 ps_color()
 #if !NEEDS_TEX
 	vec4 T = vec4(0.0f);
 #elif PS_CHANNEL_FETCH == 1
-	vec4 T = fetch_red(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_red(ivec2(FragCoord.xy));
 #elif PS_CHANNEL_FETCH == 2
-	vec4 T = fetch_green(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_green(ivec2(FragCoord.xy));
 #elif PS_CHANNEL_FETCH == 3
-	vec4 T = fetch_blue(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_blue(ivec2(FragCoord.xy));
 #elif PS_CHANNEL_FETCH == 4
-	vec4 T = fetch_alpha(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_alpha(ivec2(FragCoord.xy));
 #elif PS_CHANNEL_FETCH == 5
-	vec4 T = fetch_rgb(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_rgb(ivec2(FragCoord.xy));
 #elif PS_CHANNEL_FETCH == 6
-	vec4 T = fetch_gXbY(ivec2(gl_FragCoord.xy));
+	vec4 T = fetch_gXbY(ivec2(FragCoord.xy));
 #elif PS_DEPTH_FMT > 0
-	vec4 T = sample_depth(st_int, ivec2(gl_FragCoord.xy));
+	vec4 T = sample_depth(st_int, ivec2(FragCoord.xy));
 #else
 	vec4 T = sample_color(st);
 #endif
@ -985,9 +1101,9 @@ void ps_dither(inout vec3 C, float As)
 		ivec2 fpos;

 		#if PS_DITHER == 2
-			fpos = ivec2(gl_FragCoord.xy);
+			fpos = ivec2(FragCoord.xy);
 		#else
-			fpos = ivec2(gl_FragCoord.xy * RcpScaleFactor);
+			fpos = ivec2(FragCoord.xy * RcpScaleFactor);
 		#endif

 		float value = DitherMatrix[fpos.y & 3][fpos.x & 3];
@ -1228,11 +1344,232 @@ void ps_blend(inout vec4 Color, inout vec4 As_rgba)
 	#endif
 }

+#if PS_ACCURATE_PRIMS
+// Interpolate vertex attributes over a line/edge manually.
+void InterpolateAttributesManual(AccuratePrimsEdgeData data, int weight0, int weight1)
+{
+	float weight0_f = float(weight0);
+	float weight1_f = float(weight1);
+	float weight_total = float(weight0 + weight1);
+
+	vec4 t_float_interp = (weight1_f * data.t_float1 + weight0_f * data.t_float0) / weight_total;
+	vec4 t_int_interp = (weight1_f * data.t_int1 + weight0_f * data.t_int0) / weight_total;
+	vec4 c_interp = (weight1_f * data.c1 + weight0_f * data.c0) / weight_total;
+	float z_interp = (weight1_f * data.p1.z + weight0_f * data.p0.z) / weight_total;
+
+	// No interpolation for constant attributes.
+	vsIn.t = mix(t_float_interp, data.t_float1, equal(data.t_float1, data.t_float0));
+	vsIn.ti = mix(t_int_interp, data.t_int1, equal(data.t_int1, data.t_int0));
+	vsIn.c = mix(c_interp, data.c1, equal(data.c1, data.c0));
+	FragCoord.z = (data.p1.z == data.p0.z) ? data.p1.z : z_interp;
+
+	// Clamp attributes. Fog/Z are normalized.
+	vsIn.c = clamp(vsIn.c, 0.0f, 255.0f);
+	vsIn.t.z = clamp(vsIn.t.z, 0.0f, 1.0f);
+	FragCoord.z = clamp(FragCoord.z, 0.0f, 1.0f);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+void HandleAccurateLines(out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
+
+	ivec2 xy0 = data.xy0;
+	ivec2 xy1 = data.xy1;
+	ivec2 dxy = xy1 - xy0;
+	ivec2 xy0_i = (xy0 + 8) & ~0xF;
+	ivec2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool draw0 = bool(data.draw0);
+	bool draw1 = bool(data.draw1);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside line range
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	if ((major_i == major0_i && !draw0) ||
+		(major_i == major1_i && !draw1))
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+
+#if PS_ACCURATE_PRIMS_AA
+	// Proper fixed-point AA rounding
+	int minor_i_expected_0 = (minor_line / d_major) & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	int alpha_i;
+	if (minor_i == minor_i_expected_0)
+		alpha_i = alpha_i_0;
+	else if (minor_i == minor_i_expected_1)
+		alpha_i = alpha_i_1;
+	else
+		discard;
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	// Non-AA: fixed-point rounding and 4-bit alignment
+	int minor_i_expected = ((2 * minor_line + d_major_scaled) / (2 * d_major)) & ~0xF;
+	if (minor_i != minor_i_expected)
+		discard;
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1);
+}
+#endif
+
+#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+void HandleAccurateTrianglesEdge(out float alpha_coverage)
+{
+	AccuratePrimsEdgeData data = accurate_prims_data[accurate_prims_base_index + accurate_prims_index];
+
+	ivec2 xy0 = data.xy0;
+	ivec2 xy1 = data.xy1;
+	ivec2 dxy = xy1 - xy0;
+	ivec2 xy0_i = (xy0 + 8) & ~0xF;
+	ivec2 xy1_i = (xy1 + 8) & ~0xF;
+	bool step_x = bool(data.step_x);
+	bool side = bool(data.side);
+	bool top_left = bool(data.top_left);
+
+	// 4-bit fixed point: 16 subpixels per pixel
+	ivec2 xy_i = 16 * ivec2(floor(FragCoord.xy)); // Subtract half-integer pixel center.
+
+	// Determine major/minor axes
+	int major0 = step_x ? xy0.x : xy0.y;
+	int major1 = step_x ? xy1.x : xy1.y;
+	int minor0 = step_x ? xy0.y : xy0.x;
+	int minor1 = step_x ? xy1.y : xy1.x;
+	int major_i = step_x ? xy_i.x : xy_i.y;
+	int minor_i = step_x ? xy_i.y : xy_i.x;
+	int d_major = step_x ? dxy.x : dxy.y;
+	int d_major_scaled = 16 * d_major;
+
+	int major0_i = step_x ? xy0_i.x : xy0_i.y;
+	int major1_i = step_x ? xy1_i.x : xy1_i.y;
+
+	// Discard if outside edge range.
+	// Note: this is not exactly what the SW rasterizer does.
+	// See the note in GSRasterizer::DrawEdgeTriangle() about the asymmetry in X and Y bounds checking.
+	if (major_i < min(major0_i, major1_i) ||
+		major_i > max(major0_i, major1_i))
+		discard;
+
+	// Discard if on wrong side of other edges
+	if (dot(data.edge0, ivec4(xy_i, 1, 0)) <= 0 ||
+		dot(data.edge1, ivec4(xy_i, 1, 0)) <= 0)
+		discard;
+
+	int weight0 = major1 - major_i;
+	int weight1 = major_i - major0;
+
+	// Compute minor axis line in fixed-point
+	int minor_line = weight1 * minor1 + weight0 * minor0;
+	int minor_i_expected = minor_line / d_major;
+	int minor_i_expected_0 = minor_i_expected & ~0xF;
+	int minor_i_expected_1 = minor_i_expected_0 + 16;
+	int alpha_i_0 = d_major_scaled - (minor_line - d_major * minor_i_expected_0);
+	int alpha_i_1 = d_major_scaled - alpha_i_0;
+
+	// Proper fixed-point AA rounding
+	int alpha_i;
+	if ((minor_i_expected & 0xF) == 0)
+	{
+		// On a pixel center
+		alpha_i = top_left ? 0 : d_major_scaled;
+		minor_i_expected += top_left ? (side ? -16 : 16) : 0;
+	}
+	else if (side)
+	{
+		minor_i_expected = minor_i_expected_0;
+		alpha_i = alpha_i_0;
+	}
+	else
+	{
+		minor_i_expected = minor_i_expected_1;
+		alpha_i = alpha_i_1;
+	}
+	if (minor_i != minor_i_expected)
+		discard;
+	
+#if PS_ACCURATE_PRIMS_AA
+	// Make sure that the output alpha is always <= 127 for AA.
+	alpha_coverage = floor(clamp(128.0f * float(alpha_i) / float(d_major_scaled), 0.0f, 127.0f));
+#else
+	alpha_coverage = 128.0f;
+#endif
+
+	// Interpolate attributes
+	InterpolateAttributesManual(data, weight0, weight1);
+}
+#endif
+
 void main()
 {
+	FragCoord = gl_FragCoord;
+
+#if PS_ACCURATE_PRIMS
+	float alpha_coverage;
+#if PS_ACCURATE_PRIMS == ACCURATE_LINES
+	HandleAccurateLines(alpha_coverage);
+#elif PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+	if (bool(accurate_triangles_interior))
+	{
+		alpha_coverage = 128.0f;
+		vsIn.t = vsInReal.t;
+		vsIn.ti = vsInReal.ti;
+		vsIn.c = vsInReal.c;
+	}
+	else
+	{
+		HandleAccurateTrianglesEdge(alpha_coverage);
+	}
+#endif
+#endif // PS_ACCURATE_PRIMS
+
+#if PS_FEEDBACK_LOOP_IS_NEEDED_DEPTH
+	float current_depth = sample_from_depth().r;
+#endif
+
+#if PS_ZCLAMP && (PS_ZTST == ZTST_GEQUAL || PS_ZTST == ZTST_GREATER)
+	#if PS_ZTST == ZTST_GEQUAL
+		if (FragCoord.z < current_depth)
+			discard;
+	#elif PS_ZTST == ZTST_GREATER
+		if (FragCoord.z <= current_depth)
+			discard;
+	#endif
+#endif // PS_ZTST
+
 #if PS_SCANMSK & 2
 	// fail depth test on prohibited lines
-	if ((int(gl_FragCoord.y) & 1) == (PS_SCANMSK & 1))
+	if ((int(FragCoord.y) & 1) == (PS_SCANMSK & 1))
 		discard;
 #endif
 #if PS_DATE >= 5
@ -1267,7 +1604,7 @@ void main()
 #endif		// PS_DATE >= 5

 #if PS_DATE == 3
-	int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(gl_FragCoord.xy), 0).r);
+	int stencil_ceil = int(texelFetch(PrimMinTexture, ivec2(FragCoord.xy), 0).r);
 	// Note gl_PrimitiveID == stencil_ceil will be the primitive that will update
 	// the bad alpha value so we must keep it.

@ -1277,6 +1614,20 @@ void main()
 #endif

 	vec4 C = ps_color();
+
+#if PS_FIXED_ONE_A
+	// AA (Fixed one) will output a coverage of 1.0 as alpha
+	C.a = 128.0f;
+#elif PS_ACCURATE_PRIMS_AA
+	// AA: coverage is computed in alpha_coverage
+	#if PS_ACCURATE_PRIMS_AA_ABE
+		if (floor(C.a) == 128.0f) // According to manual & hardware tests the coverage is only used if the fragment alpha is 128.
+			C.a = alpha_coverage;
+	#else
+		C.a = alpha_coverage;
+	#endif
+#endif
+
 	bool atst_pass = atst(C);

 #if PS_AFAIL == 0 // KEEP or ATST off
@ -1284,13 +1635,6 @@ void main()
 		discard;
 #endif

-	// Must be done before alpha correction
-
-	// AA (Fixed one) will output a coverage of 1.0 as alpha
-#if PS_FIXED_ONE_A
-	C.a = 128.0f;
-#endif
-
 #if SW_AD_TO_HW
 	#if PS_RTA_CORRECTION
 		vec4 RT = trunc(sample_from_rt() * 128.0f + 0.1f);
@ -1401,9 +1745,15 @@ void main()
 	#endif

 	#if PS_ZCLAMP
-		gl_FragDepth = min(gl_FragCoord.z, MaxDepthPS);
+		#if PS_ACCURATE_PRIMS == ACCURATE_TRIANGLES
+			if (bool(accurate_triangles_interior))
+				gl_FragDepth = min(FragCoord.z, MaxDepthPS);
+			else
+				gl_FragDepth = current_depth; // No depth update for triangle edges.
+		#else
+			gl_FragDepth = min(FragCoord.z, MaxDepthPS);
+		#endif
 	#endif
-
 #endif // PS_DATE
 }

--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@ -757,6 +757,7 @@ struct Pcsx2Config
 					PreloadFrameWithGSData : 1,
 					Mipmap : 1,
 					HWMipmap : 1,
+					HWAccuratePrims: 1,
 					ManualUserHacks : 1,
 					UserHacks_AlignSpriteX : 1,
 					UserHacks_CPUFBConversion : 1,
--- a/pcsx2/GS/GSState.cpp
+++ b/pcsx2/GS/GSState.cpp
@ -431,6 +431,10 @@ const char* GSState::GetFlushReasonString(GSFlushReason reason)
 			return "VSYNC";
 		case GSFlushReason::GSREOPEN:
 			return "GS REOPEN";
+		case GSFlushReason::VERTEXCOUNT:
+			return "VERTEX COUNT";
+		case GSFlushReason::VERTEXCOUNTEXPANDED:
+			return "VERTEX COUNT EXPANDED";
 		case GSFlushReason::UNKNOWN:
 		default:
 			return "UNKNOWN";
@ -3265,6 +3269,20 @@ void GSState::UpdateVertexKick()

 	m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
 	m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim];
+
+	if (UsingAccuratePrims())
+	{
+		if (GSUtil::GetPrimClass(prim) == GS_LINE_CLASS)
+			m_vertex_expansion_factor = 3;
+		else if (GSUtil::GetPrimClass(prim) == GS_TRIANGLE_CLASS)
+			m_vertex_expansion_factor = 7;
+		else
+			pxFail("Wrong primitive class."); // Impossible.
+	}
+	else
+	{
+		m_vertex_expansion_factor = 1;
+	}
 }

 void GSState::GrowVertexBuffer()
@ -4891,6 +4909,12 @@ __forceinline void GSState::VertexKick(u32 skip)
 	constexpr u32 max_vertices = MaxVerticesForPrim(prim);
 	if (max_vertices != 0 && m_vertex.tail >= max_vertices)
 		Flush(VERTEXCOUNT);
+	
+	if (m_vertex_expansion_factor != 1)
+	{
+		if (max_vertices != 0 && (m_vertex_expansion_factor * m_index.tail) >= max_vertices)
+			Flush(VERTEXCOUNTEXPANDED);
+	}
 }

 /// Checks if region repeat is used (applying it does something to at least one of the values in min...max)
@ -5227,12 +5251,15 @@ void GSState::CalcAlphaMinMax(const int tex_alpha_min, const int tex_alpha_max)
 	// Limit max to 255 as we send 500 when we don't know, makes calculating 24/16bit easier.
 	int min = tex_alpha_min, max = std::min(tex_alpha_max, 255);

-	if (IsCoverageAlpha())
+	if (IsCoverageAlphaFixedOne())
 	{
-		// HW renderer doesn't currently support AA, so its min is 128.
-		// If we add AA support to the HW renderer, this will need to be changed.
-		// (Will probably only be supported with ROV/FBFetch so we would want to check for that.)
-		min = GSIsHardwareRenderer() ? 128 : 0;
+		// HW renderer doesn't support AA1, assume alpha is constant 128.
+		min = 128;
+		max = 128;
+	}
+	else if (IsCoverageAlphaSupported())
+	{
+		min = 0;
 		max = 128;
 	}
 	else
@ -5527,7 +5554,24 @@ bool GSState::IsMipMapActive()

 bool GSState::IsCoverageAlpha()
 {
-	return !PRIM->ABE && PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
+	return PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
+}
+
+bool GSState::IsCoverageAlphaFixedOne()
+{
+	return IsCoverageAlpha() && !PRIM->ABE && !IsCoverageAlphaSupported();
+}
+
+bool GSState::IsCoverageAlphaSupported()
+{
+	return false;
+}
+
+bool GSState::UsingAccuratePrims()
+{
+	return g_gs_device->Features().accurate_prims &&
+	       (GSUtil::GetPrimClass(PRIM->PRIM) == GS_LINE_CLASS ||
+			   (GSUtil::GetPrimClass(PRIM->PRIM) == GS_TRIANGLE_CLASS && PRIM->AA1));
 }

 GIFRegTEX0 GSState::GetTex0Layer(u32 lod)
--- a/pcsx2/GS/GSState.h
+++ b/pcsx2/GS/GSState.h
@ -165,6 +165,8 @@ protected:
 		u32 tail;
 	} m_draw_index = {};

+	int m_vertex_expansion_factor = 1;
+
 	void UpdateContext();
 	void UpdateScissor();

@ -207,6 +209,9 @@ protected:
 	bool IsMipMapDraw();
 	bool IsMipMapActive();
 	bool IsCoverageAlpha();
+	bool IsCoverageAlphaFixedOne();
+	virtual bool IsCoverageAlphaSupported();
+	bool UsingAccuratePrims();
 	void CalcAlphaMinMax(const int tex_min, const int tex_max);
 	void CorrectATEAlphaMinMax(const u32 atst, const int aref);

@ -327,6 +332,7 @@ public:
 		VSYNC  = 1 << 13,
 		GSREOPEN = 1 << 14,
 		VERTEXCOUNT = 1 << 15,
+		VERTEXCOUNTEXPANDED = 1 << 16,
 	};

 	GSFlushReason m_state_flush_reason = UNKNOWN;
--- a/pcsx2/GS/GSVector.h
+++ b/pcsx2/GS/GSVector.h
@ -57,6 +57,16 @@ public:
 		return (std::memcmp(this, &v, sizeof(*this)) != 0);
 	}

+	constexpr GSVector2T operator+(const GSVector2T& v) const
+	{
+		return {x + v.x, y + v.y};
+	}
+
+	constexpr GSVector2T operator-(const GSVector2T& v) const
+	{
+		return {x - v.x, y - v.y};
+	}
+
 	constexpr GSVector2T operator*(const GSVector2T& v) const
 	{
 		return { x * v.x, y * v.y };
@ -81,6 +91,11 @@ public:
 typedef GSVector2T<float> GSVector2;
 typedef GSVector2T<int> GSVector2i;

+constexpr GSVector2i operator&(const GSVector2i& v0, const GSVector2i& v1)
+{
+	return {v0.x & v1.x, v0.y & v1.y};
+}
+
 class GSVector4;
 class GSVector4i;

--- a/pcsx2/GS/Renderers/Common/GSDevice.h
+++ b/pcsx2/GS/Renderers/Common/GSDevice.h
@ -289,6 +289,41 @@ struct HWBlend
 	BlendFactor src, dst;
 };

+struct alignas(16) AccuratePrimsEdgeData
+{
+	// Interpolated attributes
+	GSVector4 t_float0; // 0
+	GSVector4 t_float1; // 16
+	GSVector4 t_int0; // 32
+	GSVector4 t_int1; // 48
+	GSVector4 c0; // 64
+	GSVector4 c1; // 80
+	GSVector4 p0; // 96
+	GSVector4 p1; // 112
+	GSVector4i edge0; // 128
+	GSVector4i edge1; // 144
+	GSVector2i xy0; // 160
+	GSVector2i xy1; // 168
+	u32 step_x; // 176
+	u32 draw0; // 180
+	u32 draw1; // 184
+	u32 top_left; // 188
+	u32 side; // 192
+	u32 _pad0; // 196
+	u32 _pad1; // 200
+	u32 _pad2; // 204
+	// Total 208
+};
+
+static_assert(sizeof(AccuratePrimsEdgeData) == 208);
+
+enum
+{
+	ACCURATE_PRIMS_DISABLE = 0,
+	ACCURATE_PRIMS_LINE = 1,
+	ACCURATE_PRIMS_TRIANGLE = 2
+};
+
 struct alignas(16) GSHWDrawConfig
 {
 	enum class Topology: u8
@ -316,7 +351,7 @@ struct alignas(16) GSHWDrawConfig
 				u8 iip : 1;
 				u8 point_size : 1;		///< Set when points need to be expanded without VS expanding.
 				VSExpand expand : 2;
-				u8 _free : 2;
+				u8 accurate_prims : 2; // 0 - disables; 1 - lines; 2 - triangles.
 			};
 			u8 key;
 		};
@ -354,6 +389,7 @@ struct alignas(16) GSHWDrawConfig
 				u32 date : 3;
 				u32 atst : 3;
 				u32 afail : 2;
+				u32 ztst : 2;
 				// Color sampling
 				u32 fst : 1; // Investigate to do it on the VS
 				u32 tfx : 3;
@ -414,6 +450,11 @@ struct alignas(16) GSHWDrawConfig

 				// Scan mask
 				u32 scanmsk : 2;
+
+				// Accurate lines
+				u32 accurate_prims : 2; // 0 - disabled; 1 - lines; 2 - triangles
+				u32 accurate_prims_aa : 1;
+				u32 accurate_prims_aa_abe : 1;
 			};

 			struct
@ -435,6 +476,13 @@ struct alignas(16) GSHWDrawConfig
 			return channel_fb || tex_is_fb || fbmask || (date >= 5) || sw_blend_needs_rt;
 		}

+		__fi bool IsFeedbackLoopDepth() const
+		{
+			// Note: Manual depth testing/interpolation for accurate prims is bundled with zclamp to reduce pipeline combinations.
+			// The zclamp is used to indicate that either Z write of Z testing is enabled.
+			return (accurate_prims == ACCURATE_PRIMS_TRIANGLE) && accurate_prims_aa && zclamp;
+		}
+
 		/// Disables color output from the pixel shader, this is done when all channels are masked.
 		__fi void DisableColorOutput()
 		{
@ -579,6 +627,7 @@ struct alignas(16) GSHWDrawConfig
 		GSVector2 texture_offset;
 		GSVector2 point_size;
 		GSVector2i max_depth;
+		GSVector2i base_vertex;
 		__fi VSConstantBuffer()
 		{
 			memset(static_cast<void*>(this), 0, sizeof(*this));
@ -628,6 +677,8 @@ struct alignas(16) GSHWDrawConfig

 		GSVector4 ScaleFactor;

+		GSVector4i accurate_prims_base_index;
+
 		__fi PSConstantBuffer()
 		{
 			memset(static_cast<void*>(this), 0, sizeof(*this));
@ -745,6 +796,9 @@ struct alignas(16) GSHWDrawConfig
 	SetDATM datm : 2;
 	bool line_expand : 1;

+	bool accurate_prims;
+	std::vector<AccuratePrimsEdgeData>* accurate_prims_edge_data;
+
 	struct AlphaPass
 	{
 		alignas(8) PSSelector ps;
@ -843,6 +897,7 @@ public:
 		bool stencil_buffer       : 1; ///< Supports stencil buffer, and can use for DATE.
 		bool cas_sharpening       : 1; ///< Supports sufficient functionality for contrast adaptive sharpening.
 		bool test_and_sample_depth: 1; ///< Supports concurrently binding the depth-stencil buffer for sampling and depth testing.
+		bool accurate_prims       : 1; ///< Supports AA1 triangles/lines and accurate lines shaders.
 		FeatureSupport()
 		{
 			memset(this, 0, sizeof(*this));
--- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp
+++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp
@ -14,6 +14,7 @@
 #include "common/Error.h"
 #include "common/Path.h"
 #include "common/StringUtil.h"
+#include "common/ScopedGuard.h"

 #include "imgui.h"
 #include "IconsFontAwesome6.h"
@ -395,6 +396,39 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
 		}
 	}

+	bd = {};
+
+	if (m_features.accurate_prims)
+	{
+		bd.Usage = D3D11_USAGE_DEFAULT;
+		bd.CPUAccessFlags = 0;
+		bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE;
+		bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+		bd.StructureByteStride = sizeof(AccuratePrimsEdgeData);
+		bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+
+		if (FAILED(m_dev->CreateBuffer(&bd, nullptr, m_accurate_prims_b.put())))
+		{
+			Console.Error("D3D11: Failed to create accurate prims buffer.");
+			return false;
+		}
+
+		const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc(
+			D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0,
+			ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData));
+		
+		if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc,
+		m_accurate_prims_b_srv.put())))
+		{
+			Console.Error("D3D11: Failed to create accurate prims buffer SRV.");
+			return false;
+		}
+
+		// If MAX_TEXTURES changes, please change the register for this buffer in the shader.
+		static_assert(MAX_TEXTURES == 5);
+		m_ctx->PSSetShaderResources(5, 1, m_accurate_prims_b_srv.addressof());
+	}
+
 	// rasterizer

 	memset(&rd, 0, sizeof(rd));
@ -541,6 +575,8 @@ void GSDevice11::Destroy()
 	m_expand_vb_srv.reset();
 	m_expand_vb.reset();
 	m_expand_ib.reset();
+	m_accurate_prims_b.reset();
+	m_accurate_prims_b_srv.reset();

 	m_vs.clear();
 	m_vs_cb.reset();
@ -599,6 +635,8 @@ void GSDevice11::SetFeatures(IDXGIAdapter1* adapter)
 	m_max_texture_size = (m_feature_level >= D3D_FEATURE_LEVEL_11_0) ?
 	                         D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION :
 	                         D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+
+	m_features.accurate_prims = GSConfig.HWAccuratePrims;
 }

 bool GSDevice11::HasSurface() const
@ -1665,6 +1703,7 @@ void GSDevice11::SetupVS(VSSelector sel, const GSHWDrawConfig::VSConstantBuffer*
 		sm.AddMacro("VS_FST", sel.fst);
 		sm.AddMacro("VS_IIP", sel.iip);
 		sm.AddMacro("VS_EXPAND", static_cast<int>(sel.expand));
+		sm.AddMacro("VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));

 		static constexpr const D3D11_INPUT_ELEMENT_DESC layout[] =
 			{
@ -1766,6 +1805,10 @@ void GSDevice11::SetupPS(const PSSelector& sel, const GSHWDrawConfig::PSConstant
 		sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb);
 		sm.AddMacro("PS_NO_COLOR", sel.no_color);
 		sm.AddMacro("PS_NO_COLOR1", sel.no_color1);
+		sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims);
+		sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
+		sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
+		sm.AddMacro("PS_ZTST", sel.ztst);

 		wil::com_ptr_nothrow<ID3D11PixelShader> ps = m_shader_cache.GetPixelShader(m_dev.get(), m_tfx_source, sm.GetPtr(), "ps_main");
 		i = m_ps.try_emplace(sel, std::move(ps)).first;
@ -2280,6 +2323,32 @@ bool GSDevice11::IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 cou
 	return true;
 }

+bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		const u32 count = config.accurate_prims_edge_data->size();
+		const u32 size = count * sizeof(AccuratePrimsEdgeData);
+
+		if (size > ACCURATE_PRIMS_BUFFER_SIZE)
+			return false;
+
+		// Performance note: UpdateSubresource() copies data to a temp staging buffer to avoid stalling the GPU,
+		// so a manual ring buffer is not needed here like VK/DX12.
+		D3D11_BOX dst_region{};
+		dst_region.left = 0;
+		dst_region.right = size;
+		dst_region.top = 0;
+		dst_region.bottom = 1;
+		dst_region.front = 0;
+		dst_region.back = 1;
+		m_ctx->UpdateSubresource(m_accurate_prims_b.get(), 0, &dst_region, config.accurate_prims_edge_data->data(), size, 0);
+
+		config.cb_ps.accurate_prims_base_index.x = 0; // No offsetting needed like DX12/VK since we don't use a ring buffer.
+	}
+	return true;
+}
+
 u16* GSDevice11::IAMapIndexBuffer(u32 count)
 {
 	if (count > (INDEX_BUFFER_SIZE / sizeof(u16)))
@ -2583,6 +2652,18 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 {
 	const GSVector2i rtsize = (config.rt ? config.rt : config.ds)->GetSize();
 	GSTexture* colclip_rt = g_gs_device->GetColorClipTexture();
+	GSTexture* draw_rt_clone = nullptr;
+	GSTexture* draw_ds_clone = nullptr;
+	GSTexture* primid_texture = nullptr;
+	
+	ScopedGuard recycle_temp_textures([&]() {
+		if (draw_rt_clone)
+			Recycle(draw_rt_clone);
+		if (draw_ds_clone)
+			Recycle(draw_ds_clone);
+		if (primid_texture)
+			Recycle(primid_texture);
+	});

 	if (colclip_rt)
 	{
@ -2627,7 +2708,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)

 	// Destination Alpha Setup
 	const bool multidraw_fb_copy = m_features.multidraw_fb_copy && (config.require_one_barrier || config.require_full_barrier);
-	GSTexture* primid_texture = nullptr;
 	if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking)
 	{
 		primid_texture = CreateRenderTarget(rtsize.x, rtsize.y, GSTexture::Format::PrimID, false);
@ -2652,7 +2732,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 			return;
 		}

-		config.cb_vs.max_depth.y = m_vertex.start;
+		config.cb_vs.base_vertex = m_vertex.start;
 	}
 	else
 	{
@ -2663,6 +2743,12 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 		}
 	}

+	if (!SetupAccuratePrims(config))
+	{
+		Console.Error("D3D11: Failed to setup accurate prims");
+		return;
+	}
+
 	if (config.vs.UseExpandIndexBuffer())
 	{
 		IASetIndexBuffer(m_expand_ib.get());
@ -2742,8 +2828,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 		draw_ds = m_state.cached_dsv;
 	}

-	GSTexture* draw_rt_clone = nullptr;
-
 	if (draw_rt && (config.require_one_barrier || (config.require_full_barrier && m_features.multidraw_fb_copy) || (config.tex && config.tex == config.rt)))
 	{
 		// Requires a copy of the RT.
@ -2754,6 +2838,15 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 			Console.Warning("D3D11: Failed to allocate temp texture for RT copy.");
 	}

+	if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth())
+	{
+		// Requires a copy of the DS.
+		// Used as "bind ds" flag when texture barrier is unsupported for tex is fb.
+		draw_ds_clone = CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true);
+		if (!draw_rt_clone)
+			Console.Warning("D3D11: Failed to allocate temp texture for DS copy.");
+	}
+
 	OMSetRenderTargets(draw_rt, draw_ds, &config.scissor, read_only_dsv);
 	SetupOM(config.depth, OMBlendSelector(config.colormask, config.blend), config.blend.constant);

@ -2761,7 +2854,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 	if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::StencilOne && multidraw_fb_copy)
 		m_ctx->ClearDepthStencilView(*static_cast<GSTexture11*>(draw_ds), D3D11_CLEAR_STENCIL, 0.0f, 1);

-	SendHWDraw(config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false);
+	SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds, config.require_one_barrier, config.require_full_barrier, false);

 	if (config.blend_multi_pass.enable)
 	{
@ -2787,15 +2880,10 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 		}

 		SetupOM(config.alpha_second_pass.depth, OMBlendSelector(config.alpha_second_pass.colormask, config.blend), config.blend.constant);
-		SendHWDraw(config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
+		SendHWDraw(config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
+			config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
 	}

-	if (draw_rt_clone)
-		Recycle(draw_rt_clone);
-
-	if (primid_texture)
-		Recycle(primid_texture);
-
 	if (colclip_rt)
 	{
 		config.colclip_update_area = config.colclip_update_area.runion(config.drawarea);
@ -2814,19 +2902,29 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
 	}
 }

-void GSDevice11::SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
+void GSDevice11::SendHWDraw(const GSHWDrawConfig& config,
+	GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds,
+	const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
 {
-	if (draw_rt_clone)
+	if (draw_rt_clone || draw_ds_clone)
 	{
 #ifdef PCSX2_DEVBUILD
-		if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
+		if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
 			Console.Warning("D3D11: Possible unnecessary copy detected.");
 #endif

 		auto CopyAndBind = [&](GSVector4i drawarea) {
+			if (draw_rt_clone)
 				CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top);
+			if (draw_ds_clone)
+				CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top);
 			if (one_barrier || full_barrier)
+			{
+				if (draw_rt_clone)
 					PSSetShaderResource(2, draw_rt_clone);
+				if (draw_ds_clone)
+					PSSetShaderResource(4, draw_ds_clone);
+			}
 			if (config.tex && config.tex == config.rt)
 				PSSetShaderResource(0, draw_rt_clone);
 		};
--- a/pcsx2/GS/Renderers/DX11/GSDevice11.h
+++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h
@ -83,10 +83,14 @@ public:
 private:
 	enum : u32
 	{
-		MAX_TEXTURES = 4,
+		MAX_TEXTURES = 5,
 		MAX_SAMPLERS = 1,
 		VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
 		INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
+
+		// Structured buffer size must be multiple of element size.
+		ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData),
+		
 		NUM_TIMESTAMP_QUERIES = 5,
 	};

@ -126,6 +130,8 @@ private:
 	wil::com_ptr_nothrow<ID3D11Buffer> m_expand_vb;
 	wil::com_ptr_nothrow<ID3D11Buffer> m_expand_ib;
 	wil::com_ptr_nothrow<ID3D11ShaderResourceView> m_expand_vb_srv;
+	wil::com_ptr_nothrow<ID3D11Buffer> m_accurate_prims_b;
+	wil::com_ptr_nothrow<ID3D11ShaderResourceView> m_accurate_prims_b_srv;

 	D3D_FEATURE_LEVEL m_feature_level = D3D_FEATURE_LEVEL_10_0;
 	u32 m_vb_pos = 0; // bytes
@ -317,6 +323,7 @@ public:
 	void IAUnmapVertexBuffer(u32 stride, u32 count);
 	bool IASetVertexBuffer(const void* vertex, u32 stride, u32 count);
 	bool IASetExpandVertexBuffer(const void* vertex, u32 stride, u32 count);
+	bool SetupAccuratePrims(GSHWDrawConfig& config);

 	u16* IAMapIndexBuffer(u32 count);
 	void IAUnmapIndexBuffer(u32 count);
@ -345,7 +352,9 @@ public:
 	void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, u8 afix);

 	void RenderHW(GSHWDrawConfig& config) override;
-	void SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
+	void SendHWDraw(const GSHWDrawConfig& config,
+		GSTexture* draw_rt_clone, GSTexture* draw_rt, GSTexture* draw_ds_clone, GSTexture* draw_ds,
+		const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);

 	void ClearSamplerCache() override;

--- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp
+++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp
@ -20,29 +20,33 @@ D3D12StreamBuffer::~D3D12StreamBuffer()
 	Destroy();
 }

-bool D3D12StreamBuffer::Create(u32 size)
+bool D3D12StreamBuffer::Create(u32 size, bool default_heap)
 {
 	const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN,
 		{1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};

 	D3D12MA::ALLOCATION_DESC allocationDesc = {};
 	allocationDesc.Flags = D3D12MA::ALLOCATION_FLAG_COMMITTED;
-	allocationDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD;
+	allocationDesc.HeapType = default_heap ? D3D12_HEAP_TYPE_DEFAULT : D3D12_HEAP_TYPE_UPLOAD;

 	wil::com_ptr_nothrow<ID3D12Resource> buffer;
 	wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
 	HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocationDesc, &resource_desc,
-		D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
+		default_heap ? D3D12_RESOURCE_STATE_COMMON : D3D12_RESOURCE_STATE_GENERIC_READ,
+		nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
 	pxAssertMsg(SUCCEEDED(hr), "Allocate buffer");
 	if (FAILED(hr))
 		return false;

 	static const D3D12_RANGE read_range = {};
-	u8* host_pointer;
+	u8* host_pointer = nullptr;
+	if (!default_heap)
+	{
 		hr = buffer->Map(0, &read_range, reinterpret_cast<void**>(&host_pointer));
 		pxAssertMsg(SUCCEEDED(hr), "Map buffer");
 		if (FAILED(hr))
 			return false;
+	}

 	Destroy(true);

@ -51,6 +55,7 @@ bool D3D12StreamBuffer::Create(u32 size)
 	m_host_pointer = host_pointer;
 	m_size = size;
 	m_gpu_pointer = m_buffer->GetGPUVirtualAddress();
+	m_default_heap = default_heap;
 	return true;
 }

@ -148,6 +153,7 @@ void D3D12StreamBuffer::Destroy(bool defer)
 	m_current_offset = 0;
 	m_current_space = 0;
 	m_current_gpu_position = 0;
+	m_default_heap = false;
 	m_tracked_fences.clear();
 }

--- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h
+++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h
@ -22,7 +22,7 @@ public:
 	D3D12StreamBuffer();
 	~D3D12StreamBuffer();

-	bool Create(u32 size);
+	bool Create(u32 size, bool default_heap = false);

 	__fi bool IsValid() const { return static_cast<bool>(m_buffer); }
 	__fi ID3D12Resource* GetBuffer() const { return m_buffer.get(); }
@ -54,7 +54,8 @@ private:
 	wil::com_ptr_nothrow<ID3D12Resource> m_buffer;
 	wil::com_ptr_nothrow<D3D12MA::Allocation> m_allocation;
 	D3D12_GPU_VIRTUAL_ADDRESS m_gpu_pointer = {};
-	u8* m_host_pointer = nullptr;
+	u8* m_host_pointer = nullptr; // Only used for upload heaps.
+	bool m_default_heap = false; // False for upload heap; true for default heap.

 	// List of fences and the corresponding positions in the buffer
 	std::deque<std::pair<u64, u32>> m_tracked_fences;
--- a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp
+++ b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp
@ -624,52 +624,91 @@ bool GSDevice12::SetGPUTimingEnabled(bool enabled)
 bool GSDevice12::AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer,
 	D3D12MA::Allocation** gpu_allocation, const std::function<void(void*)>& fill_callback)
 {
-	// Try to place the fixed index buffer in GPU local memory.
-	// Use the staging buffer to copy into it.
+	// Allocate and fill staging buffer
+	ID3D12Resource* cpu_buffer = AllocateUploadStagingBuffer(size, fill_callback);
+
+	// Create GPU buffer
 	const D3D12_RESOURCE_DESC rd = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0},
 		D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
-
-	const D3D12MA::ALLOCATION_DESC cpu_ad = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
-
-	ComPtr<ID3D12Resource> cpu_buffer;
-	ComPtr<D3D12MA::Allocation> cpu_allocation;
-	HRESULT hr = m_allocator->CreateResource(
-		&cpu_ad, &rd, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, cpu_allocation.put(), IID_PPV_ARGS(cpu_buffer.put()));
-	pxAssertMsg(SUCCEEDED(hr), "Allocate CPU buffer");
-	if (FAILED(hr))
-		return false;
-
-	static constexpr const D3D12_RANGE read_range = {};
-	const D3D12_RANGE write_range = {0, size};
-	void* mapped;
-	hr = cpu_buffer->Map(0, &read_range, &mapped);
-	pxAssertMsg(SUCCEEDED(hr), "Map CPU buffer");
-	if (FAILED(hr))
-		return false;
-	fill_callback(mapped);
-	cpu_buffer->Unmap(0, &write_range);
-
 	const D3D12MA::ALLOCATION_DESC gpu_ad = {D3D12MA::ALLOCATION_FLAG_COMMITTED, D3D12_HEAP_TYPE_DEFAULT};
-
-	hr = m_allocator->CreateResource(
+	HRESULT hr = m_allocator->CreateResource(
 		&gpu_ad, &rd, D3D12_RESOURCE_STATE_COMMON, nullptr, gpu_allocation, IID_PPV_ARGS(gpu_buffer));
 	pxAssertMsg(SUCCEEDED(hr), "Allocate GPU buffer");
 	if (FAILED(hr))
 		return false;

-	GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer.get(), 0, size);
+	// Copy the data
+	GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer, 0, size);

+	// Transition GPU buffer to COPY_DEST
 	D3D12_RESOURCE_BARRIER rb = {D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, D3D12_RESOURCE_BARRIER_FLAG_NONE};
 	rb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
 	rb.Transition.pResource = *gpu_buffer;
 	rb.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; // COMMON -> COPY_DEST at first use.
 	rb.Transition.StateAfter = D3D12_RESOURCE_STATE_INDEX_BUFFER;
 	GetInitCommandList()->ResourceBarrier(1, &rb);
-
-	DeferResourceDestruction(cpu_allocation.get(), cpu_buffer.get());
 	return true;
 }

+ID3D12Resource* GSDevice12::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
+{
+	if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
+	{
+		GSDevice12::GetInstance()->ExecuteCommandList(
+			false, "While waiting for %u bytes in texture upload buffer", size);
+		if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
+		{
+			Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
+			return nullptr;
+		}
+	}
+
+	offset_out = m_texture_stream_buffer.GetCurrentOffset();
+	write_data(m_texture_stream_buffer.GetCurrentHostPointer());
+	m_texture_stream_buffer.CommitMemory(size);
+	return m_texture_stream_buffer.GetBuffer();
+}
+
+ID3D12Resource* GSDevice12::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
+{
+	wil::com_ptr_nothrow<ID3D12Resource> resource;
+	wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
+
+	// Allocate staging buffer
+	const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
+	const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1,
+		DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
+	HRESULT hr = GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
+		D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
+	if (FAILED(hr))
+	{
+		Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
+		return nullptr;
+	}
+
+	// Map
+	static constexpr const D3D12_RANGE read_range = {};
+	void* map_ptr;
+	hr = resource->Map(0, &read_range, &map_ptr);
+	if (FAILED(hr))
+	{
+		Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
+		return nullptr;
+	}
+
+	// Write data
+	write_data(map_ptr);
+
+	// Unmap
+	const D3D12_RANGE write_range = {0, size};
+	resource->Unmap(0, &write_range);
+
+	// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
+	// This adds the reference needed to keep the buffer alive.
+	DeferResourceDestruction(allocation.get(), resource.get());
+	return resource.get();
+}
+
 RenderAPI GSDevice12::GetRenderAPI() const
 {
 	return RenderAPI::D3D12;
@ -1250,6 +1289,8 @@ bool GSDevice12::CheckFeatures(const u32& vendor_id)
 		DXGI_FEATURE_PRESENT_ALLOW_TEARING, &allow_tearing_supported, sizeof(allow_tearing_supported));
 	m_allow_tearing_supported = (SUCCEEDED(hr) && allow_tearing_supported == TRUE);

+	m_features.accurate_prims = GSConfig.HWAccuratePrims;
+
 	return true;
 }

@ -2178,6 +2219,93 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count)
 	m_index_stream_buffer.CommitMemory(size);
 }

+void GSDevice12::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		// Unbind the buffer.
+		m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING;
+
+		const u32 count = config.accurate_prims_edge_data->size();
+		const u32 size = count * sizeof(AccuratePrimsEdgeData);
+
+		// Reserve the GPU region.
+		if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
+		{
+			ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
+			if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
+				pxFailRel("Failed to reserve space for accurate prims");
+		}
+
+		const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
+
+		if (InRenderPass())
+			EndRenderPass();
+
+		// Copy data to an upload buffer.
+		ID3D12Resource* upload_buffer;
+		u32 upload_buffer_offset;
+
+		const auto upload_data = [&](void* map_ptr) {
+			std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
+		};
+
+		// If the texture is larger than half our streaming buffer size, use a separate buffer.
+		// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
+		if (size > m_texture_stream_buffer.GetSize() / 2)
+		{
+			upload_buffer_offset = 0;
+			upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
+		}
+		else
+		{
+			upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
+		}
+		if (!upload_buffer)
+		{
+			Console.Error("Failed to get upload buffer for accurate prims data.");
+			return;
+		}
+		
+		// Copy data from upload to GPU buffer.
+		const D3D12_RESOURCE_BARRIER barrier_sr_to_dst = {
+			D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
+			D3D12_RESOURCE_BARRIER_FLAG_NONE,
+			{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
+				D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
+				D3D12_RESOURCE_STATE_COPY_DEST}}};
+		GetCommandList()->ResourceBarrier(1, &barrier_sr_to_dst);
+		GetCommandList()->CopyBufferRegion(
+			m_accurate_prims_stream_buffer.GetBuffer(), offset, upload_buffer, upload_buffer_offset, size);
+
+		// Commit the GPU region.
+		m_accurate_prims_stream_buffer.CommitMemory(size);
+
+		// Issue the barrier since this will be used next draw.
+		const D3D12_RESOURCE_BARRIER barrier_dst_to_sr = {
+			D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
+			D3D12_RESOURCE_BARRIER_FLAG_NONE,
+			{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
+				D3D12_RESOURCE_STATE_COPY_DEST,
+				D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
+		GetCommandList()->ResourceBarrier(1, &barrier_dst_to_sr);
+		
+		m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
+	}
+}
+
+void GSDevice12::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		config.cb_vs.base_vertex = m_vertex.start;
+		config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
+
+		SetVSConstantBuffer(config.cb_vs);
+		SetPSConstantBuffer(config.cb_ps);
+	}
+}
+
 void GSDevice12::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor)
 {
 	GSTexture12* vkRt = static_cast<GSTexture12*>(rt);
@ -2305,9 +2433,9 @@ bool GSDevice12::GetTextureGroupDescriptors(
 	}

 	D3D12_CPU_DESCRIPTOR_HANDLE dst_handle = *gpu_handle;
-	D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TFX_TEXTURES];
-	UINT src_sizes[NUM_TFX_TEXTURES];
-	pxAssert(count <= NUM_TFX_TEXTURES);
+	D3D12_CPU_DESCRIPTOR_HANDLE src_handles[NUM_TOTAL_TFX_TEXTURES];
+	UINT src_sizes[NUM_TOTAL_TFX_TEXTURES];
+	pxAssert(count <= NUM_TOTAL_TFX_TEXTURES);
 	for (u32 i = 0; i < count; i++)
 	{
 		src_handles[i] = cpu_handles[i];
@ -2365,6 +2493,39 @@ bool GSDevice12::CreateBuffers()
 		return false;
 	}

+	if (!m_accurate_prims_stream_buffer.Create(
+			m_features.accurate_prims ? ACCURATE_PRIMS_BUFFER_SIZE : sizeof(AccuratePrimsEdgeData), true))
+	{
+		Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
+		return false;
+	}
+
+	if (!m_descriptor_heap_manager.Allocate(&m_accurate_prims_srv_descriptor_cpu))
+	{
+		Console.Error("Failed to allocate accurate prims CPU descriptor");
+		return false;
+	}
+
+	if (m_features.accurate_prims)
+	{
+		// Transition to accurate prims buffer to pixel shader resource and create the shader resource view.
+		const D3D12_RESOURCE_BARRIER barrier = {
+			D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
+			D3D12_RESOURCE_BARRIER_FLAG_NONE,
+			{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
+				D3D12_RESOURCE_STATE_COMMON,
+				D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
+		GetInitCommandList()->ResourceBarrier(1, &barrier);
+
+		D3D12_SHADER_RESOURCE_VIEW_DESC desc = {
+			DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING};
+		desc.Buffer.FirstElement = 0;
+		desc.Buffer.NumElements = ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData);
+		desc.Buffer.StructureByteStride = sizeof(AccuratePrimsEdgeData);
+		m_device->CreateShaderResourceView(m_accurate_prims_stream_buffer.GetBuffer(), &desc,
+			m_accurate_prims_srv_descriptor_cpu.cpu_handle);
+	}
+
 	if (!m_vertex_constant_buffer.Create(VERTEX_UNIFORM_BUFFER_SIZE))
 	{
 		Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer");
@ -2415,9 +2576,11 @@ bool GSDevice12::CreateRootSignatures()
 	rsb.AddCBVParameter(0, D3D12_SHADER_VISIBILITY_ALL);
 	rsb.AddCBVParameter(1, D3D12_SHADER_VISIBILITY_PIXEL);
 	rsb.AddSRVParameter(0, D3D12_SHADER_VISIBILITY_VERTEX);
-	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL);
+	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, 2, D3D12_SHADER_VISIBILITY_PIXEL); // Source / Palette 
 	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 0, NUM_TFX_SAMPLERS, D3D12_SHADER_VISIBILITY_PIXEL);
-	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL);
+	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 2, D3D12_SHADER_VISIBILITY_PIXEL); // RT / PrimID
+	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 4, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Depth
+	rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 5, 1, D3D12_SHADER_VISIBILITY_PIXEL); // Accurate Prims
 	if (!(m_tfx_root_signature = rsb.Create()))
 		return false;
 	D3D12::SetObjectName(m_tfx_root_signature.get(), "TFX root signature");
@ -2805,6 +2968,7 @@ void GSDevice12::DestroyResources()
 	m_vertex_constant_buffer.Destroy(false);
 	m_index_stream_buffer.Destroy(false);
 	m_vertex_stream_buffer.Destroy(false);
+	m_accurate_prims_stream_buffer.Destroy(false);

 	m_utility_root_signature.reset();
 	m_tfx_root_signature.reset();
@ -2818,6 +2982,7 @@ void GSDevice12::DestroyResources()
 	m_shader_cache.Close();

 	m_descriptor_heap_manager.Free(&m_null_srv_descriptor);
+	m_descriptor_heap_manager.Free(&m_accurate_prims_srv_descriptor_cpu);
 	m_timestamp_query_buffer.reset();
 	m_timestamp_query_allocation.reset();
 	m_sampler_heap_manager.Destroy();
@ -2851,6 +3016,7 @@ const ID3DBlob* GSDevice12::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel)
 	sm.AddMacro("VS_FST", sel.fst);
 	sm.AddMacro("VS_IIP", sel.iip);
 	sm.AddMacro("VS_EXPAND", static_cast<int>(sel.expand));
+	sm.AddMacro("VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));

 	const char* entry_point = (sel.expand != GSHWDrawConfig::VSExpand::None) ? "vs_main_expand" : "vs_main";
 	ComPtr<ID3DBlob> vs(m_shader_cache.GetVertexShader(m_tfx_source, sm.GetPtr(), entry_point));
@ -2922,6 +3088,10 @@ const ID3DBlob* GSDevice12::GetTFXPixelShader(const GSHWDrawConfig::PSSelector&
 	sm.AddMacro("PS_TEX_IS_FB", sel.tex_is_fb);
 	sm.AddMacro("PS_NO_COLOR", sel.no_color);
 	sm.AddMacro("PS_NO_COLOR1", sel.no_color1);
+	sm.AddMacro("PS_ACCURATE_PRIMS", sel.accurate_prims);
+	sm.AddMacro("PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
+	sm.AddMacro("PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
+	sm.AddMacro("PS_ZTST", sel.ztst);

 	ComPtr<ID3DBlob> ps(m_shader_cache.GetPixelShader(m_tfx_source, sm.GetPtr(), "ps_main"));
 	it = m_tfx_pixel_shaders.emplace(sel, std::move(ps)).first;
@ -3155,6 +3325,7 @@ void GSDevice12::InvalidateCachedState()
 	m_tfx_textures_handle_gpu.Clear();
 	m_tfx_samplers_handle_gpu.Clear();
 	m_tfx_rt_textures_handle_gpu.Clear();
+	m_tfx_depth_textures_handle_gpu.Clear();
 }

 void GSDevice12::SetVertexBuffer(D3D12_GPU_VIRTUAL_ADDRESS buffer, size_t size, size_t stride)
@ -3236,7 +3407,11 @@ void GSDevice12::PSSetShaderResource(int i, GSTexture* sr, bool check_state)
 		return;

 	m_tfx_textures[i] = handle;
-	m_dirty_flags |= (i < 2) ? DIRTY_FLAG_TFX_TEXTURES : DIRTY_FLAG_TFX_RT_TEXTURES;
+	m_dirty_flags |=
+		(i < 2) ? DIRTY_FLAG_TFX_TEXTURES :
+		(i < 4) ? DIRTY_FLAG_TFX_RT_TEXTURES :
+		(i < 5) ? DIRTY_FLAG_TFX_DEPTH_TEXTURES :
+		          0; 
 }

 void GSDevice12::PSSetSampler(GSHWDrawConfig::SamplerSelector sel)
@ -3642,6 +3817,17 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
 		flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2;
 	}

+	if (flags & DIRTY_FLAG_TFX_DEPTH_TEXTURES)
+	{
+		if (!GetTextureGroupDescriptors(&m_tfx_depth_textures_handle_gpu, m_tfx_textures.data() + 4, 1))
+		{
+			ExecuteCommandListAndRestartRenderPass(false, "Ran out of TFX depth descriptor descriptor groups");
+			return ApplyTFXState(true);
+		}
+
+		flags |= DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3;
+	}
+
 	ID3D12GraphicsCommandList* cmdlist = GetCommandList();

 	if (m_current_root_signature != RootSignature::TFX)
@ -3649,7 +3835,8 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
 		m_current_root_signature = RootSignature::TFX;
 		flags |= DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING |
 		         DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE |
-		         DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_PIPELINE;
+		         DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 |
+		         DIRTY_FLAG_PIPELINE;
 		cmdlist->SetGraphicsRootSignature(m_tfx_root_signature.get());
 	}

@ -3662,12 +3849,28 @@ bool GSDevice12::ApplyTFXState(bool already_execed)
 		cmdlist->SetGraphicsRootShaderResourceView(TFX_ROOT_SIGNATURE_PARAM_VS_SRV,
 			m_vertex_stream_buffer.GetGPUPointer() + m_vertex.start * sizeof(GSVertex));
 	}
+	if (flags & DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING)
+	{
+		if (!GetDescriptorAllocator().Allocate(1, &m_accurate_prims_srv_descriptor_gpu))
+		{
+			Console.Error("Failed to allocate accurate prims GPU descriptor");
+			return false;
+		}
+
+		m_device.get()->CopyDescriptorsSimple(
+			1, m_accurate_prims_srv_descriptor_gpu, m_accurate_prims_srv_descriptor_cpu, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+
+		cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV, m_accurate_prims_srv_descriptor_gpu);
+		
+	}
 	if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE)
 		cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES, m_tfx_textures_handle_gpu);
 	if (flags & DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE)
 		cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS, m_tfx_samplers_handle_gpu);
 	if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2)
 		cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES, m_tfx_rt_textures_handle_gpu);
+	if (flags & DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3)
+		cmdlist->SetGraphicsRootDescriptorTable(TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES, m_tfx_depth_textures_handle_gpu);

 	ApplyBaseState(flags, cmdlist);
 	return true;
@ -3832,12 +4035,26 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 	GSTexture12* draw_rt = static_cast<GSTexture12*>(config.rt);
 	GSTexture12* draw_ds = static_cast<GSTexture12*>(config.ds);
 	GSTexture12* draw_rt_clone = nullptr;
+	GSTexture12* draw_ds_clone = nullptr;
+	GSTexture12* date_image = nullptr;
+
+	ScopedGuard recycle_temp_textures([&]() {
+		if (draw_rt_clone)
+			Recycle(draw_rt_clone);
+		if (draw_ds_clone)
+			Recycle(draw_ds_clone);
+		if (date_image)
+			Recycle(date_image);
+	});

 	// Align the render area to 128x128, hopefully avoiding render pass restarts for small render area changes (e.g. Ratchet and Clank).
 	const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize());

 	PipelineSelector& pipe = m_pipeline_selector;

+	// Copying buffers needs to done outside render pass so do this early.
+	SetupAccuratePrimsBuffer(config);
+
 	// figure out the pipeline
 	UpdateHWPipelineSelector(config);

@ -3906,7 +4123,6 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 	}

 	// Primitive ID tracking DATE setup.
-	GSTexture12* date_image = nullptr;
 	if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking)
 	{
 		GSTexture* backup_rt = config.rt;
@ -3994,6 +4210,15 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 			Console.Warning("D3D12: Failed to allocate temp texture for RT copy.");
 	}

+	if (draw_ds && config.require_full_barrier && m_features.multidraw_fb_copy && config.ps.IsFeedbackLoopDepth())
+	{
+		// Requires a copy of the DS.
+		// Used as "bind ds" flag when texture barrier is unsupported for tex is fb.
+		draw_ds_clone = static_cast<GSTexture12*>(CreateTexture(rtsize.x, rtsize.y, 1, draw_ds->GetFormat(), true));
+		if (!draw_rt_clone)
+			Console.Warning("D3D12: Failed to allocate temp texture for DS copy.");
+	}
+
 	OMSetRenderTargets(draw_rt, draw_ds, config.scissor);

 	// Begin render pass if new target or out of the area.
@ -4040,7 +4265,8 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 		UploadHWDrawVerticesAndIndices(config);

 	// now we can do the actual draw
-	SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier, false);
+	SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
+		config.require_one_barrier, config.require_full_barrier, false);

 	// blend second pass
 	if (config.blend_multi_pass.enable)
@ -4070,15 +4296,10 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 		pipe.cms = config.alpha_second_pass.colormask;
 		pipe.dss = config.alpha_second_pass.depth;
 		pipe.bs = config.blend;
-		SendHWDraw(pipe, config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
+		SendHWDraw(pipe, config, draw_rt_clone, draw_rt, draw_ds_clone, draw_ds,
+			config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, true);
 	}

-	if (draw_rt_clone)
-		Recycle(draw_rt_clone);
-
-	if (date_image)
-		Recycle(date_image);
-
 	// now blit the colclip texture back to the original target
 	if (colclip_rt)
 	{
@ -4113,23 +4334,40 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
 	}
 }

-void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
+void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config,
+	GSTexture12* draw_rt_clone, GSTexture12* draw_rt,
+	GSTexture12* draw_ds_clone, GSTexture12* draw_ds,
+	const bool one_barrier, const bool full_barrier, const bool skip_first_barrier)
 {
-	if (draw_rt_clone)
+	if (draw_rt_clone || draw_ds_clone)
 	{

 #ifdef PCSX2_DEVBUILD
-		if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
+		if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
 			Console.Warning("D3D12: Possible unnecessary copy detected.");
 #endif
 		auto CopyAndBind = [&](GSVector4i drawarea) {
 			EndRenderPass();

+			if (draw_rt_clone)
+			{
 				CopyRect(draw_rt, draw_rt_clone, drawarea, drawarea.left, drawarea.top);
 				draw_rt->TransitionToState(D3D12_RESOURCE_STATE_RENDER_TARGET);
+			}
+
+			if (draw_ds_clone)
+			{
+				CopyRect(draw_ds, draw_ds_clone, drawarea, drawarea.left, drawarea.top);
+				draw_ds->TransitionToState(D3D12_RESOURCE_STATE_DEPTH_WRITE);
+			}

 			if (one_barrier || full_barrier)
+			{
+				if (draw_rt_clone)
 					PSSetShaderResource(2, draw_rt_clone, true);
+				if (draw_ds_clone)
+					PSSetShaderResource(4, draw_ds_clone, true);
+			}
 			if (config.tex && config.tex == config.rt)
 				PSSetShaderResource(0, draw_rt_clone, true);
 		};
@ -4158,7 +4396,6 @@ void GSDevice12::SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig&
 			return;
 		}

-
 		// Optimization: For alpha second pass we can reuse the copy snapshot from the first pass.
 		if (!skip_first_barrier)
 			CopyAndBind(config.drawarea);
@ -4182,7 +4419,7 @@ void GSDevice12::UpdateHWPipelineSelector(GSHWDrawConfig& config)
 	m_pipeline_selector.ds = config.ds != nullptr;
 }

-void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
+void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
 {
 	IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts);

@ -4200,4 +4437,7 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
 	{
 		IASetIndexBuffer(config.indices, config.nindices);
 	}
+
+	// Needs to be done after vertex offset is set.
+	SetupAccuratePrimsConstants(config);
 }
--- a/pcsx2/GS/Renderers/DX12/GSDevice12.h
+++ b/pcsx2/GS/Renderers/DX12/GSDevice12.h
@ -129,6 +129,8 @@ public:
 	// Allocates a temporary CPU staging buffer, fires the callback with it to populate, then copies to a GPU buffer.
 	bool AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation,
 		const std::function<void(void*)>& fill_callback);
+	ID3D12Resource* AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
+	ID3D12Resource* WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);

 private:
 	struct CommandListResources
@ -256,7 +258,8 @@ public:
 		NUM_TFX_CONSTANT_BUFFERS = 2,
 		NUM_TFX_TEXTURES = 2,
 		NUM_TFX_RT_TEXTURES = 2,
-		NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES,
+		NUM_TFX_DEPTH_TEXTURES = 1,
+		NUM_TOTAL_TFX_TEXTURES = NUM_TFX_TEXTURES + NUM_TFX_RT_TEXTURES + NUM_TFX_DEPTH_TEXTURES,
 		NUM_TFX_SAMPLERS = 1,
 		NUM_UTILITY_TEXTURES = 1,
 		NUM_UTILITY_SAMPLERS = 1,
@ -264,6 +267,10 @@ public:

 		VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
 		INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
+
+		// Structured buffer size must be multiple of element size.
+		ACCURATE_PRIMS_BUFFER_SIZE = (32 * 1024 * 1024 / sizeof(AccuratePrimsEdgeData)) * sizeof(AccuratePrimsEdgeData),
+
 		VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
 		FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,

@ -273,6 +280,8 @@ public:
 		TFX_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 3,
 		TFX_ROOT_SIGNATURE_PARAM_PS_SAMPLERS = 4,
 		TFX_ROOT_SIGNATURE_PARAM_PS_RT_TEXTURES = 5,
+		TFX_ROOT_SIGNATURE_PARAM_PS_DEPTH_TEXTURES = 6,
+		TFX_ROOT_SIGNATURE_PARAM_PS_ACCURATE_PRIMS_SRV = 7,

 		UTILITY_ROOT_SIGNATURE_PARAM_PUSH_CONSTANTS = 0,
 		UTILITY_ROOT_SIGNATURE_PARAM_PS_TEXTURES = 1,
@ -299,6 +308,10 @@ private:

 	D3D12StreamBuffer m_vertex_stream_buffer;
 	D3D12StreamBuffer m_index_stream_buffer;
+	D3D12StreamBuffer m_accurate_prims_stream_buffer;
+	u32 m_accurate_prims_stream_buffer_offset = 0;  // Ring buffer offset for the current draw.
+	D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu;
+	D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu;
 	D3D12StreamBuffer m_vertex_constant_buffer;
 	D3D12StreamBuffer m_pixel_constant_buffer;
 	D3D12StreamBuffer m_texture_stream_buffer;
@ -455,6 +468,8 @@ public:

 	void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
 	void IASetIndexBuffer(const void* index, size_t count);
+	void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
+	void SetupAccuratePrimsConstants(GSHWDrawConfig& config);

 	void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
 	void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);
@ -466,10 +481,13 @@ public:
 	bool BindDrawPipeline(const PipelineSelector& p);

 	void RenderHW(GSHWDrawConfig& config) override;
-	void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config, GSTexture12* draw_rt_clone, GSTexture12* draw_rt, const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);
+	void SendHWDraw(const PipelineSelector& pipe, const GSHWDrawConfig& config,
+		GSTexture12* draw_rt_clone, GSTexture12* draw_rt,
+		GSTexture12* draw_ds_clone, GSTexture12* draw_ds,
+		const bool one_barrier, const bool full_barrier, const bool skip_first_barrier);

 	void UpdateHWPipelineSelector(GSHWDrawConfig& config);
-	void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config);
+	void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config);

 public:
 	/// Ends any render pass, executes the command buffer, and invalidates cached state.
@ -527,33 +545,37 @@ private:
 		DIRTY_FLAG_TFX_TEXTURES = (1 << 2),
 		DIRTY_FLAG_TFX_SAMPLERS = (1 << 3),
 		DIRTY_FLAG_TFX_RT_TEXTURES = (1 << 4),
+		DIRTY_FLAG_TFX_DEPTH_TEXTURES = (1 << 5),

-		DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 5),
-		DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 6),
-		DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 7),
-		DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 8),
-		DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 9),
-		DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 10),
+		DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING = (1 << 6),
+		DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING = (1 << 7),
+		DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING = (1 << 8),
+		DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING = (1 << 9),
+		DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE = (1 << 10),
+		DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE = (1 << 11),
+		DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 = (1 << 12),
+		DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 = (1 << 13),

-		DIRTY_FLAG_VERTEX_BUFFER = (1 << 11),
-		DIRTY_FLAG_INDEX_BUFFER = (1 << 12),
-		DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 13),
-		DIRTY_FLAG_VIEWPORT = (1 << 14),
-		DIRTY_FLAG_SCISSOR = (1 << 15),
-		DIRTY_FLAG_RENDER_TARGET = (1 << 16),
-		DIRTY_FLAG_PIPELINE = (1 << 17),
-		DIRTY_FLAG_BLEND_CONSTANTS = (1 << 18),
-		DIRTY_FLAG_STENCIL_REF = (1 << 19),
+		DIRTY_FLAG_VERTEX_BUFFER = (1 << 14),
+		DIRTY_FLAG_INDEX_BUFFER = (1 << 15),
+		DIRTY_FLAG_PRIMITIVE_TOPOLOGY = (1 << 16),
+		DIRTY_FLAG_VIEWPORT = (1 << 17),
+		DIRTY_FLAG_SCISSOR = (1 << 18),
+		DIRTY_FLAG_RENDER_TARGET = (1 << 19),
+		DIRTY_FLAG_PIPELINE = (1 << 20),
+		DIRTY_FLAG_BLEND_CONSTANTS = (1 << 21),
+		DIRTY_FLAG_STENCIL_REF = (1 << 22),

 		DIRTY_BASE_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER_BINDING | DIRTY_FLAG_PS_CONSTANT_BUFFER_BINDING |
-		                   DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE |
-		                   DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 |
+		                   DIRTY_FLAG_VS_VERTEX_BUFFER_BINDING | DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING |
+		                   DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE | DIRTY_FLAG_SAMPLERS_DESCRIPTOR_TABLE | 
+		                   DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_2 | DIRTY_FLAG_TEXTURES_DESCRIPTOR_TABLE_3 |
 		                   DIRTY_FLAG_VERTEX_BUFFER | DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PRIMITIVE_TOPOLOGY |
 		                   DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR | DIRTY_FLAG_RENDER_TARGET | DIRTY_FLAG_PIPELINE |
 		                   DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_STENCIL_REF,

-		DIRTY_TFX_STATE =
-			DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS | DIRTY_FLAG_TFX_RT_TEXTURES,
+		DIRTY_TFX_STATE = DIRTY_BASE_STATE | DIRTY_FLAG_TFX_TEXTURES | DIRTY_FLAG_TFX_SAMPLERS |
+		                  DIRTY_FLAG_TFX_RT_TEXTURES | DIRTY_FLAG_TFX_DEPTH_TEXTURES,
 		DIRTY_UTILITY_STATE = DIRTY_BASE_STATE,
 		DIRTY_CONSTANT_BUFFER_STATE = DIRTY_FLAG_VS_CONSTANT_BUFFER | DIRTY_FLAG_PS_CONSTANT_BUFFER,
 	};
@ -594,6 +616,7 @@ private:
 	D3D12DescriptorHandle m_tfx_textures_handle_gpu;
 	D3D12DescriptorHandle m_tfx_samplers_handle_gpu;
 	D3D12DescriptorHandle m_tfx_rt_textures_handle_gpu;
+	D3D12DescriptorHandle m_tfx_depth_textures_handle_gpu;

 	D3D12DescriptorHandle m_utility_texture_cpu;
 	D3D12DescriptorHandle m_utility_texture_gpu;
--- a/pcsx2/GS/Renderers/DX12/GSTexture12.cpp
+++ b/pcsx2/GS/Renderers/DX12/GSTexture12.cpp
@ -350,43 +350,6 @@ ID3D12GraphicsCommandList* GSTexture12::GetCommandBufferForUpdate()
 	return dev->GetInitCommandList();
 }

-ID3D12Resource* GSTexture12::AllocateUploadStagingBuffer(
-	const void* data, u32 pitch, u32 upload_pitch, u32 height) const
-{
-	const u32 buffer_size = CalcUploadSize(height, upload_pitch);
-	wil::com_ptr_nothrow<ID3D12Resource> resource;
-	wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
-
-	const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
-	const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, buffer_size, 1, 1, 1,
-		DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
-	HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
-		D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
-	if (FAILED(hr))
-	{
-		Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
-		return nullptr;
-	}
-
-	void* map_ptr;
-	hr = resource->Map(0, nullptr, &map_ptr);
-	if (FAILED(hr))
-	{
-		Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
-		return nullptr;
-	}
-
-	CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
-
-	const D3D12_RANGE write_range = {0, buffer_size};
-	resource->Unmap(0, &write_range);
-
-	// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
-	// This adds the reference needed to keep the buffer alive.
-	GSDevice12::GetInstance()->DeferResourceDestruction(allocation.get(), resource.get());
-	return resource.get();
-}
-
 void GSTexture12::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const
 {
 	const u32 block_size = GetCompressedBlockSize();
@ -406,7 +369,7 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
 	const u32 width = Common::AlignUpPow2(r.width(), block_size);
 	const u32 height = Common::AlignUpPow2(r.height(), block_size);
 	const u32 upload_pitch = Common::AlignUpPow2<u32>(pitch, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
-	const u32 required_size = CalcUploadSize(r.height(), upload_pitch);
+	const u32 required_size = CalcUploadSize(height, upload_pitch);

 	D3D12_TEXTURE_COPY_LOCATION srcloc;
 	srcloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
@ -416,35 +379,25 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
 	srcloc.PlacedFootprint.Footprint.Format = m_dxgi_format;
 	srcloc.PlacedFootprint.Footprint.RowPitch = upload_pitch;

+	const auto upload_data = [&](void* map_ptr) {
+		CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
+	};
+
 	// If the texture is larger than half our streaming buffer size, use a separate buffer.
 	// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
 	if (required_size > (GSDevice12::GetInstance()->GetTextureStreamBuffer().GetSize() / 2))
 	{
-		srcloc.pResource = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
-		if (!srcloc.pResource)
-			return false;
-
+		srcloc.pResource = GSDevice12::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
 		srcloc.PlacedFootprint.Offset = 0;
 	}
 	else
 	{
-		D3D12StreamBuffer& sbuffer = GSDevice12::GetInstance()->GetTextureStreamBuffer();
-		if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
-		{
-			GSDevice12::GetInstance()->ExecuteCommandList(
-				false, "While waiting for %u bytes in texture upload buffer", required_size);
-			if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
-			{
-				Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
+		u32 offset;
+		srcloc.pResource = GSDevice12::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, offset);
+		srcloc.PlacedFootprint.Offset = offset;
+	}
+	if (!srcloc.pResource)
 		return false;
-			}
-		}
-
-		srcloc.pResource = sbuffer.GetBuffer();
-		srcloc.PlacedFootprint.Offset = sbuffer.GetCurrentOffset();
-		CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
-		sbuffer.CommitMemory(required_size);
-	}

 	ID3D12GraphicsCommandList* cmdlist = GetCommandBufferForUpdate();
 	GL_PUSH("GSTexture12::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);
--- a/pcsx2/GS/Renderers/DX12/GSTexture12.h
+++ b/pcsx2/GS/Renderers/DX12/GSTexture12.h
@ -79,7 +79,6 @@ private:
 	static bool CreateUAVDescriptor(ID3D12Resource* resource, DXGI_FORMAT format, D3D12DescriptorHandle* dh);

 	ID3D12GraphicsCommandList* GetCommandBufferForUpdate();
-	ID3D12Resource* AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
 	void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;

 	wil::com_ptr_nothrow<ID3D12Resource> m_resource;
--- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp
+++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp
@ -291,6 +291,360 @@ void GSRendererHW::Lines2Sprites()
 	}
 }

+static __forceinline void GetCoveringQuad(const GSVector2i& v0, const GSVector2i& v1, GSVertex* out)
+{
+	float x0 = static_cast<float>(v0.x) / 16.0f;
+	float y0 = static_cast<float>(v0.y) / 16.0f;
+	float x1 = static_cast<float>(v1.x) / 16.0f;
+	float y1 = static_cast<float>(v1.y) / 16.0f;
+
+	float dx = x1 - x0;
+	float dy = y1 - y0;
+	float d_len = sqrtf(dx * dx + dy * dy);
+	dx = 2.0f * dx / d_len;
+	dy = 2.0f * dy / d_len;
+
+	float nx = -dy;
+	float ny = dx;
+
+	int dxi = static_cast<int>(16.0f * dx);
+	int dyi = static_cast<int>(16.0f * dy);
+	int nxi = static_cast<int>(16.0f * nx);
+	int nyi = static_cast<int>(16.0f * ny);
+
+	GSVertex v[4];
+	std::memset(v, 0, sizeof(v));
+
+	v[0].XYZ.X = static_cast<u32>(std::clamp<int>(v0.x - dxi - nxi, 0, 0xFFFF));
+	v[0].XYZ.Y = static_cast<u32>(std::clamp<int>(v0.y - dyi - nyi, 0, 0xFFFF));
+
+	v[1].XYZ.X = static_cast<u32>(std::clamp<int>(v0.x - dxi + nxi, 0, 0xFFFF));
+	v[1].XYZ.Y = static_cast<u32>(std::clamp<int>(v0.y - dyi + nyi, 0, 0xFFFF));
+
+	v[2].XYZ.X = static_cast<u32>(std::clamp<int>(v1.x + dxi - nxi, 0, 0xFFFF));
+	v[2].XYZ.Y = static_cast<u32>(std::clamp<int>(v1.y + dyi - nyi, 0, 0xFFFF));
+
+	v[3].XYZ.X = static_cast<u32>(std::clamp<int>(v1.x + dxi + nxi, 0, 0xFFFF));
+	v[3].XYZ.Y = static_cast<u32>(std::clamp<int>(v1.y + dyi + nyi, 0, 0xFFFF));
+
+	out[0] = v[0];
+	out[1] = v[1];
+	out[2] = v[2];
+
+	out[3] = v[1];
+	out[4] = v[2];
+	out[5] = v[3];
+}
+
+void GSRendererHW::GetAccuratePrimsEdgeVertexAttributes(const GSVertex& vtx0, const GSVertex& vtx1, const GSVertex* vtx_provoking, AccuratePrimsEdgeData& data)
+{
+	GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
+	GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
+
+	// Interpolated attributes - mimicks transformations done in vertex shader.
+	GSVector2 uv0 = GSVector2(static_cast<float>(vtx0.U), static_cast<float>(vtx0.V)) - m_conf.cb_vs.texture_offset;
+	GSVector2 uv1 = GSVector2(static_cast<float>(vtx1.U), static_cast<float>(vtx1.V)) - m_conf.cb_vs.texture_offset;
+	GSVector2 uv0_scale = uv0 * m_conf.cb_vs.texture_scale;
+	GSVector2 uv1_scale = uv1 * m_conf.cb_vs.texture_scale;
+	GSVector2 st0 = GSVector2(vtx0.ST.S, vtx0.ST.T) - m_conf.cb_vs.texture_offset;
+	GSVector2 st1 = GSVector2(vtx1.ST.S, vtx1.ST.T) - m_conf.cb_vs.texture_offset;
+	GSVector2 st0_scale = PRIM->TME ? st0 / m_conf.cb_vs.texture_scale : GSVector2(0);
+	GSVector2 st1_scale = PRIM->TME ? st1 / m_conf.cb_vs.texture_scale : GSVector2(0);
+
+	float fog0;
+	float fog1;
+	if (vtx_provoking)
+	{
+		fog0 = fog1 = static_cast<float>(vtx_provoking->FOG) / 255.0f;
+	}
+	else
+	{
+		fog0 = static_cast<float>(vtx0.FOG) / 255.0f;
+		fog1 = static_cast<float>(vtx1.FOG) / 255.0f;
+	}
+
+	data.t_float0 = GSVector4(st0.x, st0.y, fog0, vtx0.RGBAQ.Q);
+	data.t_float1 = GSVector4(st1.x, st1.y, fog1, vtx1.RGBAQ.Q);
+	data.t_int0 = GSVector4(uv0_scale.x, uv0_scale.y);
+	data.t_int1 = GSVector4(uv1_scale.x, uv1_scale.y);
+
+	if (m_conf.vs.fst)
+	{
+		data.t_int0.z = uv0.x;
+		data.t_int0.w = uv0.y;
+		data.t_int1.z = uv1.x;
+		data.t_int1.w = uv1.y;
+	}
+	else
+	{
+		data.t_int0.z = st0_scale.x;
+		data.t_int0.w = st0_scale.y;
+		data.t_int1.z = st1_scale.x;
+		data.t_int1.w = st1_scale.y;
+	}
+
+	constexpr float exp_min32 = 0x1p-32f;
+	float z0 = static_cast<float>(std::min(vtx0.XYZ.Z, static_cast<u32>(m_conf.cb_vs.max_depth.x)));
+	float z1 = static_cast<float>(std::min(vtx1.XYZ.Z, static_cast<u32>(m_conf.cb_vs.max_depth.x)));
+
+	GSVector2 xy0 = GSVector2(v0.x, v0.y) - GSVector2(0.05f);
+	GSVector2 xy1 = GSVector2(v1.x, v1.y) - GSVector2(0.05f);
+
+	xy0 = xy0 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset;
+	xy1 = xy1 * m_conf.cb_vs.vertex_scale - m_conf.cb_vs.vertex_offset;
+
+	GSRendererType renderer = GSGetCurrentRenderer();
+	float y_sign = (renderer == GSRendererType::DX11 || renderer == GSRendererType::DX12) ? -1.0f : 1.0f;
+	data.p0 = GSVector4(xy0.x, y_sign * xy0.y, z0 * exp_min32, 1.0f);
+	data.p1 = GSVector4(xy1.x, y_sign * xy1.y, z1 * exp_min32, 1.0f);
+
+	if (vtx_provoking)
+	{
+		data.c0 = data.c1 = GSVector4(
+			static_cast<float>(vtx_provoking->RGBAQ.R),
+			static_cast<float>(vtx_provoking->RGBAQ.G),
+			static_cast<float>(vtx_provoking->RGBAQ.B),
+			static_cast<float>(vtx_provoking->RGBAQ.A));
+	}
+	else
+	{
+		data.c0 = GSVector4(
+			static_cast<float>(vtx0.RGBAQ.R),
+			static_cast<float>(vtx0.RGBAQ.G),
+			static_cast<float>(vtx0.RGBAQ.B),
+			static_cast<float>(vtx0.RGBAQ.A));
+		data.c1 = GSVector4(
+			static_cast<float>(vtx1.RGBAQ.R),
+			static_cast<float>(vtx1.RGBAQ.G),
+			static_cast<float>(vtx1.RGBAQ.B),
+			static_cast<float>(vtx1.RGBAQ.A));
+	}
+}
+
+void GSRendererHW::ExpandAccurateTrianglesEdge(
+	const GSVertex& vtx0,
+	const GSVertex& vtx1,
+	const GSVertex* vtx_provoking,
+	const GSVector4i& edge0,
+	const GSVector4i& edge1,
+	bool top_left,
+	AccuratePrimsEdgeData& data,
+	GSVertex* vertex_out)
+{
+	const GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
+	const GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
+
+	const GSVector4i& xyof = m_context->scissor.xyof;
+
+	data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y);
+	data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y);
+	const GSVector2i dxy = data.xy1 - data.xy0;
+	const bool pos_x = dxy.x >= 0;
+	const bool pos_y = dxy.y >= 0;
+	data.edge0 = edge0;
+	data.edge1 = edge1;
+	data.step_x = std::abs(dxy.x) >= std::abs(dxy.y);
+	data.side = top_left != (data.step_x && (dxy.y != 0) && (pos_x == pos_y));
+
+	GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data);
+
+	GetCoveringQuad(v0, v1, vertex_out);
+}
+
+static const u8 s_ysort[8][4] =
+{
+	{0, 1, 2, 0}, // y0 <= y1 <= y2
+	{1, 0, 2, 0}, // y1 < y0 <= y2
+	{0, 0, 0, 0},
+	{1, 2, 0, 0}, // y1 <= y2 < y0
+	{0, 2, 1, 0}, // y0 <= y2 < y1
+	{0, 0, 0, 0},
+	{2, 0, 1, 0}, // y2 < y0 <= y1
+	{2, 1, 0, 0}, // y2 < y1 < y0
+};
+
+void GSRendererHW::ExpandAccurateTrianglesVertices()
+{
+	constexpr int verts_per_prim = 21; // 3 verts for triangle interior; 3 x 6 verts for the edges.
+	const int prims = m_index.tail / 3;
+
+	while (m_vertex.maxcount < static_cast<u32>(prims * verts_per_prim))
+		GrowVertexBuffer();
+
+	m_accurate_prims_edge_data.clear();
+	m_accurate_prims_edge_data.resize(3 * prims);
+
+	const GSVector4i& xyof = m_context->scissor.xyof;
+
+	const bool flat_shade = !PRIM->IIP;
+	const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 2 : 0;
+
+	for (int i = 0; i < prims; i++)
+	{
+		// Code from GSRasterizer
+		const GSVertex& vtx0_orig = m_vertex.buff[m_index.buff[3 * i + 0]];
+		const GSVertex& vtx1_orig = m_vertex.buff[m_index.buff[3 * i + 1]];
+		const GSVertex& vtx2_orig = m_vertex.buff[m_index.buff[3 * i + 2]];
+
+		const GSVector2i v0_orig = { static_cast<int>(vtx0_orig.XYZ.X) - xyof.x, static_cast<int>(vtx0_orig.XYZ.Y) - xyof.y };
+		const GSVector2i v1_orig = { static_cast<int>(vtx1_orig.XYZ.X) - xyof.x, static_cast<int>(vtx1_orig.XYZ.Y) - xyof.y };
+		const GSVector2i v2_orig = { static_cast<int>(vtx2_orig.XYZ.X) - xyof.x, static_cast<int>(vtx2_orig.XYZ.Y) - xyof.y };
+
+		GSVector4i y0011(v0_orig.y, v0_orig.y, v1_orig.y, v1_orig.y);
+		GSVector4i y1221(v1_orig.y, v2_orig.y, v2_orig.y, v1_orig.y);
+
+		int m1 = GSVector4::cast(y0011 > y1221).mask() & 7;
+
+		const u8* idx = s_ysort[m1];
+
+		const GSVertex* vtx[3] = { &vtx0_orig, &vtx1_orig, &vtx2_orig };
+		const GSVector2i* v[3] = { &v0_orig, &v1_orig, &v2_orig };
+
+		const GSVertex& vtx0 = *vtx[idx[0]];
+		const GSVertex& vtx1 = *vtx[idx[1]];
+		const GSVertex& vtx2 = *vtx[idx[2]];
+		const GSVertex* vtx_provoking = flat_shade ? vtx[idx[provoking_offset]] : nullptr;
+
+		const GSVector2i& v0 = *v[idx[0]];
+		const GSVector2i& v1 = *v[idx[1]];
+		const GSVector2i& v2 = *v[idx[2]];
+
+		y0011 = GSVector4i(v0.y, v0.y, v1.y, v1.y);
+		y1221 = GSVector4i(v1.y, v2.y, v2.y, v1.y);
+
+		m1 = GSVector4::cast(y0011 == y1221).mask() & 7;
+
+		if (m1 == 7)
+			continue; // Degenerate triangle.
+
+		GSVector2i dv0 = v1 - v0;
+		GSVector2i dv1 = v2 - v0;
+		GSVector2i dv2 = v2 - v1;
+
+		int cross = dv0.y * dv1.x - dv0.x * dv1.y;
+
+		if (cross == 0)
+			continue; // Degenerate triangle
+
+		bool clockwise = cross < 0;
+
+		const bool tl0 = (v0.y == v1.y) || !clockwise;
+		const bool tl1 = clockwise;
+		const bool tl2 = (v1.y != v2.y) && !clockwise;
+
+		GSVector4i edge0 = GSVector4i( dv0.y, -dv0.x, 0, 0);
+		GSVector4i edge1 = GSVector4i(-dv1.y,  dv1.x, 0, 0);
+		GSVector4i edge2 = GSVector4i( dv2.y, -dv2.x, 0, 0);
+
+		edge0.z = v1.x * v0.y - v0.x * v1.y;
+		edge1.z = v0.x * v2.y - v2.x * v0.y;
+		edge2.z = v2.x * v1.y - v1.x * v2.y;
+
+		if (clockwise)
+		{
+			edge0 = GSVector4i(0) - edge0;
+			edge1 = GSVector4i(0) - edge1;
+			edge2 = GSVector4i(0) - edge2;
+		}
+
+		// Bias for top-left edges.
+		edge0.z += tl0 ? 1 : 0;
+		edge1.z += tl1 ? 1 : 0;
+		edge2.z += tl2 ? 1 : 0;
+
+		// Interior triangle
+		m_vertex.buff_copy[verts_per_prim * i + 0] = vtx0;
+		m_vertex.buff_copy[verts_per_prim * i + 1] = vtx1;
+		m_vertex.buff_copy[verts_per_prim * i + 2] = vtx2;
+
+		// Edges
+		ExpandAccurateTrianglesEdge(vtx0, vtx1, vtx_provoking, edge1, edge2, tl0, m_accurate_prims_edge_data[3 * i + 0],
+			&m_vertex.buff_copy[verts_per_prim * i + 3]);
+		ExpandAccurateTrianglesEdge(vtx0, vtx2, vtx_provoking, edge2, edge0, tl1, m_accurate_prims_edge_data[3 * i + 1],
+			&m_vertex.buff_copy[verts_per_prim * i + 9]);
+		ExpandAccurateTrianglesEdge(vtx1, vtx2, vtx_provoking, edge0, edge1, tl2, m_accurate_prims_edge_data[3 * i + 2],
+			&m_vertex.buff_copy[verts_per_prim * i + 15]);
+	}
+
+	m_index.tail = prims * verts_per_prim;
+	for (std::size_t i = 0; i < m_index.tail; i++)
+	{
+		m_index.buff[i] = i;
+	}
+	m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail;
+
+	std::swap(m_vertex.buff, m_vertex.buff_copy);
+}
+
+void GSRendererHW::ExpandAccurateLinesVertices()
+{
+	constexpr int verts_per_prim = 6; // 6 verts to form quad covering each line.
+	const int prims = m_index.tail / 2;
+
+	const bool flat_shade = !PRIM->IIP;
+	const int provoking_offset = g_gs_device->Features().provoking_vertex_last ? 1 : 0;
+
+	const auto ExitRule = [](const GSVector2i& d, bool step_x, bool pos_step) {
+		int dist = std::abs(d.x) + std::abs(d.y);
+		if (dist < 8)
+			return false;
+
+		if (step_x)
+		{
+			bool x_good = pos_step ? (d.x > 0) : (d.x < 0);
+			return x_good && (dist > 8 || d.y >= 0);
+		}
+		else
+		{
+			bool y_good = pos_step ? (d.y > 0) : (d.y < 0);
+			return y_good && (dist > 8 || d.x >= 0);
+		}
+	};
+
+	while (m_vertex.maxcount < static_cast<u32>(verts_per_prim * prims))
+		GrowVertexBuffer();
+
+	m_accurate_prims_edge_data.clear();
+	m_accurate_prims_edge_data.resize(prims);
+
+	const GSVector4i& xyof = m_context->scissor.xyof;
+
+	for (int i = 0; i < prims; i++)
+	{
+		const GSVertex& vtx0 = m_vertex.buff[m_index.buff[2 * i + 0]];
+		const GSVertex& vtx1 = m_vertex.buff[m_index.buff[2 * i + 1]];
+		const GSVertex* vtx_provoking = flat_shade ? &m_vertex.buff[m_index.buff[2 * i + provoking_offset]] : nullptr;
+
+		const GSVector2i v0 = { static_cast<int>(vtx0.XYZ.X), static_cast<int>(vtx0.XYZ.Y) };
+		const GSVector2i v1 = { static_cast<int>(vtx1.XYZ.X), static_cast<int>(vtx1.XYZ.Y) };
+
+		AccuratePrimsEdgeData& data = m_accurate_prims_edge_data[i];
+
+		data.xy0 = GSVector2i(v0.x - xyof.x, v0.y - xyof.y);
+		data.xy1 = GSVector2i(v1.x - xyof.x, v1.y - xyof.y);
+		const GSVector2i dxy = data.xy1 - data.xy0;
+		const GSVector2i xy0_i = (data.xy0 + 8) & GSVector2i(~0xF);
+		const GSVector2i xy1_i = (data.xy1 + 8) & GSVector2i(~0xF);
+		data.step_x = std::abs(dxy.x) >= std::abs(dxy.y);
+		bool pos_step = data.step_x ? dxy.x >= 0 : dxy.y >= 0;
+		data.draw0 = !ExitRule(data.xy0 - xy0_i, data.step_x, pos_step);
+		data.draw1 = ExitRule(data.xy1 - xy1_i, data.step_x, pos_step);
+
+		GetAccuratePrimsEdgeVertexAttributes(vtx0, vtx1, vtx_provoking, data);
+
+		GetCoveringQuad(v0, v1, &m_vertex.buff_copy[i * verts_per_prim]);
+	}
+
+	m_index.tail = prims * verts_per_prim;
+	for (std::size_t i = 0; i < m_index.tail; i++)
+	{
+		m_index.buff[i] = i;
+	}
+	m_vertex.next = m_vertex.tail = m_vertex.head = m_index.tail;
+
+	std::swap(m_vertex.buff, m_vertex.buff_copy);
+}
+
 void GSRendererHW::ExpandLineIndices()
 {
 	const u32 process_count = (m_index.tail + 7) / 8 * 8;
@ -2471,7 +2825,7 @@ void GSRendererHW::Draw()

 	// Need to fix the alpha test, since the alpha will be fixed to 1.0 if ABE is disabled and AA1 is enabled
 	// So if it doesn't meet the condition, always fail, if it does, always pass (turn off the test).
-	if (IsCoverageAlpha() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1)
+	if (IsCoverageAlphaFixedOne() && m_cached_ctx.TEST.ATE && m_cached_ctx.TEST.ATST > 1)
 	{
 		const float aref = static_cast<float>(m_cached_ctx.TEST.AREF);
 		const int old_ATST = m_cached_ctx.TEST.ATST;
@ -5017,6 +5371,21 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
 			break;

 		case GS_LINE_CLASS:
+			{
+				if (features.accurate_prims)
+				{
+					GL_INS("HW: Using accurate lines");
+					ExpandAccurateLinesVertices();
+					m_conf.accurate_prims = true;
+					m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data;
+					m_conf.vs.accurate_prims = ACCURATE_PRIMS_LINE;
+					m_conf.ps.accurate_prims = ACCURATE_PRIMS_LINE;
+					m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0);
+					m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0);
+					m_conf.topology = GSHWDrawConfig::Topology::Triangle;
+					m_conf.indices_per_prim = 6;
+				}
+				else
 				{
 					m_conf.topology = GSHWDrawConfig::Topology::Line;
 					m_conf.indices_per_prim = 2;
@ -5036,6 +5405,7 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
 						}
 					}
 				}
+			}
 			break;

 		case GS_SPRITE_CLASS:
@ -5076,6 +5446,20 @@ void GSRendererHW::SetupIA(float target_scale, float sx, float sy, bool req_vert
 			break;

 		case GS_TRIANGLE_CLASS:
+			if (features.accurate_prims && PRIM->AA1)
+			{
+				GL_INS("HW: Using accurate triangles");
+				ExpandAccurateTrianglesVertices();
+				m_conf.accurate_prims = true;
+				m_conf.accurate_prims_edge_data = &m_accurate_prims_edge_data;
+				m_conf.vs.accurate_prims = ACCURATE_PRIMS_TRIANGLE;
+				m_conf.ps.accurate_prims = ACCURATE_PRIMS_TRIANGLE;
+				m_conf.ps.accurate_prims_aa = (PRIM->AA1 != 0);
+				m_conf.ps.accurate_prims_aa_abe = (PRIM->ABE != 0);
+				m_conf.topology = GSHWDrawConfig::Topology::Triangle;
+				m_conf.indices_per_prim = 21;
+			}
+			else
 			{
 				m_conf.topology = GSHWDrawConfig::Topology::Triangle;
 				m_conf.indices_per_prim = 3;
@ -5130,6 +5514,10 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds)
 		m_conf.depth.ztst = ZTST_ALWAYS;
 	}

+	// Accurate prims requires a manual depth interpolation in the pixel shader.
+	// Piggy-back on Z clamp to avoid creating more pipeline combinations.
+	bool accurate_prims_clamp_z = UsingAccuratePrims() && (m_conf.depth.zwe || m_conf.depth.ztst != ZTST_ALWAYS);
+
 	// On the real GS we appear to do clamping on the max z value the format allows.
 	// Clamping is done after rasterization.
 	const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_cached_ctx.ZBUF.PSM].fmt * 8);
@ -5139,16 +5527,23 @@ void GSRendererHW::EmulateZbuffer(const GSTextureCache::Target* ds)
 	//ps_cb.MaxDepth = GSVector4(0.0f, 0.0f, 0.0f, 1.0f);
 	m_conf.ps.zclamp = 0;

-	if (clamp_z)
+	if (clamp_z || accurate_prims_clamp_z)
 	{
 		if (m_vt.m_primclass == GS_SPRITE_CLASS || m_vt.m_primclass == GS_POINT_CLASS)
 		{
 			m_conf.cb_vs.max_depth = GSVector2i(max_z);
 		}
-		else if (!m_cached_ctx.ZBUF.ZMSK)
+		else if (!m_cached_ctx.ZBUF.ZMSK || accurate_prims_clamp_z)
 		{
 			m_conf.cb_ps.TA_MaxDepth_Af.z = static_cast<float>(max_z) * 0x1p-32f;
 			m_conf.ps.zclamp = 1;
+			if (accurate_prims_clamp_z && m_vt.m_primclass == GS_TRIANGLE_CLASS && PRIM->AA1 &&
+				m_cached_ctx.TEST.ZTE && (m_conf.depth.ztst == ZTST_GEQUAL || m_conf.depth.ztst == ZTST_GREATER))
+			{
+				// For HW AA1 with triangles we must do Z test in the shader to get proper
+				// updating of the Z buffer (interior triangle points update the Z buffer but edges should not).
+				m_conf.ps.ztst = m_conf.depth.ztst;
+			}
 		}
 	}
 }
@ -5619,15 +6014,13 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
 {
 	const GIFRegALPHA& ALPHA = m_context->ALPHA;
 	{
-		// AA1: Blending needs to be enabled on draw.
-		const bool AA1 = PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS);
 		// PABE: Check condition early as an optimization, no blending when As < 128.
 		// For Cs*As + Cd*(1 - As) if As is 128 then blending can be disabled as well.
 		const bool PABE_skip = m_draw_env->PABE.PABE &&
 			((GetAlphaMinMax().max < 128) || (GetAlphaMinMax().max == 128 && ALPHA.A == 0 && ALPHA.B == 1 && ALPHA.C == 0 && ALPHA.D == 1));

 		// No blending or coverage anti-aliasing so early exit
-		if (PABE_skip || !(NeedsBlending() || AA1))
+		if (PABE_skip || !(NeedsBlending() || IsCoverageAlpha()))
 		{
 			m_conf.blend = {};
 			m_conf.ps.no_color1 = true;
@ -7315,8 +7708,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 		const bool is_overlap_alpha = m_prim_overlap != PRIM_OVERLAP_NO && !(m_cached_ctx.FRAME.FBMSK & 0x80000000);
 		if (m_cached_ctx.TEST.DATM == 0)
 		{
-			// Some pixles are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass).
-			DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlpha())));
+			// Some pixels are >= 1 so some fail, or some pixels get written but the written alpha matches or exceeds 1 (so overlap doesn't always pass).
+			DATE = rt->m_alpha_max >= 128 || (is_overlap_alpha && rt->m_alpha_min < 128 && (GetAlphaMinMax().max >= 128 || (m_context->FBA.FBA || IsCoverageAlphaFixedOne())));

 			// All pixels fail.
 			if (DATE && rt->m_alpha_min >= 128)
@ -7324,8 +7717,8 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 		}
 		else
 		{
-			// Some pixles are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass).
-			DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlpha())));
+			// Some pixels are < 1 so some fail, or some pixels get written but the written alpha goes below 1 (so overlap doesn't always pass).
+			DATE = rt->m_alpha_min < 128 || (is_overlap_alpha && rt->m_alpha_max >= 128 && (GetAlphaMinMax().min < 128 && !(m_context->FBA.FBA || IsCoverageAlphaFixedOne())));

 			// All pixels fail.
 			if (DATE && rt->m_alpha_max < 128)
@ -7477,7 +7870,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 		}
 		// When Blending is disabled and Edge Anti Aliasing is enabled,
 		// the output alpha is Coverage (which we force to 128) so DATE will fail/pass guaranteed on second pass.
-		else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlpha()) && features.stencil_buffer)
+		else if (m_conf.colormask.wa && (m_context->FBA.FBA || IsCoverageAlphaFixedOne()) && features.stencil_buffer)
 		{
 			GL_PERF("DATE: Fast with FBA, all pixels will be >= 128");
 			DATE_one = !m_cached_ctx.TEST.DATM;
@ -7663,7 +8056,7 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 	}

 	// AA1: Set alpha source to coverage 128 when there is no alpha blending.
-	m_conf.ps.fixed_one_a = IsCoverageAlpha();
+	m_conf.ps.fixed_one_a = IsCoverageAlphaFixedOne();

 	if ((!IsOpaque() || m_context->ALPHA.IsBlack()) && rt && ((m_conf.colormask.wrgba & 0x7) || (m_texture_shuffle && !m_copy_16bit_to_target_shuffle && !m_same_group_texture_shuffle)))
 	{
@ -8030,6 +8423,23 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 		m_conf.require_full_barrier = false;
 	}

+	if ((features.texture_barrier || features.multidraw_fb_copy) && UsingAccuratePrims() &&
+		(m_vt.m_primclass == GS_TRIANGLE_CLASS) && PRIM->AA1 && m_conf.ps.zclamp)
+	{
+		// Manual depth test in the shader requires full barrier.
+		if (m_prim_overlap == PRIM_OVERLAP_NO)
+			m_conf.require_one_barrier = true;
+		else
+			m_conf.require_full_barrier = true;
+	}
+
+	if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy))
+	{
+		ComputeDrawlistGetSize(rt->m_scale);
+		m_conf.drawlist = &m_drawlist;
+		m_conf.drawlist_bbox = &m_drawlist_bbox;
+	}
+
 	// rs
 	const GSVector4i hacked_scissor = m_channel_shuffle ? GSVector4i::cxpr(0, 0, 1024, 1024) : m_context->scissor.in;
 	const GSVector4i scissor(GSVector4i(GSVector4(rtscale) * GSVector4(hacked_scissor)).rintersect(GSVector4i::loadh(rtsize)));
@ -8125,13 +8535,6 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
 		m_conf.alpha_second_pass.enable = false;
 	}
 	
-	if (m_conf.require_full_barrier && (g_gs_device->Features().texture_barrier || g_gs_device->Features().multidraw_fb_copy))
-	{
-		ComputeDrawlistGetSize(rt->m_scale);
-		m_conf.drawlist = &m_drawlist;
-		m_conf.drawlist_bbox = &m_drawlist_bbox;
-	}
-
 	if (!m_channel_shuffle_width)
 		g_gs_device->RenderHW(m_conf);
 	else
@ -9574,3 +9977,10 @@ std::size_t GSRendererHW::ComputeDrawlistGetSize(float scale)
 	}
 	return m_drawlist.size();
 }
+
+bool GSRendererHW::IsCoverageAlphaSupported()
+{
+	return IsCoverageAlpha() &&
+	       ((m_vt.m_primclass == GS_LINE_CLASS || m_vt.m_primclass == GS_TRIANGLE_CLASS) &&
+			   g_gs_device->Features().accurate_prims);
+}
--- a/pcsx2/GS/Renderers/HW/GSRendererHW.h
+++ b/pcsx2/GS/Renderers/HW/GSRendererHW.h
@ -137,6 +137,21 @@ private:
 	bool IsUsingCsInBlend();
 	bool IsUsingAsInBlend();

+	void GetAccuratePrimsEdgeVertexAttributes(
+		const GSVertex& vtx0,
+		const GSVertex& vtx1,
+		const GSVertex* vtx_provoking,
+		AccuratePrimsEdgeData& data);
+	void ExpandAccurateTrianglesEdge(
+		const GSVertex& vtx0,
+		const GSVertex& vtx1,
+		const GSVertex* vtx_provoking,
+		const GSVector4i& edge0,
+		const GSVector4i& edge1,
+		bool top_left,
+		AccuratePrimsEdgeData& data,
+		GSVertex* vertex_out);
+
 	// We modify some of the context registers to optimize away unnecessary operations.
 	// Instead of messing with the real context, we copy them and use those instead.
 	struct HWCachedCtx
@ -205,6 +220,8 @@ private:
 	std::unique_ptr<GSTextureCacheSW::Texture> m_sw_texture[7 + 1];
 	std::unique_ptr<GSVirtualAlignedClass<32>> m_sw_rasterizer;

+	std::vector<AccuratePrimsEdgeData> m_accurate_prims_edge_data;
+
 public:
 	GSRendererHW();
 	virtual ~GSRendererHW() override;
@ -221,6 +238,8 @@ public:
 	void Lines2Sprites();
 	bool VerifyIndices();
 	void ExpandLineIndices();
+	void ExpandAccurateLinesVertices();
+	void ExpandAccurateTrianglesVertices();
 	void ConvertSpriteTextureShuffle(u32& process_rg, u32& process_ba, bool& shuffle_across, GSTextureCache::Target* rt, GSTextureCache::Source* tex);
 	GSVector4 RealignTargetTextureCoordinate(const GSTextureCache::Source* tex);
 	GSVector4i ComputeBoundingBox(const GSVector2i& rtsize, float rtscale);
@ -273,4 +292,6 @@ public:

 	/// Compute the drawlist (if not already present) and bounding boxes for the current draw.
 	std::size_t ComputeDrawlistGetSize(float scale);
+
+	bool IsCoverageAlphaSupported() override;
 };
--- a/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h
+++ b/pcsx2/GS/Renderers/Metal/GSMTLSharedHeader.h
@ -94,6 +94,11 @@ struct GSMTLMainVSUniform
 	vector_float2 texture_offset;
 	vector_float2 point_size;
 	uint max_depth;
+	uint _pad0;
+	uint base_vertex;
+	uint _pad1;
+	uint _pad2;
+	uint _pad3;
 };

 struct GSMTLMainPSUniform
@ -134,6 +139,8 @@ struct GSMTLMainPSUniform
 	matrix_float4x4 dither_matrix;

 	vector_float4 scale_factor;
+
+	vector_uint4 accurate_prims_base_index;
 };

 enum GSMTLAttributes
--- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp
+++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp
@ -310,10 +310,10 @@ namespace
 	};
 } // namespace

-std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size)
+std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size, bool nonsyncing)
 {
 	std::unique_ptr<GLStreamBuffer> buf;
-	if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage)
+	if (!nonsyncing && (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage))
 	{
 		buf = BufferStorageStreamBuffer::Create(target, size);
 		if (buf)
--- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h
+++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h
@ -38,7 +38,7 @@ public:
 	/// Returns the minimum granularity of blocks which sync objects will be created around.
 	virtual u32 GetChunkSize() const = 0;

-	static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size);
+	static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size, bool nonsyncing = false);

 protected:
 	GLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
--- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp
+++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp
@ -26,6 +26,7 @@ static constexpr u32 g_ps_cb_index        = 0;

 static constexpr u32 VERTEX_BUFFER_SIZE = 32 * 1024 * 1024;
 static constexpr u32 INDEX_BUFFER_SIZE = 16 * 1024 * 1024;
+static constexpr u32 ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024;
 static constexpr u32 VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024;
 static constexpr u32 FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024;
 static constexpr u32 TEXTURE_UPLOAD_BUFFER_SIZE = 128 * 1024 * 1024;
@ -258,10 +259,18 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)

 		m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE);
 		m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE);
+		if (m_features.accurate_prims)
+		{
+			// Performance note: prefer a non-syncing buffer for accurate prims so that it is more likely to be GPU local.
+			// Rationale: we expect this buffer to be updated relatively rarely and it's used as a pixel shader resource.
+			m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE, true);
+		}
 		m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE);
 		m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE);
 		glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment);
-		if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer)
+		if (!m_vertex_stream_buffer || !m_index_stream_buffer ||
+			(m_features.accurate_prims && !m_accurate_prims_stream_buffer) ||
+			!m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer)
 		{
 			Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers");
 			return false;
@ -303,6 +312,11 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
 			glBufferData(GL_ELEMENT_ARRAY_BUFFER, EXPAND_BUFFER_SIZE, expand_data.get(), GL_STATIC_DRAW);
 			glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, m_vertex_stream_buffer->GetGLBufferId(), 0, VERTEX_BUFFER_SIZE);
 		}
+
+		if (m_features.accurate_prims)
+		{
+			glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 3, m_accurate_prims_stream_buffer->GetGLBufferId(), 0, ACCURATE_PRIMS_BUFFER_SIZE);
+		}
 	}

 	// ****************************************************************
@ -770,6 +784,8 @@ bool GSDeviceOGL::CheckFeatures()
 		m_features.line_expand ? "hardware" : (m_features.vs_expand ? "vertex expanding" : "UNSUPPORTED"),
 		m_features.vs_expand ? "vertex expanding" : "CPU");

+	m_features.accurate_prims = GSConfig.HWAccuratePrims;
+
 	return true;
 }

@ -840,6 +856,7 @@ void GSDeviceOGL::DestroyResources()

 	m_fragment_uniform_stream_buffer.reset();
 	m_vertex_uniform_stream_buffer.reset();
+	m_accurate_prims_stream_buffer.reset();

 	glBindVertexArray(0);
 	if (m_expand_ibo != 0)
@ -1330,8 +1347,9 @@ std::string GSDeviceOGL::GetVSSource(VSSelector sel)
 	std::string macro = fmt::format("#define VS_FST {}\n", static_cast<u32>(sel.fst))
 		+ fmt::format("#define VS_IIP {}\n", static_cast<u32>(sel.iip))
 		+ fmt::format("#define VS_POINT_SIZE {}\n", static_cast<u32>(sel.point_size))
-	  + fmt::format("#define VS_EXPAND {}\n", static_cast<int>(sel.expand));
-
+		+ fmt::format("#define VS_EXPAND {}\n", static_cast<int>(sel.expand))
+		+ fmt::format("#define VS_ACCURATE_PRIMS {}\n", static_cast<int>(sel.accurate_prims))
+	;
 	std::string src = GenGlslHeader("vs_main", GL_VERTEX_SHADER, macro);
 	src += m_shader_tfx_vgs;
 	return src;
@ -1396,6 +1414,10 @@ std::string GSDeviceOGL::GetPSSource(const PSSelector& sel)
 		+ fmt::format("#define PS_SCANMSK {}\n", sel.scanmsk)
 		+ fmt::format("#define PS_NO_COLOR {}\n", sel.no_color)
 		+ fmt::format("#define PS_NO_COLOR1 {}\n", sel.no_color1)
+		+ fmt::format("#define PS_ACCURATE_PRIMS {}\n", sel.accurate_prims)
+		+ fmt::format("#define PS_ACCURATE_PRIMS_AA {}\n", sel.accurate_prims_aa)
+		+ fmt::format("#define PS_ACCURATE_PRIMS_AA_ABE {}\n", sel.accurate_prims_aa_abe)
+		+ fmt::format("#define PS_ZTST {}\n", sel.ztst)
 	;

 	std::string src = GenGlslHeader("ps_main", GL_FRAGMENT_SHADER, macro);
@ -2012,6 +2034,21 @@ void GSDeviceOGL::ClearSamplerCache()
 	}
 }

+void GSDeviceOGL::SetupAccuratePrims(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		const u32 count = config.accurate_prims_edge_data->size();
+		const u32 size = count * sizeof(AccuratePrimsEdgeData);
+		auto res = m_accurate_prims_stream_buffer->Map(sizeof(AccuratePrimsEdgeData), size);
+		std::memcpy(res.pointer, config.accurate_prims_edge_data->data(), size);
+		m_accurate_prims_stream_buffer->Unmap(size);
+		
+		config.cb_vs.base_vertex.x = m_vertex.start;
+		config.cb_ps.accurate_prims_base_index.x = res.index_aligned;
+	}
+}
+
 bool GSDeviceOGL::CreateCASPrograms()
 {
 	std::optional<std::string> cas_source = ReadShaderSource("shaders/opengl/cas.glsl");
@ -2525,6 +2562,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
 	IASetVertexBuffer(config.verts, config.nverts, GetVertexAlignment(config.vs.expand));
 	m_vertex.start *= GetExpansionFactor(config.vs.expand);

+	SetupAccuratePrims(config);
+
 	if (config.vs.UseExpandIndexBuffer())
 	{
 		IASetVAO(m_expand_vao);
@ -2554,6 +2593,8 @@ void GSDeviceOGL::RenderHW(GSHWDrawConfig& config)
 		PSSetShaderResource(2, draw_rt_clone);
 	else if (config.require_one_barrier || config.require_full_barrier)
 		PSSetShaderResource(2, colclip_rt ? colclip_rt : config.rt);
+	if ((config.require_one_barrier || config.require_full_barrier) && config.ps.IsFeedbackLoopDepth())
+		PSSetShaderResource(4, config.ds);

 	SetupSampler(config.sampler);

@ -2761,7 +2802,7 @@ void GSDeviceOGL::SendHWDraw(const GSHWDrawConfig& config, bool one_barrier, boo
 	}

 #ifdef PCSX2_DEVBUILD
-	if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
+	if ((one_barrier || full_barrier) && !(config.ps.IsFeedbackLoop() || config.ps.IsFeedbackLoopDepth())) [[unlikely]]
 		Console.Warning("OpenGL: Possible unnecessary barrier detected.");
 #endif

--- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h
+++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h
@ -157,6 +157,7 @@ private:

 	std::unique_ptr<GLStreamBuffer> m_vertex_stream_buffer;
 	std::unique_ptr<GLStreamBuffer> m_index_stream_buffer;
+	std::unique_ptr<GLStreamBuffer> m_accurate_prims_stream_buffer;
 	GLuint m_expand_ibo = 0;
 	GLuint m_vao = 0;
 	GLuint m_expand_vao = 0;
@ -346,6 +347,7 @@ public:
 	void IASetPrimitiveTopology(GLenum topology);
 	void IASetVertexBuffer(const void* vertices, size_t count, size_t align_multiplier = 1);
 	void IASetIndexBuffer(const void* index, size_t count);
+	void SetupAccuratePrims(GSHWDrawConfig& config);

 	void PSSetShaderResource(int i, GSTexture* sr);
 	void PSSetSamplerState(GLuint ss);
--- a/pcsx2/GS/Renderers/SW/GSRendererSW.h
+++ b/pcsx2/GS/Renderers/SW/GSRendererSW.h
@ -82,6 +82,8 @@ protected:
 	template <u32 primclass>
 	void RewriteVerticesIfSTOverflow();

+	bool IsCoverageAlphaSupported() override { return true; }
+
 public:
 	GSRendererSW(int threads);
 	~GSRendererSW() override;
--- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp
+++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp
@ -41,6 +41,7 @@ enum : u32

 	VERTEX_BUFFER_SIZE = 32 * 1024 * 1024,
 	INDEX_BUFFER_SIZE = 16 * 1024 * 1024,
+	ACCURATE_PRIMS_BUFFER_SIZE = 32 * 1024 * 1024,
 	VERTEX_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
 	FRAGMENT_UNIFORM_BUFFER_SIZE = 8 * 1024 * 1024,
 	TEXTURE_BUFFER_SIZE = 64 * 1024 * 1024,
@ -932,7 +933,7 @@ bool GSDeviceVK::CreateGlobalDescriptorPool()
 {
 	static constexpr const VkDescriptorPoolSize pool_sizes[] = {
 		{VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 2},
-		{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2},
+		{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3},
 	};

 	VkDescriptorPoolCreateInfo pool_create_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr,
@ -1501,12 +1502,13 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
 	VkAttachmentReference* color_reference_ptr = nullptr;
 	VkAttachmentReference depth_reference;
 	VkAttachmentReference* depth_reference_ptr = nullptr;
-	VkAttachmentReference input_reference;
-	VkAttachmentReference* input_reference_ptr = nullptr;
-	VkSubpassDependency subpass_dependency;
-	VkSubpassDependency* subpass_dependency_ptr = nullptr;
+	std::array<VkAttachmentReference, 2> input_reference;
+	u32 num_subpass_inputs = 0;
+	std::array<VkSubpassDependency, 2> subpass_dependency;
+	u32 num_subpass_dependencies = 0;
 	std::array<VkAttachmentDescription, 2> attachments;
 	u32 num_attachments = 0;
+	bool actual_color_feedback_loop = false;
 	if (key.color_format != VK_FORMAT_UNDEFINED)
 	{
 		const VkImageLayout layout =
@ -1522,28 +1524,32 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)

 		if (key.color_feedback_loop)
 		{
+			actual_color_feedback_loop = true;
+
 			if (!UseFeedbackLoopLayout())
 			{
-				input_reference.attachment = num_attachments;
-				input_reference.layout = layout;
-				input_reference_ptr = &input_reference;
+				pxAssert(num_subpass_inputs == 0); // Must always have the color input first.
+				input_reference[num_subpass_inputs].attachment = num_attachments;
+				input_reference[num_subpass_inputs].layout = layout;
+				num_subpass_inputs++;
 			}

 			if (!m_features.framebuffer_fetch)
 			{
+				pxAssert(num_subpass_dependencies == 0); // Must always have the color input first.
 				// don't need the framebuffer-local dependency when we have rasterization order attachment access
-				subpass_dependency.srcSubpass = 0;
-				subpass_dependency.dstSubpass = 0;
-				subpass_dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
-				subpass_dependency.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
-				subpass_dependency.srcAccessMask =
+				subpass_dependency[num_subpass_dependencies].srcSubpass = 0;
+				subpass_dependency[num_subpass_dependencies].dstSubpass = 0;
+				subpass_dependency[num_subpass_dependencies].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+				subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+				subpass_dependency[num_subpass_dependencies].srcAccessMask =
 					VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-				subpass_dependency.dstAccessMask =
+				subpass_dependency[num_subpass_dependencies].dstAccessMask =
 					UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
-				subpass_dependency.dependencyFlags =
+				subpass_dependency[num_subpass_dependencies].dependencyFlags =
 					UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
 											  VK_DEPENDENCY_BY_REGION_BIT;
-				subpass_dependency_ptr = &subpass_dependency;
+				num_subpass_dependencies++;
 			}
 		}

@ -1562,6 +1568,41 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
 		depth_reference.attachment = num_attachments;
 		depth_reference.layout = layout;
 		depth_reference_ptr = &depth_reference;
+
+		if (actual_color_feedback_loop && key.depth_sampling)
+		{
+			// Note: We only allow depth to be bound in a feedback loop if color is already bound as such.
+			// This is partly because it doesn't seem likely that we will ever need a depth feedback loop
+			// without a color feedback loop and to simplify the indices for subpass inputs (0 for color; 1 for depth);
+
+			if (!UseFeedbackLoopLayout())
+			{
+				pxAssert(num_subpass_inputs == 1); // Must always have the color input first.
+				input_reference[num_subpass_inputs].attachment = num_attachments;
+				input_reference[num_subpass_inputs].layout = layout;
+				num_subpass_inputs++;
+			}
+
+			if (!m_features.framebuffer_fetch)
+			{
+				pxAssert(num_subpass_dependencies == 1); // Must always have the color input first.
+				// don't need the framebuffer-local dependency when we have rasterization order attachment access
+				subpass_dependency[num_subpass_dependencies].srcSubpass = 0;
+				subpass_dependency[num_subpass_dependencies].dstSubpass = 0;
+				subpass_dependency[num_subpass_dependencies].srcStageMask =
+					VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+				subpass_dependency[num_subpass_dependencies].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+				subpass_dependency[num_subpass_dependencies].srcAccessMask =
+					VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+				subpass_dependency[num_subpass_dependencies].dstAccessMask =
+					UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+				subpass_dependency[num_subpass_dependencies].dependencyFlags =
+					UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
+											  VK_DEPENDENCY_BY_REGION_BIT;
+				num_subpass_dependencies++;
+			}
+		}
+
 		num_attachments++;
 	}

@ -1569,11 +1610,11 @@ VkRenderPass GSDeviceVK::CreateCachedRenderPass(RenderPassCacheKey key)
 		(key.color_feedback_loop && m_optional_extensions.vk_ext_rasterization_order_attachment_access) ?
 			VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT :
 			0;
-	const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, input_reference_ptr ? 1u : 0u,
-		input_reference_ptr ? input_reference_ptr : nullptr, color_reference_ptr ? 1u : 0u,
+	const VkSubpassDescription subpass = {subpass_flags, VK_PIPELINE_BIND_POINT_GRAPHICS, num_subpass_inputs,
+		num_subpass_inputs ? input_reference.data() : nullptr, color_reference_ptr ? 1u : 0u,
 		color_reference_ptr ? color_reference_ptr : nullptr, nullptr, depth_reference_ptr, 0, nullptr};
 	const VkRenderPassCreateInfo pass_info = {VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, nullptr, 0u, num_attachments,
-		attachments.data(), 1u, &subpass, subpass_dependency_ptr ? 1u : 0u, subpass_dependency_ptr};
+		attachments.data(), 1u, &subpass, num_subpass_dependencies, num_subpass_dependencies ? subpass_dependency.data() : nullptr};

 	VkRenderPass pass;
 	const VkResult res = vkCreateRenderPass(m_device, &pass_info, nullptr, &pass);
@ -2679,6 +2720,8 @@ bool GSDeviceVK::CheckFeatures()

 	m_max_texture_size = m_device_properties.limits.maxImageDimension2D;

+	m_features.accurate_prims = GSConfig.HWAccuratePrims;
+
 	return true;
 }

@ -3363,6 +3406,135 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count)
 	SetIndexBuffer(m_index_stream_buffer.GetBuffer());
 }

+void GSDeviceVK::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		const u32 count = config.accurate_prims_edge_data->size();
+		const u32 size = count * sizeof(AccuratePrimsEdgeData);
+
+		// Reserve the GPU region.
+		if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
+		{
+			ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
+			if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
+				pxFailRel("Failed to reserve space for accurate prims");
+		}
+
+		const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
+		
+		if (InRenderPass())
+			EndRenderPass();
+
+		// Copy data to an upload buffer.
+		VkBuffer upload_buffer;
+		u32 upload_buffer_offset;
+
+		const auto upload_data = [&](void* map_ptr) {
+			std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
+		};
+
+		// If the texture is larger than half our streaming buffer size, use a separate buffer.
+		// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
+		if (size > m_texture_stream_buffer.GetCurrentSize() / 2)
+		{
+			upload_buffer_offset = 0;
+			upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
+		}
+		else
+		{
+			upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
+		}
+		if (upload_buffer == VK_NULL_HANDLE)
+		{
+			Console.Error("Failed to get upload buffer for accurate prims data.");
+			return;
+		}
+
+		// Copy data from upload to GPU buffer.
+		VkBufferCopy copyRegion = {upload_buffer_offset, offset, size};
+		vkCmdCopyBuffer(GetCurrentCommandBuffer(), upload_buffer, m_accurate_prims_stream_buffer.GetBuffer(), 1, &copyRegion);
+
+		// Commit the GPU region.
+		m_accurate_prims_stream_buffer.CommitMemory(size);
+
+		// Issue the barrier since this will be used next draw.
+		VkBufferMemoryBarrier barrier = {
+			VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, nullptr,
+			VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
+			VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
+			m_accurate_prims_stream_buffer.GetBuffer(), offset, size};
+		vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
+			VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+			0, 0, nullptr, 1, &barrier, 0, nullptr);
+
+		m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
+	}
+}
+
+void GSDeviceVK::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
+{
+	if (config.accurate_prims)
+	{
+		// We separate this from setting up the buffer to mirror Vulkan, which requires it.
+		config.cb_vs.base_vertex = m_vertex.start;
+		config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
+
+		SetVSConstantBuffer(config.cb_vs);
+		SetPSConstantBuffer(config.cb_ps);
+	}
+}
+
+VkBuffer GSDeviceVK::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
+{
+	if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
+	{
+		ExecuteCommandBuffer(
+			false, "While waiting for %u bytes in texture upload buffer", size);
+		if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
+		{
+			Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
+			return VK_NULL_HANDLE;
+		}
+	}
+
+	offset_out = m_texture_stream_buffer.GetCurrentOffset();
+	write_data(m_texture_stream_buffer.GetCurrentHostPointer());
+	m_texture_stream_buffer.CommitMemory(size);
+	return m_texture_stream_buffer.GetBuffer();
+}
+
+VkBuffer GSDeviceVK::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
+{
+	const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
+		VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
+
+	// Don't worry about setting the coherent bit for this upload, the main reason we had
+	// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
+	// smaller uploads, but we're writing to the whole thing anyway.
+	VmaAllocationCreateInfo aci = {};
+	aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+	aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+
+	VmaAllocationInfo ai;
+	VkBuffer buffer;
+	VmaAllocation allocation;
+	VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
+	if (res != VK_SUCCESS)
+	{
+		LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
+		return VK_NULL_HANDLE;
+	}
+
+	// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
+	GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
+
+	// And write the data.
+	write_data(ai.pMappedData);
+	vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
+	return buffer;
+}
+
 void GSDeviceVK::OMSetRenderTargets(
 	GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop)
 {
@ -3379,12 +3551,15 @@ void GSDeviceVK::OMSetRenderTargets(
 		if (vkRt)
 		{
 			m_current_framebuffer =
-				vkRt->GetLinkedFramebuffer(vkDs, (feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0);
+				vkRt->GetLinkedFramebuffer(vkDs,
+					(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) != 0,
+					(feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth) != 0);
 		}
 		else
 		{
-			pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT));
-			m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false);
+			pxAssert(!(feedback_loop & FeedbackLoopFlag_ReadAndWriteRT) &&
+					 !(feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth));
+			m_current_framebuffer = vkDs->GetLinkedFramebuffer(nullptr, false, false);
 		}
 	}
 	else if (InRenderPass())
@ -3494,7 +3669,21 @@ void GSDeviceVK::OMSetRenderTargets(
 		if (vkDs)
 		{
 			// need to update descriptors to reflect the new layout
-			if (feedback_loop & FeedbackLoopFlag_ReadDS)
+			if (feedback_loop & FeedbackLoopFlag_ReadAndWriteDepth)
+			{
+				// NVIDIA drivers appear to return random garbage when sampling the RT via a feedback loop, if the load op for
+				// the render pass is CLEAR. Using vkCmdClearAttachments() doesn't work, so we have to clear the image instead.
+				// Note: DS feedback loop was added later - we will assume that the same issue is relevant.
+				if (vkDs->GetState() == GSTexture::State::Cleared && IsDeviceNVIDIA())
+					vkDs->CommitClear();
+
+				if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop)
+				{
+					m_dirty_flags |= (DIRTY_FLAG_TFX_TEXTURE_0 << TFX_TEXTURE_DEPTH);
+					vkDs->TransitionToLayout(GSTextureVK::Layout::FeedbackLoop);
+				}
+			}
+			else if (feedback_loop & FeedbackLoopFlag_ReadDepth)
 			{
 				if (vkDs->GetLayout() != GSTextureVK::Layout::FeedbackLoop)
 				{
@ -3675,6 +3864,16 @@ bool GSDeviceVK::CreateBuffers()
 		return false;
 	}

+	if (m_features.accurate_prims)
+	{
+		if (!m_accurate_prims_stream_buffer.Create(
+			VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, ACCURATE_PRIMS_BUFFER_SIZE, true))
+		{
+			Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
+			return false;
+		}
+	}
+
 	if (!m_vertex_uniform_stream_buffer.Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VERTEX_UNIFORM_BUFFER_SIZE))
 	{
 		Host::ReportErrorAsync("GS", "Failed to allocate vertex uniform buffer");
@ -3734,6 +3933,8 @@ bool GSDeviceVK::CreatePipelineLayouts()
 	dslb.AddBinding(1, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
 	if (m_features.vs_expand)
 		dslb.AddBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_VERTEX_BIT);
+	if (m_features.accurate_prims)
+		dslb.AddBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
 	if ((m_tfx_ubo_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE)
 		return false;
 	Vulkan::SetObjectName(dev, m_tfx_ubo_ds_layout, "TFX UBO descriptor layout");
@ -3746,6 +3947,10 @@ bool GSDeviceVK::CreatePipelineLayouts()
 		                                                           VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
 		1, VK_SHADER_STAGE_FRAGMENT_BIT);
 	dslb.AddBinding(TFX_TEXTURE_PRIMID, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
+	dslb.AddBinding(TFX_TEXTURE_DEPTH,
+		(m_features.texture_barrier && !UseFeedbackLoopLayout()) ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT :
+		                                                           VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+		1, VK_SHADER_STAGE_FRAGMENT_BIT);
 	if ((m_tfx_texture_ds_layout = dslb.Create(dev)) == VK_NULL_HANDLE)
 		return false;
 	Vulkan::SetObjectName(dev, m_tfx_texture_ds_layout, "TFX texture descriptor layout");
@ -4603,6 +4808,7 @@ void GSDeviceVK::DestroyResources()
 	m_fragment_uniform_stream_buffer.Destroy(false);
 	m_vertex_uniform_stream_buffer.Destroy(false);
 	m_index_stream_buffer.Destroy(false);
+	m_accurate_prims_stream_buffer.Destroy(false);
 	m_vertex_stream_buffer.Destroy(false);
 	if (m_expand_index_buffer != VK_NULL_HANDLE)
 		vmaDestroyBuffer(m_allocator, m_expand_index_buffer, m_expand_index_buffer_allocation);
@ -4670,6 +4876,7 @@ VkShaderModule GSDeviceVK::GetTFXVertexShader(GSHWDrawConfig::VSSelector sel)
 	AddMacro(ss, "VS_POINT_SIZE", sel.point_size);
 	AddMacro(ss, "VS_EXPAND", static_cast<int>(sel.expand));
 	AddMacro(ss, "VS_PROVOKING_VERTEX_LAST", static_cast<int>(m_features.provoking_vertex_last));
+	AddMacro(ss, "VS_ACCURATE_PRIMS", static_cast<int>(sel.accurate_prims));
 	ss << m_tfx_source;

 	VkShaderModule mod = g_vulkan_shader_cache->GetVertexShader(ss.str());
@ -4744,6 +4951,10 @@ VkShaderModule GSDeviceVK::GetTFXFragmentShader(const GSHWDrawConfig::PSSelector
 	AddMacro(ss, "PS_TEX_IS_FB", sel.tex_is_fb);
 	AddMacro(ss, "PS_NO_COLOR", sel.no_color);
 	AddMacro(ss, "PS_NO_COLOR1", sel.no_color1);
+	AddMacro(ss, "PS_ACCURATE_PRIMS", sel.accurate_prims);
+	AddMacro(ss, "PS_ACCURATE_PRIMS_AA", sel.accurate_prims_aa);
+	AddMacro(ss, "PS_ACCURATE_PRIMS_AA_ABE", sel.accurate_prims_aa_abe);
+	AddMacro(ss, "PS_ZTST", sel.ztst);
 	ss << m_tfx_source;

 	VkShaderModule mod = g_vulkan_shader_cache->GetFragmentShader(ss.str());
@ -4945,6 +5156,11 @@ bool GSDeviceVK::CreatePersistentDescriptorSets()
 		dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
 			m_vertex_stream_buffer.GetBuffer(), 0, VERTEX_BUFFER_SIZE);
 	}
+	if (m_features.accurate_prims)
+	{
+		dsub.AddBufferDescriptorWrite(m_tfx_ubo_descriptor_set, 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+			m_accurate_prims_stream_buffer.GetBuffer(), 0, ACCURATE_PRIMS_BUFFER_SIZE);
+	}
 	dsub.Update(dev);
 	Vulkan::SetObjectName(dev, m_tfx_ubo_descriptor_set, "Persistent TFX UBO set");
 	return true;
@ -5341,11 +5557,15 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed)
 		m_current_pipeline_layout = PipelineLayout::TFX;
 		flags |= DIRTY_FLAG_TFX_UBO | DIRTY_FLAG_TFX_TEXTURES;

-		// Clear out the RT binding if feedback loop isn't on, because it'll be in the wrong state and make
+		// Clear out the RT/DS binding if feedback loop isn't on, because it'll be in the wrong state and make
 		// the validation layer cranky. Not a big deal since we need to write it anyway.
-		const GSTextureVK::Layout rt_tex_layout = m_tfx_textures[TFX_TEXTURE_RT]->GetLayout();
-		if (rt_tex_layout != GSTextureVK::Layout::FeedbackLoop && rt_tex_layout != GSTextureVK::Layout::ShaderReadOnly)
-			m_tfx_textures[TFX_TEXTURE_RT] = m_null_texture.get();
+		std::array<TFX_TEXTURES, 2> texture_types = { TFX_TEXTURE_RT, TFX_TEXTURE_DEPTH };
+		for (u32 texture_type : texture_types)
+		{
+			const GSTextureVK::Layout tex_layout = m_tfx_textures[texture_type]->GetLayout();
+			if (tex_layout != GSTextureVK::Layout::FeedbackLoop && tex_layout != GSTextureVK::Layout::ShaderReadOnly)
+				m_tfx_textures[texture_type] = m_null_texture.get();
+		}
 	}

 	if (flags & DIRTY_FLAG_TFX_UBO)
@ -5386,6 +5606,19 @@ bool GSDeviceVK::ApplyTFXState(bool already_execed)
 			dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_PRIMID,
 				m_tfx_textures[TFX_TEXTURE_PRIMID]->GetView(), m_tfx_textures[TFX_TEXTURE_PRIMID]->GetVkLayout());
 		}
+		if (flags & DIRTY_FLAG_TFX_TEXTURE_DEPTH)
+		{
+			if (m_features.texture_barrier && !UseFeedbackLoopLayout())
+			{
+				dsub.AddInputAttachmentDescriptorWrite(
+					VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(), VK_IMAGE_LAYOUT_GENERAL);
+			}
+			else
+			{
+				dsub.AddImageDescriptorWrite(VK_NULL_HANDLE, TFX_TEXTURE_DEPTH, m_tfx_textures[TFX_TEXTURE_DEPTH]->GetView(),
+					m_tfx_textures[TFX_TEXTURE_DEPTH]->GetVkLayout());
+			}
+		}

 		dsub.PushUpdate(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, m_tfx_pipeline_layout, TFX_DESCRIPTOR_SET_TEXTURES);
 	}
@ -5545,13 +5778,15 @@ GSTextureVK* GSDeviceVK::SetupPrimitiveTrackingDATE(GSHWDrawConfig& config)

 void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
 {
-
 	const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize());
 	GSTextureVK* draw_rt = static_cast<GSTextureVK*>(config.rt);
 	GSTextureVK* draw_ds = static_cast<GSTextureVK*>(config.ds);
 	GSTextureVK* draw_rt_clone = nullptr;
 	GSTextureVK* colclip_rt = static_cast<GSTextureVK*>(g_gs_device->GetColorClipTexture());

+	// Copying buffers needs to done outside render pass so do this early.
+	SetupAccuratePrimsBuffer(config);
+
 	// stream buffer in first, in case we need to exec
 	SetVSConstantBuffer(config.cb_vs);
 	SetPSConstantBuffer(config.cb_ps);
@ -5597,8 +5832,12 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
 	UpdateHWPipelineSelector(config, pipe);

 	// If we don't have a barrier but the texture was drawn to last draw, end the pass to insert a barrier.
-	if (InRenderPass() && !pipe.IsRTFeedbackLoop() && (config.tex == m_current_render_target || config.tex == m_current_depth_target))
+	if (InRenderPass())
+	{
+		if ((!pipe.IsRTFeedbackLoop() && config.tex == m_current_render_target) ||
+			(!pipe.IsDepthFeedbackLoop() && config.tex == m_current_depth_target))
 			EndRenderPass();
+	}

 	// now blit the colclip texture back to the original target
 	if (colclip_rt)
@ -5781,20 +6020,31 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
 	// Despite the layout changing enforcing the execution dependency between previous draws and the first
 	// input attachment read, it still wants the region/fragment-local barrier...

-	const bool skip_first_barrier =
-		(draw_rt && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop && !pipe.ps.colclip_hw && !IsDeviceAMD());
+	bool skip_first_barrier = !pipe.ps.colclip_hw && !IsDeviceAMD();
+	if (draw_rt)
+		skip_first_barrier = skip_first_barrier && draw_rt->GetLayout() != GSTextureVK::Layout::FeedbackLoop;
+	if (draw_ds)
+		skip_first_barrier = skip_first_barrier && draw_ds->GetLayout() != GSTextureVK::Layout::FeedbackLoop;

 	OMSetRenderTargets(draw_rt, draw_ds, config.scissor, static_cast<FeedbackLoopFlag>(pipe.feedback_loop_flags));
 	if (pipe.IsRTFeedbackLoop())
 	{
 		pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled");
-		PSSetShaderResource(2, draw_rt, false);
+		PSSetShaderResource(TFX_TEXTURE_RT, draw_rt, false);

 		// If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor.
 		// Otherwise, we might have a previous descriptor left over, that has the RT in a different state.
 		m_dirty_flags |= (skip_first_barrier ? static_cast<u32>(DIRTY_FLAG_TFX_TEXTURE_RT) : 0);
 	}
+	if (pipe.IsDepthFeedbackLoop())
+	{
+		pxAssertMsg(m_features.texture_barrier, "Texture barriers enabled");
+		PSSetShaderResource(TFX_TEXTURE_DEPTH, draw_ds, false);

+		// If this is the first draw to the target as a feedback loop, make sure we re-generate the texture descriptor.
+		// Otherwise, we might have a previous descriptor left over, that has the RT in a different state.
+		m_dirty_flags |= (skip_first_barrier ? static_cast<u32>(DIRTY_FLAG_TFX_TEXTURE_DEPTH) : 0);
+	}
 	// Begin render pass if new target or out of the area.
 	if (!InRenderPass())
 	{
@ -5868,7 +6118,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)

 	// now we can do the actual draw
 	if (BindDrawPipeline(pipe))
-		SendHWDraw(config, draw_rt, config.require_one_barrier, config.require_full_barrier, skip_first_barrier);
+		SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr,
+			config.require_one_barrier, config.require_full_barrier, skip_first_barrier);

 	// blend second pass
 	if (config.blend_multi_pass.enable)
@ -5903,8 +6154,8 @@ void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
 		pipe.bs = config.blend;
 		if (BindDrawPipeline(pipe))
 		{
-			SendHWDraw(config, draw_rt, config.alpha_second_pass.require_one_barrier,
-				config.alpha_second_pass.require_full_barrier, false);
+			SendHWDraw(config, draw_rt, pipe.IsDepthFeedbackLoop() ? draw_ds : nullptr,
+				config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier, false);
 		}
 	}

@ -5981,19 +6232,24 @@ void GSDeviceVK::UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelect
 	pipe.rt = config.rt != nullptr;
 	pipe.ds = config.ds != nullptr;
 	pipe.line_width = config.line_expand;
-	pipe.feedback_loop_flags =
-		(m_features.texture_barrier &&
-			(config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier)) ?
-			FeedbackLoopFlag_ReadAndWriteRT :
-			FeedbackLoopFlag_None;
-	pipe.feedback_loop_flags |=
-		(config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDS : FeedbackLoopFlag_None;
+	pipe.feedback_loop_flags = FeedbackLoopFlag_None;
+	if (m_features.texture_barrier && (config.ps.IsFeedbackLoop() || config.require_one_barrier || config.require_full_barrier))
+	{
+		pipe.feedback_loop_flags |= FeedbackLoopFlag_ReadAndWriteRT;
+
+		// We only allow DS feedback loop if RT is already in a feedback loop.
+		pipe.feedback_loop_flags |= (pipe.ds && config.ps.IsFeedbackLoopDepth()) ? FeedbackLoopFlag_ReadAndWriteDepth : FeedbackLoopFlag_None;
+	}
+	if (!(pipe.feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth))
+	{
+		pipe.feedback_loop_flags |= (config.tex && config.tex == config.ds) ? FeedbackLoopFlag_ReadDepth : FeedbackLoopFlag_None;
+	}

 	// enable point size in the vertex shader if we're rendering points regardless of upscaling.
 	pipe.vs.point_size |= (config.topology == GSHWDrawConfig::Topology::Point);
 }

-void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
+void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
 {
 	IASetVertexBuffer(config.verts, sizeof(GSVertex), config.nverts, GetVertexAlignment(config.vs.expand));
 	m_vertex.start *= GetExpansionFactor(config.vs.expand);
@ -6008,6 +6264,9 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config)
 	{
 		IASetIndexBuffer(config.indices, config.nindices);
 	}
+
+	// Needs to be done after vertex offset is set.
+	SetupAccuratePrimsConstants(config);
 }

 VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const
@ -6021,13 +6280,31 @@ VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const
 		VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, rt->GetImage(), {VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u}};
 }

+VkImageMemoryBarrier GSDeviceVK::GetDepthStencilBufferBarrier(GSTextureVK* ds) const
+{
+	const VkImageLayout layout =
+		UseFeedbackLoopLayout() ? VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT : VK_IMAGE_LAYOUT_GENERAL;
+	const VkAccessFlags dst_access =
+		UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+	return {VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, nullptr,
+		VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, dst_access, layout, layout,
+		VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, ds->GetImage(),
+		{VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0u, 1u, 0u, 1u}};
+}
+
 VkDependencyFlags GSDeviceVK::GetColorBufferBarrierFlags() const
 {
 	return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
 	                                 VK_DEPENDENCY_BY_REGION_BIT;
 }

-void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
+VkDependencyFlags GSDeviceVK::GetDepthStencilBufferBarrierFlags() const
+{
+	return UseFeedbackLoopLayout() ? (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
+	                                 VK_DEPENDENCY_BY_REGION_BIT;
+}
+
+void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds,
 	bool one_barrier, bool full_barrier, bool skip_first_barrier)
 {
 	if (!m_features.texture_barrier) [[unlikely]]
@ -6037,21 +6314,48 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
 	}

 #ifdef PCSX2_DEVBUILD
-	if ((one_barrier || full_barrier) && !m_pipeline_selector.ps.IsFeedbackLoop()) [[unlikely]]
+	if ((one_barrier || full_barrier) && !(m_pipeline_selector.ps.IsFeedbackLoop() || m_pipeline_selector.ps.IsFeedbackLoopDepth())) [[unlikely]]
 		Console.Warning("VK: Possible unnecessary barrier detected.");
 #endif
-	const VkDependencyFlags barrier_flags = GetColorBufferBarrierFlags();
+	std::array<VkDependencyFlags, 2> barrier_flags = {
+		GetColorBufferBarrierFlags(),
+		GetDepthStencilBufferBarrierFlags(),
+	};
+	std::array<VkImageMemoryBarrier, 2> barrier;
+	u32 barriers_per_draw = 0;
+	if (full_barrier || one_barrier)
+	{
+		if (draw_rt)
+			barrier[barriers_per_draw++] = GetColorBufferBarrier(draw_rt);
+		if (draw_ds)
+			barrier[barriers_per_draw++] = GetDepthStencilBufferBarrier(draw_ds);
+	}
+
+	const auto IssueBarriers = [&]() {
+		if (draw_rt)
+		{
+			vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
+				VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+				VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[0], 0, nullptr, 0, nullptr, 1, &barrier[0]);
+		}
+		if (draw_ds)
+		{
+			vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
+				VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
+				VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags[1], 0, nullptr, 0, nullptr, 1, &barrier[1]);
+		}
+	};
+
 	if (full_barrier)
 	{
 		pxAssert(config.drawlist && !config.drawlist->empty());

-		const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt);
 		const u32 indices_per_prim = config.indices_per_prim;
 		const u32 draw_list_size = static_cast<u32>(config.drawlist->size());

 		GL_PUSH("Split the draw");
-		g_perfmon.Put(
-			GSPerfMon::Barriers, static_cast<u32>(draw_list_size) - static_cast<u32>(skip_first_barrier));
+		g_perfmon.Put(GSPerfMon::Barriers,
+			barriers_per_draw * (static_cast<u32>(draw_list_size) - static_cast<u32>(skip_first_barrier)));

 		u32 p = 0;
 		u32 n = 0;
@ -6066,8 +6370,7 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,

 		for (; n < draw_list_size; n++)
 		{
-			vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-				VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier);
+			IssueBarriers();

 			const u32 count = (*config.drawlist)[n] * indices_per_prim;
 			DrawIndexedPrimitive(p, count);
@ -6079,11 +6382,8 @@ void GSDeviceVK::SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,

 	if (one_barrier && !skip_first_barrier)
 	{
-		g_perfmon.Put(GSPerfMon::Barriers, 1);
-
-		const VkImageMemoryBarrier barrier = GetColorBufferBarrier(draw_rt);
-		vkCmdPipelineBarrier(GetCurrentCommandBuffer(), VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-			VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, barrier_flags, 0, nullptr, 0, nullptr, 1, &barrier);
+		g_perfmon.Put(GSPerfMon::Barriers, barriers_per_draw);
+		IssueBarriers();
 	}

 	DrawIndexedPrimitive();
--- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h
+++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h
@ -98,6 +98,8 @@ public:
 	__fi VkCommandBuffer GetCurrentCommandBuffer() const { return m_current_command_buffer; }
 	__fi VKStreamBuffer& GetTextureUploadBuffer() { return m_texture_stream_buffer; }
 	VkCommandBuffer GetCurrentInitCommandBuffer();
+	VkBuffer AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
+	VkBuffer WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);

 	/// Allocates a descriptor set from the pool reserved for the current frame.
 	VkDescriptorSet AllocatePersistentDescriptorSet(VkDescriptorSetLayout set_layout);
@ -293,7 +295,8 @@ public:
 	{
 		FeedbackLoopFlag_None = 0,
 		FeedbackLoopFlag_ReadAndWriteRT = 1,
-		FeedbackLoopFlag_ReadDS = 2,
+		FeedbackLoopFlag_ReadDepth = 2,
+		FeedbackLoopFlag_ReadAndWriteDepth = 4,
 	};

 	struct alignas(8) PipelineSelector
@ -308,7 +311,7 @@ public:
 				u32 rt : 1;
 				u32 ds : 1;
 				u32 line_width : 1;
-				u32 feedback_loop_flags : 2;
+				u32 feedback_loop_flags : 3;
 			};

 			u32 key;
@ -326,7 +329,8 @@ public:
 		__fi PipelineSelector() { std::memset(this, 0, sizeof(*this)); }

 		__fi bool IsRTFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteRT) != 0); }
-		__fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadDS) != 0); }
+		__fi bool IsDepthFeedbackLoop() const { return ((feedback_loop_flags & FeedbackLoopFlag_ReadAndWriteDepth) != 0); }
+		__fi bool IsTestingAndSamplingDepth() const { return ((feedback_loop_flags & (FeedbackLoopFlag_ReadDepth | FeedbackLoopFlag_ReadAndWriteDepth)) != 0); }
 	};
 	static_assert(sizeof(PipelineSelector) == 24, "Pipeline selector is 24 bytes");

@ -357,10 +361,11 @@ public:
 	};
 	enum TFX_TEXTURES : u32
 	{
-		TFX_TEXTURE_TEXTURE,
+		TFX_TEXTURE_TEXTURE = 0,
 		TFX_TEXTURE_PALETTE,
 		TFX_TEXTURE_RT,
 		TFX_TEXTURE_PRIMID,
+		TFX_TEXTURE_DEPTH,

 		NUM_TFX_TEXTURES
 	};
@ -377,6 +382,8 @@ private:

 	VKStreamBuffer m_vertex_stream_buffer;
 	VKStreamBuffer m_index_stream_buffer;
+	VKStreamBuffer m_accurate_prims_stream_buffer;
+	u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw.
 	VKStreamBuffer m_vertex_uniform_stream_buffer;
 	VKStreamBuffer m_fragment_uniform_stream_buffer;
 	VKStreamBuffer m_texture_stream_buffer;
@ -559,6 +566,9 @@ public:
 	void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
 	void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);

+	void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
+	void SetupAccuratePrimsConstants(GSHWDrawConfig& config);
+
 	void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor,
 		FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None);

@ -568,10 +578,12 @@ public:

 	void RenderHW(GSHWDrawConfig& config) override;
 	void UpdateHWPipelineSelector(GSHWDrawConfig& config, PipelineSelector& pipe);
-	void UploadHWDrawVerticesAndIndices(const GSHWDrawConfig& config);
+	void UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config);
 	VkImageMemoryBarrier GetColorBufferBarrier(GSTextureVK* rt) const;
 	VkDependencyFlags GetColorBufferBarrierFlags() const;
-	void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt,
+	VkImageMemoryBarrier GetDepthStencilBufferBarrier(GSTextureVK* ds) const;
+	VkDependencyFlags GetDepthStencilBufferBarrierFlags() const;
+	void SendHWDraw(const GSHWDrawConfig& config, GSTextureVK* draw_rt, GSTextureVK* draw_ds,
 		bool one_barrier, bool full_barrier, bool skip_first_barrier);

 	//////////////////////////////////////////////////////////////////////////
@ -621,25 +633,27 @@ public:
 private:
 	enum DIRTY_FLAG : u32
 	{
-		DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3
-		DIRTY_FLAG_TFX_UBO = (1 << 4),
-		DIRTY_FLAG_UTILITY_TEXTURE = (1 << 5),
-		DIRTY_FLAG_BLEND_CONSTANTS = (1 << 6),
-		DIRTY_FLAG_LINE_WIDTH = (1 << 7),
-		DIRTY_FLAG_INDEX_BUFFER = (1 << 8),
-		DIRTY_FLAG_VIEWPORT = (1 << 9),
-		DIRTY_FLAG_SCISSOR = (1 << 10),
-		DIRTY_FLAG_PIPELINE = (1 << 11),
-		DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 12),
-		DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 13),
+		DIRTY_FLAG_TFX_TEXTURE_0 = (1 << 0), // 0, 1, 2, 3, 4
+		DIRTY_FLAG_TFX_UBO = (1 << 5),
+		DIRTY_FLAG_UTILITY_TEXTURE = (1 << 6),
+		DIRTY_FLAG_BLEND_CONSTANTS = (1 << 7),
+		DIRTY_FLAG_LINE_WIDTH = (1 << 8),
+		DIRTY_FLAG_INDEX_BUFFER = (1 << 9),
+		DIRTY_FLAG_VIEWPORT = (1 << 10),
+		DIRTY_FLAG_SCISSOR = (1 << 11),
+		DIRTY_FLAG_PIPELINE = (1 << 12),
+		DIRTY_FLAG_VS_CONSTANT_BUFFER = (1 << 13),
+		DIRTY_FLAG_PS_CONSTANT_BUFFER = (1 << 14),

 		DIRTY_FLAG_TFX_TEXTURE_TEX = (DIRTY_FLAG_TFX_TEXTURE_0 << 0),
 		DIRTY_FLAG_TFX_TEXTURE_PALETTE = (DIRTY_FLAG_TFX_TEXTURE_0 << 1),
 		DIRTY_FLAG_TFX_TEXTURE_RT = (DIRTY_FLAG_TFX_TEXTURE_0 << 2),
 		DIRTY_FLAG_TFX_TEXTURE_PRIMID = (DIRTY_FLAG_TFX_TEXTURE_0 << 3),
+		DIRTY_FLAG_TFX_TEXTURE_DEPTH = (DIRTY_FLAG_TFX_TEXTURE_0 << 4),

 		DIRTY_FLAG_TFX_TEXTURES = DIRTY_FLAG_TFX_TEXTURE_TEX | DIRTY_FLAG_TFX_TEXTURE_PALETTE |
-		                          DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID,
+		                          DIRTY_FLAG_TFX_TEXTURE_RT | DIRTY_FLAG_TFX_TEXTURE_PRIMID |
+		                          DIRTY_FLAG_TFX_TEXTURE_DEPTH,

 		DIRTY_BASE_STATE = DIRTY_FLAG_INDEX_BUFFER | DIRTY_FLAG_PIPELINE | DIRTY_FLAG_VIEWPORT | DIRTY_FLAG_SCISSOR |
 		                   DIRTY_FLAG_BLEND_CONSTANTS | DIRTY_FLAG_LINE_WIDTH,
--- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp
+++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp
@ -114,7 +114,7 @@ std::unique_ptr<GSTextureVK> GSTextureVK::Create(Type type, Format format, int w
 				VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
 				VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT |
 				(GSDeviceVK::GetInstance()->UseFeedbackLoopLayout() ? VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT
-				                                                    : 0);
+				                                                    : VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT);
 			vci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
 		}
 		break;
@ -198,7 +198,7 @@ void GSTextureVK::Destroy(bool defer)

 	if (m_type == Type::RenderTarget || m_type == Type::DepthStencil)
 	{
-		for (const auto& [other_tex, fb, feedback] : m_framebuffers)
+		for (const auto& [other_tex, fb, feedback_color, feedback_depth] : m_framebuffers)
 		{
 			if (other_tex)
 			{
@ -270,38 +270,6 @@ void GSTextureVK::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch
 	StringUtil::StrideMemCpy(dst, upload_pitch, src, pitch, std::min(upload_pitch, pitch), count);
 }

-VkBuffer GSTextureVK::AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const
-{
-	const u32 size = upload_pitch * height;
-	const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
-		VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
-
-	// Don't worry about setting the coherent bit for this upload, the main reason we had
-	// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
-	// smaller uploads, but we're writing to the whole thing anyway.
-	VmaAllocationCreateInfo aci = {};
-	aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
-	aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
-
-	VmaAllocationInfo ai;
-	VkBuffer buffer;
-	VmaAllocation allocation;
-	VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
-	if (res != VK_SUCCESS)
-	{
-		LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
-		return VK_NULL_HANDLE;
-	}
-
-	// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
-	GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
-
-	// And write the data.
-	CopyTextureDataForUpload(ai.pMappedData, data, pitch, upload_pitch, height);
-	vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
-	return buffer;
-}
-
 void GSTextureVK::UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height,
 	u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset)
 {
@ -333,6 +301,10 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
 	const u32 upload_pitch = Common::AlignUpPow2(pitch, GSDeviceVK::GetInstance()->GetBufferCopyRowPitchAlignment());
 	const u32 required_size = CalcUploadSize(height, upload_pitch);

+	const auto upload_data = [&](void* map_ptr) {
+		CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
+	};
+
 	// If the texture is larger than half our streaming buffer size, use a separate buffer.
 	// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
 	VkBuffer buffer;
@ -340,29 +312,14 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
 	if (required_size > (GSDeviceVK::GetInstance()->GetTextureUploadBuffer().GetCurrentSize() / 2))
 	{
 		buffer_offset = 0;
-		buffer = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
-		if (buffer == VK_NULL_HANDLE)
-			return false;
+		buffer = GSDeviceVK::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
 	}
 	else
 	{
-		VKStreamBuffer& sbuffer = GSDeviceVK::GetInstance()->GetTextureUploadBuffer();
-		if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
-		{
-			GSDeviceVK::GetInstance()->ExecuteCommandBuffer(
-				false, "While waiting for %u bytes in texture upload buffer", required_size);
-			if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
-			{
-				Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
+		buffer = GSDeviceVK::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, buffer_offset);
+	}
+	if (buffer == VK_NULL_HANDLE)
 		return false;
-			}
-		}
-
-		buffer = sbuffer.GetBuffer();
-		buffer_offset = sbuffer.GetCurrentOffset();
-		CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
-		sbuffer.CommitMemory(required_size);
-	}

 	const VkCommandBuffer cmdbuf = GetCommandBufferForUpdate();
 	GL_PUSH("GSTextureVK::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);
@ -738,16 +695,16 @@ void GSTextureVK::TransitionSubresourcesToLayout(

 VkFramebuffer GSTextureVK::GetFramebuffer(bool feedback_loop)
 {
-	return GetLinkedFramebuffer(nullptr, feedback_loop);
+	return GetLinkedFramebuffer(nullptr, feedback_loop, false);
 }

-VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop)
+VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth)
 {
 	pxAssertRel(m_type != Type::Texture, "Texture is a render target");

-	for (const auto& [other_tex, fb, other_feedback_loop] : m_framebuffers)
+	for (const auto& [other_tex, fb, other_feedback_loop_color, other_feedback_loop_depth] : m_framebuffers)
 	{
-		if (other_tex == depth_texture && other_feedback_loop == feedback_loop)
+		if (other_tex == depth_texture && other_feedback_loop_color == feedback_loop_color && other_feedback_loop_depth == feedback_loop_depth)
 			return fb;
 	}

@ -756,7 +713,7 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool
 		(m_type != GSTexture::Type::DepthStencil) ? (depth_texture ? depth_texture->m_vk_format : VK_FORMAT_UNDEFINED) :
 													m_vk_format,
 		VK_ATTACHMENT_LOAD_OP_LOAD, VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_LOAD,
-		VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop);
+		VK_ATTACHMENT_STORE_OP_STORE, VK_ATTACHMENT_LOAD_OP_DONT_CARE, VK_ATTACHMENT_STORE_OP_DONT_CARE, feedback_loop_color, feedback_loop_depth);
 	if (!rp)
 		return VK_NULL_HANDLE;

@ -771,9 +728,9 @@ VkFramebuffer GSTextureVK::GetLinkedFramebuffer(GSTextureVK* depth_texture, bool
 	if (!fb)
 		return VK_NULL_HANDLE;

-	m_framebuffers.emplace_back(depth_texture, fb, feedback_loop);
+	m_framebuffers.emplace_back(depth_texture, fb, feedback_loop_color, feedback_loop_depth);
 	if (depth_texture)
-		depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop);
+		depth_texture->m_framebuffers.emplace_back(this, fb, feedback_loop_color, feedback_loop_depth);
 	return fb;
 }

--- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h
+++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h
@ -73,7 +73,7 @@ public:
 	/// Framebuffers are lazily allocated.
 	VkFramebuffer GetFramebuffer(bool feedback_loop);

-	VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop);
+	VkFramebuffer GetLinkedFramebuffer(GSTextureVK* depth_texture, bool feedback_loop_color, bool feedback_loop_depth);

 	// Call when the texture is bound to the pipeline, or read from in a copy.
 	__fi void SetUseFenceCounter(u64 counter) { m_use_fence_counter = counter; }
@ -84,7 +84,6 @@ private:

 	VkCommandBuffer GetCommandBufferForUpdate();
 	void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;
-	VkBuffer AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
 	void UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height,
 		u32 row_length, VkBuffer buffer, u32 buffer_offset);

@ -103,7 +102,7 @@ private:

 	// linked framebuffer is combined with depth texture
 	// list of color textures this depth texture is linked to or vice versa
-	std::vector<std::tuple<GSTextureVK*, VkFramebuffer, bool>> m_framebuffers;
+	std::vector<std::tuple<GSTextureVK*, VkFramebuffer, bool, bool>> m_framebuffers;
 };

 class GSDownloadTextureVK final : public GSDownloadTexture
--- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp
+++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp
@ -19,6 +19,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
 	, m_allocation(move.m_allocation)
 	, m_buffer(move.m_buffer)
 	, m_host_pointer(move.m_host_pointer)
+	, m_device_local(move.m_device_local)
 	, m_tracked_fences(std::move(move.m_tracked_fences))
 {
 	move.m_size = 0;
@ -28,6 +29,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
 	move.m_allocation = VK_NULL_HANDLE;
 	move.m_buffer = VK_NULL_HANDLE;
 	move.m_host_pointer = nullptr;
+	move.m_device_local = false;
 }

 VKStreamBuffer::~VKStreamBuffer()
@ -48,19 +50,29 @@ VKStreamBuffer& VKStreamBuffer::operator=(VKStreamBuffer&& move)
 	std::swap(m_buffer, move.m_buffer);
 	std::swap(m_host_pointer, move.m_host_pointer);
 	std::swap(m_tracked_fences, move.m_tracked_fences);
+	std::swap(m_device_local, move.m_device_local);

 	return *this;
 }

-bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
+bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size, bool device_local)
 {
 	const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
 		usage, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};

 	VmaAllocationCreateInfo aci = {};
+	if (device_local)
+	{
+		// GPU default buffer
+		aci.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+	}
+	else
+	{
+		// CPU upload buffer
 		aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
 		aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
 		aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+	}

 	VmaAllocationInfo ai = {};
 	VkBuffer new_buffer = VK_NULL_HANDLE;
@ -83,7 +95,8 @@ bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
 	m_tracked_fences.clear();
 	m_allocation = new_allocation;
 	m_buffer = new_buffer;
-	m_host_pointer = static_cast<u8*>(ai.pMappedData);
+	m_host_pointer = device_local ? nullptr : static_cast<u8*>(ai.pMappedData);
+	m_device_local = device_local;
 	return true;
 }

@ -104,6 +117,7 @@ void VKStreamBuffer::Destroy(bool defer)
 	m_buffer = VK_NULL_HANDLE;
 	m_allocation = VK_NULL_HANDLE;
 	m_host_pointer = nullptr;
+	m_device_local = false;
 }

 bool VKStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
@ -180,8 +194,11 @@ void VKStreamBuffer::CommitMemory(u32 final_num_bytes)
 	pxAssert((m_current_offset + final_num_bytes) <= m_size);
 	pxAssert(final_num_bytes <= m_current_space);

+	if (!m_device_local)
+	{
 		// For non-coherent mappings, flush the memory range
 		vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes);
+	}

 	m_current_offset += final_num_bytes;
 	m_current_space -= final_num_bytes;
--- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h
+++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h
@ -30,14 +30,13 @@ public:
 	__fi u32 GetCurrentSpace() const { return m_current_space; }
 	__fi u32 GetCurrentOffset() const { return m_current_offset; }

-	bool Create(VkBufferUsageFlags usage, u32 size);
+	bool Create(VkBufferUsageFlags usage, u32 size, bool device_local = false);
 	void Destroy(bool defer);

 	bool ReserveMemory(u32 num_bytes, u32 alignment);
 	void CommitMemory(u32 final_num_bytes);

 private:
-	bool AllocateBuffer(VkBufferUsageFlags usage, u32 size);
 	void UpdateCurrentFencePosition();
 	void UpdateGPUPosition();

@ -51,7 +50,8 @@ private:

 	VmaAllocation m_allocation = VK_NULL_HANDLE;
 	VkBuffer m_buffer = VK_NULL_HANDLE;
-	u8* m_host_pointer = nullptr;
+	u8* m_host_pointer = nullptr; // Only used for upload buffers.
+	bool m_device_local = false; // False for upload buffer; true for default buffer.

 	// List of fences and the corresponding positions in the buffer
 	std::deque<std::pair<u64, u32>> m_tracked_fences;
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@ -751,6 +751,7 @@ Pcsx2Config::GSOptions::GSOptions()
 	PreloadFrameWithGSData = false;
 	Mipmap = true;
 	HWMipmap = true;
+	HWAccuratePrims = false;

 	ManualUserHacks = false;
 	UserHacks_AlignSpriteX = false;
@ -1021,6 +1022,7 @@ void Pcsx2Config::GSOptions::LoadSave(SettingsWrapper& wrap)
 	SettingsWrapEntryEx(UpscaleMultiplier, "upscale_multiplier");

 	SettingsWrapBitBoolEx(HWMipmap, "hw_mipmap");
+	SettingsWrapBitBoolEx(HWAccuratePrims, "HWAccuratePrims");
 	SettingsWrapIntEnumEx(AccurateBlendingUnit, "accurate_blending_unit");
 	SettingsWrapIntEnumEx(TextureFiltering, "filter");
 	SettingsWrapIntEnumEx(TexturePreloading, "texture_preloading");