GS/DX11: Support multidraw framebuffer copy for sw blending, fbmask.

Instead of using barriers we can do a multi fb copy for each draw split to replicate
barrier behavior, however it will be much slower.
This commit is contained in:
lightningterror 2025-08-21 16:42:55 +02:00
parent 20411aa8d6
commit 567b0173ec
9 changed files with 98 additions and 37 deletions

View File

@ -1102,7 +1102,7 @@ void GraphicsSettingsWidget::updateRendererDependentOptions()
const bool is_software = (type == GSRendererType::SW);
const bool is_auto = (type == GSRendererType::Auto);
const bool is_vk = (type == GSRendererType::VK);
const bool is_disable_barriers = (type == GSRendererType::DX11 || type == GSRendererType::DX12 || type == GSRendererType::Metal || type == GSRendererType::SW);
const bool is_disable_barriers = (type == GSRendererType::DX12 || type == GSRendererType::Metal || type == GSRendererType::SW);
const bool hw_fixes = (is_hardware && m_hw.enableHWFixes && m_hw.enableHWFixes->checkState() == Qt::Checked);
QWidget* prev_tab;

View File

@ -809,6 +809,7 @@ public:
bool vs_expand : 1; ///< Supports expanding points/lines/sprites in the vertex shader
bool primitive_id : 1; ///< Supports primitive ID for use with prim tracking destination alpha algorithm
bool texture_barrier : 1; ///< Supports sampling rt and hopefully texture barrier
bool multidraw_fb_copy : 1; ///< Replacement for texture barrier.
bool provoking_vertex_last: 1; ///< Supports using the last vertex in a primitive as the value for flat shading.
bool point_expand : 1; ///< Supports point expansion in hardware.
bool line_expand : 1; ///< Supports line expansion in hardware.

View File

@ -46,6 +46,7 @@ GSDevice11::GSDevice11()
m_features.primitive_id = true;
m_features.texture_barrier = false;
m_features.multidraw_fb_copy = GSConfig.OverrideTextureBarriers != 0;
m_features.provoking_vertex_last = false;
m_features.point_expand = false;
m_features.line_expand = false;
@ -2681,26 +2682,6 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
PSSetShaderResource(1, config.pal);
}
GSTexture* draw_rt_clone = nullptr;
if (config.require_one_barrier || (config.tex && config.tex == config.rt))
{
// Requires a copy of the RT.
// Used as "bind rt" flag when texture barrier is unsupported for tex is fb.
draw_rt_clone = CreateTexture(rtsize.x, rtsize.y, 1, colclip_rt ? GSTexture::Format::ColorClip : GSTexture::Format::Color, true);
if (draw_rt_clone)
{
CopyRect(colclip_rt ? colclip_rt : config.rt, draw_rt_clone, config.drawarea, config.drawarea.left, config.drawarea.top);
if (config.require_one_barrier)
PSSetShaderResource(2, draw_rt_clone);
if (config.tex && config.tex == config.rt)
PSSetShaderResource(0, draw_rt_clone);
}
else
Console.Warning("D3D11: Failed to allocate temp texture for RT copy.");
}
SetupVS(config.vs, &config.cb_vs);
SetupPS(config.ps, &config.cb_ps, config.sampler);
@ -2742,13 +2723,25 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
draw_ds = m_state.cached_dsv;
}
GSTexture* draw_rt_clone = nullptr;
if (config.require_one_barrier || (config.require_full_barrier && m_features.multidraw_fb_copy) || (config.tex && config.tex == config.rt))
{
// Requires a copy of the RT.
// Used as "bind rt" flag when texture barrier is unsupported for tex is fb.
draw_rt_clone = CreateTexture(rtsize.x, rtsize.y, 1, draw_rt->GetFormat(), true);
if (!draw_rt_clone)
Console.Warning("D3D11: Failed to allocate temp texture for RT copy.");
}
// Update again as it may have changed.
if (config.tex && config.tex == config.ds)
read_only_dsv = static_cast<GSTexture11*>(draw_ds)->ReadOnlyDepthStencilView();
OMSetRenderTargets(draw_rt, draw_ds, &config.scissor, read_only_dsv);
SetupOM(config.depth, OMBlendSelector(config.colormask, config.blend), config.blend.constant);
DrawIndexedPrimitive();
SendHWDraw(config, draw_rt_clone, draw_rt, config.require_one_barrier, config.require_full_barrier);
if (config.blend_multi_pass.enable)
{
@ -2773,7 +2766,7 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
}
SetupOM(config.alpha_second_pass.depth, OMBlendSelector(config.alpha_second_pass.colormask, config.blend), config.blend.constant);
DrawIndexedPrimitive();
SendHWDraw(config, draw_rt_clone, draw_rt, config.alpha_second_pass.require_one_barrier, config.alpha_second_pass.require_full_barrier);
}
if (draw_rt_clone)
@ -2799,3 +2792,60 @@ void GSDevice11::RenderHW(GSHWDrawConfig& config)
}
}
}
void GSDevice11::SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier)
{
if (draw_rt_clone)
{
#ifdef PCSX2_DEVBUILD
if ((one_barrier || full_barrier) && !config.ps.IsFeedbackLoop()) [[unlikely]]
Console.Warning("D3D11: Possible unnecessary copy detected.");
#endif
const u32 indices_per_prim = config.indices_per_prim;
auto CopyAndBind = [&]() {
CopyRect(draw_rt, draw_rt_clone, config.drawarea, config.drawarea.left, config.drawarea.top);
if (one_barrier || full_barrier)
PSSetShaderResource(2, draw_rt_clone);
if (config.tex && config.tex == config.rt)
PSSetShaderResource(0, draw_rt_clone);
};
// Copy once per batch, primitives don't overlap each other.
if (m_features.multidraw_fb_copy && full_barrier && config.drawlist)
{
const u32 draw_list_size = static_cast<u32>(config.drawlist->size());
for (u32 n = 0, p = 0; n < draw_list_size; n++)
{
const u32 count = (*config.drawlist)[n] * indices_per_prim;
CopyAndBind();
DrawIndexedPrimitive(p, count);
p += count;
}
return;
}
// Copy once per primitive.
// TODO: Optimization try to use prim area for copy instead of draw area,
// might need current prim and previous prim area due to overlap,
// will need to use vertex cords to get the new copy rect.
if (m_features.multidraw_fb_copy && full_barrier)
{
for (u32 p = 0; p < config.nindices; p += indices_per_prim)
{
CopyAndBind();
DrawIndexedPrimitive(p, indices_per_prim);
}
return;
}
CopyAndBind();
}
DrawIndexedPrimitive();
}

View File

@ -343,6 +343,7 @@ public:
void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, u8 afix);
void RenderHW(GSHWDrawConfig& config) override;
void SendHWDraw(const GSHWDrawConfig& config, GSTexture* draw_rt_clone, GSTexture* draw_rt, const bool one_barrier, const bool full_barrier);
void ClearSamplerCache() override;

View File

@ -1225,6 +1225,7 @@ bool GSDevice12::CheckFeatures(const u32& vendor_id)
const bool isAMD = (vendor_id == 0x1002 || vendor_id == 0x1022);
m_features.texture_barrier = false;
m_features.multidraw_fb_copy = false;
m_features.broken_point_sampler = isAMD;
m_features.primitive_id = true;
m_features.prefer_new_textures = true;

View File

@ -5069,7 +5069,7 @@ void GSRendererHW::EmulateTextureShuffleAndFbmask(GSTextureCache::Target* rt, GS
bool enable_fbmask_emulation = false;
const GSDevice::FeatureSupport features = g_gs_device->Features();
if (features.texture_barrier)
if (features.texture_barrier || features.multidraw_fb_copy)
{
enable_fbmask_emulation = GSConfig.AccurateBlendingUnit != AccBlendLevel::Minimum;
}
@ -5241,7 +5241,7 @@ void GSRendererHW::EmulateTextureShuffleAndFbmask(GSTextureCache::Target* rt, GS
have been invalidated before subsequent Draws are executed.
*/
// No blending so hit unsafe path.
if (!PRIM->ABE || !(~ff_fbmask & ~zero_fbmask & 0x7) || !g_gs_device->Features().texture_barrier)
if (!PRIM->ABE || !(~ff_fbmask & ~zero_fbmask & 0x7) || !(features.texture_barrier || features.multidraw_fb_copy))
{
GL_INS("HW: FBMASK Unsafe SW emulated fb_mask:%x on %d bits format", m_cached_ctx.FRAME.FBMSK,
(m_conf.ps.dst_fmt == GSLocalMemory::PSM_FMT_16) ? 16 : 32);
@ -5691,7 +5691,7 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
const bool alpha_mask = (m_cached_ctx.FRAME.FBMSK & 0xFF000000) == 0xFF000000;
bool blend_ad_alpha_masked = blend_ad && alpha_mask;
const bool is_basic_blend = GSConfig.AccurateBlendingUnit >= AccBlendLevel::Basic;
if (blend_ad_alpha_masked && (((is_basic_blend || (COLCLAMP.CLAMP == 0)) && features.texture_barrier)
if (blend_ad_alpha_masked && (((is_basic_blend || (COLCLAMP.CLAMP == 0)) && (features.texture_barrier || features.multidraw_fb_copy))
|| ((GSConfig.AccurateBlendingUnit >= AccBlendLevel::Medium) || m_conf.require_one_barrier)))
{
// Swap Ad with As for hw blend.
@ -5746,7 +5746,7 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
// HW blend can be done in multiple passes when there's no overlap.
// Blend multi pass is only useful when texture barriers aren't supported.
// Speed wise Texture barriers > blend multi pass > texture copies.
const bool blend_multi_pass_support = !features.texture_barrier && no_prim_overlap && is_basic_blend;
const bool blend_multi_pass_support = !features.texture_barrier && no_prim_overlap && is_basic_blend && COLCLAMP.CLAMP;
const bool bmix1_multi_pass1 = blend_multi_pass_support && blend_mix1 && (alpha_c0_high_max_one || alpha_c2_high_one) && m_conf.ps.blend_d == 2;
const bool bmix1_multi_pass2 = blend_multi_pass_support && (blend_flag & BLEND_MIX1) && m_conf.ps.blend_b == m_conf.ps.blend_d && !m_conf.ps.dither && alpha_high_one;
const bool bmix3_multi_pass = blend_multi_pass_support && blend_mix3 && !m_conf.ps.dither && alpha_high_one;
@ -5758,15 +5758,19 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
// Condition 2: One barrier is already enabled, prims don't overlap or is a channel shuffle so let's use sw blend instead.
// Condition 3: A texture shuffle is unlikely to overlap, so we can prefer full sw blend.
// Condition 4: If it's tex in fb draw and there's no overlap prefer sw blend, fb is already being read.
const bool prefer_sw_blend = (features.texture_barrier && m_conf.require_full_barrier) || (m_conf.require_one_barrier && (no_prim_overlap || m_channel_shuffle)) || m_conf.ps.shuffle || (no_prim_overlap && (m_conf.tex == m_conf.rt));
const bool prefer_sw_blend = ((features.texture_barrier || features.multidraw_fb_copy) && m_conf.require_full_barrier) || (m_conf.require_one_barrier && (no_prim_overlap || m_channel_shuffle)) || m_conf.ps.shuffle || (no_prim_overlap && (m_conf.tex == m_conf.rt));
const bool free_blend = blend_non_recursive // Free sw blending, doesn't require barriers or reading fb
|| accumulation_blend; // Mix of hw/sw blending
// Warning no break on purpose
// Note: the [[fallthrough]] attribute tell compilers not to complain about not having breaks.
bool sw_blending = false;
if (features.texture_barrier)
if (features.texture_barrier || features.multidraw_fb_copy)
{
// Try to lower sw blend on dx11, try to use blend multipass if possible on basic blend.
const bool blend_multipass_group = blend_multi_pass_support && !features.texture_barrier && features.multidraw_fb_copy &&
(bmix1_multi_pass1 || bmix1_multi_pass2 || bmix3_multi_pass || (blend_flag & (BLEND_HW3 | BLEND_HW4 | BLEND_HW5 | BLEND_HW6 | BLEND_HW7 | BLEND_HW8 | BLEND_HW9)));
const bool blend_requires_barrier = (blend_flag & BLEND_A_MAX) // Impossible blending
// Sw blend, either full barrier or one barrier with no overlap.
|| prefer_sw_blend
@ -5774,9 +5778,9 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
// On fbfetch, one barrier is like full barrier.
|| (one_barrier && (no_prim_overlap || features.framebuffer_fetch))
// Blending with alpha > 1 will be wrong, except BLEND_HW2.
|| (!(blend_flag & BLEND_HW2) && (alpha_c2_high_one || alpha_c0_high_max_one) && no_prim_overlap)
|| (!(blend_flag & BLEND_HW2) && !blend_multipass_group && (alpha_c2_high_one || alpha_c0_high_max_one) && no_prim_overlap)
// Ad blends are completely wrong without sw blend (Ad is 0.5 not 1 for 128). We can spare a barrier for it.
|| (blend_ad && no_prim_overlap && !new_rt_alpha_scale);
|| (blend_ad && !blend_multipass_group && no_prim_overlap && !new_rt_alpha_scale);
switch (GSConfig.AccurateBlendingUnit)
{
@ -5815,8 +5819,8 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
}
else
{
const bool ad_second_pass = blend_multi_pass_support && alpha_c1_high_no_rta_correct && COLCLAMP.CLAMP &&
(blend_flag & (BLEND_HW3 | BLEND_HW5 | BLEND_HW6 | BLEND_HW7 | BLEND_HW9));
const bool ad_second_pass = blend_multi_pass_support && alpha_c1_high_no_rta_correct &&
(blend_flag & (BLEND_HW3 | BLEND_HW5 | BLEND_HW6 | BLEND_HW7 | BLEND_HW9));
switch (GSConfig.AccurateBlendingUnit)
{
@ -5956,7 +5960,7 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
m_conf.ps.pabe = 1;
}
else if (features.texture_barrier)
else if (features.texture_barrier || features.multidraw_fb_copy)
{
// PABE sw blend:
// Disable hw/sw mix and do pure sw blend with reading the framebuffer.
@ -6130,7 +6134,7 @@ void GSRendererHW::EmulateBlending(int rt_alpha_min, int rt_alpha_max, const boo
const bool blend_non_recursive_one_barrier = blend_non_recursive && blend_ad_alpha_masked;
if (blend_non_recursive_one_barrier)
m_conf.require_one_barrier |= true;
else if (features.texture_barrier)
else if (features.texture_barrier || features.multidraw_fb_copy)
m_conf.require_full_barrier |= !blend_non_recursive;
else
m_conf.require_one_barrier |= !blend_non_recursive;
@ -7936,12 +7940,12 @@ __ri void GSRendererHW::DrawPrims(GSTextureCache::Target* rt, GSTextureCache::Ta
pxAssert(!m_conf.require_full_barrier || !m_conf.ps.colclip_hw);
// Swap full barrier for one barrier when there's no overlap, or a shuffle.
if (features.texture_barrier && m_conf.require_full_barrier && (m_prim_overlap == PRIM_OVERLAP_NO || m_conf.ps.shuffle || m_channel_shuffle))
if ((features.texture_barrier || features.multidraw_fb_copy) && m_conf.require_full_barrier && (m_prim_overlap == PRIM_OVERLAP_NO || m_conf.ps.shuffle || m_channel_shuffle))
{
m_conf.require_full_barrier = false;
m_conf.require_one_barrier = true;
}
else if (!features.texture_barrier)
else if (!(features.texture_barrier || features.multidraw_fb_copy))
{
// These shouldn't be enabled if texture barriers aren't supported, make sure they are off.
m_conf.ps.write_rg = 0;

View File

@ -921,6 +921,7 @@ bool GSDeviceMTL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
m_features.vs_expand = !GSConfig.DisableVertexShaderExpand;
m_features.primitive_id = m_dev.features.primid;
m_features.texture_barrier = true;
m_features.multidraw_fb_copy = false;
m_features.provoking_vertex_last = false;
m_features.point_expand = true;
m_features.line_expand = false;

View File

@ -733,6 +733,7 @@ bool GSDeviceOGL::CheckFeatures(bool& buggy_pbo)
"GL_ARB_texture_barrier is not supported, blending will not be accurate.", Host::OSD_ERROR_DURATION);
}
m_features.multidraw_fb_copy = false;
m_features.provoking_vertex_last = true;
m_features.dxt_textures = GLAD_GL_EXT_texture_compression_s3tc;
m_features.bptc_textures =

View File

@ -2601,6 +2601,7 @@ bool GSDeviceVK::CheckFeatures()
m_features.framebuffer_fetch =
m_optional_extensions.vk_ext_rasterization_order_attachment_access && !GSConfig.DisableFramebufferFetch;
m_features.texture_barrier = GSConfig.OverrideTextureBarriers != 0;
m_features.multidraw_fb_copy = false;
m_features.broken_point_sampler = false;
// geometryShader is needed because gl_PrimitiveID is part of the Geometry SPIR-V Execution Model.
@ -2640,6 +2641,7 @@ bool GSDeviceVK::CheckFeatures()
DevCon.WriteLn("Optional features:%s%s%s%s%s", m_features.primitive_id ? " primitive_id" : "",
m_features.texture_barrier ? " texture_barrier" : "", m_features.framebuffer_fetch ? " framebuffer_fetch" : "",
m_features.provoking_vertex_last ? " provoking_vertex_last" : "", m_features.vs_expand ? " vs_expand" : "");
DevCon.WriteLn("Using %s for point expansion and %s for line expansion.",
m_features.point_expand ? "hardware" : "vertex expanding",
m_features.line_expand ? "hardware" : "vertex expanding");