gl: Implement hardware deswizzle for small texel formats

This commit is contained in:
kd-11 2025-11-20 02:04:23 +03:00 committed by kd-11
parent ff72f944ba
commit cffc13696d
3 changed files with 44 additions and 31 deletions

View File

@ -263,8 +263,6 @@ namespace gl
cs_deswizzle_3d() cs_deswizzle_3d()
{ {
ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type"
initialize(); initialize();
m_src = m_src =
@ -294,8 +292,10 @@ namespace gl
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))}, { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))},
{ "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) }, { "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) },
{ "%ws", std::to_string(optimal_group_size) }, { "%ws", std::to_string(optimal_group_size) },
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, { "%_wordcount", std::to_string(std::max<u32>(sizeof(_BlockType) / 4u, 1u)) },
{ "%f", transform } { "%f", transform },
{ "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" },
{ "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" },
}; };
m_src = fmt::replace_all(m_src, syntax_replace); m_src = fmt::replace_all(m_src, syntax_replace);
@ -339,7 +339,8 @@ namespace gl
set_parameters(cmd); set_parameters(cmd);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, linear_invocations);
} }
}; };

View File

@ -36,6 +36,16 @@ namespace gl
{ {
switch (block_size) switch (block_size)
{ {
case 1:
gl::get_compute_task<gl::cs_deswizzle_3d<u8, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset,
data_length, width, height, depth, 1);
break;
case 2:
gl::get_compute_task<gl::cs_deswizzle_3d<u16, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset,
data_length, width, height, depth, 1);
break;
case 4: case 4:
gl::get_compute_task<gl::cs_deswizzle_3d<u32, WordType, SwapBytes>>()->run( gl::get_compute_task<gl::cs_deswizzle_3d<u32, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset, cmd, dst, dst_offset, src, src_offset,
@ -748,39 +758,54 @@ namespace gl
g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast<u32>(image_linear_size)); g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast<u32>(image_linear_size));
// 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem // 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem
ensure(op.element_size == 2 || op.element_size == 4);
const auto block_size = op.element_size * op.block_length; const auto block_size = op.element_size * op.block_length;
if (op.require_swap) if (op.require_swap)
{ {
mem_layout.swap_bytes = false; mem_layout.swap_bytes = false;
if (op.element_size == 4) [[ likely ]] switch (op.element_size)
{ {
do_deswizzle_transformation<u32, true>(cmd, block_size, case 1:
do_deswizzle_transformation<u8, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
} break;
else case 2:
{
do_deswizzle_transformation<u16, true>(cmd, block_size, do_deswizzle_transformation<u16, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
case 4:
do_deswizzle_transformation<u32, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
default:
fmt::throw_exception("Unimplemented element size deswizzle");
} }
} }
else else
{ {
if (op.element_size == 4) [[ likely ]] switch (op.element_size)
{ {
do_deswizzle_transformation<u32, false>(cmd, block_size, case 1:
do_deswizzle_transformation<u8, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
} break;
else case 2:
{
do_deswizzle_transformation<u16, false>(cmd, block_size, do_deswizzle_transformation<u16, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
case 4:
do_deswizzle_transformation<u32, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
default:
fmt::throw_exception("Unimplemented element size deswizzle");
} }
} }

View File

@ -476,21 +476,8 @@ namespace vk
params.logd = rsx::ceil_log2(depth); params.logd = rsx::ceil_log2(depth);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
// Check if we need to do subaddressing and adjust invocation count accordingly
switch (sizeof(_BlockType))
{
case 1:
linear_invocations /= 4;
break;
case 2:
linear_invocations /= 2;
break;
default:
break;
}
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, linear_invocations);
} }
}; };