mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-12-16 04:09:07 +00:00
gl: Implement hardware deswizzle for small texel formats
This commit is contained in:
parent
ff72f944ba
commit
cffc13696d
@ -263,8 +263,6 @@ namespace gl
|
|||||||
|
|
||||||
cs_deswizzle_3d()
|
cs_deswizzle_3d()
|
||||||
{
|
{
|
||||||
ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type"
|
|
||||||
|
|
||||||
initialize();
|
initialize();
|
||||||
|
|
||||||
m_src =
|
m_src =
|
||||||
@ -294,8 +292,10 @@ namespace gl
|
|||||||
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))},
|
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))},
|
||||||
{ "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) },
|
{ "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) },
|
||||||
{ "%ws", std::to_string(optimal_group_size) },
|
{ "%ws", std::to_string(optimal_group_size) },
|
||||||
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
|
{ "%_wordcount", std::to_string(std::max<u32>(sizeof(_BlockType) / 4u, 1u)) },
|
||||||
{ "%f", transform }
|
{ "%f", transform },
|
||||||
|
{ "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" },
|
||||||
|
{ "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" },
|
||||||
};
|
};
|
||||||
|
|
||||||
m_src = fmt::replace_all(m_src, syntax_replace);
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
||||||
@ -339,7 +339,8 @@ namespace gl
|
|||||||
set_parameters(cmd);
|
set_parameters(cmd);
|
||||||
|
|
||||||
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
||||||
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
|
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
|
||||||
|
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
|
||||||
compute_task::run(cmd, linear_invocations);
|
compute_task::run(cmd, linear_invocations);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@ -36,6 +36,16 @@ namespace gl
|
|||||||
{
|
{
|
||||||
switch (block_size)
|
switch (block_size)
|
||||||
{
|
{
|
||||||
|
case 1:
|
||||||
|
gl::get_compute_task<gl::cs_deswizzle_3d<u8, WordType, SwapBytes>>()->run(
|
||||||
|
cmd, dst, dst_offset, src, src_offset,
|
||||||
|
data_length, width, height, depth, 1);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
gl::get_compute_task<gl::cs_deswizzle_3d<u16, WordType, SwapBytes>>()->run(
|
||||||
|
cmd, dst, dst_offset, src, src_offset,
|
||||||
|
data_length, width, height, depth, 1);
|
||||||
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
gl::get_compute_task<gl::cs_deswizzle_3d<u32, WordType, SwapBytes>>()->run(
|
gl::get_compute_task<gl::cs_deswizzle_3d<u32, WordType, SwapBytes>>()->run(
|
||||||
cmd, dst, dst_offset, src, src_offset,
|
cmd, dst, dst_offset, src, src_offset,
|
||||||
@ -748,39 +758,54 @@ namespace gl
|
|||||||
g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast<u32>(image_linear_size));
|
g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast<u32>(image_linear_size));
|
||||||
|
|
||||||
// 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem
|
// 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem
|
||||||
ensure(op.element_size == 2 || op.element_size == 4);
|
|
||||||
const auto block_size = op.element_size * op.block_length;
|
const auto block_size = op.element_size * op.block_length;
|
||||||
|
|
||||||
if (op.require_swap)
|
if (op.require_swap)
|
||||||
{
|
{
|
||||||
mem_layout.swap_bytes = false;
|
mem_layout.swap_bytes = false;
|
||||||
|
|
||||||
if (op.element_size == 4) [[ likely ]]
|
switch (op.element_size)
|
||||||
{
|
{
|
||||||
do_deswizzle_transformation<u32, true>(cmd, block_size,
|
case 1:
|
||||||
|
do_deswizzle_transformation<u8, true>(cmd, block_size,
|
||||||
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
}
|
break;
|
||||||
else
|
case 2:
|
||||||
{
|
|
||||||
do_deswizzle_transformation<u16, true>(cmd, block_size,
|
do_deswizzle_transformation<u16, true>(cmd, block_size,
|
||||||
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
do_deswizzle_transformation<u32, true>(cmd, block_size,
|
||||||
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fmt::throw_exception("Unimplemented element size deswizzle");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (op.element_size == 4) [[ likely ]]
|
switch (op.element_size)
|
||||||
{
|
{
|
||||||
do_deswizzle_transformation<u32, false>(cmd, block_size,
|
case 1:
|
||||||
|
do_deswizzle_transformation<u8, false>(cmd, block_size,
|
||||||
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
}
|
break;
|
||||||
else
|
case 2:
|
||||||
{
|
|
||||||
do_deswizzle_transformation<u16, false>(cmd, block_size,
|
do_deswizzle_transformation<u16, false>(cmd, block_size,
|
||||||
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
do_deswizzle_transformation<u32, false>(cmd, block_size,
|
||||||
|
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
|
||||||
|
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fmt::throw_exception("Unimplemented element size deswizzle");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -476,21 +476,8 @@ namespace vk
|
|||||||
params.logd = rsx::ceil_log2(depth);
|
params.logd = rsx::ceil_log2(depth);
|
||||||
|
|
||||||
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
||||||
u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
|
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
|
||||||
|
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
|
||||||
// Check if we need to do subaddressing and adjust invocation count accordingly
|
|
||||||
switch (sizeof(_BlockType))
|
|
||||||
{
|
|
||||||
case 1:
|
|
||||||
linear_invocations /= 4;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
linear_invocations /= 2;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
compute_task::run(cmd, linear_invocations);
|
compute_task::run(cmd, linear_invocations);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user