GS/VK/GL/DX12/DX11: Use default buffer instead of upload buffer for accurate prims data.

Should hopefully give better performance.

Also refactor some upload/staging buffer handling in VK/DX12.
This commit is contained in:
TJnotJT 2025-11-25 18:25:57 -05:00
parent 16a7cfebdd
commit cbd4a9c92f
17 changed files with 370 additions and 208 deletions

View File

@ -396,8 +396,12 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
}
}
bd = {};
if (m_features.accurate_prims)
{
bd.Usage = D3D11_USAGE_DEFAULT;
bd.CPUAccessFlags = 0;
bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
bd.StructureByteStride = sizeof(AccuratePrimsEdgeData);
@ -410,8 +414,11 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
}
const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc(
D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData));
if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, m_accurate_prims_b_srv.put())))
D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0,
ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData));
if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc,
m_accurate_prims_b_srv.put())))
{
Console.Error("D3D11: Failed to create accurate prims buffer SRV.");
return false;
@ -419,7 +426,7 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
// If MAX_TEXTURES changes, please change the register for this buffer in the shader.
static_assert(MAX_TEXTURES == 5);
m_ctx->PSSetShaderResources(MAX_TEXTURES, 1, m_accurate_prims_b_srv.addressof());
m_ctx->PSSetShaderResources(5, 1, m_accurate_prims_b_srv.addressof());
}
// rasterizer
@ -2326,29 +2333,18 @@ bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config)
if (size > ACCURATE_PRIMS_BUFFER_SIZE)
return false;
D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
// Performance note: UpdateSubresource() copies data to a temp staging buffer to avoid stalling the GPU,
// so a manual ring buffer is not needed here like VK/DX12.
D3D11_BOX dst_region{};
dst_region.left = 0;
dst_region.right = size;
dst_region.top = 0;
dst_region.bottom = 1;
dst_region.front = 0;
dst_region.back = 1;
m_ctx->UpdateSubresource(m_accurate_prims_b.get(), 0, &dst_region, config.accurate_prims_edge_data->data(), size, 0);
pxAssert(m_accurate_prims_b_pos % sizeof(AccuratePrimsEdgeData) == 0);
if (m_accurate_prims_b_pos + size > ACCURATE_PRIMS_BUFFER_SIZE)
{
m_accurate_prims_b_pos = 0;
type = D3D11_MAP_WRITE_DISCARD;
}
D3D11_MAPPED_SUBRESOURCE m;
if (FAILED(m_ctx->Map(m_accurate_prims_b.get(), 0, type, 0, &m)))
return false;
void* map = static_cast<u8*>(m.pData) + m_accurate_prims_b_pos;
GSVector4i::storent(map, config.accurate_prims_edge_data->data(), size);
m_ctx->Unmap(m_accurate_prims_b.get(), 0);
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_b_pos / sizeof(AccuratePrimsEdgeData);
m_accurate_prims_b_pos += size;
config.cb_ps.accurate_prims_base_index.x = 0; // No offsetting needed like DX12/VK since we don't use a ring buffer.
}
return true;
}

View File

@ -137,7 +137,6 @@ private:
u32 m_vb_pos = 0; // bytes
u32 m_ib_pos = 0; // indices/sizeof(u32)
u32 m_structured_vb_pos = 0; // bytes
u32 m_accurate_prims_b_pos = 0; // bytes/sizeof(AccuratePrimsEdgeData)
bool m_allow_tearing_supported = false;
bool m_using_flip_model_swap_chain = true;

View File

@ -20,29 +20,33 @@ D3D12StreamBuffer::~D3D12StreamBuffer()
Destroy();
}
bool D3D12StreamBuffer::Create(u32 size)
bool D3D12StreamBuffer::Create(u32 size, bool default_heap)
{
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN,
{1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
D3D12MA::ALLOCATION_DESC allocationDesc = {};
allocationDesc.Flags = D3D12MA::ALLOCATION_FLAG_COMMITTED;
allocationDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD;
allocationDesc.HeapType = default_heap ? D3D12_HEAP_TYPE_DEFAULT : D3D12_HEAP_TYPE_UPLOAD;
wil::com_ptr_nothrow<ID3D12Resource> buffer;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocationDesc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
default_heap ? D3D12_RESOURCE_STATE_COMMON : D3D12_RESOURCE_STATE_GENERIC_READ,
nullptr, allocation.put(), IID_PPV_ARGS(buffer.put()));
pxAssertMsg(SUCCEEDED(hr), "Allocate buffer");
if (FAILED(hr))
return false;
static const D3D12_RANGE read_range = {};
u8* host_pointer;
hr = buffer->Map(0, &read_range, reinterpret_cast<void**>(&host_pointer));
pxAssertMsg(SUCCEEDED(hr), "Map buffer");
if (FAILED(hr))
return false;
u8* host_pointer = nullptr;
if (!default_heap)
{
hr = buffer->Map(0, &read_range, reinterpret_cast<void**>(&host_pointer));
pxAssertMsg(SUCCEEDED(hr), "Map buffer");
if (FAILED(hr))
return false;
}
Destroy(true);
@ -51,6 +55,7 @@ bool D3D12StreamBuffer::Create(u32 size)
m_host_pointer = host_pointer;
m_size = size;
m_gpu_pointer = m_buffer->GetGPUVirtualAddress();
m_default_heap = default_heap;
return true;
}
@ -148,6 +153,7 @@ void D3D12StreamBuffer::Destroy(bool defer)
m_current_offset = 0;
m_current_space = 0;
m_current_gpu_position = 0;
m_default_heap = false;
m_tracked_fences.clear();
}

View File

@ -22,7 +22,7 @@ public:
D3D12StreamBuffer();
~D3D12StreamBuffer();
bool Create(u32 size);
bool Create(u32 size, bool default_heap = false);
__fi bool IsValid() const { return static_cast<bool>(m_buffer); }
__fi ID3D12Resource* GetBuffer() const { return m_buffer.get(); }
@ -54,7 +54,8 @@ private:
wil::com_ptr_nothrow<ID3D12Resource> m_buffer;
wil::com_ptr_nothrow<D3D12MA::Allocation> m_allocation;
D3D12_GPU_VIRTUAL_ADDRESS m_gpu_pointer = {};
u8* m_host_pointer = nullptr;
u8* m_host_pointer = nullptr; // Only used for upload heaps.
bool m_default_heap = false; // False for upload heap; true for default heap.
// List of fences and the corresponding positions in the buffer
std::deque<std::pair<u64, u32>> m_tracked_fences;

View File

@ -624,52 +624,91 @@ bool GSDevice12::SetGPUTimingEnabled(bool enabled)
bool GSDevice12::AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer,
D3D12MA::Allocation** gpu_allocation, const std::function<void(void*)>& fill_callback)
{
// Try to place the fixed index buffer in GPU local memory.
// Use the staging buffer to copy into it.
// Allocate and fill staging buffer
ID3D12Resource* cpu_buffer = AllocateUploadStagingBuffer(size, fill_callback);
// Create GPU buffer
const D3D12_RESOURCE_DESC rd = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0},
D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
const D3D12MA::ALLOCATION_DESC cpu_ad = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
ComPtr<ID3D12Resource> cpu_buffer;
ComPtr<D3D12MA::Allocation> cpu_allocation;
HRESULT hr = m_allocator->CreateResource(
&cpu_ad, &rd, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, cpu_allocation.put(), IID_PPV_ARGS(cpu_buffer.put()));
pxAssertMsg(SUCCEEDED(hr), "Allocate CPU buffer");
if (FAILED(hr))
return false;
static constexpr const D3D12_RANGE read_range = {};
const D3D12_RANGE write_range = {0, size};
void* mapped;
hr = cpu_buffer->Map(0, &read_range, &mapped);
pxAssertMsg(SUCCEEDED(hr), "Map CPU buffer");
if (FAILED(hr))
return false;
fill_callback(mapped);
cpu_buffer->Unmap(0, &write_range);
const D3D12MA::ALLOCATION_DESC gpu_ad = {D3D12MA::ALLOCATION_FLAG_COMMITTED, D3D12_HEAP_TYPE_DEFAULT};
hr = m_allocator->CreateResource(
HRESULT hr = m_allocator->CreateResource(
&gpu_ad, &rd, D3D12_RESOURCE_STATE_COMMON, nullptr, gpu_allocation, IID_PPV_ARGS(gpu_buffer));
pxAssertMsg(SUCCEEDED(hr), "Allocate GPU buffer");
if (FAILED(hr))
return false;
GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer.get(), 0, size);
// Copy the data
GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer, 0, size);
// Transition GPU buffer to COPY_DEST
D3D12_RESOURCE_BARRIER rb = {D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, D3D12_RESOURCE_BARRIER_FLAG_NONE};
rb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
rb.Transition.pResource = *gpu_buffer;
rb.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; // COMMON -> COPY_DEST at first use.
rb.Transition.StateAfter = D3D12_RESOURCE_STATE_INDEX_BUFFER;
GetInitCommandList()->ResourceBarrier(1, &rb);
DeferResourceDestruction(cpu_allocation.get(), cpu_buffer.get());
return true;
}
ID3D12Resource* GSDevice12::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
{
if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
GSDevice12::GetInstance()->ExecuteCommandList(
false, "While waiting for %u bytes in texture upload buffer", size);
if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
return nullptr;
}
}
offset_out = m_texture_stream_buffer.GetCurrentOffset();
write_data(m_texture_stream_buffer.GetCurrentHostPointer());
m_texture_stream_buffer.CommitMemory(size);
return m_texture_stream_buffer.GetBuffer();
}
ID3D12Resource* GSDevice12::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
{
wil::com_ptr_nothrow<ID3D12Resource> resource;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
// Allocate staging buffer
const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1,
DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
HRESULT hr = GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
return nullptr;
}
// Map
static constexpr const D3D12_RANGE read_range = {};
void* map_ptr;
hr = resource->Map(0, &read_range, &map_ptr);
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
return nullptr;
}
// Write data
write_data(map_ptr);
// Unmap
const D3D12_RANGE write_range = {0, size};
resource->Unmap(0, &write_range);
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
// This adds the reference needed to keep the buffer alive.
DeferResourceDestruction(allocation.get(), resource.get());
return resource.get();
}
RenderAPI GSDevice12::GetRenderAPI() const
{
return RenderAPI::D3D12;
@ -2180,15 +2219,17 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count)
m_index_stream_buffer.CommitMemory(size);
}
void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config)
void GSDevice12::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
// Unbind the buffer.
m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING;
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
// Reserve the GPU region.
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
{
ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
@ -2196,14 +2237,72 @@ void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config)
pxFailRel("Failed to reserve space for accurate prims");
}
const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
if (InRenderPass())
EndRenderPass();
// Copy data to an upload buffer.
ID3D12Resource* upload_buffer;
u32 upload_buffer_offset;
const auto upload_data = [&](void* map_ptr) {
std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (size > m_texture_stream_buffer.GetSize() / 2)
{
upload_buffer_offset = 0;
upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
}
else
{
upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
}
if (!upload_buffer)
{
Console.Error("Failed to get upload buffer for accurate prims data.");
return;
}
// Copy data from upload to GPU buffer.
const D3D12_RESOURCE_BARRIER barrier_sr_to_dst = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
D3D12_RESOURCE_STATE_COPY_DEST}}};
GetCommandList()->ResourceBarrier(1, &barrier_sr_to_dst);
GetCommandList()->CopyBufferRegion(
m_accurate_prims_stream_buffer.GetBuffer(), offset, upload_buffer, upload_buffer_offset, size);
// Commit the GPU region.
m_accurate_prims_stream_buffer.CommitMemory(size);
// Issue the barrier since this will be used next draw.
const D3D12_RESOURCE_BARRIER barrier_dst_to_sr = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_COPY_DEST,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
GetCommandList()->ResourceBarrier(1, &barrier_dst_to_sr);
m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
}
}
void GSDevice12::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
config.cb_vs.base_vertex = m_vertex.start;
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData);
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size);
m_accurate_prims_stream_buffer.CommitMemory(size);
}
}
@ -2394,7 +2493,8 @@ bool GSDevice12::CreateBuffers()
return false;
}
if (!m_accurate_prims_stream_buffer.Create(ACCURATE_PRIMS_BUFFER_SIZE))
if (!m_accurate_prims_stream_buffer.Create(
m_features.accurate_prims ? ACCURATE_PRIMS_BUFFER_SIZE : sizeof(AccuratePrimsEdgeData), true))
{
Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
return false;
@ -2406,8 +2506,17 @@ bool GSDevice12::CreateBuffers()
return false;
}
// Create the shader resource view for the accurate prims buffer.
if (m_features.accurate_prims)
{
// Transition to accurate prims buffer to pixel shader resource and create the shader resource view.
const D3D12_RESOURCE_BARRIER barrier = {
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
D3D12_RESOURCE_BARRIER_FLAG_NONE,
{{m_accurate_prims_stream_buffer.GetBuffer(), 0,
D3D12_RESOURCE_STATE_COMMON,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}};
GetInitCommandList()->ResourceBarrier(1, &barrier);
D3D12_SHADER_RESOURCE_VIEW_DESC desc = {
DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING};
desc.Buffer.FirstElement = 0;
@ -3940,6 +4049,9 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config)
PipelineSelector& pipe = m_pipeline_selector;
// Copying buffers needs to done outside render pass so do this early.
SetupAccuratePrimsBuffer(config);
// figure out the pipeline
UpdateHWPipelineSelector(config);
@ -4321,5 +4433,6 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
IASetIndexBuffer(config.indices, config.nindices);
}
SetupAccuratePrims(config);
// Needs to be done after vertex offset is set.
SetupAccuratePrimsConstants(config);
}

View File

@ -129,6 +129,8 @@ public:
// Allocates a temporary CPU staging buffer, fires the callback with it to populate, then copies to a GPU buffer.
bool AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation,
const std::function<void(void*)>& fill_callback);
ID3D12Resource* AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
ID3D12Resource* WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);
private:
struct CommandListResources
@ -307,6 +309,7 @@ private:
D3D12StreamBuffer m_vertex_stream_buffer;
D3D12StreamBuffer m_index_stream_buffer;
D3D12StreamBuffer m_accurate_prims_stream_buffer;
u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw.
D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu;
D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu;
D3D12StreamBuffer m_vertex_constant_buffer;
@ -465,7 +468,8 @@ public:
void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
void IASetIndexBuffer(const void* index, size_t count);
void SetupAccuratePrims(GSHWDrawConfig& config);
void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
void SetupAccuratePrimsConstants(GSHWDrawConfig& config);
void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);

View File

@ -350,43 +350,6 @@ ID3D12GraphicsCommandList* GSTexture12::GetCommandBufferForUpdate()
return dev->GetInitCommandList();
}
ID3D12Resource* GSTexture12::AllocateUploadStagingBuffer(
const void* data, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 buffer_size = CalcUploadSize(height, upload_pitch);
wil::com_ptr_nothrow<ID3D12Resource> resource;
wil::com_ptr_nothrow<D3D12MA::Allocation> allocation;
const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD};
const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, buffer_size, 1, 1, 1,
DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE};
HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocation_desc, &resource_desc,
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put()));
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr);
return nullptr;
}
void* map_ptr;
hr = resource->Map(0, nullptr, &map_ptr);
if (FAILED(hr))
{
Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr);
return nullptr;
}
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
const D3D12_RANGE write_range = {0, buffer_size};
resource->Unmap(0, &write_range);
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
// This adds the reference needed to keep the buffer alive.
GSDevice12::GetInstance()->DeferResourceDestruction(allocation.get(), resource.get());
return resource.get();
}
void GSTexture12::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 block_size = GetCompressedBlockSize();
@ -406,7 +369,7 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
const u32 width = Common::AlignUpPow2(r.width(), block_size);
const u32 height = Common::AlignUpPow2(r.height(), block_size);
const u32 upload_pitch = Common::AlignUpPow2<u32>(pitch, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
const u32 required_size = CalcUploadSize(r.height(), upload_pitch);
const u32 required_size = CalcUploadSize(height, upload_pitch);
D3D12_TEXTURE_COPY_LOCATION srcloc;
srcloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
@ -416,35 +379,25 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l
srcloc.PlacedFootprint.Footprint.Format = m_dxgi_format;
srcloc.PlacedFootprint.Footprint.RowPitch = upload_pitch;
const auto upload_data = [&](void* map_ptr) {
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (required_size > (GSDevice12::GetInstance()->GetTextureStreamBuffer().GetSize() / 2))
{
srcloc.pResource = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
if (!srcloc.pResource)
return false;
srcloc.pResource = GSDevice12::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
srcloc.PlacedFootprint.Offset = 0;
}
else
{
D3D12StreamBuffer& sbuffer = GSDevice12::GetInstance()->GetTextureStreamBuffer();
if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
GSDevice12::GetInstance()->ExecuteCommandList(
false, "While waiting for %u bytes in texture upload buffer", required_size);
if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
return false;
}
}
srcloc.pResource = sbuffer.GetBuffer();
srcloc.PlacedFootprint.Offset = sbuffer.GetCurrentOffset();
CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
sbuffer.CommitMemory(required_size);
u32 offset;
srcloc.pResource = GSDevice12::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, offset);
srcloc.PlacedFootprint.Offset = offset;
}
if (!srcloc.pResource)
return false;
ID3D12GraphicsCommandList* cmdlist = GetCommandBufferForUpdate();
GL_PUSH("GSTexture12::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);

View File

@ -79,7 +79,6 @@ private:
static bool CreateUAVDescriptor(ID3D12Resource* resource, DXGI_FORMAT format, D3D12DescriptorHandle* dh);
ID3D12GraphicsCommandList* GetCommandBufferForUpdate();
ID3D12Resource* AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;
wil::com_ptr_nothrow<ID3D12Resource> m_resource;

View File

@ -310,10 +310,10 @@ namespace
};
} // namespace
std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size)
std::unique_ptr<GLStreamBuffer> GLStreamBuffer::Create(GLenum target, u32 size, bool nonsyncing)
{
std::unique_ptr<GLStreamBuffer> buf;
if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage)
if (!nonsyncing && (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage))
{
buf = BufferStorageStreamBuffer::Create(target, size);
if (buf)

View File

@ -38,7 +38,7 @@ public:
/// Returns the minimum granularity of blocks which sync objects will be created around.
virtual u32 GetChunkSize() const = 0;
static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size);
static std::unique_ptr<GLStreamBuffer> Create(GLenum target, u32 size, bool nonsyncing = false);
protected:
GLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);

View File

@ -260,11 +260,17 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle)
m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE);
m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE);
m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE);
if (m_features.accurate_prims)
{
// Performance note: prefer a non-syncing buffer for accurate prims so that it is more likely to be GPU local.
// Rationale: we expect this buffer to be updated relatively rarely and it's used as a pixel shader resource.
m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE, true);
}
m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE);
m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE);
glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment);
if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_accurate_prims_stream_buffer ||
if (!m_vertex_stream_buffer || !m_index_stream_buffer ||
(m_features.accurate_prims && !m_accurate_prims_stream_buffer) ||
!m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer)
{
Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers");

View File

@ -3406,13 +3406,14 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count)
SetIndexBuffer(m_index_stream_buffer.GetBuffer());
}
void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config)
void GSDeviceVK::SetupAccuratePrimsBuffer(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
const u32 count = config.accurate_prims_edge_data->size();
const u32 size = count * sizeof(AccuratePrimsEdgeData);
// Reserve the GPU region.
if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData)))
{
ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer");
@ -3420,17 +3421,120 @@ void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config)
pxFailRel("Failed to reserve space for accurate prims");
}
const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset();
if (InRenderPass())
EndRenderPass();
// Copy data to an upload buffer.
VkBuffer upload_buffer;
u32 upload_buffer_offset;
const auto upload_data = [&](void* map_ptr) {
std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
if (size > m_texture_stream_buffer.GetCurrentSize() / 2)
{
upload_buffer_offset = 0;
upload_buffer = AllocateUploadStagingBuffer(size, upload_data);
}
else
{
upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset);
}
if (upload_buffer == VK_NULL_HANDLE)
{
Console.Error("Failed to get upload buffer for accurate prims data.");
return;
}
// Copy data from upload to GPU buffer.
VkBufferCopy copyRegion = {upload_buffer_offset, offset, size};
vkCmdCopyBuffer(GetCurrentCommandBuffer(), upload_buffer, m_accurate_prims_stream_buffer.GetBuffer(), 1, &copyRegion);
// Commit the GPU region.
m_accurate_prims_stream_buffer.CommitMemory(size);
// Issue the barrier since this will be used next draw.
VkBufferMemoryBarrier barrier = {
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, nullptr,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT,
VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
m_accurate_prims_stream_buffer.GetBuffer(), offset, size};
vkCmdPipelineBarrier(GetCurrentCommandBuffer(),
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
0, 0, nullptr, 1, &barrier, 0, nullptr);
m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer.
}
}
void GSDeviceVK::SetupAccuratePrimsConstants(GSHWDrawConfig& config)
{
if (config.accurate_prims)
{
// We separate this from setting up the buffer to mirror Vulkan, which requires it.
config.cb_vs.base_vertex = m_vertex.start;
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData);
config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData);
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size);
m_accurate_prims_stream_buffer.CommitMemory(size);
}
}
VkBuffer GSDeviceVK::WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out)
{
if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
{
ExecuteCommandBuffer(
false, "While waiting for %u bytes in texture upload buffer", size);
if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment()))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", size);
return VK_NULL_HANDLE;
}
}
offset_out = m_texture_stream_buffer.GetCurrentOffset();
write_data(m_texture_stream_buffer.GetCurrentHostPointer());
m_texture_stream_buffer.CommitMemory(size);
return m_texture_stream_buffer.GetBuffer();
}
VkBuffer GSDeviceVK::AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data)
{
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
// Don't worry about setting the coherent bit for this upload, the main reason we had
// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
// smaller uploads, but we're writing to the whole thing anyway.
VmaAllocationCreateInfo aci = {};
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
VmaAllocationInfo ai;
VkBuffer buffer;
VmaAllocation allocation;
VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
if (res != VK_SUCCESS)
{
LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
return VK_NULL_HANDLE;
}
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
// And write the data.
write_data(ai.pMappedData);
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
return buffer;
}
void GSDeviceVK::OMSetRenderTargets(
GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop)
{
@ -3762,7 +3866,8 @@ bool GSDeviceVK::CreateBuffers()
if (m_features.accurate_prims)
{
if (!m_accurate_prims_stream_buffer.Create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ACCURATE_PRIMS_BUFFER_SIZE))
if (!m_accurate_prims_stream_buffer.Create(
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, ACCURATE_PRIMS_BUFFER_SIZE, true))
{
Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer");
return false;
@ -5673,13 +5778,15 @@ GSTextureVK* GSDeviceVK::SetupPrimitiveTrackingDATE(GSHWDrawConfig& config)
void GSDeviceVK::RenderHW(GSHWDrawConfig& config)
{
const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize());
GSTextureVK* draw_rt = static_cast<GSTextureVK*>(config.rt);
GSTextureVK* draw_ds = static_cast<GSTextureVK*>(config.ds);
GSTextureVK* draw_rt_clone = nullptr;
GSTextureVK* colclip_rt = static_cast<GSTextureVK*>(g_gs_device->GetColorClipTexture());
// Copying buffers needs to done outside render pass so do this early.
SetupAccuratePrimsBuffer(config);
// stream buffer in first, in case we need to exec
SetVSConstantBuffer(config.cb_vs);
SetPSConstantBuffer(config.cb_ps);
@ -6157,7 +6264,8 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config)
IASetIndexBuffer(config.indices, config.nindices);
}
SetupAccuratePrims(config);
// Needs to be done after vertex offset is set.
SetupAccuratePrimsConstants(config);
}
VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const

View File

@ -98,6 +98,8 @@ public:
__fi VkCommandBuffer GetCurrentCommandBuffer() const { return m_current_command_buffer; }
__fi VKStreamBuffer& GetTextureUploadBuffer() { return m_texture_stream_buffer; }
VkCommandBuffer GetCurrentInitCommandBuffer();
VkBuffer AllocateUploadStagingBuffer(u32 size, std::function<void(void*)> write_data);
VkBuffer WriteTextureUploadBuffer(u32 size, std::function<void(void*)> write_data, u32& offset_out);
/// Allocates a descriptor set from the pool reserved for the current frame.
VkDescriptorSet AllocatePersistentDescriptorSet(VkDescriptorSetLayout set_layout);
@ -381,6 +383,7 @@ private:
VKStreamBuffer m_vertex_stream_buffer;
VKStreamBuffer m_index_stream_buffer;
VKStreamBuffer m_accurate_prims_stream_buffer;
u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw.
VKStreamBuffer m_vertex_uniform_stream_buffer;
VKStreamBuffer m_fragment_uniform_stream_buffer;
VKStreamBuffer m_texture_stream_buffer;
@ -563,7 +566,8 @@ public:
void PSSetShaderResource(int i, GSTexture* sr, bool check_state);
void PSSetSampler(GSHWDrawConfig::SamplerSelector sel);
void SetupAccuratePrims(GSHWDrawConfig& config);
void SetupAccuratePrimsBuffer(GSHWDrawConfig& config);
void SetupAccuratePrimsConstants(GSHWDrawConfig& config);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor,
FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None);

View File

@ -270,38 +270,6 @@ void GSTextureVK::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch
StringUtil::StrideMemCpy(dst, upload_pitch, src, pitch, std::min(upload_pitch, pitch), count);
}
VkBuffer GSTextureVK::AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const
{
const u32 size = upload_pitch * height;
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
// Don't worry about setting the coherent bit for this upload, the main reason we had
// that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on
// smaller uploads, but we're writing to the whole thing anyway.
VmaAllocationCreateInfo aci = {};
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
VmaAllocationInfo ai;
VkBuffer buffer;
VmaAllocation allocation;
VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
if (res != VK_SUCCESS)
{
LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: ");
return VK_NULL_HANDLE;
}
// Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy.
GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation);
// And write the data.
CopyTextureDataForUpload(ai.pMappedData, data, pitch, upload_pitch, height);
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size);
return buffer;
}
void GSTextureVK::UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height,
u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset)
{
@ -333,6 +301,10 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
const u32 upload_pitch = Common::AlignUpPow2(pitch, GSDeviceVK::GetInstance()->GetBufferCopyRowPitchAlignment());
const u32 required_size = CalcUploadSize(height, upload_pitch);
const auto upload_data = [&](void* map_ptr) {
CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height);
};
// If the texture is larger than half our streaming buffer size, use a separate buffer.
// Otherwise allocation will either fail, or require lots of cmdbuffer submissions.
VkBuffer buffer;
@ -340,29 +312,14 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l
if (required_size > (GSDeviceVK::GetInstance()->GetTextureUploadBuffer().GetCurrentSize() / 2))
{
buffer_offset = 0;
buffer = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height);
if (buffer == VK_NULL_HANDLE)
return false;
buffer = GSDeviceVK::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data);
}
else
{
VKStreamBuffer& sbuffer = GSDeviceVK::GetInstance()->GetTextureUploadBuffer();
if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
{
GSDeviceVK::GetInstance()->ExecuteCommandBuffer(
false, "While waiting for %u bytes in texture upload buffer", required_size);
if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment()))
{
Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size);
return false;
}
}
buffer = sbuffer.GetBuffer();
buffer_offset = sbuffer.GetCurrentOffset();
CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height);
sbuffer.CommitMemory(required_size);
buffer = GSDeviceVK::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, buffer_offset);
}
if (buffer == VK_NULL_HANDLE)
return false;
const VkCommandBuffer cmdbuf = GetCommandBufferForUpdate();
GL_PUSH("GSTextureVK::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer);

View File

@ -84,7 +84,6 @@ private:
VkCommandBuffer GetCommandBufferForUpdate();
void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const;
VkBuffer AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const;
void UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height,
u32 row_length, VkBuffer buffer, u32 buffer_offset);

View File

@ -19,6 +19,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
, m_allocation(move.m_allocation)
, m_buffer(move.m_buffer)
, m_host_pointer(move.m_host_pointer)
, m_device_local(move.m_device_local)
, m_tracked_fences(std::move(move.m_tracked_fences))
{
move.m_size = 0;
@ -28,6 +29,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move)
move.m_allocation = VK_NULL_HANDLE;
move.m_buffer = VK_NULL_HANDLE;
move.m_host_pointer = nullptr;
move.m_device_local = false;
}
VKStreamBuffer::~VKStreamBuffer()
@ -48,19 +50,29 @@ VKStreamBuffer& VKStreamBuffer::operator=(VKStreamBuffer&& move)
std::swap(m_buffer, move.m_buffer);
std::swap(m_host_pointer, move.m_host_pointer);
std::swap(m_tracked_fences, move.m_tracked_fences);
std::swap(m_device_local, move.m_device_local);
return *this;
}
bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size, bool device_local)
{
const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast<VkDeviceSize>(size),
usage, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr};
VmaAllocationCreateInfo aci = {};
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
if (device_local)
{
// GPU default buffer
aci.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
}
else
{
// CPU upload buffer
aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
VmaAllocationInfo ai = {};
VkBuffer new_buffer = VK_NULL_HANDLE;
@ -83,7 +95,8 @@ bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size)
m_tracked_fences.clear();
m_allocation = new_allocation;
m_buffer = new_buffer;
m_host_pointer = static_cast<u8*>(ai.pMappedData);
m_host_pointer = device_local ? nullptr : static_cast<u8*>(ai.pMappedData);
m_device_local = device_local;
return true;
}
@ -104,6 +117,7 @@ void VKStreamBuffer::Destroy(bool defer)
m_buffer = VK_NULL_HANDLE;
m_allocation = VK_NULL_HANDLE;
m_host_pointer = nullptr;
m_device_local = false;
}
bool VKStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
@ -180,8 +194,11 @@ void VKStreamBuffer::CommitMemory(u32 final_num_bytes)
pxAssert((m_current_offset + final_num_bytes) <= m_size);
pxAssert(final_num_bytes <= m_current_space);
// For non-coherent mappings, flush the memory range
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes);
if (!m_device_local)
{
// For non-coherent mappings, flush the memory range
vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes);
}
m_current_offset += final_num_bytes;
m_current_space -= final_num_bytes;

View File

@ -30,14 +30,13 @@ public:
__fi u32 GetCurrentSpace() const { return m_current_space; }
__fi u32 GetCurrentOffset() const { return m_current_offset; }
bool Create(VkBufferUsageFlags usage, u32 size);
bool Create(VkBufferUsageFlags usage, u32 size, bool device_local = false);
void Destroy(bool defer);
bool ReserveMemory(u32 num_bytes, u32 alignment);
void CommitMemory(u32 final_num_bytes);
private:
bool AllocateBuffer(VkBufferUsageFlags usage, u32 size);
void UpdateCurrentFencePosition();
void UpdateGPUPosition();
@ -51,7 +50,8 @@ private:
VmaAllocation m_allocation = VK_NULL_HANDLE;
VkBuffer m_buffer = VK_NULL_HANDLE;
u8* m_host_pointer = nullptr;
u8* m_host_pointer = nullptr; // Only used for upload buffers.
bool m_device_local = false; // False for upload buffer; true for default buffer.
// List of fences and the corresponding positions in the buffer
std::deque<std::pair<u64, u32>> m_tracked_fences;