diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp index d95e22a249..56bb0a06c6 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp @@ -396,8 +396,12 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) } } + bd = {}; + if (m_features.accurate_prims) { + bd.Usage = D3D11_USAGE_DEFAULT; + bd.CPUAccessFlags = 0; bd.ByteWidth = ACCURATE_PRIMS_BUFFER_SIZE; bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; bd.StructureByteStride = sizeof(AccuratePrimsEdgeData); @@ -410,8 +414,11 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) } const CD3D11_SHADER_RESOURCE_VIEW_DESC accurate_prims_b_srv_desc( - D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData)); - if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, m_accurate_prims_b_srv.put()))) + D3D11_SRV_DIMENSION_BUFFER, DXGI_FORMAT_UNKNOWN, 0, + ACCURATE_PRIMS_BUFFER_SIZE / sizeof(AccuratePrimsEdgeData)); + + if (FAILED(m_dev->CreateShaderResourceView(m_accurate_prims_b.get(), &accurate_prims_b_srv_desc, + m_accurate_prims_b_srv.put()))) { Console.Error("D3D11: Failed to create accurate prims buffer SRV."); return false; @@ -419,7 +426,7 @@ bool GSDevice11::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) // If MAX_TEXTURES changes, please change the register for this buffer in the shader. static_assert(MAX_TEXTURES == 5); - m_ctx->PSSetShaderResources(MAX_TEXTURES, 1, m_accurate_prims_b_srv.addressof()); + m_ctx->PSSetShaderResources(5, 1, m_accurate_prims_b_srv.addressof()); } // rasterizer @@ -2326,29 +2333,18 @@ bool GSDevice11::SetupAccuratePrims(GSHWDrawConfig& config) if (size > ACCURATE_PRIMS_BUFFER_SIZE) return false; - D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE; + // Performance note: UpdateSubresource() copies data to a temp staging buffer to avoid stalling the GPU, + // so a manual ring buffer is not needed here like VK/DX12. + D3D11_BOX dst_region{}; + dst_region.left = 0; + dst_region.right = size; + dst_region.top = 0; + dst_region.bottom = 1; + dst_region.front = 0; + dst_region.back = 1; + m_ctx->UpdateSubresource(m_accurate_prims_b.get(), 0, &dst_region, config.accurate_prims_edge_data->data(), size, 0); - pxAssert(m_accurate_prims_b_pos % sizeof(AccuratePrimsEdgeData) == 0); - - if (m_accurate_prims_b_pos + size > ACCURATE_PRIMS_BUFFER_SIZE) - { - m_accurate_prims_b_pos = 0; - type = D3D11_MAP_WRITE_DISCARD; - } - - D3D11_MAPPED_SUBRESOURCE m; - if (FAILED(m_ctx->Map(m_accurate_prims_b.get(), 0, type, 0, &m))) - return false; - - void* map = static_cast(m.pData) + m_accurate_prims_b_pos; - - GSVector4i::storent(map, config.accurate_prims_edge_data->data(), size); - - m_ctx->Unmap(m_accurate_prims_b.get(), 0); - - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_b_pos / sizeof(AccuratePrimsEdgeData); - - m_accurate_prims_b_pos += size; + config.cb_ps.accurate_prims_base_index.x = 0; // No offsetting needed like DX12/VK since we don't use a ring buffer. } return true; } diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.h b/pcsx2/GS/Renderers/DX11/GSDevice11.h index b790e3eed9..a90296da05 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.h +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h @@ -137,7 +137,6 @@ private: u32 m_vb_pos = 0; // bytes u32 m_ib_pos = 0; // indices/sizeof(u32) u32 m_structured_vb_pos = 0; // bytes - u32 m_accurate_prims_b_pos = 0; // bytes/sizeof(AccuratePrimsEdgeData) bool m_allow_tearing_supported = false; bool m_using_flip_model_swap_chain = true; diff --git a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp index 7219dc48f1..da87429193 100644 --- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp +++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.cpp @@ -20,29 +20,33 @@ D3D12StreamBuffer::~D3D12StreamBuffer() Destroy(); } -bool D3D12StreamBuffer::Create(u32 size) +bool D3D12StreamBuffer::Create(u32 size, bool default_heap) { const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; D3D12MA::ALLOCATION_DESC allocationDesc = {}; allocationDesc.Flags = D3D12MA::ALLOCATION_FLAG_COMMITTED; - allocationDesc.HeapType = D3D12_HEAP_TYPE_UPLOAD; + allocationDesc.HeapType = default_heap ? D3D12_HEAP_TYPE_DEFAULT : D3D12_HEAP_TYPE_UPLOAD; wil::com_ptr_nothrow buffer; wil::com_ptr_nothrow allocation; HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocationDesc, &resource_desc, - D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(buffer.put())); + default_heap ? D3D12_RESOURCE_STATE_COMMON : D3D12_RESOURCE_STATE_GENERIC_READ, + nullptr, allocation.put(), IID_PPV_ARGS(buffer.put())); pxAssertMsg(SUCCEEDED(hr), "Allocate buffer"); if (FAILED(hr)) return false; static const D3D12_RANGE read_range = {}; - u8* host_pointer; - hr = buffer->Map(0, &read_range, reinterpret_cast(&host_pointer)); - pxAssertMsg(SUCCEEDED(hr), "Map buffer"); - if (FAILED(hr)) - return false; + u8* host_pointer = nullptr; + if (!default_heap) + { + hr = buffer->Map(0, &read_range, reinterpret_cast(&host_pointer)); + pxAssertMsg(SUCCEEDED(hr), "Map buffer"); + if (FAILED(hr)) + return false; + } Destroy(true); @@ -51,6 +55,7 @@ bool D3D12StreamBuffer::Create(u32 size) m_host_pointer = host_pointer; m_size = size; m_gpu_pointer = m_buffer->GetGPUVirtualAddress(); + m_default_heap = default_heap; return true; } @@ -148,6 +153,7 @@ void D3D12StreamBuffer::Destroy(bool defer) m_current_offset = 0; m_current_space = 0; m_current_gpu_position = 0; + m_default_heap = false; m_tracked_fences.clear(); } diff --git a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h index da1655f4bd..f5164db4f3 100644 --- a/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h +++ b/pcsx2/GS/Renderers/DX12/D3D12StreamBuffer.h @@ -22,7 +22,7 @@ public: D3D12StreamBuffer(); ~D3D12StreamBuffer(); - bool Create(u32 size); + bool Create(u32 size, bool default_heap = false); __fi bool IsValid() const { return static_cast(m_buffer); } __fi ID3D12Resource* GetBuffer() const { return m_buffer.get(); } @@ -54,7 +54,8 @@ private: wil::com_ptr_nothrow m_buffer; wil::com_ptr_nothrow m_allocation; D3D12_GPU_VIRTUAL_ADDRESS m_gpu_pointer = {}; - u8* m_host_pointer = nullptr; + u8* m_host_pointer = nullptr; // Only used for upload heaps. + bool m_default_heap = false; // False for upload heap; true for default heap. // List of fences and the corresponding positions in the buffer std::deque> m_tracked_fences; diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp index 4f6c991f91..1416e8369d 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp @@ -624,52 +624,91 @@ bool GSDevice12::SetGPUTimingEnabled(bool enabled) bool GSDevice12::AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation, const std::function& fill_callback) { - // Try to place the fixed index buffer in GPU local memory. - // Use the staging buffer to copy into it. + // Allocate and fill staging buffer + ID3D12Resource* cpu_buffer = AllocateUploadStagingBuffer(size, fill_callback); + + // Create GPU buffer const D3D12_RESOURCE_DESC rd = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; - - const D3D12MA::ALLOCATION_DESC cpu_ad = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; - - ComPtr cpu_buffer; - ComPtr cpu_allocation; - HRESULT hr = m_allocator->CreateResource( - &cpu_ad, &rd, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, cpu_allocation.put(), IID_PPV_ARGS(cpu_buffer.put())); - pxAssertMsg(SUCCEEDED(hr), "Allocate CPU buffer"); - if (FAILED(hr)) - return false; - - static constexpr const D3D12_RANGE read_range = {}; - const D3D12_RANGE write_range = {0, size}; - void* mapped; - hr = cpu_buffer->Map(0, &read_range, &mapped); - pxAssertMsg(SUCCEEDED(hr), "Map CPU buffer"); - if (FAILED(hr)) - return false; - fill_callback(mapped); - cpu_buffer->Unmap(0, &write_range); - const D3D12MA::ALLOCATION_DESC gpu_ad = {D3D12MA::ALLOCATION_FLAG_COMMITTED, D3D12_HEAP_TYPE_DEFAULT}; - - hr = m_allocator->CreateResource( + HRESULT hr = m_allocator->CreateResource( &gpu_ad, &rd, D3D12_RESOURCE_STATE_COMMON, nullptr, gpu_allocation, IID_PPV_ARGS(gpu_buffer)); pxAssertMsg(SUCCEEDED(hr), "Allocate GPU buffer"); if (FAILED(hr)) return false; - GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer.get(), 0, size); + // Copy the data + GetInitCommandList()->CopyBufferRegion(*gpu_buffer, 0, cpu_buffer, 0, size); + // Transition GPU buffer to COPY_DEST D3D12_RESOURCE_BARRIER rb = {D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, D3D12_RESOURCE_BARRIER_FLAG_NONE}; rb.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; rb.Transition.pResource = *gpu_buffer; rb.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; // COMMON -> COPY_DEST at first use. rb.Transition.StateAfter = D3D12_RESOURCE_STATE_INDEX_BUFFER; GetInitCommandList()->ResourceBarrier(1, &rb); - - DeferResourceDestruction(cpu_allocation.get(), cpu_buffer.get()); return true; } +ID3D12Resource* GSDevice12::WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out) +{ + if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) + { + GSDevice12::GetInstance()->ExecuteCommandList( + false, "While waiting for %u bytes in texture upload buffer", size); + if (!m_texture_stream_buffer.ReserveMemory(size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) + { + Console.Error("Failed to reserve texture upload memory (%u bytes).", size); + return nullptr; + } + } + + offset_out = m_texture_stream_buffer.GetCurrentOffset(); + write_data(m_texture_stream_buffer.GetCurrentHostPointer()); + m_texture_stream_buffer.CommitMemory(size); + return m_texture_stream_buffer.GetBuffer(); +} + +ID3D12Resource* GSDevice12::AllocateUploadStagingBuffer(u32 size, std::function write_data) +{ + wil::com_ptr_nothrow resource; + wil::com_ptr_nothrow allocation; + + // Allocate staging buffer + const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; + const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, size, 1, 1, 1, + DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; + HRESULT hr = GetAllocator()->CreateResource(&allocation_desc, &resource_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put())); + if (FAILED(hr)) + { + Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr); + return nullptr; + } + + // Map + static constexpr const D3D12_RANGE read_range = {}; + void* map_ptr; + hr = resource->Map(0, &read_range, &map_ptr); + if (FAILED(hr)) + { + Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr); + return nullptr; + } + + // Write data + write_data(map_ptr); + + // Unmap + const D3D12_RANGE write_range = {0, size}; + resource->Unmap(0, &write_range); + + // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. + // This adds the reference needed to keep the buffer alive. + DeferResourceDestruction(allocation.get(), resource.get()); + return resource.get(); +} + RenderAPI GSDevice12::GetRenderAPI() const { return RenderAPI::D3D12; @@ -2180,15 +2219,17 @@ void GSDevice12::IASetIndexBuffer(const void* index, size_t count) m_index_stream_buffer.CommitMemory(size); } -void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config) +void GSDevice12::SetupAccuratePrimsBuffer(GSHWDrawConfig& config) { if (config.accurate_prims) { + // Unbind the buffer. m_dirty_flags |= DIRTY_FLAG_PS_ACCURATE_PRIMS_BUFFER_BINDING; const u32 count = config.accurate_prims_edge_data->size(); const u32 size = count * sizeof(AccuratePrimsEdgeData); + // Reserve the GPU region. if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) { ExecuteCommandListAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); @@ -2196,14 +2237,72 @@ void GSDevice12::SetupAccuratePrims(GSHWDrawConfig& config) pxFailRel("Failed to reserve space for accurate prims"); } + const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset(); + + if (InRenderPass()) + EndRenderPass(); + + // Copy data to an upload buffer. + ID3D12Resource* upload_buffer; + u32 upload_buffer_offset; + + const auto upload_data = [&](void* map_ptr) { + std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size); + }; + + // If the texture is larger than half our streaming buffer size, use a separate buffer. + // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. + if (size > m_texture_stream_buffer.GetSize() / 2) + { + upload_buffer_offset = 0; + upload_buffer = AllocateUploadStagingBuffer(size, upload_data); + } + else + { + upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset); + } + if (!upload_buffer) + { + Console.Error("Failed to get upload buffer for accurate prims data."); + return; + } + + // Copy data from upload to GPU buffer. + const D3D12_RESOURCE_BARRIER barrier_sr_to_dst = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, + D3D12_RESOURCE_STATE_COPY_DEST}}}; + GetCommandList()->ResourceBarrier(1, &barrier_sr_to_dst); + GetCommandList()->CopyBufferRegion( + m_accurate_prims_stream_buffer.GetBuffer(), offset, upload_buffer, upload_buffer_offset, size); + + // Commit the GPU region. + m_accurate_prims_stream_buffer.CommitMemory(size); + + // Issue the barrier since this will be used next draw. + const D3D12_RESOURCE_BARRIER barrier_dst_to_sr = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}}; + GetCommandList()->ResourceBarrier(1, &barrier_dst_to_sr); + + m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer. + } +} + +void GSDevice12::SetupAccuratePrimsConstants(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { config.cb_vs.base_vertex = m_vertex.start; - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData); SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); - - std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); - m_accurate_prims_stream_buffer.CommitMemory(size); } } @@ -2394,7 +2493,8 @@ bool GSDevice12::CreateBuffers() return false; } - if (!m_accurate_prims_stream_buffer.Create(ACCURATE_PRIMS_BUFFER_SIZE)) + if (!m_accurate_prims_stream_buffer.Create( + m_features.accurate_prims ? ACCURATE_PRIMS_BUFFER_SIZE : sizeof(AccuratePrimsEdgeData), true)) { Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); return false; @@ -2406,8 +2506,17 @@ bool GSDevice12::CreateBuffers() return false; } - // Create the shader resource view for the accurate prims buffer. + if (m_features.accurate_prims) { + // Transition to accurate prims buffer to pixel shader resource and create the shader resource view. + const D3D12_RESOURCE_BARRIER barrier = { + D3D12_RESOURCE_BARRIER_TYPE_TRANSITION, + D3D12_RESOURCE_BARRIER_FLAG_NONE, + {{m_accurate_prims_stream_buffer.GetBuffer(), 0, + D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE}}}; + GetInitCommandList()->ResourceBarrier(1, &barrier); + D3D12_SHADER_RESOURCE_VIEW_DESC desc = { DXGI_FORMAT_UNKNOWN, D3D12_SRV_DIMENSION_BUFFER, D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING}; desc.Buffer.FirstElement = 0; @@ -3940,6 +4049,9 @@ void GSDevice12::RenderHW(GSHWDrawConfig& config) PipelineSelector& pipe = m_pipeline_selector; + // Copying buffers needs to done outside render pass so do this early. + SetupAccuratePrimsBuffer(config); + // figure out the pipeline UpdateHWPipelineSelector(config); @@ -4321,5 +4433,6 @@ void GSDevice12::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) IASetIndexBuffer(config.indices, config.nindices); } - SetupAccuratePrims(config); + // Needs to be done after vertex offset is set. + SetupAccuratePrimsConstants(config); } diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.h b/pcsx2/GS/Renderers/DX12/GSDevice12.h index 7b452a9f4f..f479ae3b12 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.h +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.h @@ -129,6 +129,8 @@ public: // Allocates a temporary CPU staging buffer, fires the callback with it to populate, then copies to a GPU buffer. bool AllocatePreinitializedGPUBuffer(u32 size, ID3D12Resource** gpu_buffer, D3D12MA::Allocation** gpu_allocation, const std::function& fill_callback); + ID3D12Resource* AllocateUploadStagingBuffer(u32 size, std::function write_data); + ID3D12Resource* WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out); private: struct CommandListResources @@ -307,6 +309,7 @@ private: D3D12StreamBuffer m_vertex_stream_buffer; D3D12StreamBuffer m_index_stream_buffer; D3D12StreamBuffer m_accurate_prims_stream_buffer; + u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw. D3D12DescriptorHandle m_accurate_prims_srv_descriptor_cpu; D3D12DescriptorHandle m_accurate_prims_srv_descriptor_gpu; D3D12StreamBuffer m_vertex_constant_buffer; @@ -465,7 +468,8 @@ public: void IASetVertexBuffer(const void* vertex, size_t stride, size_t count); void IASetIndexBuffer(const void* index, size_t count); - void SetupAccuratePrims(GSHWDrawConfig& config); + void SetupAccuratePrimsBuffer(GSHWDrawConfig& config); + void SetupAccuratePrimsConstants(GSHWDrawConfig& config); void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); diff --git a/pcsx2/GS/Renderers/DX12/GSTexture12.cpp b/pcsx2/GS/Renderers/DX12/GSTexture12.cpp index e21f749254..348b5a17c4 100644 --- a/pcsx2/GS/Renderers/DX12/GSTexture12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSTexture12.cpp @@ -350,43 +350,6 @@ ID3D12GraphicsCommandList* GSTexture12::GetCommandBufferForUpdate() return dev->GetInitCommandList(); } -ID3D12Resource* GSTexture12::AllocateUploadStagingBuffer( - const void* data, u32 pitch, u32 upload_pitch, u32 height) const -{ - const u32 buffer_size = CalcUploadSize(height, upload_pitch); - wil::com_ptr_nothrow resource; - wil::com_ptr_nothrow allocation; - - const D3D12MA::ALLOCATION_DESC allocation_desc = {D3D12MA::ALLOCATION_FLAG_NONE, D3D12_HEAP_TYPE_UPLOAD}; - const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER, 0, buffer_size, 1, 1, 1, - DXGI_FORMAT_UNKNOWN, {1, 0}, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, D3D12_RESOURCE_FLAG_NONE}; - HRESULT hr = GSDevice12::GetInstance()->GetAllocator()->CreateResource(&allocation_desc, &resource_desc, - D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, allocation.put(), IID_PPV_ARGS(resource.put())); - if (FAILED(hr)) - { - Console.WriteLn("(AllocateUploadStagingBuffer) CreateCommittedResource() failed with %08X", hr); - return nullptr; - } - - void* map_ptr; - hr = resource->Map(0, nullptr, &map_ptr); - if (FAILED(hr)) - { - Console.WriteLn("(AllocateUploadStagingBuffer) Map() failed with %08X", hr); - return nullptr; - } - - CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); - - const D3D12_RANGE write_range = {0, buffer_size}; - resource->Unmap(0, &write_range); - - // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. - // This adds the reference needed to keep the buffer alive. - GSDevice12::GetInstance()->DeferResourceDestruction(allocation.get(), resource.get()); - return resource.get(); -} - void GSTexture12::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const { const u32 block_size = GetCompressedBlockSize(); @@ -406,7 +369,7 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l const u32 width = Common::AlignUpPow2(r.width(), block_size); const u32 height = Common::AlignUpPow2(r.height(), block_size); const u32 upload_pitch = Common::AlignUpPow2(pitch, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT); - const u32 required_size = CalcUploadSize(r.height(), upload_pitch); + const u32 required_size = CalcUploadSize(height, upload_pitch); D3D12_TEXTURE_COPY_LOCATION srcloc; srcloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; @@ -416,35 +379,25 @@ bool GSTexture12::Update(const GSVector4i& r, const void* data, int pitch, int l srcloc.PlacedFootprint.Footprint.Format = m_dxgi_format; srcloc.PlacedFootprint.Footprint.RowPitch = upload_pitch; + const auto upload_data = [&](void* map_ptr) { + CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); + }; + // If the texture is larger than half our streaming buffer size, use a separate buffer. // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. if (required_size > (GSDevice12::GetInstance()->GetTextureStreamBuffer().GetSize() / 2)) { - srcloc.pResource = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height); - if (!srcloc.pResource) - return false; - + srcloc.pResource = GSDevice12::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data); srcloc.PlacedFootprint.Offset = 0; } else { - D3D12StreamBuffer& sbuffer = GSDevice12::GetInstance()->GetTextureStreamBuffer(); - if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) - { - GSDevice12::GetInstance()->ExecuteCommandList( - false, "While waiting for %u bytes in texture upload buffer", required_size); - if (!sbuffer.ReserveMemory(required_size, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)) - { - Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size); - return false; - } - } - - srcloc.pResource = sbuffer.GetBuffer(); - srcloc.PlacedFootprint.Offset = sbuffer.GetCurrentOffset(); - CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height); - sbuffer.CommitMemory(required_size); + u32 offset; + srcloc.pResource = GSDevice12::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, offset); + srcloc.PlacedFootprint.Offset = offset; } + if (!srcloc.pResource) + return false; ID3D12GraphicsCommandList* cmdlist = GetCommandBufferForUpdate(); GL_PUSH("GSTexture12::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer); diff --git a/pcsx2/GS/Renderers/DX12/GSTexture12.h b/pcsx2/GS/Renderers/DX12/GSTexture12.h index 49c82d034f..ced6fa545f 100644 --- a/pcsx2/GS/Renderers/DX12/GSTexture12.h +++ b/pcsx2/GS/Renderers/DX12/GSTexture12.h @@ -79,7 +79,6 @@ private: static bool CreateUAVDescriptor(ID3D12Resource* resource, DXGI_FORMAT format, D3D12DescriptorHandle* dh); ID3D12GraphicsCommandList* GetCommandBufferForUpdate(); - ID3D12Resource* AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const; void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const; wil::com_ptr_nothrow m_resource; diff --git a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp index 0271500956..0b7086cb4f 100644 --- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.cpp @@ -310,10 +310,10 @@ namespace }; } // namespace -std::unique_ptr GLStreamBuffer::Create(GLenum target, u32 size) +std::unique_ptr GLStreamBuffer::Create(GLenum target, u32 size, bool nonsyncing) { std::unique_ptr buf; - if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) + if (!nonsyncing && (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage)) { buf = BufferStorageStreamBuffer::Create(target, size); if (buf) diff --git a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h index 3da57e125e..bd63ec760c 100644 --- a/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h +++ b/pcsx2/GS/Renderers/OpenGL/GLStreamBuffer.h @@ -38,7 +38,7 @@ public: /// Returns the minimum granularity of blocks which sync objects will be created around. virtual u32 GetChunkSize() const = 0; - static std::unique_ptr Create(GLenum target, u32 size); + static std::unique_ptr Create(GLenum target, u32 size, bool nonsyncing = false); protected: GLStreamBuffer(GLenum target, GLuint buffer_id, u32 size); diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp index 174f69174f..019ec607fc 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp @@ -260,11 +260,17 @@ bool GSDeviceOGL::Create(GSVSyncMode vsync_mode, bool allow_present_throttle) m_vertex_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE); m_index_stream_buffer = GLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE); - m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE); + if (m_features.accurate_prims) + { + // Performance note: prefer a non-syncing buffer for accurate prims so that it is more likely to be GPU local. + // Rationale: we expect this buffer to be updated relatively rarely and it's used as a pixel shader resource. + m_accurate_prims_stream_buffer = GLStreamBuffer::Create(GL_ARRAY_BUFFER, ACCURATE_PRIMS_BUFFER_SIZE, true); + } m_vertex_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, VERTEX_UNIFORM_BUFFER_SIZE); m_fragment_uniform_stream_buffer = GLStreamBuffer::Create(GL_UNIFORM_BUFFER, FRAGMENT_UNIFORM_BUFFER_SIZE); glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &m_uniform_buffer_alignment); - if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_accurate_prims_stream_buffer || + if (!m_vertex_stream_buffer || !m_index_stream_buffer || + (m_features.accurate_prims && !m_accurate_prims_stream_buffer) || !m_vertex_uniform_stream_buffer || !m_fragment_uniform_stream_buffer) { Host::ReportErrorAsync("GS", "Failed to create vertex/index/uniform streaming buffers"); diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp index 0881bdb7e0..59a9f3acbb 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp @@ -3406,13 +3406,14 @@ void GSDeviceVK::IASetIndexBuffer(const void* index, size_t count) SetIndexBuffer(m_index_stream_buffer.GetBuffer()); } -void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config) +void GSDeviceVK::SetupAccuratePrimsBuffer(GSHWDrawConfig& config) { if (config.accurate_prims) { const u32 count = config.accurate_prims_edge_data->size(); const u32 size = count * sizeof(AccuratePrimsEdgeData); + // Reserve the GPU region. if (!m_accurate_prims_stream_buffer.ReserveMemory(size, sizeof(AccuratePrimsEdgeData))) { ExecuteCommandBufferAndRestartRenderPass(false, "Uploading bytes to accurate prims buffer"); @@ -3420,17 +3421,120 @@ void GSDeviceVK::SetupAccuratePrims(GSHWDrawConfig& config) pxFailRel("Failed to reserve space for accurate prims"); } + const u32 offset = m_accurate_prims_stream_buffer.GetCurrentOffset(); + + if (InRenderPass()) + EndRenderPass(); + + // Copy data to an upload buffer. + VkBuffer upload_buffer; + u32 upload_buffer_offset; + + const auto upload_data = [&](void* map_ptr) { + std::memcpy(map_ptr, config.accurate_prims_edge_data->data(), size); + }; + + // If the texture is larger than half our streaming buffer size, use a separate buffer. + // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. + if (size > m_texture_stream_buffer.GetCurrentSize() / 2) + { + upload_buffer_offset = 0; + upload_buffer = AllocateUploadStagingBuffer(size, upload_data); + } + else + { + upload_buffer = WriteTextureUploadBuffer(size, upload_data, upload_buffer_offset); + } + if (upload_buffer == VK_NULL_HANDLE) + { + Console.Error("Failed to get upload buffer for accurate prims data."); + return; + } + + // Copy data from upload to GPU buffer. + VkBufferCopy copyRegion = {upload_buffer_offset, offset, size}; + vkCmdCopyBuffer(GetCurrentCommandBuffer(), upload_buffer, m_accurate_prims_stream_buffer.GetBuffer(), 1, ©Region); + + // Commit the GPU region. + m_accurate_prims_stream_buffer.CommitMemory(size); + + // Issue the barrier since this will be used next draw. + VkBufferMemoryBarrier barrier = { + VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, nullptr, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, + VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, + m_accurate_prims_stream_buffer.GetBuffer(), offset, size}; + vkCmdPipelineBarrier(GetCurrentCommandBuffer(), + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + 0, 0, nullptr, 1, &barrier, 0, nullptr); + + m_accurate_prims_stream_buffer_offset = offset; // Save this for the constant buffer. + } +} + +void GSDeviceVK::SetupAccuratePrimsConstants(GSHWDrawConfig& config) +{ + if (config.accurate_prims) + { + // We separate this from setting up the buffer to mirror Vulkan, which requires it. config.cb_vs.base_vertex = m_vertex.start; - config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer.GetCurrentOffset() / sizeof(AccuratePrimsEdgeData); + config.cb_ps.accurate_prims_base_index.x = m_accurate_prims_stream_buffer_offset / sizeof(AccuratePrimsEdgeData); SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); - - std::memcpy(m_accurate_prims_stream_buffer.GetCurrentHostPointer(), config.accurate_prims_edge_data->data(), size); - m_accurate_prims_stream_buffer.CommitMemory(size); } } +VkBuffer GSDeviceVK::WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out) +{ + if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment())) + { + ExecuteCommandBuffer( + false, "While waiting for %u bytes in texture upload buffer", size); + if (!m_texture_stream_buffer.ReserveMemory(size, GetBufferCopyOffsetAlignment())) + { + Console.Error("Failed to reserve texture upload memory (%u bytes).", size); + return VK_NULL_HANDLE; + } + } + + offset_out = m_texture_stream_buffer.GetCurrentOffset(); + write_data(m_texture_stream_buffer.GetCurrentHostPointer()); + m_texture_stream_buffer.CommitMemory(size); + return m_texture_stream_buffer.GetBuffer(); +} + +VkBuffer GSDeviceVK::AllocateUploadStagingBuffer(u32 size, std::function write_data) +{ + const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), + VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; + + // Don't worry about setting the coherent bit for this upload, the main reason we had + // that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on + // smaller uploads, but we're writing to the whole thing anyway. + VmaAllocationCreateInfo aci = {}; + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + + VmaAllocationInfo ai; + VkBuffer buffer; + VmaAllocation allocation; + VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai); + if (res != VK_SUCCESS) + { + LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: "); + return VK_NULL_HANDLE; + } + + // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. + GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation); + + // And write the data. + write_data(ai.pMappedData); + vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size); + return buffer; +} + void GSDeviceVK::OMSetRenderTargets( GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop) { @@ -3762,7 +3866,8 @@ bool GSDeviceVK::CreateBuffers() if (m_features.accurate_prims) { - if (!m_accurate_prims_stream_buffer.Create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ACCURATE_PRIMS_BUFFER_SIZE)) + if (!m_accurate_prims_stream_buffer.Create( + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, ACCURATE_PRIMS_BUFFER_SIZE, true)) { Host::ReportErrorAsync("GS", "Failed to allocate accurate prims buffer"); return false; @@ -5673,13 +5778,15 @@ GSTextureVK* GSDeviceVK::SetupPrimitiveTrackingDATE(GSHWDrawConfig& config) void GSDeviceVK::RenderHW(GSHWDrawConfig& config) { - const GSVector2i rtsize(config.rt ? config.rt->GetSize() : config.ds->GetSize()); GSTextureVK* draw_rt = static_cast(config.rt); GSTextureVK* draw_ds = static_cast(config.ds); GSTextureVK* draw_rt_clone = nullptr; GSTextureVK* colclip_rt = static_cast(g_gs_device->GetColorClipTexture()); + // Copying buffers needs to done outside render pass so do this early. + SetupAccuratePrimsBuffer(config); + // stream buffer in first, in case we need to exec SetVSConstantBuffer(config.cb_vs); SetPSConstantBuffer(config.cb_ps); @@ -6157,7 +6264,8 @@ void GSDeviceVK::UploadHWDrawVerticesAndIndices(GSHWDrawConfig& config) IASetIndexBuffer(config.indices, config.nindices); } - SetupAccuratePrims(config); + // Needs to be done after vertex offset is set. + SetupAccuratePrimsConstants(config); } VkImageMemoryBarrier GSDeviceVK::GetColorBufferBarrier(GSTextureVK* rt) const diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h index e8eb0f8ff5..17ac9d9f3d 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h @@ -98,6 +98,8 @@ public: __fi VkCommandBuffer GetCurrentCommandBuffer() const { return m_current_command_buffer; } __fi VKStreamBuffer& GetTextureUploadBuffer() { return m_texture_stream_buffer; } VkCommandBuffer GetCurrentInitCommandBuffer(); + VkBuffer AllocateUploadStagingBuffer(u32 size, std::function write_data); + VkBuffer WriteTextureUploadBuffer(u32 size, std::function write_data, u32& offset_out); /// Allocates a descriptor set from the pool reserved for the current frame. VkDescriptorSet AllocatePersistentDescriptorSet(VkDescriptorSetLayout set_layout); @@ -381,6 +383,7 @@ private: VKStreamBuffer m_vertex_stream_buffer; VKStreamBuffer m_index_stream_buffer; VKStreamBuffer m_accurate_prims_stream_buffer; + u32 m_accurate_prims_stream_buffer_offset = 0; // Ring buffer offset for the current draw. VKStreamBuffer m_vertex_uniform_stream_buffer; VKStreamBuffer m_fragment_uniform_stream_buffer; VKStreamBuffer m_texture_stream_buffer; @@ -563,7 +566,8 @@ public: void PSSetShaderResource(int i, GSTexture* sr, bool check_state); void PSSetSampler(GSHWDrawConfig::SamplerSelector sel); - void SetupAccuratePrims(GSHWDrawConfig& config); + void SetupAccuratePrimsBuffer(GSHWDrawConfig& config); + void SetupAccuratePrimsConstants(GSHWDrawConfig& config); void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i& scissor, FeedbackLoopFlag feedback_loop = FeedbackLoopFlag_None); diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp index d7aec5b8af..ff681e525a 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.cpp @@ -270,38 +270,6 @@ void GSTextureVK::CopyTextureDataForUpload(void* dst, const void* src, u32 pitch StringUtil::StrideMemCpy(dst, upload_pitch, src, pitch, std::min(upload_pitch, pitch), count); } -VkBuffer GSTextureVK::AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const -{ - const u32 size = upload_pitch * height; - const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), - VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; - - // Don't worry about setting the coherent bit for this upload, the main reason we had - // that set in StreamBuffer was for MoltenVK, which would upload the whole buffer on - // smaller uploads, but we're writing to the whole thing anyway. - VmaAllocationCreateInfo aci = {}; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - - VmaAllocationInfo ai; - VkBuffer buffer; - VmaAllocation allocation; - VkResult res = vmaCreateBuffer(GSDeviceVK::GetInstance()->GetAllocator(), &bci, &aci, &buffer, &allocation, &ai); - if (res != VK_SUCCESS) - { - LOG_VULKAN_ERROR(res, "(AllocateUploadStagingBuffer) vmaCreateBuffer() failed: "); - return VK_NULL_HANDLE; - } - - // Immediately queue it for freeing after the command buffer finishes, since it's only needed for the copy. - GSDeviceVK::GetInstance()->DeferBufferDestruction(buffer, allocation); - - // And write the data. - CopyTextureDataForUpload(ai.pMappedData, data, pitch, upload_pitch, height); - vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), allocation, 0, size); - return buffer; -} - void GSTextureVK::UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset) { @@ -333,6 +301,10 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l const u32 upload_pitch = Common::AlignUpPow2(pitch, GSDeviceVK::GetInstance()->GetBufferCopyRowPitchAlignment()); const u32 required_size = CalcUploadSize(height, upload_pitch); + const auto upload_data = [&](void* map_ptr) { + CopyTextureDataForUpload(map_ptr, data, pitch, upload_pitch, height); + }; + // If the texture is larger than half our streaming buffer size, use a separate buffer. // Otherwise allocation will either fail, or require lots of cmdbuffer submissions. VkBuffer buffer; @@ -340,29 +312,14 @@ bool GSTextureVK::Update(const GSVector4i& r, const void* data, int pitch, int l if (required_size > (GSDeviceVK::GetInstance()->GetTextureUploadBuffer().GetCurrentSize() / 2)) { buffer_offset = 0; - buffer = AllocateUploadStagingBuffer(data, pitch, upload_pitch, height); - if (buffer == VK_NULL_HANDLE) - return false; + buffer = GSDeviceVK::GetInstance()->AllocateUploadStagingBuffer(required_size, upload_data); } else { - VKStreamBuffer& sbuffer = GSDeviceVK::GetInstance()->GetTextureUploadBuffer(); - if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment())) - { - GSDeviceVK::GetInstance()->ExecuteCommandBuffer( - false, "While waiting for %u bytes in texture upload buffer", required_size); - if (!sbuffer.ReserveMemory(required_size, GSDeviceVK::GetInstance()->GetBufferCopyOffsetAlignment())) - { - Console.Error("Failed to reserve texture upload memory (%u bytes).", required_size); - return false; - } - } - - buffer = sbuffer.GetBuffer(); - buffer_offset = sbuffer.GetCurrentOffset(); - CopyTextureDataForUpload(sbuffer.GetCurrentHostPointer(), data, pitch, upload_pitch, height); - sbuffer.CommitMemory(required_size); + buffer = GSDeviceVK::GetInstance()->WriteTextureUploadBuffer(required_size, upload_data, buffer_offset); } + if (buffer == VK_NULL_HANDLE) + return false; const VkCommandBuffer cmdbuf = GetCommandBufferForUpdate(); GL_PUSH("GSTextureVK::Update({%d,%d} %dx%d Lvl:%u", r.x, r.y, r.width(), r.height(), layer); diff --git a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h index 41cd2edbb2..121946eb01 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSTextureVK.h @@ -84,7 +84,6 @@ private: VkCommandBuffer GetCommandBufferForUpdate(); void CopyTextureDataForUpload(void* dst, const void* src, u32 pitch, u32 upload_pitch, u32 height) const; - VkBuffer AllocateUploadStagingBuffer(const void* data, u32 pitch, u32 upload_pitch, u32 height) const; void UpdateFromBuffer(VkCommandBuffer cmdbuf, int level, u32 x, u32 y, u32 width, u32 height, u32 buffer_height, u32 row_length, VkBuffer buffer, u32 buffer_offset); diff --git a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp index 8f6323d6fa..ad4e581f85 100644 --- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp +++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.cpp @@ -19,6 +19,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move) , m_allocation(move.m_allocation) , m_buffer(move.m_buffer) , m_host_pointer(move.m_host_pointer) + , m_device_local(move.m_device_local) , m_tracked_fences(std::move(move.m_tracked_fences)) { move.m_size = 0; @@ -28,6 +29,7 @@ VKStreamBuffer::VKStreamBuffer(VKStreamBuffer&& move) move.m_allocation = VK_NULL_HANDLE; move.m_buffer = VK_NULL_HANDLE; move.m_host_pointer = nullptr; + move.m_device_local = false; } VKStreamBuffer::~VKStreamBuffer() @@ -48,19 +50,29 @@ VKStreamBuffer& VKStreamBuffer::operator=(VKStreamBuffer&& move) std::swap(m_buffer, move.m_buffer); std::swap(m_host_pointer, move.m_host_pointer); std::swap(m_tracked_fences, move.m_tracked_fences); + std::swap(m_device_local, move.m_device_local); return *this; } -bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size) +bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size, bool device_local) { const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, 0, static_cast(size), usage, VK_SHARING_MODE_EXCLUSIVE, 0, nullptr}; VmaAllocationCreateInfo aci = {}; - aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; - aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + if (device_local) + { + // GPU default buffer + aci.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + } + else + { + // CPU upload buffer + aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + aci.usage = VMA_MEMORY_USAGE_CPU_TO_GPU; + aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + } VmaAllocationInfo ai = {}; VkBuffer new_buffer = VK_NULL_HANDLE; @@ -83,7 +95,8 @@ bool VKStreamBuffer::Create(VkBufferUsageFlags usage, u32 size) m_tracked_fences.clear(); m_allocation = new_allocation; m_buffer = new_buffer; - m_host_pointer = static_cast(ai.pMappedData); + m_host_pointer = device_local ? nullptr : static_cast(ai.pMappedData); + m_device_local = device_local; return true; } @@ -104,6 +117,7 @@ void VKStreamBuffer::Destroy(bool defer) m_buffer = VK_NULL_HANDLE; m_allocation = VK_NULL_HANDLE; m_host_pointer = nullptr; + m_device_local = false; } bool VKStreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment) @@ -180,8 +194,11 @@ void VKStreamBuffer::CommitMemory(u32 final_num_bytes) pxAssert((m_current_offset + final_num_bytes) <= m_size); pxAssert(final_num_bytes <= m_current_space); - // For non-coherent mappings, flush the memory range - vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes); + if (!m_device_local) + { + // For non-coherent mappings, flush the memory range + vmaFlushAllocation(GSDeviceVK::GetInstance()->GetAllocator(), m_allocation, m_current_offset, final_num_bytes); + } m_current_offset += final_num_bytes; m_current_space -= final_num_bytes; diff --git a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h index e0839a838e..07de25a8cb 100644 --- a/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h +++ b/pcsx2/GS/Renderers/Vulkan/VKStreamBuffer.h @@ -30,14 +30,13 @@ public: __fi u32 GetCurrentSpace() const { return m_current_space; } __fi u32 GetCurrentOffset() const { return m_current_offset; } - bool Create(VkBufferUsageFlags usage, u32 size); + bool Create(VkBufferUsageFlags usage, u32 size, bool device_local = false); void Destroy(bool defer); bool ReserveMemory(u32 num_bytes, u32 alignment); void CommitMemory(u32 final_num_bytes); private: - bool AllocateBuffer(VkBufferUsageFlags usage, u32 size); void UpdateCurrentFencePosition(); void UpdateGPUPosition(); @@ -51,7 +50,8 @@ private: VmaAllocation m_allocation = VK_NULL_HANDLE; VkBuffer m_buffer = VK_NULL_HANDLE; - u8* m_host_pointer = nullptr; + u8* m_host_pointer = nullptr; // Only used for upload buffers. + bool m_device_local = false; // False for upload buffer; true for default buffer. // List of fences and the corresponding positions in the buffer std::deque> m_tracked_fences;