shadPS4/src/video_core/buffer_cache/buffer.cpp
Lander Gallastegi f9bbde9c79
video_core: Implement DMA. (#2819)
* Import memory

* 64K pages and fix memory mapping

* Queue coverage

* Buffer syncing, faulted readback adn BDA in Buffer

* Base DMA implementation

* Preparations for implementing SPV DMA access

* Base impl (pending 16K pages and getbuffersize)

* 16K pages and stack overflow fix

* clang-format

* clang-format but for real this time

* Try to fix macOS build

* Correct decltype

* Add testing log

* Fix stride and patch phi node blocks

* No need to check if it is a deleted buffer

* Clang format once more

* Offset in bytes

* Removed host buffers (may do it in another PR)

Also some random barrier fixes

* Add IR dumping from my read-const branch

* clang-format

* Correct size insteed of end

* Fix incorrect assert

* Possible fix for NieR deadlock

* Copy to avoid deadlock

* Use 2 mutexes insteed of copy

* Attempt to range sync error

* Revert "Attempt to range sync error"

This reverts commit dd287b48682b50f215680bb0956e39c2809bf3fe.

* Fix size truncated when syncing range

And memory barrier

* Some fixes (and async testing (doesn't work))

* Use compute to parse fault buffer

* Process faults on submit

* Only sync in the first time we see a readconst

Thsi is partialy wrong. We need to save the state into the submission context itself, not the rasterizer since we can yield and process another sumission (if im not understanding wrong).

* Use spec const and 32 bit atomic

* 32 bit counter

* Fix store_index

* Better sync (WIP, breaks PR now)

* Fixes for better sync

* Better sync

* Remove memory coveragte logic

* Point sirit to upstream

* Less waiting and barriers

* Correctly checkout moltenvk

* Bring back applying pending operations in wait

* Sync the whole buffer insteed of only the range

* Implement recursive shared/scoped locks

* Iterators

* Faster syncing with ranges

* Some alignment fixes

* fixed clang format

* Fix clang-format again

* Port page_manager from readbacks-poc

* clang-format

* Defer memory protect

* Remove RENDERER_TRACE

* Experiment: only sync on first readconst

* Added profiling (will be removed)

* Don't sync entire buffers

* Added logging for testing

* Updated temporary workaround to use 4k pages

* clang.-format

* Cleanup part 1

* Make ReadConst a SPIR-V function

---------

Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
2025-05-22 21:00:15 +03:00

207 lines
7.3 KiB
C++

// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/alignment.h"
#include "common/assert.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_platform.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include <vk_mem_alloc.h>
namespace VideoCore {
std::string_view BufferTypeName(MemoryUsage type) {
switch (type) {
case MemoryUsage::Upload:
return "Upload";
case MemoryUsage::Download:
return "Download";
case MemoryUsage::Stream:
return "Stream";
case MemoryUsage::DeviceLocal:
return "DeviceLocal";
default:
return "Invalid";
}
}
[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) {
return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
: VkMemoryPropertyFlagBits{};
}
[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) {
switch (usage) {
case MemoryUsage::Upload:
case MemoryUsage::Stream:
return VMA_ALLOCATION_CREATE_MAPPED_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
case MemoryUsage::Download:
return VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
case MemoryUsage::DeviceLocal:
return {};
}
return {};
}
[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) {
switch (usage) {
case MemoryUsage::DeviceLocal:
case MemoryUsage::Stream:
return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
case MemoryUsage::Upload:
case MemoryUsage::Download:
return VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
}
return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
}
UniqueBuffer::UniqueBuffer(vk::Device device_, VmaAllocator allocator_)
: device{device_}, allocator{allocator_} {}
UniqueBuffer::~UniqueBuffer() {
if (buffer) {
vmaDestroyBuffer(allocator, buffer, allocation);
}
}
void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
VmaAllocationInfo* out_alloc_info) {
const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress);
const VmaAllocationCreateFlags bda_flag =
with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0;
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage),
.usage = MemoryUsageVma(usage),
.requiredFlags = 0,
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
.pool = VK_NULL_HANDLE,
.pUserData = nullptr,
};
const VkBufferCreateInfo buffer_ci_unsafe = static_cast<VkBufferCreateInfo>(buffer_ci);
VkBuffer unsafe_buffer{};
VkResult result = vmaCreateBuffer(allocator, &buffer_ci_unsafe, &alloc_ci, &unsafe_buffer,
&allocation, out_alloc_info);
ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
vk::to_string(vk::Result{result}));
buffer = vk::Buffer{unsafe_buffer};
if (with_bda) {
vk::BufferDeviceAddressInfo bda_info{
.buffer = buffer,
};
auto bda_result = device.getBufferAddress(bda_info);
ASSERT_MSG(bda_result != 0, "Failed to get buffer device address");
bda_addr = bda_result;
}
}
Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_,
VAddr cpu_addr_, vk::BufferUsageFlags flags, u64 size_bytes_)
: cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, scheduler{&scheduler_},
usage{usage_}, buffer{instance->GetDevice(), instance->GetAllocator()} {
// Create buffer object.
const vk::BufferCreateInfo buffer_ci = {
.size = size_bytes,
.usage = flags,
};
VmaAllocationInfo alloc_info{};
buffer.Create(buffer_ci, usage, &alloc_info);
const auto device = instance->GetDevice();
Vulkan::SetObjectName(device, Handle(), "Buffer {:#x}:{:#x}", cpu_addr, size_bytes);
// Map it if it is host visible.
VkMemoryPropertyFlags property_flags{};
vmaGetAllocationMemoryProperties(instance->GetAllocator(), buffer.allocation, &property_flags);
if (alloc_info.pMappedData) {
mapped_data = std::span<u8>{std::bit_cast<u8*>(alloc_info.pMappedData), size_bytes};
}
is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
MemoryUsage usage, u64 size_bytes)
: Buffer{instance, scheduler, usage, 0, AllFlags, size_bytes} {
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
const auto device = instance.GetDevice();
Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}):{:#x}", BufferTypeName(usage),
size_bytes);
}
std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
if (!is_coherent && usage == MemoryUsage::Stream) {
size = Common::AlignUp(size, instance->NonCoherentAtomSize());
}
ASSERT(size <= this->size_bytes);
mapped_size = size;
if (alignment > 0) {
offset = Common::AlignUp(offset, alignment);
}
if (offset + size > this->size_bytes) {
// The buffer would overflow, save the amount of used watches and reset the state.
invalidation_mark = current_watch_cursor;
current_watch_cursor = 0;
offset = 0;
// Swap watches and reset waiting cursors.
std::swap(previous_watches, current_watches);
wait_cursor = 0;
wait_bound = 0;
}
const u64 mapped_upper_bound = offset + size;
WaitPendingOperations(mapped_upper_bound);
return std::make_pair(mapped_data.data() + offset, offset);
}
void StreamBuffer::Commit() {
if (!is_coherent) {
if (usage == MemoryUsage::Download) {
vmaInvalidateAllocation(instance->GetAllocator(), buffer.allocation, offset,
mapped_size);
} else {
vmaFlushAllocation(instance->GetAllocator(), buffer.allocation, offset, mapped_size);
}
}
offset += mapped_size;
if (current_watch_cursor + 1 >= current_watches.size()) {
// Ensure that there are enough watches.
ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
}
auto& watch = current_watches[current_watch_cursor++];
watch.upper_bound = offset;
watch.tick = scheduler->CurrentTick();
}
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
watches.resize(watches.size() + grow_size);
}
void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
if (!invalidation_mark) {
return;
}
while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
auto& watch = previous_watches[wait_cursor];
wait_bound = watch.upper_bound;
scheduler->Wait(watch.tick);
++wait_cursor;
}
}
} // namespace VideoCore