This commit is contained in:
Lander Gallastegi 2025-12-15 17:14:48 -08:00 committed by GitHub
commit 330e90cd28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 366 additions and 99 deletions

View File

@ -255,4 +255,31 @@ bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_w
return true;
}
StreamBufferMapping::StreamBufferMapping(StreamBuffer& stream_buffer, u64 size, u64 alignment,
bool allow_wait) {
const auto [data, offset] = stream_buffer.Map(size, alignment, allow_wait);
if (!data) {
// This happens if the size is too big or no waiting is allowed when it is required
is_temp_buffer = true;
this->buffer = new VideoCore::Buffer(*stream_buffer.instance, *stream_buffer.scheduler,
stream_buffer.usage, 0, AllFlags, size);
this->data = this->buffer->mapped_data.data();
this->offset = 0;
ASSERT_MSG(this->data, "Failed to map temporary buffer");
} else {
is_temp_buffer = false;
buffer = &stream_buffer;
this->data = data;
this->offset = offset;
}
}
StreamBufferMapping::~StreamBufferMapping() {
if (is_temp_buffer) {
ASSERT(buffer);
auto scheduler = buffer->scheduler;
scheduler->DeferOperation([buffer = this->buffer]() mutable { delete buffer; });
}
}
} // namespace VideoCore

View File

@ -4,6 +4,7 @@
#pragma once
#include <cstddef>
#include <memory>
#include <optional>
#include <utility>
#include <vector>
@ -210,4 +211,51 @@ private:
u64 wait_bound{};
};
class StreamBufferMapping {
public:
StreamBufferMapping(StreamBuffer& stream_buffer, u64 size, u64 alignment = 0,
bool allow_wait = true);
~StreamBufferMapping();
StreamBufferMapping(const StreamBufferMapping&) = delete;
StreamBufferMapping& operator=(const StreamBufferMapping&) = delete;
StreamBufferMapping(StreamBufferMapping&& other)
: buffer{std::exchange(other.buffer, nullptr)}, data{std::exchange(other.data, nullptr)},
offset{std::exchange(other.offset, 0)},
is_temp_buffer{std::exchange(other.is_temp_buffer, false)} {}
StreamBufferMapping& operator=(StreamBufferMapping&& other) {
if (this != &other) {
buffer = std::exchange(other.buffer, nullptr);
data = std::exchange(other.data, nullptr);
offset = std::exchange(other.offset, 0);
is_temp_buffer = std::exchange(other.is_temp_buffer, false);
}
return *this;
}
VideoCore::Buffer* Buffer() const {
return buffer;
}
u8* Data() const {
return data;
}
u64 Offset() const {
return offset;
}
bool TemporaryBuffer() const {
return is_temp_buffer;
}
private:
VideoCore::Buffer* buffer;
u8* data{};
u64 offset{};
bool is_temp_buffer{};
};
} // namespace VideoCore

View File

@ -73,12 +73,17 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool download) {
if (!IsRegionRegistered(device_addr, size)) {
return;
}
memory_tracker->InvalidateRegion(
device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); });
if (download) {
memory_tracker->InvalidateRegion(
device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); });
} else {
memory_tracker->InvalidateRegion(device_addr, size);
gpu_modified_ranges.Subtract(device_addr, size);
}
}
void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
@ -122,11 +127,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
const auto write_data = [&]() {
cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), copies);
const auto write_func = [this, buf_addr = buffer.CpuAddr(), copies = std::move(copies),
download, offset, device_addr, size, is_write]() {
auto* memory = Core::Memory::Instance();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const VAddr copy_device_addr = buf_addr + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
copy.size);
@ -136,12 +143,67 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
}
};
if constexpr (async) {
scheduler.DeferOperation(write_data);
scheduler.DeferOperation(write_func);
} else {
scheduler.Finish();
write_data();
write_func();
}
return;
}
void BufferCache::ReadEdgeImagePages(const Image& image) {
// May happen that after downloading the image and invalidating region,
// that there were GPU modified ranges that are lost due to CPU reuploading.
// This doesn't change tracker state and it is spected to call DownloadImageMemory after this.
const VAddr image_addr = image.info.guest_address;
const u64 image_size = image.info.guest_size;
const VAddr image_end = image_addr + image_size;
const VAddr page_start = PageManager::GetPageAddr(image_addr);
const VAddr page_end = PageManager::GetNextPageAddr(image_end - 1);
boost::container::small_vector<vk::BufferCopy, 2> copies;
u64 total_size_bytes = 0;
const auto [buffer, offset] = ObtainBufferForImage(image_addr, image_size);
const auto add_download = [&](VAddr start, VAddr end) {
const u64 new_offset = start - buffer->CpuAddr();
const u64 new_size = end - start;
copies.push_back(vk::BufferCopy{
.srcOffset = new_offset,
.dstOffset = total_size_bytes,
.size = new_size,
});
// Align up to avoid cache conflicts
constexpr u64 align = 64ULL;
constexpr u64 mask = ~(align - 1ULL);
total_size_bytes += (new_size + align - 1) & mask;
};
gpu_modified_ranges.ForEachInRange(page_start, image_addr - page_start, add_download);
gpu_modified_ranges.ForEachInRange(image_end, page_end - image_end, add_download);
gpu_modified_ranges.Subtract(page_start, page_end - page_start);
if (total_size_bytes == 0) {
return;
}
const auto [download, download_offset] = download_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
// Modify copies to have the staging offset in mind
copy.dstOffset += download_offset;
}
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer->Handle(), download_buffer.Handle(), copies);
scheduler.DeferOperation([this, buf_addr = buffer->CpuAddr(), copies = std::move(copies),
download, download_offset, image_addr, image_size]() {
auto* memory = Core::Memory::Instance();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buf_addr + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - download_offset;
memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
copy.size);
}
});
}
void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
@ -203,7 +265,7 @@ void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
// Map buffers for merged ranges
for (auto& range : ranges_merged) {
const u64 size = memory->ClampRangeSize(range.base_address, range.GetSize());
const auto [buffer, offset] = ObtainBuffer(range.base_address, size, false);
const auto [buffer, offset] = ObtainBuffer(range.base_address, size);
range.vk_buffer = buffer->buffer;
range.offset = offset;
}
@ -256,7 +318,7 @@ void BufferCache::BindIndexBuffer(u32 index_offset) {
// Bind index buffer.
const u32 index_buffer_size = regs.num_indices * index_size;
const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size, false);
const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(vk_buffer->Handle(), offset, index_type);
}
@ -275,7 +337,8 @@ void BufferCache::FillBuffer(VAddr address, u32 num_bytes, u32 value, bool is_gd
if (is_gds) {
return &gds_buffer;
}
const auto [buffer, offset] = ObtainBuffer(address, num_bytes, true);
const auto [buffer, offset] =
ObtainBuffer(address, num_bytes, ObtainBufferFlags::IsWritten);
return buffer;
}();
buffer->Fill(buffer->Offset(address), num_bytes, value);
@ -297,20 +360,19 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds,
if (src_gds) {
return gds_buffer;
}
const auto buffer_id = FindBuffer(src, num_bytes);
auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, src, num_bytes, false, true);
return buffer;
const auto [buffer, offset] =
ObtainBuffer(src, num_bytes,
ObtainBufferFlags::IgnoreStreamBuffer | ObtainBufferFlags::IsTexelBuffer |
ObtainBufferFlags::InvalidateTextureCache);
return *buffer;
}();
auto& dst_buffer = [&] -> const Buffer& {
if (dst_gds) {
return gds_buffer;
}
const auto buffer_id = FindBuffer(dst, num_bytes);
auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, dst, num_bytes, true, true);
gpu_modified_ranges.Add(dst, num_bytes);
return buffer;
const auto [buffer, offset] = ObtainBuffer(
dst, num_bytes, ObtainBufferFlags::IsWritten | ObtainBufferFlags::IsTexelBuffer);
return *buffer;
}();
const vk::BufferCopy region = {
.srcOffset = src_buffer.Offset(src),
@ -372,10 +434,14 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds,
});
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer, BufferId buffer_id) {
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size,
ObtainBufferFlags flags, BufferId buffer_id) {
// For read-only buffers use device local stream buffer to reduce renderpass breaks.
if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) {
const bool is_written = True(flags & ObtainBufferFlags::IsWritten);
const bool is_texel_buffer = True(flags & ObtainBufferFlags::IsTexelBuffer);
const bool skip_stream_buffer = True(flags & ObtainBufferFlags::IgnoreStreamBuffer);
if (!is_written && !skip_stream_buffer && size <= CACHING_PAGESIZE &&
!IsRegionGpuModified(device_addr, size)) {
const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
return {&stream_buffer, offset};
}
@ -383,9 +449,13 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
buffer_id = FindBuffer(device_addr, size);
}
Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer);
const bool img_synced =
SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer);
if (img_synced && True(flags & ObtainBufferFlags::InvalidateTextureCache)) {
texture_cache.InvalidateMemoryFromGPU(device_addr, size);
}
if (is_written) {
gpu_modified_ranges.Add(device_addr, size);
MarkRegionAsGpuModified(device_addr, size);
}
return {&buffer, buffer.Offset(device_addr)};
}
@ -401,7 +471,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 si
}
// If some buffer within was GPU modified create a full buffer to avoid losing GPU data.
if (IsRegionGpuModified(gpu_addr, size)) {
return ObtainBuffer(gpu_addr, size, false, false);
return ObtainBuffer(gpu_addr, size);
}
// In all other cases, just do a CPU copy to the staging buffer.
const auto [data, offset] = staging_buffer.Map(size, 16);
@ -423,6 +493,12 @@ bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) {
return memory_tracker->IsRegionGpuModified(addr, size);
}
void BufferCache::MarkRegionAsGpuModified(VAddr addr, size_t size) {
gpu_modified_ranges.Add(addr, size);
memory_tracker->MarkRegionAsGpuModified(addr, size);
texture_cache.MarkAsMaybeReused(addr, size);
}
BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
if (device_addr == 0) {
return NULL_BUFFER_ID;
@ -640,6 +716,7 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
size_t total_size_bytes = 0;
VAddr buffer_start = buffer.CpuAddr();
vk::Buffer src_buffer = VK_NULL_HANDLE;
TouchBuffer(buffer);
memory_tracker->ForEachUploadRange(
device_addr, size, is_written,
[&](u64 device_addr_out, u64 range_size) {
@ -682,7 +759,6 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
TouchBuffer(buffer);
}
if (is_texel_buffer && !is_written) {
return SynchronizeBufferFromImage(buffer, device_addr, size);
@ -852,7 +928,6 @@ void BufferCache::RunGarbageCollector() {
}
--max_deletions;
Buffer& buffer = slot_buffers[buffer_id];
// InvalidateMemory(buffer.CpuAddr(), buffer.SizeBytes());
DownloadBufferMemory<true>(buffer, buffer.CpuAddr(), buffer.SizeBytes(), true);
DeleteBuffer(buffer_id);
};

View File

@ -4,6 +4,7 @@
#pragma once
#include <boost/container/small_vector.hpp>
#include "common/enum.h"
#include "common/lru_cache.h"
#include "common/slot_vector.h"
#include "common/types.h"
@ -11,6 +12,7 @@
#include "video_core/buffer_cache/fault_manager.h"
#include "video_core/buffer_cache/range_set.h"
#include "video_core/multi_level_page_table.h"
#include "video_core/texture_cache/image.h"
namespace AmdGpu {
struct Liverpool;
@ -34,6 +36,15 @@ class TextureCache;
class MemoryTracker;
class PageManager;
enum class ObtainBufferFlags {
None = 0,
IsWritten = 1 << 0,
IsTexelBuffer = 1 << 1,
IgnoreStreamBuffer = 1 << 2,
InvalidateTextureCache = 1 << 3,
};
DECLARE_ENUM_FLAG_OPERATORS(ObtainBufferFlags)
class BufferCache {
public:
static constexpr u32 CACHING_PAGEBITS = 14;
@ -106,11 +117,14 @@ public:
}
/// Invalidates any buffer in the logical page range.
void InvalidateMemory(VAddr device_addr, u64 size);
void InvalidateMemory(VAddr device_addr, u64 size, bool download);
/// Flushes any GPU modified buffer in the logical page range back to CPU memory.
void ReadMemory(VAddr device_addr, u64 size, bool is_write = false);
/// Flushes GPU modified ranges of the uncovered part of the edge pages of an image.
void ReadEdgeImagePages(const Image& image);
/// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -124,9 +138,9 @@ public:
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
bool is_texel_buffer = false,
BufferId buffer_id = {});
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(
VAddr gpu_addr, u32 size, ObtainBufferFlags flags = ObtainBufferFlags::None,
BufferId buffer_id = {});
/// Attempts to obtain a buffer without modifying the cache contents.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBufferForImage(VAddr gpu_addr, u32 size);
@ -140,6 +154,9 @@ public:
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
/// Mark region as modified from the GPU
void MarkRegionAsGpuModified(VAddr addr, size_t size);
/// Return buffer id for the specified region
BufferId FindBuffer(VAddr device_addr, u32 size);

View File

@ -51,6 +51,15 @@ public:
});
}
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<false>(dirty_cpu_addr, query_size,
[this](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size,
@ -75,6 +84,8 @@ public:
manager->template IsRegionModified<Type::GPU>(offset, size)) {
return true;
}
manager->template ChangeRegionState<Type::GPU, false>(
manager->GetCpuAddr() + offset, size);
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
return false;
@ -85,6 +96,20 @@ public:
});
}
/// Removes all protection from a page (lose any non downloaded GPU modifications)
void InvalidateRegion(VAddr cpu_addr, u64 size) noexcept {
IteratePages<false>(cpu_addr, size, [](RegionManager* manager, u64 offset, size_t size) {
// Perform both the GPU modification check and CPU state change with the lock
// in case we are racing with GPU thread trying to mark the page as GPU
// modified.
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::GPU, false>(manager->GetCpuAddr() + offset,
size);
manager->template ChangeRegionState<Type::CPU, true>(manager->GetCpuAddr() + offset,
size);
});
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func,
auto&& on_upload) {

View File

@ -72,7 +72,7 @@ struct RangeSet {
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_set.empty()) {
if (m_ranges_set.empty() || size == 0) {
return;
}
const VAddr start_address = base_addr;
@ -176,7 +176,7 @@ public:
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
if (m_ranges_map.empty() || size == 0) {
return;
}
const VAddr start_address = base_addr;
@ -280,7 +280,7 @@ public:
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
if (m_ranges_map.empty() || size == 0) {
return;
}
const VAddr start_address = base_addr;

View File

@ -51,6 +51,10 @@ public:
return Common::AlignUp(addr + 1, PAGE_SIZE);
}
static constexpr size_t GetPageSize() {
return PAGE_SIZE;
}
private:
struct Impl;
std::unique_ptr<Impl> impl;

View File

@ -426,7 +426,8 @@ public:
/// Returns the total memory budget available to the device.
[[nodiscard]] u64 GetTotalMemoryBudget() const {
return total_memory_budget;
return 2_GB; // TODO: this is for better garbage collection testing, temporary
// return total_memory_budget;
}
/// Determines if a format is supported for a set of feature flags.

View File

@ -260,12 +260,12 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3
}
const auto& [buffer, base] =
buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count, false);
buffer_cache.ObtainBuffer(arg_address + offset, stride * max_count);
VideoCore::Buffer* count_buffer{};
u32 count_base{};
if (count_address != 0) {
std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4, false);
std::tie(count_buffer, count_base) = buffer_cache.ObtainBuffer(count_address, 4);
}
pipeline->BindResources(set_writes, buffer_barriers, push_data);
@ -346,7 +346,7 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) {
return;
}
const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false);
const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size);
scheduler.EndRendering();
pipeline->BindResources(set_writes, buffer_barriers, push_data);
@ -629,8 +629,15 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE);
}
} else {
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
vsharp.base_address, size, desc.is_written, desc.is_formatted, buffer_id);
VideoCore::ObtainBufferFlags flags = {};
if (desc.is_written) {
flags |= VideoCore::ObtainBufferFlags::IsWritten;
}
if (desc.is_formatted) {
flags |= VideoCore::ObtainBufferFlags::IsTexelBuffer;
}
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(vsharp.base_address, size, flags, buffer_id);
const u32 offset_aligned = Common::AlignDown(offset, alignment);
const u32 adjust = offset - offset_aligned;
ASSERT(adjust % 4 == 0);
@ -996,7 +1003,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
// Not GPU mapped memory, can skip invalidation logic entirely.
return false;
}
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, true);
texture_cache.InvalidateMemory(addr, size);
return true;
}
@ -1030,7 +1037,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) {
}
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, true);
texture_cache.UnmapMemory(addr, size);
page_manager.OnGpuUnmap(addr, size);
{

View File

@ -95,9 +95,10 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Compute
// Obtain buffers for the total source and destination ranges.
const auto [src_buf, src_buf_offset] = buffer_cache.ObtainBuffer(
src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min, false);
src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min);
const auto [dst_buf, dst_buf_offset] = buffer_cache.ObtainBuffer(
dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, true);
dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min,
VideoCore::ObtainBufferFlags::IgnoreStreamBuffer);
// Apply found buffer base.
const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);
@ -117,6 +118,14 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Compute
vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
// Mark destination regions as GPU modified.
for (u32 i = 0; i < cs_program.dim_x; i++) {
const auto& [dst_idx, src_idx, end] = ctl_buf[i];
const VAddr dst_addr = dst_buf_sharp.base_address + (dst_idx * buf_stride);
const u32 size = (end + 1) * buf_stride;
buffer_cache.MarkRegionAsGpuModified(dst_addr, size);
}
return true;
}

View File

@ -28,9 +28,11 @@ enum ImageFlagBits : u32 {
Empty = 0,
MaybeCpuDirty = 1 << 0, ///< The page this image is in was touched before the image address
CpuDirty = 1 << 1, ///< Contents have been modified from the CPU
GpuDirty = 1 << 2, ///< Contents have been modified from the GPU (valid data in buffer cache)
GpuDirty =
1 << 2, ///< Image contents have been modified from the GPU (valid data in buffer cache)
Dirty = MaybeCpuDirty | CpuDirty | GpuDirty,
GpuModified = 1 << 3, ///< Contents have been modified from the GPU
MaybeReused = 1 << 4, ///< Memory region containing this image was maybe reused by the GPU
Registered = 1 << 6, ///< True when the image is registered
Picked = 1 << 7, ///< Temporary flag to mark the image as picked
};

View File

@ -23,9 +23,9 @@ static constexpr u64 NumFramesBeforeRemoval = 32;
TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_, BufferCache& buffer_cache_,
PageManager& tracker_)
PageManager& page_manager_)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
buffer_cache{buffer_cache_}, tracker{tracker_}, blit_helper{instance, scheduler},
buffer_cache{buffer_cache_}, page_manager{page_manager_}, blit_helper{instance, scheduler},
tile_manager{instance, scheduler, buffer_cache.GetUtilityBuffer(MemoryUsage::Stream)} {
// Create basic null image at fixed image ID.
const auto null_id = GetNullImage(vk::Format::eR8G8B8A8Unorm);
@ -85,48 +85,70 @@ ImageId TextureCache::GetNullImage(const vk::Format format) {
void TextureCache::ProcessDownloadImages() {
for (const ImageId image_id : download_images) {
DownloadImageMemory(image_id);
DownloadImageMemory<true>(image_id);
}
download_images.clear();
}
template <bool priority>
void TextureCache::DownloadImageMemory(ImageId image_id) {
Image& image = slot_images[image_id];
if (False(image.flags & ImageFlagBits::GpuModified)) {
return;
}
auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download);
const u32 download_size = image.info.pitch * image.info.size.height *
image.info.resources.layers * (image.info.num_bits / 8);
ASSERT(download_size <= image.info.guest_size);
const auto [download, offset] = download_buffer.Map(download_size);
download_buffer.Commit();
const vk::BufferImageCopy image_download = {
.bufferOffset = offset,
.bufferRowLength = image.info.pitch,
.bufferImageHeight = image.info.size.height,
.imageSubresource =
{
.aspectMask = image.info.props.is_depth ? vk::ImageAspectFlagBits::eDepth
: vk::ImageAspectFlagBits::eColor,
.mipLevel = 0,
const auto image_addr = image.info.guest_address;
const auto image_size = image.info.guest_size;
const auto image_mips = image.info.resources.levels;
u32 copy_size = 0;
boost::container::small_vector<vk::BufferImageCopy, 8> buffer_copies;
for (u32 mip = 0; mip < image_mips; ++mip) {
const auto& width = std::max(image.info.size.width >> mip, 1u);
const auto& height = std::max(image.info.size.height >> mip, 1u);
const auto& depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> mip, 1u) : 1u;
const auto [mip_size, mip_pitch, mip_height, mip_offset] = image.info.mips_layout[mip];
const u32 extent_width = mip_pitch ? std::min(mip_pitch, width) : width;
const u32 extent_height = mip_height ? std::min(mip_height, height) : height;
buffer_copies.push_back(vk::BufferImageCopy{
.bufferOffset = mip_offset,
.bufferRowLength = mip_pitch,
.bufferImageHeight = mip_height,
.imageSubresource{
.aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
.mipLevel = mip,
.baseArrayLayer = 0,
.layerCount = image.info.resources.layers,
},
.imageOffset = {0, 0, 0},
.imageExtent = {image.info.size.width, image.info.size.height, 1},
};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
cmdbuf.copyImageToBuffer(image.GetImage(), vk::ImageLayout::eTransferSrcOptimal,
download_buffer.Handle(), image_download);
scheduler.DeferPriorityOperation(
[this, device_addr = image.info.guest_address, download, download_size] {
Core::Memory::Instance()->TryWriteBacking(std::bit_cast<u8*>(device_addr), download,
download_size);
.imageOffset = {0, 0, 0},
.imageExtent = {extent_width, extent_height, depth},
});
copy_size += mip_size;
}
if (buffer_copies.empty()) {
return;
}
StreamBufferMapping mapping(download_buffer, image_size);
download_buffer.Commit();
scheduler.EndRendering();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
tile_manager.TileImage(image, buffer_copies, mapping.Buffer()->Handle(), mapping.Offset(),
copy_size);
const auto operation = [this, device_addr = image.info.guest_address, download = mapping.Data(),
image_size] {
Core::Memory::Instance()->TryWriteBacking(std::bit_cast<u8*>(device_addr), download,
image_size);
if constexpr (!priority) {
buffer_cache.InvalidateMemory(device_addr, image_size, false);
}
};
if constexpr (priority) {
scheduler.DeferPriorityOperation(std::move(operation));
} else {
scheduler.DeferOperation(std::move(operation));
}
}
void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) {
@ -183,6 +205,13 @@ void TextureCache::InvalidateMemoryFromGPU(VAddr address, size_t max_size) {
});
}
void TextureCache::MarkAsMaybeReused(VAddr addr, size_t size) {
std::scoped_lock lock{mutex};
ForEachImageInRegion(addr, size, [&](ImageId image_id, Image& image) {
image.flags |= ImageFlagBits::MaybeReused;
});
}
void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
std::scoped_lock lk{mutex};
@ -411,6 +440,7 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) {
TrackImage(new_image_id);
new_image.flags &= ~ImageFlagBits::Dirty;
new_image.flags |= src_image.flags & ImageFlagBits::GpuModified;
return new_image_id;
}
@ -516,12 +546,15 @@ ImageId TextureCache::FindImageFromRange(VAddr address, size_t size, bool ensure
if (image_ids.size() == 1) {
// Sometimes image size might not exactly match with requested buffer size
// If we only found 1 candidate image use it without too many questions.
Image& image = slot_images[image_ids[0]];
TouchImage(image);
return image_ids.back();
}
if (!image_ids.empty()) {
for (s32 i = 0; i < image_ids.size(); ++i) {
Image& image = slot_images[image_ids[i]];
if (image.info.guest_size == size) {
TouchImage(image);
return image_ids[i];
}
}
@ -758,7 +791,7 @@ void TextureCache::TrackImage(ImageId image_id) {
// Re-track the whole image
image.track_addr = image_begin;
image.track_addr_end = image_end;
tracker.UpdatePageWatchers<1>(image_begin, image.info.guest_size);
page_manager.UpdatePageWatchers<1>(image_begin, image.info.guest_size);
} else {
if (image_begin < image.track_addr) {
TrackImageHead(image_id);
@ -781,7 +814,7 @@ void TextureCache::TrackImageHead(ImageId image_id) {
ASSERT(image.track_addr != 0 && image_begin < image.track_addr);
const auto size = image.track_addr - image_begin;
image.track_addr = image_begin;
tracker.UpdatePageWatchers<1>(image_begin, size);
page_manager.UpdatePageWatchers<1>(image_begin, size);
}
void TextureCache::TrackImageTail(ImageId image_id) {
@ -797,7 +830,7 @@ void TextureCache::TrackImageTail(ImageId image_id) {
const auto addr = image.track_addr_end;
const auto size = image_end - image.track_addr_end;
image.track_addr_end = image_end;
tracker.UpdatePageWatchers<1>(addr, size);
page_manager.UpdatePageWatchers<1>(addr, size);
}
void TextureCache::UntrackImage(ImageId image_id) {
@ -810,7 +843,7 @@ void TextureCache::UntrackImage(ImageId image_id) {
image.track_addr = 0;
image.track_addr_end = 0;
if (size != 0) {
tracker.UpdatePageWatchers<false>(addr, size);
page_manager.UpdatePageWatchers<false>(addr, size);
}
}
@ -820,7 +853,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) {
if (!image.IsTracked() || image_begin < image.track_addr) {
return;
}
const auto addr = tracker.GetNextPageAddr(image_begin);
const auto addr = page_manager.GetNextPageAddr(image_begin);
const auto size = addr - image_begin;
image.track_addr = addr;
if (image.track_addr == image.track_addr_end) {
@ -829,7 +862,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) {
// Cehck its hash later.
MarkAsMaybeDirty(image_id, image);
}
tracker.UpdatePageWatchers<false>(image_begin, size);
page_manager.UpdatePageWatchers<false>(image_begin, size);
}
void TextureCache::UntrackImageTail(ImageId image_id) {
@ -839,7 +872,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) {
return;
}
ASSERT(image.track_addr_end != 0);
const auto addr = tracker.GetPageAddr(image_end);
const auto addr = page_manager.GetPageAddr(image_end);
const auto size = image_end - addr;
image.track_addr_end = addr;
if (image.track_addr == image.track_addr_end) {
@ -848,7 +881,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) {
// Cehck its hash later.
MarkAsMaybeDirty(image_id, image);
}
tracker.UpdatePageWatchers<false>(addr, size);
page_manager.UpdatePageWatchers<false>(addr, size);
}
void TextureCache::RunGarbageCollector() {
@ -866,6 +899,7 @@ void TextureCache::RunGarbageCollector() {
bool aggresive = false;
u64 ticks_to_destroy = 0;
size_t num_deletions = 0;
boost::container::small_vector<ImageId, 8> download_pending;
const auto configure = [&](bool allow_aggressive) {
pressured = total_used_memory >= pressure_gc_memory;
@ -880,19 +914,19 @@ void TextureCache::RunGarbageCollector() {
}
--num_deletions;
auto& image = slot_images[image_id];
const bool download = image.SafeToDownload();
const bool tiled = image.info.IsTiled();
if (tiled && download) {
// This is a workaround for now. We can't handle non-linear image downloads.
return false;
}
const bool download =
image.SafeToDownload() && False(image.flags & ImageFlagBits::MaybeReused);
if (download && !pressured) {
return false;
}
if (download) {
DownloadImageMemory(image_id);
download_pending.push_back(image_id);
buffer_cache.ReadEdgeImagePages(image);
UntrackImage(image_id);
UnregisterImage(image_id);
} else {
FreeImage(image_id);
}
FreeImage(image_id);
if (total_used_memory < critical_gc_memory) {
if (aggresive) {
num_deletions >>= 2;
@ -916,10 +950,26 @@ void TextureCache::RunGarbageCollector() {
configure(true);
lru_cache.ForEachItemBelow(gc_tick - ticks_to_destroy, clean_up);
}
for (const auto& image_id : download_pending) {
DownloadImageMemory<false>(image_id);
DeleteImage(image_id);
}
if (!download_pending.empty()) {
// We need to make downloads synchronous. It is possible that the contents
// of the image are requested before they are downloaded in which case
// outdated buffer cache contents are used instead.
scheduler.Finish();
scheduler.PopPendingOperations();
}
}
void TextureCache::TouchImage(const Image& image) {
void TextureCache::TouchImage(Image& image) {
lru_cache.Touch(image.lru_id, gc_tick);
// Image is still valid
image.flags &= ~ImageFlagBits::MaybeReused;
}
void TextureCache::DeleteImage(ImageId image_id) {

View File

@ -3,12 +3,9 @@
#pragma once
#include <condition_variable>
#include <mutex>
#include <thread>
#include <unordered_set>
#include <boost/container/small_vector.hpp>
#include <queue>
#include <tsl/robin_map.h>
#include "common/lru_cache.h"
@ -77,7 +74,8 @@ public:
public:
TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
AmdGpu::Liverpool* liverpool, BufferCache& buffer_cache, PageManager& tracker);
AmdGpu::Liverpool* liverpool, BufferCache& buffer_cache,
PageManager& page_manager);
~TextureCache();
TileManager& GetTileManager() noexcept {
@ -90,6 +88,9 @@ public:
/// Marks an image as dirty if it exists at the provided address.
void InvalidateMemoryFromGPU(VAddr address, size_t max_size);
/// Marks an image as maybe reused if it exists within the provided range.
void MarkAsMaybeReused(VAddr addr, size_t size);
/// Evicts any images that overlap the unmapped range.
void UnmapMemory(VAddr cpu_addr, size_t size);
@ -255,6 +256,7 @@ private:
ImageId GetNullImage(vk::Format format);
/// Copies image memory back to CPU.
template <bool priority>
void DownloadImageMemory(ImageId image_id);
/// Thread function for copying downloaded images out to CPU memory.
@ -285,7 +287,7 @@ private:
void DeleteImage(ImageId image_id);
/// Touch the image in the LRU cache.
void TouchImage(const Image& image);
void TouchImage(Image& image);
void FreeImage(ImageId image_id) {
UntrackImage(image_id);
@ -298,7 +300,7 @@ private:
Vulkan::Scheduler& scheduler;
AmdGpu::Liverpool* liverpool;
BufferCache& buffer_cache;
PageManager& tracker;
PageManager& page_manager;
BlitHelper blit_helper;
TileManager tile_manager;
Common::SlotVector<Image> slot_images;