From 159be2c7f436a91ee85f8f58465974ca64a9425c Mon Sep 17 00:00:00 2001
From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com>
Date: Mon, 5 Aug 2024 13:45:28 +0300
Subject: [PATCH 01/11] video_core: Minor fixes (#366)

* data_share: Fix DS instruction

* vk_graphics_pipeline: Fix unnecessary invalidate

* spirv: Remove subgroup id

* vector_alu: Simplify mbcnt pattern

* shader_recompiler: More instructions

* clang format

* kernel: Fix cond memory leak and reduce spam

* liverpool: Print error on exception

* build fix
---
 .../libraries/kernel/thread_management.cpp    |  7 ++-
 .../backend/spirv/emit_spirv_warp.cpp         |  2 +-
 .../backend/spirv/spirv_emit_context.cpp      |  1 -
 .../backend/spirv/spirv_emit_context.h        |  1 -
 .../frontend/translate/data_share.cpp         | 22 ++++++----
 .../frontend/translate/translate.h            |  2 +
 .../frontend/translate/vector_alu.cpp         | 43 +++++++++++--------
 src/video_core/amdgpu/liverpool.h             |  8 +++-
 .../renderer_vulkan/vk_compute_pipeline.cpp   |  4 +-
 9 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp
index c5237d0a..3393138d 100644
--- a/src/core/libraries/kernel/thread_management.cpp
+++ b/src/core/libraries/kernel/thread_management.cpp
@@ -727,6 +727,9 @@ int PS4_SYSV_ABI scePthreadCondDestroy(ScePthreadCond* cond) {
 
     LOG_INFO(Kernel_Pthread, "scePthreadCondDestroy, result={}", result);
 
+    delete *cond;
+    *cond = nullptr;
+
     switch (result) {
     case 0:
         return SCE_OK;
@@ -1142,7 +1145,7 @@ int PS4_SYSV_ABI scePthreadCondWait(ScePthreadCond* cond, ScePthreadMutex* mutex
     }
     int result = pthread_cond_wait(&(*cond)->cond, &(*mutex)->pth_mutex);
 
-    LOG_INFO(Kernel_Pthread, "scePthreadCondWait, result={}", result);
+    LOG_DEBUG(Kernel_Pthread, "scePthreadCondWait, result={}", result);
 
     switch (result) {
     case 0:
@@ -1162,7 +1165,7 @@ int PS4_SYSV_ABI scePthreadCondattrDestroy(ScePthreadCondattr* attr) {
     }
     int result = pthread_condattr_destroy(&(*attr)->cond_attr);
 
-    LOG_INFO(Kernel_Pthread, "scePthreadCondattrDestroy: result = {} ", result);
+    LOG_DEBUG(Kernel_Pthread, "scePthreadCondattrDestroy: result = {} ", result);
 
     switch (result) {
     case 0:
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
index bd4ac066..38afd90f 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
@@ -11,7 +11,7 @@ Id SubgroupScope(EmitContext& ctx) {
 }
 
 Id EmitWarpId(EmitContext& ctx) {
-    return ctx.OpLoad(ctx.U32[1], ctx.subgroup_id);
+    UNREACHABLE();
 }
 
 Id EmitLaneId(EmitContext& ctx) {
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index f7b30052..8ca8b7a3 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -225,7 +225,6 @@ void EmitContext::DefineInputs(const Info& info) {
         break;
     }
     case Stage::Fragment:
-        subgroup_id = DefineVariable(U32[1], spv::BuiltIn::SubgroupId, spv::StorageClass::Input);
         subgroup_local_invocation_id = DefineVariable(
             U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input);
         Decorate(subgroup_local_invocation_id, spv::Decoration::Flat);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 34c13d3f..2aa1bf78 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -180,7 +180,6 @@ public:
 
     Id workgroup_id{};
     Id local_invocation_id{};
-    Id subgroup_id{};
     Id subgroup_local_invocation_id{};
     Id image_u32{};
 
diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp
index 14837166..532e024e 100644
--- a/src/shader_recompiler/frontend/translate/data_share.cpp
+++ b/src/shader_recompiler/frontend/translate/data_share.cpp
@@ -48,7 +48,8 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
     IR::VectorReg dst_reg{inst.dst[0].code};
     if (is_pair) {
         // Pair loads are either 32 or 64-bit
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
+        const u32 adj = bit_size == 32 ? 4 : 8;
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
         const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0);
         if (bit_size == 32) {
             ir.SetVectorReg(dst_reg++, IR::U32{data0});
@@ -56,7 +57,7 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)});
         }
-        const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
+        const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
         const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
         if (bit_size == 32) {
             ir.SetVectorReg(dst_reg++, IR::U32{data1});
@@ -65,11 +66,13 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)});
         }
     } else if (bit_size == 64) {
-        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr);
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
+        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
         ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)});
         ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)});
     } else {
-        const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr)};
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
+        const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)};
         ir.SetVectorReg(dst_reg, data);
     }
 }
@@ -79,7 +82,8 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
     const IR::VectorReg data0{inst.src[1].code};
     const IR::VectorReg data1{inst.src[2].code};
     if (is_pair) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
+        const u32 adj = bit_size == 32 ? 4 : 8;
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
         if (bit_size == 32) {
             ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
         } else {
@@ -87,7 +91,7 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
                 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)),
                 addr0);
         }
-        const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
+        const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
         if (bit_size == 32) {
             ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
         } else {
@@ -96,11 +100,13 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
                 addr1);
         }
     } else if (bit_size == 64) {
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
         const IR::Value data =
             ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
-        ir.WriteShared(bit_size, data, addr);
+        ir.WriteShared(bit_size, data, addr0);
     } else {
-        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr);
+        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
+        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
     }
 }
 
diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h
index 8d1b7683..fe4457d2 100644
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@@ -125,6 +125,7 @@ public:
     void V_ADD_F32(const GcnInst& inst);
     void V_CVT_OFF_F32_I4(const GcnInst& inst);
     void V_MED3_F32(const GcnInst& inst);
+    void V_MED3_I32(const GcnInst& inst);
     void V_FLOOR_F32(const GcnInst& inst);
     void V_SUB_F32(const GcnInst& inst);
     void V_RCP_F32(const GcnInst& inst);
@@ -159,6 +160,7 @@ public:
     void V_SUB_I32(const GcnInst& inst);
     void V_LSHR_B32(const GcnInst& inst);
     void V_ASHRREV_I32(const GcnInst& inst);
+    void V_ASHR_I32(const GcnInst& inst);
     void V_MAD_U32_U24(const GcnInst& inst);
     void V_RNDNE_F32(const GcnInst& inst);
     void V_BCNT_U32_B32(const GcnInst& inst);
diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp
index 669ef7ca..89428c44 100644
--- a/src/shader_recompiler/frontend/translate/vector_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp
@@ -24,6 +24,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
         return V_LSHR_B32(inst);
     case Opcode::V_ASHRREV_I32:
         return V_ASHRREV_I32(inst);
+    case Opcode::V_ASHR_I32:
+        return V_ASHR_I32(inst);
     case Opcode::V_LSHRREV_B32:
         return V_LSHRREV_B32(inst);
     case Opcode::V_NOT_B32:
@@ -183,6 +185,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
         return V_ADD_F32(inst);
     case Opcode::V_MED3_F32:
         return V_MED3_F32(inst);
+    case Opcode::V_MED3_I32:
+        return V_MED3_I32(inst);
     case Opcode::V_FLOOR_F32:
         return V_FLOOR_F32(inst);
     case Opcode::V_SUB_F32:
@@ -479,6 +483,14 @@ void Translator::V_MED3_F32(const GcnInst& inst) {
     SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx));
 }
 
+void Translator::V_MED3_I32(const GcnInst& inst) {
+    const IR::U32 src0{GetSrc(inst.src[0])};
+    const IR::U32 src1{GetSrc(inst.src[1])};
+    const IR::U32 src2{GetSrc(inst.src[2])};
+    const IR::U32 mmx = ir.SMin(ir.SMax(src0, src1), src2);
+    SetDst(inst.dst[0], ir.SMax(ir.SMin(src0, src1), mmx));
+}
+
 void Translator::V_FLOOR_F32(const GcnInst& inst) {
     const IR::F32 src0{GetSrc(inst.src[0], true)};
     const IR::VectorReg dst_reg{inst.dst[0].code};
@@ -760,6 +772,12 @@ void Translator::V_ASHRREV_I32(const GcnInst& inst) {
     SetDst(inst.dst[0], ir.ShiftRightArithmetic(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
 }
 
+void Translator::V_ASHR_I32(const GcnInst& inst) {
+    const IR::U32 src0{GetSrc(inst.src[0])};
+    const IR::U32 src1{GetSrc(inst.src[1])};
+    SetDst(inst.dst[0], ir.ShiftRightArithmetic(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F))));
+}
+
 void Translator::V_MAD_U32_U24(const GcnInst& inst) {
     V_MAD_I32_I24(inst, false);
 }
@@ -925,25 +943,12 @@ void Translator::V_FFBL_B32(const GcnInst& inst) {
 void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
     const IR::U32 src0{GetSrc(inst.src[0])};
     const IR::U32 src1{GetSrc(inst.src[1])};
-    const IR::U32 lane_id = ir.LaneId();
-
-    const auto [warp_half, mask_shift] = [&]() -> std::pair<IR::U32, IR::U32> {
-        if (profile.subgroup_size == 32) {
-            const IR::U32 warp_half = ir.BitwiseAnd(ir.WarpId(), ir.Imm32(1));
-            return std::make_pair(warp_half, lane_id);
-        }
-        const IR::U32 warp_half = ir.ShiftRightLogical(lane_id, ir.Imm32(5));
-        const IR::U32 mask_shift = ir.BitwiseAnd(lane_id, ir.Imm32(0x1F));
-        return std::make_pair(warp_half, mask_shift);
-    }();
-
-    const IR::U32 thread_mask = ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1), mask_shift), ir.Imm32(1));
-    const IR::U1 is_odd_warp = ir.INotEqual(warp_half, ir.Imm32(0));
-    const IR::U32 mask = IR::U32{ir.Select(is_odd_warp, is_low ? ir.Imm32(~0U) : thread_mask,
-                                           is_low ? thread_mask : ir.Imm32(0))};
-    const IR::U32 masked_value = ir.BitwiseAnd(src0, mask);
-    const IR::U32 result = ir.IAdd(src1, ir.BitCount(masked_value));
-    SetDst(inst.dst[0], result);
+    if (!is_low) {
+        ASSERT(src0.IsImmediate() && src0.U32() == ~0U && src1.IsImmediate() && src1.U32() == 0U);
+        return;
+    }
+    ASSERT(src0.IsImmediate() && src0.U32() == ~0U);
+    SetDst(inst.dst[0], ir.LaneId());
 }
 
 } // namespace Shader::Gcn
diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h
index b0285809..400af031 100644
--- a/src/video_core/amdgpu/liverpool.h
+++ b/src/video_core/amdgpu/liverpool.h
@@ -6,7 +6,7 @@
 #include <array>
 #include <condition_variable>
 #include <coroutine>
-#include <functional>
+#include <exception>
 #include <mutex>
 #include <span>
 #include <thread>
@@ -1040,7 +1040,11 @@ private:
                 return {};
             }
             void unhandled_exception() {
-                UNREACHABLE();
+                try {
+                    std::rethrow_exception(std::current_exception());
+                } catch (const std::exception& e) {
+                    UNREACHABLE_MSG("Unhandled exception: {}", e.what());
+                }
             }
             void return_void() {}
             struct empty {};
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 34f1e9cc..d8e5f7fa 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -94,7 +94,9 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
         const auto vsharp = buffer.GetVsharp(info);
         const u32 size = vsharp.GetSize();
         const VAddr address = vsharp.base_address;
-        texture_cache.OnCpuWrite(address);
+        if (buffer.is_storage) {
+            texture_cache.OnCpuWrite(address);
+        }
         const u32 offset = staging.Copy(address, size,
                                         buffer.is_storage ? instance.StorageMinAlignment()
                                                           : instance.UniformMinAlignment());

From 381ba8c7a591c43f58056f5246ccc7baab1ecb95 Mon Sep 17 00:00:00 2001
From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:02:10 +0300
Subject: [PATCH 02/11] video_core: Implement guest buffer manager (#373)

* video_core: Introduce buffer cache

* video_core: Use multi level page table for caches

* renderer_vulkan: Remove unused stream buffer

* fix build

* oops forgot optimize off
---
 CMakeLists.txt                                |  15 +-
 .../object_pool.h                             |   4 +-
 src/common/unique_function.h                  |  61 +++
 .../libraries/kernel/thread_management.cpp    |  17 +-
 src/core/libraries/pad/pad.cpp                |   4 +-
 src/core/memory.cpp                           |  96 +---
 src/core/memory.h                             |  30 +-
 src/core/module.cpp                           |   1 +
 .../spirv/emit_spirv_context_get_set.cpp      |  30 +-
 .../backend/spirv/spirv_emit_context.cpp      |  80 +--
 .../backend/spirv/spirv_emit_context.h        |  19 +-
 .../frontend/control_flow_graph.cpp           |   2 +-
 .../frontend/control_flow_graph.h             |   6 +-
 src/shader_recompiler/frontend/module.h       |  10 -
 .../frontend/structured_control_flow.cpp      |  22 +-
 .../frontend/structured_control_flow.h        |   5 +-
 .../frontend/translate/translate.cpp          |   1 +
 .../frontend/translate/vector_memory.cpp      |   5 +
 src/shader_recompiler/ir/basic_block.cpp      |   2 +-
 src/shader_recompiler/ir/basic_block.h        |   6 +-
 .../ir/passes/resource_tracking_pass.cpp      | 101 ++--
 src/shader_recompiler/recompiler.cpp          |  10 +-
 src/shader_recompiler/recompiler.h            |   6 +-
 src/shader_recompiler/runtime_info.h          |  21 +-
 src/video_core/amdgpu/liverpool.h             |   2 +-
 src/video_core/amdgpu/resource.h              |   4 +
 src/video_core/buffer_cache/buffer.cpp        | 227 ++++++++
 src/video_core/buffer_cache/buffer.h          | 173 ++++++
 src/video_core/buffer_cache/buffer_cache.cpp  | 497 ++++++++++++++++++
 src/video_core/buffer_cache/buffer_cache.h    | 129 +++++
 .../buffer_cache/memory_tracker_base.h        | 175 ++++++
 src/video_core/buffer_cache/range_set.h       | 159 ++++++
 src/video_core/buffer_cache/word_manager.h    | 398 ++++++++++++++
 src/video_core/multi_level_page_table.h       |  65 +++
 src/video_core/page_manager.cpp               | 260 +++++++++
 src/video_core/page_manager.h                 |  39 ++
 .../renderer_vulkan/renderer_vulkan.cpp       |   4 +-
 .../renderer_vulkan/renderer_vulkan.h         |   6 +-
 .../renderer_vulkan/vk_compute_pipeline.cpp   |  63 ++-
 .../renderer_vulkan/vk_compute_pipeline.h     |  10 +-
 .../renderer_vulkan/vk_graphics_pipeline.cpp  | 132 ++---
 .../renderer_vulkan/vk_graphics_pipeline.h    |  15 +-
 .../renderer_vulkan/vk_instance.cpp           |  12 +-
 .../renderer_vulkan/vk_pipeline_cache.h       |   5 +-
 .../renderer_vulkan/vk_rasterizer.cpp         |  87 +--
 .../renderer_vulkan/vk_rasterizer.h           |  27 +-
 src/video_core/renderer_vulkan/vk_scheduler.h |   7 +-
 .../renderer_vulkan/vk_stream_buffer.cpp      | 241 ---------
 .../renderer_vulkan/vk_stream_buffer.h        |  89 ----
 src/video_core/texture_cache/image_info.cpp   |   2 +-
 src/video_core/texture_cache/image_view.cpp   |  15 +-
 .../texture_cache/texture_cache.cpp           | 197 ++-----
 src/video_core/texture_cache/texture_cache.h  |  71 +--
 src/video_core/texture_cache/tile_manager.cpp |  66 ++-
 src/video_core/texture_cache/tile_manager.h   |   5 +-
 55 files changed, 2697 insertions(+), 1039 deletions(-)
 rename src/{shader_recompiler => common}/object_pool.h (98%)
 create mode 100755 src/common/unique_function.h
 delete mode 100644 src/shader_recompiler/frontend/module.h
 create mode 100644 src/video_core/buffer_cache/buffer.cpp
 create mode 100644 src/video_core/buffer_cache/buffer.h
 create mode 100644 src/video_core/buffer_cache/buffer_cache.cpp
 create mode 100644 src/video_core/buffer_cache/buffer_cache.h
 create mode 100644 src/video_core/buffer_cache/memory_tracker_base.h
 create mode 100644 src/video_core/buffer_cache/range_set.h
 create mode 100644 src/video_core/buffer_cache/word_manager.h
 create mode 100644 src/video_core/multi_level_page_table.h
 create mode 100644 src/video_core/page_manager.cpp
 create mode 100644 src/video_core/page_manager.h
 delete mode 100644 src/video_core/renderer_vulkan/vk_stream_buffer.cpp
 delete mode 100644 src/video_core/renderer_vulkan/vk_stream_buffer.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90ba4d83..4df3db2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -283,6 +283,7 @@ set(COMMON src/common/logging/backend.cpp
            src/common/native_clock.h
            src/common/path_util.cpp
            src/common/path_util.h
+           src/common/object_pool.h
            src/common/polyfill_thread.h
            src/common/rdtsc.cpp
            src/common/rdtsc.h
@@ -294,6 +295,7 @@ set(COMMON src/common/logging/backend.cpp
            src/common/thread.h
            src/common/types.h
            src/common/uint128.h
+           src/common/unique_function.h
            src/common/version.h
            src/common/ntapi.h
            src/common/ntapi.cpp
@@ -367,7 +369,6 @@ set(CORE src/core/aerolib/stubs.cpp
 )
 
 set(SHADER_RECOMPILER src/shader_recompiler/exception.h
-                      src/shader_recompiler/object_pool.h
                       src/shader_recompiler/profile.h
                       src/shader_recompiler/recompiler.cpp
                       src/shader_recompiler/recompiler.h
@@ -451,6 +452,13 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
                src/video_core/amdgpu/pm4_cmds.h
                src/video_core/amdgpu/pm4_opcodes.h
                src/video_core/amdgpu/resource.h
+               src/video_core/buffer_cache/buffer.cpp
+               src/video_core/buffer_cache/buffer.h
+               src/video_core/buffer_cache/buffer_cache.cpp
+               src/video_core/buffer_cache/buffer_cache.h
+               src/video_core/buffer_cache/memory_tracker_base.h
+               src/video_core/buffer_cache/range_set.h
+               src/video_core/buffer_cache/word_manager.h
                src/video_core/renderer_vulkan/liverpool_to_vk.cpp
                src/video_core/renderer_vulkan/liverpool_to_vk.h
                src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -479,8 +487,6 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
                src/video_core/renderer_vulkan/vk_scheduler.h
                src/video_core/renderer_vulkan/vk_shader_util.cpp
                src/video_core/renderer_vulkan/vk_shader_util.h
-               src/video_core/renderer_vulkan/vk_stream_buffer.cpp
-               src/video_core/renderer_vulkan/vk_stream_buffer.h
                src/video_core/renderer_vulkan/vk_swapchain.cpp
                src/video_core/renderer_vulkan/vk_swapchain.h
                src/video_core/texture_cache/image.cpp
@@ -496,6 +502,9 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
                src/video_core/texture_cache/tile_manager.cpp
                src/video_core/texture_cache/tile_manager.h
                src/video_core/texture_cache/types.h
+               src/video_core/page_manager.cpp
+               src/video_core/page_manager.h
+               src/video_core/multi_level_page_table.h
                src/video_core/renderdoc.cpp
                src/video_core/renderdoc.h
 )
diff --git a/src/shader_recompiler/object_pool.h b/src/common/object_pool.h
similarity index 98%
rename from src/shader_recompiler/object_pool.h
rename to src/common/object_pool.h
index 1398898a..9e25e0c4 100644
--- a/src/shader_recompiler/object_pool.h
+++ b/src/common/object_pool.h
@@ -8,7 +8,7 @@
 #include <utility>
 #include <vector>
 
-namespace Shader {
+namespace Common {
 
 template <typename T>
     requires std::is_destructible_v<T>
@@ -104,4 +104,4 @@ private:
     size_t new_chunk_size{};
 };
 
-} // namespace Shader
+} // namespace Common
diff --git a/src/common/unique_function.h b/src/common/unique_function.h
new file mode 100755
index 00000000..1891ec3c
--- /dev/null
+++ b/src/common/unique_function.h
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+namespace Common {
+
+/// General purpose function wrapper similar to std::function.
+/// Unlike std::function, the captured values don't have to be copyable.
+/// This class can be moved but not copied.
+template <typename ResultType, typename... Args>
+class UniqueFunction {
+    class CallableBase {
+    public:
+        virtual ~CallableBase() = default;
+        virtual ResultType operator()(Args&&...) = 0;
+    };
+
+    template <typename Functor>
+    class Callable final : public CallableBase {
+    public:
+        Callable(Functor&& functor_) : functor{std::move(functor_)} {}
+        ~Callable() override = default;
+
+        ResultType operator()(Args&&... args) override {
+            return functor(std::forward<Args>(args)...);
+        }
+
+    private:
+        Functor functor;
+    };
+
+public:
+    UniqueFunction() = default;
+
+    template <typename Functor>
+    UniqueFunction(Functor&& functor)
+        : callable{std::make_unique<Callable<Functor>>(std::move(functor))} {}
+
+    UniqueFunction& operator=(UniqueFunction&& rhs) noexcept = default;
+    UniqueFunction(UniqueFunction&& rhs) noexcept = default;
+
+    UniqueFunction& operator=(const UniqueFunction&) = delete;
+    UniqueFunction(const UniqueFunction&) = delete;
+
+    ResultType operator()(Args&&... args) const {
+        return (*callable)(std::forward<Args>(args)...);
+    }
+
+    explicit operator bool() const noexcept {
+        return static_cast<bool>(callable);
+    }
+
+private:
+    std::unique_ptr<CallableBase> callable;
+};
+
+} // namespace Common
diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp
index 3393138d..48347ea5 100644
--- a/src/core/libraries/kernel/thread_management.cpp
+++ b/src/core/libraries/kernel/thread_management.cpp
@@ -465,7 +465,7 @@ int PS4_SYSV_ABI scePthreadMutexDestroy(ScePthreadMutex* mutex) {
 
     int result = pthread_mutex_destroy(&(*mutex)->pth_mutex);
 
-    LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result);
+    LOG_DEBUG(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result);
 
     delete *mutex;
     *mutex = nullptr;
@@ -725,7 +725,7 @@ int PS4_SYSV_ABI scePthreadCondDestroy(ScePthreadCond* cond) {
     }
     int result = pthread_cond_destroy(&(*cond)->cond);
 
-    LOG_INFO(Kernel_Pthread, "scePthreadCondDestroy, result={}", result);
+    LOG_DEBUG(Kernel_Pthread, "scePthreadCondDestroy, result={}", result);
 
     delete *cond;
     *cond = nullptr;
@@ -811,8 +811,6 @@ int PS4_SYSV_ABI posix_pthread_cond_timedwait(ScePthreadCond* cond, ScePthreadMu
 }
 
 int PS4_SYSV_ABI posix_pthread_cond_broadcast(ScePthreadCond* cond) {
-    LOG_INFO(Kernel_Pthread,
-             "posix posix_pthread_cond_broadcast redirect to scePthreadCondBroadcast");
     int result = scePthreadCondBroadcast(cond);
     if (result != 0) {
         int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@@ -824,7 +822,6 @@ int PS4_SYSV_ABI posix_pthread_cond_broadcast(ScePthreadCond* cond) {
 }
 
 int PS4_SYSV_ABI posix_pthread_mutexattr_init(ScePthreadMutexattr* attr) {
-    // LOG_INFO(Kernel_Pthread, "posix pthread_mutexattr_init redirect to scePthreadMutexattrInit");
     int result = scePthreadMutexattrInit(attr);
     if (result < 0) {
         int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@@ -836,7 +833,6 @@ int PS4_SYSV_ABI posix_pthread_mutexattr_init(ScePthreadMutexattr* attr) {
 }
 
 int PS4_SYSV_ABI posix_pthread_mutexattr_settype(ScePthreadMutexattr* attr, int type) {
-    // LOG_INFO(Kernel_Pthread, "posix pthread_mutex_init redirect to scePthreadMutexInit");
     int result = scePthreadMutexattrSettype(attr, type);
     if (result < 0) {
         int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@@ -861,7 +857,6 @@ int PS4_SYSV_ABI posix_pthread_once(pthread_once_t* once_control, void (*init_ro
 
 int PS4_SYSV_ABI posix_pthread_mutexattr_setprotocol(ScePthreadMutexattr* attr, int protocol) {
     int result = scePthreadMutexattrSetprotocol(attr, protocol);
-    LOG_INFO(Kernel_Pthread, "redirect to scePthreadMutexattrSetprotocol: result = {}", result);
     if (result < 0) {
         UNREACHABLE();
     }
@@ -1295,8 +1290,6 @@ int PS4_SYSV_ABI posix_pthread_attr_setdetachstate(ScePthreadAttr* attr, int det
 int PS4_SYSV_ABI posix_pthread_create_name_np(ScePthread* thread, const ScePthreadAttr* attr,
                                               PthreadEntryFunc start_routine, void* arg,
                                               const char* name) {
-    LOG_INFO(Kernel_Pthread, "posix pthread_create redirect to scePthreadCreate: name = {}", name);
-
     int result = scePthreadCreate(thread, attr, start_routine, arg, name);
     if (result != 0) {
         int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@@ -1343,17 +1336,11 @@ int PS4_SYSV_ABI posix_pthread_cond_init(ScePthreadCond* cond, const ScePthreadC
 
 int PS4_SYSV_ABI posix_pthread_cond_signal(ScePthreadCond* cond) {
     int result = scePthreadCondSignal(cond);
-    LOG_INFO(Kernel_Pthread,
-             "posix posix_pthread_cond_signal redirect to scePthreadCondSignal, result = {}",
-             result);
     return result;
 }
 
 int PS4_SYSV_ABI posix_pthread_cond_destroy(ScePthreadCond* cond) {
     int result = scePthreadCondDestroy(cond);
-    LOG_INFO(Kernel_Pthread,
-             "posix posix_pthread_cond_destroy redirect to scePthreadCondDestroy, result = {}",
-             result);
     return result;
 }
 
diff --git a/src/core/libraries/pad/pad.cpp b/src/core/libraries/pad/pad.cpp
index e318e152..064c71b8 100644
--- a/src/core/libraries/pad/pad.cpp
+++ b/src/core/libraries/pad/pad.cpp
@@ -470,7 +470,7 @@ int PS4_SYSV_ABI scePadSetUserColor() {
 }
 
 int PS4_SYSV_ABI scePadSetVibration(s32 handle, const OrbisPadVibrationParam* pParam) {
-    LOG_ERROR(Lib_Pad, "(STUBBED) called");
+    LOG_DEBUG(Lib_Pad, "(STUBBED) called");
     return ORBIS_OK;
 }
 
@@ -665,4 +665,4 @@ void RegisterlibScePad(Core::Loader::SymbolsResolver* sym) {
     LIB_FUNCTION("7xA+hFtvBCA", "libScePad", 1, "libScePad", 1, 1, Func_EF103E845B6F0420);
 };
 
-} // namespace Libraries::Pad
\ No newline at end of file
+} // namespace Libraries::Pad
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index aa552d51..dc5ded41 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -7,7 +7,7 @@
 #include "core/libraries/error_codes.h"
 #include "core/libraries/kernel/memory_management.h"
 #include "core/memory.h"
-#include "video_core/renderer_vulkan/vk_instance.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
 
 namespace Core {
 
@@ -172,7 +172,7 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M
 
     if (type == VMAType::Direct) {
         new_vma.phys_base = phys_addr;
-        MapVulkanMemory(mapped_addr, size);
+        rasterizer->MapMemory(mapped_addr, size);
     }
     if (type == VMAType::Flexible) {
         flexible_usage += size;
@@ -222,7 +222,7 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) {
     const auto type = it->second.type;
     const bool has_backing = type == VMAType::Direct || type == VMAType::File;
     if (type == VMAType::Direct) {
-        UnmapVulkanMemory(virtual_addr, size);
+        rasterizer->UnmapMemory(virtual_addr, size);
     }
     if (type == VMAType::Flexible) {
         flexible_usage -= size;
@@ -263,7 +263,7 @@ int MemoryManager::QueryProtection(VAddr addr, void** start, void** end, u32* pr
 }
 
 int MemoryManager::VirtualQuery(VAddr addr, int flags,
-                                Libraries::Kernel::OrbisVirtualQueryInfo* info) {
+                                ::Libraries::Kernel::OrbisVirtualQueryInfo* info) {
     std::scoped_lock lk{mutex};
 
     auto it = FindVMA(addr);
@@ -293,7 +293,7 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags,
 }
 
 int MemoryManager::DirectMemoryQuery(PAddr addr, bool find_next,
-                                     Libraries::Kernel::OrbisQueryInfo* out_info) {
+                                     ::Libraries::Kernel::OrbisQueryInfo* out_info) {
     std::scoped_lock lk{mutex};
 
     auto dmem_area = FindDmemArea(addr);
@@ -333,13 +333,6 @@ int MemoryManager::DirectQueryAvailable(PAddr search_start, PAddr search_end, si
     return ORBIS_OK;
 }
 
-std::pair<vk::Buffer, size_t> MemoryManager::GetVulkanBuffer(VAddr addr) {
-    auto it = mapped_memories.upper_bound(addr);
-    it = std::prev(it);
-    ASSERT(it != mapped_memories.end() && it->first <= addr);
-    return std::make_pair(*it->second.buffer, addr - it->first);
-}
-
 void MemoryManager::NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name) {
     auto it = FindVMA(virtual_addr);
 
@@ -455,85 +448,6 @@ MemoryManager::DMemHandle MemoryManager::Split(DMemHandle dmem_handle, size_t of
     return dmem_map.emplace_hint(std::next(dmem_handle), new_area.base, new_area);
 };
 
-void MemoryManager::MapVulkanMemory(VAddr addr, size_t size) {
-    return;
-    const vk::Device device = instance->GetDevice();
-    const auto memory_props = instance->GetPhysicalDevice().getMemoryProperties();
-    void* host_pointer = reinterpret_cast<void*>(addr);
-    const auto host_mem_props = device.getMemoryHostPointerPropertiesEXT(
-        vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, host_pointer);
-    ASSERT(host_mem_props.memoryTypeBits != 0);
-
-    int mapped_memory_type = -1;
-    auto find_mem_type_with_flag = [&](const vk::MemoryPropertyFlags flags) {
-        u32 host_mem_types = host_mem_props.memoryTypeBits;
-        while (host_mem_types != 0) {
-            // Try to find a cached memory type
-            mapped_memory_type = std::countr_zero(host_mem_types);
-            host_mem_types -= (1 << mapped_memory_type);
-
-            if ((memory_props.memoryTypes[mapped_memory_type].propertyFlags & flags) == flags) {
-                return;
-            }
-        }
-
-        mapped_memory_type = -1;
-    };
-
-    // First try to find a memory that is both coherent and cached
-    find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent |
-                            vk::MemoryPropertyFlagBits::eHostCached);
-    if (mapped_memory_type == -1)
-        // Then only coherent (lower performance)
-        find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent);
-
-    if (mapped_memory_type == -1) {
-        LOG_CRITICAL(Render_Vulkan, "No coherent memory available for memory mapping");
-        mapped_memory_type = std::countr_zero(host_mem_props.memoryTypeBits);
-    }
-
-    const vk::StructureChain alloc_info = {
-        vk::MemoryAllocateInfo{
-            .allocationSize = size,
-            .memoryTypeIndex = static_cast<uint32_t>(mapped_memory_type),
-        },
-        vk::ImportMemoryHostPointerInfoEXT{
-            .handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT,
-            .pHostPointer = host_pointer,
-        },
-    };
-
-    const auto [it, new_memory] = mapped_memories.try_emplace(addr);
-    ASSERT_MSG(new_memory, "Attempting to remap already mapped vulkan memory");
-
-    auto& memory = it->second;
-    memory.backing = device.allocateMemoryUnique(alloc_info.get());
-
-    constexpr vk::BufferUsageFlags MapFlags =
-        vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer |
-        vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst |
-        vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer;
-
-    const vk::StructureChain buffer_info = {
-        vk::BufferCreateInfo{
-            .size = size,
-            .usage = MapFlags,
-            .sharingMode = vk::SharingMode::eExclusive,
-        },
-        vk::ExternalMemoryBufferCreateInfoKHR{
-            .handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT,
-        }};
-    memory.buffer = device.createBufferUnique(buffer_info.get());
-    device.bindBufferMemory(*memory.buffer, *memory.backing, 0);
-}
-
-void MemoryManager::UnmapVulkanMemory(VAddr addr, size_t size) {
-    return;
-    const auto it = mapped_memories.find(addr);
-    ASSERT(it != mapped_memories.end() && it->second.buffer_size == size);
-    mapped_memories.erase(it);
-}
-
 int MemoryManager::GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut,
                                        void** directMemoryStartOut, void** directMemoryEndOut) {
     std::scoped_lock lk{mutex};
diff --git a/src/core/memory.h b/src/core/memory.h
index 2b3d07a7..6d0a977f 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -3,20 +3,17 @@
 
 #pragma once
 
-#include <functional>
+#include <map>
 #include <mutex>
 #include <string_view>
-#include <vector>
-#include <boost/icl/split_interval_map.hpp>
 #include "common/enum.h"
 #include "common/singleton.h"
 #include "common/types.h"
 #include "core/address_space.h"
 #include "core/libraries/kernel/memory_management.h"
-#include "video_core/renderer_vulkan/vk_common.h"
 
 namespace Vulkan {
-class Instance;
+class Rasterizer;
 }
 
 namespace Libraries::Kernel {
@@ -128,8 +125,8 @@ public:
     explicit MemoryManager();
     ~MemoryManager();
 
-    void SetInstance(const Vulkan::Instance* instance_) {
-        instance = instance_;
+    void SetRasterizer(Vulkan::Rasterizer* rasterizer_) {
+        rasterizer = rasterizer_;
     }
 
     void SetTotalFlexibleSize(u64 size) {
@@ -140,9 +137,7 @@ public:
         return total_flexible_size - flexible_usage;
     }
 
-    /// Returns the offset of the mapped virtual system managed memory base from where it usually
-    /// would be mapped.
-    [[nodiscard]] VAddr SystemReservedVirtualBase() noexcept {
+    VAddr SystemReservedVirtualBase() noexcept {
         return impl.SystemReservedVirtualBase();
     }
 
@@ -172,8 +167,6 @@ public:
     int DirectQueryAvailable(PAddr search_start, PAddr search_end, size_t alignment,
                              PAddr* phys_addr_out, size_t* size_out);
 
-    std::pair<vk::Buffer, size_t> GetVulkanBuffer(VAddr addr);
-
     int GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, void** directMemoryStartOut,
                             void** directMemoryEndOut);
 
@@ -218,10 +211,6 @@ private:
 
     DMemHandle Split(DMemHandle dmem_handle, size_t offset_in_area);
 
-    void MapVulkanMemory(VAddr addr, size_t size);
-
-    void UnmapVulkanMemory(VAddr addr, size_t size);
-
 private:
     AddressSpace impl;
     DMemMap dmem_map;
@@ -229,14 +218,7 @@ private:
     std::recursive_mutex mutex;
     size_t total_flexible_size = 448_MB;
     size_t flexible_usage{};
-
-    struct MappedMemory {
-        vk::UniqueBuffer buffer;
-        vk::UniqueDeviceMemory backing;
-        size_t buffer_size;
-    };
-    std::map<VAddr, MappedMemory> mapped_memories;
-    const Vulkan::Instance* instance{};
+    Vulkan::Rasterizer* rasterizer{};
 };
 
 using Memory = Common::Singleton<MemoryManager>;
diff --git a/src/core/module.cpp b/src/core/module.cpp
index d885b917..775e1ef1 100644
--- a/src/core/module.cpp
+++ b/src/core/module.cpp
@@ -88,6 +88,7 @@ void Module::LoadModuleToMemory(u32& max_tls_index) {
                       aligned_base_size + TrampolineSize, MemoryProt::CpuReadWrite,
                       MemoryMapFlags::Fixed, VMAType::Code, name, true);
     LoadOffset += CODE_BASE_INCR * (1 + aligned_base_size / CODE_BASE_INCR);
+    LOG_INFO(Core_Linker, "Loading module {} to {}", name, fmt::ptr(*out_addr));
 
     // Initialize trampoline generator.
     void* trampoline_addr = std::bit_cast<void*>(base_virtual_addr + aligned_base_size);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 02480303..40d6cdb7 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -21,6 +21,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) {
     case VsOutput::ClipDist7: {
         const u32 index = u32(output) - u32(VsOutput::ClipDist0);
         const Id clip_num{ctx.ConstU32(index)};
+        ASSERT_MSG(Sirit::ValidId(ctx.clip_distances), "Clip distance used but not defined");
         return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, clip_num);
     }
     case VsOutput::CullDist0:
@@ -33,6 +34,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) {
     case VsOutput::CullDist7: {
         const u32 index = u32(output) - u32(VsOutput::CullDist0);
         const Id cull_num{ctx.ConstU32(index)};
+        ASSERT_MSG(Sirit::ValidId(ctx.cull_distances), "Cull distance used but not defined");
         return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, cull_num);
     }
     default:
@@ -125,7 +127,12 @@ Id EmitReadConst(EmitContext& ctx) {
 }
 
 Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
-    const auto& buffer = ctx.buffers[handle];
+    auto& buffer = ctx.buffers[handle];
+    if (!Sirit::ValidId(buffer.offset)) {
+        buffer.offset = ctx.GetBufferOffset(handle);
+    }
+    const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))};
+    index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords);
     const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
     return ctx.OpLoad(buffer.data_types->Get(1), ptr);
 }
@@ -137,7 +144,7 @@ Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) {
 Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
     return ctx.OpLoad(
         ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]),
-                                      ctx.instance_step_rates,
+                                      ctx.push_data_block,
                                       rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value));
 }
 
@@ -221,7 +228,11 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
 
 template <u32 N>
 static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
-    const auto& buffer = ctx.buffers[handle];
+    auto& buffer = ctx.buffers[handle];
+    if (!Sirit::ValidId(buffer.offset)) {
+        buffer.offset = ctx.GetBufferOffset(handle);
+    }
+    address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
     if constexpr (N == 1) {
         const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
@@ -314,7 +325,7 @@ static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offs
 }
 
 static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) {
-    const auto& buffer = ctx.buffers[handle];
+    auto& buffer = ctx.buffers[handle];
     const auto format = buffer.buffer.GetDataFmt();
     switch (format) {
     case AmdGpu::DataFormat::FormatInvalid:
@@ -399,6 +410,11 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com
 
 template <u32 N>
 static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    auto& buffer = ctx.buffers[handle];
+    if (!Sirit::ValidId(buffer.offset)) {
+        buffer.offset = ctx.GetBufferOffset(handle);
+    }
+    address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     if constexpr (N == 1) {
         return GetBufferFormatValue(ctx, handle, address, 0);
     } else {
@@ -428,7 +444,11 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad
 
 template <u32 N>
 static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
-    const auto& buffer = ctx.buffers[handle];
+    auto& buffer = ctx.buffers[handle];
+    if (!Sirit::ValidId(buffer.offset)) {
+        buffer.offset = ctx.GetBufferOffset(handle);
+    }
+    address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
     if constexpr (N == 1) {
         const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 8ca8b7a3..cdf417fc 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -46,9 +46,9 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin
       stage{program.info.stage}, binding{binding_} {
     AddCapability(spv::Capability::Shader);
     DefineArithmeticTypes();
-    DefineInterfaces(program);
-    DefineBuffers(info);
-    DefineImagesAndSamplers(info);
+    DefineInterfaces();
+    DefineBuffers();
+    DefineImagesAndSamplers();
     DefineSharedMemory();
 }
 
@@ -117,9 +117,10 @@ void EmitContext::DefineArithmeticTypes() {
     full_result_u32x2 = Name(TypeStruct(U32[1], U32[1]), "full_result_u32x2");
 }
 
-void EmitContext::DefineInterfaces(const IR::Program& program) {
-    DefineInputs(program.info);
-    DefineOutputs(program.info);
+void EmitContext::DefineInterfaces() {
+    DefinePushDataBlock();
+    DefineInputs();
+    DefineOutputs();
 }
 
 Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) {
@@ -164,6 +165,16 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
     throw InvalidArgument("Invalid attribute type {}", fmt);
 }
 
+Id EmitContext::GetBufferOffset(u32 binding) {
+    const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4);
+    const u32 comp = (binding & 0xf) >> 2;
+    const u32 offset = (binding & 0x3) << 3;
+    const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
+                               push_data_block, ConstU32(half), ConstU32(comp))};
+    const Id value{OpLoad(U32[1], ptr)};
+    return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U));
+}
+
 Id MakeDefaultValue(EmitContext& ctx, u32 default_value) {
     switch (default_value) {
     case 0:
@@ -179,24 +190,13 @@ Id MakeDefaultValue(EmitContext& ctx, u32 default_value) {
     }
 }
 
-void EmitContext::DefineInputs(const Info& info) {
+void EmitContext::DefineInputs() {
     switch (stage) {
     case Stage::Vertex: {
         vertex_index = DefineVariable(U32[1], spv::BuiltIn::VertexIndex, spv::StorageClass::Input);
         base_vertex = DefineVariable(U32[1], spv::BuiltIn::BaseVertex, spv::StorageClass::Input);
         instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input);
 
-        // Create push constants block for instance steps rates
-        const Id struct_type{Name(TypeStruct(U32[1], U32[1]), "instance_step_rates")};
-        Decorate(struct_type, spv::Decoration::Block);
-        MemberName(struct_type, 0, "sr0");
-        MemberName(struct_type, 1, "sr1");
-        MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
-        MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U);
-        instance_step_rates = DefineVar(struct_type, spv::StorageClass::PushConstant);
-        Name(instance_step_rates, "step_rates");
-        interfaces.push_back(instance_step_rates);
-
         for (const auto& input : info.vs_inputs) {
             const Id type{GetAttributeType(*this, input.fmt)};
             if (input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate0 ||
@@ -260,19 +260,20 @@ void EmitContext::DefineInputs(const Info& info) {
     }
 }
 
-void EmitContext::DefineOutputs(const Info& info) {
+void EmitContext::DefineOutputs() {
     switch (stage) {
     case Stage::Vertex: {
         output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output);
-        const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value,
-                                     f32_zero_value, f32_zero_value, f32_zero_value,
-                                     f32_zero_value, f32_zero_value};
-        const Id type{TypeArray(F32[1], ConstU32(8U))};
-        const Id initializer{ConstantComposite(type, zero)};
-        clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output,
-                                        initializer);
-        cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output,
-                                        initializer);
+        const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) ||
+                                          info.stores.Get(IR::Attribute::Position2) ||
+                                          info.stores.Get(IR::Attribute::Position3);
+        if (has_extra_pos_stores) {
+            const Id type{TypeArray(F32[1], ConstU32(8U))};
+            clip_distances =
+                DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output);
+            cull_distances =
+                DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output);
+        }
         for (u32 i = 0; i < IR::NumParams; i++) {
             const IR::Attribute param{IR::Attribute::Param0 + i};
             if (!info.stores.GetAny(param)) {
@@ -304,7 +305,24 @@ void EmitContext::DefineOutputs(const Info& info) {
     }
 }
 
-void EmitContext::DefineBuffers(const Info& info) {
+void EmitContext::DefinePushDataBlock() {
+    // Create push constants block for instance steps rates
+    const Id struct_type{Name(TypeStruct(U32[1], U32[1], U32[4], U32[4]), "AuxData")};
+    Decorate(struct_type, spv::Decoration::Block);
+    MemberName(struct_type, 0, "sr0");
+    MemberName(struct_type, 1, "sr1");
+    MemberName(struct_type, 2, "buf_offsets0");
+    MemberName(struct_type, 3, "buf_offsets1");
+    MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
+    MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U);
+    MemberDecorate(struct_type, 2, spv::Decoration::Offset, 8U);
+    MemberDecorate(struct_type, 3, spv::Decoration::Offset, 24U);
+    push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant);
+    Name(push_data_block, "push_data");
+    interfaces.push_back(push_data_block);
+}
+
+void EmitContext::DefineBuffers() {
     boost::container::small_vector<Id, 8> type_ids;
     for (u32 i = 0; const auto& buffer : info.buffers) {
         const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32;
@@ -322,8 +340,8 @@ void EmitContext::DefineBuffers(const Info& info) {
             Decorate(struct_type, spv::Decoration::Block);
             MemberName(struct_type, 0, "data");
             MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
+            type_ids.push_back(record_array_type);
         }
-        type_ids.push_back(record_array_type);
 
         const auto storage_class =
             buffer.is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform;
@@ -430,7 +448,7 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
     throw InvalidArgument("Invalid texture type {}", desc.type);
 }
 
-void EmitContext::DefineImagesAndSamplers(const Info& info) {
+void EmitContext::DefineImagesAndSamplers() {
     for (const auto& image_desc : info.images) {
         const VectorIds* data_types = [&] {
             switch (image_desc.nfmt) {
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 2aa1bf78..ff9ec4b7 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -40,6 +40,7 @@ public:
     ~EmitContext();
 
     Id Def(const IR::Value& value);
+    Id GetBufferOffset(u32 binding);
 
     [[nodiscard]] Id DefineInput(Id type, u32 location) {
         const Id input_id{DefineVar(type, spv::StorageClass::Input)};
@@ -168,7 +169,7 @@ public:
     Id output_position{};
     Id vertex_index{};
     Id instance_id{};
-    Id instance_step_rates{};
+    Id push_data_block{};
     Id base_vertex{};
     Id frag_coord{};
     Id front_facing{};
@@ -201,14 +202,15 @@ public:
 
     struct BufferDefinition {
         Id id;
+        Id offset;
         const VectorIds* data_types;
         Id pointer_type;
         AmdGpu::Buffer buffer;
     };
 
     u32& binding;
-    boost::container::small_vector<BufferDefinition, 4> buffers;
-    boost::container::small_vector<TextureDefinition, 4> images;
+    boost::container::small_vector<BufferDefinition, 16> buffers;
+    boost::container::small_vector<TextureDefinition, 8> images;
     boost::container::small_vector<Id, 4> samplers;
 
     Id sampler_type{};
@@ -227,11 +229,12 @@ public:
 
 private:
     void DefineArithmeticTypes();
-    void DefineInterfaces(const IR::Program& program);
-    void DefineInputs(const Info& info);
-    void DefineOutputs(const Info& info);
-    void DefineBuffers(const Info& info);
-    void DefineImagesAndSamplers(const Info& info);
+    void DefineInterfaces();
+    void DefineInputs();
+    void DefineOutputs();
+    void DefinePushDataBlock();
+    void DefineBuffers();
+    void DefineImagesAndSamplers();
     void DefineSharedMemory();
 
     SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id);
diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp
index 5eadae1b..2925c05d 100644
--- a/src/shader_recompiler/frontend/control_flow_graph.cpp
+++ b/src/shader_recompiler/frontend/control_flow_graph.cpp
@@ -40,7 +40,7 @@ static IR::Condition MakeCondition(Opcode opcode) {
     }
 }
 
-CFG::CFG(ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_list_)
+CFG::CFG(Common::ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_list_)
     : block_pool{block_pool_}, inst_list{inst_list_} {
     index_to_pc.resize(inst_list.size() + 1);
     EmitLabels();
diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h
index 07190087..ebe614ee 100644
--- a/src/shader_recompiler/frontend/control_flow_graph.h
+++ b/src/shader_recompiler/frontend/control_flow_graph.h
@@ -8,10 +8,10 @@
 #include <boost/container/small_vector.hpp>
 #include <boost/intrusive/set.hpp>
 
+#include "common/object_pool.h"
 #include "common/types.h"
 #include "shader_recompiler/frontend/instruction.h"
 #include "shader_recompiler/ir/condition.h"
-#include "shader_recompiler/object_pool.h"
 
 namespace Shader::Gcn {
 
@@ -49,7 +49,7 @@ class CFG {
     using Label = u32;
 
 public:
-    explicit CFG(ObjectPool<Block>& block_pool, std::span<const GcnInst> inst_list);
+    explicit CFG(Common::ObjectPool<Block>& block_pool, std::span<const GcnInst> inst_list);
 
     [[nodiscard]] std::string Dot() const;
 
@@ -59,7 +59,7 @@ private:
     void LinkBlocks();
 
 public:
-    ObjectPool<Block>& block_pool;
+    Common::ObjectPool<Block>& block_pool;
     std::span<const GcnInst> inst_list;
     std::vector<u32> index_to_pc;
     boost::container::small_vector<Label, 16> labels;
diff --git a/src/shader_recompiler/frontend/module.h b/src/shader_recompiler/frontend/module.h
deleted file mode 100644
index 3901f021..00000000
--- a/src/shader_recompiler/frontend/module.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-namespace Shader::Gcn {
-
-void Translate();
-
-} // namespace Shader::Gcn
\ No newline at end of file
diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp
index c8d73858..b50205d4 100644
--- a/src/shader_recompiler/frontend/structured_control_flow.cpp
+++ b/src/shader_recompiler/frontend/structured_control_flow.cpp
@@ -287,7 +287,7 @@ bool NeedsLift(Node goto_stmt, Node label_stmt) noexcept {
  */
 class GotoPass {
 public:
-    explicit GotoPass(CFG& cfg, ObjectPool<Statement>& stmt_pool) : pool{stmt_pool} {
+    explicit GotoPass(CFG& cfg, Common::ObjectPool<Statement>& stmt_pool) : pool{stmt_pool} {
         std::vector gotos{BuildTree(cfg)};
         const auto end{gotos.rend()};
         for (auto goto_stmt = gotos.rbegin(); goto_stmt != end; ++goto_stmt) {
@@ -563,7 +563,7 @@ private:
         return parent_tree.insert(std::next(loop), *new_goto);
     }
 
-    ObjectPool<Statement>& pool;
+    Common::ObjectPool<Statement>& pool;
     Statement root_stmt{FunctionTag{}};
 };
 
@@ -597,8 +597,9 @@ private:
 
 class TranslatePass {
 public:
-    TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
-                  ObjectPool<Statement>& stmt_pool_, Statement& root_stmt,
+    TranslatePass(Common::ObjectPool<IR::Inst>& inst_pool_,
+                  Common::ObjectPool<IR::Block>& block_pool_,
+                  Common::ObjectPool<Statement>& stmt_pool_, Statement& root_stmt,
                   IR::AbstractSyntaxList& syntax_list_, std::span<const GcnInst> inst_list_,
                   Info& info_, const Profile& profile_)
         : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_},
@@ -808,9 +809,9 @@ private:
         return block_pool.Create(inst_pool);
     }
 
-    ObjectPool<Statement>& stmt_pool;
-    ObjectPool<IR::Inst>& inst_pool;
-    ObjectPool<IR::Block>& block_pool;
+    Common::ObjectPool<Statement>& stmt_pool;
+    Common::ObjectPool<IR::Inst>& inst_pool;
+    Common::ObjectPool<IR::Block>& block_pool;
     IR::AbstractSyntaxList& syntax_list;
     const Block dummy_flow_block{.is_dummy = true};
     std::span<const GcnInst> inst_list;
@@ -819,9 +820,10 @@ private:
 };
 } // Anonymous namespace
 
-IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
-                                CFG& cfg, Info& info, const Profile& profile) {
-    ObjectPool<Statement> stmt_pool{64};
+IR::AbstractSyntaxList BuildASL(Common::ObjectPool<IR::Inst>& inst_pool,
+                                Common::ObjectPool<IR::Block>& block_pool, CFG& cfg, Info& info,
+                                const Profile& profile) {
+    Common::ObjectPool<Statement> stmt_pool{64};
     GotoPass goto_pass{cfg, stmt_pool};
     Statement& root{goto_pass.RootStatement()};
     IR::AbstractSyntaxList syntax_list;
diff --git a/src/shader_recompiler/frontend/structured_control_flow.h b/src/shader_recompiler/frontend/structured_control_flow.h
index da4ef1ff..f5a54051 100644
--- a/src/shader_recompiler/frontend/structured_control_flow.h
+++ b/src/shader_recompiler/frontend/structured_control_flow.h
@@ -7,7 +7,6 @@
 #include "shader_recompiler/ir/abstract_syntax_list.h"
 #include "shader_recompiler/ir/basic_block.h"
 #include "shader_recompiler/ir/value.h"
-#include "shader_recompiler/object_pool.h"
 
 namespace Shader {
 struct Info;
@@ -16,8 +15,8 @@ struct Profile;
 
 namespace Shader::Gcn {
 
-[[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
-                                              ObjectPool<IR::Block>& block_pool, CFG& cfg,
+[[nodiscard]] IR::AbstractSyntaxList BuildASL(Common::ObjectPool<IR::Inst>& inst_pool,
+                                              Common::ObjectPool<IR::Block>& block_pool, CFG& cfg,
                                               Info& info, const Profile& profile);
 
 } // namespace Shader::Gcn
diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp
index e8c2a31c..b295c1be 100644
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@@ -447,6 +447,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
                 .is_instance_data = true,
             });
             instance_buf_handle = s32(info.buffers.size() - 1);
+            info.uses_step_rates = true;
         }
 
         const u32 num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
index c667968a..3c6dfbda 100644
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -338,6 +338,11 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma
     if (is_typed) {
         info.dmft.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
         info.nfmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
+        ASSERT(info.nfmt == AmdGpu::NumberFormat::Float &&
+               (info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
+                info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
+                info.dmft == AmdGpu::DataFormat::Format32_32 ||
+                info.dmft == AmdGpu::DataFormat::Format32));
     }
 
     const IR::Value handle =
diff --git a/src/shader_recompiler/ir/basic_block.cpp b/src/shader_recompiler/ir/basic_block.cpp
index 622a6249..60ba0647 100644
--- a/src/shader_recompiler/ir/basic_block.cpp
+++ b/src/shader_recompiler/ir/basic_block.cpp
@@ -9,7 +9,7 @@
 
 namespace Shader::IR {
 
-Block::Block(ObjectPool<Inst>& inst_pool_) : inst_pool{&inst_pool_} {}
+Block::Block(Common::ObjectPool<Inst>& inst_pool_) : inst_pool{&inst_pool_} {}
 
 Block::~Block() = default;
 
diff --git a/src/shader_recompiler/ir/basic_block.h b/src/shader_recompiler/ir/basic_block.h
index 5a7036c6..1eb11469 100644
--- a/src/shader_recompiler/ir/basic_block.h
+++ b/src/shader_recompiler/ir/basic_block.h
@@ -9,10 +9,10 @@
 #include <vector>
 #include <boost/intrusive/list.hpp>
 
+#include "common/object_pool.h"
 #include "common/types.h"
 #include "shader_recompiler/ir/reg.h"
 #include "shader_recompiler/ir/value.h"
-#include "shader_recompiler/object_pool.h"
 
 namespace Shader::IR {
 
@@ -25,7 +25,7 @@ public:
     using reverse_iterator = InstructionList::reverse_iterator;
     using const_reverse_iterator = InstructionList::const_reverse_iterator;
 
-    explicit Block(ObjectPool<Inst>& inst_pool_);
+    explicit Block(Common::ObjectPool<Inst>& inst_pool_);
     ~Block();
 
     Block(const Block&) = delete;
@@ -153,7 +153,7 @@ public:
 
 private:
     /// Memory pool for instruction list
-    ObjectPool<Inst>* inst_pool;
+    Common::ObjectPool<Inst>* inst_pool;
 
     /// List of instructions in this block
     InstructionList instructions;
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index eaca8ce8..97438f80 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -173,10 +173,9 @@ bool IsImageStorageInstruction(const IR::Inst& inst) {
 
 class Descriptors {
 public:
-    explicit Descriptors(BufferResourceList& buffer_resources_, ImageResourceList& image_resources_,
-                         SamplerResourceList& sampler_resources_)
-        : buffer_resources{buffer_resources_}, image_resources{image_resources_},
-          sampler_resources{sampler_resources_} {}
+    explicit Descriptors(Info& info_)
+        : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images},
+          sampler_resources{info_.samplers} {}
 
     u32 Add(const BufferResource& desc) {
         const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
@@ -188,6 +187,7 @@ public:
         ASSERT(buffer.length == desc.length);
         buffer.is_storage |= desc.is_storage;
         buffer.used_types |= desc.used_types;
+        buffer.is_written |= desc.is_written;
         return index;
     }
 
@@ -201,9 +201,16 @@ public:
     }
 
     u32 Add(const SamplerResource& desc) {
-        const u32 index{Add(sampler_resources, desc, [&desc](const auto& existing) {
-            return desc.sgpr_base == existing.sgpr_base &&
-                   desc.dword_offset == existing.dword_offset;
+        const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
+            if (desc.sgpr_base == existing.sgpr_base &&
+                desc.dword_offset == existing.dword_offset) {
+                return true;
+            }
+            // Samplers with different bindings might still be the same.
+            const auto old_sharp =
+                info.ReadUd<AmdGpu::Sampler>(existing.sgpr_base, existing.dword_offset);
+            const auto new_sharp = info.ReadUd<AmdGpu::Sampler>(desc.sgpr_base, desc.dword_offset);
+            return old_sharp == new_sharp;
         })};
         return index;
     }
@@ -219,6 +226,7 @@ private:
         return static_cast<u32>(descriptors.size()) - 1;
     }
 
+    const Info& info;
     BufferResourceList& buffer_resources;
     ImageResourceList& image_resources;
     SamplerResourceList& sampler_resources;
@@ -328,16 +336,6 @@ static bool IsLoadBufferFormat(const IR::Inst& inst) {
     }
 }
 
-static bool IsReadConstBuffer(const IR::Inst& inst) {
-    switch (inst.GetOpcode()) {
-    case IR::Opcode::ReadConstBuffer:
-    case IR::Opcode::ReadConstBufferU32:
-        return true;
-    default:
-        return false;
-    }
-}
-
 static u32 BufferLength(const AmdGpu::Buffer& buffer) {
     const auto stride = buffer.GetStride();
     if (stride < sizeof(f32)) {
@@ -401,30 +399,37 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
         IR::Inst* handle = inst.Arg(0).InstRecursive();
         IR::Inst* producer = handle->Arg(0).InstRecursive();
         const auto sharp = TrackSharp(producer);
+        const bool is_store = IsBufferStore(inst);
         buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
         binding = descriptors.Add(BufferResource{
             .sgpr_base = sharp.sgpr_base,
             .dword_offset = sharp.dword_offset,
             .length = BufferLength(buffer),
             .used_types = BufferDataType(inst, buffer.GetNumberFmt()),
-            .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
+            .is_storage = is_store || buffer.GetSize() > MaxUboSize,
+            .is_written = is_store,
         });
     }
 
+    // Update buffer descriptor format.
     const auto inst_info = inst.Flags<IR::BufferInstInfo>();
-    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
-    // Replace handle with binding index in buffer resource list.
-    inst.SetArg(0, ir.Imm32(binding));
-    ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
+    auto& buffer_desc = info.buffers[binding];
     if (inst_info.is_typed) {
-        ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float &&
-               (inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
-                inst_info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
-                inst_info.dmft == AmdGpu::DataFormat::Format32_32 ||
-                inst_info.dmft == AmdGpu::DataFormat::Format32));
+        buffer_desc.dfmt = inst_info.dmft;
+        buffer_desc.nfmt = inst_info.nfmt;
+    } else {
+        buffer_desc.dfmt = buffer.GetDataFmt();
+        buffer_desc.nfmt = buffer.GetNumberFmt();
     }
 
-    if (IsReadConstBuffer(inst)) {
+    // Replace handle with binding index in buffer resource list.
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    inst.SetArg(0, ir.Imm32(binding));
+    ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
+
+    // Address of constant buffer reads can be calculated at IR emittion time.
+    if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
+        inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
         return;
     }
 
@@ -434,10 +439,14 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
         }
     } else {
         const u32 stride = buffer.GetStride();
-        ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}",
-                   stride);
+        if (stride < 4) {
+            LOG_WARNING(Render_Vulkan,
+                        "non-formatting load_buffer_* is not implemented for stride {}", stride);
+        }
     }
 
+    // Compute address of the buffer using the stride.
+    // Todo: What if buffer is rebound with different stride?
     IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
     if (inst_info.index_enable) {
         const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
@@ -587,39 +596,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
 }
 
 void ResourceTrackingPass(IR::Program& program) {
-    // When loading data from untyped buffer we don't have if it is float or integer.
-    // Most of the time it is float so that is the default. This pass detects float buffer loads
-    // combined with bitcasts and patches them to be integer loads.
-    for (IR::Block* const block : program.post_order_blocks) {
-        break;
-        for (IR::Inst& inst : block->Instructions()) {
-            if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) {
-                continue;
-            }
-            // Replace the bitcast with a typed buffer read
-            IR::Inst* const arg_inst{inst.Arg(0).TryInstRecursive()};
-            if (!arg_inst) {
-                continue;
-            }
-            const auto replace{[&](IR::Opcode new_opcode) {
-                inst.ReplaceOpcode(new_opcode);
-                inst.SetArg(0, arg_inst->Arg(0));
-                inst.SetArg(1, arg_inst->Arg(1));
-                inst.SetFlags(arg_inst->Flags<u32>());
-                arg_inst->Invalidate();
-            }};
-            if (arg_inst->GetOpcode() == IR::Opcode::ReadConstBuffer) {
-                replace(IR::Opcode::ReadConstBufferU32);
-            }
-            if (arg_inst->GetOpcode() == IR::Opcode::LoadBufferF32) {
-                replace(IR::Opcode::LoadBufferU32);
-            }
-        }
-    }
-
     // Iterate resource instructions and patch them after finding the sharp.
     auto& info = program.info;
-    Descriptors descriptors{info.buffers, info.images, info.samplers};
+    Descriptors descriptors{info};
     for (IR::Block* const block : program.blocks) {
         for (IR::Inst& inst : block->Instructions()) {
             if (IsBufferInstruction(inst)) {
diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp
index d747c016..69eec50f 100644
--- a/src/shader_recompiler/recompiler.cpp
+++ b/src/shader_recompiler/recompiler.cpp
@@ -27,9 +27,9 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
     return blocks;
 }
 
-IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
-                             std::span<const u32> token, const Info&& info,
-                             const Profile& profile) {
+IR::Program TranslateProgram(Common::ObjectPool<IR::Inst>& inst_pool,
+                             Common::ObjectPool<IR::Block>& block_pool, std::span<const u32> token,
+                             const Info&& info, const Profile& profile) {
     // Ensure first instruction is expected.
     constexpr u32 token_mov_vcchi = 0xBEEB03FF;
     ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm");
@@ -45,7 +45,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
     }
 
     // Create control flow graph
-    ObjectPool<Gcn::Block> gcn_block_pool{64};
+    Common::ObjectPool<Gcn::Block> gcn_block_pool{64};
     Gcn::CFG cfg{gcn_block_pool, program.ins_list};
 
     // Structurize control flow graph and create program.
@@ -61,7 +61,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
     Shader::Optimization::IdentityRemovalPass(program.blocks);
     Shader::Optimization::DeadCodeEliminationPass(program);
     Shader::Optimization::CollectShaderInfoPass(program);
-    LOG_INFO(Render_Vulkan, "{}", Shader::IR::DumpProgram(program));
+    LOG_DEBUG(Render_Vulkan, "{}", Shader::IR::DumpProgram(program));
 
     return program;
 }
diff --git a/src/shader_recompiler/recompiler.h b/src/shader_recompiler/recompiler.h
index 297d4158..34e958a1 100644
--- a/src/shader_recompiler/recompiler.h
+++ b/src/shader_recompiler/recompiler.h
@@ -3,16 +3,16 @@
 
 #pragma once
 
+#include "common/object_pool.h"
 #include "shader_recompiler/ir/basic_block.h"
 #include "shader_recompiler/ir/program.h"
-#include "shader_recompiler/object_pool.h"
 
 namespace Shader {
 
 struct Profile;
 
-[[nodiscard]] IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool,
-                                           ObjectPool<IR::Block>& block_pool,
+[[nodiscard]] IR::Program TranslateProgram(Common::ObjectPool<IR::Inst>& inst_pool,
+                                           Common::ObjectPool<IR::Block>& block_pool,
                                            std::span<const u32> code, const Info&& info,
                                            const Profile& profile);
 
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index 277c38b7..4ab71c3b 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -77,8 +77,11 @@ struct BufferResource {
     u32 length;
     IR::Type used_types;
     AmdGpu::Buffer inline_cbuf;
-    bool is_storage{false};
-    bool is_instance_data{false};
+    AmdGpu::DataFormat dfmt;
+    AmdGpu::NumberFormat nfmt;
+    bool is_storage{};
+    bool is_instance_data{};
+    bool is_written{};
 
     constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept;
 };
@@ -105,6 +108,19 @@ struct SamplerResource {
 };
 using SamplerResourceList = boost::container::static_vector<SamplerResource, 16>;
 
+struct PushData {
+    static constexpr size_t BufOffsetIndex = 2;
+
+    u32 step0;
+    u32 step1;
+    std::array<u8, 32> buf_offsets;
+
+    void AddOffset(u32 binding, u32 offset) {
+        ASSERT(offset < 64 && binding < 32);
+        buf_offsets[binding] = offset;
+    }
+};
+
 struct Info {
     struct VsInput {
         enum InstanceIdType : u8 {
@@ -182,6 +198,7 @@ struct Info {
     bool uses_shared_u8{};
     bool uses_shared_u16{};
     bool uses_fp16{};
+    bool uses_step_rates{};
     bool translation_failed{}; // indicates that shader has unsupported instructions
 
     template <typename T>
diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h
index 400af031..3ebd9a97 100644
--- a/src/video_core/amdgpu/liverpool.h
+++ b/src/video_core/amdgpu/liverpool.h
@@ -496,7 +496,7 @@ struct Liverpool {
 
         template <typename T = VAddr>
         T Address() const {
-            return reinterpret_cast<T>((base_addr_lo & ~1U) | u64(base_addr_hi) << 32);
+            return std::bit_cast<T>((base_addr_lo & ~1U) | u64(base_addr_hi) << 32);
         }
     };
 
diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h
index 01271792..ef5bf1b6 100644
--- a/src/video_core/amdgpu/resource.h
+++ b/src/video_core/amdgpu/resource.h
@@ -363,6 +363,10 @@ struct Sampler {
         return raw0 != 0 || raw1 != 0;
     }
 
+    bool operator==(const Sampler& other) const noexcept {
+        return std::memcmp(this, &other, sizeof(Sampler)) == 0;
+    }
+
     float LodBias() const noexcept {
         return static_cast<float>(static_cast<int16_t>((lod_bias.Value() ^ 0x2000u) - 0x2000u)) /
                256.0f;
diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp
new file mode 100644
index 00000000..e9498b35
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer.cpp
@@ -0,0 +1,227 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "video_core/buffer_cache/buffer.h"
+#include "video_core/renderer_vulkan/liverpool_to_vk.h"
+#include "video_core/renderer_vulkan/vk_instance.h"
+#include "video_core/renderer_vulkan/vk_platform.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+#include <vk_mem_alloc.h>
+
+namespace VideoCore {
+
+constexpr vk::BufferUsageFlags AllFlags =
+    vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst |
+    vk::BufferUsageFlagBits::eUniformTexelBuffer | vk::BufferUsageFlagBits::eStorageTexelBuffer |
+    vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer |
+    vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer;
+
+std::string_view BufferTypeName(MemoryUsage type) {
+    switch (type) {
+    case MemoryUsage::Upload:
+        return "Upload";
+    case MemoryUsage::Download:
+        return "Download";
+    case MemoryUsage::Stream:
+        return "Stream";
+    case MemoryUsage::DeviceLocal:
+        return "DeviceLocal";
+    default:
+        return "Invalid";
+    }
+}
+
+[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) {
+    return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
+                                             : VkMemoryPropertyFlagBits{};
+}
+
+[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) {
+    switch (usage) {
+    case MemoryUsage::Upload:
+    case MemoryUsage::Stream:
+        return VMA_ALLOCATION_CREATE_MAPPED_BIT |
+               VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+    case MemoryUsage::Download:
+        return VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    case MemoryUsage::DeviceLocal:
+        return {};
+    }
+    return {};
+}
+
+[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) {
+    switch (usage) {
+    case MemoryUsage::DeviceLocal:
+    case MemoryUsage::Stream:
+        return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+    case MemoryUsage::Upload:
+    case MemoryUsage::Download:
+        return VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+    }
+    return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+}
+
+UniqueBuffer::UniqueBuffer(vk::Device device_, VmaAllocator allocator_)
+    : device{device_}, allocator{allocator_} {}
+
+UniqueBuffer::~UniqueBuffer() {
+    if (buffer) {
+        vmaDestroyBuffer(allocator, buffer, allocation);
+    }
+}
+
+void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
+                          VmaAllocationInfo* out_alloc_info) {
+    const VmaAllocationCreateInfo alloc_ci = {
+        .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
+        .usage = MemoryUsageVma(usage),
+        .requiredFlags = 0,
+        .preferredFlags = MemoryUsagePreferredVmaFlags(usage),
+        .pool = VK_NULL_HANDLE,
+        .pUserData = nullptr,
+    };
+
+    const VkBufferCreateInfo buffer_ci_unsafe = static_cast<VkBufferCreateInfo>(buffer_ci);
+    VkBuffer unsafe_buffer{};
+    VkResult result = vmaCreateBuffer(allocator, &buffer_ci_unsafe, &alloc_ci, &unsafe_buffer,
+                                      &allocation, out_alloc_info);
+    ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
+               vk::to_string(vk::Result{result}));
+    buffer = vk::Buffer{unsafe_buffer};
+}
+
+Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_,
+               u64 size_bytes_)
+    : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_},
+      buffer{instance->GetDevice(), instance->GetAllocator()} {
+    // Create buffer object.
+    const vk::BufferCreateInfo buffer_ci = {
+        .size = size_bytes,
+        .usage = AllFlags,
+    };
+    VmaAllocationInfo alloc_info{};
+    buffer.Create(buffer_ci, usage, &alloc_info);
+
+    if (instance->HasDebuggingToolAttached()) {
+        const auto device = instance->GetDevice();
+        Vulkan::SetObjectName(device, Handle(), "Buffer {:#x} {} KiB", cpu_addr, size_bytes / 1024);
+    }
+
+    // Map it if it is host visible.
+    VkMemoryPropertyFlags property_flags{};
+    vmaGetAllocationMemoryProperties(instance->GetAllocator(), buffer.allocation, &property_flags);
+    if (alloc_info.pMappedData) {
+        mapped_data = std::span<u8>{std::bit_cast<u8*>(alloc_info.pMappedData), size_bytes};
+    }
+    is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+}
+
+vk::BufferView Buffer::View(u32 offset, u32 size, AmdGpu::DataFormat dfmt,
+                            AmdGpu::NumberFormat nfmt) {
+    const auto it{std::ranges::find_if(views, [offset, size, dfmt, nfmt](const BufferView& view) {
+        return offset == view.offset && size == view.size && dfmt == view.dfmt && nfmt == view.nfmt;
+    })};
+    if (it != views.end()) {
+        return it->handle;
+    }
+    views.push_back({
+        .offset = offset,
+        .size = size,
+        .dfmt = dfmt,
+        .nfmt = nfmt,
+        .handle = instance->GetDevice().createBufferView({
+            .buffer = buffer.buffer,
+            .format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt),
+            .offset = offset,
+            .range = size,
+        }),
+    });
+    return views.back().handle;
+}
+
+constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
+constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
+
+StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_,
+                           MemoryUsage usage, u64 size_bytes)
+    : Buffer{instance, usage, 0, size_bytes}, scheduler{scheduler_} {
+    ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
+    ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
+    const auto device = instance.GetDevice();
+    if (instance.HasDebuggingToolAttached()) {
+        Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}): {} KiB", BufferTypeName(usage),
+                              size_bytes / 1024);
+    }
+}
+
+std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
+    if (!is_coherent && usage == MemoryUsage::Stream) {
+        size = Common::AlignUp(size, instance->NonCoherentAtomSize());
+    }
+
+    ASSERT(size <= this->size_bytes);
+    mapped_size = size;
+
+    if (alignment > 0) {
+        offset = Common::AlignUp(offset, alignment);
+    }
+
+    if (offset + size > this->size_bytes) {
+        // The buffer would overflow, save the amount of used watches and reset the state.
+        invalidation_mark = current_watch_cursor;
+        current_watch_cursor = 0;
+        offset = 0;
+
+        // Swap watches and reset waiting cursors.
+        std::swap(previous_watches, current_watches);
+        wait_cursor = 0;
+        wait_bound = 0;
+    }
+
+    const u64 mapped_upper_bound = offset + size;
+    WaitPendingOperations(mapped_upper_bound);
+    return std::make_pair(mapped_data.data() + offset, offset);
+}
+
+void StreamBuffer::Commit() {
+    if (!is_coherent) {
+        if (usage == MemoryUsage::Download) {
+            vmaInvalidateAllocation(instance->GetAllocator(), buffer.allocation, offset,
+                                    mapped_size);
+        } else {
+            vmaFlushAllocation(instance->GetAllocator(), buffer.allocation, offset, mapped_size);
+        }
+    }
+
+    offset += mapped_size;
+    if (current_watch_cursor + 1 >= current_watches.size()) {
+        // Ensure that there are enough watches.
+        ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
+    }
+
+    auto& watch = current_watches[current_watch_cursor++];
+    watch.upper_bound = offset;
+    watch.tick = scheduler.CurrentTick();
+}
+
+void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
+    watches.resize(watches.size() + grow_size);
+}
+
+void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
+    if (!invalidation_mark) {
+        return;
+    }
+    while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
+        auto& watch = previous_watches[wait_cursor];
+        wait_bound = watch.upper_bound;
+        scheduler.Wait(watch.tick);
+        ++wait_cursor;
+    }
+}
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h
new file mode 100644
index 00000000..e0d9da08
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer.h
@@ -0,0 +1,173 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include "common/types.h"
+#include "video_core/amdgpu/resource.h"
+#include "video_core/renderer_vulkan/vk_common.h"
+
+namespace Vulkan {
+class Instance;
+class Scheduler;
+} // namespace Vulkan
+
+VK_DEFINE_HANDLE(VmaAllocation)
+VK_DEFINE_HANDLE(VmaAllocator)
+
+struct VmaAllocationInfo;
+
+namespace VideoCore {
+
+/// Hints and requirements for the backing memory type of a commit
+enum class MemoryUsage {
+    DeviceLocal, ///< Requests device local buffer.
+    Upload,      ///< Requires a host visible memory type optimized for CPU to GPU uploads
+    Download,    ///< Requires a host visible memory type optimized for GPU to CPU readbacks
+    Stream,      ///< Requests device local host visible buffer, falling back host memory.
+};
+
+struct UniqueBuffer {
+    explicit UniqueBuffer(vk::Device device, VmaAllocator allocator);
+    ~UniqueBuffer();
+
+    UniqueBuffer(const UniqueBuffer&) = delete;
+    UniqueBuffer& operator=(const UniqueBuffer&) = delete;
+
+    UniqueBuffer(UniqueBuffer&& other)
+        : buffer{std::exchange(other.buffer, VK_NULL_HANDLE)},
+          allocator{std::exchange(other.allocator, VK_NULL_HANDLE)},
+          allocation{std::exchange(other.allocation, VK_NULL_HANDLE)} {}
+    UniqueBuffer& operator=(UniqueBuffer&& other) {
+        buffer = std::exchange(other.buffer, VK_NULL_HANDLE);
+        allocator = std::exchange(other.allocator, VK_NULL_HANDLE);
+        allocation = std::exchange(other.allocation, VK_NULL_HANDLE);
+        return *this;
+    }
+
+    void Create(const vk::BufferCreateInfo& image_ci, MemoryUsage usage,
+                VmaAllocationInfo* out_alloc_info);
+
+    operator vk::Buffer() const {
+        return buffer;
+    }
+
+    vk::Device device;
+    VmaAllocator allocator;
+    VmaAllocation allocation;
+    vk::Buffer buffer{};
+};
+
+class Buffer {
+public:
+    explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_,
+                    u64 size_bytes_);
+
+    Buffer& operator=(const Buffer&) = delete;
+    Buffer(const Buffer&) = delete;
+
+    Buffer& operator=(Buffer&&) = default;
+    Buffer(Buffer&&) = default;
+
+    vk::BufferView View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt);
+
+    /// Increases the likeliness of this being a stream buffer
+    void IncreaseStreamScore(int score) noexcept {
+        stream_score += score;
+    }
+
+    /// Returns the likeliness of this being a stream buffer
+    [[nodiscard]] int StreamScore() const noexcept {
+        return stream_score;
+    }
+
+    /// Returns true when vaddr -> vaddr+size is fully contained in the buffer
+    [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
+        return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
+    }
+
+    /// Returns the base CPU address of the buffer
+    [[nodiscard]] VAddr CpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    /// Returns the offset relative to the given CPU address
+    [[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept {
+        return static_cast<u32>(other_cpu_addr - cpu_addr);
+    }
+
+    size_t SizeBytes() const {
+        return size_bytes;
+    }
+
+    vk::Buffer Handle() const noexcept {
+        return buffer;
+    }
+
+public:
+    VAddr cpu_addr = 0;
+    bool is_picked{};
+    bool is_coherent{};
+    int stream_score = 0;
+    size_t size_bytes = 0;
+    std::span<u8> mapped_data;
+    const Vulkan::Instance* instance{};
+    MemoryUsage usage;
+    UniqueBuffer buffer;
+    struct BufferView {
+        u32 offset;
+        u32 size;
+        AmdGpu::DataFormat dfmt;
+        AmdGpu::NumberFormat nfmt;
+        vk::BufferView handle;
+    };
+    std::vector<BufferView> views;
+};
+
+class StreamBuffer : public Buffer {
+public:
+    explicit StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
+                          MemoryUsage usage, u64 size_bytes_);
+
+    /// Reserves a region of memory from the stream buffer.
+    std::pair<u8*, u64> Map(u64 size, u64 alignment = 0);
+
+    /// Ensures that reserved bytes of memory are available to the GPU.
+    void Commit();
+
+    /// Maps and commits a memory region with user provided data
+    u64 Copy(VAddr src, size_t size, size_t alignment = 0) {
+        const auto [data, offset] = Map(size, alignment);
+        std::memcpy(data, reinterpret_cast<const void*>(src), size);
+        Commit();
+        return offset;
+    }
+
+private:
+    struct Watch {
+        u64 tick{};
+        u64 upper_bound{};
+    };
+
+    /// Increases the amount of watches available.
+    void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);
+
+    /// Waits pending watches until requested upper bound.
+    void WaitPendingOperations(u64 requested_upper_bound);
+
+private:
+    Vulkan::Scheduler& scheduler;
+    u64 offset{};
+    u64 mapped_size{};
+    std::vector<Watch> current_watches;
+    std::size_t current_watch_cursor{};
+    std::optional<size_t> invalidation_mark;
+    std::vector<Watch> previous_watches;
+    std::size_t wait_cursor{};
+    u64 wait_bound{};
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 00000000..7ab0d817
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,497 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <algorithm>
+#include "common/alignment.h"
+#include "common/scope_exit.h"
+#include "shader_recompiler/runtime_info.h"
+#include "video_core/amdgpu/liverpool.h"
+#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/renderer_vulkan/liverpool_to_vk.h"
+#include "video_core/renderer_vulkan/vk_instance.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace VideoCore {
+
+static constexpr size_t StagingBufferSize = 256_MB;
+static constexpr size_t UboStreamBufferSize = 64_MB;
+
+BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
+                         const AmdGpu::Liverpool* liverpool_, PageManager& tracker_)
+    : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, tracker{tracker_},
+      staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
+      stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
+      memory_tracker{&tracker} {
+    // Ensure the first slot is used for the null buffer
+    void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, 1));
+}
+
+BufferCache::~BufferCache() = default;
+
+void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
+    std::scoped_lock lk{mutex};
+    const bool is_tracked = IsRegionRegistered(device_addr, size);
+    if (!is_tracked) {
+        return;
+    }
+    // Mark the page as CPU modified to stop tracking writes.
+    SCOPE_EXIT {
+        memory_tracker.MarkRegionAsCpuModified(device_addr, size);
+    };
+    if (!memory_tracker.IsRegionGpuModified(device_addr, size)) {
+        // Page has not been modified by the GPU, nothing to do.
+        return;
+    }
+}
+
+void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) {
+    boost::container::small_vector<vk::BufferCopy, 1> copies;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    memory_tracker.ForEachDownloadRange<true>(
+        device_addr, size, [&](u64 device_addr_out, u64 range_size) {
+            const VAddr buffer_addr = buffer.CpuAddr();
+            const auto add_download = [&](VAddr start, VAddr end, u64) {
+                const u64 new_offset = start - buffer_addr;
+                const u64 new_size = end - start;
+                copies.push_back(vk::BufferCopy{
+                    .srcOffset = new_offset,
+                    .dstOffset = total_size_bytes,
+                    .size = new_size,
+                });
+                // Align up to avoid cache conflicts
+                constexpr u64 align = 64ULL;
+                constexpr u64 mask = ~(align - 1ULL);
+                total_size_bytes += (new_size + align - 1) & mask;
+                largest_copy = std::max(largest_copy, new_size);
+            };
+        });
+    if (total_size_bytes == 0) {
+        return;
+    }
+    const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
+    for (auto& copy : copies) {
+        // Modify copies to have the staging offset in mind
+        copy.dstOffset += offset;
+    }
+    staging_buffer.Commit();
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies);
+    scheduler.Finish();
+    for (const auto& copy : copies) {
+        const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
+        const u64 dst_offset = copy.dstOffset - offset;
+        std::memcpy(std::bit_cast<u8*>(copy_device_addr), staging + dst_offset, copy.size);
+    }
+}
+
+bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
+    if (vs_info.vs_inputs.empty()) {
+        return false;
+    }
+
+    std::array<vk::Buffer, NUM_VERTEX_BUFFERS> host_buffers;
+    std::array<vk::DeviceSize, NUM_VERTEX_BUFFERS> host_offsets;
+    boost::container::static_vector<AmdGpu::Buffer, NUM_VERTEX_BUFFERS> guest_buffers;
+
+    struct BufferRange {
+        VAddr base_address;
+        VAddr end_address;
+        vk::Buffer vk_buffer;
+        u64 offset;
+
+        size_t GetSize() const {
+            return end_address - base_address;
+        }
+    };
+
+    // Calculate buffers memory overlaps
+    bool has_step_rate = false;
+    boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges{};
+    for (const auto& input : vs_info.vs_inputs) {
+        if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 ||
+            input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) {
+            has_step_rate = true;
+            continue;
+        }
+
+        const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
+        if (buffer.GetSize() == 0) {
+            continue;
+        }
+        guest_buffers.emplace_back(buffer);
+        ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize());
+    }
+
+    std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) {
+        return lhv.base_address < rhv.base_address;
+    });
+
+    boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges_merged{ranges[0]};
+    for (auto range : ranges) {
+        auto& prev_range = ranges_merged.back();
+        if (prev_range.end_address < range.base_address) {
+            ranges_merged.emplace_back(range);
+        } else {
+            prev_range.end_address = std::max(prev_range.end_address, range.end_address);
+        }
+    }
+
+    // Map buffers
+    for (auto& range : ranges_merged) {
+        const auto [buffer, offset] = ObtainBuffer(range.base_address, range.GetSize(), false);
+        range.vk_buffer = buffer->buffer;
+        range.offset = offset;
+    }
+
+    // Bind vertex buffers
+    const size_t num_buffers = guest_buffers.size();
+    for (u32 i = 0; i < num_buffers; ++i) {
+        const auto& buffer = guest_buffers[i];
+        const auto host_buffer = std::ranges::find_if(ranges_merged, [&](const BufferRange& range) {
+            return (buffer.base_address >= range.base_address &&
+                    buffer.base_address < range.end_address);
+        });
+        ASSERT(host_buffer != ranges_merged.cend());
+
+        host_buffers[i] = host_buffer->vk_buffer;
+        host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address;
+    }
+
+    if (num_buffers > 0) {
+        const auto cmdbuf = scheduler.CommandBuffer();
+        cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data());
+    }
+
+    return has_step_rate;
+}
+
+u32 BufferCache::BindIndexBuffer(bool& is_indexed, u32 index_offset) {
+    // Emulate QuadList primitive type with CPU made index buffer.
+    const auto& regs = liverpool->regs;
+    if (regs.primitive_type == AmdGpu::Liverpool::PrimitiveType::QuadList) {
+        is_indexed = true;
+
+        // Emit indices.
+        const u32 index_size = 3 * regs.num_indices;
+        const auto [data, offset] = stream_buffer.Map(index_size);
+        Vulkan::LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices);
+        stream_buffer.Commit();
+
+        // Bind index buffer.
+        const auto cmdbuf = scheduler.CommandBuffer();
+        cmdbuf.bindIndexBuffer(stream_buffer.Handle(), offset, vk::IndexType::eUint16);
+        return index_size / sizeof(u16);
+    }
+    if (!is_indexed) {
+        return regs.num_indices;
+    }
+
+    // Figure out index type and size.
+    const bool is_index16 =
+        regs.index_buffer_type.index_type == AmdGpu::Liverpool::IndexType::Index16;
+    const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32;
+    const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32);
+    VAddr index_address = regs.index_base_address.Address<VAddr>();
+    index_address += index_offset * index_size;
+
+    // Bind index buffer.
+    const u32 index_buffer_size = regs.num_indices * index_size;
+    const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size, false);
+    const auto cmdbuf = scheduler.CommandBuffer();
+    cmdbuf.bindIndexBuffer(vk_buffer->Handle(), offset, index_type);
+    return regs.num_indices;
+}
+
+std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written) {
+    std::scoped_lock lk{mutex};
+    static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
+    const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
+    if (!is_written && size < StreamThreshold && !is_gpu_dirty) {
+        // For small uniform buffers that have not been modified by gpu
+        // use device local stream buffer to reduce renderpass breaks.
+        const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
+        return {&stream_buffer, offset};
+    }
+
+    const BufferId buffer_id = FindBuffer(device_addr, size);
+    Buffer& buffer = slot_buffers[buffer_id];
+    SynchronizeBuffer(buffer, device_addr, size);
+    if (is_written) {
+        memory_tracker.MarkRegionAsGpuModified(device_addr, size);
+    }
+    return {&buffer, buffer.Offset(device_addr)};
+}
+
+bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
+    const VAddr end_addr = addr + size;
+    const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
+    for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
+        const BufferId buffer_id = page_table[page];
+        if (!buffer_id) {
+            ++page;
+            continue;
+        }
+        Buffer& buffer = slot_buffers[buffer_id];
+        const VAddr buf_start_addr = buffer.CpuAddr();
+        const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
+        if (buf_start_addr < end_addr && addr < buf_end_addr) {
+            return true;
+        }
+        page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
+    }
+    return false;
+}
+
+bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
+    return memory_tracker.IsRegionCpuModified(addr, size);
+}
+
+BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
+    if (device_addr == 0) {
+        return NULL_BUFFER_ID;
+    }
+    const u64 page = device_addr >> CACHING_PAGEBITS;
+    const BufferId buffer_id = page_table[page];
+    if (!buffer_id) {
+        return CreateBuffer(device_addr, size);
+    }
+    const Buffer& buffer = slot_buffers[buffer_id];
+    if (buffer.IsInBounds(device_addr, size)) {
+        return buffer_id;
+    }
+    return CreateBuffer(device_addr, size);
+}
+
+BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 wanted_size) {
+    static constexpr int STREAM_LEAP_THRESHOLD = 16;
+    boost::container::small_vector<BufferId, 16> overlap_ids;
+    VAddr begin = device_addr;
+    VAddr end = device_addr + wanted_size;
+    int stream_score = 0;
+    bool has_stream_leap = false;
+    const auto expand_begin = [&](VAddr add_value) {
+        static constexpr VAddr min_page = CACHING_PAGESIZE + DEVICE_PAGESIZE;
+        if (add_value > begin - min_page) {
+            begin = min_page;
+            device_addr = DEVICE_PAGESIZE;
+            return;
+        }
+        begin -= add_value;
+        device_addr = begin - CACHING_PAGESIZE;
+    };
+    const auto expand_end = [&](VAddr add_value) {
+        static constexpr VAddr max_page = 1ULL << MemoryTracker::MAX_CPU_PAGE_BITS;
+        if (add_value > max_page - end) {
+            end = max_page;
+            return;
+        }
+        end += add_value;
+    };
+    if (begin == 0) {
+        return OverlapResult{
+            .ids = std::move(overlap_ids),
+            .begin = begin,
+            .end = end,
+            .has_stream_leap = has_stream_leap,
+        };
+    }
+    for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
+         device_addr += CACHING_PAGESIZE) {
+        const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS];
+        if (!overlap_id) {
+            continue;
+        }
+        Buffer& overlap = slot_buffers[overlap_id];
+        if (overlap.is_picked) {
+            continue;
+        }
+        overlap_ids.push_back(overlap_id);
+        overlap.is_picked = true;
+        const VAddr overlap_device_addr = overlap.CpuAddr();
+        const bool expands_left = overlap_device_addr < begin;
+        if (expands_left) {
+            begin = overlap_device_addr;
+        }
+        const VAddr overlap_end = overlap_device_addr + overlap.SizeBytes();
+        const bool expands_right = overlap_end > end;
+        if (overlap_end > end) {
+            end = overlap_end;
+        }
+        stream_score += overlap.StreamScore();
+        if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) {
+            // When this memory region has been joined a bunch of times, we assume it's being used
+            // as a stream buffer. Increase the size to skip constantly recreating buffers.
+            has_stream_leap = true;
+            if (expands_right) {
+                expand_begin(CACHING_PAGESIZE * 128);
+            }
+            if (expands_left) {
+                expand_end(CACHING_PAGESIZE * 128);
+            }
+        }
+    }
+    return OverlapResult{
+        .ids = std::move(overlap_ids),
+        .begin = begin,
+        .end = end,
+        .has_stream_leap = has_stream_leap,
+    };
+}
+
+void BufferCache::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
+                              bool accumulate_stream_score) {
+    Buffer& new_buffer = slot_buffers[new_buffer_id];
+    Buffer& overlap = slot_buffers[overlap_id];
+    if (accumulate_stream_score) {
+        new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1);
+    }
+    const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
+    const vk::BufferCopy copy = {
+        .srcOffset = 0,
+        .dstOffset = dst_base_offset,
+        .size = overlap.SizeBytes(),
+    };
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    static constexpr vk::MemoryBarrier READ_BARRIER{
+        .srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
+        .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
+    };
+    static constexpr vk::MemoryBarrier WRITE_BARRIER{
+        .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+        .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+    };
+    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
+                           vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
+                           READ_BARRIER, {}, {});
+    cmdbuf.copyBuffer(overlap.buffer, new_buffer.buffer, copy);
+    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
+                           vk::PipelineStageFlagBits::eAllCommands,
+                           vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
+    DeleteBuffer(overlap_id, true);
+}
+
+BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
+    const VAddr device_addr_end = Common::AlignUp(device_addr + wanted_size, CACHING_PAGESIZE);
+    device_addr = Common::AlignDown(device_addr, CACHING_PAGESIZE);
+    wanted_size = static_cast<u32>(device_addr_end - device_addr);
+    const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
+    const u32 size = static_cast<u32>(overlap.end - overlap.begin);
+    const BufferId new_buffer_id =
+        slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, size);
+    auto& new_buffer = slot_buffers[new_buffer_id];
+    const size_t size_bytes = new_buffer.SizeBytes();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    scheduler.EndRendering();
+    cmdbuf.fillBuffer(new_buffer.buffer, 0, size_bytes, 0);
+    for (const BufferId overlap_id : overlap.ids) {
+        JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
+    }
+    Register(new_buffer_id);
+    return new_buffer_id;
+}
+
+void BufferCache::Register(BufferId buffer_id) {
+    ChangeRegister<true>(buffer_id);
+}
+
+void BufferCache::Unregister(BufferId buffer_id) {
+    ChangeRegister<false>(buffer_id);
+}
+
+template <bool insert>
+void BufferCache::ChangeRegister(BufferId buffer_id) {
+    Buffer& buffer = slot_buffers[buffer_id];
+    const auto size = buffer.SizeBytes();
+    const VAddr device_addr_begin = buffer.CpuAddr();
+    const VAddr device_addr_end = device_addr_begin + size;
+    const u64 page_begin = device_addr_begin / CACHING_PAGESIZE;
+    const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
+    for (u64 page = page_begin; page != page_end; ++page) {
+        if constexpr (insert) {
+            page_table[page] = buffer_id;
+        } else {
+            page_table[page] = BufferId{};
+        }
+    }
+}
+
+bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) {
+    boost::container::small_vector<vk::BufferCopy, 4> copies;
+    u64 total_size_bytes = 0;
+    u64 largest_copy = 0;
+    VAddr buffer_start = buffer.CpuAddr();
+    const auto add_copy = [&](VAddr device_addr_out, u64 range_size) {
+        copies.push_back(vk::BufferCopy{
+            .srcOffset = total_size_bytes,
+            .dstOffset = device_addr_out - buffer_start,
+            .size = range_size,
+        });
+        total_size_bytes += range_size;
+        largest_copy = std::max(largest_copy, range_size);
+    };
+    memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
+        add_copy(device_addr_out, range_size);
+        // Prevent uploading to gpu modified regions.
+        // gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy);
+    });
+    if (total_size_bytes == 0) {
+        return true;
+    }
+    vk::Buffer src_buffer = staging_buffer.Handle();
+    if (total_size_bytes < StagingBufferSize) {
+        const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
+        for (auto& copy : copies) {
+            u8* const src_pointer = staging + copy.srcOffset;
+            const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
+            std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
+            // Apply the staging offset
+            copy.srcOffset += offset;
+        }
+        staging_buffer.Commit();
+    } else {
+        // For large one time transfers use a temporary host buffer.
+        // RenderDoc can lag quite a bit if the stream buffer is too large.
+        Buffer temp_buffer{instance, MemoryUsage::Upload, 0, total_size_bytes};
+        src_buffer = temp_buffer.Handle();
+        u8* const staging = temp_buffer.mapped_data.data();
+        for (auto& copy : copies) {
+            u8* const src_pointer = staging + copy.srcOffset;
+            const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
+            std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
+        }
+        scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
+    }
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    static constexpr vk::MemoryBarrier READ_BARRIER{
+        .srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
+        .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
+    };
+    static constexpr vk::MemoryBarrier WRITE_BARRIER{
+        .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+        .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+    };
+    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
+                           vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
+                           READ_BARRIER, {}, {});
+    cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
+    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
+                           vk::PipelineStageFlagBits::eAllCommands,
+                           vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
+    return false;
+}
+
+void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
+    // Mark the whole buffer as CPU written to stop tracking CPU writes
+    if (!do_not_mark) {
+        Buffer& buffer = slot_buffers[buffer_id];
+        memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
+    }
+    Unregister(buffer_id);
+    scheduler.DeferOperation([this, buffer_id] { slot_buffers.erase(buffer_id); });
+}
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
new file mode 100644
index 00000000..0dee87cf
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,129 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <array>
+#include <mutex>
+#include <boost/container/small_vector.hpp>
+#include <boost/icl/interval_map.hpp>
+#include <tsl/robin_map.h>
+#include "common/div_ceil.h"
+#include "common/slot_vector.h"
+#include "common/types.h"
+#include "video_core/buffer_cache/buffer.h"
+#include "video_core/buffer_cache/memory_tracker_base.h"
+#include "video_core/multi_level_page_table.h"
+
+namespace AmdGpu {
+struct Liverpool;
+}
+
+namespace Shader {
+struct Info;
+}
+
+namespace VideoCore {
+
+using BufferId = Common::SlotId;
+
+static constexpr BufferId NULL_BUFFER_ID{0};
+
+static constexpr u32 NUM_VERTEX_BUFFERS = 32;
+
+class BufferCache {
+public:
+    static constexpr u32 CACHING_PAGEBITS = 12;
+    static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
+    static constexpr u64 DEVICE_PAGESIZE = 4_KB;
+
+    struct Traits {
+        using Entry = BufferId;
+        static constexpr size_t AddressSpaceBits = 39;
+        static constexpr size_t FirstLevelBits = 14;
+        static constexpr size_t PageBits = CACHING_PAGEBITS;
+    };
+    using PageTable = MultiLevelPageTable<Traits>;
+
+    struct OverlapResult {
+        boost::container::small_vector<BufferId, 16> ids;
+        VAddr begin;
+        VAddr end;
+        bool has_stream_leap = false;
+    };
+
+public:
+    explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
+                         const AmdGpu::Liverpool* liverpool, PageManager& tracker);
+    ~BufferCache();
+
+    /// Invalidates any buffer in the logical page range.
+    void InvalidateMemory(VAddr device_addr, u64 size);
+
+    /// Binds host vertex buffers for the current draw.
+    bool BindVertexBuffers(const Shader::Info& vs_info);
+
+    /// Bind host index buffer for the current draw.
+    u32 BindIndexBuffer(bool& is_indexed, u32 index_offset);
+
+    /// Obtains a buffer for the specified region.
+    [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written);
+
+    /// Return true when a region is registered on the cache
+    [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
+
+    /// Return true when a CPU region is modified from the CPU
+    [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
+
+private:
+    template <typename Func>
+    void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
+        const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
+        for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
+            const BufferId buffer_id = page_table[page];
+            if (!buffer_id) {
+                ++page;
+                continue;
+            }
+            Buffer& buffer = slot_buffers[buffer_id];
+            func(buffer_id, buffer);
+
+            const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+            page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
+        }
+    }
+
+    void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
+
+    [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
+
+    [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
+
+    void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
+
+    [[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);
+
+    void Register(BufferId buffer_id);
+
+    void Unregister(BufferId buffer_id);
+
+    template <bool insert>
+    void ChangeRegister(BufferId buffer_id);
+
+    bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size);
+
+    void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);
+
+    const Vulkan::Instance& instance;
+    Vulkan::Scheduler& scheduler;
+    const AmdGpu::Liverpool* liverpool;
+    PageManager& tracker;
+    StreamBuffer staging_buffer;
+    StreamBuffer stream_buffer;
+    std::recursive_mutex mutex;
+    Common::SlotVector<Buffer> slot_buffers;
+    MemoryTracker memory_tracker;
+    PageTable page_table;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h
new file mode 100644
index 00000000..375701c4
--- /dev/null
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <type_traits>
+#include <vector>
+#include "common/types.h"
+#include "video_core/buffer_cache/word_manager.h"
+
+namespace VideoCore {
+
+class MemoryTracker {
+public:
+    static constexpr size_t MAX_CPU_PAGE_BITS = 39;
+    static constexpr size_t HIGHER_PAGE_BITS = 22;
+    static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
+    static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;
+    static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS);
+    static constexpr size_t MANAGER_POOL_SIZE = 32;
+    static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD;
+    using Manager = WordManager<WORDS_STACK_NEEDED>;
+
+public:
+    explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {}
+    ~MemoryTracker() = default;
+
+    /// Returns true if a region has been modified from the CPU
+    [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
+        return IteratePages<true>(
+            query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
+                return manager->template IsRegionModified<Type::CPU>(offset, size);
+            });
+    }
+
+    /// Returns true if a region has been modified from the GPU
+    [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
+        return IteratePages<false>(
+            query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
+                return manager->template IsRegionModified<Type::GPU>(offset, size);
+            });
+    }
+
+    /// Mark region as CPU modified, notifying the device_tracker about this change
+    void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
+        IteratePages<true>(dirty_cpu_addr, query_size,
+                           [](Manager* manager, u64 offset, size_t size) {
+                               manager->template ChangeRegionState<Type::CPU, true>(
+                                   manager->GetCpuAddr() + offset, size);
+                           });
+    }
+
+    /// Unmark region as CPU modified, notifying the device_tracker about this change
+    void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
+        IteratePages<true>(dirty_cpu_addr, query_size,
+                           [](Manager* manager, u64 offset, size_t size) {
+                               manager->template ChangeRegionState<Type::CPU, false>(
+                                   manager->GetCpuAddr() + offset, size);
+                           });
+    }
+
+    /// Mark region as modified from the host GPU
+    void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
+        IteratePages<true>(dirty_cpu_addr, query_size,
+                           [](Manager* manager, u64 offset, size_t size) {
+                               manager->template ChangeRegionState<Type::GPU, true>(
+                                   manager->GetCpuAddr() + offset, size);
+                           });
+    }
+
+    /// Unmark region as modified from the host GPU
+    void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
+        IteratePages<true>(dirty_cpu_addr, query_size,
+                           [](Manager* manager, u64 offset, size_t size) {
+                               manager->template ChangeRegionState<Type::GPU, false>(
+                                   manager->GetCpuAddr() + offset, size);
+                           });
+    }
+
+    /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
+    template <typename Func>
+    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
+        IteratePages<true>(query_cpu_range, query_size,
+                           [&func](Manager* manager, u64 offset, size_t size) {
+                               manager->template ForEachModifiedRange<Type::CPU, true>(
+                                   manager->GetCpuAddr() + offset, size, func);
+                           });
+    }
+
+    /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
+    template <bool clear, typename Func>
+    void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
+        IteratePages<false>(query_cpu_range, query_size,
+                            [&func](Manager* manager, u64 offset, size_t size) {
+                                if constexpr (clear) {
+                                    manager->template ForEachModifiedRange<Type::GPU, true>(
+                                        manager->GetCpuAddr() + offset, size, func);
+                                } else {
+                                    manager->template ForEachModifiedRange<Type::GPU, false>(
+                                        manager->GetCpuAddr() + offset, size, func);
+                                }
+                            });
+    }
+
+private:
+    /**
+     * @brief IteratePages Iterates L2 word manager page table.
+     * @param cpu_address Start byte cpu address
+     * @param size Size in bytes of the region of iterate.
+     * @param func Callback for each word manager.
+     * @return
+     */
+    template <bool create_region_on_fail, typename Func>
+    bool IteratePages(VAddr cpu_address, size_t size, Func&& func) {
+        using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type;
+        static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
+        std::size_t remaining_size{size};
+        std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS};
+        u64 page_offset{cpu_address & HIGHER_PAGE_MASK};
+        while (remaining_size > 0) {
+            const std::size_t copy_amount{
+                std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)};
+            auto* manager{top_tier[page_index]};
+            if (manager) {
+                if constexpr (BOOL_BREAK) {
+                    if (func(manager, page_offset, copy_amount)) {
+                        return true;
+                    }
+                } else {
+                    func(manager, page_offset, copy_amount);
+                }
+            } else if constexpr (create_region_on_fail) {
+                CreateRegion(page_index);
+                manager = top_tier[page_index];
+                if constexpr (BOOL_BREAK) {
+                    if (func(manager, page_offset, copy_amount)) {
+                        return true;
+                    }
+                } else {
+                    func(manager, page_offset, copy_amount);
+                }
+            }
+            page_index++;
+            page_offset = 0;
+            remaining_size -= copy_amount;
+        }
+        return false;
+    }
+
+    void CreateRegion(std::size_t page_index) {
+        const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS;
+        if (free_managers.empty()) {
+            manager_pool.emplace_back();
+            auto& last_pool = manager_pool.back();
+            for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) {
+                std::construct_at(&last_pool[i], tracker, 0, HIGHER_PAGE_SIZE);
+                free_managers.push_back(&last_pool[i]);
+            }
+        }
+        // Each manager tracks a 4_MB virtual address space.
+        auto* new_manager = free_managers.back();
+        new_manager->SetCpuAddress(base_cpu_addr);
+        free_managers.pop_back();
+        top_tier[page_index] = new_manager;
+    }
+
+    PageManager* tracker;
+    std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool;
+    std::vector<Manager*> free_managers;
+    std::array<Manager*, NUM_HIGH_PAGES> top_tier{};
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/range_set.h b/src/video_core/buffer_cache/range_set.h
new file mode 100644
index 00000000..fe54aff8
--- /dev/null
+++ b/src/video_core/buffer_cache/range_set.h
@@ -0,0 +1,159 @@
+// SPDX-FileCopyrightText: 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <boost/icl/interval_map.hpp>
+#include <boost/pool/pool.hpp>
+#include <boost/pool/pool_alloc.hpp>
+#include <boost/pool/poolfwd.hpp>
+#include "common/types.h"
+
+namespace VideoCore {
+
+template <class T>
+using RangeSetsAllocator =
+    boost::fast_pool_allocator<T, boost::default_user_allocator_new_delete,
+                               boost::details::pool::default_mutex, 1024, 2048>;
+
+struct RangeSet {
+    using IntervalSet =
+        boost::icl::interval_set<VAddr, std::less,
+                                 ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
+                                 RangeSetsAllocator>;
+    using IntervalType = typename IntervalSet::interval_type;
+
+    explicit RangeSet() = default;
+    ~RangeSet() = default;
+
+    void Add(VAddr base_address, size_t size) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_set.add(interval);
+    }
+
+    void Subtract(VAddr base_address, size_t size) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_set.subtract(interval);
+    }
+
+    template <typename Func>
+    void ForEach(Func&& func) const {
+        if (m_ranges_set.empty()) {
+            return;
+        }
+        auto it = m_ranges_set.begin();
+        auto end_it = m_ranges_set.end();
+        for (; it != end_it; it++) {
+            const VAddr inter_addr_end = it->upper();
+            const VAddr inter_addr = it->lower();
+            func(inter_addr, inter_addr_end);
+        }
+    }
+
+    template <typename Func>
+    void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
+        if (m_ranges_set.empty()) {
+            return;
+        }
+        const VAddr start_address = base_addr;
+        const VAddr end_address = start_address + size;
+        const IntervalType search_interval{start_address, end_address};
+        auto it = m_ranges_set.lower_bound(search_interval);
+        if (it == m_ranges_set.end()) {
+            return;
+        }
+        auto end_it = m_ranges_set.upper_bound(search_interval);
+        for (; it != end_it; it++) {
+            VAddr inter_addr_end = it->upper();
+            VAddr inter_addr = it->lower();
+            if (inter_addr_end > end_address) {
+                inter_addr_end = end_address;
+            }
+            if (inter_addr < start_address) {
+                inter_addr = start_address;
+            }
+            func(inter_addr, inter_addr_end);
+        }
+    }
+
+    IntervalSet m_ranges_set;
+};
+
+class RangeMap {
+public:
+    using IntervalMap =
+        boost::icl::interval_map<VAddr, u64, boost::icl::partial_absorber, std::less,
+                                 boost::icl::inplace_plus, boost::icl::inter_section,
+                                 ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
+                                 RangeSetsAllocator>;
+    using IntervalType = typename IntervalMap::interval_type;
+
+public:
+    RangeMap() = default;
+    ~RangeMap() = default;
+
+    RangeMap(RangeMap const&) = delete;
+    RangeMap& operator=(RangeMap const&) = delete;
+
+    RangeMap(RangeMap&& other);
+    RangeMap& operator=(RangeMap&& other);
+
+    void Add(VAddr base_address, size_t size, u64 value) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_map.add({interval, value});
+    }
+
+    void Subtract(VAddr base_address, size_t size) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_map -= interval;
+    }
+
+    template <typename Func>
+    void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
+        if (m_ranges_map.empty()) {
+            return;
+        }
+        const VAddr start_address = base_addr;
+        const VAddr end_address = start_address + size;
+        const IntervalType search_interval{start_address, end_address};
+        auto it = m_ranges_map.lower_bound(search_interval);
+        if (it == m_ranges_map.end()) {
+            return;
+        }
+        auto end_it = m_ranges_map.upper_bound(search_interval);
+        for (; it != end_it; it++) {
+            VAddr inter_addr_end = it->first.upper();
+            VAddr inter_addr = it->first.lower();
+            if (inter_addr_end > end_address) {
+                inter_addr_end = end_address;
+            }
+            if (inter_addr < start_address) {
+                inter_addr = start_address;
+            }
+            func(inter_addr, inter_addr_end, it->second);
+        }
+    }
+
+    template <typename Func>
+    void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
+        const VAddr end_addr = base_addr + size;
+        ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) {
+            if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
+                func(base_addr, gap_size);
+            }
+            base_addr = range_end;
+        });
+        if (base_addr != end_addr) {
+            func(base_addr, end_addr - base_addr);
+        }
+    }
+
+private:
+    IntervalMap m_ranges_map;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h
new file mode 100644
index 00000000..549d2a9e
--- /dev/null
+++ b/src/video_core/buffer_cache/word_manager.h
@@ -0,0 +1,398 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <algorithm>
+#include <span>
+#include <utility>
+#include "common/div_ceil.h"
+#include "common/types.h"
+#include "video_core/page_manager.h"
+
+namespace VideoCore {
+
+constexpr u64 PAGES_PER_WORD = 64;
+constexpr u64 BYTES_PER_PAGE = 4_KB;
+constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
+
+enum class Type {
+    CPU,
+    GPU,
+    Untracked,
+};
+
+/// Vector tracking modified pages tightly packed with small vector optimization
+template <size_t stack_words = 1>
+struct WordsArray {
+    /// Returns the pointer to the words state
+    [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
+        return is_short ? stack.data() : heap;
+    }
+
+    /// Returns the pointer to the words state
+    [[nodiscard]] u64* Pointer(bool is_short) noexcept {
+        return is_short ? stack.data() : heap;
+    }
+
+    std::array<u64, stack_words> stack{}; ///< Small buffers storage
+    u64* heap;                            ///< Not-small buffers pointer to the storage
+};
+
+template <size_t stack_words = 1>
+struct Words {
+    explicit Words() = default;
+    explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
+        num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD);
+        if (IsShort()) {
+            cpu.stack.fill(~u64{0});
+            gpu.stack.fill(0);
+            untracked.stack.fill(~u64{0});
+        } else {
+            // Share allocation between CPU and GPU pages and set their default values
+            u64* const alloc = new u64[num_words * 3];
+            cpu.heap = alloc;
+            gpu.heap = alloc + num_words;
+            untracked.heap = alloc + num_words * 2;
+            std::fill_n(cpu.heap, num_words, ~u64{0});
+            std::fill_n(gpu.heap, num_words, 0);
+            std::fill_n(untracked.heap, num_words, ~u64{0});
+        }
+        // Clean up tailing bits
+        const u64 last_word_size = size_bytes % BYTES_PER_WORD;
+        const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
+        const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
+        const u64 last_word = (~u64{0} << shift) >> shift;
+        cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
+        untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
+    }
+
+    ~Words() {
+        Release();
+    }
+
+    Words& operator=(Words&& rhs) noexcept {
+        Release();
+        size_bytes = rhs.size_bytes;
+        num_words = rhs.num_words;
+        cpu = rhs.cpu;
+        gpu = rhs.gpu;
+        untracked = rhs.untracked;
+        rhs.cpu.heap = nullptr;
+        return *this;
+    }
+
+    Words(Words&& rhs) noexcept
+        : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu},
+          untracked{rhs.untracked} {
+        rhs.cpu.heap = nullptr;
+    }
+
+    Words& operator=(const Words&) = delete;
+    Words(const Words&) = delete;
+
+    /// Returns true when the buffer fits in the small vector optimization
+    [[nodiscard]] bool IsShort() const noexcept {
+        return num_words <= stack_words;
+    }
+
+    /// Returns the number of words of the buffer
+    [[nodiscard]] size_t NumWords() const noexcept {
+        return num_words;
+    }
+
+    /// Release buffer resources
+    void Release() {
+        if (!IsShort()) {
+            // CPU written words is the base for the heap allocation
+            delete[] cpu.heap;
+        }
+    }
+
+    template <Type type>
+    std::span<u64> Span() noexcept {
+        if constexpr (type == Type::CPU) {
+            return std::span<u64>(cpu.Pointer(IsShort()), num_words);
+        } else if constexpr (type == Type::GPU) {
+            return std::span<u64>(gpu.Pointer(IsShort()), num_words);
+        } else if constexpr (type == Type::Untracked) {
+            return std::span<u64>(untracked.Pointer(IsShort()), num_words);
+        }
+    }
+
+    template <Type type>
+    std::span<const u64> Span() const noexcept {
+        if constexpr (type == Type::CPU) {
+            return std::span<const u64>(cpu.Pointer(IsShort()), num_words);
+        } else if constexpr (type == Type::GPU) {
+            return std::span<const u64>(gpu.Pointer(IsShort()), num_words);
+        } else if constexpr (type == Type::Untracked) {
+            return std::span<const u64>(untracked.Pointer(IsShort()), num_words);
+        }
+    }
+
+    u64 size_bytes = 0;
+    size_t num_words = 0;
+    WordsArray<stack_words> cpu;
+    WordsArray<stack_words> gpu;
+    WordsArray<stack_words> untracked;
+};
+
+template <size_t stack_words = 1>
+class WordManager {
+public:
+    explicit WordManager(PageManager* tracker_, VAddr cpu_addr_, u64 size_bytes)
+        : tracker{tracker_}, cpu_addr{cpu_addr_}, words{size_bytes} {}
+
+    explicit WordManager() = default;
+
+    void SetCpuAddress(VAddr new_cpu_addr) {
+        cpu_addr = new_cpu_addr;
+    }
+
+    VAddr GetCpuAddr() const {
+        return cpu_addr;
+    }
+
+    static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
+        constexpr size_t number_bits = sizeof(u64) * 8;
+        const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
+        u64 bits = (word >> page_start) << page_start;
+        bits = (bits << limit_page_end) >> limit_page_end;
+        return bits;
+    }
+
+    static std::pair<size_t, size_t> GetWordPage(VAddr address) {
+        const size_t converted_address = static_cast<size_t>(address);
+        const size_t word_number = converted_address / BYTES_PER_WORD;
+        const size_t amount_pages = converted_address % BYTES_PER_WORD;
+        return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE);
+    }
+
+    template <typename Func>
+    void IterateWords(size_t offset, size_t size, Func&& func) const {
+        using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>;
+        static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
+        const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL));
+        const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL));
+        if (start >= SizeBytes() || end <= start) {
+            return;
+        }
+        auto [start_word, start_page] = GetWordPage(start);
+        auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL);
+        const size_t num_words = NumWords();
+        start_word = std::min(start_word, num_words);
+        end_word = std::min(end_word, num_words);
+        const size_t diff = end_word - start_word;
+        end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD;
+        end_word = std::min(end_word, num_words);
+        end_page += diff * PAGES_PER_WORD;
+        constexpr u64 base_mask{~0ULL};
+        for (size_t word_index = start_word; word_index < end_word; word_index++) {
+            const u64 mask = ExtractBits(base_mask, start_page, end_page);
+            start_page = 0;
+            end_page -= PAGES_PER_WORD;
+            if constexpr (BOOL_BREAK) {
+                if (func(word_index, mask)) {
+                    return;
+                }
+            } else {
+                func(word_index, mask);
+            }
+        }
+    }
+
+    template <typename Func>
+    void IteratePages(u64 mask, Func&& func) const {
+        size_t offset = 0;
+        while (mask != 0) {
+            const size_t empty_bits = std::countr_zero(mask);
+            offset += empty_bits;
+            mask = mask >> empty_bits;
+
+            const size_t continuous_bits = std::countr_one(mask);
+            func(offset, continuous_bits);
+            mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0;
+            offset += continuous_bits;
+        }
+    }
+
+    /**
+     * Change the state of a range of pages
+     *
+     * @param dirty_addr    Base address to mark or unmark as modified
+     * @param size          Size in bytes to mark or unmark as modified
+     */
+    template <Type type, bool enable>
+    void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) {
+        std::span<u64> state_words = words.template Span<type>();
+        [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
+        IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) {
+            if constexpr (type == Type::CPU) {
+                NotifyPageTracker<!enable>(index, untracked_words[index], mask);
+            }
+            if constexpr (enable) {
+                state_words[index] |= mask;
+                if constexpr (type == Type::CPU) {
+                    untracked_words[index] |= mask;
+                }
+            } else {
+                state_words[index] &= ~mask;
+                if constexpr (type == Type::CPU) {
+                    untracked_words[index] &= ~mask;
+                }
+            }
+        });
+    }
+
+    /**
+     * Loop over each page in the given range, turn off those bits and notify the tracker if
+     * needed. Call the given function on each turned off range.
+     *
+     * @param query_cpu_range Base CPU address to loop over
+     * @param size            Size in bytes of the CPU range to loop over
+     * @param func            Function to call for each turned off region
+     */
+    template <Type type, bool clear, typename Func>
+    void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
+        static_assert(type != Type::Untracked);
+
+        std::span<u64> state_words = words.template Span<type>();
+        [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
+        const size_t offset = query_cpu_range - cpu_addr;
+        bool pending = false;
+        size_t pending_offset{};
+        size_t pending_pointer{};
+        const auto release = [&]() {
+            func(cpu_addr + pending_offset * BYTES_PER_PAGE,
+                 (pending_pointer - pending_offset) * BYTES_PER_PAGE);
+        };
+        IterateWords(offset, size, [&](size_t index, u64 mask) {
+            if constexpr (type == Type::GPU) {
+                mask &= ~untracked_words[index];
+            }
+            const u64 word = state_words[index] & mask;
+            if constexpr (clear) {
+                if constexpr (type == Type::CPU) {
+                    NotifyPageTracker<true>(index, untracked_words[index], mask);
+                }
+                state_words[index] &= ~mask;
+                if constexpr (type == Type::CPU) {
+                    untracked_words[index] &= ~mask;
+                }
+            }
+            const size_t base_offset = index * PAGES_PER_WORD;
+            IteratePages(word, [&](size_t pages_offset, size_t pages_size) {
+                const auto reset = [&]() {
+                    pending_offset = base_offset + pages_offset;
+                    pending_pointer = base_offset + pages_offset + pages_size;
+                };
+                if (!pending) {
+                    reset();
+                    pending = true;
+                    return;
+                }
+                if (pending_pointer == base_offset + pages_offset) {
+                    pending_pointer += pages_size;
+                    return;
+                }
+                release();
+                reset();
+            });
+        });
+        if (pending) {
+            release();
+        }
+    }
+
+    /**
+     * Returns true when a region has been modified
+     *
+     * @param offset Offset in bytes from the start of the buffer
+     * @param size   Size in bytes of the region to query for modifications
+     */
+    template <Type type>
+    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
+        static_assert(type != Type::Untracked);
+
+        const std::span<const u64> state_words = words.template Span<type>();
+        [[maybe_unused]] const std::span<const u64> untracked_words =
+            words.template Span<Type::Untracked>();
+        bool result = false;
+        IterateWords(offset, size, [&](size_t index, u64 mask) {
+            if constexpr (type == Type::GPU) {
+                mask &= ~untracked_words[index];
+            }
+            const u64 word = state_words[index] & mask;
+            if (word != 0) {
+                result = true;
+                return true;
+            }
+            return false;
+        });
+        return result;
+    }
+
+    /// Returns the number of words of the manager
+    [[nodiscard]] size_t NumWords() const noexcept {
+        return words.NumWords();
+    }
+
+    /// Returns the size in bytes of the manager
+    [[nodiscard]] u64 SizeBytes() const noexcept {
+        return words.size_bytes;
+    }
+
+    /// Returns true when the buffer fits in the small vector optimization
+    [[nodiscard]] bool IsShort() const noexcept {
+        return words.IsShort();
+    }
+
+private:
+    template <Type type>
+    u64* Array() noexcept {
+        if constexpr (type == Type::CPU) {
+            return words.cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::GPU) {
+            return words.gpu.Pointer(IsShort());
+        } else if constexpr (type == Type::Untracked) {
+            return words.untracked.Pointer(IsShort());
+        }
+    }
+
+    template <Type type>
+    const u64* Array() const noexcept {
+        if constexpr (type == Type::CPU) {
+            return words.cpu.Pointer(IsShort());
+        } else if constexpr (type == Type::GPU) {
+            return words.gpu.Pointer(IsShort());
+        } else if constexpr (type == Type::Untracked) {
+            return words.untracked.Pointer(IsShort());
+        }
+    }
+
+    /**
+     * Notify tracker about changes in the CPU tracking state of a word in the buffer
+     *
+     * @param word_index   Index to the word to notify to the tracker
+     * @param current_bits Current state of the word
+     * @param new_bits     New state of the word
+     *
+     * @tparam add_to_tracker True when the tracker should start tracking the new pages
+     */
+    template <bool add_to_tracker>
+    void NotifyPageTracker(u64 word_index, u64 current_bits, u64 new_bits) const {
+        u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
+        VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
+        IteratePages(changed_bits, [&](size_t offset, size_t size) {
+            tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE,
+                                            add_to_tracker ? 1 : -1);
+        });
+    }
+
+    PageManager* tracker;
+    VAddr cpu_addr = 0;
+    Words<stack_words> words;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/multi_level_page_table.h b/src/video_core/multi_level_page_table.h
new file mode 100644
index 00000000..527476f3
--- /dev/null
+++ b/src/video_core/multi_level_page_table.h
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "common/object_pool.h"
+#include "common/types.h"
+
+namespace VideoCore {
+
+template <class Traits>
+class MultiLevelPageTable final {
+    using Entry = typename Traits::Entry;
+
+    static constexpr size_t AddressSpaceBits = Traits::AddressSpaceBits;
+    static constexpr size_t FirstLevelBits = Traits::FirstLevelBits;
+    static constexpr size_t PageBits = Traits::PageBits;
+    static constexpr size_t FirstLevelShift = AddressSpaceBits - FirstLevelBits;
+    static constexpr size_t SecondLevelBits = FirstLevelShift - PageBits;
+    static constexpr size_t NumEntriesPerL1Page = 1ULL << SecondLevelBits;
+
+    using L1Page = std::array<Entry, NumEntriesPerL1Page>;
+
+public:
+    explicit MultiLevelPageTable() : first_level_map{1ULL << FirstLevelBits, nullptr} {}
+
+    ~MultiLevelPageTable() noexcept = default;
+
+    [[nodiscard]] Entry* find(size_t page) {
+        const size_t l1_page = page >> SecondLevelBits;
+        const size_t l2_page = page & (NumEntriesPerL1Page - 1);
+        if (!first_level_map[l1_page]) {
+            return nullptr;
+        }
+        return &(*first_level_map[l1_page])[l2_page];
+    }
+
+    [[nodiscard]] const Entry& operator[](size_t page) const {
+        const size_t l1_page = page >> SecondLevelBits;
+        const size_t l2_page = page & (NumEntriesPerL1Page - 1);
+        if (!first_level_map[l1_page]) {
+            first_level_map[l1_page] = page_alloc.Create();
+        }
+        return (*first_level_map[l1_page])[l2_page];
+    }
+
+    [[nodiscard]] Entry& operator[](size_t page) {
+        const size_t l1_page = page >> SecondLevelBits;
+        const size_t l2_page = page & (NumEntriesPerL1Page - 1);
+        if (!first_level_map[l1_page]) {
+            first_level_map[l1_page] = page_alloc.Create();
+        }
+        return (*first_level_map[l1_page])[l2_page];
+    }
+
+private:
+    std::vector<L1Page*> first_level_map{};
+    Common::ObjectPool<L1Page> page_alloc;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp
new file mode 100644
index 00000000..6225f11b
--- /dev/null
+++ b/src/video_core/page_manager.cpp
@@ -0,0 +1,260 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <thread>
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/error.h"
+#include "video_core/page_manager.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
+
+#ifndef _WIN64
+#include <fcntl.h>
+#include <poll.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#ifdef ENABLE_USERFAULTFD
+#include <linux/userfaultfd.h>
+#endif
+#else
+#include <windows.h>
+#endif
+
+namespace VideoCore {
+
+constexpr size_t PAGESIZE = 4_KB;
+constexpr size_t PAGEBITS = 12;
+
+#ifdef _WIN64
+struct PageManager::Impl {
+    Impl(Vulkan::Rasterizer* rasterizer_) {
+        rasterizer = rasterizer_;
+
+        veh_handle = AddVectoredExceptionHandler(0, GuestFaultSignalHandler);
+        ASSERT_MSG(veh_handle, "Failed to register an exception handler");
+    }
+
+    void OnMap(VAddr address, size_t size) {}
+
+    void OnUnmap(VAddr address, size_t size) {}
+
+    void Protect(VAddr address, size_t size, bool allow_write) {
+        DWORD prot = allow_write ? PAGE_READWRITE : PAGE_READONLY;
+        DWORD old_prot{};
+        BOOL result = VirtualProtect(std::bit_cast<LPVOID>(address), size, prot, &old_prot);
+        ASSERT_MSG(result != 0, "Region protection failed");
+    }
+
+    static LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept {
+        const u32 ec = pExp->ExceptionRecord->ExceptionCode;
+        if (ec == EXCEPTION_ACCESS_VIOLATION) {
+            const auto info = pExp->ExceptionRecord->ExceptionInformation;
+            if (info[0] == 1) { // Write violation
+                rasterizer->InvalidateMemory(info[1], sizeof(u64));
+                return EXCEPTION_CONTINUE_EXECUTION;
+            } /* else {
+                UNREACHABLE();
+            }*/
+        }
+        return EXCEPTION_CONTINUE_SEARCH; // pass further
+    }
+
+    inline static Vulkan::Rasterizer* rasterizer;
+    void* veh_handle{};
+};
+#elif ENABLE_USERFAULTFD
+struct PageManager::Impl {
+    Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
+        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+        ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg());
+
+        // Request uffdio features from kernel.
+        uffdio_api api;
+        api.api = UFFD_API;
+        api.features = UFFD_FEATURE_THREAD_ID;
+        const int ret = ioctl(uffd, UFFDIO_API, &api);
+        ASSERT(ret == 0 && api.api == UFFD_API);
+
+        // Create uffd handler thread
+        ufd_thread = std::jthread([&](std::stop_token token) { UffdHandler(token); });
+    }
+
+    void OnMap(VAddr address, size_t size) {
+        uffdio_register reg;
+        reg.range.start = address;
+        reg.range.len = size;
+        reg.mode = UFFDIO_REGISTER_MODE_WP;
+        const int ret = ioctl(uffd, UFFDIO_REGISTER, &reg);
+        ASSERT_MSG(ret != -1, "Uffdio register failed");
+    }
+
+    void OnUnmap(VAddr address, size_t size) {
+        uffdio_range range;
+        range.start = address;
+        range.len = size;
+        const int ret = ioctl(uffd, UFFDIO_UNREGISTER, &range);
+        ASSERT_MSG(ret != -1, "Uffdio unregister failed");
+    }
+
+    void Protect(VAddr address, size_t size, bool allow_write) {
+        uffdio_writeprotect wp;
+        wp.range.start = address;
+        wp.range.len = size;
+        wp.mode = allow_write ? 0 : UFFDIO_WRITEPROTECT_MODE_WP;
+        const int ret = ioctl(uffd, UFFDIO_WRITEPROTECT, &wp);
+        ASSERT_MSG(ret != -1, "Uffdio writeprotect failed with error: {}",
+                   Common::GetLastErrorMsg());
+    }
+
+    void UffdHandler(std::stop_token token) {
+        while (!token.stop_requested()) {
+            pollfd pollfd;
+            pollfd.fd = uffd;
+            pollfd.events = POLLIN;
+
+            // Block until the descriptor is ready for data reads.
+            const int pollres = poll(&pollfd, 1, -1);
+            switch (pollres) {
+            case -1:
+                perror("Poll userfaultfd");
+                continue;
+                break;
+            case 0:
+                continue;
+            case 1:
+                break;
+            default:
+                UNREACHABLE_MSG("Unexpected number of descriptors {} out of poll", pollres);
+            }
+
+            // We don't want an error condition to have occured.
+            ASSERT_MSG(!(pollfd.revents & POLLERR), "POLLERR on userfaultfd");
+
+            // We waited until there is data to read, we don't care about anything else.
+            if (!(pollfd.revents & POLLIN)) {
+                continue;
+            }
+
+            // Read message from kernel.
+            uffd_msg msg;
+            const int readret = read(uffd, &msg, sizeof(msg));
+            ASSERT_MSG(readret != -1 || errno == EAGAIN, "Unexpected result of uffd read");
+            if (errno == EAGAIN) {
+                continue;
+            }
+            ASSERT_MSG(readret == sizeof(msg), "Unexpected short read, exiting");
+            ASSERT(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP);
+
+            // Notify rasterizer about the fault.
+            const VAddr addr = msg.arg.pagefault.address;
+            const VAddr addr_page = Common::AlignDown(addr, PAGESIZE);
+            rasterizer->InvalidateMemory(addr_page, PAGESIZE);
+        }
+    }
+
+    Vulkan::Rasterizer* rasterizer;
+    std::jthread ufd_thread;
+    int uffd;
+};
+#else
+struct PageManager::Impl {
+    Impl(Vulkan::Rasterizer* rasterizer_) {
+        rasterizer = rasterizer_;
+
+#ifdef __APPLE__
+        // Read-only memory write results in SIGBUS on Apple.
+        static constexpr int SignalType = SIGBUS;
+#else
+        static constexpr int SignalType = SIGSEGV;
+#endif
+        sigset_t signal_mask;
+        sigemptyset(&signal_mask);
+        sigaddset(&signal_mask, SignalType);
+
+        using HandlerType = decltype(sigaction::sa_sigaction);
+
+        struct sigaction guest_access_fault {};
+        guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        guest_access_fault.sa_sigaction = &GuestFaultSignalHandler;
+        guest_access_fault.sa_mask = signal_mask;
+        sigaction(SignalType, &guest_access_fault, nullptr);
+    }
+
+    void OnMap(VAddr address, size_t size) {}
+
+    void OnUnmap(VAddr address, size_t size) {}
+
+    void Protect(VAddr address, size_t size, bool allow_write) {
+        mprotect(reinterpret_cast<void*>(address), size,
+                 PROT_READ | (allow_write ? PROT_WRITE : 0));
+    }
+
+    static void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) {
+        ucontext_t* ctx = reinterpret_cast<ucontext_t*>(raw_context);
+        const VAddr address = reinterpret_cast<VAddr>(info->si_addr);
+#ifdef __APPLE__
+        const u32 err = ctx->uc_mcontext->__es.__err;
+#else
+        const greg_t err = ctx->uc_mcontext.gregs[REG_ERR];
+#endif
+        if (err & 0x2) {
+            rasterizer->InvalidateMemory(address, sizeof(u64));
+        } else {
+            // Read not supported!
+            UNREACHABLE();
+        }
+    }
+
+    inline static Vulkan::Rasterizer* rasterizer;
+};
+#endif
+
+PageManager::PageManager(Vulkan::Rasterizer* rasterizer_)
+    : impl{std::make_unique<Impl>(rasterizer_)}, rasterizer{rasterizer_} {}
+
+PageManager::~PageManager() = default;
+
+void PageManager::OnGpuMap(VAddr address, size_t size) {
+    impl->OnMap(address, size);
+}
+
+void PageManager::OnGpuUnmap(VAddr address, size_t size) {
+    impl->OnUnmap(address, size);
+}
+
+void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
+    static constexpr u64 PageShift = 12;
+
+    std::scoped_lock lk{mutex};
+    const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
+    const u64 page_start = addr >> PageShift;
+    const u64 page_end = page_start + num_pages;
+
+    const auto pages_interval =
+        decltype(cached_pages)::interval_type::right_open(page_start, page_end);
+    if (delta > 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+
+    const auto& range = cached_pages.equal_range(pages_interval);
+    for (const auto& [range, count] : boost::make_iterator_range(range)) {
+        const auto interval = range & pages_interval;
+        const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
+        const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
+        const u32 interval_size = interval_end_addr - interval_start_addr;
+        if (delta > 0 && count == delta) {
+            impl->Protect(interval_start_addr, interval_size, false);
+        } else if (delta < 0 && count == -delta) {
+            impl->Protect(interval_start_addr, interval_size, true);
+        } else {
+            ASSERT(count >= 0);
+        }
+    }
+
+    if (delta < 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+}
+
+} // namespace VideoCore
diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h
new file mode 100644
index 00000000..0dc022aa
--- /dev/null
+++ b/src/video_core/page_manager.h
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <boost/icl/interval_map.hpp>
+#include "common/types.h"
+
+namespace Vulkan {
+class Rasterizer;
+}
+
+namespace VideoCore {
+
+class PageManager {
+public:
+    explicit PageManager(Vulkan::Rasterizer* rasterizer);
+    ~PageManager();
+
+    /// Register a range of mapped gpu memory.
+    void OnGpuMap(VAddr address, size_t size);
+
+    /// Unregister a range of gpu memory that was unmapped.
+    void OnGpuUnmap(VAddr address, size_t size);
+
+    /// Increase/decrease the number of surface in pages touching the specified region
+    void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+    Vulkan::Rasterizer* rasterizer;
+    std::mutex mutex;
+    boost::icl::interval_map<VAddr, s32> cached_pages;
+};
+
+} // namespace VideoCore
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 6810bf34..c78d629e 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -67,8 +67,8 @@ RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool*
     : window{window_}, liverpool{liverpool_},
       instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance},
       present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window},
-      texture_cache{instance, draw_scheduler} {
-    rasterizer = std::make_unique<Rasterizer>(instance, draw_scheduler, texture_cache, liverpool);
+      rasterizer{std::make_unique<Rasterizer>(instance, draw_scheduler, liverpool)},
+      texture_cache{rasterizer->GetTextureCache()} {
     const u32 num_images = swapchain.GetImageCount();
     const vk::Device device = instance.GetDevice();
 
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 3fe9267f..8178c88d 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -47,7 +47,7 @@ public:
     Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute,
                         VAddr cpu_address, bool is_eop) {
         const auto info = VideoCore::ImageInfo{attribute, cpu_address};
-        const auto image_id = texture_cache.FindImage(info, cpu_address);
+        const auto image_id = texture_cache.FindImage(info, false);
         auto& image = texture_cache.GetImage(image_id);
         return PrepareFrameInternal(image, is_eop);
     }
@@ -61,7 +61,7 @@ public:
         const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) {
         vo_buffers_addr.emplace_back(cpu_address);
         const auto info = VideoCore::ImageInfo{attribute, cpu_address};
-        const auto image_id = texture_cache.FindImage(info, cpu_address);
+        const auto image_id = texture_cache.FindImage(info, false);
         return texture_cache.GetImage(image_id);
     }
 
@@ -88,7 +88,7 @@ private:
     Scheduler flip_scheduler;
     Swapchain swapchain;
     std::unique_ptr<Rasterizer> rasterizer;
-    VideoCore::TextureCache texture_cache;
+    VideoCore::TextureCache& texture_cache;
     vk::UniqueCommandPool command_pool;
     std::vector<Frame> present_frames;
     std::queue<Frame*> free_queue;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index d8e5f7fa..8a98e968 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -3,11 +3,10 @@
 
 #include <boost/container/small_vector.hpp>
 #include "common/alignment.h"
-#include "core/memory.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_compute_pipeline.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/texture_cache/texture_cache.h"
 
 namespace Vulkan {
@@ -51,6 +50,12 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
         });
     }
 
+    const vk::PushConstantRange push_constants = {
+        .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        .offset = 0,
+        .size = sizeof(Shader::PushData),
+    };
+
     const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
         .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
         .bindingCount = static_cast<u32>(bindings.size()),
@@ -62,8 +67,8 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
     const vk::PipelineLayoutCreateInfo layout_info = {
         .setLayoutCount = 1U,
         .pSetLayouts = &set_layout,
-        .pushConstantRangeCount = 0,
-        .pPushConstantRanges = nullptr,
+        .pushConstantRangeCount = 1U,
+        .pPushConstantRanges = &push_constants,
     };
     pipeline_layout = instance.GetDevice().createPipelineLayoutUnique(layout_info);
 
@@ -82,35 +87,18 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
 
 ComputePipeline::~ComputePipeline() = default;
 
-bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
+bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
                                     VideoCore::TextureCache& texture_cache) const {
     // Bind resource buffers and textures.
     boost::container::static_vector<vk::DescriptorBufferInfo, 16> buffer_infos;
     boost::container::static_vector<vk::DescriptorImageInfo, 16> image_infos;
     boost::container::small_vector<vk::WriteDescriptorSet, 16> set_writes;
+    Shader::PushData push_data{};
     u32 binding{};
 
-    for (const auto& buffer : info.buffers) {
+    for (u32 i = 0; const auto& buffer : info.buffers) {
         const auto vsharp = buffer.GetVsharp(info);
-        const u32 size = vsharp.GetSize();
         const VAddr address = vsharp.base_address;
-        if (buffer.is_storage) {
-            texture_cache.OnCpuWrite(address);
-        }
-        const u32 offset = staging.Copy(address, size,
-                                        buffer.is_storage ? instance.StorageMinAlignment()
-                                                          : instance.UniformMinAlignment());
-        buffer_infos.emplace_back(staging.Handle(), offset, size);
-        set_writes.push_back({
-            .dstSet = VK_NULL_HANDLE,
-            .dstBinding = binding++,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer
-                                                : vk::DescriptorType::eUniformBuffer,
-            .pBufferInfo = &buffer_infos.back(),
-        });
-
         // Most of the time when a metadata is updated with a shader it gets cleared. It means we
         // can skip the whole dispatch and update the tracked state instead. Also, it is not
         // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will
@@ -125,6 +113,31 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
                 LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)");
             }
         }
+        const u32 size = vsharp.GetSize();
+        if (buffer.is_written) {
+            texture_cache.InvalidateMemory(address, size);
+        }
+        const u32 alignment =
+            buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment();
+        const auto [vk_buffer, offset] =
+            buffer_cache.ObtainBuffer(address, size, buffer.is_written);
+        const u32 offset_aligned = Common::AlignDown(offset, alignment);
+        const u32 adjust = offset - offset_aligned;
+        if (adjust != 0) {
+            ASSERT(adjust % 4 == 0);
+            push_data.AddOffset(i, adjust);
+        }
+        buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
+        set_writes.push_back({
+            .dstSet = VK_NULL_HANDLE,
+            .dstBinding = binding++,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer
+                                                : vk::DescriptorType::eUniformBuffer,
+            .pBufferInfo = &buffer_infos.back(),
+        });
+        i++;
     }
 
     for (const auto& image_desc : info.images) {
@@ -168,6 +181,8 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
     }
 
     const auto cmdbuf = scheduler.CommandBuffer();
+    cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data),
+                         &push_data);
     cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, set_writes);
     return true;
 }
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h
index 4cdcccfc..16de5635 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h
@@ -6,19 +6,15 @@
 #include "shader_recompiler/runtime_info.h"
 #include "video_core/renderer_vulkan/vk_common.h"
 
-namespace Core {
-class MemoryManager;
-}
-
 namespace VideoCore {
+class BufferCache;
 class TextureCache;
-}
+} // namespace VideoCore
 
 namespace Vulkan {
 
 class Instance;
 class Scheduler;
-class StreamBuffer;
 
 class ComputePipeline {
 public:
@@ -31,7 +27,7 @@ public:
         return *pipeline;
     }
 
-    bool BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
+    bool BindResources(VideoCore::BufferCache& buffer_cache,
                        VideoCore::TextureCache& texture_cache) const;
 
 private:
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 7b00a911..91ff999e 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -5,13 +5,13 @@
 #include <boost/container/small_vector.hpp>
 #include <boost/container/static_vector.hpp>
 
+#include "common/alignment.h"
 #include "common/assert.h"
-#include "core/memory.h"
 #include "video_core/amdgpu/resource.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
 #include "video_core/texture_cache/texture_cache.h"
 
 namespace Vulkan {
@@ -32,9 +32,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
     BuildDescSetLayout();
 
     const vk::PushConstantRange push_constants = {
-        .stageFlags = vk::ShaderStageFlagBits::eVertex,
+        .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment,
         .offset = 0,
-        .size = 2 * sizeof(u32),
+        .size = sizeof(Shader::PushData),
     };
 
     const vk::DescriptorSetLayout set_layout = *desc_layout;
@@ -328,25 +328,43 @@ void GraphicsPipeline::BuildDescSetLayout() {
     desc_layout = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
 }
 
-void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
+void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
+                                     VideoCore::BufferCache& buffer_cache,
                                      VideoCore::TextureCache& texture_cache) const {
-    BindVertexBuffers(staging);
-
     // Bind resource buffers and textures.
     boost::container::static_vector<vk::DescriptorBufferInfo, 16> buffer_infos;
     boost::container::static_vector<vk::DescriptorImageInfo, 32> image_infos;
     boost::container::small_vector<vk::WriteDescriptorSet, 16> set_writes;
+    Shader::PushData push_data{};
     u32 binding{};
 
     for (const auto& stage : stages) {
-        for (const auto& buffer : stage.buffers) {
+        if (stage.uses_step_rates) {
+            push_data.step0 = regs.vgt_instance_step_rate_0;
+            push_data.step1 = regs.vgt_instance_step_rate_1;
+        }
+        for (u32 i = 0; const auto& buffer : stage.buffers) {
             const auto vsharp = buffer.GetVsharp(stage);
-            const VAddr address = vsharp.base_address;
-            const u32 size = vsharp.GetSize();
-            const u32 offset = staging.Copy(address, size,
-                                            buffer.is_storage ? instance.StorageMinAlignment()
-                                                              : instance.UniformMinAlignment());
-            buffer_infos.emplace_back(staging.Handle(), offset, size);
+            if (vsharp) {
+                const VAddr address = vsharp.base_address;
+                if (texture_cache.IsMeta(address)) {
+                    LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)");
+                }
+                const u32 size = vsharp.GetSize();
+                const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment()
+                                                        : instance.UniformMinAlignment();
+                const auto [vk_buffer, offset] =
+                    buffer_cache.ObtainBuffer(address, size, buffer.is_written);
+                const u32 offset_aligned = Common::AlignDown(offset, alignment);
+                const u32 adjust = offset - offset_aligned;
+                if (adjust != 0) {
+                    ASSERT(adjust % 4 == 0);
+                    push_data.AddOffset(i, adjust);
+                }
+                buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
+            } else {
+                buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE);
+            }
             set_writes.push_back({
                 .dstSet = VK_NULL_HANDLE,
                 .dstBinding = binding++,
@@ -356,10 +374,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
                                                     : vk::DescriptorType::eUniformBuffer,
                 .pBufferInfo = &buffer_infos.back(),
             });
-
-            if (texture_cache.IsMeta(address)) {
-                LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)");
-            }
+            i++;
         }
 
         boost::container::static_vector<AmdGpu::Image, 16> tsharps;
@@ -406,86 +421,15 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
         }
     }
 
+    const auto cmdbuf = scheduler.CommandBuffer();
     if (!set_writes.empty()) {
-        const auto cmdbuf = scheduler.CommandBuffer();
         cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0,
                                     set_writes);
     }
-}
-
-void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const {
-    const auto& vs_info = stages[u32(Shader::Stage::Vertex)];
-    if (vs_info.vs_inputs.empty()) {
-        return;
-    }
-
-    std::array<vk::Buffer, MaxVertexBufferCount> host_buffers;
-    std::array<vk::DeviceSize, MaxVertexBufferCount> host_offsets;
-    boost::container::static_vector<AmdGpu::Buffer, MaxVertexBufferCount> guest_buffers;
-
-    struct BufferRange {
-        VAddr base_address;
-        VAddr end_address;
-        u64 offset; // offset in the mapped memory
-
-        size_t GetSize() const {
-            return end_address - base_address;
-        }
-    };
-
-    // Calculate buffers memory overlaps
-    boost::container::static_vector<BufferRange, MaxVertexBufferCount> ranges{};
-    for (const auto& input : vs_info.vs_inputs) {
-        if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 ||
-            input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) {
-            continue;
-        }
-
-        const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
-        if (buffer.GetSize() == 0) {
-            continue;
-        }
-        guest_buffers.emplace_back(buffer);
-        ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize());
-    }
-    std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) {
-        return lhv.base_address < rhv.base_address;
-    });
-
-    boost::container::static_vector<BufferRange, MaxVertexBufferCount> ranges_merged{ranges[0]};
-    for (auto range : ranges) {
-        auto& prev_range = ranges_merged.back();
-        if (prev_range.end_address < range.base_address) {
-            ranges_merged.emplace_back(range);
-        } else {
-            prev_range.end_address = std::max(prev_range.end_address, range.end_address);
-        }
-    }
-
-    // Map buffers
-    for (auto& range : ranges_merged) {
-        range.offset = staging.Copy(range.base_address, range.GetSize(), 4);
-    }
-
-    // Bind vertex buffers
-    const size_t num_buffers = guest_buffers.size();
-    for (u32 i = 0; i < num_buffers; ++i) {
-        const auto& buffer = guest_buffers[i];
-        const auto& host_buffer = std::ranges::find_if(
-            ranges_merged.cbegin(), ranges_merged.cend(), [&](const BufferRange& range) {
-                return (buffer.base_address >= range.base_address &&
-                        buffer.base_address < range.end_address);
-            });
-        assert(host_buffer != ranges_merged.cend());
-
-        host_buffers[i] = staging.Handle();
-        host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address;
-    }
-
-    if (num_buffers > 0) {
-        const auto cmdbuf = scheduler.CommandBuffer();
-        cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data());
-    }
+    cmdbuf.pushConstants(*pipeline_layout,
+                         vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, 0U,
+                         sizeof(push_data), &push_data);
+    cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, Handle());
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
index e1564f8f..f818d980 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@@ -7,13 +7,10 @@
 #include "video_core/renderer_vulkan/liverpool_to_vk.h"
 #include "video_core/renderer_vulkan/vk_common.h"
 
-namespace Core {
-class MemoryManager;
-}
-
 namespace VideoCore {
+class BufferCache;
 class TextureCache;
-}
+} // namespace VideoCore
 
 namespace Vulkan {
 
@@ -22,7 +19,6 @@ static constexpr u32 MaxShaderStages = 5;
 
 class Instance;
 class Scheduler;
-class StreamBuffer;
 
 using Liverpool = AmdGpu::Liverpool;
 
@@ -64,7 +60,7 @@ public:
                               std::array<vk::ShaderModule, MaxShaderStages> modules);
     ~GraphicsPipeline();
 
-    void BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
+    void BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache,
                        VideoCore::TextureCache& texture_cache) const;
 
     vk::Pipeline Handle() const noexcept {
@@ -75,6 +71,10 @@ public:
         return *pipeline_layout;
     }
 
+    const Shader::Info& GetStage(Shader::Stage stage) const noexcept {
+        return stages[u32(stage)];
+    }
+
     bool IsEmbeddedVs() const noexcept {
         static constexpr size_t EmbeddedVsHash = 0x9b2da5cf47f8c29f;
         return key.stage_hashes[u32(Shader::Stage::Vertex)] == EmbeddedVsHash;
@@ -90,7 +90,6 @@ public:
 
 private:
     void BuildDescSetLayout();
-    void BindVertexBuffers(StreamBuffer& staging) const;
 
 private:
     const Instance& instance;
diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp
index 735303a3..2d396daf 100644
--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@@ -204,7 +204,8 @@ bool Instance::CreateDevice() {
     // The next two extensions are required to be available together in order to support write masks
     color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME);
     color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME);
-    const auto calibrated_timestamps = add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
+    const bool calibrated_timestamps = add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
+    const bool robustness = add_extension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME);
 
     // These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2
     // with extensions.
@@ -303,12 +304,19 @@ bool Instance::CreateDevice() {
             .workgroupMemoryExplicitLayoutScalarBlockLayout = true,
             .workgroupMemoryExplicitLayout8BitAccess = true,
             .workgroupMemoryExplicitLayout16BitAccess = true,
-        }};
+        },
+        vk::PhysicalDeviceRobustness2FeaturesEXT{
+            .nullDescriptor = true,
+        },
+    };
 
     if (!color_write_en) {
         device_chain.unlink<vk::PhysicalDeviceColorWriteEnableFeaturesEXT>();
         device_chain.unlink<vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT>();
     }
+    if (!robustness) {
+        device_chain.unlink<vk::PhysicalDeviceRobustness2FeaturesEXT>();
+    }
 
     try {
         device = physical_device.createDeviceUnique(device_chain.get());
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index a77b298b..d41723ec 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -5,7 +5,6 @@
 
 #include <tsl/robin_map.h>
 #include "shader_recompiler/ir/basic_block.h"
-#include "shader_recompiler/object_pool.h"
 #include "shader_recompiler/profile.h"
 #include "video_core/renderer_vulkan/vk_compute_pipeline.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
@@ -51,8 +50,8 @@ private:
     Shader::Profile profile{};
     GraphicsPipelineKey graphics_key{};
     u64 compute_key{};
-    Shader::ObjectPool<Shader::IR::Inst> inst_pool;
-    Shader::ObjectPool<Shader::IR::Block> block_pool;
+    Common::ObjectPool<Shader::IR::Inst> inst_pool;
+    Common::ObjectPool<Shader::IR::Block> block_pool;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index ff5e97d5..51de09f7 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -13,22 +13,17 @@
 
 namespace Vulkan {
 
-static constexpr vk::BufferUsageFlags VertexIndexFlags =
-    vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndexBuffer |
-    vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eUniformBuffer |
-    vk::BufferUsageFlagBits::eStorageBuffer;
-
 Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
-                       VideoCore::TextureCache& texture_cache_, AmdGpu::Liverpool* liverpool_)
-    : instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_},
-      liverpool{liverpool_}, memory{Core::Memory::Instance()},
-      pipeline_cache{instance, scheduler, liverpool},
-      vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} {
+                       AmdGpu::Liverpool* liverpool_)
+    : instance{instance_}, scheduler{scheduler_}, page_manager{this},
+      buffer_cache{instance, scheduler, liverpool_, page_manager},
+      texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
+      memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
     if (!Config::nullGpu()) {
         liverpool->BindRasterizer(this);
     }
-
-    memory->SetInstance(&instance);
+    memory->SetRasterizer(this);
+    wfi_event = instance.GetDevice().createEventUnique({});
 }
 
 Rasterizer::~Rasterizer() = default;
@@ -38,29 +33,24 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
 
     const auto cmdbuf = scheduler.CommandBuffer();
     const auto& regs = liverpool->regs;
-    const u32 num_indices = SetupIndexBuffer(is_indexed, index_offset);
     const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
     if (!pipeline) {
         return;
     }
 
     try {
-        pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
+        pipeline->BindResources(regs, buffer_cache, texture_cache);
     } catch (...) {
         UNREACHABLE();
     }
 
+    const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex);
+    buffer_cache.BindVertexBuffers(vs_info);
+    const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, index_offset);
+
     BeginRendering();
     UpdateDynamicState(*pipeline);
 
-    cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle());
-
-    const u32 step_rates[] = {
-        regs.vgt_instance_step_rate_0,
-        regs.vgt_instance_step_rate_1,
-    };
-    cmdbuf.pushConstants(pipeline->GetLayout(), vk::ShaderStageFlagBits::eVertex, 0u,
-                         sizeof(step_rates), &step_rates);
     if (is_indexed) {
         cmdbuf.drawIndexed(num_indices, regs.num_instances.NumInstances(), 0, 0, 0);
     } else {
@@ -82,8 +72,7 @@ void Rasterizer::DispatchDirect() {
     }
 
     try {
-        const auto has_resources =
-            pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
+        const auto has_resources = pipeline->BindResources(buffer_cache, texture_cache);
         if (!has_resources) {
             return;
         }
@@ -131,7 +120,7 @@ void Rasterizer::BeginRendering() {
         state.color_images[state.num_color_attachments] = image.image;
         state.color_attachments[state.num_color_attachments++] = {
             .imageView = *image_view.image_view,
-            .imageLayout = vk::ImageLayout::eGeneral,
+            .imageLayout = vk::ImageLayout::eColorAttachmentOptimal,
             .loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad,
             .storeOp = vk::AttachmentStoreOp::eStore,
             .clearValue =
@@ -168,45 +157,19 @@ void Rasterizer::BeginRendering() {
     scheduler.BeginRendering(state);
 }
 
-u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) {
-    // Emulate QuadList primitive type with CPU made index buffer.
-    const auto& regs = liverpool->regs;
-    if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) {
-        // ASSERT_MSG(!is_indexed, "Using QuadList primitive with indexed draw");
-        is_indexed = true;
+void Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
+    buffer_cache.InvalidateMemory(addr, size);
+    texture_cache.InvalidateMemory(addr, size);
+}
 
-        // Emit indices.
-        const u32 index_size = 3 * regs.num_indices;
-        const auto [data, offset, _] = vertex_index_buffer.Map(index_size);
-        LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices);
-        vertex_index_buffer.Commit(index_size);
+void Rasterizer::MapMemory(VAddr addr, u64 size) {
+    page_manager.OnGpuMap(addr, size);
+}
 
-        // Bind index buffer.
-        const auto cmdbuf = scheduler.CommandBuffer();
-        cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset, vk::IndexType::eUint16);
-        return index_size / sizeof(u16);
-    }
-    if (!is_indexed) {
-        return regs.num_indices;
-    }
-
-    // Figure out index type and size.
-    const bool is_index16 = regs.index_buffer_type.index_type == Liverpool::IndexType::Index16;
-    const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32;
-    const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32);
-
-    // Upload index data to stream buffer.
-    const auto index_address = regs.index_base_address.Address<const void*>();
-    const u32 index_buffer_size = (index_offset + regs.num_indices) * index_size;
-    const auto [data, offset, _] = vertex_index_buffer.Map(index_buffer_size);
-    std::memcpy(data, index_address, index_buffer_size);
-    vertex_index_buffer.Commit(index_buffer_size);
-
-    // Bind index buffer.
-    const auto cmdbuf = scheduler.CommandBuffer();
-    cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset + index_offset * index_size,
-                           index_type);
-    return regs.num_indices;
+void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
+    buffer_cache.InvalidateMemory(addr, size);
+    texture_cache.UnmapMemory(addr, size);
+    page_manager.OnGpuUnmap(addr, size);
 }
 
 void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 64dc87ef..685ba6e0 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -3,8 +3,10 @@
 
 #pragma once
 
+#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/page_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+#include "video_core/texture_cache/texture_cache.h"
 
 namespace AmdGpu {
 struct Liverpool;
@@ -14,10 +16,6 @@ namespace Core {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class TextureCache;
-}
-
 namespace Vulkan {
 
 class Scheduler;
@@ -26,9 +24,13 @@ class GraphicsPipeline;
 class Rasterizer {
 public:
     explicit Rasterizer(const Instance& instance, Scheduler& scheduler,
-                        VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool);
+                        AmdGpu::Liverpool* liverpool);
     ~Rasterizer();
 
+    [[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept {
+        return texture_cache;
+    }
+
     void Draw(bool is_indexed, u32 index_offset = 0);
 
     void DispatchDirect();
@@ -36,12 +38,13 @@ public:
     void ScopeMarkerBegin(const std::string& str);
     void ScopeMarkerEnd();
 
+    void InvalidateMemory(VAddr addr, u64 size);
+    void MapMemory(VAddr addr, u64 size);
+    void UnmapMemory(VAddr addr, u64 size);
+
     u64 Flush();
 
 private:
-    u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset);
-    void MapMemory(VAddr addr, size_t size);
-
     void BeginRendering();
 
     void UpdateDynamicState(const GraphicsPipeline& pipeline);
@@ -51,11 +54,13 @@ private:
 private:
     const Instance& instance;
     Scheduler& scheduler;
-    VideoCore::TextureCache& texture_cache;
+    VideoCore::PageManager page_manager;
+    VideoCore::BufferCache buffer_cache;
+    VideoCore::TextureCache texture_cache;
     AmdGpu::Liverpool* liverpool;
     Core::MemoryManager* memory;
     PipelineCache pipeline_cache;
-    StreamBuffer vertex_index_buffer;
+    vk::UniqueEvent wfi_event;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 48c3af7a..b82d558c 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -6,6 +6,7 @@
 #include <condition_variable>
 #include <boost/container/static_vector.hpp>
 #include "common/types.h"
+#include "common/unique_function.h"
 #include "video_core/renderer_vulkan/vk_master_semaphore.h"
 #include "video_core/renderer_vulkan/vk_resource_pool.h"
 
@@ -97,8 +98,8 @@ public:
     }
 
     /// Defers an operation until the gpu has reached the current cpu tick.
-    void DeferOperation(auto&& func) {
-        pending_ops.emplace(func, CurrentTick());
+    void DeferOperation(Common::UniqueFunction<void>&& func) {
+        pending_ops.emplace(std::move(func), CurrentTick());
     }
 
     static std::mutex submit_mutex;
@@ -115,7 +116,7 @@ private:
     vk::CommandBuffer current_cmdbuf;
     std::condition_variable_any event_cv;
     struct PendingOp {
-        std::function<void()> callback;
+        Common::UniqueFunction<void> callback;
         u64 gpu_tick;
     };
     std::queue<PendingOp> pending_ops;
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
deleted file mode 100644
index 116f7896..00000000
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <algorithm>
-#include "common/alignment.h"
-#include "common/assert.h"
-#include "video_core/renderer_vulkan/vk_instance.h"
-#include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
-
-namespace Vulkan {
-
-namespace {
-
-std::string_view BufferTypeName(BufferType type) {
-    switch (type) {
-    case BufferType::Upload:
-        return "Upload";
-    case BufferType::Download:
-        return "Download";
-    case BufferType::Stream:
-        return "Stream";
-    default:
-        return "Invalid";
-    }
-}
-
-vk::MemoryPropertyFlags MakePropertyFlags(BufferType type) {
-    switch (type) {
-    case BufferType::Upload:
-        return vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
-    case BufferType::Download:
-        return vk::MemoryPropertyFlagBits::eHostVisible |
-               vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
-    case BufferType::Stream:
-        return vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible |
-               vk::MemoryPropertyFlagBits::eHostCoherent;
-    default:
-        UNREACHABLE_MSG("Unknown buffer type {}", static_cast<u32>(type));
-        return vk::MemoryPropertyFlagBits::eHostVisible;
-    }
-}
-
-static std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
-                                         vk::MemoryPropertyFlags wanted) {
-    for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
-        const auto flags = properties.memoryTypes[i].propertyFlags;
-        if ((flags & wanted) == wanted) {
-            return i;
-        }
-    }
-    return std::nullopt;
-}
-
-/// Get the preferred host visible memory type.
-u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, BufferType type) {
-    vk::MemoryPropertyFlags flags = MakePropertyFlags(type);
-    std::optional preferred_type = FindMemoryType(properties, flags);
-
-    constexpr std::array remove_flags = {
-        vk::MemoryPropertyFlagBits::eHostCached,
-        vk::MemoryPropertyFlagBits::eHostCoherent,
-    };
-
-    for (u32 i = 0; i < remove_flags.size() && !preferred_type; i++) {
-        flags &= ~remove_flags[i];
-        preferred_type = FindMemoryType(properties, flags);
-    }
-    ASSERT_MSG(preferred_type, "No suitable memory type found");
-    return preferred_type.value();
-}
-
-constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
-constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
-
-} // Anonymous namespace
-
-StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
-                           vk::BufferUsageFlags usage_, u64 size, BufferType type_)
-    : instance{instance_}, scheduler{scheduler_}, device{instance.GetDevice()},
-      stream_buffer_size{size}, usage{usage_}, type{type_} {
-    CreateBuffers(size);
-    ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
-    ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
-}
-
-StreamBuffer::~StreamBuffer() {
-    device.unmapMemory(memory);
-    device.destroyBuffer(buffer);
-    device.freeMemory(memory);
-}
-
-std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
-    if (!is_coherent && type == BufferType::Stream) {
-        size = Common::AlignUp(size, instance.NonCoherentAtomSize());
-    }
-
-    ASSERT(size <= stream_buffer_size);
-    mapped_size = size;
-
-    if (alignment > 0) {
-        offset = Common::AlignUp(offset, alignment);
-    }
-
-    bool invalidate{false};
-    if (offset + size > stream_buffer_size) {
-        // The buffer would overflow, save the amount of used watches and reset the state.
-        invalidate = true;
-        invalidation_mark = current_watch_cursor;
-        current_watch_cursor = 0;
-        offset = 0;
-
-        // Swap watches and reset waiting cursors.
-        std::swap(previous_watches, current_watches);
-        wait_cursor = 0;
-        wait_bound = 0;
-    }
-
-    const u64 mapped_upper_bound = offset + size;
-    WaitPendingOperations(mapped_upper_bound);
-
-    return std::make_tuple(mapped + offset, offset, invalidate);
-}
-
-void StreamBuffer::Commit(u64 size) {
-    if (!is_coherent && type == BufferType::Stream) {
-        size = Common::AlignUp(size, instance.NonCoherentAtomSize());
-    }
-
-    ASSERT_MSG(size <= mapped_size, "Reserved size {} is too small compared to {}", mapped_size,
-               size);
-
-    const vk::MappedMemoryRange range = {
-        .memory = memory,
-        .offset = offset,
-        .size = size,
-    };
-
-    if (!is_coherent && type == BufferType::Download) {
-        device.invalidateMappedMemoryRanges(range);
-    } else if (!is_coherent) {
-        device.flushMappedMemoryRanges(range);
-    }
-
-    offset += size;
-
-    if (current_watch_cursor + 1 >= current_watches.size()) {
-        // Ensure that there are enough watches.
-        ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
-    }
-    auto& watch = current_watches[current_watch_cursor++];
-    watch.upper_bound = offset;
-    watch.tick = scheduler.CurrentTick();
-}
-
-void StreamBuffer::CreateBuffers(u64 prefered_size) {
-    const vk::Device device = instance.GetDevice();
-    const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties();
-    const u32 preferred_type = GetMemoryType(memory_properties, type);
-    const vk::MemoryType mem_type = memory_properties.memoryTypes[preferred_type];
-    const u32 preferred_heap = mem_type.heapIndex;
-    is_coherent =
-        static_cast<bool>(mem_type.propertyFlags & vk::MemoryPropertyFlagBits::eHostCoherent);
-
-    // Substract from the preferred heap size some bytes to avoid getting out of memory.
-    const vk::DeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
-    // As per DXVK's example, using `heap_size / 2`
-    const vk::DeviceSize allocable_size = heap_size / 2;
-    buffer = device.createBuffer({
-        .size = std::min(prefered_size, allocable_size),
-        .usage = usage,
-    });
-
-    const auto requirements_chain =
-        device
-            .getBufferMemoryRequirements2<vk::MemoryRequirements2, vk::MemoryDedicatedRequirements>(
-                {.buffer = buffer});
-
-    const auto& requirements = requirements_chain.get<vk::MemoryRequirements2>();
-    const auto& dedicated_requirements = requirements_chain.get<vk::MemoryDedicatedRequirements>();
-
-    stream_buffer_size = static_cast<u64>(requirements.memoryRequirements.size);
-
-    LOG_INFO(Render_Vulkan, "Creating {} buffer with size {} KiB with flags {}",
-             BufferTypeName(type), stream_buffer_size / 1024,
-             vk::to_string(mem_type.propertyFlags));
-
-    if (dedicated_requirements.prefersDedicatedAllocation) {
-        vk::StructureChain<vk::MemoryAllocateInfo, vk::MemoryDedicatedAllocateInfo> alloc_chain =
-            {};
-
-        auto& alloc_info = alloc_chain.get<vk::MemoryAllocateInfo>();
-        alloc_info.allocationSize = requirements.memoryRequirements.size;
-        alloc_info.memoryTypeIndex = preferred_type;
-
-        auto& dedicated_alloc_info = alloc_chain.get<vk::MemoryDedicatedAllocateInfo>();
-        dedicated_alloc_info.buffer = buffer;
-
-        memory = device.allocateMemory(alloc_chain.get());
-    } else {
-        memory = device.allocateMemory({
-            .allocationSize = requirements.memoryRequirements.size,
-            .memoryTypeIndex = preferred_type,
-        });
-    }
-
-    device.bindBufferMemory(buffer, memory, 0);
-    mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE));
-
-    if (instance.HasDebuggingToolAttached()) {
-        SetObjectName(device, buffer, "StreamBuffer({}): {} KiB {}", BufferTypeName(type),
-                      stream_buffer_size / 1024, vk::to_string(mem_type.propertyFlags));
-        SetObjectName(device, memory, "StreamBufferMemory({}): {} Kib {}", BufferTypeName(type),
-                      stream_buffer_size / 1024, vk::to_string(mem_type.propertyFlags));
-    }
-}
-
-void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
-    watches.resize(watches.size() + grow_size);
-}
-
-void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
-    if (!invalidation_mark) {
-        return;
-    }
-    while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
-        auto& watch = previous_watches[wait_cursor];
-        wait_bound = watch.upper_bound;
-        scheduler.Wait(watch.tick);
-        ++wait_cursor;
-    }
-}
-
-u64 StreamBuffer::Copy(VAddr src, size_t size, size_t alignment /*= 0*/) {
-    const auto [data, offset, _] = Map(size, alignment);
-    std::memcpy(data, reinterpret_cast<const void*>(src), size);
-    Commit(size);
-    return offset;
-}
-
-} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
deleted file mode 100644
index f7957ac0..00000000
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <optional>
-#include <span>
-#include <tuple>
-#include <vector>
-#include "common/types.h"
-#include "video_core/renderer_vulkan/vk_common.h"
-
-namespace Vulkan {
-
-enum class BufferType : u32 {
-    Upload = 0,
-    Download = 1,
-    Stream = 2,
-};
-
-class Instance;
-class Scheduler;
-
-class StreamBuffer final {
-    static constexpr std::size_t MAX_BUFFER_VIEWS = 3;
-
-public:
-    explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
-                          vk::BufferUsageFlags usage, u64 size,
-                          BufferType type = BufferType::Stream);
-    ~StreamBuffer();
-
-    /**
-     * Reserves a region of memory from the stream buffer.
-     * @param size Size to reserve.
-     * @returns A pair of a raw memory pointer (with offset added), and the buffer offset
-     */
-    std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment = 0);
-
-    /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
-    void Commit(u64 size);
-
-    /// Maps and commits a memory region with user provided data
-    u64 Copy(VAddr src, size_t size, size_t alignment = 0);
-
-    vk::Buffer Handle() const noexcept {
-        return buffer;
-    }
-
-private:
-    struct Watch {
-        u64 tick{};
-        u64 upper_bound{};
-    };
-
-    /// Creates Vulkan buffer handles committing the required the required memory.
-    void CreateBuffers(u64 prefered_size);
-
-    /// Increases the amount of watches available.
-    void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);
-
-    void WaitPendingOperations(u64 requested_upper_bound);
-
-private:
-    const Instance& instance; ///< Vulkan instance.
-    Scheduler& scheduler;     ///< Command scheduler.
-
-    vk::Device device;
-    vk::Buffer buffer;        ///< Mapped buffer.
-    vk::DeviceMemory memory;  ///< Memory allocation.
-    u8* mapped{};             ///< Pointer to the mapped memory
-    u64 stream_buffer_size{}; ///< Stream buffer size.
-    vk::BufferUsageFlags usage{};
-    BufferType type;
-
-    u64 offset{};       ///< Buffer iterator.
-    u64 mapped_size{};  ///< Size reserved for the current copy.
-    bool is_coherent{}; ///< True if the buffer is coherent
-
-    std::vector<Watch> current_watches;           ///< Watches recorded in the current iteration.
-    std::size_t current_watch_cursor{};           ///< Count of watches, reset on invalidation.
-    std::optional<std::size_t> invalidation_mark; ///< Number of watches used in the previous cycle.
-
-    std::vector<Watch> previous_watches; ///< Watches used in the previous iteration.
-    std::size_t wait_cursor{};           ///< Last watch being waited for completion.
-    u64 wait_bound{};                    ///< Highest offset being watched for completion.
-};
-
-} // namespace Vulkan
diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp
index e01a61ae..94917be0 100644
--- a/src/video_core/texture_cache/image_info.cpp
+++ b/src/video_core/texture_cache/image_info.cpp
@@ -260,7 +260,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
         case AmdGpu::TilingMode::Display_MacroTiled:
         case AmdGpu::TilingMode::Texture_MacroTiled:
         case AmdGpu::TilingMode::Depth_MacroTiled: {
-            ASSERT(!props.is_cube && !props.is_block);
+            // ASSERT(!props.is_cube && !props.is_block);
             ASSERT(num_samples == 1);
             std::tie(mip_info.pitch, mip_info.size) =
                 ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index);
diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp
index 04bedaff..ef6163c4 100644
--- a/src/video_core/texture_cache/image_view.cpp
+++ b/src/video_core/texture_cache/image_view.cpp
@@ -61,23 +61,24 @@ vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) {
     return format;
 }
 
-ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept
-    : is_storage{is_storage} {
+ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage_) noexcept
+    : is_storage{is_storage_} {
     type = ConvertImageViewType(image.GetType());
     format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
     range.base.level = image.base_level;
     range.base.layer = image.base_array;
     range.extent.levels = image.last_level + 1;
     range.extent.layers = image.last_array + 1;
-    mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
-    mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
-    mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
-    mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
+    if (!is_storage) {
+        mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
+        mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
+        mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
+        mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
+    }
     // Check for unfortunate case of storage images being swizzled
     const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt());
     const u32 dst_sel = image.DstSelect();
     if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) {
-        mapping = vk::ComponentMapping{};
         if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) {
             format = new_format;
             return;
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index 7b8a5554..53596f8e 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -3,103 +3,22 @@
 
 #include <xxhash.h>
 #include "common/assert.h"
-#include "common/config.h"
-#include "core/virtual_memory.h"
+#include "video_core/page_manager.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/texture_cache/texture_cache.h"
 #include "video_core/texture_cache/tile_manager.h"
 
-#ifndef _WIN64
-#include <signal.h>
-#include <sys/mman.h>
-
-#define PAGE_NOACCESS PROT_NONE
-#define PAGE_READWRITE (PROT_READ | PROT_WRITE)
-#define PAGE_READONLY PROT_READ
-#else
-#include <windows.h>
-
-void mprotect(void* addr, size_t len, int prot) {
-    DWORD old_prot{};
-    BOOL result = VirtualProtect(addr, len, prot, &old_prot);
-    ASSERT_MSG(result != 0, "Region protection failed");
-}
-
-#endif
-
 namespace VideoCore {
 
-static TextureCache* g_texture_cache = nullptr;
-
-#ifndef _WIN64
-void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) {
-    ucontext_t* ctx = reinterpret_cast<ucontext_t*>(raw_context);
-    const VAddr address = reinterpret_cast<VAddr>(info->si_addr);
-
-#ifdef __APPLE__
-    const u32 err = ctx->uc_mcontext->__es.__err;
-#else
-    const greg_t err = ctx->uc_mcontext.gregs[REG_ERR];
-#endif
-
-    if (err & 0x2) {
-        g_texture_cache->OnCpuWrite(address);
-    } else {
-        // Read not supported!
-        UNREACHABLE();
-    }
-}
-#else
-LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept {
-    const u32 ec = pExp->ExceptionRecord->ExceptionCode;
-    if (ec == EXCEPTION_ACCESS_VIOLATION) {
-        const auto info = pExp->ExceptionRecord->ExceptionInformation;
-        if (info[0] == 1) { // Write violation
-            g_texture_cache->OnCpuWrite(info[1]);
-            return EXCEPTION_CONTINUE_EXECUTION;
-        } /* else {
-            UNREACHABLE();
-        }*/
-    }
-    return EXCEPTION_CONTINUE_SEARCH; // pass further
-}
-#endif
-
 static constexpr u64 StreamBufferSize = 512_MB;
 static constexpr u64 PageShift = 12;
 
-TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_)
-    : instance{instance_}, scheduler{scheduler_},
-      staging{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, StreamBufferSize,
-              Vulkan::BufferType::Upload},
+TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
+                           BufferCache& buffer_cache_, PageManager& tracker_)
+    : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_},
+      staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize},
       tile_manager{instance, scheduler} {
-
-#ifndef _WIN64
-#ifdef __APPLE__
-    // Read-only memory write results in SIGBUS on Apple.
-    static constexpr int SignalType = SIGBUS;
-#else
-    static constexpr int SignalType = SIGSEGV;
-#endif
-
-    sigset_t signal_mask;
-    sigemptyset(&signal_mask);
-    sigaddset(&signal_mask, SignalType);
-
-    using HandlerType = decltype(sigaction::sa_sigaction);
-
-    struct sigaction guest_access_fault {};
-    guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK;
-    guest_access_fault.sa_sigaction = &GuestFaultSignalHandler;
-    guest_access_fault.sa_mask = signal_mask;
-    sigaction(SignalType, &guest_access_fault, nullptr);
-#else
-    veh_handle = AddVectoredExceptionHandler(0, GuestFaultSignalHandler);
-    ASSERT_MSG(veh_handle, "Failed to register an exception handler");
-#endif
-    g_texture_cache = this;
-
     ImageInfo info;
     info.pixel_format = vk::Format::eR8G8B8A8Unorm;
     info.type = vk::ImageType::e2D;
@@ -110,15 +29,11 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler&
     void(slot_image_views.insert(instance, view_info, slot_images[null_id], null_id));
 }
 
-TextureCache::~TextureCache() {
-#if _WIN64
-    RemoveVectoredExceptionHandler(veh_handle);
-#endif
-}
+TextureCache::~TextureCache() = default;
 
-void TextureCache::OnCpuWrite(VAddr address) {
-    std::unique_lock lock{m_page_table};
-    ForEachImageInRegion(address, 1 << PageShift, [&](ImageId image_id, Image& image) {
+void TextureCache::InvalidateMemory(VAddr address, size_t size) {
+    std::unique_lock lock{mutex};
+    ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) {
         // Ensure image is reuploaded when accessed again.
         image.flags |= ImageFlagBits::CpuModified;
         // Untrack image, so the range is unprotected and the guest can write freely.
@@ -126,8 +41,28 @@ void TextureCache::OnCpuWrite(VAddr address) {
     });
 }
 
+void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
+    std::scoped_lock lk{mutex};
+
+    boost::container::small_vector<ImageId, 16> deleted_images;
+    ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); });
+    for (const ImageId id : deleted_images) {
+        Image& image = slot_images[id];
+        if (True(image.flags & ImageFlagBits::Tracked)) {
+            UntrackImage(image, id);
+        }
+        // TODO: Download image data back to host.
+        UnregisterImage(id);
+        DeleteImage(id);
+    }
+}
+
 ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
-    std::unique_lock lock{m_page_table};
+    if (info.guest_address == 0) [[unlikely]] {
+        return NULL_IMAGE_VIEW_ID;
+    }
+
+    std::unique_lock lock{mutex};
     boost::container::small_vector<ImageId, 2> image_ids;
     ForEachImageInRegion(
         info.guest_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) {
@@ -183,10 +118,6 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
 }
 
 ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
-    if (info.guest_address == 0) [[unlikely]] {
-        return slot_image_views[NULL_IMAGE_VIEW_ID];
-    }
-
     const ImageId image_id = FindImage(info);
     Image& image = slot_images[image_id];
     auto& usage = image.info.usage;
@@ -310,10 +241,7 @@ void TextureCache::RefreshImage(Image& image) {
         buffer = *upload_buffer;
     } else {
         // Upload data to the staging buffer.
-        const auto [data, offset_, _] = staging.Map(image.info.guest_size_bytes, 16);
-        std::memcpy(data, (void*)image.info.guest_address, image.info.guest_size_bytes);
-        staging.Commit(image.info.guest_size_bytes);
-        offset = offset_;
+        offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16);
     }
 
     const auto& num_layers = image.info.resources.layers;
@@ -344,9 +272,6 @@ void TextureCache::RefreshImage(Image& image) {
     }
 
     cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
-
-    image.Transit(vk::ImageLayout::eGeneral,
-                  vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead);
 }
 
 vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {
@@ -362,8 +287,6 @@ void TextureCache::RegisterImage(ImageId image_id) {
     image.flags |= ImageFlagBits::Registered;
     ForEachPage(image.cpu_addr, image.info.guest_size_bytes,
                 [this, image_id](u64 page) { page_table[page].push_back(image_id); });
-
-    image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone);
 }
 
 void TextureCache::UnregisterImage(ImageId image_id) {
@@ -373,11 +296,11 @@ void TextureCache::UnregisterImage(ImageId image_id) {
     image.flags &= ~ImageFlagBits::Registered;
     ForEachPage(image.cpu_addr, image.info.guest_size_bytes, [this, image_id](u64 page) {
         const auto page_it = page_table.find(page);
-        if (page_it == page_table.end()) {
-            ASSERT_MSG(false, "Unregistering unregistered page=0x{:x}", page << PageShift);
+        if (page_it == nullptr) {
+            UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PageShift);
             return;
         }
-        auto& image_ids = page_it.value();
+        auto& image_ids = *page_it;
         const auto vector_it = std::ranges::find(image_ids, image_id);
         if (vector_it == image_ids.end()) {
             ASSERT_MSG(false, "Unregistering unregistered image in page=0x{:x}", page << PageShift);
@@ -393,7 +316,7 @@ void TextureCache::TrackImage(Image& image, ImageId image_id) {
         return;
     }
     image.flags |= ImageFlagBits::Tracked;
-    UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1);
+    tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1);
 }
 
 void TextureCache::UntrackImage(Image& image, ImageId image_id) {
@@ -401,40 +324,34 @@ void TextureCache::UntrackImage(Image& image, ImageId image_id) {
         return;
     }
     image.flags &= ~ImageFlagBits::Tracked;
-    UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1);
+    tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1);
 }
 
-void TextureCache::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
-    std::scoped_lock lk{mutex};
-    const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
-    const u64 page_start = addr >> PageShift;
-    const u64 page_end = page_start + num_pages;
+void TextureCache::DeleteImage(ImageId image_id) {
+    Image& image = slot_images[image_id];
+    ASSERT_MSG(False(image.flags & ImageFlagBits::Tracked), "Image was not untracked");
+    ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Image was not unregistered");
 
-    const auto pages_interval =
-        decltype(cached_pages)::interval_type::right_open(page_start, page_end);
-    if (delta > 0) {
-        cached_pages.add({pages_interval, delta});
+    // Remove any registered meta areas.
+    const auto& meta_info = image.info.meta_info;
+    if (meta_info.cmask_addr) {
+        surface_metas.erase(meta_info.cmask_addr);
+    }
+    if (meta_info.fmask_addr) {
+        surface_metas.erase(meta_info.fmask_addr);
+    }
+    if (meta_info.htile_addr) {
+        surface_metas.erase(meta_info.htile_addr);
     }
 
-    const auto& range = cached_pages.equal_range(pages_interval);
-    for (const auto& [range, count] : boost::make_iterator_range(range)) {
-        const auto interval = range & pages_interval;
-        const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
-        const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
-        const u32 interval_size = interval_end_addr - interval_start_addr;
-        void* addr = reinterpret_cast<void*>(interval_start_addr);
-        if (delta > 0 && count == delta) {
-            mprotect(addr, interval_size, PAGE_READONLY);
-        } else if (delta < 0 && count == -delta) {
-            mprotect(addr, interval_size, PAGE_READWRITE);
-        } else {
-            ASSERT(count >= 0);
+    // Reclaim image and any image views it references.
+    scheduler.DeferOperation([this, image_id] {
+        Image& image = slot_images[image_id];
+        for (const ImageViewId image_view_id : image.image_view_ids) {
+            slot_image_views.erase(image_view_id);
         }
-    }
-
-    if (delta < 0) {
-        cached_pages.add({pages_interval, delta});
-    }
+        slot_images.erase(image_id);
+    });
 }
 
 } // namespace VideoCore
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index aef33bcf..17a09898 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -4,12 +4,11 @@
 #pragma once
 
 #include <boost/container/small_vector.hpp>
-#include <boost/icl/interval_map.hpp>
 #include <tsl/robin_map.h>
 
 #include "common/slot_vector.h"
 #include "video_core/amdgpu/resource.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+#include "video_core/multi_level_page_table.h"
 #include "video_core/texture_cache/image.h"
 #include "video_core/texture_cache/image_view.h"
 #include "video_core/texture_cache/sampler.h"
@@ -21,31 +20,28 @@ struct BufferAttributeGroup;
 
 namespace VideoCore {
 
+class BufferCache;
+class PageManager;
+
 class TextureCache {
-    // This is the page shift for adding images into the hash map. It isn't related to
-    // the page size of the guest or the host and is chosen for convenience. A number too
-    // small will increase the number of hash map lookups per image, while too large will
-    // increase the number of images per page.
-    static constexpr u64 PageBits = 20;
-    static constexpr u64 PageMask = (1ULL << PageBits) - 1;
-
-    struct MetaDataInfo {
-        enum class Type {
-            CMask,
-            FMask,
-            HTile,
-        };
-
-        Type type;
-        bool is_cleared;
+    struct Traits {
+        using Entry = boost::container::small_vector<ImageId, 16>;
+        static constexpr size_t AddressSpaceBits = 39;
+        static constexpr size_t FirstLevelBits = 9;
+        static constexpr size_t PageBits = 22;
     };
+    using PageTable = MultiLevelPageTable<Traits>;
 
 public:
-    explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler);
+    explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
+                          BufferCache& buffer_cache, PageManager& tracker);
     ~TextureCache();
 
     /// Invalidates any image in the logical page range.
-    void OnCpuWrite(VAddr address);
+    void InvalidateMemory(VAddr address, size_t size);
+
+    /// Evicts any images that overlap the unmapped range.
+    void UnmapMemory(VAddr cpu_addr, size_t size);
 
     /// Retrieves the image handle of the image with the provided attributes.
     [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true);
@@ -101,8 +97,8 @@ private:
     template <typename Func>
     static void ForEachPage(PAddr addr, size_t size, Func&& func) {
         static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
-        const u64 page_end = (addr + size - 1) >> PageBits;
-        for (u64 page = addr >> PageBits; page <= page_end; ++page) {
+        const u64 page_end = (addr + size - 1) >> Traits::PageBits;
+        for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) {
             if constexpr (RETURNS_BOOL) {
                 if (func(page)) {
                     break;
@@ -120,14 +116,14 @@ private:
         boost::container::small_vector<ImageId, 32> images;
         ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) {
             const auto it = page_table.find(page);
-            if (it == page_table.end()) {
+            if (it == nullptr) {
                 if constexpr (BOOL_BREAK) {
                     return false;
                 } else {
                     return;
                 }
             }
-            for (const ImageId image_id : it->second) {
+            for (const ImageId image_id : *it) {
                 Image& image = slot_images[image_id];
                 if (image.flags & ImageFlagBits::Picked) {
                     continue;
@@ -166,25 +162,32 @@ private:
     /// Stop tracking CPU reads and writes for image
     void UntrackImage(Image& image, ImageId image_id);
 
-    /// Increase/decrease the number of surface in pages touching the specified region
-    void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
+    /// Removes the image and any views/surface metas that reference it.
+    void DeleteImage(ImageId image_id);
 
 private:
     const Vulkan::Instance& instance;
     Vulkan::Scheduler& scheduler;
-    Vulkan::StreamBuffer staging;
+    BufferCache& buffer_cache;
+    PageManager& tracker;
+    StreamBuffer staging;
     TileManager tile_manager;
     Common::SlotVector<Image> slot_images;
     Common::SlotVector<ImageView> slot_image_views;
     tsl::robin_map<u64, Sampler> samplers;
-    tsl::robin_pg_map<u64, std::vector<ImageId>> page_table;
-    boost::icl::interval_map<VAddr, s32> cached_pages;
-    tsl::robin_map<VAddr, MetaDataInfo> surface_metas;
+    PageTable page_table;
     std::mutex mutex;
-#ifdef _WIN64
-    void* veh_handle{};
-#endif
-    std::mutex m_page_table;
+
+    struct MetaDataInfo {
+        enum class Type {
+            CMask,
+            FMask,
+            HTile,
+        };
+        Type type;
+        bool is_cleared;
+    };
+    tsl::robin_map<VAddr, MetaDataInfo> surface_metas;
 };
 
 } // namespace VideoCore
diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp
index 4f199f81..d3a7d796 100644
--- a/src/video_core/texture_cache/tile_manager.cpp
+++ b/src/video_core/texture_cache/tile_manager.cpp
@@ -183,10 +183,12 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
     case vk::Format::eB8G8R8A8Srgb:
     case vk::Format::eB8G8R8A8Unorm:
     case vk::Format::eR8G8B8A8Unorm:
+    case vk::Format::eR8G8B8A8Uint:
     case vk::Format::eR32Sfloat:
     case vk::Format::eR32Uint:
     case vk::Format::eR16G16Sfloat:
         return vk::Format::eR32Uint;
+    case vk::Format::eBc1RgbaSrgbBlock:
     case vk::Format::eBc1RgbaUnormBlock:
     case vk::Format::eBc4UnormBlock:
     case vk::Format::eR32G32Sfloat:
@@ -200,11 +202,20 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
     case vk::Format::eBc5UnormBlock:
     case vk::Format::eBc7SrgbBlock:
     case vk::Format::eBc7UnormBlock:
+    case vk::Format::eBc6HUfloatBlock:
+    case vk::Format::eR32G32B32A32Sfloat:
         return vk::Format::eR32G32B32A32Uint;
     default:
         break;
     }
-    LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format));
+
+    // Log missing formats only once to avoid spamming the log.
+    static constexpr size_t MaxFormatIndex = 256;
+    static std::array<bool, MaxFormatIndex> logged_formats{};
+    if (const u32 index = u32(format); !logged_formats[index]) {
+        LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format));
+        logged_formats[index] = true;
+    }
     return format;
 }
 
@@ -236,8 +247,11 @@ struct DetilerParams {
     u32 sizes[14];
 };
 
+static constexpr size_t StreamBufferSize = 128_MB;
+
 TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler)
-    : instance{instance}, scheduler{scheduler} {
+    : instance{instance}, scheduler{scheduler},
+      stream_buffer{instance, scheduler, MemoryUsage::Stream, StreamBufferSize} {
     static const std::array detiler_shaders{
         HostShaders::DETILE_M8X1_COMP,  HostShaders::DETILE_M8X2_COMP,
         HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP,
@@ -336,8 +350,7 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*
         .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT |
                                    VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT
                              : static_cast<VmaAllocationCreateFlags>(0),
-        .usage = is_large_buffer ? VMA_MEMORY_USAGE_AUTO_PREFER_HOST
-                                 : VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
+        .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
         .requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
                                      : static_cast<VkMemoryPropertyFlags>(0),
     };
@@ -373,37 +386,46 @@ std::optional<vk::Buffer> TileManager::TryDetile(Image& image) {
 
     const auto* detiler = GetDetiler(image);
     if (!detiler) {
-        LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
-                  vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
+        if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled) {
+            LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
+                      vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
+        }
         return std::nullopt;
     }
 
     // Prepare input buffer
-    auto in_buffer = AllocBuffer(image.info.guest_size_bytes);
-    Upload(in_buffer, reinterpret_cast<const void*>(image.info.guest_address),
-           image.info.guest_size_bytes);
+    const u32 image_size = image.info.guest_size_bytes;
+    const auto [in_buffer, in_offset] = [&] -> std::pair<vk::Buffer, u32> {
+        // Use stream buffer for smaller textures.
+        if (image_size <= StreamBufferSize) {
+            u32 offset = stream_buffer.Copy(image.info.guest_address, image_size);
+            return {stream_buffer.Handle(), offset};
+        }
+        // Request temporary host buffer for larger sizes.
+        auto in_buffer = AllocBuffer(image_size);
+        const auto addr = reinterpret_cast<const void*>(image.info.guest_address);
+        Upload(in_buffer, addr, image_size);
+        scheduler.DeferOperation([=, this]() { FreeBuffer(in_buffer); });
+        return {in_buffer.first, 0};
+    }();
 
     // Prepare output buffer
-    auto out_buffer = AllocBuffer(image.info.guest_size_bytes, true);
-
-    scheduler.DeferOperation([=, this]() {
-        FreeBuffer(in_buffer);
-        FreeBuffer(out_buffer);
-    });
+    auto out_buffer = AllocBuffer(image_size, true);
+    scheduler.DeferOperation([=, this]() { FreeBuffer(out_buffer); });
 
     auto cmdbuf = scheduler.CommandBuffer();
     cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl);
 
     const vk::DescriptorBufferInfo input_buffer_info{
-        .buffer = in_buffer.first,
-        .offset = 0,
-        .range = image.info.guest_size_bytes,
+        .buffer = in_buffer,
+        .offset = in_offset,
+        .range = image_size,
     };
 
     const vk::DescriptorBufferInfo output_buffer_info{
         .buffer = out_buffer.first,
         .offset = 0,
-        .range = image.info.guest_size_bytes,
+        .range = image_size,
     };
 
     std::vector<vk::WriteDescriptorSet> set_writes{
@@ -442,16 +464,16 @@ std::optional<vk::Buffer> TileManager::TryDetile(Image& image) {
     cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params),
                          &params);
 
-    ASSERT((image.info.guest_size_bytes % 64) == 0);
+    ASSERT((image_size % 64) == 0);
     const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u);
-    const auto num_tiles = image.info.guest_size_bytes / (64 * (bpp / 8));
+    const auto num_tiles = image_size / (64 * (bpp / 8));
     cmdbuf.dispatch(num_tiles, 1, 1);
 
     const vk::BufferMemoryBarrier post_barrier{
         .srcAccessMask = vk::AccessFlagBits::eShaderWrite,
         .dstAccessMask = vk::AccessFlagBits::eTransferRead,
         .buffer = out_buffer.first,
-        .size = image.info.guest_size_bytes,
+        .size = image_size,
     };
     cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
                            vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h
index 9102da08..00765b1f 100644
--- a/src/video_core/texture_cache/tile_manager.h
+++ b/src/video_core/texture_cache/tile_manager.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "common/types.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+#include "video_core/buffer_cache/buffer.h"
 #include "video_core/texture_cache/image.h"
 
 namespace VideoCore {
@@ -34,7 +34,7 @@ struct DetilerContext {
 
 class TileManager {
 public:
-    using ScratchBuffer = std::pair<VkBuffer, VmaAllocation>;
+    using ScratchBuffer = std::pair<vk::Buffer, VmaAllocation>;
 
     TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler);
     ~TileManager();
@@ -51,6 +51,7 @@ private:
 private:
     const Vulkan::Instance& instance;
     Vulkan::Scheduler& scheduler;
+    StreamBuffer stream_buffer;
     std::array<DetilerContext, DetilerType::Max> detilers;
 };
 

From 341034fc3056005501f0d0bfaa59a78210a8bdbb Mon Sep 17 00:00:00 2001
From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:44:05 +0300
Subject: [PATCH 03/11] filter: Add random library

---
 src/common/logging/filter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/common/logging/filter.cpp b/src/common/logging/filter.cpp
index 32576abe..a514652d 100644
--- a/src/common/logging/filter.cpp
+++ b/src/common/logging/filter.cpp
@@ -111,6 +111,7 @@ bool ParseFilterRule(Filter& instance, Iterator begin, Iterator end) {
     SUB(Lib, ErrorDialog)                                                                          \
     SUB(Lib, ImeDialog)                                                                            \
     SUB(Lib, AvPlayer)                                                                             \
+    SUB(Lib, Random)                                                                               \
     CLS(Frontend)                                                                                  \
     CLS(Render)                                                                                    \
     SUB(Render, Vulkan)                                                                            \

From 3fd2abdd5b44eb3120d6cbbc1e74f55e3e6e99fc Mon Sep 17 00:00:00 2001
From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com>
Date: Thu, 8 Aug 2024 17:00:08 +0300
Subject: [PATCH 04/11] vk_graphics_pipeline: Fix regression

---
 .../backend/spirv/emit_spirv_context_get_set.cpp          | 8 ++++----
 .../backend/spirv/spirv_emit_context.cpp                  | 2 +-
 src/shader_recompiler/backend/spirv/spirv_emit_context.h  | 1 +
 src/video_core/renderer_vulkan/vk_compute_pipeline.cpp    | 2 +-
 src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp   | 5 ++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 40d6cdb7..e85272e9 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -129,7 +129,7 @@ Id EmitReadConst(EmitContext& ctx) {
 Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
     auto& buffer = ctx.buffers[handle];
     if (!Sirit::ValidId(buffer.offset)) {
-        buffer.offset = ctx.GetBufferOffset(handle);
+        buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
     }
     const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))};
     index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords);
@@ -230,7 +230,7 @@ template <u32 N>
 static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
     auto& buffer = ctx.buffers[handle];
     if (!Sirit::ValidId(buffer.offset)) {
-        buffer.offset = ctx.GetBufferOffset(handle);
+        buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
     }
     address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
@@ -412,7 +412,7 @@ template <u32 N>
 static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     auto& buffer = ctx.buffers[handle];
     if (!Sirit::ValidId(buffer.offset)) {
-        buffer.offset = ctx.GetBufferOffset(handle);
+        buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
     }
     address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     if constexpr (N == 1) {
@@ -446,7 +446,7 @@ template <u32 N>
 static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
     auto& buffer = ctx.buffers[handle];
     if (!Sirit::ValidId(buffer.offset)) {
-        buffer.offset = ctx.GetBufferOffset(handle);
+        buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
     }
     address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index cdf417fc..61b55437 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -352,9 +352,9 @@ void EmitContext::DefineBuffers() {
         Decorate(id, spv::Decoration::DescriptorSet, 0U);
         Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base));
 
-        binding++;
         buffers.push_back({
             .id = id,
+            .global_binding = binding++,
             .data_types = data_types,
             .pointer_type = pointer_type,
             .buffer = buffer.GetVsharp(info),
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index ff9ec4b7..0d090eb3 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -203,6 +203,7 @@ public:
     struct BufferDefinition {
         Id id;
         Id offset;
+        u32 global_binding;
         const VectorIds* data_types;
         Id pointer_type;
         AmdGpu::Buffer buffer;
diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
index 8a98e968..21710a76 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@@ -125,7 +125,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
         const u32 adjust = offset - offset_aligned;
         if (adjust != 0) {
             ASSERT(adjust % 4 == 0);
-            push_data.AddOffset(i, adjust);
+            push_data.AddOffset(binding, adjust);
         }
         buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
         set_writes.push_back({
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 91ff999e..5d87a1ca 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -343,7 +343,7 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
             push_data.step0 = regs.vgt_instance_step_rate_0;
             push_data.step1 = regs.vgt_instance_step_rate_1;
         }
-        for (u32 i = 0; const auto& buffer : stage.buffers) {
+        for (const auto& buffer : stage.buffers) {
             const auto vsharp = buffer.GetVsharp(stage);
             if (vsharp) {
                 const VAddr address = vsharp.base_address;
@@ -359,7 +359,7 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
                 const u32 adjust = offset - offset_aligned;
                 if (adjust != 0) {
                     ASSERT(adjust % 4 == 0);
-                    push_data.AddOffset(i, adjust);
+                    push_data.AddOffset(binding, adjust);
                 }
                 buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
             } else {
@@ -374,7 +374,6 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
                                                     : vk::DescriptorType::eUniformBuffer,
                 .pBufferInfo = &buffer_infos.back(),
             });
-            i++;
         }
 
         boost::container::static_vector<AmdGpu::Image, 16> tsharps;

From 254b9ffb5095f946b94d537bedd07701f9c9e44e Mon Sep 17 00:00:00 2001
From: Xphalnos <164882787+Xphalnos@users.noreply.github.com>
Date: Thu, 8 Aug 2024 17:19:44 +0200
Subject: [PATCH 05/11] Workflows cleanup + misc fixes (#371)

* Workflows cleanup

* clang-format

* SDL3: Disabling unnecessary options

* Revert CMakeLists.txt changes
---
 .github/workflows/linux-qt.yml   |  5 ++---
 .github/workflows/linux.yml      |  2 --
 .github/workflows/macos-qt.yml   |  5 ++---
 .github/workflows/macos.yml      |  2 --
 .github/workflows/windows-qt.yml |  7 -------
 .github/workflows/windows.yml    | 12 +++---------
 externals/CMakeLists.txt         |  7 +++++++
 src/qt_gui/main_window.cpp       | 24 ++++++++++++------------
 src/qt_gui/main_window_ui.h      |  2 +-
 9 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/linux-qt.yml b/.github/workflows/linux-qt.yml
index 31f8df01..5611ae50 100644
--- a/.github/workflows/linux-qt.yml
+++ b/.github/workflows/linux-qt.yml
@@ -10,7 +10,6 @@ on:
     branches: [ "main" ]
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
 jobs:
@@ -19,8 +18,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - name: Fetch submodules
-      run: git submodule update --init --recursive
+      with:
+        submodules: recursive
 
     - name: Install misc packages
       run: >
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 7ebb3365..ef77a16c 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -8,10 +8,8 @@ on:
     branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
-  workflow_dispatch:
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
 jobs:
diff --git a/.github/workflows/macos-qt.yml b/.github/workflows/macos-qt.yml
index e9a9aa4f..4b3672df 100644
--- a/.github/workflows/macos-qt.yml
+++ b/.github/workflows/macos-qt.yml
@@ -8,10 +8,8 @@ on:
     branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
-  workflow_dispatch:
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
 jobs:
@@ -36,10 +34,11 @@ jobs:
     - name: Setup Qt
       uses: jurplel/install-qt-action@v4
       with:
+        version: 6.7.2
         host: mac
         target: desktop
         arch: clang_64
-        version: 6.7.2
+        archives: qtbase
 
     - name: Configure CMake
       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_OSX_ARCHITECTURES=x86_64 -DENABLE_QT_GUI=ON
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 910f0484..e46401cb 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -8,10 +8,8 @@ on:
     branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
-  workflow_dispatch:
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
 jobs:
diff --git a/.github/workflows/windows-qt.yml b/.github/workflows/windows-qt.yml
index 019a8ab2..06a16eb5 100644
--- a/.github/workflows/windows-qt.yml
+++ b/.github/workflows/windows-qt.yml
@@ -10,12 +10,8 @@ on:
     branches: [ "main" ]
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
-permissions:
-  contents: read
-
 jobs:
   build:
     runs-on: windows-latest
@@ -35,12 +31,9 @@ jobs:
         archives: qtbase
 
     - name: Configure CMake
-      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
-      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -T ClangCL -DENABLE_QT_GUI=ON
 
     - name: Build
-      # Build your program with the given configuration
       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} --parallel
 
     - name: Deploy
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 01ae3f9c..46dc13a8 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -10,12 +10,8 @@ on:
     branches: [ "main" ]
 
 env:
-  # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
   BUILD_TYPE: Release
 
-permissions:
-  contents: read
-
 jobs:
   build:
     runs-on: windows-latest
@@ -25,16 +21,14 @@ jobs:
           submodules: recursive
 
     - name: Configure CMake
-      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
-      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -T ClangCL
+
     - name: Build
-      # Build your program with the given configuration
       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} --parallel
-    - name: Upload a Build Artifact
+
+    - name: Upload executable
       uses: actions/upload-artifact@v4
       with:
         name: shadps4-win64
-        # A file, directory or wildcard pattern that describes what to upload
         path: |
           ${{github.workspace}}/build/Release/shadPS4.exe
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 9ebdd878..6426ef16 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -59,7 +59,14 @@ endif()
 
 # SDL3
 if (NOT TARGET SDL3::SDL3)
+    set(SDL_DIRECTX OFF)
+    set(SDL_MMX OFF)
+    set(SDL_OPENGL OFF)
+    set(SDL_OPENGLES OFF)
     set(SDL_PIPEWIRE OFF)
+    set(SDL_RENDER_D3D OFF)
+    set(SDL_WASAPI OFF)
+    set(SDL_XINPUT OFF)
     add_subdirectory(sdl3)
 endif()
 
diff --git a/src/qt_gui/main_window.cpp b/src/qt_gui/main_window.cpp
index b3778be0..0b88a84f 100644
--- a/src/qt_gui/main_window.cpp
+++ b/src/qt_gui/main_window.cpp
@@ -51,8 +51,8 @@ bool MainWindow::Init() {
     this->setStatusBar(statusBar.data());
     // Update status bar
     int numGames = m_game_info->m_games.size();
-    QString statusMessage = "Games: " + QString::number(numGames) + " (" +
-                            QString::number(duration.count()) + "ms). Ready.";
+    QString statusMessage =
+        "Games: " + QString::number(numGames) + " (" + QString::number(duration.count()) + "ms)";
     statusBar->showMessage(statusMessage);
     return true;
 }
@@ -72,8 +72,8 @@ void MainWindow::CreateActions() {
 
     // create action group for themes
     m_theme_act_group = new QActionGroup(this);
-    m_theme_act_group->addAction(ui->setThemeLight);
     m_theme_act_group->addAction(ui->setThemeDark);
+    m_theme_act_group->addAction(ui->setThemeLight);
     m_theme_act_group->addAction(ui->setThemeGreen);
     m_theme_act_group->addAction(ui->setThemeBlue);
     m_theme_act_group->addAction(ui->setThemeViolet);
@@ -344,14 +344,6 @@ void MainWindow::CreateConnects() {
     });
 
     // Themes
-    connect(ui->setThemeLight, &QAction::triggered, &m_window_themes, [this]() {
-        m_window_themes.SetWindowTheme(Theme::Light, ui->mw_searchbar);
-        Config::setMainWindowTheme(static_cast<int>(Theme::Light));
-        if (!isIconBlack) {
-            SetUiIcons(true);
-            isIconBlack = true;
-        }
-    });
     connect(ui->setThemeDark, &QAction::triggered, &m_window_themes, [this]() {
         m_window_themes.SetWindowTheme(Theme::Dark, ui->mw_searchbar);
         Config::setMainWindowTheme(static_cast<int>(Theme::Dark));
@@ -360,6 +352,14 @@ void MainWindow::CreateConnects() {
             isIconBlack = false;
         }
     });
+    connect(ui->setThemeLight, &QAction::triggered, &m_window_themes, [this]() {
+        m_window_themes.SetWindowTheme(Theme::Light, ui->mw_searchbar);
+        Config::setMainWindowTheme(static_cast<int>(Theme::Light));
+        if (!isIconBlack) {
+            SetUiIcons(true);
+            isIconBlack = true;
+        }
+    });
     connect(ui->setThemeGreen, &QAction::triggered, &m_window_themes, [this]() {
         m_window_themes.SetWindowTheme(Theme::Green, ui->mw_searchbar);
         Config::setMainWindowTheme(static_cast<int>(Theme::Green));
@@ -415,7 +415,7 @@ void MainWindow::RefreshGameTable() {
     m_game_grid_frame->PopulateGameGrid(m_game_info->m_games, false);
     statusBar->clearMessage();
     int numGames = m_game_info->m_games.size();
-    QString statusMessage = "Games: " + QString::number(numGames) + ". Ready.";
+    QString statusMessage = "Games: " + QString::number(numGames);
     statusBar->showMessage(statusMessage);
 }
 
diff --git a/src/qt_gui/main_window_ui.h b/src/qt_gui/main_window_ui.h
index 69d71847..06e5cf7f 100644
--- a/src/qt_gui/main_window_ui.h
+++ b/src/qt_gui/main_window_ui.h
@@ -297,7 +297,7 @@ public:
         menuRecent->setTitle(QCoreApplication::translate("MainWindow", "Recent Games", nullptr));
         exitAct->setText(QCoreApplication::translate("MainWindow", "Exit", nullptr));
 #if QT_CONFIG(tooltip)
-        exitAct->setToolTip(QCoreApplication::translate("MainWindow", "Exit Shadps4", nullptr));
+        exitAct->setToolTip(QCoreApplication::translate("MainWindow", "Exit shadPS4", nullptr));
 #endif // QT_CONFIG(tooltip)
 #if QT_CONFIG(statustip)
         exitAct->setStatusTip(

From 351f2e10737b26bc2f82e44f770e051a354dbf8b Mon Sep 17 00:00:00 2001
From: ElBread3 <92335081+ElBread3@users.noreply.github.com>
Date: Thu, 8 Aug 2024 13:23:44 -0500
Subject: [PATCH 06/11] double click to open games

---
 src/qt_gui/main_window.cpp | 58 +++++++++++++++++++++-----------------
 src/qt_gui/main_window.h   |  1 +
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/src/qt_gui/main_window.cpp b/src/qt_gui/main_window.cpp
index 0b88a84f..29d7c15f 100644
--- a/src/qt_gui/main_window.cpp
+++ b/src/qt_gui/main_window.cpp
@@ -179,32 +179,11 @@ void MainWindow::CreateConnects() {
         }
     });
 
-    connect(ui->playButton, &QPushButton::clicked, this, [this]() {
-        QString gamePath = "";
-        int table_mode = Config::getTableMode();
-        if (table_mode == 0) {
-            if (m_game_list_frame->currentItem()) {
-                int itemID = m_game_list_frame->currentItem()->row();
-                gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
-            }
-        } else if (table_mode == 1) {
-            if (m_game_grid_frame->cellClicked) {
-                int itemID = (m_game_grid_frame->crtRow * m_game_grid_frame->columnCnt) +
-                             m_game_grid_frame->crtColumn;
-                gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
-            }
-        } else {
-            if (m_elf_viewer->currentItem()) {
-                int itemID = m_elf_viewer->currentItem()->row();
-                gamePath = QString::fromStdString(m_elf_viewer->m_elf_list[itemID].toStdString());
-            }
-        }
-        if (gamePath != "") {
-            AddRecentFiles(gamePath);
-            Core::Emulator emulator;
-            emulator.Run(gamePath.toUtf8().constData());
-        }
-    });
+    connect(ui->playButton, &QPushButton::clicked, this, &MainWindow::StartGame);
+    connect(m_game_grid_frame.get(), &QTableWidget::cellDoubleClicked, this,
+            &MainWindow::StartGame);
+    connect(m_game_list_frame.get(), &QTableWidget::cellDoubleClicked, this,
+            &MainWindow::StartGame);
 
     connect(ui->setIconSizeTinyAct, &QAction::triggered, this, [this]() {
         if (isTableList) {
@@ -386,6 +365,33 @@ void MainWindow::CreateConnects() {
     });
 }
 
+void MainWindow::StartGame() {
+    QString gamePath = "";
+    int table_mode = Config::getTableMode();
+    if (table_mode == 0) {
+        if (m_game_list_frame->currentItem()) {
+            int itemID = m_game_list_frame->currentItem()->row();
+            gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
+        }
+    } else if (table_mode == 1) {
+        if (m_game_grid_frame->cellClicked) {
+            int itemID = (m_game_grid_frame->crtRow * m_game_grid_frame->columnCnt) +
+                         m_game_grid_frame->crtColumn;
+            gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
+        }
+    } else {
+        if (m_elf_viewer->currentItem()) {
+            int itemID = m_elf_viewer->currentItem()->row();
+            gamePath = QString::fromStdString(m_elf_viewer->m_elf_list[itemID].toStdString());
+        }
+    }
+    if (gamePath != "") {
+        AddRecentFiles(gamePath);
+        Core::Emulator emulator;
+        emulator.Run(gamePath.toUtf8().constData());
+    }
+}
+
 void MainWindow::SearchGameTable(const QString& text) {
     if (isTableList) {
         for (int row = 0; row < m_game_list_frame->rowCount(); row++) {
diff --git a/src/qt_gui/main_window.h b/src/qt_gui/main_window.h
index d1ef48dc..39a5d049 100644
--- a/src/qt_gui/main_window.h
+++ b/src/qt_gui/main_window.h
@@ -39,6 +39,7 @@ public:
     bool Init();
     void InstallDragDropPkg(std::filesystem::path file, int pkgNum, int nPkg);
     void InstallDirectory();
+    void StartGame();
 
 private Q_SLOTS:
     void ConfigureGuiFromSettings();

From 7e5cc6162cfa5ff951bee48d440d0834fa8a7461 Mon Sep 17 00:00:00 2001
From: SamuelFontes <samuguel@hotmail.com.br>
Date: Thu, 8 Aug 2024 15:57:43 -0300
Subject: [PATCH 07/11] qt_gui: Refreshing game list after install directory
 change

The game list wasn't being refreshed automaticly after a manual directory change on the QT GUI,  now the RefreshGameTable will be called after the GameInstallDialog is executed.
---
 src/qt_gui/main_window.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/qt_gui/main_window.cpp b/src/qt_gui/main_window.cpp
index 29d7c15f..646433ee 100644
--- a/src/qt_gui/main_window.cpp
+++ b/src/qt_gui/main_window.cpp
@@ -583,6 +583,7 @@ void MainWindow::InstallDragDropPkg(std::filesystem::path file, int pkgNum, int
 void MainWindow::InstallDirectory() {
     GameInstallDialog dlg;
     dlg.exec();
+    RefreshGameTable();
 }
 
 void MainWindow::SetLastUsedTheme() {

From 564b2f63105ecb4cc1ead7a218084c1ee19ff07d Mon Sep 17 00:00:00 2001
From: SamuelFontes <samuguel@hotmail.com.br>
Date: Thu, 8 Aug 2024 16:14:35 -0300
Subject: [PATCH 08/11] 361: Game directory window appears every time

qt_gui: When a command line argument is passed to the GUI version, it will always prompt to change the game directory. This happens because the "user" folder is created on the elf or eboot.bin location.
This change will ignore the game install directory configuration at startup when an command line argument is passed.
Since if a game was passed, it should start automatically as this is the expected behaviour.
---
 src/qt_gui/main.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/qt_gui/main.cpp b/src/qt_gui/main.cpp
index 15a06c86..cff01cc2 100644
--- a/src/qt_gui/main.cpp
+++ b/src/qt_gui/main.cpp
@@ -21,8 +21,11 @@ int main(int argc, char* argv[]) {
     Config::load(user_dir / "config.toml");
     std::filesystem::create_directory(user_dir / "game_data");
 
+    // Check if elf or eboot.bin path was passed as a command line argument
+    bool has_command_line_argument = argc > 1;
+
     // Check if the game install directory is set
-    if (Config::getGameInstallDir() == "") {
+    if (Config::getGameInstallDir() == "" && !has_command_line_argument) {
         GameInstallDialog dlg;
         dlg.exec();
     }
@@ -35,7 +38,7 @@ int main(int argc, char* argv[]) {
     m_main_window->Init();
 
     // Check for command line arguments
-    if (argc > 1) {
+    if (has_command_line_argument) {
         Core::Emulator emulator;
         emulator.Run(argv[1]);
     }

From e5087877ae527fedc939791289fe843a57e69ee0 Mon Sep 17 00:00:00 2001
From: georgemoralis <giorgosmrls@gmail.com>
Date: Thu, 8 Aug 2024 22:31:14 +0300
Subject: [PATCH 09/11] revert some sdl switches

---
 externals/CMakeLists.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 6426ef16..9ebdd878 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -59,14 +59,7 @@ endif()
 
 # SDL3
 if (NOT TARGET SDL3::SDL3)
-    set(SDL_DIRECTX OFF)
-    set(SDL_MMX OFF)
-    set(SDL_OPENGL OFF)
-    set(SDL_OPENGLES OFF)
     set(SDL_PIPEWIRE OFF)
-    set(SDL_RENDER_D3D OFF)
-    set(SDL_WASAPI OFF)
-    set(SDL_XINPUT OFF)
     add_subdirectory(sdl3)
 endif()
 

From 48c58d5ce0ca430518a5c2fc237d66783a83dc3c Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:42:51 -0500
Subject: [PATCH 10/11] Kernel-Related Fixes (#386)

* Fix OrbisKernelBatchMapEntry struct

UE4 games and GTA V cause the BatchMap offset to overflow on Windows. Changing the type fixes this, and doesn't seem to cause any regressions on Windows or Linux.

* Implement posix_sem_trywait

Grand Theft Auto V needs this.

* Add missing scePthreadAttrGetdetachstate NID

Noticed this missing NID while testing games.
---
 src/core/libraries/kernel/memory_management.h   | 2 +-
 src/core/libraries/kernel/thread_management.cpp | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/core/libraries/kernel/memory_management.h b/src/core/libraries/kernel/memory_management.h
index 25a4a9f0..6735ead7 100644
--- a/src/core/libraries/kernel/memory_management.h
+++ b/src/core/libraries/kernel/memory_management.h
@@ -63,7 +63,7 @@ struct OrbisVirtualQueryInfo {
 
 struct OrbisKernelBatchMapEntry {
     void* start;
-    off_t offset;
+    size_t offset;
     size_t length;
     char protection;
     char type;
diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp
index 48347ea5..6be90cce 100644
--- a/src/core/libraries/kernel/thread_management.cpp
+++ b/src/core/libraries/kernel/thread_management.cpp
@@ -1360,6 +1360,10 @@ int PS4_SYSV_ABI posix_sem_wait(sem_t* sem) {
     return sem_wait(sem);
 }
 
+int PS4_SYSV_ABI posix_sem_trywait(sem_t* sem) {
+    return sem_trywait(sem);
+}
+
 #ifndef HAVE_SEM_TIMEDWAIT
 int sem_timedwait(sem_t* sem, const struct timespec* abstime) {
     int rc;
@@ -1499,6 +1503,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
     LIB_FUNCTION("WrOLvHU0yQM", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setspecific);
     LIB_FUNCTION("4+h9EzwKF4I", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetschedpolicy);
     LIB_FUNCTION("-Wreprtu0Qs", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetdetachstate);
+    LIB_FUNCTION("JaRMy+QcpeU", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrGetdetachstate);
     LIB_FUNCTION("eXbUSpEaTsA", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetinheritsched);
     LIB_FUNCTION("DzES9hQF4f4", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetschedparam);
     LIB_FUNCTION("nsYoNRywwNg", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrInit);
@@ -1611,6 +1616,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
     LIB_FUNCTION("Xs9hdiD7sAA", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setschedparam);
     LIB_FUNCTION("pDuPEf3m4fI", "libScePosix", 1, "libkernel", 1, 1, posix_sem_init);
     LIB_FUNCTION("YCV5dGGBcCo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_wait);
+    LIB_FUNCTION("WBWzsRifCEA", "libScePosix", 1, "libkernel", 1, 1, posix_sem_trywait);
     LIB_FUNCTION("w5IHyvahg-o", "libScePosix", 1, "libkernel", 1, 1, posix_sem_timedwait);
     LIB_FUNCTION("IKP8typ0QUk", "libScePosix", 1, "libkernel", 1, 1, posix_sem_post);
     LIB_FUNCTION("cDW233RAwWo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_destroy);

From ab56665d4b75e13e99e824dc531b8642d0377632 Mon Sep 17 00:00:00 2001
From: SleepingSnakezzz <71992016+SleepingSnakezzz@users.noreply.github.com>
Date: Thu, 8 Aug 2024 22:43:21 +0200
Subject: [PATCH 11/11] Update latest build instructions.md (#385)

Expands on the existing instructions for more clarity.
---
 documents/Quickstart/Quickstart.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/documents/Quickstart/Quickstart.md b/documents/Quickstart/Quickstart.md
index 29c7ba49..4c51b288 100644
--- a/documents/Quickstart/Quickstart.md
+++ b/documents/Quickstart/Quickstart.md
@@ -37,13 +37,18 @@ SPDX-License-Identifier: GPL-2.0-or-later
 
 - Windows 10 or Ubuntu 22.04
 
-## Have the latest WIP version
+## How to run the latest Work-in-Progress builds of ShadPS4
 
-When you go to Github Release, you have the latest major versions (e.g. v0.0.3), but if you want to have the latest Work-In-Progress version, you can go to Actions on Github to download it (Please note a Github account is required to be able to download).
+1. Go to <https://github.com/shadps4-emu/shadPS4/actions> and make sure you are logged into your GitHub account (important!)
+2. On the left side of the page, select your operating system of choice (the "**qt**" versions have a user interface, which is probably the one you want. The others are SDL versions, which can only be run via command line). ![image](https://github.com/user-attachments/assets/43f01bbf-236c-4d6d-98ac-f5a5badd4ce8)
 
-<img src="https://github.com/shadps4-emu/shadPS4/blob/main/documents/Quickstart/1.png" width="800"></a>
+3. In the workflow list, select the latest entry with a green :white_check_mark: icon in front of it. (or the latest entry for whatever pull request you wish to test). ![image](https://github.com/user-attachments/assets/6365f407-867c-44ae-bf00-944f8d84a349)
 
-After downloading the version suitable for you (Windows or Linux), you must unzip the file and then you can run it. Please note, there are two versions for each platform, a Qt version with user interface and one without (SDL Builds).
+4. On the bottom of this page, select the name of the file, and it should start downloading. (If there is no file here, double check that you are indeed logged into a GitHub account, and that there is a green :white_check_mark: icon. ![image](https://github.com/user-attachments/assets/97924500-3911-4f90-ab63-ffae7e52700b)
+
+5. Once downloaded, extract to its own folder, and run ShadPS4's executable from the extracted folder.
+
+6. Upon first launch, ShadPS4 will prompt you to select a folder to store your installed games in. Select "Browse" and then select a folder that ShadPS4 can use to install your PKG files to.
 
 ## Install PKG files