This commit is contained in:
Antonio 2024-08-08 18:07:58 -04:00
commit 96d13fa210
72 changed files with 2824 additions and 1147 deletions

View File

@ -10,7 +10,6 @@ on:
branches: [ "main" ]
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
jobs:
@ -19,8 +18,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Fetch submodules
run: git submodule update --init --recursive
with:
submodules: recursive
- name: Install misc packages
run: >

View File

@ -8,10 +8,8 @@ on:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
workflow_dispatch:
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
jobs:

View File

@ -8,10 +8,8 @@ on:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
workflow_dispatch:
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
jobs:
@ -36,10 +34,11 @@ jobs:
- name: Setup Qt
uses: jurplel/install-qt-action@v4
with:
version: 6.7.2
host: mac
target: desktop
arch: clang_64
version: 6.7.2
archives: qtbase
- name: Configure CMake
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_OSX_ARCHITECTURES=x86_64 -DENABLE_QT_GUI=ON

View File

@ -8,10 +8,8 @@ on:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
workflow_dispatch:
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
jobs:

View File

@ -10,12 +10,8 @@ on:
branches: [ "main" ]
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
permissions:
contents: read
jobs:
build:
runs-on: windows-latest
@ -35,12 +31,9 @@ jobs:
archives: qtbase
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -T ClangCL -DENABLE_QT_GUI=ON
- name: Build
# Build your program with the given configuration
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} --parallel
- name: Deploy

View File

@ -10,12 +10,8 @@ on:
branches: [ "main" ]
env:
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: Release
permissions:
contents: read
jobs:
build:
runs-on: windows-latest
@ -25,16 +21,14 @@ jobs:
submodules: recursive
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -T ClangCL
- name: Build
# Build your program with the given configuration
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} --parallel
- name: Upload a Build Artifact
- name: Upload executable
uses: actions/upload-artifact@v4
with:
name: shadps4-win64
# A file, directory or wildcard pattern that describes what to upload
path: |
${{github.workspace}}/build/Release/shadPS4.exe

View File

@ -283,6 +283,7 @@ set(COMMON src/common/logging/backend.cpp
src/common/native_clock.h
src/common/path_util.cpp
src/common/path_util.h
src/common/object_pool.h
src/common/polyfill_thread.h
src/common/rdtsc.cpp
src/common/rdtsc.h
@ -294,6 +295,7 @@ set(COMMON src/common/logging/backend.cpp
src/common/thread.h
src/common/types.h
src/common/uint128.h
src/common/unique_function.h
src/common/version.h
src/common/ntapi.h
src/common/ntapi.cpp
@ -367,7 +369,6 @@ set(CORE src/core/aerolib/stubs.cpp
)
set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/object_pool.h
src/shader_recompiler/profile.h
src/shader_recompiler/recompiler.cpp
src/shader_recompiler/recompiler.h
@ -451,6 +452,13 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
src/video_core/amdgpu/pm4_cmds.h
src/video_core/amdgpu/pm4_opcodes.h
src/video_core/amdgpu/resource.h
src/video_core/buffer_cache/buffer.cpp
src/video_core/buffer_cache/buffer.h
src/video_core/buffer_cache/buffer_cache.cpp
src/video_core/buffer_cache/buffer_cache.h
src/video_core/buffer_cache/memory_tracker_base.h
src/video_core/buffer_cache/range_set.h
src/video_core/buffer_cache/word_manager.h
src/video_core/renderer_vulkan/liverpool_to_vk.cpp
src/video_core/renderer_vulkan/liverpool_to_vk.h
src/video_core/renderer_vulkan/renderer_vulkan.cpp
@ -479,8 +487,6 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
src/video_core/renderer_vulkan/vk_scheduler.h
src/video_core/renderer_vulkan/vk_shader_util.cpp
src/video_core/renderer_vulkan/vk_shader_util.h
src/video_core/renderer_vulkan/vk_stream_buffer.cpp
src/video_core/renderer_vulkan/vk_stream_buffer.h
src/video_core/renderer_vulkan/vk_swapchain.cpp
src/video_core/renderer_vulkan/vk_swapchain.h
src/video_core/texture_cache/image.cpp
@ -496,6 +502,9 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
src/video_core/texture_cache/tile_manager.cpp
src/video_core/texture_cache/tile_manager.h
src/video_core/texture_cache/types.h
src/video_core/page_manager.cpp
src/video_core/page_manager.h
src/video_core/multi_level_page_table.h
src/video_core/renderdoc.cpp
src/video_core/renderdoc.h
)

View File

@ -37,13 +37,18 @@ SPDX-License-Identifier: GPL-2.0-or-later
- Windows 10 or Ubuntu 22.04
## Have the latest WIP version
## How to run the latest Work-in-Progress builds of ShadPS4
When you go to Github Release, you have the latest major versions (e.g. v0.0.3), but if you want to have the latest Work-In-Progress version, you can go to Actions on Github to download it (Please note a Github account is required to be able to download).
1. Go to <https://github.com/shadps4-emu/shadPS4/actions> and make sure you are logged into your GitHub account (important!)
2. On the left side of the page, select your operating system of choice (the "**qt**" versions have a user interface, which is probably the one you want. The others are SDL versions, which can only be run via command line). ![image](https://github.com/user-attachments/assets/43f01bbf-236c-4d6d-98ac-f5a5badd4ce8)
<img src="https://github.com/shadps4-emu/shadPS4/blob/main/documents/Quickstart/1.png" width="800"></a>
3. In the workflow list, select the latest entry with a green :white_check_mark: icon in front of it. (or the latest entry for whatever pull request you wish to test). ![image](https://github.com/user-attachments/assets/6365f407-867c-44ae-bf00-944f8d84a349)
After downloading the version suitable for you (Windows or Linux), you must unzip the file and then you can run it. Please note, there are two versions for each platform, a Qt version with user interface and one without (SDL Builds).
4. On the bottom of this page, select the name of the file, and it should start downloading. (If there is no file here, double check that you are indeed logged into a GitHub account, and that there is a green :white_check_mark: icon. ![image](https://github.com/user-attachments/assets/97924500-3911-4f90-ab63-ffae7e52700b)
5. Once downloaded, extract to its own folder, and run ShadPS4's executable from the extracted folder.
6. Upon first launch, ShadPS4 will prompt you to select a folder to store your installed games in. Select "Browse" and then select a folder that ShadPS4 can use to install your PKG files to.
## Install PKG files

View File

@ -111,6 +111,7 @@ bool ParseFilterRule(Filter& instance, Iterator begin, Iterator end) {
SUB(Lib, ErrorDialog) \
SUB(Lib, ImeDialog) \
SUB(Lib, AvPlayer) \
SUB(Lib, Random) \
CLS(Frontend) \
CLS(Render) \
SUB(Render, Vulkan) \

View File

@ -8,7 +8,7 @@
#include <utility>
#include <vector>
namespace Shader {
namespace Common {
template <typename T>
requires std::is_destructible_v<T>
@ -104,4 +104,4 @@ private:
size_t new_chunk_size{};
};
} // namespace Shader
} // namespace Common

61
src/common/unique_function.h Executable file
View File

@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <memory>
#include <utility>
namespace Common {
/// General purpose function wrapper similar to std::function.
/// Unlike std::function, the captured values don't have to be copyable.
/// This class can be moved but not copied.
template <typename ResultType, typename... Args>
class UniqueFunction {
class CallableBase {
public:
virtual ~CallableBase() = default;
virtual ResultType operator()(Args&&...) = 0;
};
template <typename Functor>
class Callable final : public CallableBase {
public:
Callable(Functor&& functor_) : functor{std::move(functor_)} {}
~Callable() override = default;
ResultType operator()(Args&&... args) override {
return functor(std::forward<Args>(args)...);
}
private:
Functor functor;
};
public:
UniqueFunction() = default;
template <typename Functor>
UniqueFunction(Functor&& functor)
: callable{std::make_unique<Callable<Functor>>(std::move(functor))} {}
UniqueFunction& operator=(UniqueFunction&& rhs) noexcept = default;
UniqueFunction(UniqueFunction&& rhs) noexcept = default;
UniqueFunction& operator=(const UniqueFunction&) = delete;
UniqueFunction(const UniqueFunction&) = delete;
ResultType operator()(Args&&... args) const {
return (*callable)(std::forward<Args>(args)...);
}
explicit operator bool() const noexcept {
return static_cast<bool>(callable);
}
private:
std::unique_ptr<CallableBase> callable;
};
} // namespace Common

View File

@ -63,7 +63,7 @@ struct OrbisVirtualQueryInfo {
struct OrbisKernelBatchMapEntry {
void* start;
off_t offset;
size_t offset;
size_t length;
char protection;
char type;

View File

@ -465,7 +465,7 @@ int PS4_SYSV_ABI scePthreadMutexDestroy(ScePthreadMutex* mutex) {
int result = pthread_mutex_destroy(&(*mutex)->pth_mutex);
LOG_INFO(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result);
LOG_DEBUG(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result);
delete *mutex;
*mutex = nullptr;
@ -725,7 +725,10 @@ int PS4_SYSV_ABI scePthreadCondDestroy(ScePthreadCond* cond) {
}
int result = pthread_cond_destroy(&(*cond)->cond);
LOG_INFO(Kernel_Pthread, "scePthreadCondDestroy, result={}", result);
LOG_DEBUG(Kernel_Pthread, "scePthreadCondDestroy, result={}", result);
delete *cond;
*cond = nullptr;
switch (result) {
case 0:
@ -808,8 +811,6 @@ int PS4_SYSV_ABI posix_pthread_cond_timedwait(ScePthreadCond* cond, ScePthreadMu
}
int PS4_SYSV_ABI posix_pthread_cond_broadcast(ScePthreadCond* cond) {
LOG_INFO(Kernel_Pthread,
"posix posix_pthread_cond_broadcast redirect to scePthreadCondBroadcast");
int result = scePthreadCondBroadcast(cond);
if (result != 0) {
int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@ -821,7 +822,6 @@ int PS4_SYSV_ABI posix_pthread_cond_broadcast(ScePthreadCond* cond) {
}
int PS4_SYSV_ABI posix_pthread_mutexattr_init(ScePthreadMutexattr* attr) {
// LOG_INFO(Kernel_Pthread, "posix pthread_mutexattr_init redirect to scePthreadMutexattrInit");
int result = scePthreadMutexattrInit(attr);
if (result < 0) {
int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@ -833,7 +833,6 @@ int PS4_SYSV_ABI posix_pthread_mutexattr_init(ScePthreadMutexattr* attr) {
}
int PS4_SYSV_ABI posix_pthread_mutexattr_settype(ScePthreadMutexattr* attr, int type) {
// LOG_INFO(Kernel_Pthread, "posix pthread_mutex_init redirect to scePthreadMutexInit");
int result = scePthreadMutexattrSettype(attr, type);
if (result < 0) {
int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@ -858,7 +857,6 @@ int PS4_SYSV_ABI posix_pthread_once(pthread_once_t* once_control, void (*init_ro
int PS4_SYSV_ABI posix_pthread_mutexattr_setprotocol(ScePthreadMutexattr* attr, int protocol) {
int result = scePthreadMutexattrSetprotocol(attr, protocol);
LOG_INFO(Kernel_Pthread, "redirect to scePthreadMutexattrSetprotocol: result = {}", result);
if (result < 0) {
UNREACHABLE();
}
@ -1142,7 +1140,7 @@ int PS4_SYSV_ABI scePthreadCondWait(ScePthreadCond* cond, ScePthreadMutex* mutex
}
int result = pthread_cond_wait(&(*cond)->cond, &(*mutex)->pth_mutex);
LOG_INFO(Kernel_Pthread, "scePthreadCondWait, result={}", result);
LOG_DEBUG(Kernel_Pthread, "scePthreadCondWait, result={}", result);
switch (result) {
case 0:
@ -1162,7 +1160,7 @@ int PS4_SYSV_ABI scePthreadCondattrDestroy(ScePthreadCondattr* attr) {
}
int result = pthread_condattr_destroy(&(*attr)->cond_attr);
LOG_INFO(Kernel_Pthread, "scePthreadCondattrDestroy: result = {} ", result);
LOG_DEBUG(Kernel_Pthread, "scePthreadCondattrDestroy: result = {} ", result);
switch (result) {
case 0:
@ -1292,8 +1290,6 @@ int PS4_SYSV_ABI posix_pthread_attr_setdetachstate(ScePthreadAttr* attr, int det
int PS4_SYSV_ABI posix_pthread_create_name_np(ScePthread* thread, const ScePthreadAttr* attr,
PthreadEntryFunc start_routine, void* arg,
const char* name) {
LOG_INFO(Kernel_Pthread, "posix pthread_create redirect to scePthreadCreate: name = {}", name);
int result = scePthreadCreate(thread, attr, start_routine, arg, name);
if (result != 0) {
int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP
@ -1340,17 +1336,11 @@ int PS4_SYSV_ABI posix_pthread_cond_init(ScePthreadCond* cond, const ScePthreadC
int PS4_SYSV_ABI posix_pthread_cond_signal(ScePthreadCond* cond) {
int result = scePthreadCondSignal(cond);
LOG_INFO(Kernel_Pthread,
"posix posix_pthread_cond_signal redirect to scePthreadCondSignal, result = {}",
result);
return result;
}
int PS4_SYSV_ABI posix_pthread_cond_destroy(ScePthreadCond* cond) {
int result = scePthreadCondDestroy(cond);
LOG_INFO(Kernel_Pthread,
"posix posix_pthread_cond_destroy redirect to scePthreadCondDestroy, result = {}",
result);
return result;
}
@ -1370,6 +1360,10 @@ int PS4_SYSV_ABI posix_sem_wait(sem_t* sem) {
return sem_wait(sem);
}
int PS4_SYSV_ABI posix_sem_trywait(sem_t* sem) {
return sem_trywait(sem);
}
#ifndef HAVE_SEM_TIMEDWAIT
int sem_timedwait(sem_t* sem, const struct timespec* abstime) {
int rc;
@ -1509,6 +1503,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
LIB_FUNCTION("WrOLvHU0yQM", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setspecific);
LIB_FUNCTION("4+h9EzwKF4I", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetschedpolicy);
LIB_FUNCTION("-Wreprtu0Qs", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetdetachstate);
LIB_FUNCTION("JaRMy+QcpeU", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrGetdetachstate);
LIB_FUNCTION("eXbUSpEaTsA", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetinheritsched);
LIB_FUNCTION("DzES9hQF4f4", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrSetschedparam);
LIB_FUNCTION("nsYoNRywwNg", "libkernel", 1, "libkernel", 1, 1, scePthreadAttrInit);
@ -1621,6 +1616,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
LIB_FUNCTION("Xs9hdiD7sAA", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setschedparam);
LIB_FUNCTION("pDuPEf3m4fI", "libScePosix", 1, "libkernel", 1, 1, posix_sem_init);
LIB_FUNCTION("YCV5dGGBcCo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_wait);
LIB_FUNCTION("WBWzsRifCEA", "libScePosix", 1, "libkernel", 1, 1, posix_sem_trywait);
LIB_FUNCTION("w5IHyvahg-o", "libScePosix", 1, "libkernel", 1, 1, posix_sem_timedwait);
LIB_FUNCTION("IKP8typ0QUk", "libScePosix", 1, "libkernel", 1, 1, posix_sem_post);
LIB_FUNCTION("cDW233RAwWo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_destroy);

View File

@ -470,7 +470,7 @@ int PS4_SYSV_ABI scePadSetUserColor() {
}
int PS4_SYSV_ABI scePadSetVibration(s32 handle, const OrbisPadVibrationParam* pParam) {
LOG_ERROR(Lib_Pad, "(STUBBED) called");
LOG_DEBUG(Lib_Pad, "(STUBBED) called");
return ORBIS_OK;
}
@ -665,4 +665,4 @@ void RegisterlibScePad(Core::Loader::SymbolsResolver* sym) {
LIB_FUNCTION("7xA+hFtvBCA", "libScePad", 1, "libScePad", 1, 1, Func_EF103E845B6F0420);
};
} // namespace Libraries::Pad
} // namespace Libraries::Pad

View File

@ -8,11 +8,6 @@
#include "core/libraries/kernel/memory_management.h"
#include "core/memory.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/mman.h>
#endif
namespace Core {
@ -177,7 +172,7 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M
if (type == VMAType::Direct) {
new_vma.phys_base = phys_addr;
MapVulkanMemory(mapped_addr, size);
rasterizer->MapMemory(mapped_addr, size);
}
if (type == VMAType::Flexible) {
flexible_usage += size;
@ -227,7 +222,7 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) {
const auto type = it->second.type;
const bool has_backing = type == VMAType::Direct || type == VMAType::File;
if (type == VMAType::Direct) {
UnmapVulkanMemory(virtual_addr, size);
rasterizer->UnmapMemory(virtual_addr, size);
}
if (type == VMAType::Flexible) {
flexible_usage -= size;
@ -377,7 +372,7 @@ int MemoryManager::MTypeProtect(VAddr addr, size_t size, VMAType mtype, int prot
int MemoryManager::VirtualQuery(VAddr addr, int flags,
Libraries::Kernel::OrbisVirtualQueryInfo* info) {
::Libraries::Kernel::OrbisVirtualQueryInfo* info) {
std::scoped_lock lk{mutex};
auto it = FindVMA(addr);
@ -407,7 +402,7 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags,
}
int MemoryManager::DirectMemoryQuery(PAddr addr, bool find_next,
Libraries::Kernel::OrbisQueryInfo* out_info) {
::Libraries::Kernel::OrbisQueryInfo* out_info) {
std::scoped_lock lk{mutex};
auto dmem_area = FindDmemArea(addr);
@ -447,13 +442,6 @@ int MemoryManager::DirectQueryAvailable(PAddr search_start, PAddr search_end, si
return ORBIS_OK;
}
std::pair<vk::Buffer, size_t> MemoryManager::GetVulkanBuffer(VAddr addr) {
auto it = mapped_memories.upper_bound(addr);
it = std::prev(it);
ASSERT(it != mapped_memories.end() && it->first <= addr);
return std::make_pair(*it->second.buffer, addr - it->first);
}
void MemoryManager::NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name) {
auto it = FindVMA(virtual_addr);
@ -569,85 +557,6 @@ MemoryManager::DMemHandle MemoryManager::Split(DMemHandle dmem_handle, size_t of
return dmem_map.emplace_hint(std::next(dmem_handle), new_area.base, new_area);
};
void MemoryManager::MapVulkanMemory(VAddr addr, size_t size) {
return;
const vk::Device device = instance->GetDevice();
const auto memory_props = instance->GetPhysicalDevice().getMemoryProperties();
void* host_pointer = reinterpret_cast<void*>(addr);
const auto host_mem_props = device.getMemoryHostPointerPropertiesEXT(
vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, host_pointer);
ASSERT(host_mem_props.memoryTypeBits != 0);
int mapped_memory_type = -1;
auto find_mem_type_with_flag = [&](const vk::MemoryPropertyFlags flags) {
u32 host_mem_types = host_mem_props.memoryTypeBits;
while (host_mem_types != 0) {
// Try to find a cached memory type
mapped_memory_type = std::countr_zero(host_mem_types);
host_mem_types -= (1 << mapped_memory_type);
if ((memory_props.memoryTypes[mapped_memory_type].propertyFlags & flags) == flags) {
return;
}
}
mapped_memory_type = -1;
};
// First try to find a memory that is both coherent and cached
find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent |
vk::MemoryPropertyFlagBits::eHostCached);
if (mapped_memory_type == -1)
// Then only coherent (lower performance)
find_mem_type_with_flag(vk::MemoryPropertyFlagBits::eHostCoherent);
if (mapped_memory_type == -1) {
LOG_CRITICAL(Render_Vulkan, "No coherent memory available for memory mapping");
mapped_memory_type = std::countr_zero(host_mem_props.memoryTypeBits);
}
const vk::StructureChain alloc_info = {
vk::MemoryAllocateInfo{
.allocationSize = size,
.memoryTypeIndex = static_cast<uint32_t>(mapped_memory_type),
},
vk::ImportMemoryHostPointerInfoEXT{
.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT,
.pHostPointer = host_pointer,
},
};
const auto [it, new_memory] = mapped_memories.try_emplace(addr);
ASSERT_MSG(new_memory, "Attempting to remap already mapped vulkan memory");
auto& memory = it->second;
memory.backing = device.allocateMemoryUnique(alloc_info.get());
constexpr vk::BufferUsageFlags MapFlags =
vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer |
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst |
vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer;
const vk::StructureChain buffer_info = {
vk::BufferCreateInfo{
.size = size,
.usage = MapFlags,
.sharingMode = vk::SharingMode::eExclusive,
},
vk::ExternalMemoryBufferCreateInfoKHR{
.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT,
}};
memory.buffer = device.createBufferUnique(buffer_info.get());
device.bindBufferMemory(*memory.buffer, *memory.backing, 0);
}
void MemoryManager::UnmapVulkanMemory(VAddr addr, size_t size) {
return;
const auto it = mapped_memories.find(addr);
ASSERT(it != mapped_memories.end() && it->second.buffer_size == size);
mapped_memories.erase(it);
}
int MemoryManager::GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut,
void** directMemoryStartOut, void** directMemoryEndOut) {
std::scoped_lock lk{mutex};

View File

@ -3,20 +3,17 @@
#pragma once
#include <functional>
#include <map>
#include <mutex>
#include <string_view>
#include <vector>
#include <boost/icl/split_interval_map.hpp>
#include "common/enum.h"
#include "common/singleton.h"
#include "common/types.h"
#include "core/address_space.h"
#include "core/libraries/kernel/memory_management.h"
#include "video_core/renderer_vulkan/vk_common.h"
namespace Vulkan {
class Instance;
class Rasterizer;
}
namespace Libraries::Kernel {
@ -128,8 +125,8 @@ public:
explicit MemoryManager();
~MemoryManager();
void SetInstance(const Vulkan::Instance* instance_) {
instance = instance_;
void SetRasterizer(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
}
void SetTotalFlexibleSize(u64 size) {
@ -140,9 +137,7 @@ public:
return total_flexible_size - flexible_usage;
}
/// Returns the offset of the mapped virtual system managed memory base from where it usually
/// would be mapped.
[[nodiscard]] VAddr SystemReservedVirtualBase() noexcept {
VAddr SystemReservedVirtualBase() noexcept {
return impl.SystemReservedVirtualBase();
}
@ -176,8 +171,6 @@ public:
int DirectQueryAvailable(PAddr search_start, PAddr search_end, size_t alignment,
PAddr* phys_addr_out, size_t* size_out);
std::pair<vk::Buffer, size_t> GetVulkanBuffer(VAddr addr);
int GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, void** directMemoryStartOut,
void** directMemoryEndOut);
@ -222,10 +215,6 @@ private:
DMemHandle Split(DMemHandle dmem_handle, size_t offset_in_area);
void MapVulkanMemory(VAddr addr, size_t size);
void UnmapVulkanMemory(VAddr addr, size_t size);
private:
AddressSpace impl;
DMemMap dmem_map;
@ -233,14 +222,7 @@ private:
std::recursive_mutex mutex;
size_t total_flexible_size = 448_MB;
size_t flexible_usage{};
struct MappedMemory {
vk::UniqueBuffer buffer;
vk::UniqueDeviceMemory backing;
size_t buffer_size;
};
std::map<VAddr, MappedMemory> mapped_memories;
const Vulkan::Instance* instance{};
Vulkan::Rasterizer* rasterizer{};
};
using Memory = Common::Singleton<MemoryManager>;

View File

@ -88,6 +88,7 @@ void Module::LoadModuleToMemory(u32& max_tls_index) {
aligned_base_size + TrampolineSize, MemoryProt::CpuReadWrite,
MemoryMapFlags::Fixed, VMAType::Code, name, true);
LoadOffset += CODE_BASE_INCR * (1 + aligned_base_size / CODE_BASE_INCR);
LOG_INFO(Core_Linker, "Loading module {} to {}", name, fmt::ptr(*out_addr));
// Initialize trampoline generator.
void* trampoline_addr = std::bit_cast<void*>(base_virtual_addr + aligned_base_size);

View File

@ -21,8 +21,11 @@ int main(int argc, char* argv[]) {
Config::load(user_dir / "config.toml");
std::filesystem::create_directory(user_dir / "game_data");
// Check if elf or eboot.bin path was passed as a command line argument
bool has_command_line_argument = argc > 1;
// Check if the game install directory is set
if (Config::getGameInstallDir() == "") {
if (Config::getGameInstallDir() == "" && !has_command_line_argument) {
GameInstallDialog dlg;
dlg.exec();
}
@ -35,7 +38,7 @@ int main(int argc, char* argv[]) {
m_main_window->Init();
// Check for command line arguments
if (argc > 1) {
if (has_command_line_argument) {
Core::Emulator emulator;
emulator.Run(argv[1]);
}

View File

@ -51,8 +51,8 @@ bool MainWindow::Init() {
this->setStatusBar(statusBar.data());
// Update status bar
int numGames = m_game_info->m_games.size();
QString statusMessage = "Games: " + QString::number(numGames) + " (" +
QString::number(duration.count()) + "ms). Ready.";
QString statusMessage =
"Games: " + QString::number(numGames) + " (" + QString::number(duration.count()) + "ms)";
statusBar->showMessage(statusMessage);
return true;
}
@ -72,8 +72,8 @@ void MainWindow::CreateActions() {
// create action group for themes
m_theme_act_group = new QActionGroup(this);
m_theme_act_group->addAction(ui->setThemeLight);
m_theme_act_group->addAction(ui->setThemeDark);
m_theme_act_group->addAction(ui->setThemeLight);
m_theme_act_group->addAction(ui->setThemeGreen);
m_theme_act_group->addAction(ui->setThemeBlue);
m_theme_act_group->addAction(ui->setThemeViolet);
@ -179,32 +179,11 @@ void MainWindow::CreateConnects() {
}
});
connect(ui->playButton, &QPushButton::clicked, this, [this]() {
QString gamePath = "";
int table_mode = Config::getTableMode();
if (table_mode == 0) {
if (m_game_list_frame->currentItem()) {
int itemID = m_game_list_frame->currentItem()->row();
gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
}
} else if (table_mode == 1) {
if (m_game_grid_frame->cellClicked) {
int itemID = (m_game_grid_frame->crtRow * m_game_grid_frame->columnCnt) +
m_game_grid_frame->crtColumn;
gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
}
} else {
if (m_elf_viewer->currentItem()) {
int itemID = m_elf_viewer->currentItem()->row();
gamePath = QString::fromStdString(m_elf_viewer->m_elf_list[itemID].toStdString());
}
}
if (gamePath != "") {
AddRecentFiles(gamePath);
Core::Emulator emulator;
emulator.Run(gamePath.toUtf8().constData());
}
});
connect(ui->playButton, &QPushButton::clicked, this, &MainWindow::StartGame);
connect(m_game_grid_frame.get(), &QTableWidget::cellDoubleClicked, this,
&MainWindow::StartGame);
connect(m_game_list_frame.get(), &QTableWidget::cellDoubleClicked, this,
&MainWindow::StartGame);
connect(ui->setIconSizeTinyAct, &QAction::triggered, this, [this]() {
if (isTableList) {
@ -344,14 +323,6 @@ void MainWindow::CreateConnects() {
});
// Themes
connect(ui->setThemeLight, &QAction::triggered, &m_window_themes, [this]() {
m_window_themes.SetWindowTheme(Theme::Light, ui->mw_searchbar);
Config::setMainWindowTheme(static_cast<int>(Theme::Light));
if (!isIconBlack) {
SetUiIcons(true);
isIconBlack = true;
}
});
connect(ui->setThemeDark, &QAction::triggered, &m_window_themes, [this]() {
m_window_themes.SetWindowTheme(Theme::Dark, ui->mw_searchbar);
Config::setMainWindowTheme(static_cast<int>(Theme::Dark));
@ -360,6 +331,14 @@ void MainWindow::CreateConnects() {
isIconBlack = false;
}
});
connect(ui->setThemeLight, &QAction::triggered, &m_window_themes, [this]() {
m_window_themes.SetWindowTheme(Theme::Light, ui->mw_searchbar);
Config::setMainWindowTheme(static_cast<int>(Theme::Light));
if (!isIconBlack) {
SetUiIcons(true);
isIconBlack = true;
}
});
connect(ui->setThemeGreen, &QAction::triggered, &m_window_themes, [this]() {
m_window_themes.SetWindowTheme(Theme::Green, ui->mw_searchbar);
Config::setMainWindowTheme(static_cast<int>(Theme::Green));
@ -386,6 +365,33 @@ void MainWindow::CreateConnects() {
});
}
void MainWindow::StartGame() {
QString gamePath = "";
int table_mode = Config::getTableMode();
if (table_mode == 0) {
if (m_game_list_frame->currentItem()) {
int itemID = m_game_list_frame->currentItem()->row();
gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
}
} else if (table_mode == 1) {
if (m_game_grid_frame->cellClicked) {
int itemID = (m_game_grid_frame->crtRow * m_game_grid_frame->columnCnt) +
m_game_grid_frame->crtColumn;
gamePath = QString::fromStdString(m_game_info->m_games[itemID].path + "/eboot.bin");
}
} else {
if (m_elf_viewer->currentItem()) {
int itemID = m_elf_viewer->currentItem()->row();
gamePath = QString::fromStdString(m_elf_viewer->m_elf_list[itemID].toStdString());
}
}
if (gamePath != "") {
AddRecentFiles(gamePath);
Core::Emulator emulator;
emulator.Run(gamePath.toUtf8().constData());
}
}
void MainWindow::SearchGameTable(const QString& text) {
if (isTableList) {
for (int row = 0; row < m_game_list_frame->rowCount(); row++) {
@ -415,7 +421,7 @@ void MainWindow::RefreshGameTable() {
m_game_grid_frame->PopulateGameGrid(m_game_info->m_games, false);
statusBar->clearMessage();
int numGames = m_game_info->m_games.size();
QString statusMessage = "Games: " + QString::number(numGames) + ". Ready.";
QString statusMessage = "Games: " + QString::number(numGames);
statusBar->showMessage(statusMessage);
}
@ -577,6 +583,7 @@ void MainWindow::InstallDragDropPkg(std::filesystem::path file, int pkgNum, int
void MainWindow::InstallDirectory() {
GameInstallDialog dlg;
dlg.exec();
RefreshGameTable();
}
void MainWindow::SetLastUsedTheme() {

View File

@ -39,6 +39,7 @@ public:
bool Init();
void InstallDragDropPkg(std::filesystem::path file, int pkgNum, int nPkg);
void InstallDirectory();
void StartGame();
private Q_SLOTS:
void ConfigureGuiFromSettings();

View File

@ -297,7 +297,7 @@ public:
menuRecent->setTitle(QCoreApplication::translate("MainWindow", "Recent Games", nullptr));
exitAct->setText(QCoreApplication::translate("MainWindow", "Exit", nullptr));
#if QT_CONFIG(tooltip)
exitAct->setToolTip(QCoreApplication::translate("MainWindow", "Exit Shadps4", nullptr));
exitAct->setToolTip(QCoreApplication::translate("MainWindow", "Exit shadPS4", nullptr));
#endif // QT_CONFIG(tooltip)
#if QT_CONFIG(statustip)
exitAct->setStatusTip(

View File

@ -21,6 +21,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) {
case VsOutput::ClipDist7: {
const u32 index = u32(output) - u32(VsOutput::ClipDist0);
const Id clip_num{ctx.ConstU32(index)};
ASSERT_MSG(Sirit::ValidId(ctx.clip_distances), "Clip distance used but not defined");
return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, clip_num);
}
case VsOutput::CullDist0:
@ -33,6 +34,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) {
case VsOutput::CullDist7: {
const u32 index = u32(output) - u32(VsOutput::CullDist0);
const Id cull_num{ctx.ConstU32(index)};
ASSERT_MSG(Sirit::ValidId(ctx.cull_distances), "Cull distance used but not defined");
return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, cull_num);
}
default:
@ -125,7 +127,12 @@ Id EmitReadConst(EmitContext& ctx) {
}
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
const auto& buffer = ctx.buffers[handle];
auto& buffer = ctx.buffers[handle];
if (!Sirit::ValidId(buffer.offset)) {
buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
}
const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))};
index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords);
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
return ctx.OpLoad(buffer.data_types->Get(1), ptr);
}
@ -137,7 +144,7 @@ Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) {
Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
return ctx.OpLoad(
ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]),
ctx.instance_step_rates,
ctx.push_data_block,
rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value));
}
@ -221,7 +228,11 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
template <u32 N>
static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
const auto& buffer = ctx.buffers[handle];
auto& buffer = ctx.buffers[handle];
if (!Sirit::ValidId(buffer.offset)) {
buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
}
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
if constexpr (N == 1) {
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
@ -314,7 +325,7 @@ static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offs
}
static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) {
const auto& buffer = ctx.buffers[handle];
auto& buffer = ctx.buffers[handle];
const auto format = buffer.buffer.GetDataFmt();
switch (format) {
case AmdGpu::DataFormat::FormatInvalid:
@ -399,6 +410,11 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com
template <u32 N>
static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
auto& buffer = ctx.buffers[handle];
if (!Sirit::ValidId(buffer.offset)) {
buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
}
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
if constexpr (N == 1) {
return GetBufferFormatValue(ctx, handle, address, 0);
} else {
@ -428,7 +444,11 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad
template <u32 N>
static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
const auto& buffer = ctx.buffers[handle];
auto& buffer = ctx.buffers[handle];
if (!Sirit::ValidId(buffer.offset)) {
buffer.offset = ctx.GetBufferOffset(buffer.global_binding);
}
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
if constexpr (N == 1) {
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};

View File

@ -11,7 +11,7 @@ Id SubgroupScope(EmitContext& ctx) {
}
Id EmitWarpId(EmitContext& ctx) {
return ctx.OpLoad(ctx.U32[1], ctx.subgroup_id);
UNREACHABLE();
}
Id EmitLaneId(EmitContext& ctx) {

View File

@ -46,9 +46,9 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin
stage{program.info.stage}, binding{binding_} {
AddCapability(spv::Capability::Shader);
DefineArithmeticTypes();
DefineInterfaces(program);
DefineBuffers(info);
DefineImagesAndSamplers(info);
DefineInterfaces();
DefineBuffers();
DefineImagesAndSamplers();
DefineSharedMemory();
}
@ -117,9 +117,10 @@ void EmitContext::DefineArithmeticTypes() {
full_result_u32x2 = Name(TypeStruct(U32[1], U32[1]), "full_result_u32x2");
}
void EmitContext::DefineInterfaces(const IR::Program& program) {
DefineInputs(program.info);
DefineOutputs(program.info);
void EmitContext::DefineInterfaces() {
DefinePushDataBlock();
DefineInputs();
DefineOutputs();
}
Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) {
@ -164,6 +165,16 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
throw InvalidArgument("Invalid attribute type {}", fmt);
}
Id EmitContext::GetBufferOffset(u32 binding) {
const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4);
const u32 comp = (binding & 0xf) >> 2;
const u32 offset = (binding & 0x3) << 3;
const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
push_data_block, ConstU32(half), ConstU32(comp))};
const Id value{OpLoad(U32[1], ptr)};
return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U));
}
Id MakeDefaultValue(EmitContext& ctx, u32 default_value) {
switch (default_value) {
case 0:
@ -179,24 +190,13 @@ Id MakeDefaultValue(EmitContext& ctx, u32 default_value) {
}
}
void EmitContext::DefineInputs(const Info& info) {
void EmitContext::DefineInputs() {
switch (stage) {
case Stage::Vertex: {
vertex_index = DefineVariable(U32[1], spv::BuiltIn::VertexIndex, spv::StorageClass::Input);
base_vertex = DefineVariable(U32[1], spv::BuiltIn::BaseVertex, spv::StorageClass::Input);
instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input);
// Create push constants block for instance steps rates
const Id struct_type{Name(TypeStruct(U32[1], U32[1]), "instance_step_rates")};
Decorate(struct_type, spv::Decoration::Block);
MemberName(struct_type, 0, "sr0");
MemberName(struct_type, 1, "sr1");
MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U);
instance_step_rates = DefineVar(struct_type, spv::StorageClass::PushConstant);
Name(instance_step_rates, "step_rates");
interfaces.push_back(instance_step_rates);
for (const auto& input : info.vs_inputs) {
const Id type{GetAttributeType(*this, input.fmt)};
if (input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate0 ||
@ -225,7 +225,6 @@ void EmitContext::DefineInputs(const Info& info) {
break;
}
case Stage::Fragment:
subgroup_id = DefineVariable(U32[1], spv::BuiltIn::SubgroupId, spv::StorageClass::Input);
subgroup_local_invocation_id = DefineVariable(
U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input);
Decorate(subgroup_local_invocation_id, spv::Decoration::Flat);
@ -261,19 +260,20 @@ void EmitContext::DefineInputs(const Info& info) {
}
}
void EmitContext::DefineOutputs(const Info& info) {
void EmitContext::DefineOutputs() {
switch (stage) {
case Stage::Vertex: {
output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output);
const std::array<Id, 8> zero{f32_zero_value, f32_zero_value, f32_zero_value,
f32_zero_value, f32_zero_value, f32_zero_value,
f32_zero_value, f32_zero_value};
const Id type{TypeArray(F32[1], ConstU32(8U))};
const Id initializer{ConstantComposite(type, zero)};
clip_distances = DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output,
initializer);
cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output,
initializer);
const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) ||
info.stores.Get(IR::Attribute::Position2) ||
info.stores.Get(IR::Attribute::Position3);
if (has_extra_pos_stores) {
const Id type{TypeArray(F32[1], ConstU32(8U))};
clip_distances =
DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output);
cull_distances =
DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output);
}
for (u32 i = 0; i < IR::NumParams; i++) {
const IR::Attribute param{IR::Attribute::Param0 + i};
if (!info.stores.GetAny(param)) {
@ -305,7 +305,24 @@ void EmitContext::DefineOutputs(const Info& info) {
}
}
void EmitContext::DefineBuffers(const Info& info) {
void EmitContext::DefinePushDataBlock() {
// Create push constants block for instance steps rates
const Id struct_type{Name(TypeStruct(U32[1], U32[1], U32[4], U32[4]), "AuxData")};
Decorate(struct_type, spv::Decoration::Block);
MemberName(struct_type, 0, "sr0");
MemberName(struct_type, 1, "sr1");
MemberName(struct_type, 2, "buf_offsets0");
MemberName(struct_type, 3, "buf_offsets1");
MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U);
MemberDecorate(struct_type, 2, spv::Decoration::Offset, 8U);
MemberDecorate(struct_type, 3, spv::Decoration::Offset, 24U);
push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant);
Name(push_data_block, "push_data");
interfaces.push_back(push_data_block);
}
void EmitContext::DefineBuffers() {
boost::container::small_vector<Id, 8> type_ids;
for (u32 i = 0; const auto& buffer : info.buffers) {
const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32;
@ -323,8 +340,8 @@ void EmitContext::DefineBuffers(const Info& info) {
Decorate(struct_type, spv::Decoration::Block);
MemberName(struct_type, 0, "data");
MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
type_ids.push_back(record_array_type);
}
type_ids.push_back(record_array_type);
const auto storage_class =
buffer.is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform;
@ -335,9 +352,9 @@ void EmitContext::DefineBuffers(const Info& info) {
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base));
binding++;
buffers.push_back({
.id = id,
.global_binding = binding++,
.data_types = data_types,
.pointer_type = pointer_type,
.buffer = buffer.GetVsharp(info),
@ -431,7 +448,7 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
throw InvalidArgument("Invalid texture type {}", desc.type);
}
void EmitContext::DefineImagesAndSamplers(const Info& info) {
void EmitContext::DefineImagesAndSamplers() {
for (const auto& image_desc : info.images) {
const VectorIds* data_types = [&] {
switch (image_desc.nfmt) {

View File

@ -40,6 +40,7 @@ public:
~EmitContext();
Id Def(const IR::Value& value);
Id GetBufferOffset(u32 binding);
[[nodiscard]] Id DefineInput(Id type, u32 location) {
const Id input_id{DefineVar(type, spv::StorageClass::Input)};
@ -168,7 +169,7 @@ public:
Id output_position{};
Id vertex_index{};
Id instance_id{};
Id instance_step_rates{};
Id push_data_block{};
Id base_vertex{};
Id frag_coord{};
Id front_facing{};
@ -180,7 +181,6 @@ public:
Id workgroup_id{};
Id local_invocation_id{};
Id subgroup_id{};
Id subgroup_local_invocation_id{};
Id image_u32{};
@ -202,14 +202,16 @@ public:
struct BufferDefinition {
Id id;
Id offset;
u32 global_binding;
const VectorIds* data_types;
Id pointer_type;
AmdGpu::Buffer buffer;
};
u32& binding;
boost::container::small_vector<BufferDefinition, 4> buffers;
boost::container::small_vector<TextureDefinition, 4> images;
boost::container::small_vector<BufferDefinition, 16> buffers;
boost::container::small_vector<TextureDefinition, 8> images;
boost::container::small_vector<Id, 4> samplers;
Id sampler_type{};
@ -228,11 +230,12 @@ public:
private:
void DefineArithmeticTypes();
void DefineInterfaces(const IR::Program& program);
void DefineInputs(const Info& info);
void DefineOutputs(const Info& info);
void DefineBuffers(const Info& info);
void DefineImagesAndSamplers(const Info& info);
void DefineInterfaces();
void DefineInputs();
void DefineOutputs();
void DefinePushDataBlock();
void DefineBuffers();
void DefineImagesAndSamplers();
void DefineSharedMemory();
SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id);

View File

@ -40,7 +40,7 @@ static IR::Condition MakeCondition(Opcode opcode) {
}
}
CFG::CFG(ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_list_)
CFG::CFG(Common::ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_list_)
: block_pool{block_pool_}, inst_list{inst_list_} {
index_to_pc.resize(inst_list.size() + 1);
EmitLabels();

View File

@ -8,10 +8,10 @@
#include <boost/container/small_vector.hpp>
#include <boost/intrusive/set.hpp>
#include "common/object_pool.h"
#include "common/types.h"
#include "shader_recompiler/frontend/instruction.h"
#include "shader_recompiler/ir/condition.h"
#include "shader_recompiler/object_pool.h"
namespace Shader::Gcn {
@ -49,7 +49,7 @@ class CFG {
using Label = u32;
public:
explicit CFG(ObjectPool<Block>& block_pool, std::span<const GcnInst> inst_list);
explicit CFG(Common::ObjectPool<Block>& block_pool, std::span<const GcnInst> inst_list);
[[nodiscard]] std::string Dot() const;
@ -59,7 +59,7 @@ private:
void LinkBlocks();
public:
ObjectPool<Block>& block_pool;
Common::ObjectPool<Block>& block_pool;
std::span<const GcnInst> inst_list;
std::vector<u32> index_to_pc;
boost::container::small_vector<Label, 16> labels;

View File

@ -1,10 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
namespace Shader::Gcn {
void Translate();
} // namespace Shader::Gcn

View File

@ -287,7 +287,7 @@ bool NeedsLift(Node goto_stmt, Node label_stmt) noexcept {
*/
class GotoPass {
public:
explicit GotoPass(CFG& cfg, ObjectPool<Statement>& stmt_pool) : pool{stmt_pool} {
explicit GotoPass(CFG& cfg, Common::ObjectPool<Statement>& stmt_pool) : pool{stmt_pool} {
std::vector gotos{BuildTree(cfg)};
const auto end{gotos.rend()};
for (auto goto_stmt = gotos.rbegin(); goto_stmt != end; ++goto_stmt) {
@ -563,7 +563,7 @@ private:
return parent_tree.insert(std::next(loop), *new_goto);
}
ObjectPool<Statement>& pool;
Common::ObjectPool<Statement>& pool;
Statement root_stmt{FunctionTag{}};
};
@ -597,8 +597,9 @@ private:
class TranslatePass {
public:
TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
ObjectPool<Statement>& stmt_pool_, Statement& root_stmt,
TranslatePass(Common::ObjectPool<IR::Inst>& inst_pool_,
Common::ObjectPool<IR::Block>& block_pool_,
Common::ObjectPool<Statement>& stmt_pool_, Statement& root_stmt,
IR::AbstractSyntaxList& syntax_list_, std::span<const GcnInst> inst_list_,
Info& info_, const Profile& profile_)
: stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_},
@ -808,9 +809,9 @@ private:
return block_pool.Create(inst_pool);
}
ObjectPool<Statement>& stmt_pool;
ObjectPool<IR::Inst>& inst_pool;
ObjectPool<IR::Block>& block_pool;
Common::ObjectPool<Statement>& stmt_pool;
Common::ObjectPool<IR::Inst>& inst_pool;
Common::ObjectPool<IR::Block>& block_pool;
IR::AbstractSyntaxList& syntax_list;
const Block dummy_flow_block{.is_dummy = true};
std::span<const GcnInst> inst_list;
@ -819,9 +820,10 @@ private:
};
} // Anonymous namespace
IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
CFG& cfg, Info& info, const Profile& profile) {
ObjectPool<Statement> stmt_pool{64};
IR::AbstractSyntaxList BuildASL(Common::ObjectPool<IR::Inst>& inst_pool,
Common::ObjectPool<IR::Block>& block_pool, CFG& cfg, Info& info,
const Profile& profile) {
Common::ObjectPool<Statement> stmt_pool{64};
GotoPass goto_pass{cfg, stmt_pool};
Statement& root{goto_pass.RootStatement()};
IR::AbstractSyntaxList syntax_list;

View File

@ -7,7 +7,6 @@
#include "shader_recompiler/ir/abstract_syntax_list.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/value.h"
#include "shader_recompiler/object_pool.h"
namespace Shader {
struct Info;
@ -16,8 +15,8 @@ struct Profile;
namespace Shader::Gcn {
[[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
ObjectPool<IR::Block>& block_pool, CFG& cfg,
[[nodiscard]] IR::AbstractSyntaxList BuildASL(Common::ObjectPool<IR::Inst>& inst_pool,
Common::ObjectPool<IR::Block>& block_pool, CFG& cfg,
Info& info, const Profile& profile);
} // namespace Shader::Gcn

View File

@ -48,7 +48,8 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
IR::VectorReg dst_reg{inst.dst[0].code};
if (is_pair) {
// Pair loads are either 32 or 64-bit
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
const u32 adj = bit_size == 32 ? 4 : 8;
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0);
if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data0});
@ -56,7 +57,7 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)});
}
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data1});
@ -65,11 +66,13 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)});
}
} else if (bit_size == 64) {
const IR::Value data = ir.LoadShared(bit_size, is_signed, addr);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)});
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)});
} else {
const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr)};
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)};
ir.SetVectorReg(dst_reg, data);
}
}
@ -79,7 +82,8 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
const IR::VectorReg data0{inst.src[1].code};
const IR::VectorReg data1{inst.src[2].code};
if (is_pair) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
const u32 adj = bit_size == 32 ? 4 : 8;
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
} else {
@ -87,7 +91,7 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)),
addr0);
}
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
} else {
@ -96,11 +100,13 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
addr1);
}
} else if (bit_size == 64) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
const IR::Value data =
ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
ir.WriteShared(bit_size, data, addr);
ir.WriteShared(bit_size, data, addr0);
} else {
ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
}
}

View File

@ -447,6 +447,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
.is_instance_data = true,
});
instance_buf_handle = s32(info.buffers.size() - 1);
info.uses_step_rates = true;
}
const u32 num_components = AmdGpu::NumComponents(buffer.GetDataFmt());

View File

@ -125,6 +125,7 @@ public:
void V_ADD_F32(const GcnInst& inst);
void V_CVT_OFF_F32_I4(const GcnInst& inst);
void V_MED3_F32(const GcnInst& inst);
void V_MED3_I32(const GcnInst& inst);
void V_FLOOR_F32(const GcnInst& inst);
void V_SUB_F32(const GcnInst& inst);
void V_RCP_F32(const GcnInst& inst);
@ -159,6 +160,7 @@ public:
void V_SUB_I32(const GcnInst& inst);
void V_LSHR_B32(const GcnInst& inst);
void V_ASHRREV_I32(const GcnInst& inst);
void V_ASHR_I32(const GcnInst& inst);
void V_MAD_U32_U24(const GcnInst& inst);
void V_RNDNE_F32(const GcnInst& inst);
void V_BCNT_U32_B32(const GcnInst& inst);

View File

@ -24,6 +24,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
return V_LSHR_B32(inst);
case Opcode::V_ASHRREV_I32:
return V_ASHRREV_I32(inst);
case Opcode::V_ASHR_I32:
return V_ASHR_I32(inst);
case Opcode::V_LSHRREV_B32:
return V_LSHRREV_B32(inst);
case Opcode::V_NOT_B32:
@ -183,6 +185,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
return V_ADD_F32(inst);
case Opcode::V_MED3_F32:
return V_MED3_F32(inst);
case Opcode::V_MED3_I32:
return V_MED3_I32(inst);
case Opcode::V_FLOOR_F32:
return V_FLOOR_F32(inst);
case Opcode::V_SUB_F32:
@ -479,6 +483,14 @@ void Translator::V_MED3_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx));
}
void Translator::V_MED3_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 src2{GetSrc(inst.src[2])};
const IR::U32 mmx = ir.SMin(ir.SMax(src0, src1), src2);
SetDst(inst.dst[0], ir.SMax(ir.SMin(src0, src1), mmx));
}
void Translator::V_FLOOR_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::VectorReg dst_reg{inst.dst[0].code};
@ -760,6 +772,12 @@ void Translator::V_ASHRREV_I32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.ShiftRightArithmetic(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
}
void Translator::V_ASHR_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ShiftRightArithmetic(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F))));
}
void Translator::V_MAD_U32_U24(const GcnInst& inst) {
V_MAD_I32_I24(inst, false);
}
@ -925,25 +943,12 @@ void Translator::V_FFBL_B32(const GcnInst& inst) {
void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 lane_id = ir.LaneId();
const auto [warp_half, mask_shift] = [&]() -> std::pair<IR::U32, IR::U32> {
if (profile.subgroup_size == 32) {
const IR::U32 warp_half = ir.BitwiseAnd(ir.WarpId(), ir.Imm32(1));
return std::make_pair(warp_half, lane_id);
}
const IR::U32 warp_half = ir.ShiftRightLogical(lane_id, ir.Imm32(5));
const IR::U32 mask_shift = ir.BitwiseAnd(lane_id, ir.Imm32(0x1F));
return std::make_pair(warp_half, mask_shift);
}();
const IR::U32 thread_mask = ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1), mask_shift), ir.Imm32(1));
const IR::U1 is_odd_warp = ir.INotEqual(warp_half, ir.Imm32(0));
const IR::U32 mask = IR::U32{ir.Select(is_odd_warp, is_low ? ir.Imm32(~0U) : thread_mask,
is_low ? thread_mask : ir.Imm32(0))};
const IR::U32 masked_value = ir.BitwiseAnd(src0, mask);
const IR::U32 result = ir.IAdd(src1, ir.BitCount(masked_value));
SetDst(inst.dst[0], result);
if (!is_low) {
ASSERT(src0.IsImmediate() && src0.U32() == ~0U && src1.IsImmediate() && src1.U32() == 0U);
return;
}
ASSERT(src0.IsImmediate() && src0.U32() == ~0U);
SetDst(inst.dst[0], ir.LaneId());
}
} // namespace Shader::Gcn

View File

@ -338,6 +338,11 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma
if (is_typed) {
info.dmft.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
info.nfmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
ASSERT(info.nfmt == AmdGpu::NumberFormat::Float &&
(info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
info.dmft == AmdGpu::DataFormat::Format32_32 ||
info.dmft == AmdGpu::DataFormat::Format32));
}
const IR::Value handle =

View File

@ -9,7 +9,7 @@
namespace Shader::IR {
Block::Block(ObjectPool<Inst>& inst_pool_) : inst_pool{&inst_pool_} {}
Block::Block(Common::ObjectPool<Inst>& inst_pool_) : inst_pool{&inst_pool_} {}
Block::~Block() = default;

View File

@ -9,10 +9,10 @@
#include <vector>
#include <boost/intrusive/list.hpp>
#include "common/object_pool.h"
#include "common/types.h"
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/value.h"
#include "shader_recompiler/object_pool.h"
namespace Shader::IR {
@ -25,7 +25,7 @@ public:
using reverse_iterator = InstructionList::reverse_iterator;
using const_reverse_iterator = InstructionList::const_reverse_iterator;
explicit Block(ObjectPool<Inst>& inst_pool_);
explicit Block(Common::ObjectPool<Inst>& inst_pool_);
~Block();
Block(const Block&) = delete;
@ -153,7 +153,7 @@ public:
private:
/// Memory pool for instruction list
ObjectPool<Inst>* inst_pool;
Common::ObjectPool<Inst>* inst_pool;
/// List of instructions in this block
InstructionList instructions;

View File

@ -173,10 +173,9 @@ bool IsImageStorageInstruction(const IR::Inst& inst) {
class Descriptors {
public:
explicit Descriptors(BufferResourceList& buffer_resources_, ImageResourceList& image_resources_,
SamplerResourceList& sampler_resources_)
: buffer_resources{buffer_resources_}, image_resources{image_resources_},
sampler_resources{sampler_resources_} {}
explicit Descriptors(Info& info_)
: info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images},
sampler_resources{info_.samplers} {}
u32 Add(const BufferResource& desc) {
const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
@ -188,6 +187,7 @@ public:
ASSERT(buffer.length == desc.length);
buffer.is_storage |= desc.is_storage;
buffer.used_types |= desc.used_types;
buffer.is_written |= desc.is_written;
return index;
}
@ -201,9 +201,16 @@ public:
}
u32 Add(const SamplerResource& desc) {
const u32 index{Add(sampler_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
if (desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset) {
return true;
}
// Samplers with different bindings might still be the same.
const auto old_sharp =
info.ReadUd<AmdGpu::Sampler>(existing.sgpr_base, existing.dword_offset);
const auto new_sharp = info.ReadUd<AmdGpu::Sampler>(desc.sgpr_base, desc.dword_offset);
return old_sharp == new_sharp;
})};
return index;
}
@ -219,6 +226,7 @@ private:
return static_cast<u32>(descriptors.size()) - 1;
}
const Info& info;
BufferResourceList& buffer_resources;
ImageResourceList& image_resources;
SamplerResourceList& sampler_resources;
@ -328,16 +336,6 @@ static bool IsLoadBufferFormat(const IR::Inst& inst) {
}
}
static bool IsReadConstBuffer(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::ReadConstBuffer:
case IR::Opcode::ReadConstBufferU32:
return true;
default:
return false;
}
}
static u32 BufferLength(const AmdGpu::Buffer& buffer) {
const auto stride = buffer.GetStride();
if (stride < sizeof(f32)) {
@ -401,30 +399,37 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
IR::Inst* handle = inst.Arg(0).InstRecursive();
IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
const bool is_store = IsBufferStore(inst);
buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
binding = descriptors.Add(BufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.length = BufferLength(buffer),
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
.is_storage = is_store || buffer.GetSize() > MaxUboSize,
.is_written = is_store,
});
}
// Update buffer descriptor format.
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
// Replace handle with binding index in buffer resource list.
inst.SetArg(0, ir.Imm32(binding));
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
auto& buffer_desc = info.buffers[binding];
if (inst_info.is_typed) {
ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float &&
(inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
inst_info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
inst_info.dmft == AmdGpu::DataFormat::Format32_32 ||
inst_info.dmft == AmdGpu::DataFormat::Format32));
buffer_desc.dfmt = inst_info.dmft;
buffer_desc.nfmt = inst_info.nfmt;
} else {
buffer_desc.dfmt = buffer.GetDataFmt();
buffer_desc.nfmt = buffer.GetNumberFmt();
}
if (IsReadConstBuffer(inst)) {
// Replace handle with binding index in buffer resource list.
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.SetArg(0, ir.Imm32(binding));
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
// Address of constant buffer reads can be calculated at IR emittion time.
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
return;
}
@ -434,10 +439,14 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
}
} else {
const u32 stride = buffer.GetStride();
ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}",
stride);
if (stride < 4) {
LOG_WARNING(Render_Vulkan,
"non-formatting load_buffer_* is not implemented for stride {}", stride);
}
}
// Compute address of the buffer using the stride.
// Todo: What if buffer is rebound with different stride?
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.index_enable) {
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
@ -587,39 +596,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
}
void ResourceTrackingPass(IR::Program& program) {
// When loading data from untyped buffer we don't have if it is float or integer.
// Most of the time it is float so that is the default. This pass detects float buffer loads
// combined with bitcasts and patches them to be integer loads.
for (IR::Block* const block : program.post_order_blocks) {
break;
for (IR::Inst& inst : block->Instructions()) {
if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) {
continue;
}
// Replace the bitcast with a typed buffer read
IR::Inst* const arg_inst{inst.Arg(0).TryInstRecursive()};
if (!arg_inst) {
continue;
}
const auto replace{[&](IR::Opcode new_opcode) {
inst.ReplaceOpcode(new_opcode);
inst.SetArg(0, arg_inst->Arg(0));
inst.SetArg(1, arg_inst->Arg(1));
inst.SetFlags(arg_inst->Flags<u32>());
arg_inst->Invalidate();
}};
if (arg_inst->GetOpcode() == IR::Opcode::ReadConstBuffer) {
replace(IR::Opcode::ReadConstBufferU32);
}
if (arg_inst->GetOpcode() == IR::Opcode::LoadBufferF32) {
replace(IR::Opcode::LoadBufferU32);
}
}
}
// Iterate resource instructions and patch them after finding the sharp.
auto& info = program.info;
Descriptors descriptors{info.buffers, info.images, info.samplers};
Descriptors descriptors{info};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (IsBufferInstruction(inst)) {

View File

@ -27,9 +27,9 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
return blocks;
}
IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
std::span<const u32> token, const Info&& info,
const Profile& profile) {
IR::Program TranslateProgram(Common::ObjectPool<IR::Inst>& inst_pool,
Common::ObjectPool<IR::Block>& block_pool, std::span<const u32> token,
const Info&& info, const Profile& profile) {
// Ensure first instruction is expected.
constexpr u32 token_mov_vcchi = 0xBEEB03FF;
ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm");
@ -45,7 +45,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
}
// Create control flow graph
ObjectPool<Gcn::Block> gcn_block_pool{64};
Common::ObjectPool<Gcn::Block> gcn_block_pool{64};
Gcn::CFG cfg{gcn_block_pool, program.ins_list};
// Structurize control flow graph and create program.
@ -61,7 +61,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
Shader::Optimization::IdentityRemovalPass(program.blocks);
Shader::Optimization::DeadCodeEliminationPass(program);
Shader::Optimization::CollectShaderInfoPass(program);
LOG_INFO(Render_Vulkan, "{}", Shader::IR::DumpProgram(program));
LOG_DEBUG(Render_Vulkan, "{}", Shader::IR::DumpProgram(program));
return program;
}

View File

@ -3,16 +3,16 @@
#pragma once
#include "common/object_pool.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/object_pool.h"
namespace Shader {
struct Profile;
[[nodiscard]] IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool,
ObjectPool<IR::Block>& block_pool,
[[nodiscard]] IR::Program TranslateProgram(Common::ObjectPool<IR::Inst>& inst_pool,
Common::ObjectPool<IR::Block>& block_pool,
std::span<const u32> code, const Info&& info,
const Profile& profile);

View File

@ -77,8 +77,11 @@ struct BufferResource {
u32 length;
IR::Type used_types;
AmdGpu::Buffer inline_cbuf;
bool is_storage{false};
bool is_instance_data{false};
AmdGpu::DataFormat dfmt;
AmdGpu::NumberFormat nfmt;
bool is_storage{};
bool is_instance_data{};
bool is_written{};
constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept;
};
@ -105,6 +108,19 @@ struct SamplerResource {
};
using SamplerResourceList = boost::container::static_vector<SamplerResource, 16>;
struct PushData {
static constexpr size_t BufOffsetIndex = 2;
u32 step0;
u32 step1;
std::array<u8, 32> buf_offsets;
void AddOffset(u32 binding, u32 offset) {
ASSERT(offset < 64 && binding < 32);
buf_offsets[binding] = offset;
}
};
struct Info {
struct VsInput {
enum InstanceIdType : u8 {
@ -182,6 +198,7 @@ struct Info {
bool uses_shared_u8{};
bool uses_shared_u16{};
bool uses_fp16{};
bool uses_step_rates{};
bool translation_failed{}; // indicates that shader has unsupported instructions
template <typename T>

View File

@ -6,7 +6,7 @@
#include <array>
#include <condition_variable>
#include <coroutine>
#include <functional>
#include <exception>
#include <mutex>
#include <span>
#include <thread>
@ -496,7 +496,7 @@ struct Liverpool {
template <typename T = VAddr>
T Address() const {
return reinterpret_cast<T>((base_addr_lo & ~1U) | u64(base_addr_hi) << 32);
return std::bit_cast<T>((base_addr_lo & ~1U) | u64(base_addr_hi) << 32);
}
};
@ -1040,7 +1040,11 @@ private:
return {};
}
void unhandled_exception() {
UNREACHABLE();
try {
std::rethrow_exception(std::current_exception());
} catch (const std::exception& e) {
UNREACHABLE_MSG("Unhandled exception: {}", e.what());
}
}
void return_void() {}
struct empty {};

View File

@ -363,6 +363,10 @@ struct Sampler {
return raw0 != 0 || raw1 != 0;
}
bool operator==(const Sampler& other) const noexcept {
return std::memcmp(this, &other, sizeof(Sampler)) == 0;
}
float LodBias() const noexcept {
return static_cast<float>(static_cast<int16_t>((lod_bias.Value() ^ 0x2000u) - 0x2000u)) /
256.0f;

View File

@ -0,0 +1,227 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/alignment.h"
#include "common/assert.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_platform.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include <vk_mem_alloc.h>
namespace VideoCore {
constexpr vk::BufferUsageFlags AllFlags =
vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst |
vk::BufferUsageFlagBits::eUniformTexelBuffer | vk::BufferUsageFlagBits::eStorageTexelBuffer |
vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer |
vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer;
std::string_view BufferTypeName(MemoryUsage type) {
switch (type) {
case MemoryUsage::Upload:
return "Upload";
case MemoryUsage::Download:
return "Download";
case MemoryUsage::Stream:
return "Stream";
case MemoryUsage::DeviceLocal:
return "DeviceLocal";
default:
return "Invalid";
}
}
[[nodiscard]] VkMemoryPropertyFlags MemoryUsagePreferredVmaFlags(MemoryUsage usage) {
return usage != MemoryUsage::DeviceLocal ? VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
: VkMemoryPropertyFlagBits{};
}
[[nodiscard]] VmaAllocationCreateFlags MemoryUsageVmaFlags(MemoryUsage usage) {
switch (usage) {
case MemoryUsage::Upload:
case MemoryUsage::Stream:
return VMA_ALLOCATION_CREATE_MAPPED_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
case MemoryUsage::Download:
return VMA_ALLOCATION_CREATE_MAPPED_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
case MemoryUsage::DeviceLocal:
return {};
}
return {};
}
[[nodiscard]] VmaMemoryUsage MemoryUsageVma(MemoryUsage usage) {
switch (usage) {
case MemoryUsage::DeviceLocal:
case MemoryUsage::Stream:
return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
case MemoryUsage::Upload:
case MemoryUsage::Download:
return VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
}
return VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
}
UniqueBuffer::UniqueBuffer(vk::Device device_, VmaAllocator allocator_)
: device{device_}, allocator{allocator_} {}
UniqueBuffer::~UniqueBuffer() {
if (buffer) {
vmaDestroyBuffer(allocator, buffer, allocation);
}
}
void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
VmaAllocationInfo* out_alloc_info) {
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
.usage = MemoryUsageVma(usage),
.requiredFlags = 0,
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
.pool = VK_NULL_HANDLE,
.pUserData = nullptr,
};
const VkBufferCreateInfo buffer_ci_unsafe = static_cast<VkBufferCreateInfo>(buffer_ci);
VkBuffer unsafe_buffer{};
VkResult result = vmaCreateBuffer(allocator, &buffer_ci_unsafe, &alloc_ci, &unsafe_buffer,
&allocation, out_alloc_info);
ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
vk::to_string(vk::Result{result}));
buffer = vk::Buffer{unsafe_buffer};
}
Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_,
u64 size_bytes_)
: cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_},
buffer{instance->GetDevice(), instance->GetAllocator()} {
// Create buffer object.
const vk::BufferCreateInfo buffer_ci = {
.size = size_bytes,
.usage = AllFlags,
};
VmaAllocationInfo alloc_info{};
buffer.Create(buffer_ci, usage, &alloc_info);
if (instance->HasDebuggingToolAttached()) {
const auto device = instance->GetDevice();
Vulkan::SetObjectName(device, Handle(), "Buffer {:#x} {} KiB", cpu_addr, size_bytes / 1024);
}
// Map it if it is host visible.
VkMemoryPropertyFlags property_flags{};
vmaGetAllocationMemoryProperties(instance->GetAllocator(), buffer.allocation, &property_flags);
if (alloc_info.pMappedData) {
mapped_data = std::span<u8>{std::bit_cast<u8*>(alloc_info.pMappedData), size_bytes};
}
is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
vk::BufferView Buffer::View(u32 offset, u32 size, AmdGpu::DataFormat dfmt,
AmdGpu::NumberFormat nfmt) {
const auto it{std::ranges::find_if(views, [offset, size, dfmt, nfmt](const BufferView& view) {
return offset == view.offset && size == view.size && dfmt == view.dfmt && nfmt == view.nfmt;
})};
if (it != views.end()) {
return it->handle;
}
views.push_back({
.offset = offset,
.size = size,
.dfmt = dfmt,
.nfmt = nfmt,
.handle = instance->GetDevice().createBufferView({
.buffer = buffer.buffer,
.format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt),
.offset = offset,
.range = size,
}),
});
return views.back().handle;
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_,
MemoryUsage usage, u64 size_bytes)
: Buffer{instance, usage, 0, size_bytes}, scheduler{scheduler_} {
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
const auto device = instance.GetDevice();
if (instance.HasDebuggingToolAttached()) {
Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}): {} KiB", BufferTypeName(usage),
size_bytes / 1024);
}
}
std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
if (!is_coherent && usage == MemoryUsage::Stream) {
size = Common::AlignUp(size, instance->NonCoherentAtomSize());
}
ASSERT(size <= this->size_bytes);
mapped_size = size;
if (alignment > 0) {
offset = Common::AlignUp(offset, alignment);
}
if (offset + size > this->size_bytes) {
// The buffer would overflow, save the amount of used watches and reset the state.
invalidation_mark = current_watch_cursor;
current_watch_cursor = 0;
offset = 0;
// Swap watches and reset waiting cursors.
std::swap(previous_watches, current_watches);
wait_cursor = 0;
wait_bound = 0;
}
const u64 mapped_upper_bound = offset + size;
WaitPendingOperations(mapped_upper_bound);
return std::make_pair(mapped_data.data() + offset, offset);
}
void StreamBuffer::Commit() {
if (!is_coherent) {
if (usage == MemoryUsage::Download) {
vmaInvalidateAllocation(instance->GetAllocator(), buffer.allocation, offset,
mapped_size);
} else {
vmaFlushAllocation(instance->GetAllocator(), buffer.allocation, offset, mapped_size);
}
}
offset += mapped_size;
if (current_watch_cursor + 1 >= current_watches.size()) {
// Ensure that there are enough watches.
ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
}
auto& watch = current_watches[current_watch_cursor++];
watch.upper_bound = offset;
watch.tick = scheduler.CurrentTick();
}
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
watches.resize(watches.size() + grow_size);
}
void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
if (!invalidation_mark) {
return;
}
while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
auto& watch = previous_watches[wait_cursor];
wait_bound = watch.upper_bound;
scheduler.Wait(watch.tick);
++wait_cursor;
}
}
} // namespace VideoCore

View File

@ -0,0 +1,173 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <cstddef>
#include <utility>
#include <vector>
#include "common/types.h"
#include "video_core/amdgpu/resource.h"
#include "video_core/renderer_vulkan/vk_common.h"
namespace Vulkan {
class Instance;
class Scheduler;
} // namespace Vulkan
VK_DEFINE_HANDLE(VmaAllocation)
VK_DEFINE_HANDLE(VmaAllocator)
struct VmaAllocationInfo;
namespace VideoCore {
/// Hints and requirements for the backing memory type of a commit
enum class MemoryUsage {
DeviceLocal, ///< Requests device local buffer.
Upload, ///< Requires a host visible memory type optimized for CPU to GPU uploads
Download, ///< Requires a host visible memory type optimized for GPU to CPU readbacks
Stream, ///< Requests device local host visible buffer, falling back host memory.
};
struct UniqueBuffer {
explicit UniqueBuffer(vk::Device device, VmaAllocator allocator);
~UniqueBuffer();
UniqueBuffer(const UniqueBuffer&) = delete;
UniqueBuffer& operator=(const UniqueBuffer&) = delete;
UniqueBuffer(UniqueBuffer&& other)
: buffer{std::exchange(other.buffer, VK_NULL_HANDLE)},
allocator{std::exchange(other.allocator, VK_NULL_HANDLE)},
allocation{std::exchange(other.allocation, VK_NULL_HANDLE)} {}
UniqueBuffer& operator=(UniqueBuffer&& other) {
buffer = std::exchange(other.buffer, VK_NULL_HANDLE);
allocator = std::exchange(other.allocator, VK_NULL_HANDLE);
allocation = std::exchange(other.allocation, VK_NULL_HANDLE);
return *this;
}
void Create(const vk::BufferCreateInfo& image_ci, MemoryUsage usage,
VmaAllocationInfo* out_alloc_info);
operator vk::Buffer() const {
return buffer;
}
vk::Device device;
VmaAllocator allocator;
VmaAllocation allocation;
vk::Buffer buffer{};
};
class Buffer {
public:
explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_,
u64 size_bytes_);
Buffer& operator=(const Buffer&) = delete;
Buffer(const Buffer&) = delete;
Buffer& operator=(Buffer&&) = default;
Buffer(Buffer&&) = default;
vk::BufferView View(u32 offset, u32 size, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt);
/// Increases the likeliness of this being a stream buffer
void IncreaseStreamScore(int score) noexcept {
stream_score += score;
}
/// Returns the likeliness of this being a stream buffer
[[nodiscard]] int StreamScore() const noexcept {
return stream_score;
}
/// Returns true when vaddr -> vaddr+size is fully contained in the buffer
[[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
}
/// Returns the base CPU address of the buffer
[[nodiscard]] VAddr CpuAddr() const noexcept {
return cpu_addr;
}
/// Returns the offset relative to the given CPU address
[[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept {
return static_cast<u32>(other_cpu_addr - cpu_addr);
}
size_t SizeBytes() const {
return size_bytes;
}
vk::Buffer Handle() const noexcept {
return buffer;
}
public:
VAddr cpu_addr = 0;
bool is_picked{};
bool is_coherent{};
int stream_score = 0;
size_t size_bytes = 0;
std::span<u8> mapped_data;
const Vulkan::Instance* instance{};
MemoryUsage usage;
UniqueBuffer buffer;
struct BufferView {
u32 offset;
u32 size;
AmdGpu::DataFormat dfmt;
AmdGpu::NumberFormat nfmt;
vk::BufferView handle;
};
std::vector<BufferView> views;
};
class StreamBuffer : public Buffer {
public:
explicit StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
MemoryUsage usage, u64 size_bytes_);
/// Reserves a region of memory from the stream buffer.
std::pair<u8*, u64> Map(u64 size, u64 alignment = 0);
/// Ensures that reserved bytes of memory are available to the GPU.
void Commit();
/// Maps and commits a memory region with user provided data
u64 Copy(VAddr src, size_t size, size_t alignment = 0) {
const auto [data, offset] = Map(size, alignment);
std::memcpy(data, reinterpret_cast<const void*>(src), size);
Commit();
return offset;
}
private:
struct Watch {
u64 tick{};
u64 upper_bound{};
};
/// Increases the amount of watches available.
void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);
/// Waits pending watches until requested upper bound.
void WaitPendingOperations(u64 requested_upper_bound);
private:
Vulkan::Scheduler& scheduler;
u64 offset{};
u64 mapped_size{};
std::vector<Watch> current_watches;
std::size_t current_watch_cursor{};
std::optional<size_t> invalidation_mark;
std::vector<Watch> previous_watches;
std::size_t wait_cursor{};
u64 wait_bound{};
};
} // namespace VideoCore

View File

@ -0,0 +1,497 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include "common/alignment.h"
#include "common/scope_exit.h"
#include "shader_recompiler/runtime_info.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
namespace VideoCore {
static constexpr size_t StagingBufferSize = 256_MB;
static constexpr size_t UboStreamBufferSize = 64_MB;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
const AmdGpu::Liverpool* liverpool_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, tracker{tracker_},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
memory_tracker{&tracker} {
// Ensure the first slot is used for the null buffer
void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, 1));
}
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
std::scoped_lock lk{mutex};
const bool is_tracked = IsRegionRegistered(device_addr, size);
if (!is_tracked) {
return;
}
// Mark the page as CPU modified to stop tracking writes.
SCOPE_EXIT {
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
};
if (!memory_tracker.IsRegionGpuModified(device_addr, size)) {
// Page has not been modified by the GPU, nothing to do.
return;
}
}
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) {
boost::container::small_vector<vk::BufferCopy, 1> copies;
u64 total_size_bytes = 0;
u64 largest_copy = 0;
memory_tracker.ForEachDownloadRange<true>(
device_addr, size, [&](u64 device_addr_out, u64 range_size) {
const VAddr buffer_addr = buffer.CpuAddr();
const auto add_download = [&](VAddr start, VAddr end, u64) {
const u64 new_offset = start - buffer_addr;
const u64 new_size = end - start;
copies.push_back(vk::BufferCopy{
.srcOffset = new_offset,
.dstOffset = total_size_bytes,
.size = new_size,
});
// Align up to avoid cache conflicts
constexpr u64 align = 64ULL;
constexpr u64 mask = ~(align - 1ULL);
total_size_bytes += (new_size + align - 1) & mask;
largest_copy = std::max(largest_copy, new_size);
};
});
if (total_size_bytes == 0) {
return;
}
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
// Modify copies to have the staging offset in mind
copy.dstOffset += offset;
}
staging_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies);
scheduler.Finish();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
std::memcpy(std::bit_cast<u8*>(copy_device_addr), staging + dst_offset, copy.size);
}
}
bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
if (vs_info.vs_inputs.empty()) {
return false;
}
std::array<vk::Buffer, NUM_VERTEX_BUFFERS> host_buffers;
std::array<vk::DeviceSize, NUM_VERTEX_BUFFERS> host_offsets;
boost::container::static_vector<AmdGpu::Buffer, NUM_VERTEX_BUFFERS> guest_buffers;
struct BufferRange {
VAddr base_address;
VAddr end_address;
vk::Buffer vk_buffer;
u64 offset;
size_t GetSize() const {
return end_address - base_address;
}
};
// Calculate buffers memory overlaps
bool has_step_rate = false;
boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges{};
for (const auto& input : vs_info.vs_inputs) {
if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 ||
input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) {
has_step_rate = true;
continue;
}
const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
guest_buffers.emplace_back(buffer);
ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize());
}
std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) {
return lhv.base_address < rhv.base_address;
});
boost::container::static_vector<BufferRange, NUM_VERTEX_BUFFERS> ranges_merged{ranges[0]};
for (auto range : ranges) {
auto& prev_range = ranges_merged.back();
if (prev_range.end_address < range.base_address) {
ranges_merged.emplace_back(range);
} else {
prev_range.end_address = std::max(prev_range.end_address, range.end_address);
}
}
// Map buffers
for (auto& range : ranges_merged) {
const auto [buffer, offset] = ObtainBuffer(range.base_address, range.GetSize(), false);
range.vk_buffer = buffer->buffer;
range.offset = offset;
}
// Bind vertex buffers
const size_t num_buffers = guest_buffers.size();
for (u32 i = 0; i < num_buffers; ++i) {
const auto& buffer = guest_buffers[i];
const auto host_buffer = std::ranges::find_if(ranges_merged, [&](const BufferRange& range) {
return (buffer.base_address >= range.base_address &&
buffer.base_address < range.end_address);
});
ASSERT(host_buffer != ranges_merged.cend());
host_buffers[i] = host_buffer->vk_buffer;
host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address;
}
if (num_buffers > 0) {
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data());
}
return has_step_rate;
}
u32 BufferCache::BindIndexBuffer(bool& is_indexed, u32 index_offset) {
// Emulate QuadList primitive type with CPU made index buffer.
const auto& regs = liverpool->regs;
if (regs.primitive_type == AmdGpu::Liverpool::PrimitiveType::QuadList) {
is_indexed = true;
// Emit indices.
const u32 index_size = 3 * regs.num_indices;
const auto [data, offset] = stream_buffer.Map(index_size);
Vulkan::LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices);
stream_buffer.Commit();
// Bind index buffer.
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(stream_buffer.Handle(), offset, vk::IndexType::eUint16);
return index_size / sizeof(u16);
}
if (!is_indexed) {
return regs.num_indices;
}
// Figure out index type and size.
const bool is_index16 =
regs.index_buffer_type.index_type == AmdGpu::Liverpool::IndexType::Index16;
const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32;
const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32);
VAddr index_address = regs.index_base_address.Address<VAddr>();
index_address += index_offset * index_size;
// Bind index buffer.
const u32 index_buffer_size = regs.num_indices * index_size;
const auto [vk_buffer, offset] = ObtainBuffer(index_address, index_buffer_size, false);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(vk_buffer->Handle(), offset, index_type);
return regs.num_indices;
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written) {
std::scoped_lock lk{mutex};
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
if (!is_written && size < StreamThreshold && !is_gpu_dirty) {
// For small uniform buffers that have not been modified by gpu
// use device local stream buffer to reduce renderpass breaks.
const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
return {&stream_buffer, offset};
}
const BufferId buffer_id = FindBuffer(device_addr, size);
Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size);
if (is_written) {
memory_tracker.MarkRegionAsGpuModified(device_addr, size);
}
return {&buffer, buffer.Offset(device_addr)};
}
bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
++page;
continue;
}
Buffer& buffer = slot_buffers[buffer_id];
const VAddr buf_start_addr = buffer.CpuAddr();
const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
if (buf_start_addr < end_addr && addr < buf_end_addr) {
return true;
}
page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
}
return false;
}
bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
return memory_tracker.IsRegionCpuModified(addr, size);
}
BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
if (device_addr == 0) {
return NULL_BUFFER_ID;
}
const u64 page = device_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
return CreateBuffer(device_addr, size);
}
const Buffer& buffer = slot_buffers[buffer_id];
if (buffer.IsInBounds(device_addr, size)) {
return buffer_id;
}
return CreateBuffer(device_addr, size);
}
BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 wanted_size) {
static constexpr int STREAM_LEAP_THRESHOLD = 16;
boost::container::small_vector<BufferId, 16> overlap_ids;
VAddr begin = device_addr;
VAddr end = device_addr + wanted_size;
int stream_score = 0;
bool has_stream_leap = false;
const auto expand_begin = [&](VAddr add_value) {
static constexpr VAddr min_page = CACHING_PAGESIZE + DEVICE_PAGESIZE;
if (add_value > begin - min_page) {
begin = min_page;
device_addr = DEVICE_PAGESIZE;
return;
}
begin -= add_value;
device_addr = begin - CACHING_PAGESIZE;
};
const auto expand_end = [&](VAddr add_value) {
static constexpr VAddr max_page = 1ULL << MemoryTracker::MAX_CPU_PAGE_BITS;
if (add_value > max_page - end) {
end = max_page;
return;
}
end += add_value;
};
if (begin == 0) {
return OverlapResult{
.ids = std::move(overlap_ids),
.begin = begin,
.end = end,
.has_stream_leap = has_stream_leap,
};
}
for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
device_addr += CACHING_PAGESIZE) {
const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS];
if (!overlap_id) {
continue;
}
Buffer& overlap = slot_buffers[overlap_id];
if (overlap.is_picked) {
continue;
}
overlap_ids.push_back(overlap_id);
overlap.is_picked = true;
const VAddr overlap_device_addr = overlap.CpuAddr();
const bool expands_left = overlap_device_addr < begin;
if (expands_left) {
begin = overlap_device_addr;
}
const VAddr overlap_end = overlap_device_addr + overlap.SizeBytes();
const bool expands_right = overlap_end > end;
if (overlap_end > end) {
end = overlap_end;
}
stream_score += overlap.StreamScore();
if (stream_score > STREAM_LEAP_THRESHOLD && !has_stream_leap) {
// When this memory region has been joined a bunch of times, we assume it's being used
// as a stream buffer. Increase the size to skip constantly recreating buffers.
has_stream_leap = true;
if (expands_right) {
expand_begin(CACHING_PAGESIZE * 128);
}
if (expands_left) {
expand_end(CACHING_PAGESIZE * 128);
}
}
}
return OverlapResult{
.ids = std::move(overlap_ids),
.begin = begin,
.end = end,
.has_stream_leap = has_stream_leap,
};
}
void BufferCache::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
bool accumulate_stream_score) {
Buffer& new_buffer = slot_buffers[new_buffer_id];
Buffer& overlap = slot_buffers[overlap_id];
if (accumulate_stream_score) {
new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1);
}
const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
const vk::BufferCopy copy = {
.srcOffset = 0,
.dstOffset = dst_base_offset,
.size = overlap.SizeBytes(),
};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
static constexpr vk::MemoryBarrier READ_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
};
static constexpr vk::MemoryBarrier WRITE_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
READ_BARRIER, {}, {});
cmdbuf.copyBuffer(overlap.buffer, new_buffer.buffer, copy);
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
DeleteBuffer(overlap_id, true);
}
BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
const VAddr device_addr_end = Common::AlignUp(device_addr + wanted_size, CACHING_PAGESIZE);
device_addr = Common::AlignDown(device_addr, CACHING_PAGESIZE);
wanted_size = static_cast<u32>(device_addr_end - device_addr);
const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
const u32 size = static_cast<u32>(overlap.end - overlap.begin);
const BufferId new_buffer_id =
slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, size);
auto& new_buffer = slot_buffers[new_buffer_id];
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
cmdbuf.fillBuffer(new_buffer.buffer, 0, size_bytes, 0);
for (const BufferId overlap_id : overlap.ids) {
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
}
Register(new_buffer_id);
return new_buffer_id;
}
void BufferCache::Register(BufferId buffer_id) {
ChangeRegister<true>(buffer_id);
}
void BufferCache::Unregister(BufferId buffer_id) {
ChangeRegister<false>(buffer_id);
}
template <bool insert>
void BufferCache::ChangeRegister(BufferId buffer_id) {
Buffer& buffer = slot_buffers[buffer_id];
const auto size = buffer.SizeBytes();
const VAddr device_addr_begin = buffer.CpuAddr();
const VAddr device_addr_end = device_addr_begin + size;
const u64 page_begin = device_addr_begin / CACHING_PAGESIZE;
const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
for (u64 page = page_begin; page != page_end; ++page) {
if constexpr (insert) {
page_table[page] = buffer_id;
} else {
page_table[page] = BufferId{};
}
}
}
bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) {
boost::container::small_vector<vk::BufferCopy, 4> copies;
u64 total_size_bytes = 0;
u64 largest_copy = 0;
VAddr buffer_start = buffer.CpuAddr();
const auto add_copy = [&](VAddr device_addr_out, u64 range_size) {
copies.push_back(vk::BufferCopy{
.srcOffset = total_size_bytes,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
});
total_size_bytes += range_size;
largest_copy = std::max(largest_copy, range_size);
};
memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
add_copy(device_addr_out, range_size);
// Prevent uploading to gpu modified regions.
// gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy);
});
if (total_size_bytes == 0) {
return true;
}
vk::Buffer src_buffer = staging_buffer.Handle();
if (total_size_bytes < StagingBufferSize) {
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
// Apply the staging offset
copy.srcOffset += offset;
}
staging_buffer.Commit();
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{instance, MemoryUsage::Upload, 0, total_size_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
}
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
static constexpr vk::MemoryBarrier READ_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
};
static constexpr vk::MemoryBarrier WRITE_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
READ_BARRIER, {}, {});
cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
return false;
}
void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
// Mark the whole buffer as CPU written to stop tracking CPU writes
if (!do_not_mark) {
Buffer& buffer = slot_buffers[buffer_id];
memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
}
Unregister(buffer_id);
scheduler.DeferOperation([this, buffer_id] { slot_buffers.erase(buffer_id); });
}
} // namespace VideoCore

View File

@ -0,0 +1,129 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <array>
#include <mutex>
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_map.hpp>
#include <tsl/robin_map.h>
#include "common/div_ceil.h"
#include "common/slot_vector.h"
#include "common/types.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/buffer_cache/memory_tracker_base.h"
#include "video_core/multi_level_page_table.h"
namespace AmdGpu {
struct Liverpool;
}
namespace Shader {
struct Info;
}
namespace VideoCore {
using BufferId = Common::SlotId;
static constexpr BufferId NULL_BUFFER_ID{0};
static constexpr u32 NUM_VERTEX_BUFFERS = 32;
class BufferCache {
public:
static constexpr u32 CACHING_PAGEBITS = 12;
static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
static constexpr u64 DEVICE_PAGESIZE = 4_KB;
struct Traits {
using Entry = BufferId;
static constexpr size_t AddressSpaceBits = 39;
static constexpr size_t FirstLevelBits = 14;
static constexpr size_t PageBits = CACHING_PAGEBITS;
};
using PageTable = MultiLevelPageTable<Traits>;
struct OverlapResult {
boost::container::small_vector<BufferId, 16> ids;
VAddr begin;
VAddr end;
bool has_stream_leap = false;
};
public:
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
const AmdGpu::Liverpool* liverpool, PageManager& tracker);
~BufferCache();
/// Invalidates any buffer in the logical page range.
void InvalidateMemory(VAddr device_addr, u64 size);
/// Binds host vertex buffers for the current draw.
bool BindVertexBuffers(const Shader::Info& vs_info);
/// Bind host index buffer for the current draw.
u32 BindIndexBuffer(bool& is_indexed, u32 index_offset);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written);
/// Return true when a region is registered on the cache
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
/// Return true when a CPU region is modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page];
if (!buffer_id) {
++page;
continue;
}
Buffer& buffer = slot_buffers[buffer_id];
func(buffer_id, buffer);
const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
}
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
[[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);
[[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);
void Register(BufferId buffer_id);
void Unregister(BufferId buffer_id);
template <bool insert>
void ChangeRegister(BufferId buffer_id);
bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size);
void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
const AmdGpu::Liverpool* liverpool;
PageManager& tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
std::recursive_mutex mutex;
Common::SlotVector<Buffer> slot_buffers;
MemoryTracker memory_tracker;
PageTable page_table;
};
} // namespace VideoCore

View File

@ -0,0 +1,175 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <algorithm>
#include <deque>
#include <type_traits>
#include <vector>
#include "common/types.h"
#include "video_core/buffer_cache/word_manager.h"
namespace VideoCore {
class MemoryTracker {
public:
static constexpr size_t MAX_CPU_PAGE_BITS = 39;
static constexpr size_t HIGHER_PAGE_BITS = 22;
static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;
static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS);
static constexpr size_t MANAGER_POOL_SIZE = 32;
static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD;
using Manager = WordManager<WORDS_STACK_NEEDED>;
public:
explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {}
~MemoryTracker() = default;
/// Returns true if a region has been modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<true>(
query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
return manager->template IsRegionModified<Type::CPU>(offset, size);
});
}
/// Returns true if a region has been modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<false>(
query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) {
return manager->template IsRegionModified<Type::GPU>(offset, size);
});
}
/// Mark region as CPU modified, notifying the device_tracker about this change
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Unmark region as CPU modified, notifying the device_tracker about this change
void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::CPU, false>(
manager->GetCpuAddr() + offset, size);
});
}
/// Mark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<true>(dirty_cpu_addr, query_size,
[](Manager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, false>(
manager->GetCpuAddr() + offset, size);
});
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
template <typename Func>
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
IteratePages<true>(query_cpu_range, query_size,
[&func](Manager* manager, u64 offset, size_t size) {
manager->template ForEachModifiedRange<Type::CPU, true>(
manager->GetCpuAddr() + offset, size, func);
});
}
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <bool clear, typename Func>
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
IteratePages<false>(query_cpu_range, query_size,
[&func](Manager* manager, u64 offset, size_t size) {
if constexpr (clear) {
manager->template ForEachModifiedRange<Type::GPU, true>(
manager->GetCpuAddr() + offset, size, func);
} else {
manager->template ForEachModifiedRange<Type::GPU, false>(
manager->GetCpuAddr() + offset, size, func);
}
});
}
private:
/**
* @brief IteratePages Iterates L2 word manager page table.
* @param cpu_address Start byte cpu address
* @param size Size in bytes of the region of iterate.
* @param func Callback for each word manager.
* @return
*/
template <bool create_region_on_fail, typename Func>
bool IteratePages(VAddr cpu_address, size_t size, Func&& func) {
using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type;
static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
std::size_t remaining_size{size};
std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS};
u64 page_offset{cpu_address & HIGHER_PAGE_MASK};
while (remaining_size > 0) {
const std::size_t copy_amount{
std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)};
auto* manager{top_tier[page_index]};
if (manager) {
if constexpr (BOOL_BREAK) {
if (func(manager, page_offset, copy_amount)) {
return true;
}
} else {
func(manager, page_offset, copy_amount);
}
} else if constexpr (create_region_on_fail) {
CreateRegion(page_index);
manager = top_tier[page_index];
if constexpr (BOOL_BREAK) {
if (func(manager, page_offset, copy_amount)) {
return true;
}
} else {
func(manager, page_offset, copy_amount);
}
}
page_index++;
page_offset = 0;
remaining_size -= copy_amount;
}
return false;
}
void CreateRegion(std::size_t page_index) {
const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS;
if (free_managers.empty()) {
manager_pool.emplace_back();
auto& last_pool = manager_pool.back();
for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) {
std::construct_at(&last_pool[i], tracker, 0, HIGHER_PAGE_SIZE);
free_managers.push_back(&last_pool[i]);
}
}
// Each manager tracks a 4_MB virtual address space.
auto* new_manager = free_managers.back();
new_manager->SetCpuAddress(base_cpu_addr);
free_managers.pop_back();
top_tier[page_index] = new_manager;
}
PageManager* tracker;
std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool;
std::vector<Manager*> free_managers;
std::array<Manager*, NUM_HIGH_PAGES> top_tier{};
};
} // namespace VideoCore

View File

@ -0,0 +1,159 @@
// SPDX-FileCopyrightText: 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <boost/icl/interval_map.hpp>
#include <boost/pool/pool.hpp>
#include <boost/pool/pool_alloc.hpp>
#include <boost/pool/poolfwd.hpp>
#include "common/types.h"
namespace VideoCore {
template <class T>
using RangeSetsAllocator =
boost::fast_pool_allocator<T, boost::default_user_allocator_new_delete,
boost::details::pool::default_mutex, 1024, 2048>;
struct RangeSet {
using IntervalSet =
boost::icl::interval_set<VAddr, std::less,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalSet::interval_type;
explicit RangeSet() = default;
~RangeSet() = default;
void Add(VAddr base_address, size_t size) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_set.add(interval);
}
void Subtract(VAddr base_address, size_t size) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_set.subtract(interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_set.empty()) {
return;
}
auto it = m_ranges_set.begin();
auto end_it = m_ranges_set.end();
for (; it != end_it; it++) {
const VAddr inter_addr_end = it->upper();
const VAddr inter_addr = it->lower();
func(inter_addr, inter_addr_end);
}
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_set.empty()) {
return;
}
const VAddr start_address = base_addr;
const VAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = m_ranges_set.lower_bound(search_interval);
if (it == m_ranges_set.end()) {
return;
}
auto end_it = m_ranges_set.upper_bound(search_interval);
for (; it != end_it; it++) {
VAddr inter_addr_end = it->upper();
VAddr inter_addr = it->lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
if (inter_addr < start_address) {
inter_addr = start_address;
}
func(inter_addr, inter_addr_end);
}
}
IntervalSet m_ranges_set;
};
class RangeMap {
public:
using IntervalMap =
boost::icl::interval_map<VAddr, u64, boost::icl::partial_absorber, std::less,
boost::icl::inplace_plus, boost::icl::inter_section,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalMap::interval_type;
public:
RangeMap() = default;
~RangeMap() = default;
RangeMap(RangeMap const&) = delete;
RangeMap& operator=(RangeMap const&) = delete;
RangeMap(RangeMap&& other);
RangeMap& operator=(RangeMap&& other);
void Add(VAddr base_address, size_t size, u64 value) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map.add({interval, value});
}
void Subtract(VAddr base_address, size_t size) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map -= interval;
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
const VAddr start_address = base_addr;
const VAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = m_ranges_map.lower_bound(search_interval);
if (it == m_ranges_map.end()) {
return;
}
auto end_it = m_ranges_map.upper_bound(search_interval);
for (; it != end_it; it++) {
VAddr inter_addr_end = it->first.upper();
VAddr inter_addr = it->first.lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
if (inter_addr < start_address) {
inter_addr = start_address;
}
func(inter_addr, inter_addr_end, it->second);
}
}
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}
base_addr = range_end;
});
if (base_addr != end_addr) {
func(base_addr, end_addr - base_addr);
}
}
private:
IntervalMap m_ranges_map;
};
} // namespace VideoCore

View File

@ -0,0 +1,398 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <algorithm>
#include <span>
#include <utility>
#include "common/div_ceil.h"
#include "common/types.h"
#include "video_core/page_manager.h"
namespace VideoCore {
constexpr u64 PAGES_PER_WORD = 64;
constexpr u64 BYTES_PER_PAGE = 4_KB;
constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
enum class Type {
CPU,
GPU,
Untracked,
};
/// Vector tracking modified pages tightly packed with small vector optimization
template <size_t stack_words = 1>
struct WordsArray {
/// Returns the pointer to the words state
[[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
return is_short ? stack.data() : heap;
}
/// Returns the pointer to the words state
[[nodiscard]] u64* Pointer(bool is_short) noexcept {
return is_short ? stack.data() : heap;
}
std::array<u64, stack_words> stack{}; ///< Small buffers storage
u64* heap; ///< Not-small buffers pointer to the storage
};
template <size_t stack_words = 1>
struct Words {
explicit Words() = default;
explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD);
if (IsShort()) {
cpu.stack.fill(~u64{0});
gpu.stack.fill(0);
untracked.stack.fill(~u64{0});
} else {
// Share allocation between CPU and GPU pages and set their default values
u64* const alloc = new u64[num_words * 3];
cpu.heap = alloc;
gpu.heap = alloc + num_words;
untracked.heap = alloc + num_words * 2;
std::fill_n(cpu.heap, num_words, ~u64{0});
std::fill_n(gpu.heap, num_words, 0);
std::fill_n(untracked.heap, num_words, ~u64{0});
}
// Clean up tailing bits
const u64 last_word_size = size_bytes % BYTES_PER_WORD;
const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
const u64 last_word = (~u64{0} << shift) >> shift;
cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
}
~Words() {
Release();
}
Words& operator=(Words&& rhs) noexcept {
Release();
size_bytes = rhs.size_bytes;
num_words = rhs.num_words;
cpu = rhs.cpu;
gpu = rhs.gpu;
untracked = rhs.untracked;
rhs.cpu.heap = nullptr;
return *this;
}
Words(Words&& rhs) noexcept
: size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu},
untracked{rhs.untracked} {
rhs.cpu.heap = nullptr;
}
Words& operator=(const Words&) = delete;
Words(const Words&) = delete;
/// Returns true when the buffer fits in the small vector optimization
[[nodiscard]] bool IsShort() const noexcept {
return num_words <= stack_words;
}
/// Returns the number of words of the buffer
[[nodiscard]] size_t NumWords() const noexcept {
return num_words;
}
/// Release buffer resources
void Release() {
if (!IsShort()) {
// CPU written words is the base for the heap allocation
delete[] cpu.heap;
}
}
template <Type type>
std::span<u64> Span() noexcept {
if constexpr (type == Type::CPU) {
return std::span<u64>(cpu.Pointer(IsShort()), num_words);
} else if constexpr (type == Type::GPU) {
return std::span<u64>(gpu.Pointer(IsShort()), num_words);
} else if constexpr (type == Type::Untracked) {
return std::span<u64>(untracked.Pointer(IsShort()), num_words);
}
}
template <Type type>
std::span<const u64> Span() const noexcept {
if constexpr (type == Type::CPU) {
return std::span<const u64>(cpu.Pointer(IsShort()), num_words);
} else if constexpr (type == Type::GPU) {
return std::span<const u64>(gpu.Pointer(IsShort()), num_words);
} else if constexpr (type == Type::Untracked) {
return std::span<const u64>(untracked.Pointer(IsShort()), num_words);
}
}
u64 size_bytes = 0;
size_t num_words = 0;
WordsArray<stack_words> cpu;
WordsArray<stack_words> gpu;
WordsArray<stack_words> untracked;
};
template <size_t stack_words = 1>
class WordManager {
public:
explicit WordManager(PageManager* tracker_, VAddr cpu_addr_, u64 size_bytes)
: tracker{tracker_}, cpu_addr{cpu_addr_}, words{size_bytes} {}
explicit WordManager() = default;
void SetCpuAddress(VAddr new_cpu_addr) {
cpu_addr = new_cpu_addr;
}
VAddr GetCpuAddr() const {
return cpu_addr;
}
static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
constexpr size_t number_bits = sizeof(u64) * 8;
const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
u64 bits = (word >> page_start) << page_start;
bits = (bits << limit_page_end) >> limit_page_end;
return bits;
}
static std::pair<size_t, size_t> GetWordPage(VAddr address) {
const size_t converted_address = static_cast<size_t>(address);
const size_t word_number = converted_address / BYTES_PER_WORD;
const size_t amount_pages = converted_address % BYTES_PER_WORD;
return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE);
}
template <typename Func>
void IterateWords(size_t offset, size_t size, Func&& func) const {
using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>;
static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL));
const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL));
if (start >= SizeBytes() || end <= start) {
return;
}
auto [start_word, start_page] = GetWordPage(start);
auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL);
const size_t num_words = NumWords();
start_word = std::min(start_word, num_words);
end_word = std::min(end_word, num_words);
const size_t diff = end_word - start_word;
end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD;
end_word = std::min(end_word, num_words);
end_page += diff * PAGES_PER_WORD;
constexpr u64 base_mask{~0ULL};
for (size_t word_index = start_word; word_index < end_word; word_index++) {
const u64 mask = ExtractBits(base_mask, start_page, end_page);
start_page = 0;
end_page -= PAGES_PER_WORD;
if constexpr (BOOL_BREAK) {
if (func(word_index, mask)) {
return;
}
} else {
func(word_index, mask);
}
}
}
template <typename Func>
void IteratePages(u64 mask, Func&& func) const {
size_t offset = 0;
while (mask != 0) {
const size_t empty_bits = std::countr_zero(mask);
offset += empty_bits;
mask = mask >> empty_bits;
const size_t continuous_bits = std::countr_one(mask);
func(offset, continuous_bits);
mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0;
offset += continuous_bits;
}
}
/**
* Change the state of a range of pages
*
* @param dirty_addr Base address to mark or unmark as modified
* @param size Size in bytes to mark or unmark as modified
*/
template <Type type, bool enable>
void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) {
std::span<u64> state_words = words.template Span<type>();
[[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) {
if constexpr (type == Type::CPU) {
NotifyPageTracker<!enable>(index, untracked_words[index], mask);
}
if constexpr (enable) {
state_words[index] |= mask;
if constexpr (type == Type::CPU) {
untracked_words[index] |= mask;
}
} else {
state_words[index] &= ~mask;
if constexpr (type == Type::CPU) {
untracked_words[index] &= ~mask;
}
}
});
}
/**
* Loop over each page in the given range, turn off those bits and notify the tracker if
* needed. Call the given function on each turned off range.
*
* @param query_cpu_range Base CPU address to loop over
* @param size Size in bytes of the CPU range to loop over
* @param func Function to call for each turned off region
*/
template <Type type, bool clear, typename Func>
void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
static_assert(type != Type::Untracked);
std::span<u64> state_words = words.template Span<type>();
[[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
const size_t offset = query_cpu_range - cpu_addr;
bool pending = false;
size_t pending_offset{};
size_t pending_pointer{};
const auto release = [&]() {
func(cpu_addr + pending_offset * BYTES_PER_PAGE,
(pending_pointer - pending_offset) * BYTES_PER_PAGE);
};
IterateWords(offset, size, [&](size_t index, u64 mask) {
if constexpr (type == Type::GPU) {
mask &= ~untracked_words[index];
}
const u64 word = state_words[index] & mask;
if constexpr (clear) {
if constexpr (type == Type::CPU) {
NotifyPageTracker<true>(index, untracked_words[index], mask);
}
state_words[index] &= ~mask;
if constexpr (type == Type::CPU) {
untracked_words[index] &= ~mask;
}
}
const size_t base_offset = index * PAGES_PER_WORD;
IteratePages(word, [&](size_t pages_offset, size_t pages_size) {
const auto reset = [&]() {
pending_offset = base_offset + pages_offset;
pending_pointer = base_offset + pages_offset + pages_size;
};
if (!pending) {
reset();
pending = true;
return;
}
if (pending_pointer == base_offset + pages_offset) {
pending_pointer += pages_size;
return;
}
release();
reset();
});
});
if (pending) {
release();
}
}
/**
* Returns true when a region has been modified
*
* @param offset Offset in bytes from the start of the buffer
* @param size Size in bytes of the region to query for modifications
*/
template <Type type>
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
static_assert(type != Type::Untracked);
const std::span<const u64> state_words = words.template Span<type>();
[[maybe_unused]] const std::span<const u64> untracked_words =
words.template Span<Type::Untracked>();
bool result = false;
IterateWords(offset, size, [&](size_t index, u64 mask) {
if constexpr (type == Type::GPU) {
mask &= ~untracked_words[index];
}
const u64 word = state_words[index] & mask;
if (word != 0) {
result = true;
return true;
}
return false;
});
return result;
}
/// Returns the number of words of the manager
[[nodiscard]] size_t NumWords() const noexcept {
return words.NumWords();
}
/// Returns the size in bytes of the manager
[[nodiscard]] u64 SizeBytes() const noexcept {
return words.size_bytes;
}
/// Returns true when the buffer fits in the small vector optimization
[[nodiscard]] bool IsShort() const noexcept {
return words.IsShort();
}
private:
template <Type type>
u64* Array() noexcept {
if constexpr (type == Type::CPU) {
return words.cpu.Pointer(IsShort());
} else if constexpr (type == Type::GPU) {
return words.gpu.Pointer(IsShort());
} else if constexpr (type == Type::Untracked) {
return words.untracked.Pointer(IsShort());
}
}
template <Type type>
const u64* Array() const noexcept {
if constexpr (type == Type::CPU) {
return words.cpu.Pointer(IsShort());
} else if constexpr (type == Type::GPU) {
return words.gpu.Pointer(IsShort());
} else if constexpr (type == Type::Untracked) {
return words.untracked.Pointer(IsShort());
}
}
/**
* Notify tracker about changes in the CPU tracking state of a word in the buffer
*
* @param word_index Index to the word to notify to the tracker
* @param current_bits Current state of the word
* @param new_bits New state of the word
*
* @tparam add_to_tracker True when the tracker should start tracking the new pages
*/
template <bool add_to_tracker>
void NotifyPageTracker(u64 word_index, u64 current_bits, u64 new_bits) const {
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
IteratePages(changed_bits, [&](size_t offset, size_t size) {
tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE,
add_to_tracker ? 1 : -1);
});
}
PageManager* tracker;
VAddr cpu_addr = 0;
Words<stack_words> words;
};
} // namespace VideoCore

View File

@ -0,0 +1,65 @@
// SPDX-FileCopyrightText: 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <type_traits>
#include <utility>
#include <vector>
#include "common/object_pool.h"
#include "common/types.h"
namespace VideoCore {
template <class Traits>
class MultiLevelPageTable final {
using Entry = typename Traits::Entry;
static constexpr size_t AddressSpaceBits = Traits::AddressSpaceBits;
static constexpr size_t FirstLevelBits = Traits::FirstLevelBits;
static constexpr size_t PageBits = Traits::PageBits;
static constexpr size_t FirstLevelShift = AddressSpaceBits - FirstLevelBits;
static constexpr size_t SecondLevelBits = FirstLevelShift - PageBits;
static constexpr size_t NumEntriesPerL1Page = 1ULL << SecondLevelBits;
using L1Page = std::array<Entry, NumEntriesPerL1Page>;
public:
explicit MultiLevelPageTable() : first_level_map{1ULL << FirstLevelBits, nullptr} {}
~MultiLevelPageTable() noexcept = default;
[[nodiscard]] Entry* find(size_t page) {
const size_t l1_page = page >> SecondLevelBits;
const size_t l2_page = page & (NumEntriesPerL1Page - 1);
if (!first_level_map[l1_page]) {
return nullptr;
}
return &(*first_level_map[l1_page])[l2_page];
}
[[nodiscard]] const Entry& operator[](size_t page) const {
const size_t l1_page = page >> SecondLevelBits;
const size_t l2_page = page & (NumEntriesPerL1Page - 1);
if (!first_level_map[l1_page]) {
first_level_map[l1_page] = page_alloc.Create();
}
return (*first_level_map[l1_page])[l2_page];
}
[[nodiscard]] Entry& operator[](size_t page) {
const size_t l1_page = page >> SecondLevelBits;
const size_t l2_page = page & (NumEntriesPerL1Page - 1);
if (!first_level_map[l1_page]) {
first_level_map[l1_page] = page_alloc.Create();
}
return (*first_level_map[l1_page])[l2_page];
}
private:
std::vector<L1Page*> first_level_map{};
Common::ObjectPool<L1Page> page_alloc;
};
} // namespace VideoCore

View File

@ -0,0 +1,260 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <thread>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/error.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#ifndef _WIN64
#include <fcntl.h>
#include <poll.h>
#include <signal.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#ifdef ENABLE_USERFAULTFD
#include <linux/userfaultfd.h>
#endif
#else
#include <windows.h>
#endif
namespace VideoCore {
constexpr size_t PAGESIZE = 4_KB;
constexpr size_t PAGEBITS = 12;
#ifdef _WIN64
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
veh_handle = AddVectoredExceptionHandler(0, GuestFaultSignalHandler);
ASSERT_MSG(veh_handle, "Failed to register an exception handler");
}
void OnMap(VAddr address, size_t size) {}
void OnUnmap(VAddr address, size_t size) {}
void Protect(VAddr address, size_t size, bool allow_write) {
DWORD prot = allow_write ? PAGE_READWRITE : PAGE_READONLY;
DWORD old_prot{};
BOOL result = VirtualProtect(std::bit_cast<LPVOID>(address), size, prot, &old_prot);
ASSERT_MSG(result != 0, "Region protection failed");
}
static LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept {
const u32 ec = pExp->ExceptionRecord->ExceptionCode;
if (ec == EXCEPTION_ACCESS_VIOLATION) {
const auto info = pExp->ExceptionRecord->ExceptionInformation;
if (info[0] == 1) { // Write violation
rasterizer->InvalidateMemory(info[1], sizeof(u64));
return EXCEPTION_CONTINUE_EXECUTION;
} /* else {
UNREACHABLE();
}*/
}
return EXCEPTION_CONTINUE_SEARCH; // pass further
}
inline static Vulkan::Rasterizer* rasterizer;
void* veh_handle{};
};
#elif ENABLE_USERFAULTFD
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg());
// Request uffdio features from kernel.
uffdio_api api;
api.api = UFFD_API;
api.features = UFFD_FEATURE_THREAD_ID;
const int ret = ioctl(uffd, UFFDIO_API, &api);
ASSERT(ret == 0 && api.api == UFFD_API);
// Create uffd handler thread
ufd_thread = std::jthread([&](std::stop_token token) { UffdHandler(token); });
}
void OnMap(VAddr address, size_t size) {
uffdio_register reg;
reg.range.start = address;
reg.range.len = size;
reg.mode = UFFDIO_REGISTER_MODE_WP;
const int ret = ioctl(uffd, UFFDIO_REGISTER, &reg);
ASSERT_MSG(ret != -1, "Uffdio register failed");
}
void OnUnmap(VAddr address, size_t size) {
uffdio_range range;
range.start = address;
range.len = size;
const int ret = ioctl(uffd, UFFDIO_UNREGISTER, &range);
ASSERT_MSG(ret != -1, "Uffdio unregister failed");
}
void Protect(VAddr address, size_t size, bool allow_write) {
uffdio_writeprotect wp;
wp.range.start = address;
wp.range.len = size;
wp.mode = allow_write ? 0 : UFFDIO_WRITEPROTECT_MODE_WP;
const int ret = ioctl(uffd, UFFDIO_WRITEPROTECT, &wp);
ASSERT_MSG(ret != -1, "Uffdio writeprotect failed with error: {}",
Common::GetLastErrorMsg());
}
void UffdHandler(std::stop_token token) {
while (!token.stop_requested()) {
pollfd pollfd;
pollfd.fd = uffd;
pollfd.events = POLLIN;
// Block until the descriptor is ready for data reads.
const int pollres = poll(&pollfd, 1, -1);
switch (pollres) {
case -1:
perror("Poll userfaultfd");
continue;
break;
case 0:
continue;
case 1:
break;
default:
UNREACHABLE_MSG("Unexpected number of descriptors {} out of poll", pollres);
}
// We don't want an error condition to have occured.
ASSERT_MSG(!(pollfd.revents & POLLERR), "POLLERR on userfaultfd");
// We waited until there is data to read, we don't care about anything else.
if (!(pollfd.revents & POLLIN)) {
continue;
}
// Read message from kernel.
uffd_msg msg;
const int readret = read(uffd, &msg, sizeof(msg));
ASSERT_MSG(readret != -1 || errno == EAGAIN, "Unexpected result of uffd read");
if (errno == EAGAIN) {
continue;
}
ASSERT_MSG(readret == sizeof(msg), "Unexpected short read, exiting");
ASSERT(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP);
// Notify rasterizer about the fault.
const VAddr addr = msg.arg.pagefault.address;
const VAddr addr_page = Common::AlignDown(addr, PAGESIZE);
rasterizer->InvalidateMemory(addr_page, PAGESIZE);
}
}
Vulkan::Rasterizer* rasterizer;
std::jthread ufd_thread;
int uffd;
};
#else
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
#ifdef __APPLE__
// Read-only memory write results in SIGBUS on Apple.
static constexpr int SignalType = SIGBUS;
#else
static constexpr int SignalType = SIGSEGV;
#endif
sigset_t signal_mask;
sigemptyset(&signal_mask);
sigaddset(&signal_mask, SignalType);
using HandlerType = decltype(sigaction::sa_sigaction);
struct sigaction guest_access_fault {};
guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK;
guest_access_fault.sa_sigaction = &GuestFaultSignalHandler;
guest_access_fault.sa_mask = signal_mask;
sigaction(SignalType, &guest_access_fault, nullptr);
}
void OnMap(VAddr address, size_t size) {}
void OnUnmap(VAddr address, size_t size) {}
void Protect(VAddr address, size_t size, bool allow_write) {
mprotect(reinterpret_cast<void*>(address), size,
PROT_READ | (allow_write ? PROT_WRITE : 0));
}
static void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) {
ucontext_t* ctx = reinterpret_cast<ucontext_t*>(raw_context);
const VAddr address = reinterpret_cast<VAddr>(info->si_addr);
#ifdef __APPLE__
const u32 err = ctx->uc_mcontext->__es.__err;
#else
const greg_t err = ctx->uc_mcontext.gregs[REG_ERR];
#endif
if (err & 0x2) {
rasterizer->InvalidateMemory(address, sizeof(u64));
} else {
// Read not supported!
UNREACHABLE();
}
}
inline static Vulkan::Rasterizer* rasterizer;
};
#endif
PageManager::PageManager(Vulkan::Rasterizer* rasterizer_)
: impl{std::make_unique<Impl>(rasterizer_)}, rasterizer{rasterizer_} {}
PageManager::~PageManager() = default;
void PageManager::OnGpuMap(VAddr address, size_t size) {
impl->OnMap(address, size);
}
void PageManager::OnGpuUnmap(VAddr address, size_t size) {
impl->OnUnmap(address, size);
}
void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
static constexpr u64 PageShift = 12;
std::scoped_lock lk{mutex};
const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
const u64 page_start = addr >> PageShift;
const u64 page_end = page_start + num_pages;
const auto pages_interval =
decltype(cached_pages)::interval_type::right_open(page_start, page_end);
if (delta > 0) {
cached_pages.add({pages_interval, delta});
}
const auto& range = cached_pages.equal_range(pages_interval);
for (const auto& [range, count] : boost::make_iterator_range(range)) {
const auto interval = range & pages_interval;
const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
const u32 interval_size = interval_end_addr - interval_start_addr;
if (delta > 0 && count == delta) {
impl->Protect(interval_start_addr, interval_size, false);
} else if (delta < 0 && count == -delta) {
impl->Protect(interval_start_addr, interval_size, true);
} else {
ASSERT(count >= 0);
}
}
if (delta < 0) {
cached_pages.add({pages_interval, delta});
}
}
} // namespace VideoCore

View File

@ -0,0 +1,39 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <memory>
#include <mutex>
#include <boost/icl/interval_map.hpp>
#include "common/types.h"
namespace Vulkan {
class Rasterizer;
}
namespace VideoCore {
class PageManager {
public:
explicit PageManager(Vulkan::Rasterizer* rasterizer);
~PageManager();
/// Register a range of mapped gpu memory.
void OnGpuMap(VAddr address, size_t size);
/// Unregister a range of gpu memory that was unmapped.
void OnGpuUnmap(VAddr address, size_t size);
/// Increase/decrease the number of surface in pages touching the specified region
void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
private:
struct Impl;
std::unique_ptr<Impl> impl;
Vulkan::Rasterizer* rasterizer;
std::mutex mutex;
boost::icl::interval_map<VAddr, s32> cached_pages;
};
} // namespace VideoCore

View File

@ -67,8 +67,8 @@ RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool*
: window{window_}, liverpool{liverpool_},
instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance},
present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window},
texture_cache{instance, draw_scheduler} {
rasterizer = std::make_unique<Rasterizer>(instance, draw_scheduler, texture_cache, liverpool);
rasterizer{std::make_unique<Rasterizer>(instance, draw_scheduler, liverpool)},
texture_cache{rasterizer->GetTextureCache()} {
const u32 num_images = swapchain.GetImageCount();
const vk::Device device = instance.GetDevice();

View File

@ -47,7 +47,7 @@ public:
Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute,
VAddr cpu_address, bool is_eop) {
const auto info = VideoCore::ImageInfo{attribute, cpu_address};
const auto image_id = texture_cache.FindImage(info, cpu_address);
const auto image_id = texture_cache.FindImage(info, false);
auto& image = texture_cache.GetImage(image_id);
return PrepareFrameInternal(image, is_eop);
}
@ -61,7 +61,7 @@ public:
const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) {
vo_buffers_addr.emplace_back(cpu_address);
const auto info = VideoCore::ImageInfo{attribute, cpu_address};
const auto image_id = texture_cache.FindImage(info, cpu_address);
const auto image_id = texture_cache.FindImage(info, false);
return texture_cache.GetImage(image_id);
}
@ -88,7 +88,7 @@ private:
Scheduler flip_scheduler;
Swapchain swapchain;
std::unique_ptr<Rasterizer> rasterizer;
VideoCore::TextureCache texture_cache;
VideoCore::TextureCache& texture_cache;
vk::UniqueCommandPool command_pool;
std::vector<Frame> present_frames;
std::queue<Frame*> free_queue;

View File

@ -3,11 +3,10 @@
#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "core/memory.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/texture_cache/texture_cache.h"
namespace Vulkan {
@ -51,6 +50,12 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
});
}
const vk::PushConstantRange push_constants = {
.stageFlags = vk::ShaderStageFlagBits::eCompute,
.offset = 0,
.size = sizeof(Shader::PushData),
};
const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
.flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
.bindingCount = static_cast<u32>(bindings.size()),
@ -62,8 +67,8 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
const vk::PipelineLayoutCreateInfo layout_info = {
.setLayoutCount = 1U,
.pSetLayouts = &set_layout,
.pushConstantRangeCount = 0,
.pPushConstantRanges = nullptr,
.pushConstantRangeCount = 1U,
.pPushConstantRanges = &push_constants,
};
pipeline_layout = instance.GetDevice().createPipelineLayoutUnique(layout_info);
@ -82,33 +87,18 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
ComputePipeline::~ComputePipeline() = default;
bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
VideoCore::TextureCache& texture_cache) const {
// Bind resource buffers and textures.
boost::container::static_vector<vk::DescriptorBufferInfo, 16> buffer_infos;
boost::container::static_vector<vk::DescriptorImageInfo, 16> image_infos;
boost::container::small_vector<vk::WriteDescriptorSet, 16> set_writes;
Shader::PushData push_data{};
u32 binding{};
for (const auto& buffer : info.buffers) {
for (u32 i = 0; const auto& buffer : info.buffers) {
const auto vsharp = buffer.GetVsharp(info);
const u32 size = vsharp.GetSize();
const VAddr address = vsharp.base_address;
texture_cache.OnCpuWrite(address);
const u32 offset = staging.Copy(address, size,
buffer.is_storage ? instance.StorageMinAlignment()
: instance.UniformMinAlignment());
buffer_infos.emplace_back(staging.Handle(), offset, size);
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer
: vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
// Most of the time when a metadata is updated with a shader it gets cleared. It means we
// can skip the whole dispatch and update the tracked state instead. Also, it is not
// intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we will
@ -123,6 +113,31 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)");
}
}
const u32 size = vsharp.GetSize();
if (buffer.is_written) {
texture_cache.InvalidateMemory(address, size);
}
const u32 alignment =
buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment();
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(address, size, buffer.is_written);
const u32 offset_aligned = Common::AlignDown(offset, alignment);
const u32 adjust = offset - offset_aligned;
if (adjust != 0) {
ASSERT(adjust % 4 == 0);
push_data.AddOffset(binding, adjust);
}
buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = buffer.is_storage ? vk::DescriptorType::eStorageBuffer
: vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
i++;
}
for (const auto& image_desc : info.images) {
@ -166,6 +181,8 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
}
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data),
&push_data);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, set_writes);
return true;
}

View File

@ -6,19 +6,15 @@
#include "shader_recompiler/runtime_info.h"
#include "video_core/renderer_vulkan/vk_common.h"
namespace Core {
class MemoryManager;
}
namespace VideoCore {
class BufferCache;
class TextureCache;
}
} // namespace VideoCore
namespace Vulkan {
class Instance;
class Scheduler;
class StreamBuffer;
class ComputePipeline {
public:
@ -31,7 +27,7 @@ public:
return *pipeline;
}
bool BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
bool BindResources(VideoCore::BufferCache& buffer_cache,
VideoCore::TextureCache& texture_cache) const;
private:

View File

@ -5,13 +5,13 @@
#include <boost/container/small_vector.hpp>
#include <boost/container/static_vector.hpp>
#include "common/alignment.h"
#include "common/assert.h"
#include "core/memory.h"
#include "video_core/amdgpu/resource.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/texture_cache/texture_cache.h"
namespace Vulkan {
@ -32,9 +32,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
BuildDescSetLayout();
const vk::PushConstantRange push_constants = {
.stageFlags = vk::ShaderStageFlagBits::eVertex,
.stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment,
.offset = 0,
.size = 2 * sizeof(u32),
.size = sizeof(Shader::PushData),
};
const vk::DescriptorSetLayout set_layout = *desc_layout;
@ -328,25 +328,43 @@ void GraphicsPipeline::BuildDescSetLayout() {
desc_layout = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
}
void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
void GraphicsPipeline::BindResources(const Liverpool::Regs& regs,
VideoCore::BufferCache& buffer_cache,
VideoCore::TextureCache& texture_cache) const {
BindVertexBuffers(staging);
// Bind resource buffers and textures.
boost::container::static_vector<vk::DescriptorBufferInfo, 16> buffer_infos;
boost::container::static_vector<vk::DescriptorImageInfo, 32> image_infos;
boost::container::small_vector<vk::WriteDescriptorSet, 16> set_writes;
Shader::PushData push_data{};
u32 binding{};
for (const auto& stage : stages) {
if (stage.uses_step_rates) {
push_data.step0 = regs.vgt_instance_step_rate_0;
push_data.step1 = regs.vgt_instance_step_rate_1;
}
for (const auto& buffer : stage.buffers) {
const auto vsharp = buffer.GetVsharp(stage);
const VAddr address = vsharp.base_address;
const u32 size = vsharp.GetSize();
const u32 offset = staging.Copy(address, size,
buffer.is_storage ? instance.StorageMinAlignment()
: instance.UniformMinAlignment());
buffer_infos.emplace_back(staging.Handle(), offset, size);
if (vsharp) {
const VAddr address = vsharp.base_address;
if (texture_cache.IsMeta(address)) {
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)");
}
const u32 size = vsharp.GetSize();
const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment()
: instance.UniformMinAlignment();
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(address, size, buffer.is_written);
const u32 offset_aligned = Common::AlignDown(offset, alignment);
const u32 adjust = offset - offset_aligned;
if (adjust != 0) {
ASSERT(adjust % 4 == 0);
push_data.AddOffset(binding, adjust);
}
buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust);
} else {
buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE);
}
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding++,
@ -356,10 +374,6 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
: vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
if (texture_cache.IsMeta(address)) {
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)");
}
}
boost::container::static_vector<AmdGpu::Image, 16> tsharps;
@ -406,86 +420,15 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
}
}
const auto cmdbuf = scheduler.CommandBuffer();
if (!set_writes.empty()) {
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0,
set_writes);
}
}
void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const {
const auto& vs_info = stages[u32(Shader::Stage::Vertex)];
if (vs_info.vs_inputs.empty()) {
return;
}
std::array<vk::Buffer, MaxVertexBufferCount> host_buffers;
std::array<vk::DeviceSize, MaxVertexBufferCount> host_offsets;
boost::container::static_vector<AmdGpu::Buffer, MaxVertexBufferCount> guest_buffers;
struct BufferRange {
VAddr base_address;
VAddr end_address;
u64 offset; // offset in the mapped memory
size_t GetSize() const {
return end_address - base_address;
}
};
// Calculate buffers memory overlaps
boost::container::static_vector<BufferRange, MaxVertexBufferCount> ranges{};
for (const auto& input : vs_info.vs_inputs) {
if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 ||
input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) {
continue;
}
const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
guest_buffers.emplace_back(buffer);
ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize());
}
std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) {
return lhv.base_address < rhv.base_address;
});
boost::container::static_vector<BufferRange, MaxVertexBufferCount> ranges_merged{ranges[0]};
for (auto range : ranges) {
auto& prev_range = ranges_merged.back();
if (prev_range.end_address < range.base_address) {
ranges_merged.emplace_back(range);
} else {
prev_range.end_address = std::max(prev_range.end_address, range.end_address);
}
}
// Map buffers
for (auto& range : ranges_merged) {
range.offset = staging.Copy(range.base_address, range.GetSize(), 4);
}
// Bind vertex buffers
const size_t num_buffers = guest_buffers.size();
for (u32 i = 0; i < num_buffers; ++i) {
const auto& buffer = guest_buffers[i];
const auto& host_buffer = std::ranges::find_if(
ranges_merged.cbegin(), ranges_merged.cend(), [&](const BufferRange& range) {
return (buffer.base_address >= range.base_address &&
buffer.base_address < range.end_address);
});
assert(host_buffer != ranges_merged.cend());
host_buffers[i] = staging.Handle();
host_offsets[i] = host_buffer->offset + buffer.base_address - host_buffer->base_address;
}
if (num_buffers > 0) {
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindVertexBuffers(0, num_buffers, host_buffers.data(), host_offsets.data());
}
cmdbuf.pushConstants(*pipeline_layout,
vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, 0U,
sizeof(push_data), &push_data);
cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, Handle());
}
} // namespace Vulkan

View File

@ -7,13 +7,10 @@
#include "video_core/renderer_vulkan/liverpool_to_vk.h"
#include "video_core/renderer_vulkan/vk_common.h"
namespace Core {
class MemoryManager;
}
namespace VideoCore {
class BufferCache;
class TextureCache;
}
} // namespace VideoCore
namespace Vulkan {
@ -22,7 +19,6 @@ static constexpr u32 MaxShaderStages = 5;
class Instance;
class Scheduler;
class StreamBuffer;
using Liverpool = AmdGpu::Liverpool;
@ -64,7 +60,7 @@ public:
std::array<vk::ShaderModule, MaxShaderStages> modules);
~GraphicsPipeline();
void BindResources(Core::MemoryManager* memory, StreamBuffer& staging,
void BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache,
VideoCore::TextureCache& texture_cache) const;
vk::Pipeline Handle() const noexcept {
@ -75,6 +71,10 @@ public:
return *pipeline_layout;
}
const Shader::Info& GetStage(Shader::Stage stage) const noexcept {
return stages[u32(stage)];
}
bool IsEmbeddedVs() const noexcept {
static constexpr size_t EmbeddedVsHash = 0x9b2da5cf47f8c29f;
return key.stage_hashes[u32(Shader::Stage::Vertex)] == EmbeddedVsHash;
@ -90,7 +90,6 @@ public:
private:
void BuildDescSetLayout();
void BindVertexBuffers(StreamBuffer& staging) const;
private:
const Instance& instance;

View File

@ -204,7 +204,8 @@ bool Instance::CreateDevice() {
// The next two extensions are required to be available together in order to support write masks
color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME);
color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME);
const auto calibrated_timestamps = add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
const bool calibrated_timestamps = add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME);
const bool robustness = add_extension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME);
// These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2
// with extensions.
@ -303,12 +304,19 @@ bool Instance::CreateDevice() {
.workgroupMemoryExplicitLayoutScalarBlockLayout = true,
.workgroupMemoryExplicitLayout8BitAccess = true,
.workgroupMemoryExplicitLayout16BitAccess = true,
}};
},
vk::PhysicalDeviceRobustness2FeaturesEXT{
.nullDescriptor = true,
},
};
if (!color_write_en) {
device_chain.unlink<vk::PhysicalDeviceColorWriteEnableFeaturesEXT>();
device_chain.unlink<vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT>();
}
if (!robustness) {
device_chain.unlink<vk::PhysicalDeviceRobustness2FeaturesEXT>();
}
try {
device = physical_device.createDeviceUnique(device_chain.get());

View File

@ -5,7 +5,6 @@
#include <tsl/robin_map.h>
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/object_pool.h"
#include "shader_recompiler/profile.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
@ -51,8 +50,8 @@ private:
Shader::Profile profile{};
GraphicsPipelineKey graphics_key{};
u64 compute_key{};
Shader::ObjectPool<Shader::IR::Inst> inst_pool;
Shader::ObjectPool<Shader::IR::Block> block_pool;
Common::ObjectPool<Shader::IR::Inst> inst_pool;
Common::ObjectPool<Shader::IR::Block> block_pool;
};
} // namespace Vulkan

View File

@ -13,22 +13,17 @@
namespace Vulkan {
static constexpr vk::BufferUsageFlags VertexIndexFlags =
vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndexBuffer |
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eUniformBuffer |
vk::BufferUsageFlagBits::eStorageBuffer;
Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
VideoCore::TextureCache& texture_cache_, AmdGpu::Liverpool* liverpool_)
: instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_},
liverpool{liverpool_}, memory{Core::Memory::Instance()},
pipeline_cache{instance, scheduler, liverpool},
vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} {
AmdGpu::Liverpool* liverpool_)
: instance{instance_}, scheduler{scheduler_}, page_manager{this},
buffer_cache{instance, scheduler, liverpool_, page_manager},
texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
if (!Config::nullGpu()) {
liverpool->BindRasterizer(this);
}
memory->SetInstance(&instance);
memory->SetRasterizer(this);
wfi_event = instance.GetDevice().createEventUnique({});
}
Rasterizer::~Rasterizer() = default;
@ -38,29 +33,24 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
const auto cmdbuf = scheduler.CommandBuffer();
const auto& regs = liverpool->regs;
const u32 num_indices = SetupIndexBuffer(is_indexed, index_offset);
const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
if (!pipeline) {
return;
}
try {
pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
pipeline->BindResources(regs, buffer_cache, texture_cache);
} catch (...) {
UNREACHABLE();
}
const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex);
buffer_cache.BindVertexBuffers(vs_info);
const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, index_offset);
BeginRendering();
UpdateDynamicState(*pipeline);
cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle());
const u32 step_rates[] = {
regs.vgt_instance_step_rate_0,
regs.vgt_instance_step_rate_1,
};
cmdbuf.pushConstants(pipeline->GetLayout(), vk::ShaderStageFlagBits::eVertex, 0u,
sizeof(step_rates), &step_rates);
if (is_indexed) {
cmdbuf.drawIndexed(num_indices, regs.num_instances.NumInstances(), 0, 0, 0);
} else {
@ -82,8 +72,7 @@ void Rasterizer::DispatchDirect() {
}
try {
const auto has_resources =
pipeline->BindResources(memory, vertex_index_buffer, texture_cache);
const auto has_resources = pipeline->BindResources(buffer_cache, texture_cache);
if (!has_resources) {
return;
}
@ -131,7 +120,7 @@ void Rasterizer::BeginRendering() {
state.color_images[state.num_color_attachments] = image.image;
state.color_attachments[state.num_color_attachments++] = {
.imageView = *image_view.image_view,
.imageLayout = vk::ImageLayout::eGeneral,
.imageLayout = vk::ImageLayout::eColorAttachmentOptimal,
.loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad,
.storeOp = vk::AttachmentStoreOp::eStore,
.clearValue =
@ -168,45 +157,19 @@ void Rasterizer::BeginRendering() {
scheduler.BeginRendering(state);
}
u32 Rasterizer::SetupIndexBuffer(bool& is_indexed, u32 index_offset) {
// Emulate QuadList primitive type with CPU made index buffer.
const auto& regs = liverpool->regs;
if (liverpool->regs.primitive_type == Liverpool::PrimitiveType::QuadList) {
// ASSERT_MSG(!is_indexed, "Using QuadList primitive with indexed draw");
is_indexed = true;
void Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size);
texture_cache.InvalidateMemory(addr, size);
}
// Emit indices.
const u32 index_size = 3 * regs.num_indices;
const auto [data, offset, _] = vertex_index_buffer.Map(index_size);
LiverpoolToVK::EmitQuadToTriangleListIndices(data, regs.num_indices);
vertex_index_buffer.Commit(index_size);
void Rasterizer::MapMemory(VAddr addr, u64 size) {
page_manager.OnGpuMap(addr, size);
}
// Bind index buffer.
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset, vk::IndexType::eUint16);
return index_size / sizeof(u16);
}
if (!is_indexed) {
return regs.num_indices;
}
// Figure out index type and size.
const bool is_index16 = regs.index_buffer_type.index_type == Liverpool::IndexType::Index16;
const vk::IndexType index_type = is_index16 ? vk::IndexType::eUint16 : vk::IndexType::eUint32;
const u32 index_size = is_index16 ? sizeof(u16) : sizeof(u32);
// Upload index data to stream buffer.
const auto index_address = regs.index_base_address.Address<const void*>();
const u32 index_buffer_size = (index_offset + regs.num_indices) * index_size;
const auto [data, offset, _] = vertex_index_buffer.Map(index_buffer_size);
std::memcpy(data, index_address, index_buffer_size);
vertex_index_buffer.Commit(index_buffer_size);
// Bind index buffer.
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindIndexBuffer(vertex_index_buffer.Handle(), offset + index_offset * index_size,
index_type);
return regs.num_indices;
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size);
texture_cache.UnmapMemory(addr, size);
page_manager.OnGpuUnmap(addr, size);
}
void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline) {

View File

@ -3,8 +3,10 @@
#pragma once
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/texture_cache/texture_cache.h"
namespace AmdGpu {
struct Liverpool;
@ -14,10 +16,6 @@ namespace Core {
class MemoryManager;
}
namespace VideoCore {
class TextureCache;
}
namespace Vulkan {
class Scheduler;
@ -26,9 +24,13 @@ class GraphicsPipeline;
class Rasterizer {
public:
explicit Rasterizer(const Instance& instance, Scheduler& scheduler,
VideoCore::TextureCache& texture_cache, AmdGpu::Liverpool* liverpool);
AmdGpu::Liverpool* liverpool);
~Rasterizer();
[[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept {
return texture_cache;
}
void Draw(bool is_indexed, u32 index_offset = 0);
void DispatchDirect();
@ -36,12 +38,13 @@ public:
void ScopeMarkerBegin(const std::string& str);
void ScopeMarkerEnd();
void InvalidateMemory(VAddr addr, u64 size);
void MapMemory(VAddr addr, u64 size);
void UnmapMemory(VAddr addr, u64 size);
u64 Flush();
private:
u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset);
void MapMemory(VAddr addr, size_t size);
void BeginRendering();
void UpdateDynamicState(const GraphicsPipeline& pipeline);
@ -51,11 +54,13 @@ private:
private:
const Instance& instance;
Scheduler& scheduler;
VideoCore::TextureCache& texture_cache;
VideoCore::PageManager page_manager;
VideoCore::BufferCache buffer_cache;
VideoCore::TextureCache texture_cache;
AmdGpu::Liverpool* liverpool;
Core::MemoryManager* memory;
PipelineCache pipeline_cache;
StreamBuffer vertex_index_buffer;
vk::UniqueEvent wfi_event;
};
} // namespace Vulkan

View File

@ -6,6 +6,7 @@
#include <condition_variable>
#include <boost/container/static_vector.hpp>
#include "common/types.h"
#include "common/unique_function.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/renderer_vulkan/vk_resource_pool.h"
@ -97,8 +98,8 @@ public:
}
/// Defers an operation until the gpu has reached the current cpu tick.
void DeferOperation(auto&& func) {
pending_ops.emplace(func, CurrentTick());
void DeferOperation(Common::UniqueFunction<void>&& func) {
pending_ops.emplace(std::move(func), CurrentTick());
}
static std::mutex submit_mutex;
@ -115,7 +116,7 @@ private:
vk::CommandBuffer current_cmdbuf;
std::condition_variable_any event_cv;
struct PendingOp {
std::function<void()> callback;
Common::UniqueFunction<void> callback;
u64 gpu_tick;
};
std::queue<PendingOp> pending_ops;

View File

@ -1,241 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include "common/alignment.h"
#include "common/assert.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
namespace Vulkan {
namespace {
std::string_view BufferTypeName(BufferType type) {
switch (type) {
case BufferType::Upload:
return "Upload";
case BufferType::Download:
return "Download";
case BufferType::Stream:
return "Stream";
default:
return "Invalid";
}
}
vk::MemoryPropertyFlags MakePropertyFlags(BufferType type) {
switch (type) {
case BufferType::Upload:
return vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent;
case BufferType::Download:
return vk::MemoryPropertyFlagBits::eHostVisible |
vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
case BufferType::Stream:
return vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible |
vk::MemoryPropertyFlagBits::eHostCoherent;
default:
UNREACHABLE_MSG("Unknown buffer type {}", static_cast<u32>(type));
return vk::MemoryPropertyFlagBits::eHostVisible;
}
}
static std::optional<u32> FindMemoryType(const vk::PhysicalDeviceMemoryProperties& properties,
vk::MemoryPropertyFlags wanted) {
for (u32 i = 0; i < properties.memoryTypeCount; ++i) {
const auto flags = properties.memoryTypes[i].propertyFlags;
if ((flags & wanted) == wanted) {
return i;
}
}
return std::nullopt;
}
/// Get the preferred host visible memory type.
u32 GetMemoryType(const vk::PhysicalDeviceMemoryProperties& properties, BufferType type) {
vk::MemoryPropertyFlags flags = MakePropertyFlags(type);
std::optional preferred_type = FindMemoryType(properties, flags);
constexpr std::array remove_flags = {
vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostCoherent,
};
for (u32 i = 0; i < remove_flags.size() && !preferred_type; i++) {
flags &= ~remove_flags[i];
preferred_type = FindMemoryType(properties, flags);
}
ASSERT_MSG(preferred_type, "No suitable memory type found");
return preferred_type.value();
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;
} // Anonymous namespace
StreamBuffer::StreamBuffer(const Instance& instance_, Scheduler& scheduler_,
vk::BufferUsageFlags usage_, u64 size, BufferType type_)
: instance{instance_}, scheduler{scheduler_}, device{instance.GetDevice()},
stream_buffer_size{size}, usage{usage_}, type{type_} {
CreateBuffers(size);
ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE);
ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE);
}
StreamBuffer::~StreamBuffer() {
device.unmapMemory(memory);
device.destroyBuffer(buffer);
device.freeMemory(memory);
}
std::tuple<u8*, u64, bool> StreamBuffer::Map(u64 size, u64 alignment) {
if (!is_coherent && type == BufferType::Stream) {
size = Common::AlignUp(size, instance.NonCoherentAtomSize());
}
ASSERT(size <= stream_buffer_size);
mapped_size = size;
if (alignment > 0) {
offset = Common::AlignUp(offset, alignment);
}
bool invalidate{false};
if (offset + size > stream_buffer_size) {
// The buffer would overflow, save the amount of used watches and reset the state.
invalidate = true;
invalidation_mark = current_watch_cursor;
current_watch_cursor = 0;
offset = 0;
// Swap watches and reset waiting cursors.
std::swap(previous_watches, current_watches);
wait_cursor = 0;
wait_bound = 0;
}
const u64 mapped_upper_bound = offset + size;
WaitPendingOperations(mapped_upper_bound);
return std::make_tuple(mapped + offset, offset, invalidate);
}
void StreamBuffer::Commit(u64 size) {
if (!is_coherent && type == BufferType::Stream) {
size = Common::AlignUp(size, instance.NonCoherentAtomSize());
}
ASSERT_MSG(size <= mapped_size, "Reserved size {} is too small compared to {}", mapped_size,
size);
const vk::MappedMemoryRange range = {
.memory = memory,
.offset = offset,
.size = size,
};
if (!is_coherent && type == BufferType::Download) {
device.invalidateMappedMemoryRanges(range);
} else if (!is_coherent) {
device.flushMappedMemoryRanges(range);
}
offset += size;
if (current_watch_cursor + 1 >= current_watches.size()) {
// Ensure that there are enough watches.
ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
}
auto& watch = current_watches[current_watch_cursor++];
watch.upper_bound = offset;
watch.tick = scheduler.CurrentTick();
}
void StreamBuffer::CreateBuffers(u64 prefered_size) {
const vk::Device device = instance.GetDevice();
const auto memory_properties = instance.GetPhysicalDevice().getMemoryProperties();
const u32 preferred_type = GetMemoryType(memory_properties, type);
const vk::MemoryType mem_type = memory_properties.memoryTypes[preferred_type];
const u32 preferred_heap = mem_type.heapIndex;
is_coherent =
static_cast<bool>(mem_type.propertyFlags & vk::MemoryPropertyFlagBits::eHostCoherent);
// Substract from the preferred heap size some bytes to avoid getting out of memory.
const vk::DeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size;
// As per DXVK's example, using `heap_size / 2`
const vk::DeviceSize allocable_size = heap_size / 2;
buffer = device.createBuffer({
.size = std::min(prefered_size, allocable_size),
.usage = usage,
});
const auto requirements_chain =
device
.getBufferMemoryRequirements2<vk::MemoryRequirements2, vk::MemoryDedicatedRequirements>(
{.buffer = buffer});
const auto& requirements = requirements_chain.get<vk::MemoryRequirements2>();
const auto& dedicated_requirements = requirements_chain.get<vk::MemoryDedicatedRequirements>();
stream_buffer_size = static_cast<u64>(requirements.memoryRequirements.size);
LOG_INFO(Render_Vulkan, "Creating {} buffer with size {} KiB with flags {}",
BufferTypeName(type), stream_buffer_size / 1024,
vk::to_string(mem_type.propertyFlags));
if (dedicated_requirements.prefersDedicatedAllocation) {
vk::StructureChain<vk::MemoryAllocateInfo, vk::MemoryDedicatedAllocateInfo> alloc_chain =
{};
auto& alloc_info = alloc_chain.get<vk::MemoryAllocateInfo>();
alloc_info.allocationSize = requirements.memoryRequirements.size;
alloc_info.memoryTypeIndex = preferred_type;
auto& dedicated_alloc_info = alloc_chain.get<vk::MemoryDedicatedAllocateInfo>();
dedicated_alloc_info.buffer = buffer;
memory = device.allocateMemory(alloc_chain.get());
} else {
memory = device.allocateMemory({
.allocationSize = requirements.memoryRequirements.size,
.memoryTypeIndex = preferred_type,
});
}
device.bindBufferMemory(buffer, memory, 0);
mapped = reinterpret_cast<u8*>(device.mapMemory(memory, 0, VK_WHOLE_SIZE));
if (instance.HasDebuggingToolAttached()) {
SetObjectName(device, buffer, "StreamBuffer({}): {} KiB {}", BufferTypeName(type),
stream_buffer_size / 1024, vk::to_string(mem_type.propertyFlags));
SetObjectName(device, memory, "StreamBufferMemory({}): {} Kib {}", BufferTypeName(type),
stream_buffer_size / 1024, vk::to_string(mem_type.propertyFlags));
}
}
void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size) {
watches.resize(watches.size() + grow_size);
}
void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
if (!invalidation_mark) {
return;
}
while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
auto& watch = previous_watches[wait_cursor];
wait_bound = watch.upper_bound;
scheduler.Wait(watch.tick);
++wait_cursor;
}
}
u64 StreamBuffer::Copy(VAddr src, size_t size, size_t alignment /*= 0*/) {
const auto [data, offset, _] = Map(size, alignment);
std::memcpy(data, reinterpret_cast<const void*>(src), size);
Commit(size);
return offset;
}
} // namespace Vulkan

View File

@ -1,89 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <optional>
#include <span>
#include <tuple>
#include <vector>
#include "common/types.h"
#include "video_core/renderer_vulkan/vk_common.h"
namespace Vulkan {
enum class BufferType : u32 {
Upload = 0,
Download = 1,
Stream = 2,
};
class Instance;
class Scheduler;
class StreamBuffer final {
static constexpr std::size_t MAX_BUFFER_VIEWS = 3;
public:
explicit StreamBuffer(const Instance& instance, Scheduler& scheduler,
vk::BufferUsageFlags usage, u64 size,
BufferType type = BufferType::Stream);
~StreamBuffer();
/**
* Reserves a region of memory from the stream buffer.
* @param size Size to reserve.
* @returns A pair of a raw memory pointer (with offset added), and the buffer offset
*/
std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment = 0);
/// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
void Commit(u64 size);
/// Maps and commits a memory region with user provided data
u64 Copy(VAddr src, size_t size, size_t alignment = 0);
vk::Buffer Handle() const noexcept {
return buffer;
}
private:
struct Watch {
u64 tick{};
u64 upper_bound{};
};
/// Creates Vulkan buffer handles committing the required the required memory.
void CreateBuffers(u64 prefered_size);
/// Increases the amount of watches available.
void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);
void WaitPendingOperations(u64 requested_upper_bound);
private:
const Instance& instance; ///< Vulkan instance.
Scheduler& scheduler; ///< Command scheduler.
vk::Device device;
vk::Buffer buffer; ///< Mapped buffer.
vk::DeviceMemory memory; ///< Memory allocation.
u8* mapped{}; ///< Pointer to the mapped memory
u64 stream_buffer_size{}; ///< Stream buffer size.
vk::BufferUsageFlags usage{};
BufferType type;
u64 offset{}; ///< Buffer iterator.
u64 mapped_size{}; ///< Size reserved for the current copy.
bool is_coherent{}; ///< True if the buffer is coherent
std::vector<Watch> current_watches; ///< Watches recorded in the current iteration.
std::size_t current_watch_cursor{}; ///< Count of watches, reset on invalidation.
std::optional<std::size_t> invalidation_mark; ///< Number of watches used in the previous cycle.
std::vector<Watch> previous_watches; ///< Watches used in the previous iteration.
std::size_t wait_cursor{}; ///< Last watch being waited for completion.
u64 wait_bound{}; ///< Highest offset being watched for completion.
};
} // namespace Vulkan

View File

@ -260,7 +260,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
case AmdGpu::TilingMode::Display_MacroTiled:
case AmdGpu::TilingMode::Texture_MacroTiled:
case AmdGpu::TilingMode::Depth_MacroTiled: {
ASSERT(!props.is_cube && !props.is_block);
// ASSERT(!props.is_cube && !props.is_block);
ASSERT(num_samples == 1);
std::tie(mip_info.pitch, mip_info.size) =
ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index);

View File

@ -61,23 +61,24 @@ vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) {
return format;
}
ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept
: is_storage{is_storage} {
ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage_) noexcept
: is_storage{is_storage_} {
type = ConvertImageViewType(image.GetType());
format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
range.base.level = image.base_level;
range.base.layer = image.base_array;
range.extent.levels = image.last_level + 1;
range.extent.layers = image.last_array + 1;
mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
if (!is_storage) {
mapping.r = ConvertComponentSwizzle(image.dst_sel_x);
mapping.g = ConvertComponentSwizzle(image.dst_sel_y);
mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
}
// Check for unfortunate case of storage images being swizzled
const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt());
const u32 dst_sel = image.DstSelect();
if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) {
mapping = vk::ComponentMapping{};
if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) {
format = new_format;
return;

View File

@ -3,103 +3,22 @@
#include <xxhash.h>
#include "common/assert.h"
#include "common/config.h"
#include "core/virtual_memory.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/texture_cache/texture_cache.h"
#include "video_core/texture_cache/tile_manager.h"
#ifndef _WIN64
#include <signal.h>
#include <sys/mman.h>
#define PAGE_NOACCESS PROT_NONE
#define PAGE_READWRITE (PROT_READ | PROT_WRITE)
#define PAGE_READONLY PROT_READ
#else
#include <windows.h>
void mprotect(void* addr, size_t len, int prot) {
DWORD old_prot{};
BOOL result = VirtualProtect(addr, len, prot, &old_prot);
ASSERT_MSG(result != 0, "Region protection failed");
}
#endif
namespace VideoCore {
static TextureCache* g_texture_cache = nullptr;
#ifndef _WIN64
void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) {
ucontext_t* ctx = reinterpret_cast<ucontext_t*>(raw_context);
const VAddr address = reinterpret_cast<VAddr>(info->si_addr);
#ifdef __APPLE__
const u32 err = ctx->uc_mcontext->__es.__err;
#else
const greg_t err = ctx->uc_mcontext.gregs[REG_ERR];
#endif
if (err & 0x2) {
g_texture_cache->OnCpuWrite(address);
} else {
// Read not supported!
UNREACHABLE();
}
}
#else
LONG WINAPI GuestFaultSignalHandler(EXCEPTION_POINTERS* pExp) noexcept {
const u32 ec = pExp->ExceptionRecord->ExceptionCode;
if (ec == EXCEPTION_ACCESS_VIOLATION) {
const auto info = pExp->ExceptionRecord->ExceptionInformation;
if (info[0] == 1) { // Write violation
g_texture_cache->OnCpuWrite(info[1]);
return EXCEPTION_CONTINUE_EXECUTION;
} /* else {
UNREACHABLE();
}*/
}
return EXCEPTION_CONTINUE_SEARCH; // pass further
}
#endif
static constexpr u64 StreamBufferSize = 512_MB;
static constexpr u64 PageShift = 12;
TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_)
: instance{instance_}, scheduler{scheduler_},
staging{instance, scheduler, vk::BufferUsageFlagBits::eTransferSrc, StreamBufferSize,
Vulkan::BufferType::Upload},
TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
BufferCache& buffer_cache_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_},
staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize},
tile_manager{instance, scheduler} {
#ifndef _WIN64
#ifdef __APPLE__
// Read-only memory write results in SIGBUS on Apple.
static constexpr int SignalType = SIGBUS;
#else
static constexpr int SignalType = SIGSEGV;
#endif
sigset_t signal_mask;
sigemptyset(&signal_mask);
sigaddset(&signal_mask, SignalType);
using HandlerType = decltype(sigaction::sa_sigaction);
struct sigaction guest_access_fault {};
guest_access_fault.sa_flags = SA_SIGINFO | SA_ONSTACK;
guest_access_fault.sa_sigaction = &GuestFaultSignalHandler;
guest_access_fault.sa_mask = signal_mask;
sigaction(SignalType, &guest_access_fault, nullptr);
#else
veh_handle = AddVectoredExceptionHandler(0, GuestFaultSignalHandler);
ASSERT_MSG(veh_handle, "Failed to register an exception handler");
#endif
g_texture_cache = this;
ImageInfo info;
info.pixel_format = vk::Format::eR8G8B8A8Unorm;
info.type = vk::ImageType::e2D;
@ -110,15 +29,11 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler&
void(slot_image_views.insert(instance, view_info, slot_images[null_id], null_id));
}
TextureCache::~TextureCache() {
#if _WIN64
RemoveVectoredExceptionHandler(veh_handle);
#endif
}
TextureCache::~TextureCache() = default;
void TextureCache::OnCpuWrite(VAddr address) {
std::unique_lock lock{m_page_table};
ForEachImageInRegion(address, 1 << PageShift, [&](ImageId image_id, Image& image) {
void TextureCache::InvalidateMemory(VAddr address, size_t size) {
std::unique_lock lock{mutex};
ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) {
// Ensure image is reuploaded when accessed again.
image.flags |= ImageFlagBits::CpuModified;
// Untrack image, so the range is unprotected and the guest can write freely.
@ -126,8 +41,28 @@ void TextureCache::OnCpuWrite(VAddr address) {
});
}
void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
std::scoped_lock lk{mutex};
boost::container::small_vector<ImageId, 16> deleted_images;
ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); });
for (const ImageId id : deleted_images) {
Image& image = slot_images[id];
if (True(image.flags & ImageFlagBits::Tracked)) {
UntrackImage(image, id);
}
// TODO: Download image data back to host.
UnregisterImage(id);
DeleteImage(id);
}
}
ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
std::unique_lock lock{m_page_table};
if (info.guest_address == 0) [[unlikely]] {
return NULL_IMAGE_VIEW_ID;
}
std::unique_lock lock{mutex};
boost::container::small_vector<ImageId, 2> image_ids;
ForEachImageInRegion(
info.guest_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) {
@ -183,10 +118,6 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
}
ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
if (info.guest_address == 0) [[unlikely]] {
return slot_image_views[NULL_IMAGE_VIEW_ID];
}
const ImageId image_id = FindImage(info);
Image& image = slot_images[image_id];
auto& usage = image.info.usage;
@ -310,10 +241,7 @@ void TextureCache::RefreshImage(Image& image) {
buffer = *upload_buffer;
} else {
// Upload data to the staging buffer.
const auto [data, offset_, _] = staging.Map(image.info.guest_size_bytes, 16);
std::memcpy(data, (void*)image.info.guest_address, image.info.guest_size_bytes);
staging.Commit(image.info.guest_size_bytes);
offset = offset_;
offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16);
}
const auto& num_layers = image.info.resources.layers;
@ -344,9 +272,6 @@ void TextureCache::RefreshImage(Image& image) {
}
cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead);
}
vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {
@ -362,8 +287,6 @@ void TextureCache::RegisterImage(ImageId image_id) {
image.flags |= ImageFlagBits::Registered;
ForEachPage(image.cpu_addr, image.info.guest_size_bytes,
[this, image_id](u64 page) { page_table[page].push_back(image_id); });
image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone);
}
void TextureCache::UnregisterImage(ImageId image_id) {
@ -373,11 +296,11 @@ void TextureCache::UnregisterImage(ImageId image_id) {
image.flags &= ~ImageFlagBits::Registered;
ForEachPage(image.cpu_addr, image.info.guest_size_bytes, [this, image_id](u64 page) {
const auto page_it = page_table.find(page);
if (page_it == page_table.end()) {
ASSERT_MSG(false, "Unregistering unregistered page=0x{:x}", page << PageShift);
if (page_it == nullptr) {
UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PageShift);
return;
}
auto& image_ids = page_it.value();
auto& image_ids = *page_it;
const auto vector_it = std::ranges::find(image_ids, image_id);
if (vector_it == image_ids.end()) {
ASSERT_MSG(false, "Unregistering unregistered image in page=0x{:x}", page << PageShift);
@ -393,7 +316,7 @@ void TextureCache::TrackImage(Image& image, ImageId image_id) {
return;
}
image.flags |= ImageFlagBits::Tracked;
UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1);
tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, 1);
}
void TextureCache::UntrackImage(Image& image, ImageId image_id) {
@ -401,40 +324,34 @@ void TextureCache::UntrackImage(Image& image, ImageId image_id) {
return;
}
image.flags &= ~ImageFlagBits::Tracked;
UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1);
tracker.UpdatePagesCachedCount(image.cpu_addr, image.info.guest_size_bytes, -1);
}
void TextureCache::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
std::scoped_lock lk{mutex};
const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
const u64 page_start = addr >> PageShift;
const u64 page_end = page_start + num_pages;
void TextureCache::DeleteImage(ImageId image_id) {
Image& image = slot_images[image_id];
ASSERT_MSG(False(image.flags & ImageFlagBits::Tracked), "Image was not untracked");
ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Image was not unregistered");
const auto pages_interval =
decltype(cached_pages)::interval_type::right_open(page_start, page_end);
if (delta > 0) {
cached_pages.add({pages_interval, delta});
// Remove any registered meta areas.
const auto& meta_info = image.info.meta_info;
if (meta_info.cmask_addr) {
surface_metas.erase(meta_info.cmask_addr);
}
if (meta_info.fmask_addr) {
surface_metas.erase(meta_info.fmask_addr);
}
if (meta_info.htile_addr) {
surface_metas.erase(meta_info.htile_addr);
}
const auto& range = cached_pages.equal_range(pages_interval);
for (const auto& [range, count] : boost::make_iterator_range(range)) {
const auto interval = range & pages_interval;
const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
const u32 interval_size = interval_end_addr - interval_start_addr;
void* addr = reinterpret_cast<void*>(interval_start_addr);
if (delta > 0 && count == delta) {
mprotect(addr, interval_size, PAGE_READONLY);
} else if (delta < 0 && count == -delta) {
mprotect(addr, interval_size, PAGE_READWRITE);
} else {
ASSERT(count >= 0);
// Reclaim image and any image views it references.
scheduler.DeferOperation([this, image_id] {
Image& image = slot_images[image_id];
for (const ImageViewId image_view_id : image.image_view_ids) {
slot_image_views.erase(image_view_id);
}
}
if (delta < 0) {
cached_pages.add({pages_interval, delta});
}
slot_images.erase(image_id);
});
}
} // namespace VideoCore

View File

@ -4,12 +4,11 @@
#pragma once
#include <boost/container/small_vector.hpp>
#include <boost/icl/interval_map.hpp>
#include <tsl/robin_map.h>
#include "common/slot_vector.h"
#include "video_core/amdgpu/resource.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/multi_level_page_table.h"
#include "video_core/texture_cache/image.h"
#include "video_core/texture_cache/image_view.h"
#include "video_core/texture_cache/sampler.h"
@ -21,31 +20,28 @@ struct BufferAttributeGroup;
namespace VideoCore {
class BufferCache;
class PageManager;
class TextureCache {
// This is the page shift for adding images into the hash map. It isn't related to
// the page size of the guest or the host and is chosen for convenience. A number too
// small will increase the number of hash map lookups per image, while too large will
// increase the number of images per page.
static constexpr u64 PageBits = 20;
static constexpr u64 PageMask = (1ULL << PageBits) - 1;
struct MetaDataInfo {
enum class Type {
CMask,
FMask,
HTile,
};
Type type;
bool is_cleared;
struct Traits {
using Entry = boost::container::small_vector<ImageId, 16>;
static constexpr size_t AddressSpaceBits = 39;
static constexpr size_t FirstLevelBits = 9;
static constexpr size_t PageBits = 22;
};
using PageTable = MultiLevelPageTable<Traits>;
public:
explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler);
explicit TextureCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
BufferCache& buffer_cache, PageManager& tracker);
~TextureCache();
/// Invalidates any image in the logical page range.
void OnCpuWrite(VAddr address);
void InvalidateMemory(VAddr address, size_t size);
/// Evicts any images that overlap the unmapped range.
void UnmapMemory(VAddr cpu_addr, size_t size);
/// Retrieves the image handle of the image with the provided attributes.
[[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true);
@ -101,8 +97,8 @@ private:
template <typename Func>
static void ForEachPage(PAddr addr, size_t size, Func&& func) {
static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
const u64 page_end = (addr + size - 1) >> PageBits;
for (u64 page = addr >> PageBits; page <= page_end; ++page) {
const u64 page_end = (addr + size - 1) >> Traits::PageBits;
for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) {
if constexpr (RETURNS_BOOL) {
if (func(page)) {
break;
@ -120,14 +116,14 @@ private:
boost::container::small_vector<ImageId, 32> images;
ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) {
const auto it = page_table.find(page);
if (it == page_table.end()) {
if (it == nullptr) {
if constexpr (BOOL_BREAK) {
return false;
} else {
return;
}
}
for (const ImageId image_id : it->second) {
for (const ImageId image_id : *it) {
Image& image = slot_images[image_id];
if (image.flags & ImageFlagBits::Picked) {
continue;
@ -166,25 +162,32 @@ private:
/// Stop tracking CPU reads and writes for image
void UntrackImage(Image& image, ImageId image_id);
/// Increase/decrease the number of surface in pages touching the specified region
void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
/// Removes the image and any views/surface metas that reference it.
void DeleteImage(ImageId image_id);
private:
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
Vulkan::StreamBuffer staging;
BufferCache& buffer_cache;
PageManager& tracker;
StreamBuffer staging;
TileManager tile_manager;
Common::SlotVector<Image> slot_images;
Common::SlotVector<ImageView> slot_image_views;
tsl::robin_map<u64, Sampler> samplers;
tsl::robin_pg_map<u64, std::vector<ImageId>> page_table;
boost::icl::interval_map<VAddr, s32> cached_pages;
tsl::robin_map<VAddr, MetaDataInfo> surface_metas;
PageTable page_table;
std::mutex mutex;
#ifdef _WIN64
void* veh_handle{};
#endif
std::mutex m_page_table;
struct MetaDataInfo {
enum class Type {
CMask,
FMask,
HTile,
};
Type type;
bool is_cleared;
};
tsl::robin_map<VAddr, MetaDataInfo> surface_metas;
};
} // namespace VideoCore

View File

@ -183,10 +183,12 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
case vk::Format::eB8G8R8A8Srgb:
case vk::Format::eB8G8R8A8Unorm:
case vk::Format::eR8G8B8A8Unorm:
case vk::Format::eR8G8B8A8Uint:
case vk::Format::eR32Sfloat:
case vk::Format::eR32Uint:
case vk::Format::eR16G16Sfloat:
return vk::Format::eR32Uint;
case vk::Format::eBc1RgbaSrgbBlock:
case vk::Format::eBc1RgbaUnormBlock:
case vk::Format::eBc4UnormBlock:
case vk::Format::eR32G32Sfloat:
@ -200,11 +202,20 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
case vk::Format::eBc5UnormBlock:
case vk::Format::eBc7SrgbBlock:
case vk::Format::eBc7UnormBlock:
case vk::Format::eBc6HUfloatBlock:
case vk::Format::eR32G32B32A32Sfloat:
return vk::Format::eR32G32B32A32Uint;
default:
break;
}
LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format));
// Log missing formats only once to avoid spamming the log.
static constexpr size_t MaxFormatIndex = 256;
static std::array<bool, MaxFormatIndex> logged_formats{};
if (const u32 index = u32(format); !logged_formats[index]) {
LOG_ERROR(Render_Vulkan, "Unexpected format for demotion {}", vk::to_string(format));
logged_formats[index] = true;
}
return format;
}
@ -236,8 +247,11 @@ struct DetilerParams {
u32 sizes[14];
};
static constexpr size_t StreamBufferSize = 128_MB;
TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler)
: instance{instance}, scheduler{scheduler} {
: instance{instance}, scheduler{scheduler},
stream_buffer{instance, scheduler, MemoryUsage::Stream, StreamBufferSize} {
static const std::array detiler_shaders{
HostShaders::DETILE_M8X1_COMP, HostShaders::DETILE_M8X2_COMP,
HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP,
@ -336,8 +350,7 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*
.flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT |
VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT
: static_cast<VmaAllocationCreateFlags>(0),
.usage = is_large_buffer ? VMA_MEMORY_USAGE_AUTO_PREFER_HOST
: VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
.requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
: static_cast<VkMemoryPropertyFlags>(0),
};
@ -373,37 +386,46 @@ std::optional<vk::Buffer> TileManager::TryDetile(Image& image) {
const auto* detiler = GetDetiler(image);
if (!detiler) {
LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled) {
LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
}
return std::nullopt;
}
// Prepare input buffer
auto in_buffer = AllocBuffer(image.info.guest_size_bytes);
Upload(in_buffer, reinterpret_cast<const void*>(image.info.guest_address),
image.info.guest_size_bytes);
const u32 image_size = image.info.guest_size_bytes;
const auto [in_buffer, in_offset] = [&] -> std::pair<vk::Buffer, u32> {
// Use stream buffer for smaller textures.
if (image_size <= StreamBufferSize) {
u32 offset = stream_buffer.Copy(image.info.guest_address, image_size);
return {stream_buffer.Handle(), offset};
}
// Request temporary host buffer for larger sizes.
auto in_buffer = AllocBuffer(image_size);
const auto addr = reinterpret_cast<const void*>(image.info.guest_address);
Upload(in_buffer, addr, image_size);
scheduler.DeferOperation([=, this]() { FreeBuffer(in_buffer); });
return {in_buffer.first, 0};
}();
// Prepare output buffer
auto out_buffer = AllocBuffer(image.info.guest_size_bytes, true);
scheduler.DeferOperation([=, this]() {
FreeBuffer(in_buffer);
FreeBuffer(out_buffer);
});
auto out_buffer = AllocBuffer(image_size, true);
scheduler.DeferOperation([=, this]() { FreeBuffer(out_buffer); });
auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl);
const vk::DescriptorBufferInfo input_buffer_info{
.buffer = in_buffer.first,
.offset = 0,
.range = image.info.guest_size_bytes,
.buffer = in_buffer,
.offset = in_offset,
.range = image_size,
};
const vk::DescriptorBufferInfo output_buffer_info{
.buffer = out_buffer.first,
.offset = 0,
.range = image.info.guest_size_bytes,
.range = image_size,
};
std::vector<vk::WriteDescriptorSet> set_writes{
@ -442,16 +464,16 @@ std::optional<vk::Buffer> TileManager::TryDetile(Image& image) {
cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params),
&params);
ASSERT((image.info.guest_size_bytes % 64) == 0);
ASSERT((image_size % 64) == 0);
const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u);
const auto num_tiles = image.info.guest_size_bytes / (64 * (bpp / 8));
const auto num_tiles = image_size / (64 * (bpp / 8));
cmdbuf.dispatch(num_tiles, 1, 1);
const vk::BufferMemoryBarrier post_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead,
.buffer = out_buffer.first,
.size = image.info.guest_size_bytes,
.size = image_size,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,

View File

@ -4,7 +4,7 @@
#pragma once
#include "common/types.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/texture_cache/image.h"
namespace VideoCore {
@ -34,7 +34,7 @@ struct DetilerContext {
class TileManager {
public:
using ScratchBuffer = std::pair<VkBuffer, VmaAllocation>;
using ScratchBuffer = std::pair<vk::Buffer, VmaAllocation>;
TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler);
~TileManager();
@ -51,6 +51,7 @@ private:
private:
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
StreamBuffer stream_buffer;
std::array<DetilerContext, DetilerType::Max> detilers;
};