diff --git a/.github/shadps4.png b/.github/shadps4.png index 86fa592d..7ad301d4 100644 Binary files a/.github/shadps4.png and b/.github/shadps4.png differ diff --git a/.github/workflows/windows-qt.yml b/.github/workflows/windows-qt.yml index 9610280b..019a8ab2 100644 --- a/.github/workflows/windows-qt.yml +++ b/.github/workflows/windows-qt.yml @@ -28,8 +28,11 @@ jobs: - name: Setup Qt uses: jurplel/install-qt-action@v4 with: - arch: win64_msvc2019_64 version: 6.7.2 + host: windows + target: desktop + arch: win64_msvc2019_64 + archives: qtbase - name: Configure CMake # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. diff --git a/.gitignore b/.gitignore index fcf7634f..2a314508 100644 --- a/.gitignore +++ b/.gitignore @@ -408,3 +408,4 @@ FodyWeavers.xsd /emulator/eboot.bin /out/* /third-party/out/* +/src/common/scm_rev.cpp diff --git a/.reuse/dep5 b/.reuse/dep5 index 1dad5014..a80001f8 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -14,8 +14,6 @@ Files: CMakeSettings.json documents/Screenshots/Sonic Mania.png documents/Screenshots/Undertale.png documents/Screenshots/We are DOOMED.png - externals/stb_image.h - externals/tracy/* scripts/ps4_names.txt src/images/controller_icon.png src/images/exit_icon.png @@ -36,9 +34,26 @@ Files: CMakeSettings.json src/images/refresh_icon.png src/images/settings_icon.png src/images/stop_icon.png + src/images/shadPS4.icns src/images/shadps4.ico src/images/themes_icon.png src/shadps4.qrc src/shadps4.rc Copyright: shadPS4 Emulator Project License: GPL-2.0-or-later + +Files: externals/cmake-modules/* +Copyright: 2009-2010 Iowa State University +License: BSL-1.0 + +Files: externals/renderdoc/* +Copyright: 2019-2024 Baldur Karlsson +License: MIT + +Files: externals/stb_image.h +Copyright: 2017 Sean Barrett +License: MIT + +Files: externals/tracy/* +Copyright: 2017-2024 Bartosz Taudul +License: BSD-3-Clause diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ec7cd54..90ba4d83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,12 +19,11 @@ project(shadPS4) option(ENABLE_QT_GUI "Enable the Qt GUI. If not selected then the emulator uses a minimal SDL-based UI instead" OFF) -# This function should be passed a list of all files in a target. It will automatically generate -# file groups following the directory hierarchy, so that the layout of the files in IDEs matches the -# one in the filesystem. +# This function should be passed a list of all files in a target. It will automatically generate file groups +# following the directory hierarchy, so that the layout of the files in IDEs matches the one in the filesystem. function(create_target_directory_groups target_name) - # Place any files that aren't in the source list in a separate group so that they don't get in - # the way. + + # Place any files that aren't in the source list in a separate group so that they don't get in the way. source_group("Other Files" REGULAR_EXPRESSION ".") get_target_property(target_sources "${target_name}" SOURCES) @@ -39,14 +38,6 @@ endfunction() # Setup a custom clang-format target (if clang-format can be found) that will run # against all the src files. This should be used before making a pull request. -# ======================================================================= - -set(CLANG_FORMAT_POSTFIX "-17") -find_program(CLANG_FORMAT - NAMES clang-format${CLANG_FORMAT_POSTFIX} - clang-format - PATHS ${PROJECT_BINARY_DIR}/externals) - if (CLANG_FORMAT) set(SRCS ${PROJECT_SOURCE_DIR}/src) set(CCOMMENT "Running clang format against all the .h and .cpp files in src/") @@ -65,6 +56,15 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +# generate git revision information +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules/") +include(GetGitRevisionDescription) +get_git_head_revision(GIT_REF_SPEC GIT_REV) +git_describe(GIT_DESC --always --long --dirty) +git_branch_name(GIT_BRANCH) + +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/common/scm_rev.cpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/src/common/scm_rev.cpp" @ONLY) + find_package(Boost 1.84.0 CONFIG) find_package(cryptopp 8.9.0 MODULE) find_package(fmt 10.2.1 CONFIG) @@ -79,17 +79,22 @@ find_package(xbyak 7.07 CONFIG) find_package(xxHash 0.8.2 MODULE) find_package(zlib-ng 2.2.0 MODULE) find_package(Zydis 4.1.0 CONFIG) +find_package(RenderDoc MODULE) if (APPLE) find_package(date 3.0.1 CONFIG) endif() +# Note: Windows always has these functions through winpthreads include(CheckSymbolExists) check_symbol_exists(pthread_mutex_timedlock "pthread.h" HAVE_PTHREAD_MUTEX_TIMEDLOCK) -# Windows always has the function through winpthreads if(HAVE_PTHREAD_MUTEX_TIMEDLOCK OR WIN32) add_compile_options(-DHAVE_PTHREAD_MUTEX_TIMEDLOCK) endif() +check_symbol_exists(sem_timedwait "semaphore.h" HAVE_SEM_TIMEDWAIT) +if(HAVE_SEM_TIMEDWAIT OR WIN32) + add_compile_options(-DHAVE_SEM_TIMEDWAIT) +endif() add_subdirectory(externals) include_directories(src) @@ -226,6 +231,10 @@ set(PLAYGO_LIB src/core/libraries/playgo/playgo.cpp src/core/libraries/playgo/playgo_types.h ) +set(RANDOM_LIB src/core/libraries/random/random.cpp + src/core/libraries/random/random.h +) + set(USBD_LIB src/core/libraries/usbd/usbd.cpp src/core/libraries/usbd/usbd.h ) @@ -288,6 +297,8 @@ set(COMMON src/common/logging/backend.cpp src/common/version.h src/common/ntapi.h src/common/ntapi.cpp + src/common/scm_rev.cpp + src/common/scm_rev.h ) set(CORE src/core/aerolib/stubs.cpp @@ -308,6 +319,8 @@ set(CORE src/core/aerolib/stubs.cpp src/core/file_format/pkg_type.h src/core/file_format/psf.cpp src/core/file_format/psf.h + src/core/file_format/playgo_chunk.cpp + src/core/file_format/playgo_chunk.h src/core/file_format/trp.cpp src/core/file_format/trp.h src/core/file_format/splash.h @@ -336,6 +349,7 @@ set(CORE src/core/aerolib/stubs.cpp ${NP_LIBS} ${PNG_LIB} ${PLAYGO_LIB} + ${RANDOM_LIB} ${USBD_LIB} ${MISC_LIBS} ${DIALOGS_LIB} @@ -482,6 +496,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/texture_cache/tile_manager.cpp src/video_core/texture_cache/tile_manager.h src/video_core/texture_cache/types.h + src/video_core/renderdoc.cpp + src/video_core/renderdoc.h ) set(INPUT src/input/controller.cpp @@ -494,7 +510,7 @@ set(EMULATOR src/emulator.cpp src/sdl_window.cpp ) -# the above is shared in sdl and qt version (TODO share them all) +# The above is shared in SDL and Qt version (TODO share them all) if(ENABLE_QT_GUI) qt_add_resources(RESOURCE_FILES src/shadps4.qrc) @@ -537,6 +553,7 @@ if (ENABLE_QT_GUI) ${SHADER_RECOMPILER} ${VIDEO_CORE} ${EMULATOR} + src/images/shadPS4.icns ) else() add_executable(shadps4 @@ -557,7 +574,7 @@ endif() create_target_directory_groups(shadps4) -target_link_libraries(shadps4 PRIVATE magic_enum::magic_enum fmt::fmt toml11::toml11 tsl::robin_map xbyak::xbyak Tracy::TracyClient) +target_link_libraries(shadps4 PRIVATE magic_enum::magic_enum fmt::fmt toml11::toml11 tsl::robin_map xbyak::xbyak Tracy::TracyClient RenderDoc::API) target_link_libraries(shadps4 PRIVATE Boost::headers GPUOpen::VulkanMemoryAllocator sirit Vulkan::Headers xxHash::xxhash Zydis::Zydis glslang::SPIRV glslang::glslang SDL3::SDL3) if (APPLE) @@ -623,6 +640,10 @@ target_include_directories(shadps4 PRIVATE ${HOST_SHADERS_INCLUDE}) if (ENABLE_QT_GUI) set_target_properties(shadps4 PROPERTIES -# WIN32_EXECUTABLE ON - MACOSX_BUNDLE ON) +# WIN32_EXECUTABLE ON + MACOSX_BUNDLE ON + MACOSX_BUNDLE_ICON_FILE shadPS4.icns) + + set_source_files_properties(src/images/shadPS4.icns PROPERTIES + MACOSX_PACKAGE_LOCATION Resources) endif() diff --git a/LICENSES/BSL-1.0.txt b/LICENSES/BSL-1.0.txt new file mode 100644 index 00000000..2d87ab1a --- /dev/null +++ b/LICENSES/BSL-1.0.txt @@ -0,0 +1,7 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index cf7d9f9b..7089b3c0 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ SPDX-License-Identifier: GPL-2.0-or-later # shadPS4 -shadPS4 is an early PS4 emulator for Windows and Linux written in C++ +shadPS4 is an early **PlayStation 4** emulator for **Windows**, **Linux** and **macOS** written in C++ If you encounter problems or have doubts, do not hesitate to look at the [**Quickstart**](https://github.com/shadps4-emu/shadPS4/blob/main/documents/Quickstart/Quickstart.md). @@ -42,13 +42,13 @@ To verify that a game works, you can look at [**shadPS4 Game Compatibility**](ht To discuss shadPS4 development or suggest ideas, join the [**Discord server**](https://discord.gg/MyZRaBngxA). -Check us on [**X (twitter)**](https://x.com/shadps4) or on our [**website**](https://shadps4.net/). +To get the latest news, go to our [**X (twitter)**](https://x.com/shadps4) or our [**website**](https://shadps4.net/). # Status In development, small games are working like [**Sonic Mania**](https://www.youtube.com/watch?v=AAHoNzhHyCU), [**Undertale**](https://youtu.be/5zIvdy65Ro4), [**Dysmantle**](https://youtu.be/b9xzhLBdESE) and others... -# Why? +# Why The project started as a fun project. Due to limited free time, it will probably take a while before shadPS4 is able to run anything decent, but we're trying to make small, regular commits. @@ -64,20 +64,37 @@ Check the build instructions for [**Linux**](https://github.com/shadps4-emu/shad ## Build status -|Windows|Build status| -|--------|------------| +
+Windows + +| Windows | Build status | +|--------|--------| |Windows SDL Build|[![Windows-sdl](https://github.com/shadps4-emu/shadPS4/actions/workflows/windows.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/windows.yml) |Windows Qt Build|[![Windows-qt](https://github.com/shadps4-emu/shadPS4/actions/workflows/windows-qt.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/windows-qt.yml) +
-|Linux|Build status| -|--------|------------| +
+Linux + +| Linux | Build status | +|--------|--------| |Linux SDL Build|[![Linux-sdl](https://github.com/shadps4-emu/shadPS4/actions/workflows/linux.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/linux.yml) |Linux Qt Build|[![Linux-qt](https://github.com/shadps4-emu/shadPS4/actions/workflows/linux-qt.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/linux-qt.yml) +
+ +
+macOS + +| macOS | Build status | +|--------|--------| +|macOS SDL Build|[![macOS-sdl](https://github.com/shadps4-emu/shadPS4/actions/workflows/macos.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/macos.yml) +|macOS Qt Build|[![macOS-qt](https://github.com/shadps4-emu/shadPS4/actions/workflows/macos-qt.yml/badge.svg)](https://github.com/shadps4-emu/shadPS4/actions/workflows/macos-qt.yml) +
# Keyboard Mapping | Controller button | Keyboard | -| ------------- | ------------- | +|-------------|-------------| LEFT AXIS UP | W | LEFT AXIS DOWN | S | LEFT AXIS LEFT | A | @@ -123,7 +140,7 @@ Open a PR and we'll check it :) # Contributors - + # Sister Projects diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 7fca7b54..9ebdd878 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -74,6 +74,13 @@ if (NOT TARGET GPUOpen::VulkanMemoryAllocator) add_subdirectory(vma) endif() +# RenderDoc +if (NOT TARGET RenderDoc::API) + add_library(renderdoc INTERFACE) + target_include_directories(renderdoc SYSTEM INTERFACE ./renderdoc) + add_library(RenderDoc::API ALIAS renderdoc) +endif() + # glslang if (NOT TARGET glslang::glslang) set(SKIP_GLSLANG_INSTALL ON CACHE BOOL "") diff --git a/externals/cmake-modules/GetGitRevisionDescription.cmake b/externals/cmake-modules/GetGitRevisionDescription.cmake new file mode 100644 index 00000000..087f5dee --- /dev/null +++ b/externals/cmake-modules/GetGitRevisionDescription.cmake @@ -0,0 +1,158 @@ +# - Returns a version string from Git +# +# These functions force a re-configure on each git commit so that you can +# trust the values of the variables in your build system. +# +# get_git_head_revision( [ ...]) +# +# Returns the refspec and sha hash of the current head revision +# +# git_describe( [ ...]) +# +# Returns the results of git describe on the source tree, and adjusting +# the output so that it tests false if an error occurs. +# +# git_get_exact_tag( [ ...]) +# +# Returns the results of git describe --exact-match on the source tree, +# and adjusting the output so that it tests false if there was no exact +# matching tag. +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +if(__get_git_revision_description) + return() +endif() +set(__get_git_revision_description YES) + +# We must run the following at "include" time, not at function call time, +# to find the path to this module rather than the path to a calling list file +get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH) + +function(get_git_head_revision _refspecvar _hashvar) + set(GIT_PARENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}") + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + while(NOT EXISTS "${GIT_DIR}") # .git dir not found, search parent directories + set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}") + get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH) + if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT) + # We have reached the root directory, we are not in git + set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + return() + endif() + set(GIT_DIR "${GIT_PARENT_DIR}/.git") + endwhile() + # check if this is a submodule + if(NOT IS_DIRECTORY ${GIT_DIR}) + file(READ ${GIT_DIR} submodule) + string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" GIT_DIR_RELATIVE ${submodule}) + get_filename_component(SUBMODULE_DIR ${GIT_DIR} PATH) + get_filename_component(GIT_DIR ${SUBMODULE_DIR}/${GIT_DIR_RELATIVE} ABSOLUTE) + endif() + set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data") + if(NOT EXISTS "${GIT_DATA}") + file(MAKE_DIRECTORY "${GIT_DATA}") + endif() + + if(NOT EXISTS "${GIT_DIR}/HEAD") + return() + endif() + set(HEAD_FILE "${GIT_DATA}/HEAD") + configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY) + + configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in" + "${GIT_DATA}/grabRef.cmake" + @ONLY) + include("${GIT_DATA}/grabRef.cmake") + + set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE) + set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE) +endfunction() + +function(git_branch_name _var) + if(NOT GIT_FOUND) + find_package(Git QUIET) + endif() + + if(NOT GIT_FOUND) + set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) + return() + endif() + + execute_process(COMMAND + "${GIT_EXECUTABLE}" + rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY + "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE + res + OUTPUT_VARIABLE + out + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT res EQUAL 0) + set(out "${out}-${res}-NOTFOUND") + endif() + + set(${_var} "${out}" PARENT_SCOPE) +endfunction() + +function(git_describe _var) + if(NOT GIT_FOUND) + find_package(Git QUIET) + endif() + #get_git_head_revision(refspec hash) + if(NOT GIT_FOUND) + set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) + return() + endif() + #if(NOT hash) + # set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE) + # return() + #endif() + + # TODO sanitize + #if((${ARGN}" MATCHES "&&") OR + # (ARGN MATCHES "||") OR + # (ARGN MATCHES "\\;")) + # message("Please report the following error to the project!") + # message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}") + #endif() + + #message(STATUS "Arguments to execute_process: ${ARGN}") + + execute_process(COMMAND + "${GIT_EXECUTABLE}" + describe + ${hash} + ${ARGN} + WORKING_DIRECTORY + "${CMAKE_SOURCE_DIR}" + RESULT_VARIABLE + res + OUTPUT_VARIABLE + out + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT res EQUAL 0) + set(out "${out}-${res}-NOTFOUND") + endif() + + set(${_var} "${out}" PARENT_SCOPE) +endfunction() + +function(git_get_exact_tag _var) + git_describe(out --exact-match ${ARGN}) + set(${_var} "${out}" PARENT_SCOPE) +endfunction() diff --git a/externals/cmake-modules/GetGitRevisionDescription.cmake.in b/externals/cmake-modules/GetGitRevisionDescription.cmake.in new file mode 100644 index 00000000..0d7eb3c2 --- /dev/null +++ b/externals/cmake-modules/GetGitRevisionDescription.cmake.in @@ -0,0 +1,42 @@ +# +# Internal file for GetGitRevisionDescription.cmake +# +# Requires CMake 2.6 or newer (uses the 'function' command) +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +set(HEAD_HASH) + +file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024) + +string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS) +if(HEAD_CONTENTS MATCHES "ref") + # named branch + string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}") + if(EXISTS "@GIT_DIR@/${HEAD_REF}") + configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}") + configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + set(HEAD_HASH "${HEAD_REF}") + endif() +else() + # detached HEAD + configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY) +endif() + +if(NOT HEAD_HASH) + if(EXISTS "@GIT_DATA@/head-ref") + file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024) + string(STRIP "${HEAD_HASH}" HEAD_HASH) + else() + set(HEAD_HASH "Unknown") + endif() +endif() diff --git a/externals/renderdoc/renderdoc_app.h b/externals/renderdoc/renderdoc_app.h new file mode 100644 index 00000000..c01e0593 --- /dev/null +++ b/externals/renderdoc/renderdoc_app.h @@ -0,0 +1,741 @@ +/****************************************************************************** + * The MIT License (MIT) + * + * Copyright (c) 2019-2024 Baldur Karlsson + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + ******************************************************************************/ + +#pragma once + +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Documentation for the API is available at https://renderdoc.org/docs/in_application_api.html +// + +#if !defined(RENDERDOC_NO_STDINT) +#include +#endif + +#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) +#define RENDERDOC_CC __cdecl +#elif defined(__linux__) || defined(__FreeBSD__) +#define RENDERDOC_CC +#elif defined(__APPLE__) +#define RENDERDOC_CC +#else +#error "Unknown platform" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Constants not used directly in below API + +// This is a GUID/magic value used for when applications pass a path where shader debug +// information can be found to match up with a stripped shader. +// the define can be used like so: const GUID RENDERDOC_ShaderDebugMagicValue = +// RENDERDOC_ShaderDebugMagicValue_value +#define RENDERDOC_ShaderDebugMagicValue_struct \ + { \ + 0xeab25520, 0x6670, 0x4865, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \ + } + +// as an alternative when you want a byte array (assuming x86 endianness): +#define RENDERDOC_ShaderDebugMagicValue_bytearray \ + { \ + 0x20, 0x55, 0xb2, 0xea, 0x70, 0x66, 0x65, 0x48, 0x84, 0x29, 0x6c, 0x8, 0x51, 0x54, 0x00, 0xff \ + } + +// truncated version when only a uint64_t is available (e.g. Vulkan tags): +#define RENDERDOC_ShaderDebugMagicValue_truncated 0x48656670eab25520ULL + +////////////////////////////////////////////////////////////////////////////////////////////////// +// RenderDoc capture options +// + +typedef enum RENDERDOC_CaptureOption +{ + // Allow the application to enable vsync + // + // Default - enabled + // + // 1 - The application can enable or disable vsync at will + // 0 - vsync is force disabled + eRENDERDOC_Option_AllowVSync = 0, + + // Allow the application to enable fullscreen + // + // Default - enabled + // + // 1 - The application can enable or disable fullscreen at will + // 0 - fullscreen is force disabled + eRENDERDOC_Option_AllowFullscreen = 1, + + // Record API debugging events and messages + // + // Default - disabled + // + // 1 - Enable built-in API debugging features and records the results into + // the capture, which is matched up with events on replay + // 0 - no API debugging is forcibly enabled + eRENDERDOC_Option_APIValidation = 2, + eRENDERDOC_Option_DebugDeviceMode = 2, // deprecated name of this enum + + // Capture CPU callstacks for API events + // + // Default - disabled + // + // 1 - Enables capturing of callstacks + // 0 - no callstacks are captured + eRENDERDOC_Option_CaptureCallstacks = 3, + + // When capturing CPU callstacks, only capture them from actions. + // This option does nothing without the above option being enabled + // + // Default - disabled + // + // 1 - Only captures callstacks for actions. + // Ignored if CaptureCallstacks is disabled + // 0 - Callstacks, if enabled, are captured for every event. + eRENDERDOC_Option_CaptureCallstacksOnlyDraws = 4, + eRENDERDOC_Option_CaptureCallstacksOnlyActions = 4, + + // Specify a delay in seconds to wait for a debugger to attach, after + // creating or injecting into a process, before continuing to allow it to run. + // + // 0 indicates no delay, and the process will run immediately after injection + // + // Default - 0 seconds + // + eRENDERDOC_Option_DelayForDebugger = 5, + + // Verify buffer access. This includes checking the memory returned by a Map() call to + // detect any out-of-bounds modification, as well as initialising buffers with undefined contents + // to a marker value to catch use of uninitialised memory. + // + // NOTE: This option is only valid for OpenGL and D3D11. Explicit APIs such as D3D12 and Vulkan do + // not do the same kind of interception & checking and undefined contents are really undefined. + // + // Default - disabled + // + // 1 - Verify buffer access + // 0 - No verification is performed, and overwriting bounds may cause crashes or corruption in + // RenderDoc. + eRENDERDOC_Option_VerifyBufferAccess = 6, + + // The old name for eRENDERDOC_Option_VerifyBufferAccess was eRENDERDOC_Option_VerifyMapWrites. + // This option now controls the filling of uninitialised buffers with 0xdddddddd which was + // previously always enabled + eRENDERDOC_Option_VerifyMapWrites = eRENDERDOC_Option_VerifyBufferAccess, + + // Hooks any system API calls that create child processes, and injects + // RenderDoc into them recursively with the same options. + // + // Default - disabled + // + // 1 - Hooks into spawned child processes + // 0 - Child processes are not hooked by RenderDoc + eRENDERDOC_Option_HookIntoChildren = 7, + + // By default RenderDoc only includes resources in the final capture necessary + // for that frame, this allows you to override that behaviour. + // + // Default - disabled + // + // 1 - all live resources at the time of capture are included in the capture + // and available for inspection + // 0 - only the resources referenced by the captured frame are included + eRENDERDOC_Option_RefAllResources = 8, + + // **NOTE**: As of RenderDoc v1.1 this option has been deprecated. Setting or + // getting it will be ignored, to allow compatibility with older versions. + // In v1.1 the option acts as if it's always enabled. + // + // By default RenderDoc skips saving initial states for resources where the + // previous contents don't appear to be used, assuming that writes before + // reads indicate previous contents aren't used. + // + // Default - disabled + // + // 1 - initial contents at the start of each captured frame are saved, even if + // they are later overwritten or cleared before being used. + // 0 - unless a read is detected, initial contents will not be saved and will + // appear as black or empty data. + eRENDERDOC_Option_SaveAllInitials = 9, + + // In APIs that allow for the recording of command lists to be replayed later, + // RenderDoc may choose to not capture command lists before a frame capture is + // triggered, to reduce overheads. This means any command lists recorded once + // and replayed many times will not be available and may cause a failure to + // capture. + // + // NOTE: This is only true for APIs where multithreading is difficult or + // discouraged. Newer APIs like Vulkan and D3D12 will ignore this option + // and always capture all command lists since the API is heavily oriented + // around it and the overheads have been reduced by API design. + // + // 1 - All command lists are captured from the start of the application + // 0 - Command lists are only captured if their recording begins during + // the period when a frame capture is in progress. + eRENDERDOC_Option_CaptureAllCmdLists = 10, + + // Mute API debugging output when the API validation mode option is enabled + // + // Default - enabled + // + // 1 - Mute any API debug messages from being displayed or passed through + // 0 - API debugging is displayed as normal + eRENDERDOC_Option_DebugOutputMute = 11, + + // Option to allow vendor extensions to be used even when they may be + // incompatible with RenderDoc and cause corrupted replays or crashes. + // + // Default - inactive + // + // No values are documented, this option should only be used when absolutely + // necessary as directed by a RenderDoc developer. + eRENDERDOC_Option_AllowUnsupportedVendorExtensions = 12, + + // Define a soft memory limit which some APIs may aim to keep overhead under where + // possible. Anything above this limit will where possible be saved directly to disk during + // capture. + // This will cause increased disk space use (which may cause a capture to fail if disk space is + // exhausted) as well as slower capture times. + // + // Not all memory allocations may be deferred like this so it is not a guarantee of a memory + // limit. + // + // Units are in MBs, suggested values would range from 200MB to 1000MB. + // + // Default - 0 Megabytes + eRENDERDOC_Option_SoftMemoryLimit = 13, +} RENDERDOC_CaptureOption; + +// Sets an option that controls how RenderDoc behaves on capture. +// +// Returns 1 if the option and value are valid +// Returns 0 if either is invalid and the option is unchanged +typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionU32)(RENDERDOC_CaptureOption opt, uint32_t val); +typedef int(RENDERDOC_CC *pRENDERDOC_SetCaptureOptionF32)(RENDERDOC_CaptureOption opt, float val); + +// Gets the current value of an option as a uint32_t +// +// If the option is invalid, 0xffffffff is returned +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionU32)(RENDERDOC_CaptureOption opt); + +// Gets the current value of an option as a float +// +// If the option is invalid, -FLT_MAX is returned +typedef float(RENDERDOC_CC *pRENDERDOC_GetCaptureOptionF32)(RENDERDOC_CaptureOption opt); + +typedef enum RENDERDOC_InputButton +{ + // '0' - '9' matches ASCII values + eRENDERDOC_Key_0 = 0x30, + eRENDERDOC_Key_1 = 0x31, + eRENDERDOC_Key_2 = 0x32, + eRENDERDOC_Key_3 = 0x33, + eRENDERDOC_Key_4 = 0x34, + eRENDERDOC_Key_5 = 0x35, + eRENDERDOC_Key_6 = 0x36, + eRENDERDOC_Key_7 = 0x37, + eRENDERDOC_Key_8 = 0x38, + eRENDERDOC_Key_9 = 0x39, + + // 'A' - 'Z' matches ASCII values + eRENDERDOC_Key_A = 0x41, + eRENDERDOC_Key_B = 0x42, + eRENDERDOC_Key_C = 0x43, + eRENDERDOC_Key_D = 0x44, + eRENDERDOC_Key_E = 0x45, + eRENDERDOC_Key_F = 0x46, + eRENDERDOC_Key_G = 0x47, + eRENDERDOC_Key_H = 0x48, + eRENDERDOC_Key_I = 0x49, + eRENDERDOC_Key_J = 0x4A, + eRENDERDOC_Key_K = 0x4B, + eRENDERDOC_Key_L = 0x4C, + eRENDERDOC_Key_M = 0x4D, + eRENDERDOC_Key_N = 0x4E, + eRENDERDOC_Key_O = 0x4F, + eRENDERDOC_Key_P = 0x50, + eRENDERDOC_Key_Q = 0x51, + eRENDERDOC_Key_R = 0x52, + eRENDERDOC_Key_S = 0x53, + eRENDERDOC_Key_T = 0x54, + eRENDERDOC_Key_U = 0x55, + eRENDERDOC_Key_V = 0x56, + eRENDERDOC_Key_W = 0x57, + eRENDERDOC_Key_X = 0x58, + eRENDERDOC_Key_Y = 0x59, + eRENDERDOC_Key_Z = 0x5A, + + // leave the rest of the ASCII range free + // in case we want to use it later + eRENDERDOC_Key_NonPrintable = 0x100, + + eRENDERDOC_Key_Divide, + eRENDERDOC_Key_Multiply, + eRENDERDOC_Key_Subtract, + eRENDERDOC_Key_Plus, + + eRENDERDOC_Key_F1, + eRENDERDOC_Key_F2, + eRENDERDOC_Key_F3, + eRENDERDOC_Key_F4, + eRENDERDOC_Key_F5, + eRENDERDOC_Key_F6, + eRENDERDOC_Key_F7, + eRENDERDOC_Key_F8, + eRENDERDOC_Key_F9, + eRENDERDOC_Key_F10, + eRENDERDOC_Key_F11, + eRENDERDOC_Key_F12, + + eRENDERDOC_Key_Home, + eRENDERDOC_Key_End, + eRENDERDOC_Key_Insert, + eRENDERDOC_Key_Delete, + eRENDERDOC_Key_PageUp, + eRENDERDOC_Key_PageDn, + + eRENDERDOC_Key_Backspace, + eRENDERDOC_Key_Tab, + eRENDERDOC_Key_PrtScrn, + eRENDERDOC_Key_Pause, + + eRENDERDOC_Key_Max, +} RENDERDOC_InputButton; + +// Sets which key or keys can be used to toggle focus between multiple windows +// +// If keys is NULL or num is 0, toggle keys will be disabled +typedef void(RENDERDOC_CC *pRENDERDOC_SetFocusToggleKeys)(RENDERDOC_InputButton *keys, int num); + +// Sets which key or keys can be used to capture the next frame +// +// If keys is NULL or num is 0, captures keys will be disabled +typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureKeys)(RENDERDOC_InputButton *keys, int num); + +typedef enum RENDERDOC_OverlayBits +{ + // This single bit controls whether the overlay is enabled or disabled globally + eRENDERDOC_Overlay_Enabled = 0x1, + + // Show the average framerate over several seconds as well as min/max + eRENDERDOC_Overlay_FrameRate = 0x2, + + // Show the current frame number + eRENDERDOC_Overlay_FrameNumber = 0x4, + + // Show a list of recent captures, and how many captures have been made + eRENDERDOC_Overlay_CaptureList = 0x8, + + // Default values for the overlay mask + eRENDERDOC_Overlay_Default = (eRENDERDOC_Overlay_Enabled | eRENDERDOC_Overlay_FrameRate | + eRENDERDOC_Overlay_FrameNumber | eRENDERDOC_Overlay_CaptureList), + + // Enable all bits + eRENDERDOC_Overlay_All = ~0U, + + // Disable all bits + eRENDERDOC_Overlay_None = 0, +} RENDERDOC_OverlayBits; + +// returns the overlay bits that have been set +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetOverlayBits)(); +// sets the overlay bits with an and & or mask +typedef void(RENDERDOC_CC *pRENDERDOC_MaskOverlayBits)(uint32_t And, uint32_t Or); + +// this function will attempt to remove RenderDoc's hooks in the application. +// +// Note: that this can only work correctly if done immediately after +// the module is loaded, before any API work happens. RenderDoc will remove its +// injected hooks and shut down. Behaviour is undefined if this is called +// after any API functions have been called, and there is still no guarantee of +// success. +typedef void(RENDERDOC_CC *pRENDERDOC_RemoveHooks)(); + +// DEPRECATED: compatibility for code compiled against pre-1.4.1 headers. +typedef pRENDERDOC_RemoveHooks pRENDERDOC_Shutdown; + +// This function will unload RenderDoc's crash handler. +// +// If you use your own crash handler and don't want RenderDoc's handler to +// intercede, you can call this function to unload it and any unhandled +// exceptions will pass to the next handler. +typedef void(RENDERDOC_CC *pRENDERDOC_UnloadCrashHandler)(); + +// Sets the capture file path template +// +// pathtemplate is a UTF-8 string that gives a template for how captures will be named +// and where they will be saved. +// +// Any extension is stripped off the path, and captures are saved in the directory +// specified, and named with the filename and the frame number appended. If the +// directory does not exist it will be created, including any parent directories. +// +// If pathtemplate is NULL, the template will remain unchanged +// +// Example: +// +// SetCaptureFilePathTemplate("my_captures/example"); +// +// Capture #1 -> my_captures/example_frame123.rdc +// Capture #2 -> my_captures/example_frame456.rdc +typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFilePathTemplate)(const char *pathtemplate); + +// returns the current capture path template, see SetCaptureFileTemplate above, as a UTF-8 string +typedef const char *(RENDERDOC_CC *pRENDERDOC_GetCaptureFilePathTemplate)(); + +// DEPRECATED: compatibility for code compiled against pre-1.1.2 headers. +typedef pRENDERDOC_SetCaptureFilePathTemplate pRENDERDOC_SetLogFilePathTemplate; +typedef pRENDERDOC_GetCaptureFilePathTemplate pRENDERDOC_GetLogFilePathTemplate; + +// returns the number of captures that have been made +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetNumCaptures)(); + +// This function returns the details of a capture, by index. New captures are added +// to the end of the list. +// +// filename will be filled with the absolute path to the capture file, as a UTF-8 string +// pathlength will be written with the length in bytes of the filename string +// timestamp will be written with the time of the capture, in seconds since the Unix epoch +// +// Any of the parameters can be NULL and they'll be skipped. +// +// The function will return 1 if the capture index is valid, or 0 if the index is invalid +// If the index is invalid, the values will be unchanged +// +// Note: when captures are deleted in the UI they will remain in this list, so the +// capture path may not exist anymore. +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_GetCapture)(uint32_t idx, char *filename, + uint32_t *pathlength, uint64_t *timestamp); + +// Sets the comments associated with a capture file. These comments are displayed in the +// UI program when opening. +// +// filePath should be a path to the capture file to add comments to. If set to NULL or "" +// the most recent capture file created made will be used instead. +// comments should be a NULL-terminated UTF-8 string to add as comments. +// +// Any existing comments will be overwritten. +typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureFileComments)(const char *filePath, + const char *comments); + +// returns 1 if the RenderDoc UI is connected to this application, 0 otherwise +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsTargetControlConnected)(); + +// DEPRECATED: compatibility for code compiled against pre-1.1.1 headers. +// This was renamed to IsTargetControlConnected in API 1.1.1, the old typedef is kept here for +// backwards compatibility with old code, it is castable either way since it's ABI compatible +// as the same function pointer type. +typedef pRENDERDOC_IsTargetControlConnected pRENDERDOC_IsRemoteAccessConnected; + +// This function will launch the Replay UI associated with the RenderDoc library injected +// into the running application. +// +// if connectTargetControl is 1, the Replay UI will be launched with a command line parameter +// to connect to this application +// cmdline is the rest of the command line, as a UTF-8 string. E.g. a captures to open +// if cmdline is NULL, the command line will be empty. +// +// returns the PID of the replay UI if successful, 0 if not successful. +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_LaunchReplayUI)(uint32_t connectTargetControl, + const char *cmdline); + +// RenderDoc can return a higher version than requested if it's backwards compatible, +// this function returns the actual version returned. If a parameter is NULL, it will be +// ignored and the others will be filled out. +typedef void(RENDERDOC_CC *pRENDERDOC_GetAPIVersion)(int *major, int *minor, int *patch); + +// Requests that the replay UI show itself (if hidden or not the current top window). This can be +// used in conjunction with IsTargetControlConnected and LaunchReplayUI to intelligently handle +// showing the UI after making a capture. +// +// This will return 1 if the request was successfully passed on, though it's not guaranteed that +// the UI will be on top in all cases depending on OS rules. It will return 0 if there is no current +// target control connection to make such a request, or if there was another error +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_ShowReplayUI)(); + +////////////////////////////////////////////////////////////////////////// +// Capturing functions +// + +// A device pointer is a pointer to the API's root handle. +// +// This would be an ID3D11Device, HGLRC/GLXContext, ID3D12Device, etc +typedef void *RENDERDOC_DevicePointer; + +// A window handle is the OS's native window handle +// +// This would be an HWND, GLXDrawable, etc +typedef void *RENDERDOC_WindowHandle; + +// A helper macro for Vulkan, where the device handle cannot be used directly. +// +// Passing the VkInstance to this macro will return the RENDERDOC_DevicePointer to use. +// +// Specifically, the value needed is the dispatch table pointer, which sits as the first +// pointer-sized object in the memory pointed to by the VkInstance. Thus we cast to a void** and +// indirect once. +#define RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(inst) (*((void **)(inst))) + +// This sets the RenderDoc in-app overlay in the API/window pair as 'active' and it will +// respond to keypresses. Neither parameter can be NULL +typedef void(RENDERDOC_CC *pRENDERDOC_SetActiveWindow)(RENDERDOC_DevicePointer device, + RENDERDOC_WindowHandle wndHandle); + +// capture the next frame on whichever window and API is currently considered active +typedef void(RENDERDOC_CC *pRENDERDOC_TriggerCapture)(); + +// capture the next N frames on whichever window and API is currently considered active +typedef void(RENDERDOC_CC *pRENDERDOC_TriggerMultiFrameCapture)(uint32_t numFrames); + +// When choosing either a device pointer or a window handle to capture, you can pass NULL. +// Passing NULL specifies a 'wildcard' match against anything. This allows you to specify +// any API rendering to a specific window, or a specific API instance rendering to any window, +// or in the simplest case of one window and one API, you can just pass NULL for both. +// +// In either case, if there are two or more possible matching (device,window) pairs it +// is undefined which one will be captured. +// +// Note: for headless rendering you can pass NULL for the window handle and either specify +// a device pointer or leave it NULL as above. + +// Immediately starts capturing API calls on the specified device pointer and window handle. +// +// If there is no matching thing to capture (e.g. no supported API has been initialised), +// this will do nothing. +// +// The results are undefined (including crashes) if two captures are started overlapping, +// even on separate devices and/oror windows. +typedef void(RENDERDOC_CC *pRENDERDOC_StartFrameCapture)(RENDERDOC_DevicePointer device, + RENDERDOC_WindowHandle wndHandle); + +// Returns whether or not a frame capture is currently ongoing anywhere. +// +// This will return 1 if a capture is ongoing, and 0 if there is no capture running +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_IsFrameCapturing)(); + +// Ends capturing immediately. +// +// This will return 1 if the capture succeeded, and 0 if there was an error capturing. +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_EndFrameCapture)(RENDERDOC_DevicePointer device, + RENDERDOC_WindowHandle wndHandle); + +// Ends capturing immediately and discard any data stored without saving to disk. +// +// This will return 1 if the capture was discarded, and 0 if there was an error or no capture +// was in progress +typedef uint32_t(RENDERDOC_CC *pRENDERDOC_DiscardFrameCapture)(RENDERDOC_DevicePointer device, + RENDERDOC_WindowHandle wndHandle); + +// Only valid to be called between a call to StartFrameCapture and EndFrameCapture. Gives a custom +// title to the capture produced which will be displayed in the UI. +// +// If multiple captures are ongoing, this title will be applied to the first capture to end after +// this call. The second capture to end will have no title, unless this function is called again. +// +// Calling this function has no effect if no capture is currently running, and if it is called +// multiple times only the last title will be used. +typedef void(RENDERDOC_CC *pRENDERDOC_SetCaptureTitle)(const char *title); + +////////////////////////////////////////////////////////////////////////////////////////////////// +// RenderDoc API versions +// + +// RenderDoc uses semantic versioning (http://semver.org/). +// +// MAJOR version is incremented when incompatible API changes happen. +// MINOR version is incremented when functionality is added in a backwards-compatible manner. +// PATCH version is incremented when backwards-compatible bug fixes happen. +// +// Note that this means the API returned can be higher than the one you might have requested. +// e.g. if you are running against a newer RenderDoc that supports 1.0.1, it will be returned +// instead of 1.0.0. You can check this with the GetAPIVersion entry point +typedef enum RENDERDOC_Version +{ + eRENDERDOC_API_Version_1_0_0 = 10000, // RENDERDOC_API_1_0_0 = 1 00 00 + eRENDERDOC_API_Version_1_0_1 = 10001, // RENDERDOC_API_1_0_1 = 1 00 01 + eRENDERDOC_API_Version_1_0_2 = 10002, // RENDERDOC_API_1_0_2 = 1 00 02 + eRENDERDOC_API_Version_1_1_0 = 10100, // RENDERDOC_API_1_1_0 = 1 01 00 + eRENDERDOC_API_Version_1_1_1 = 10101, // RENDERDOC_API_1_1_1 = 1 01 01 + eRENDERDOC_API_Version_1_1_2 = 10102, // RENDERDOC_API_1_1_2 = 1 01 02 + eRENDERDOC_API_Version_1_2_0 = 10200, // RENDERDOC_API_1_2_0 = 1 02 00 + eRENDERDOC_API_Version_1_3_0 = 10300, // RENDERDOC_API_1_3_0 = 1 03 00 + eRENDERDOC_API_Version_1_4_0 = 10400, // RENDERDOC_API_1_4_0 = 1 04 00 + eRENDERDOC_API_Version_1_4_1 = 10401, // RENDERDOC_API_1_4_1 = 1 04 01 + eRENDERDOC_API_Version_1_4_2 = 10402, // RENDERDOC_API_1_4_2 = 1 04 02 + eRENDERDOC_API_Version_1_5_0 = 10500, // RENDERDOC_API_1_5_0 = 1 05 00 + eRENDERDOC_API_Version_1_6_0 = 10600, // RENDERDOC_API_1_6_0 = 1 06 00 +} RENDERDOC_Version; + +// API version changelog: +// +// 1.0.0 - initial release +// 1.0.1 - Bugfix: IsFrameCapturing() was returning false for captures that were triggered +// by keypress or TriggerCapture, instead of Start/EndFrameCapture. +// 1.0.2 - Refactor: Renamed eRENDERDOC_Option_DebugDeviceMode to eRENDERDOC_Option_APIValidation +// 1.1.0 - Add feature: TriggerMultiFrameCapture(). Backwards compatible with 1.0.x since the new +// function pointer is added to the end of the struct, the original layout is identical +// 1.1.1 - Refactor: Renamed remote access to target control (to better disambiguate from remote +// replay/remote server concept in replay UI) +// 1.1.2 - Refactor: Renamed "log file" in function names to just capture, to clarify that these +// are captures and not debug logging files. This is the first API version in the v1.0 +// branch. +// 1.2.0 - Added feature: SetCaptureFileComments() to add comments to a capture file that will be +// displayed in the UI program on load. +// 1.3.0 - Added feature: New capture option eRENDERDOC_Option_AllowUnsupportedVendorExtensions +// which allows users to opt-in to allowing unsupported vendor extensions to function. +// Should be used at the user's own risk. +// Refactor: Renamed eRENDERDOC_Option_VerifyMapWrites to +// eRENDERDOC_Option_VerifyBufferAccess, which now also controls initialisation to +// 0xdddddddd of uninitialised buffer contents. +// 1.4.0 - Added feature: DiscardFrameCapture() to discard a frame capture in progress and stop +// capturing without saving anything to disk. +// 1.4.1 - Refactor: Renamed Shutdown to RemoveHooks to better clarify what is happening +// 1.4.2 - Refactor: Renamed 'draws' to 'actions' in callstack capture option. +// 1.5.0 - Added feature: ShowReplayUI() to request that the replay UI show itself if connected +// 1.6.0 - Added feature: SetCaptureTitle() which can be used to set a title for a +// capture made with StartFrameCapture() or EndFrameCapture() + +typedef struct RENDERDOC_API_1_6_0 +{ + pRENDERDOC_GetAPIVersion GetAPIVersion; + + pRENDERDOC_SetCaptureOptionU32 SetCaptureOptionU32; + pRENDERDOC_SetCaptureOptionF32 SetCaptureOptionF32; + + pRENDERDOC_GetCaptureOptionU32 GetCaptureOptionU32; + pRENDERDOC_GetCaptureOptionF32 GetCaptureOptionF32; + + pRENDERDOC_SetFocusToggleKeys SetFocusToggleKeys; + pRENDERDOC_SetCaptureKeys SetCaptureKeys; + + pRENDERDOC_GetOverlayBits GetOverlayBits; + pRENDERDOC_MaskOverlayBits MaskOverlayBits; + + // Shutdown was renamed to RemoveHooks in 1.4.1. + // These unions allow old code to continue compiling without changes + union + { + pRENDERDOC_Shutdown Shutdown; + pRENDERDOC_RemoveHooks RemoveHooks; + }; + pRENDERDOC_UnloadCrashHandler UnloadCrashHandler; + + // Get/SetLogFilePathTemplate was renamed to Get/SetCaptureFilePathTemplate in 1.1.2. + // These unions allow old code to continue compiling without changes + union + { + // deprecated name + pRENDERDOC_SetLogFilePathTemplate SetLogFilePathTemplate; + // current name + pRENDERDOC_SetCaptureFilePathTemplate SetCaptureFilePathTemplate; + }; + union + { + // deprecated name + pRENDERDOC_GetLogFilePathTemplate GetLogFilePathTemplate; + // current name + pRENDERDOC_GetCaptureFilePathTemplate GetCaptureFilePathTemplate; + }; + + pRENDERDOC_GetNumCaptures GetNumCaptures; + pRENDERDOC_GetCapture GetCapture; + + pRENDERDOC_TriggerCapture TriggerCapture; + + // IsRemoteAccessConnected was renamed to IsTargetControlConnected in 1.1.1. + // This union allows old code to continue compiling without changes + union + { + // deprecated name + pRENDERDOC_IsRemoteAccessConnected IsRemoteAccessConnected; + // current name + pRENDERDOC_IsTargetControlConnected IsTargetControlConnected; + }; + pRENDERDOC_LaunchReplayUI LaunchReplayUI; + + pRENDERDOC_SetActiveWindow SetActiveWindow; + + pRENDERDOC_StartFrameCapture StartFrameCapture; + pRENDERDOC_IsFrameCapturing IsFrameCapturing; + pRENDERDOC_EndFrameCapture EndFrameCapture; + + // new function in 1.1.0 + pRENDERDOC_TriggerMultiFrameCapture TriggerMultiFrameCapture; + + // new function in 1.2.0 + pRENDERDOC_SetCaptureFileComments SetCaptureFileComments; + + // new function in 1.4.0 + pRENDERDOC_DiscardFrameCapture DiscardFrameCapture; + + // new function in 1.5.0 + pRENDERDOC_ShowReplayUI ShowReplayUI; + + // new function in 1.6.0 + pRENDERDOC_SetCaptureTitle SetCaptureTitle; +} RENDERDOC_API_1_6_0; + +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_0_0; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_0_1; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_0_2; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_1_0; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_1_1; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_1_2; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_2_0; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_3_0; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_4_0; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_4_1; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_4_2; +typedef RENDERDOC_API_1_6_0 RENDERDOC_API_1_5_0; + +////////////////////////////////////////////////////////////////////////////////////////////////// +// RenderDoc API entry point +// +// This entry point can be obtained via GetProcAddress/dlsym if RenderDoc is available. +// +// The name is the same as the typedef - "RENDERDOC_GetAPI" +// +// This function is not thread safe, and should not be called on multiple threads at once. +// Ideally, call this once as early as possible in your application's startup, before doing +// any API work, since some configuration functionality etc has to be done also before +// initialising any APIs. +// +// Parameters: +// version is a single value from the RENDERDOC_Version above. +// +// outAPIPointers will be filled out with a pointer to the corresponding struct of function +// pointers. +// +// Returns: +// 1 - if the outAPIPointers has been filled with a pointer to the API struct requested +// 0 - if the requested version is not supported or the arguments are invalid. +// +typedef int(RENDERDOC_CC *pRENDERDOC_GetAPI)(RENDERDOC_Version version, void **outAPIPointers); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/audio_core/sdl_audio.cpp b/src/audio_core/sdl_audio.cpp index 0d494707..141d338e 100644 --- a/src/audio_core/sdl_audio.cpp +++ b/src/audio_core/sdl_audio.cpp @@ -4,8 +4,8 @@ #include #include #include -#include -#include +#include "common/assert.h" +#include "core/libraries/error_codes.h" #include "sdl_audio.h" namespace Audio { diff --git a/src/common/config.cpp b/src/common/config.cpp index a577b143..7e677f84 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -15,15 +15,17 @@ static u32 screenWidth = 1280; static u32 screenHeight = 720; static s32 gpuId = -1; // Vulkan physical device index. Set to negative for auto select static std::string logFilter; -static std::string logType = "sync"; +static std::string logType = "async"; static bool isDebugDump = false; static bool isLibc = true; static bool isShowSplash = false; static bool isNullGpu = false; static bool shouldDumpShaders = false; static bool shouldDumpPM4 = false; +static u32 vblankDivider = 1; static bool vkValidation = false; static bool vkValidationSync = false; +static bool rdocEnable = false; // Gui std::string settings_install_dir = ""; u32 main_window_geometry_x = 400; @@ -94,6 +96,14 @@ bool dumpPM4() { return shouldDumpPM4; } +bool isRdocEnabled() { + return rdocEnable; +} + +u32 vblankDiv() { + return vblankDivider; +} + bool vkValidationEnabled() { return vkValidation; } @@ -220,7 +230,7 @@ void load(const std::filesystem::path& path) { auto general = generalResult.unwrap(); isNeo = toml::find_or(general, "isPS4Pro", false); - isFullscreen = toml::find_or(general, "Fullscreen", true); + isFullscreen = toml::find_or(general, "Fullscreen", false); logFilter = toml::find_or(general, "logFilter", ""); logType = toml::find_or(general, "logType", "sync"); isShowSplash = toml::find_or(general, "showSplash", true); @@ -233,10 +243,10 @@ void load(const std::filesystem::path& path) { screenWidth = toml::find_or(gpu, "screenWidth", screenWidth); screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); - gpuId = toml::find_or(gpu, "gpuId", 0); isNullGpu = toml::find_or(gpu, "nullGpu", false); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", false); shouldDumpPM4 = toml::find_or(gpu, "dumpPM4", false); + vblankDivider = toml::find_or(gpu, "vblankDivider", 1); } } if (data.contains("Vulkan")) { @@ -244,8 +254,10 @@ void load(const std::filesystem::path& path) { if (vkResult.is_ok()) { auto vk = vkResult.unwrap(); + gpuId = toml::find_or(vk, "gpuId", 0); vkValidation = toml::find_or(vk, "validation", true); vkValidationSync = toml::find_or(vk, "validation_sync", true); + rdocEnable = toml::find_or(vk, "rdocEnable", false); } } if (data.contains("Debug")) { @@ -312,14 +324,16 @@ void save(const std::filesystem::path& path) { data["General"]["logFilter"] = logFilter; data["General"]["logType"] = logType; data["General"]["showSplash"] = isShowSplash; - data["GPU"]["gpuId"] = gpuId; data["GPU"]["screenWidth"] = screenWidth; data["GPU"]["screenHeight"] = screenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["dumpPM4"] = shouldDumpPM4; + data["GPU"]["vblankDivider"] = vblankDivider; + data["Vulkan"]["gpuId"] = gpuId; data["Vulkan"]["validation"] = vkValidation; data["Vulkan"]["validation_sync"] = vkValidationSync; + data["Vulkan"]["rdocEnable"] = rdocEnable; data["Debug"]["DebugDump"] = isDebugDump; data["LLE"]["libc"] = isLibc; data["GUI"]["theme"] = mw_themes; diff --git a/src/common/config.h b/src/common/config.h index 0a3b4905..637ac746 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -26,6 +26,8 @@ bool showSplash(); bool nullGpu(); bool dumpShaders(); bool dumpPM4(); +bool isRdocEnabled(); +u32 vblankDiv(); bool vkValidationEnabled(); bool vkValidationSyncEnabled(); diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp index 0b03c86b..a21af8bb 100644 --- a/src/common/logging/backend.cpp +++ b/src/common/logging/backend.cpp @@ -207,8 +207,8 @@ public: message_queue.EmplaceWait(entry); } else { ForEachBackend([&entry](auto& backend) { backend.Write(entry); }); + std::fflush(stdout); } - std::fflush(stdout); } private: diff --git a/src/common/logging/types.h b/src/common/logging/types.h index 33e52fd4..dccb838a 100644 --- a/src/common/logging/types.h +++ b/src/common/logging/types.h @@ -73,6 +73,7 @@ enum class Class : u8 { Lib_DiscMap, ///< The LibSceDiscMap implementation. Lib_Png, ///< The LibScePng implementation. Lib_PlayGo, ///< The LibScePlayGo implementation. + Lib_Random, ///< The libSceRandom implementation. Lib_Usbd, ///< The LibSceUsbd implementation. Lib_Ajm, ///< The LibSceAjm implementation. Lib_ErrorDialog, ///< The LibSceErrorDialog implementation. diff --git a/src/common/path_util.cpp b/src/common/path_util.cpp index 429fe2a5..c1e8a5c0 100644 --- a/src/common/path_util.cpp +++ b/src/common/path_util.cpp @@ -72,6 +72,8 @@ static auto UserPaths = [] { create_path(PathType::GameDataDir, user_dir / GAMEDATA_DIR); create_path(PathType::TempDataDir, user_dir / TEMPDATA_DIR); create_path(PathType::SysModuleDir, user_dir / SYSMODULES_DIR); + create_path(PathType::DownloadDir, user_dir / DOWNLOAD_DIR); + create_path(PathType::CapturesDir, user_dir / CAPTURES_DIR); return paths; }(); diff --git a/src/common/path_util.h b/src/common/path_util.h index 57a9a73f..263edd46 100644 --- a/src/common/path_util.h +++ b/src/common/path_util.h @@ -18,6 +18,8 @@ enum class PathType { TempDataDir, // Where game temp data is stored. GameDataDir, // Where game data is stored. SysModuleDir, // Where system modules are stored. + DownloadDir, // Where downloads/temp files are stored. + CapturesDir, // Where rdoc captures are stored. }; constexpr auto PORTABLE_DIR = "user"; @@ -31,6 +33,8 @@ constexpr auto SAVEDATA_DIR = "savedata"; constexpr auto GAMEDATA_DIR = "data"; constexpr auto TEMPDATA_DIR = "temp"; constexpr auto SYSMODULES_DIR = "sys_modules"; +constexpr auto DOWNLOAD_DIR = "download"; +constexpr auto CAPTURES_DIR = "captures"; // Filenames constexpr auto LOG_FILE = "shad_log.txt"; diff --git a/src/common/scm_rev.cpp.in b/src/common/scm_rev.cpp.in new file mode 100644 index 00000000..7f6fba9e --- /dev/null +++ b/src/common/scm_rev.cpp.in @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/scm_rev.h" + +#define GIT_REV "@GIT_REV@" +#define GIT_BRANCH "@GIT_BRANCH@" +#define GIT_DESC "@GIT_DESC@" + +namespace Common { + +const char g_scm_rev[] = GIT_REV; +const char g_scm_branch[] = GIT_BRANCH; +const char g_scm_desc[] = GIT_DESC; + +} // namespace + diff --git a/src/common/scm_rev.h b/src/common/scm_rev.h new file mode 100644 index 00000000..877a0127 --- /dev/null +++ b/src/common/scm_rev.h @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +namespace Common { + +extern const char g_scm_rev[]; +extern const char g_scm_branch[]; +extern const char g_scm_desc[]; + +} // namespace Common diff --git a/src/common/version.h b/src/common/version.h index 8492c7d9..92fd18fb 100644 --- a/src/common/version.h +++ b/src/common/version.h @@ -9,5 +9,6 @@ namespace Common { constexpr char VERSION[] = "0.1.1 WIP"; +constexpr bool isRelease = false; } // namespace Common diff --git a/src/core/address_space.h b/src/core/address_space.h index 1d2c67da..311310f2 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -34,10 +34,7 @@ constexpr VAddr USER_MAX = 0xFBFFFFFFFFULL; static constexpr size_t SystemManagedSize = SYSTEM_MANAGED_MAX - SYSTEM_MANAGED_MIN + 1; static constexpr size_t SystemReservedSize = SYSTEM_RESERVED_MAX - SYSTEM_RESERVED_MIN + 1; -// User area size is normally larger than this. However games are unlikely to map to high -// regions of that area, so by default we allocate a smaller virtual address space (about 1/4th). -// to save space on page tables. -static constexpr size_t UserSize = 1ULL << 39; +static constexpr size_t UserSize = 1ULL << 40; /** * Represents the user virtual address space backed by a dmem memory block diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 2a9cf5e2..42318822 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -285,20 +285,24 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe const auto slot = GetTcbKey(); #if defined(_WIN32) - // The following logic is based on the wine implementation of TlsGetValue - // https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719 + // The following logic is based on the Kernel32.dll asm of TlsGetValue static constexpr u32 TlsSlotsOffset = 0x1480; static constexpr u32 TlsExpansionSlotsOffset = 0x1780; static constexpr u32 TlsMinimumAvailable = 64; - const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset; - const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable; - // Load the pointer to the table of TLS slots. c.putSeg(gs); - c.mov(dst, ptr[reinterpret_cast(teb_offset)]); - // Load the pointer to our buffer. - c.mov(dst, qword[dst + tls_index * sizeof(LPVOID)]); + if (slot < TlsMinimumAvailable) { + // Load the pointer to TLS slots. + c.mov(dst, ptr[reinterpret_cast(TlsSlotsOffset + slot * sizeof(LPVOID))]); + } else { + const u32 tls_index = slot - TlsMinimumAvailable; + + // Load the pointer to the table of TLS expansion slots. + c.mov(dst, ptr[reinterpret_cast(TlsExpansionSlotsOffset)]); + // Load the pointer to our buffer. + c.mov(dst, qword[dst + tls_index * sizeof(LPVOID)]); + } #elif defined(__APPLE__) // The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by // pthread_getspecific https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96 diff --git a/src/core/file_format/playgo_chunk.cpp b/src/core/file_format/playgo_chunk.cpp new file mode 100644 index 00000000..43d8a4de --- /dev/null +++ b/src/core/file_format/playgo_chunk.cpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/io_file.h" + +#include "playgo_chunk.h" + +bool PlaygoChunk::Open(const std::filesystem::path& filepath) { + Common::FS::IOFile file(filepath, Common::FS::FileAccessMode::Read); + if (!file.IsOpen()) { + return false; + } + file.Read(playgoHeader); + + return true; +} \ No newline at end of file diff --git a/src/core/file_format/playgo_chunk.h b/src/core/file_format/playgo_chunk.h new file mode 100644 index 00000000..d17d24bf --- /dev/null +++ b/src/core/file_format/playgo_chunk.h @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once +#include +#include "common/types.h" + +struct PlaygoHeader { + u32 magic; + + u16 version_major; + u16 version_minor; + u16 image_count; + u16 chunk_count; + u16 mchunk_count; + u16 scenario_count; + // TODO fill the rest +}; +class PlaygoChunk { +public: + PlaygoChunk() = default; + ~PlaygoChunk() = default; + + bool Open(const std::filesystem::path& filepath); + PlaygoHeader GetPlaygoHeader() { + return playgoHeader; + } + +private: + PlaygoHeader playgoHeader; +}; \ No newline at end of file diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 2f57c9f3..2bcff191 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -26,23 +26,27 @@ void MntPoints::UnmountAll() { } std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) { - const MntPair* mount = GetMount(guest_directory); + // Evil games like Turok2 pass double slashes e.g /app0//game.kpf + auto corrected_path = guest_directory; + size_t pos = corrected_path.find("//"); + while (pos != std::string::npos) { + corrected_path.replace(pos, 2, "/"); + pos = corrected_path.find("//", pos + 1); + } + + const MntPair* mount = GetMount(corrected_path); if (!mount) { - return guest_directory; + return ""; } // Nothing to do if getting the mount itself. - if (guest_directory == mount->mount) { + if (corrected_path == mount->mount) { return mount->host_path; } // Remove device (e.g /app0) from path to retrieve relative path. - u32 pos = mount->mount.size() + 1; - // Evil games like Turok2 pass double slashes e.g /app0//game.kpf - if (guest_directory[pos] == '/') { - pos++; - } - const auto rel_path = std::string_view(guest_directory).substr(pos); + pos = mount->mount.size() + 1; + const auto rel_path = std::string_view(corrected_path).substr(pos); const auto host_path = mount->host_path / rel_path; if (!NeedsCaseInsensiveSearch) { return host_path; @@ -66,7 +70,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) // exist in filesystem but in different case. auto guest_path = current_path; while (!path_parts.empty()) { - const auto& part = path_parts.back(); + const auto part = path_parts.back(); const auto add_match = [&](const auto& host_part) { current_path /= host_part; guest_path /= part; diff --git a/src/core/libraries/app_content/app_content.cpp b/src/core/libraries/app_content/app_content.cpp index 7e9cf7a2..882f99e4 100644 --- a/src/core/libraries/app_content/app_content.cpp +++ b/src/core/libraries/app_content/app_content.cpp @@ -198,13 +198,9 @@ int PS4_SYSV_ABI sceAppContentTemporaryDataMount() { int PS4_SYSV_ABI sceAppContentTemporaryDataMount2(OrbisAppContentTemporaryDataOption option, OrbisAppContentMountPoint* mountPoint) { - if (std::string_view(mountPoint->data).empty()) // causing issues with save_data. + if (mountPoint == nullptr) return ORBIS_APP_CONTENT_ERROR_PARAMETER; - auto* param_sfo = Common::Singleton::Instance(); - std::string id(param_sfo->GetString("CONTENT_ID"), 7, 9); - const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::TempDataDir) / id; - auto* mnt = Common::Singleton::Instance(); - mnt->Mount(mount_dir, mountPoint->data); + strncpy(mountPoint->data, "/temp0", 16); LOG_INFO(Lib_AppContent, "sceAppContentTemporaryDataMount2: option = {}, mountPoint = {}", option, mountPoint->data); return ORBIS_OK; diff --git a/src/core/libraries/error_codes.h b/src/core/libraries/error_codes.h index 63016213..5eabaaf6 100644 --- a/src/core/libraries/error_codes.h +++ b/src/core/libraries/error_codes.h @@ -233,6 +233,9 @@ constexpr int SCE_KERNEL_ERROR_ESDKVERSION = 0x80020063; constexpr int SCE_KERNEL_ERROR_ESTART = 0x80020064; constexpr int SCE_KERNEL_ERROR_ESTOP = 0x80020065; +// libSceRandom error codes +constexpr int SCE_RANDOM_ERROR_INVALID = 0x817C0016; + // videoOut constexpr int SCE_VIDEO_OUT_ERROR_INVALID_VALUE = 0x80290001; // invalid argument constexpr int SCE_VIDEO_OUT_ERROR_INVALID_ADDRESS = 0x80290002; // invalid addresses diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 866a9698..650252f9 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -20,13 +20,12 @@ extern Frontend::WindowSDL* g_window; std::unique_ptr renderer; +std::unique_ptr liverpool; namespace Libraries::GnmDriver { using namespace AmdGpu; -static std::unique_ptr liverpool; - enum GnmEventIdents : u64 { Compute0RelMem = 0x00, Compute1RelMem = 0x01, @@ -958,7 +957,7 @@ int PS4_SYSV_ABI sceGnmGetGpuBlockStatus() { } int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + LOG_DEBUG(Lib_GnmDriver, "(STUBBED) called"); return ORBIS_OK; } @@ -2131,6 +2130,7 @@ int PS4_SYSV_ABI sceGnmSubmitDone() { if (!liverpool->IsGpuIdle()) { submission_lock = true; } + liverpool->SubmitDone(); send_init_packet = true; ++frames_submitted; return ORBIS_OK; diff --git a/src/core/libraries/kernel/event_queue.cpp b/src/core/libraries/kernel/event_queue.cpp index 6bd88459..3555fddc 100644 --- a/src/core/libraries/kernel/event_queue.cpp +++ b/src/core/libraries/kernel/event_queue.cpp @@ -78,9 +78,7 @@ bool EqueueInternal::TriggerEvent(u64 ident, s16 filter, void* trigger_data) { std::scoped_lock lock{m_mutex}; for (auto& event : m_events) { - ASSERT_MSG(event.event.filter == filter, - "Event to trigger doesn't match to queue events"); - if (event.event.ident == ident) { + if ((event.event.ident == ident) && (event.event.filter == filter)) { event.Trigger(trigger_data); has_found = true; } diff --git a/src/core/libraries/kernel/file_system.cpp b/src/core/libraries/kernel/file_system.cpp index 8734b964..f8386347 100644 --- a/src/core/libraries/kernel/file_system.cpp +++ b/src/core/libraries/kernel/file_system.cpp @@ -53,6 +53,9 @@ int PS4_SYSV_ABI sceKernelOpen(const char* path, int flags, u16 mode) { if (std::string_view{path} == "/dev/stdout") { return 2002; } + if (std::string_view{path} == "/dev/urandom") { + return 2003; + } u32 handle = h->CreateHandle(); auto* file = h->GetFile(handle); if (directory) { @@ -113,6 +116,9 @@ int PS4_SYSV_ABI sceKernelClose(int d) { if (d < 3) { // d probably hold an error code return ORBIS_KERNEL_ERROR_EPERM; } + if (d == 2003) { // dev/urandom case + return SCE_OK; + } auto* h = Common::Singleton::Instance(); auto* file = h->GetFile(d); if (file == nullptr) { @@ -223,6 +229,13 @@ s64 PS4_SYSV_ABI posix_lseek(int d, s64 offset, int whence) { } s64 PS4_SYSV_ABI sceKernelRead(int d, void* buf, size_t nbytes) { + if (d == 2003) // dev urandom case + { + auto rbuf = static_cast(buf); + for (size_t i = 0; i < nbytes; i++) + rbuf[i] = std::rand() & 0xFF; + return nbytes; + } auto* h = Common::Singleton::Instance(); auto* file = h->GetFile(d); if (file == nullptr) { @@ -459,7 +472,30 @@ s64 PS4_SYSV_ABI sceKernelPwrite(int d, void* buf, size_t nbytes, s64 offset) { return file->f.WriteRaw(buf, nbytes); } +s32 PS4_SYSV_ABI sceKernelRename(const char* from, const char* to) { + auto* mnt = Common::Singleton::Instance(); + const auto src_path = mnt->GetHostPath(from); + if (!std::filesystem::exists(src_path)) { + return ORBIS_KERNEL_ERROR_ENOENT; + } + const auto dst_path = mnt->GetHostPath(to); + const bool src_is_dir = std::filesystem::is_directory(src_path); + const bool dst_is_dir = std::filesystem::is_directory(dst_path); + if (src_is_dir && !dst_is_dir) { + return ORBIS_KERNEL_ERROR_ENOTDIR; + } + if (!src_is_dir && dst_is_dir) { + return ORBIS_KERNEL_ERROR_EISDIR; + } + if (dst_is_dir && !std::filesystem::is_empty(dst_path)) { + return ORBIS_KERNEL_ERROR_ENOTEMPTY; + } + std::filesystem::copy(src_path, dst_path, std::filesystem::copy_options::overwrite_existing); + return ORBIS_OK; +} + void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) { + std::srand(std::time(nullptr)); LIB_FUNCTION("1G3lF1Gg1k8", "libkernel", 1, "libkernel", 1, 1, sceKernelOpen); LIB_FUNCTION("wuCroIGjt2g", "libScePosix", 1, "libkernel", 1, 1, posix_open); LIB_FUNCTION("UK2Tl2DWUns", "libkernel", 1, "libkernel", 1, 1, sceKernelClose); @@ -479,6 +515,7 @@ void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("kBwCPsYX-m4", "libkernel", 1, "libkernel", 1, 1, sceKernelFStat); LIB_FUNCTION("mqQMh1zPPT8", "libScePosix", 1, "libkernel", 1, 1, posix_fstat); LIB_FUNCTION("VW3TVZiM4-E", "libkernel", 1, "libkernel", 1, 1, sceKernelFtruncate); + LIB_FUNCTION("52NcYU9+lEo", "libkernel", 1, "libkernel", 1, 1, sceKernelRename); LIB_FUNCTION("E6ao34wPw+U", "libScePosix", 1, "libkernel", 1, 1, posix_stat); LIB_FUNCTION("+r3rMFwItV4", "libkernel", 1, "libkernel", 1, 1, sceKernelPread); diff --git a/src/core/libraries/kernel/libkernel.cpp b/src/core/libraries/kernel/libkernel.cpp index a7f619f1..e2625819 100644 --- a/src/core/libraries/kernel/libkernel.cpp +++ b/src/core/libraries/kernel/libkernel.cpp @@ -7,6 +7,7 @@ #include #include "common/assert.h" +#include "common/debug.h" #include "common/logging/log.h" #include "common/polyfill_thread.h" #include "common/singleton.h" @@ -84,6 +85,9 @@ static PS4_SYSV_ABI void stack_chk_fail() { int PS4_SYSV_ABI sceKernelMunmap(void* addr, size_t len) { LOG_INFO(Kernel_Vmm, "addr = {}, len = {:#x}", fmt::ptr(addr), len); + if (len == 0) { + return ORBIS_OK; + } auto* memory = Core::Memory::Instance(); memory->UnmapMemory(std::bit_cast(addr), len); return SCE_OK; @@ -407,6 +411,7 @@ void LibKernel_Register(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("2SKEx6bSq-4", "libkernel", 1, "libkernel", 1, 1, sceKernelBatchMap); LIB_FUNCTION("kBJzF8x4SyE", "libkernel", 1, "libkernel", 1, 1, sceKernelBatchMap2); + LIB_FUNCTION("DGMG3JshrZU", "libkernel", 1, "libkernel", 1, 1, sceKernelSetVirtualRangeName); // equeue LIB_FUNCTION("D0OdFMjp46I", "libkernel", 1, "libkernel", 1, 1, sceKernelCreateEqueue); diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index 81874c1f..f0d71c5f 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -246,7 +246,8 @@ int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut s32 PS4_SYSV_ABI sceKernelBatchMap(OrbisKernelBatchMapEntry* entries, int numEntries, int* numEntriesOut) { - return sceKernelBatchMap2(entries, numEntries, numEntriesOut, 0x10); // 0x10 : Fixed / 0x410 + return sceKernelBatchMap2(entries, numEntries, numEntriesOut, + MemoryFlags::SCE_KERNEL_MAP_FIXED); // 0x10, 0x410? } int PS4_SYSV_ABI sceKernelMunmap(void* addr, size_t len); @@ -261,7 +262,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn break; // break and assign a value to numEntriesOut. } - if (entries[i].operation == 0) { // MAP_DIRECT + if (entries[i].operation == MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_DIRECT) { result = sceKernelMapNamedDirectMemory(&entries[i].start, entries[i].length, entries[i].protection, flags, static_cast(entries[i].offset), 0, ""); @@ -274,13 +275,18 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn if (result == 0) processed++; +<<<<<<< HEAD } else if (entries[i].operation == 1) { // UNMAP +======= + } else if (entries[i].operation == MemoryOpTypes::ORBIS_KERNEL_MAP_OP_UNMAP) { +>>>>>>> cdff4af38da1d832e35d8c057d698f38c64b2932 result = sceKernelMunmap(entries[i].start, entries[i].length); LOG_INFO(Kernel_Vmm, "BatchMap: entry = {}, operation = {}, len = {:#x}, result = {}", i, entries[i].operation, entries[i].length, result); if (result == 0) processed++; +<<<<<<< HEAD } else if (entries[i].operation == 4) { // MPROTECT result = sceKernelMProtect(entries[i].start, entries[i].length, entries[i].protection); @@ -296,6 +302,19 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn } } else { +======= + } else if (entries[i].operation == MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_FLEXIBLE) { + result = sceKernelMapNamedFlexibleMemory(&entries[i].start, entries[i].length, + entries[i].protection, flags, ""); + LOG_INFO(Kernel_Vmm, + "BatchMap: entry = {}, operation = {}, len = {:#x}, type = {}, " + "result = {}", + i, entries[i].operation, entries[i].length, (u8)entries[i].type, result); + + if (result == 0) + processed++; + } else { +>>>>>>> cdff4af38da1d832e35d8c057d698f38c64b2932 UNREACHABLE_MSG("called: Unimplemented Operation = {}", entries[i].operation); } } @@ -305,4 +324,19 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn return result; } +s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, const char* name) { + static constexpr size_t MaxNameSize = 32; + if (std::strlen(name) > MaxNameSize) { + LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); + return ORBIS_KERNEL_ERROR_ENAMETOOLONG; + } + + if (name == nullptr) { + LOG_ERROR(Kernel_Vmm, "name is invalid!"); + return ORBIS_KERNEL_ERROR_EFAULT; + } + auto* memory = Core::Memory::Instance(); + memory->NameVirtualRange(std::bit_cast(addr), len, name); + return ORBIS_OK; +} } // namespace Libraries::Kernel diff --git a/src/core/libraries/kernel/memory_management.h b/src/core/libraries/kernel/memory_management.h index 25434ecb..25a4a9f0 100644 --- a/src/core/libraries/kernel/memory_management.h +++ b/src/core/libraries/kernel/memory_management.h @@ -31,6 +31,14 @@ enum MemoryProtection : u32 { SCE_KERNEL_PROT_GPU_RW = 0x30 // Permit reads/writes from the GPU }; +enum MemoryOpTypes : u32 { + ORBIS_KERNEL_MAP_OP_MAP_DIRECT = 0, + ORBIS_KERNEL_MAP_OP_UNMAP = 1, + ORBIS_KERNEL_MAP_OP_PROTECT = 2, + ORBIS_KERNEL_MAP_OP_MAP_FLEXIBLE = 3, + ORBIS_KERNEL_MAP_OP_TYPE_PROTECT = 4 +}; + struct OrbisQueryInfo { uintptr_t start; uintptr_t end; @@ -100,4 +108,6 @@ s32 PS4_SYSV_ABI sceKernelBatchMap(OrbisKernelBatchMapEntry* entries, int numEnt s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEntries, int* numEntriesOut, int flags); +s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, const char* name); + } // namespace Libraries::Kernel diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 7c075e87..c5237d0a 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -394,6 +394,18 @@ int PS4_SYSV_ABI scePthreadSetaffinity(ScePthread thread, const /*SceKernelCpuma return result; } +int PS4_SYSV_ABI scePthreadGetaffinity(ScePthread thread, /*SceKernelCpumask*/ u64* mask) { + LOG_INFO(Kernel_Pthread, "called"); + + if (thread == nullptr) { + return SCE_KERNEL_ERROR_ESRCH; + } + + auto result = scePthreadAttrGetaffinity(&thread->attr, mask); + + return result; +} + ScePthreadMutex* createMutex(ScePthreadMutex* addr) { if (addr == nullptr || *addr != nullptr) { return addr; @@ -427,11 +439,7 @@ int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMut int result = pthread_mutex_init(&(*mutex)->pth_mutex, &(*attr)->pth_mutex_attr); - static auto mutex_loc = MUTEX_LOCATION("mutex"); - (*mutex)->tracy_lock = std::make_unique(&mutex_loc); - if (name != nullptr) { - (*mutex)->tracy_lock->CustomName(name, std::strlen(name)); LOG_INFO(Kernel_Pthread, "name={}, result={}", name, result); } @@ -543,15 +551,11 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) { return SCE_KERNEL_ERROR_EINVAL; } - (*mutex)->tracy_lock->BeforeLock(); - int result = pthread_mutex_lock(&(*mutex)->pth_mutex); if (result != 0) { LOG_TRACE(Kernel_Pthread, "Locked name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterLock(); - switch (result) { case 0: return SCE_OK; @@ -577,8 +581,6 @@ int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) { LOG_TRACE(Kernel_Pthread, "Unlocking name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterUnlock(); - switch (result) { case 0: return SCE_OK; @@ -1183,8 +1185,6 @@ int PS4_SYSV_ABI scePthreadMutexTrylock(ScePthreadMutex* mutex) { LOG_TRACE(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterTryLock(result == 0); - switch (result) { case 0: return ORBIS_OK; @@ -1243,6 +1243,40 @@ int PS4_SYSV_ABI posix_pthread_attr_destroy(ScePthreadAttr* attr) { return result; } +int PS4_SYSV_ABI posix_pthread_attr_setschedparam(ScePthreadAttr* attr, + const SceKernelSchedParam* param) { + int result = scePthreadAttrSetschedparam(attr, param); + if (result < 0) { + int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP + ? result + -SCE_KERNEL_ERROR_UNKNOWN + : POSIX_EOTHER; + return rt; + } + return result; +} + +int PS4_SYSV_ABI posix_pthread_attr_setinheritsched(ScePthreadAttr* attr, int inheritSched) { + int result = scePthreadAttrSetinheritsched(attr, inheritSched); + if (result < 0) { + int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP + ? result + -SCE_KERNEL_ERROR_UNKNOWN + : POSIX_EOTHER; + return rt; + } + return result; +} + +int PS4_SYSV_ABI posix_pthread_setprio(ScePthread thread, int prio) { + int result = scePthreadSetprio(thread, prio); + if (result < 0) { + int rt = result > SCE_KERNEL_ERROR_UNKNOWN && result <= SCE_KERNEL_ERROR_ESTOP + ? result + -SCE_KERNEL_ERROR_UNKNOWN + : POSIX_EOTHER; + return rt; + } + return result; +} + int PS4_SYSV_ABI posix_pthread_attr_setdetachstate(ScePthreadAttr* attr, int detachstate) { // LOG_INFO(Kernel_Pthread, "posix pthread_mutexattr_init redirect to scePthreadMutexattrInit"); int result = scePthreadAttrSetdetachstate(attr, detachstate); @@ -1336,14 +1370,56 @@ int PS4_SYSV_ABI posix_sem_wait(sem_t* sem) { return sem_wait(sem); } +#ifndef HAVE_SEM_TIMEDWAIT +int sem_timedwait(sem_t* sem, const struct timespec* abstime) { + int rc; + while ((rc = sem_trywait(sem)) == EAGAIN) { + struct timespec curr_time; + clock_gettime(CLOCK_REALTIME, &curr_time); + + s64 remaining_ns = 0; + remaining_ns += + (static_cast(abstime->tv_sec) - static_cast(curr_time.tv_sec)) * 1000000000L; + remaining_ns += static_cast(abstime->tv_nsec) - static_cast(curr_time.tv_nsec); + + if (remaining_ns <= 0) { + return ETIMEDOUT; + } + + struct timespec sleep_time; + sleep_time.tv_sec = 0; + if (remaining_ns < 5000000L) { + sleep_time.tv_nsec = remaining_ns; + } else { + sleep_time.tv_nsec = 5000000; + } + + nanosleep(&sleep_time, nullptr); + } + return rc; +} +#endif + +int PS4_SYSV_ABI posix_sem_timedwait(sem_t* sem, const timespec* t) { + return sem_timedwait(sem, t); +} + int PS4_SYSV_ABI posix_sem_post(sem_t* sem) { return sem_post(sem); } +int PS4_SYSV_ABI posix_sem_destroy(sem_t* sem) { + return sem_destroy(sem); +} + int PS4_SYSV_ABI posix_sem_getvalue(sem_t* sem, int* sval) { return sem_getvalue(sem, sval); } +int PS4_SYSV_ABI posix_pthread_attr_getstacksize(const pthread_attr_t* attr, size_t* size) { + return pthread_attr_getstacksize(attr, size); +} + int PS4_SYSV_ABI scePthreadGetschedparam(ScePthread thread, int* policy, SceKernelSchedParam* param) { return pthread_getschedparam(thread->pth, policy, param); @@ -1403,6 +1479,26 @@ int PS4_SYSV_ABI posix_pthread_condattr_setclock(ScePthreadCondattr* attr, clock return SCE_OK; } +int PS4_SYSV_ABI posix_pthread_getschedparam(ScePthread thread, int* policy, + SceKernelSchedParam* param) { + return scePthreadGetschedparam(thread, policy, param); +} + +int PS4_SYSV_ABI posix_pthread_setschedparam(ScePthread thread, int policy, + const SceKernelSchedParam* param) { + return scePthreadSetschedparam(thread, policy, param); +} + +int PS4_SYSV_ABI posix_pthread_attr_getschedpolicy(const ScePthreadAttr* attr, int* policy) { + return scePthreadAttrGetschedpolicy(attr, policy); +} + +int PS4_SYSV_ABI scePthreadRename(ScePthread thread, const char* name) { + thread->name = name; + LOG_INFO(Kernel_Pthread, "scePthreadRename: name = {}", thread->name); + return SCE_OK; +} + void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("lZzFeSxPl08", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setcancelstate); LIB_FUNCTION("0TyVk4MSLt0", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_cond_init); @@ -1427,6 +1523,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("EI-5-jlq2dE", "libkernel", 1, "libkernel", 1, 1, scePthreadGetthreadid); LIB_FUNCTION("1tKyG7RlMJo", "libkernel", 1, "libkernel", 1, 1, scePthreadGetprio); LIB_FUNCTION("W0Hpm2X0uPE", "libkernel", 1, "libkernel", 1, 1, scePthreadSetprio); + LIB_FUNCTION("GBUY7ywdULE", "libkernel", 1, "libkernel", 1, 1, scePthreadRename); LIB_FUNCTION("aI+OeCz8xrQ", "libkernel", 1, "libkernel", 1, 1, scePthreadSelf); LIB_FUNCTION("EotR8a3ASf4", "libkernel", 1, "libkernel", 1, 1, posix_pthread_self); @@ -1442,6 +1539,7 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("OxhIB8LB-PQ", "libkernel", 1, "libkernel", 1, 1, posix_pthread_create); LIB_FUNCTION("OxhIB8LB-PQ", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_create); LIB_FUNCTION("bt3CTBKmGyI", "libkernel", 1, "libkernel", 1, 1, scePthreadSetaffinity); + LIB_FUNCTION("rcrVFJsQWRY", "libkernel", 1, "libkernel", 1, 1, scePthreadGetaffinity); LIB_FUNCTION("6UgtwV+0zb4", "libkernel", 1, "libkernel", 1, 1, scePthreadCreate); LIB_FUNCTION("T72hz6ffq08", "libkernel", 1, "libkernel", 1, 1, scePthreadYield); LIB_FUNCTION("B5GmVDKwpn0", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_yield); @@ -1498,6 +1596,8 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("EjllaAqAPZo", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_condattr_setclock); LIB_FUNCTION("Z4QosVuAsA0", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_once); + LIB_FUNCTION("RtLRV-pBTTY", "libScePosix", 1, "libkernel", 1, 1, + posix_pthread_attr_getschedpolicy); // openorbis weird functions LIB_FUNCTION("7H0iTOciTLo", "libkernel", 1, "libkernel", 1, 1, posix_pthread_mutex_lock); @@ -1507,15 +1607,26 @@ void pthreadSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("E+tyo3lp5Lw", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_attr_setdetachstate); LIB_FUNCTION("zHchY8ft5pk", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_attr_destroy); + LIB_FUNCTION("euKRgm0Vn2M", "libScePosix", 1, "libkernel", 1, 1, + posix_pthread_attr_setschedparam); + LIB_FUNCTION("7ZlAakEf0Qg", "libScePosix", 1, "libkernel", 1, 1, + posix_pthread_attr_setinheritsched); + LIB_FUNCTION("a2P9wYGeZvc", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setprio); LIB_FUNCTION("Jmi+9w9u0E4", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_create_name_np); LIB_FUNCTION("OxhIB8LB-PQ", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_create); LIB_FUNCTION("+U1R4WtXvoc", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_detach); LIB_FUNCTION("CBNtXOoef-E", "libScePosix", 1, "libkernel", 1, 1, posix_sched_get_priority_max); LIB_FUNCTION("m0iS6jNsXds", "libScePosix", 1, "libkernel", 1, 1, posix_sched_get_priority_min); + LIB_FUNCTION("FIs3-UQT9sg", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_getschedparam); + LIB_FUNCTION("Xs9hdiD7sAA", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_setschedparam); LIB_FUNCTION("pDuPEf3m4fI", "libScePosix", 1, "libkernel", 1, 1, posix_sem_init); LIB_FUNCTION("YCV5dGGBcCo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_wait); + LIB_FUNCTION("w5IHyvahg-o", "libScePosix", 1, "libkernel", 1, 1, posix_sem_timedwait); LIB_FUNCTION("IKP8typ0QUk", "libScePosix", 1, "libkernel", 1, 1, posix_sem_post); + LIB_FUNCTION("cDW233RAwWo", "libScePosix", 1, "libkernel", 1, 1, posix_sem_destroy); LIB_FUNCTION("Bq+LRV-N6Hk", "libScePosix", 1, "libkernel", 1, 1, posix_sem_getvalue); + LIB_FUNCTION("0qOtCR-ZHck", "libScePosix", 1, "libkernel", 1, 1, + posix_pthread_attr_getstacksize); // libs RwlockSymbolsRegister(sym); SemaphoreSymbolsRegister(sym); diff --git a/src/core/libraries/kernel/thread_management.h b/src/core/libraries/kernel/thread_management.h index 8303c9ef..c5935275 100644 --- a/src/core/libraries/kernel/thread_management.h +++ b/src/core/libraries/kernel/thread_management.h @@ -9,7 +9,6 @@ #include #include #include -#include "common/debug.h" #include "common/types.h" namespace Core::Loader { @@ -74,7 +73,6 @@ struct PthreadMutexInternal { u8 reserved[256]; std::string name; pthread_mutex_t pth_mutex; - std::unique_ptr tracy_lock; }; struct PthreadMutexattrInternal { @@ -169,9 +167,12 @@ ScePthread PS4_SYSV_ABI scePthreadSelf(); int PS4_SYSV_ABI scePthreadAttrSetaffinity(ScePthreadAttr* pattr, const /*SceKernelCpumask*/ u64 mask); int PS4_SYSV_ABI scePthreadSetaffinity(ScePthread thread, const /*SceKernelCpumask*/ u64 mask); +int PS4_SYSV_ABI scePthreadGetaffinity(ScePthread thread, /*SceKernelCpumask*/ u64* mask); int PS4_SYSV_ABI scePthreadCreate(ScePthread* thread, const ScePthreadAttr* attr, PthreadEntryFunc start_routine, void* arg, const char* name); +int PS4_SYSV_ABI scePthreadSetprio(ScePthread thread, int prio); + /*** * Mutex calls */ diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index bfa6a68d..370dba44 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -41,7 +41,6 @@ public: AddWaiter(waiter); // Perform the wait. - std::exchange(lk, std::unique_lock{waiter.mutex}); return waiter.Wait(lk, timeout); } @@ -59,10 +58,9 @@ public: it++; continue; } - std::scoped_lock lk2{waiter.mutex}; + it = wait_list.erase(it); token_count -= waiter.need_count; waiter.cv.notify_one(); - it = wait_list.erase(it); } return true; @@ -84,7 +82,6 @@ public: public: struct WaitingThread : public ListBaseHook { - std::mutex mutex; std::string name; std::condition_variable cv; u32 priority; diff --git a/src/core/libraries/kernel/time_management.cpp b/src/core/libraries/kernel/time_management.cpp index bc1617d3..c4854937 100644 --- a/src/core/libraries/kernel/time_management.cpp +++ b/src/core/libraries/kernel/time_management.cpp @@ -214,6 +214,22 @@ int PS4_SYSV_ABI posix_clock_getres(u32 clock_id, OrbisKernelTimespec* res) { return SCE_KERNEL_ERROR_EINVAL; } +int PS4_SYSV_ABI sceKernelConvertLocaltimeToUtc(time_t param_1, int64_t param_2, time_t* seconds, + OrbisKernelTimezone* timezone, int* dst_seconds) { + LOG_INFO(Kernel, "called"); + if (timezone) { + sceKernelGettimezone(timezone); + param_1 -= (timezone->tz_minuteswest + timezone->tz_dsttime) * 60; + if (seconds) + *seconds = param_1; + if (dst_seconds) + *dst_seconds = timezone->tz_dsttime * 60; + } else { + return SCE_KERNEL_ERROR_EINVAL; + } + return SCE_OK; +} + void timeSymbolsRegister(Core::Loader::SymbolsResolver* sym) { clock = std::make_unique(); initial_ptc = clock->GetUptime(); @@ -239,6 +255,7 @@ void timeSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("lLMT9vJAck0", "libkernel", 1, "libkernel", 1, 1, posix_clock_gettime); LIB_FUNCTION("lLMT9vJAck0", "libScePosix", 1, "libkernel", 1, 1, posix_clock_gettime); LIB_FUNCTION("smIj7eqzZE8", "libScePosix", 1, "libkernel", 1, 1, posix_clock_getres); + LIB_FUNCTION("0NTHN1NKONI", "libkernel", 1, "libkernel", 1, 1, sceKernelConvertLocaltimeToUtc); } } // namespace Libraries::Kernel diff --git a/src/core/libraries/libs.cpp b/src/core/libraries/libs.cpp index f9325297..20efd3c0 100644 --- a/src/core/libraries/libs.cpp +++ b/src/core/libraries/libs.cpp @@ -20,6 +20,7 @@ #include "core/libraries/np_trophy/np_trophy.h" #include "core/libraries/pad/pad.h" #include "core/libraries/playgo/playgo.h" +#include "core/libraries/random/random.h" #include "core/libraries/rtc/rtc.h" #include "core/libraries/save_data/savedata.h" #include "core/libraries/screenshot/screenshot.h" @@ -43,8 +44,8 @@ namespace Libraries { void InitHLELibs(Core::Loader::SymbolsResolver* sym) { LOG_INFO(Lib_Kernel, "Initializing HLE libraries"); Libraries::Kernel::LibKernel_Register(sym); - Libraries::VideoOut::RegisterLib(sym); Libraries::GnmDriver::RegisterlibSceGnmDriver(sym); + Libraries::VideoOut::RegisterLib(sym); if (!Config::isLleLibc()) { Libraries::LibC::libcSymbolsRegister(sym); } @@ -71,6 +72,7 @@ void InitHLELibs(Core::Loader::SymbolsResolver* sym) { Libraries::AppContent::RegisterlibSceAppContent(sym); Libraries::PngDec::RegisterlibScePngDec(sym); Libraries::PlayGo::RegisterlibScePlayGo(sym); + Libraries::Random::RegisterlibSceRandom(sym); Libraries::Usbd::RegisterlibSceUsbd(sym); Libraries::Pad::RegisterlibScePad(sym); Libraries::Ajm::RegisterlibSceAjm(sym); diff --git a/src/core/libraries/network/net.cpp b/src/core/libraries/network/net.cpp index 1569a51c..958f9264 100644 --- a/src/core/libraries/network/net.cpp +++ b/src/core/libraries/network/net.cpp @@ -559,7 +559,7 @@ int PS4_SYSV_ABI sceNetEpollDestroy() { } int PS4_SYSV_ABI sceNetEpollWait() { - LOG_ERROR(Lib_Net, "(STUBBED) called"); + LOG_TRACE(Lib_Net, "(STUBBED) called"); return ORBIS_OK; } diff --git a/src/core/libraries/network/netctl.cpp b/src/core/libraries/network/netctl.cpp index ab1cb8ae..a1c8e81c 100644 --- a/src/core/libraries/network/netctl.cpp +++ b/src/core/libraries/network/netctl.cpp @@ -79,7 +79,7 @@ int PS4_SYSV_ABI sceNetCtlUnregisterCallbackV6() { } int PS4_SYSV_ABI sceNetCtlCheckCallback() { - LOG_ERROR(Lib_NetCtl, "(STUBBED) called"); + LOG_TRACE(Lib_NetCtl, "(STUBBED) called"); return ORBIS_OK; } diff --git a/src/core/libraries/np_manager/np_manager.cpp b/src/core/libraries/np_manager/np_manager.cpp index ee4b3d6b..33308abc 100644 --- a/src/core/libraries/np_manager/np_manager.cpp +++ b/src/core/libraries/np_manager/np_manager.cpp @@ -870,7 +870,7 @@ int PS4_SYSV_ABI sceNpAsmTerminate() { } int PS4_SYSV_ABI sceNpCheckCallback() { - LOG_ERROR(Lib_NpManager, "(STUBBED) called"); + LOG_TRACE(Lib_NpManager, "(STUBBED) called"); return ORBIS_OK; } @@ -3510,4 +3510,4 @@ void RegisterlibSceNpManager(Core::Loader::SymbolsResolver* sym) { sceNpUnregisterStateCallbackForToolkit); }; -} // namespace Libraries::NpManager \ No newline at end of file +} // namespace Libraries::NpManager diff --git a/src/core/libraries/playgo/playgo.cpp b/src/core/libraries/playgo/playgo.cpp index e029413e..a3af8b4c 100644 --- a/src/core/libraries/playgo/playgo.cpp +++ b/src/core/libraries/playgo/playgo.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "common/logging/log.h" #include "common/singleton.h" #include "core/libraries/error_codes.h" @@ -50,9 +51,16 @@ s32 PS4_SYSV_ABI scePlayGoGetLocus(OrbisPlayGoHandle handle, const OrbisPlayGoCh uint32_t numberOfEntries, OrbisPlayGoLocus* outLoci) { LOG_ERROR(Lib_PlayGo, "(STUBBED)called handle = {}, chunkIds = {}, numberOfEntries = {}", handle, *chunkIds, numberOfEntries); - // assign all now so that scePlayGoGetLocus is not called again for every single entry - std::fill(outLoci, outLoci + numberOfEntries, - OrbisPlayGoLocusValue::ORBIS_PLAYGO_LOCUS_LOCAL_FAST); + + auto* playgo = Common::Singleton::Instance(); + + for (uint32_t i = 0; i < numberOfEntries; i++) { + if (chunkIds[i] <= playgo->GetPlaygoHeader().mchunk_count) { + outLoci[i] = OrbisPlayGoLocusValue::ORBIS_PLAYGO_LOCUS_LOCAL_FAST; + } else { + return ORBIS_PLAYGO_ERROR_BAD_CHUNK_ID; + } + } return ORBIS_OK; } @@ -68,7 +76,7 @@ s32 PS4_SYSV_ABI scePlayGoGetProgress(OrbisPlayGoHandle handle, const OrbisPlayG s32 PS4_SYSV_ABI scePlayGoGetToDoList(OrbisPlayGoHandle handle, OrbisPlayGoToDo* outTodoList, u32 numberOfEntries, u32* outEntries) { LOG_ERROR(Lib_PlayGo, "(STUBBED)called"); - if (handle != shadMagic) + if (handle != 1) return ORBIS_PLAYGO_ERROR_BAD_HANDLE; if (outTodoList == nullptr) return ORBIS_PLAYGO_ERROR_BAD_POINTER; @@ -86,7 +94,7 @@ s32 PS4_SYSV_ABI scePlayGoInitialize(OrbisPlayGoInitParams* param) { } s32 PS4_SYSV_ABI scePlayGoOpen(OrbisPlayGoHandle* outHandle, const void* param) { - *outHandle = shadMagic; + *outHandle = 1; LOG_INFO(Lib_PlayGo, "(STUBBED)called"); return ORBIS_OK; } diff --git a/src/core/libraries/random/random.cpp b/src/core/libraries/random/random.cpp new file mode 100644 index 00000000..8147c518 --- /dev/null +++ b/src/core/libraries/random/random.cpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/logging/log.h" +#include "core/libraries/error_codes.h" +#include "core/libraries/libs.h" +#include "random.h" + +namespace Libraries::Random { + +s32 PS4_SYSV_ABI sceRandomGetRandomNumber(u8* buf, size_t size) { + LOG_TRACE(Lib_Random, "called"); + if (size > SCE_RANDOM_MAX_SIZE) { + return SCE_RANDOM_ERROR_INVALID; + } + + for (auto i = 0; i < size; ++i) { + buf[i] = std::rand() & 0xFF; + } + return ORBIS_OK; +} + +void RegisterlibSceRandom(Core::Loader::SymbolsResolver* sym) { + LIB_FUNCTION("PI7jIZj4pcE", "libSceRandom", 1, "libSceRandom", 1, 1, sceRandomGetRandomNumber); +}; + +} // namespace Libraries::Random diff --git a/src/core/libraries/random/random.h b/src/core/libraries/random/random.h new file mode 100644 index 00000000..b5f87f87 --- /dev/null +++ b/src/core/libraries/random/random.h @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once +#include "common/types.h" + +namespace Core::Loader { +class SymbolsResolver; +} + +namespace Libraries::Random { +constexpr int32_t SCE_RANDOM_MAX_SIZE = 64; + +s32 PS4_SYSV_ABI sceRandomGetRandomNumber(u8* buf, size_t size); + +void RegisterlibSceRandom(Core::Loader::SymbolsResolver* sym); +} // namespace Libraries::Random \ No newline at end of file diff --git a/src/core/libraries/save_data/savedata.cpp b/src/core/libraries/save_data/savedata.cpp index db6d0964..64237994 100644 --- a/src/core/libraries/save_data/savedata.cpp +++ b/src/core/libraries/save_data/savedata.cpp @@ -15,7 +15,7 @@ #include "error_codes.h" namespace Libraries::SaveData { - +bool is_rw_mode = false; static constexpr std::string_view g_mount_point = "/savedata0"; // temp mount point (todo) std::string game_serial; @@ -180,27 +180,31 @@ int PS4_SYSV_ABI sceSaveDataDirNameSearch(const OrbisSaveDataDirNameSearchCond* OrbisSaveDataDirNameSearchResult* result) { if (cond == nullptr) return ORBIS_SAVE_DATA_ERROR_PARAMETER; - LOG_ERROR(Lib_SaveData, "TODO sceSaveDataDirNameSearch: Add params"); + LOG_INFO(Lib_SaveData, "called"); const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / std::to_string(cond->userId) / game_serial; if (!mount_dir.empty() && std::filesystem::exists(mount_dir)) { - if (cond->dirName == nullptr) { // look for all dirs if no dir is provided. + if (cond->dirName == nullptr || std::string_view(cond->dirName->data) + .empty()) { // look for all dirs if no dir is provided. for (int i = 0; const auto& entry : std::filesystem::directory_iterator(mount_dir)) { - if (std::filesystem::is_directory(entry.path())) { - i++; - result->dirNamesNum = 0; // why is it 1024? is it max? + if (std::filesystem::is_directory(entry.path()) && + entry.path().filename().string() != "sdmemory") { + // sceSaveDataDirNameSearch does not search for dataMemory1/2 dirs. // copy dir name to be used by sceSaveDataMount in read mode. strncpy(result->dirNames[i].data, entry.path().filename().string().c_str(), 32); - result->hitNum = i + 1; - result->dirNamesNum = i + 1; // to confirm - result->setNum = i + 1; // to confirm + i++; + result->hitNum = i; + result->dirNamesNum = i; + result->setNum = i; } } } else { // Need a game to test. + LOG_ERROR(Lib_SaveData, "Check Me. sceSaveDataDirNameSearch: dirName = {}", + cond->dirName->data); strncpy(result->dirNames[0].data, cond->dirName->data, 32); result->hitNum = 1; - result->dirNamesNum = 1; // to confirm - result->setNum = 1; // to confirm + result->dirNamesNum = 1; + result->setNum = 1; } } else { result->hitNum = 0; @@ -303,8 +307,51 @@ int PS4_SYSV_ABI sceSaveDataGetMountInfo(const OrbisSaveDataMountPoint* mountPoi return ORBIS_OK; } -int PS4_SYSV_ABI sceSaveDataGetParam() { - LOG_ERROR(Lib_SaveData, "(STUBBED) called"); +int PS4_SYSV_ABI sceSaveDataGetParam(const OrbisSaveDataMountPoint* mountPoint, + const OrbisSaveDataParamType paramType, void* paramBuf, + const size_t paramBufSize, size_t* gotSize) { + + if (mountPoint == nullptr) + return ORBIS_SAVE_DATA_ERROR_PARAMETER; + + auto* mnt = Common::Singleton::Instance(); + const auto mount_dir = mnt->GetHostPath(mountPoint->data); + Common::FS::IOFile file(mount_dir / "param.txt", Common::FS::FileAccessMode::Read); + OrbisSaveDataParam params; + file.Read(params); + + LOG_INFO(Lib_SaveData, "called"); + + switch (paramType) { + case ORBIS_SAVE_DATA_PARAM_TYPE_ALL: { + memcpy(paramBuf, ¶ms, sizeof(OrbisSaveDataParam)); + *gotSize = sizeof(OrbisSaveDataParam); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_TITLE: { + std::memcpy(paramBuf, ¶ms.title, ORBIS_SAVE_DATA_TITLE_MAXSIZE); + *gotSize = ORBIS_SAVE_DATA_TITLE_MAXSIZE; + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_SUB_TITLE: { + std::memcpy(paramBuf, ¶ms.subTitle, ORBIS_SAVE_DATA_SUBTITLE_MAXSIZE); + *gotSize = ORBIS_SAVE_DATA_SUBTITLE_MAXSIZE; + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_DETAIL: { + std::memcpy(paramBuf, ¶ms.detail, ORBIS_SAVE_DATA_DETAIL_MAXSIZE); + *gotSize = ORBIS_SAVE_DATA_DETAIL_MAXSIZE; + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_USER_PARAM: { + std::memcpy(paramBuf, ¶ms.userParam, sizeof(u32)); + *gotSize = sizeof(u32); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_MTIME: { + std::memcpy(paramBuf, ¶ms.mtime, sizeof(time_t)); + *gotSize = sizeof(time_t); + } break; + default: { + UNREACHABLE_MSG("Unknown Param = {}", paramType); + } break; + } + return ORBIS_OK; } @@ -321,7 +368,7 @@ int PS4_SYSV_ABI sceSaveDataGetSaveDataCount() { int PS4_SYSV_ABI sceSaveDataGetSaveDataMemory(const u32 userId, void* buf, const size_t bufSize, const int64_t offset) { const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(userId) / game_serial / "save_mem1.sav"; + std::to_string(userId) / game_serial / "sdmemory/save_mem1.sav"; Common::FS::IOFile file(mount_dir, Common::FS::FileAccessMode::Read); if (!file.IsOpen()) { @@ -336,7 +383,7 @@ int PS4_SYSV_ABI sceSaveDataGetSaveDataMemory(const u32 userId, void* buf, const int PS4_SYSV_ABI sceSaveDataGetSaveDataMemory2(OrbisSaveDataMemoryGet2* getParam) { const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(getParam->userId) / game_serial; + std::to_string(getParam->userId) / game_serial / "sdmemory"; if (getParam == nullptr) return ORBIS_SAVE_DATA_ERROR_PARAMETER; if (getParam->data != nullptr) { @@ -443,6 +490,7 @@ s32 saveDataMount(u32 user_id, char* dir_name, u32 mount_mode, case ORBIS_SAVE_DATA_MOUNT_MODE_RDWR: case ORBIS_SAVE_DATA_MOUNT_MODE_RDWR | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF: case ORBIS_SAVE_DATA_MOUNT_MODE_RDONLY | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF: { + is_rw_mode = (mount_mode == ORBIS_SAVE_DATA_MOUNT_MODE_RDWR) ? true : false; if (!std::filesystem::exists(mount_dir)) { return ORBIS_SAVE_DATA_ERROR_NOT_FOUND; } @@ -460,10 +508,6 @@ s32 saveDataMount(u32 user_id, char* dir_name, u32 mount_mode, case ORBIS_SAVE_DATA_MOUNT_MODE_CREATE | ORBIS_SAVE_DATA_MOUNT_MODE_DESTRUCT_OFF | ORBIS_SAVE_DATA_MOUNT_MODE_COPY_ICON: { if (std::filesystem::exists(mount_dir)) { - g_mount_point.copy(mount_result->mount_point.data, 16); - mnt->Mount(mount_dir, mount_result->mount_point.data); - mount_result->required_blocks = 0; - mount_result->mount_status = 0; return ORBIS_SAVE_DATA_ERROR_EXISTS; } if (std::filesystem::create_directories(mount_dir)) { @@ -483,7 +527,7 @@ s32 saveDataMount(u32 user_id, char* dir_name, u32 mount_mode, mount_result->mount_status = 1; } break; default: - UNREACHABLE(); + UNREACHABLE_MSG("Unknown mount mode = {}", mount_mode); } mount_result->required_blocks = 0; @@ -583,15 +627,46 @@ int PS4_SYSV_ABI sceSaveDataSetEventInfo() { int PS4_SYSV_ABI sceSaveDataSetParam(const OrbisSaveDataMountPoint* mountPoint, OrbisSaveDataParamType paramType, const void* paramBuf, size_t paramBufSize) { - auto* mnt = Common::Singleton::Instance(); - const auto mount_dir = mnt->GetHostPath(mountPoint->data); - LOG_INFO(Lib_SaveData, "called = {}, mountPoint->data = {}", mount_dir.string(), - mountPoint->data); + if (paramBuf == nullptr) + return ORBIS_SAVE_DATA_ERROR_PARAMETER; - if (paramBuf != nullptr) { - Common::FS::IOFile file(mount_dir / "param.txt", Common::FS::FileAccessMode::Write); - file.WriteRaw(paramBuf, paramBufSize); + auto* mnt = Common::Singleton::Instance(); + const auto mount_dir = mnt->GetHostPath(mountPoint->data) / "param.txt"; + OrbisSaveDataParam params; + if (std::filesystem::exists(mount_dir)) { + Common::FS::IOFile file(mount_dir, Common::FS::FileAccessMode::Read); + file.ReadRaw(¶ms, sizeof(OrbisSaveDataParam)); } + + LOG_INFO(Lib_SaveData, "called"); + + switch (paramType) { + case ORBIS_SAVE_DATA_PARAM_TYPE_ALL: { + memcpy(¶ms, paramBuf, sizeof(OrbisSaveDataParam)); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_TITLE: { + strncpy(params.title, static_cast(paramBuf), paramBufSize); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_SUB_TITLE: { + strncpy(params.subTitle, static_cast(paramBuf), paramBufSize); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_DETAIL: { + strncpy(params.detail, static_cast(paramBuf), paramBufSize); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_USER_PARAM: { + params.userParam = *(static_cast(paramBuf)); + } break; + case ORBIS_SAVE_DATA_PARAM_TYPE_MTIME: { + params.mtime = *(static_cast(paramBuf)); + } break; + default: { + UNREACHABLE_MSG("Unknown Param = {}", paramType); + } + } + + Common::FS::IOFile file(mount_dir, Common::FS::FileAccessMode::Write); + file.WriteRaw(¶ms, sizeof(OrbisSaveDataParam)); + return ORBIS_OK; } @@ -604,11 +679,11 @@ int PS4_SYSV_ABI sceSaveDataSetSaveDataMemory(const u32 userId, const void* buf, const size_t bufSize, const int64_t offset) { LOG_INFO(Lib_SaveData, "called"); const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(userId) / game_serial / "save_mem1.sav"; + std::to_string(userId) / game_serial / "sdmemory/save_mem1.sav"; Common::FS::IOFile file(mount_dir, Common::FS::FileAccessMode::Write); file.Seek(offset); - file.WriteRaw((void*)buf, bufSize); + file.WriteRaw(buf, bufSize); return ORBIS_OK; } @@ -616,13 +691,13 @@ int PS4_SYSV_ABI sceSaveDataSetSaveDataMemory(const u32 userId, const void* buf, int PS4_SYSV_ABI sceSaveDataSetSaveDataMemory2(const OrbisSaveDataMemorySet2* setParam) { LOG_INFO(Lib_SaveData, "called: dataNum = {}, slotId= {}", setParam->dataNum, setParam->slotId); const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(setParam->userId) / game_serial; + std::to_string(setParam->userId) / game_serial / "sdmemory"; if (setParam->data != nullptr) { Common::FS::IOFile file(mount_dir / "save_mem2.sav", Common::FS::FileAccessMode::Write); if (!file.IsOpen()) return -1; file.Seek(setParam->data->offset); - file.WriteRaw((void*)setParam->data->buf, setParam->data->bufSize); + file.WriteRaw(setParam->data->buf, setParam->data->bufSize); } if (setParam->param != nullptr) { @@ -632,7 +707,7 @@ int PS4_SYSV_ABI sceSaveDataSetSaveDataMemory2(const OrbisSaveDataMemorySet2* se if (setParam->icon != nullptr) { Common::FS::IOFile file(mount_dir / "save_icon.png", Common::FS::FileAccessMode::Write); - file.WriteRaw((void*)setParam->icon->buf, setParam->icon->bufSize); + file.WriteRaw(setParam->icon->buf, setParam->icon->bufSize); } return ORBIS_OK; @@ -644,7 +719,7 @@ int PS4_SYSV_ABI sceSaveDataSetupSaveDataMemory(u32 userId, size_t memorySize, LOG_INFO(Lib_SaveData, "called:userId = {}, memorySize = {}", userId, memorySize); const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(userId) / game_serial; + std::to_string(userId) / game_serial / "sdmemory"; if (std::filesystem::exists(mount_dir)) { return ORBIS_SAVE_DATA_ERROR_EXISTS; @@ -663,7 +738,7 @@ int PS4_SYSV_ABI sceSaveDataSetupSaveDataMemory2(const OrbisSaveDataMemorySetup2 LOG_INFO(Lib_SaveData, "called"); // if (setupParam->option == 1) { // check this later. const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / - std::to_string(setupParam->userId) / game_serial; + std::to_string(setupParam->userId) / game_serial / "sdmemory"; if (std::filesystem::exists(mount_dir) && std::filesystem::exists(mount_dir / "save_mem2.sav")) { Common::FS::IOFile file(mount_dir / "save_mem2.sav", Common::FS::FileAccessMode::Read); @@ -717,16 +792,17 @@ int PS4_SYSV_ABI sceSaveDataTransferringMount() { } s32 PS4_SYSV_ABI sceSaveDataUmount(const OrbisSaveDataMountPoint* mountPoint) { + LOG_INFO(Lib_SaveData, "mountPoint = {}", std::string(mountPoint->data)); if (std::string(mountPoint->data).empty()) { return ORBIS_SAVE_DATA_ERROR_NOT_MOUNTED; } const auto& mount_dir = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir) / std::to_string(1) / game_serial / mountPoint->data; auto* mnt = Common::Singleton::Instance(); - + const auto& guest_path = mnt->GetHostPath(mountPoint->data); + if (guest_path.empty()) + return ORBIS_SAVE_DATA_ERROR_NOT_MOUNTED; mnt->Unmount(mount_dir, mountPoint->data); - LOG_INFO(Lib_SaveData, "mountPoint = {}", std::string(mountPoint->data)); - return ORBIS_OK; } @@ -736,23 +812,33 @@ int PS4_SYSV_ABI sceSaveDataUmountSys() { } int PS4_SYSV_ABI sceSaveDataUmountWithBackup(const OrbisSaveDataMountPoint* mountPoint) { - LOG_ERROR(Lib_SaveData, "called = {}", std::string(mountPoint->data)); + LOG_INFO(Lib_SaveData, "called mount = {}, is_rw_mode = {}", std::string(mountPoint->data), + is_rw_mode); auto* mnt = Common::Singleton::Instance(); const auto mount_dir = mnt->GetHostPath(mountPoint->data); if (!std::filesystem::exists(mount_dir)) { return ORBIS_SAVE_DATA_ERROR_NOT_FOUND; } + // leave disabled for now. and just unmount. - std::filesystem::create_directories(mount_dir.parent_path() / "backup"); + /* if (is_rw_mode) { // backup is done only when mount mode is ReadWrite. + auto backup_path = mount_dir; + std::string save_data_dir = (mount_dir.string() + "_backup"); + backup_path.replace_filename(save_data_dir); - for (const auto& entry : std::filesystem::recursive_directory_iterator(mount_dir)) { - const auto& path = entry.path(); - const auto target_path = mount_dir.parent_path() / "backup"; - if (std::filesystem::is_regular_file(path)) { - std::filesystem::copy(path, target_path, - std::filesystem::copy_options::overwrite_existing); + std::filesystem::create_directories(backup_path); + + for (const auto& entry : std::filesystem::recursive_directory_iterator(mount_dir)) { + const auto& path = entry.path(); + if (std::filesystem::is_regular_file(path)) { + std::filesystem::copy(path, save_data_dir, + std::filesystem::copy_options::overwrite_existing); + } } - } + }*/ + const auto& guest_path = mnt->GetHostPath(mountPoint->data); + if (guest_path.empty()) + return ORBIS_SAVE_DATA_ERROR_NOT_MOUNTED; mnt->Unmount(mount_dir, mountPoint->data); return ORBIS_OK; diff --git a/src/core/libraries/save_data/savedata.h b/src/core/libraries/save_data/savedata.h index f342d0dd..9b3cf900 100644 --- a/src/core/libraries/save_data/savedata.h +++ b/src/core/libraries/save_data/savedata.h @@ -242,6 +242,13 @@ struct OrbisSaveDataMemorySync { u8 reserved[28]; }; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_ALL = 0; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_TITLE = 1; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_SUB_TITLE = 2; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_DETAIL = 3; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_USER_PARAM = 4; +constexpr int ORBIS_SAVE_DATA_PARAM_TYPE_MTIME = 5; + int PS4_SYSV_ABI sceSaveDataAbort(); int PS4_SYSV_ABI sceSaveDataBackup(); int PS4_SYSV_ABI sceSaveDataBindPsnAccount(); @@ -291,7 +298,9 @@ int PS4_SYSV_ABI sceSaveDataGetFormat(); int PS4_SYSV_ABI sceSaveDataGetMountedSaveDataCount(); int PS4_SYSV_ABI sceSaveDataGetMountInfo(const OrbisSaveDataMountPoint* mountPoint, OrbisSaveDataMountInfo* info); -int PS4_SYSV_ABI sceSaveDataGetParam(); +int PS4_SYSV_ABI sceSaveDataGetParam(const OrbisSaveDataMountPoint* mountPoint, + const OrbisSaveDataParamType paramType, void* paramBuf, + const size_t paramBufSize, size_t* gotSize); int PS4_SYSV_ABI sceSaveDataGetProgress(); int PS4_SYSV_ABI sceSaveDataGetSaveDataCount(); int PS4_SYSV_ABI sceSaveDataGetSaveDataMemory(const u32 userId, void* buf, const size_t bufSize, diff --git a/src/core/libraries/videoout/driver.cpp b/src/core/libraries/videoout/driver.cpp index e74fb10f..97b1816e 100644 --- a/src/core/libraries/videoout/driver.cpp +++ b/src/core/libraries/videoout/driver.cpp @@ -3,14 +3,16 @@ #include #include "common/assert.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/thread.h" #include "core/libraries/error_codes.h" #include "core/libraries/kernel/time_management.h" #include "core/libraries/videoout/driver.h" -#include "core/platform.h" - #include "video_core/renderer_vulkan/renderer_vulkan.h" extern std::unique_ptr renderer; +extern std::unique_ptr liverpool; namespace Libraries::VideoOut { @@ -41,20 +43,18 @@ VideoOutDriver::VideoOutDriver(u32 width, u32 height) { main_port.resolution.fullHeight = height; main_port.resolution.paneWidth = width; main_port.resolution.paneHeight = height; + present_thread = std::jthread([&](std::stop_token token) { PresentThread(token); }); } VideoOutDriver::~VideoOutDriver() = default; int VideoOutDriver::Open(const ServiceThreadParams* params) { - std::scoped_lock lock{mutex}; - if (main_port.is_open) { return ORBIS_VIDEO_OUT_ERROR_RESOURCE_BUSY; } - - int handle = 1; main_port.is_open = true; - return handle; + liverpool->SetVoPort(&main_port); + return 1; } void VideoOutDriver::Close(s32 handle) { @@ -158,31 +158,22 @@ int VideoOutDriver::UnregisterBuffers(VideoOutPort* port, s32 attributeIndex) { return ORBIS_OK; } -void VideoOutDriver::Flip(std::chrono::microseconds timeout) { - Request req; - { - std::unique_lock lock{mutex}; - submit_cond.wait_for(lock, timeout, [&] { return !requests.empty(); }); - if (requests.empty()) { - renderer->ShowSplash(); - return; - } - - // Retrieve the request. - req = requests.front(); - requests.pop(); +std::chrono::microseconds VideoOutDriver::Flip(const Request& req) { + if (!req) { + return std::chrono::microseconds{0}; } + const auto start = std::chrono::high_resolution_clock::now(); + // Whatever the game is rendering show splash if it is active if (!renderer->ShowSplash(req.frame)) { // Present the frame. renderer->Present(req.frame); } - std::scoped_lock lock{mutex}; - // Update flip status. - auto& flip_status = req.port->flip_status; + auto* port = req.port; + auto& flip_status = port->flip_status; flip_status.count++; flip_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); flip_status.tsc = Libraries::Kernel::sceKernelReadTsc(); @@ -192,7 +183,7 @@ void VideoOutDriver::Flip(std::chrono::microseconds timeout) { flip_status.flipPendingNum = static_cast(requests.size()); // Trigger flip events for the port. - for (auto& event : req.port->flip_events) { + for (auto& event : port->flip_events) { if (event != nullptr) { event->TriggerEvent(SCE_VIDEO_OUT_EVENT_FLIP, Kernel::SceKernelEvent::Filter::VideoOut, reinterpret_cast(req.flip_arg)); @@ -201,21 +192,23 @@ void VideoOutDriver::Flip(std::chrono::microseconds timeout) { // Reset flip label if (req.index != -1) { - req.port->buffer_labels[req.index] = 0; + port->buffer_labels[req.index] = 0; + port->SignalVoLabel(); } + + const auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(end - start); } bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, bool is_eop /*= false*/) { - std::scoped_lock lock{mutex}; - Vulkan::Frame* frame; if (index == -1) { frame = renderer->PrepareBlankFrame(); } else { const auto& buffer = port->buffer_slots[index]; const auto& group = port->groups[buffer.group_index]; - frame = renderer->PrepareFrame(group, buffer.address_left); + frame = renderer->PrepareFrame(group, buffer.address_left, is_eop); } if (index != -1 && requests.size() >= port->NumRegisteredBuffers()) { @@ -223,6 +216,7 @@ bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, return false; } + std::scoped_lock lock{mutex}; requests.push({ .frame = frame, .port = port, @@ -234,24 +228,53 @@ bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, port->flip_status.flipPendingNum = static_cast(requests.size()); port->flip_status.gcQueueNum = 0; - submit_cond.notify_one(); return true; } -void VideoOutDriver::Vblank() { - std::scoped_lock lock{mutex}; +void VideoOutDriver::PresentThread(std::stop_token token) { + static constexpr std::chrono::milliseconds VblankPeriod{16}; + Common::SetCurrentThreadName("PresentThread"); - auto& vblank_status = main_port.vblank_status; - vblank_status.count++; - vblank_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); - vblank_status.tsc = Libraries::Kernel::sceKernelReadTsc(); + const auto receive_request = [this] -> Request { + std::scoped_lock lk{mutex}; + if (!requests.empty()) { + const auto request = requests.front(); + requests.pop(); + return request; + } + return {}; + }; - // Trigger flip events for the port. - for (auto& event : main_port.vblank_events) { - if (event != nullptr) { - event->TriggerEvent(SCE_VIDEO_OUT_EVENT_VBLANK, - Kernel::SceKernelEvent::Filter::VideoOut, nullptr); + auto vblank_period = VblankPeriod / Config::vblankDiv(); + auto delay = std::chrono::microseconds{0}; + while (!token.stop_requested()) { + // Sleep for most of the vblank duration. + std::this_thread::sleep_for(vblank_period - delay); + + // Check if it's time to take a request. + auto& vblank_status = main_port.vblank_status; + if (vblank_status.count % (main_port.flip_rate + 1) == 0) { + const auto request = receive_request(); + delay = Flip(request); + FRAME_END; + } + + { + // Needs lock here as can be concurrently read by `sceVideoOutGetVblankStatus` + std::unique_lock lock{main_port.vo_mutex}; + vblank_status.count++; + vblank_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); + vblank_status.tsc = Libraries::Kernel::sceKernelReadTsc(); + main_port.vblank_cv.notify_all(); + } + + // Trigger flip events for the port. + for (auto& event : main_port.vblank_events) { + if (event != nullptr) { + event->TriggerEvent(SCE_VIDEO_OUT_EVENT_VBLANK, + Kernel::SceKernelEvent::Filter::VideoOut, nullptr); + } } } } diff --git a/src/core/libraries/videoout/driver.h b/src/core/libraries/videoout/driver.h index d98e62ee..104056de 100644 --- a/src/core/libraries/videoout/driver.h +++ b/src/core/libraries/videoout/driver.h @@ -3,10 +3,13 @@ #pragma once +#include "common/debug.h" +#include "common/polyfill_thread.h" +#include "core/libraries/videoout/video_out.h" + #include #include #include -#include "core/libraries/videoout/video_out.h" namespace Vulkan { struct Frame; @@ -25,6 +28,9 @@ struct VideoOutPort { SceVideoOutVblankStatus vblank_status; std::vector flip_events; std::vector vblank_events; + std::mutex vo_mutex; + std::condition_variable vo_cv; + std::condition_variable vblank_cv; int flip_rate = 0; s32 FindFreeGroup() const { @@ -35,6 +41,22 @@ struct VideoOutPort { return index; } + bool IsVoLabel(const u64* address) const { + const u64* start = &buffer_labels[0]; + const u64* end = &buffer_labels[MaxDisplayBuffers - 1]; + return address >= start && address <= end; + } + + void WaitVoLabel(auto&& pred) { + std::unique_lock lk{vo_mutex}; + vo_cv.wait(lk, pred); + } + + void SignalVoLabel() { + std::scoped_lock lk{vo_mutex}; + vo_cv.notify_one(); + } + [[nodiscard]] int NumRegisteredBuffers() const { return std::count_if(buffer_slots.cbegin(), buffer_slots.cend(), [](auto& buffer) { return buffer.group_index != -1; }); @@ -63,11 +85,8 @@ public: const BufferAttribute* attribute); int UnregisterBuffers(VideoOutPort* port, s32 attributeIndex); - void Flip(std::chrono::microseconds timeout); bool SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, bool is_eop = false); - void Vblank(); - private: struct Request { Vulkan::Frame* frame; @@ -76,14 +95,19 @@ private: s64 flip_arg; u64 submit_tsc; bool eop; + + operator bool() const noexcept { + return frame != nullptr; + } }; + std::chrono::microseconds Flip(const Request& req); + void PresentThread(std::stop_token token); + std::mutex mutex; VideoOutPort main_port{}; - std::condition_variable_any submit_cond; - std::condition_variable done_cond; + std::jthread present_thread; std::queue requests; - bool is_neo{}; }; } // namespace Libraries::VideoOut diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index 8fbd69c4..15e14662 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -183,6 +183,7 @@ s32 PS4_SYSV_ABI sceVideoOutGetVblankStatus(int handle, SceVideoOutVblankStatus* return ORBIS_VIDEO_OUT_ERROR_INVALID_HANDLE; } + std::unique_lock lock{port->vo_mutex}; *status = port->vblank_status; return ORBIS_OK; } @@ -229,14 +230,6 @@ s32 PS4_SYSV_ABI sceVideoOutUnregisterBuffers(s32 handle, s32 attributeIndex) { return driver->UnregisterBuffers(port, attributeIndex); } -void Flip(std::chrono::microseconds micros) { - return driver->Flip(micros); -} - -void Vblank() { - return driver->Vblank(); -} - void sceVideoOutGetBufferLabelAddress(s32 handle, uintptr_t* label_addr) { auto* port = driver->GetPort(handle); ASSERT(port); @@ -266,6 +259,18 @@ s32 PS4_SYSV_ABI sceVideoOutGetDeviceCapabilityInfo( return ORBIS_OK; } +s32 PS4_SYSV_ABI sceVideoOutWaitVblank(s32 handle) { + auto* port = driver->GetPort(handle); + if (!port) { + return ORBIS_VIDEO_OUT_ERROR_INVALID_HANDLE; + } + + std::unique_lock lock{port->vo_mutex}; + const auto prev_counter = port->vblank_status.count; + port->vblank_cv.wait(lock, [&]() { return prev_counter != port->vblank_status.count; }); + return ORBIS_OK; +} + void RegisterLib(Core::Loader::SymbolsResolver* sym) { driver = std::make_unique(Config::getScreenWidth(), Config::getScreenHeight()); @@ -294,6 +299,7 @@ void RegisterLib(Core::Loader::SymbolsResolver* sym) { sceVideoOutGetVblankStatus); LIB_FUNCTION("kGVLc3htQE8", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutGetDeviceCapabilityInfo); + LIB_FUNCTION("j6RaAUlaLv0", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutWaitVblank); // openOrbis appears to have libSceVideoOut_v1 module libSceVideoOut_v1.1 LIB_FUNCTION("Up36PTk687E", "libSceVideoOut", 1, "libSceVideoOut", 1, 1, sceVideoOutOpen); diff --git a/src/core/libraries/videoout/video_out.h b/src/core/libraries/videoout/video_out.h index 52426ecc..b4423efd 100644 --- a/src/core/libraries/videoout/video_out.h +++ b/src/core/libraries/videoout/video_out.h @@ -92,11 +92,12 @@ void PS4_SYSV_ABI sceVideoOutSetBufferAttribute(BufferAttribute* attribute, Pixe u32 tilingMode, u32 aspectRatio, u32 width, u32 height, u32 pitchInPixel); s32 PS4_SYSV_ABI sceVideoOutAddFlipEvent(Kernel::SceKernelEqueue eq, s32 handle, void* udata); -s32 PS4_SYSV_ABI sceVideoOutAddVBlankEvent(Kernel::SceKernelEqueue eq, s32 handle, void* udata); +s32 PS4_SYSV_ABI sceVideoOutAddVblankEvent(Kernel::SceKernelEqueue eq, s32 handle, void* udata); s32 PS4_SYSV_ABI sceVideoOutRegisterBuffers(s32 handle, s32 startIndex, void* const* addresses, s32 bufferNum, const BufferAttribute* attribute); s32 PS4_SYSV_ABI sceVideoOutSetFlipRate(s32 handle, s32 rate); s32 PS4_SYSV_ABI sceVideoOutIsFlipPending(s32 handle); +s32 PS4_SYSV_ABI sceVideoOutWaitVblank(s32 handle); s32 PS4_SYSV_ABI sceVideoOutSubmitFlip(s32 handle, s32 bufferIndex, s32 flipMode, s64 flipArg); s32 PS4_SYSV_ABI sceVideoOutGetFlipStatus(s32 handle, FlipStatus* status); s32 PS4_SYSV_ABI sceVideoOutGetResolutionStatus(s32 handle, SceVideoOutResolutionStatus* status); @@ -104,9 +105,6 @@ s32 PS4_SYSV_ABI sceVideoOutOpen(SceUserServiceUserId userId, s32 busType, s32 i const void* param); s32 PS4_SYSV_ABI sceVideoOutClose(s32 handle); -void Flip(std::chrono::microseconds micros); -void Vblank(); - // Internal system functions void sceVideoOutGetBufferLabelAddress(s32 handle, uintptr_t* label_addr); s32 sceVideoOutSubmitEopFlip(s32 handle, u32 buf_id, u32 mode, u32 arg, void** unk); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index ba6c6f6a..28a348d4 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -193,7 +193,7 @@ int MemoryManager::MapFile(void** out_addr, VAddr virtual_addr, size_t size, Mem // Find first free area to map the file. if (False(flags & MemoryMapFlags::Fixed)) { - mapped_addr = SearchFree(mapped_addr, size_aligned); + mapped_addr = SearchFree(mapped_addr, size_aligned, 1); } if (True(flags & MemoryMapFlags::Fixed)) { @@ -240,6 +240,7 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { vma.prot = MemoryProt::NoAccess; vma.phys_base = 0; vma.disallow_merge = false; + vma.name = ""; MergeAdjacent(vma_map, new_it); // Unmap the memory region. @@ -334,6 +335,7 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags, info->is_flexible.Assign(vma.type == VMAType::Flexible); info->is_direct.Assign(vma.type == VMAType::Direct); info->is_commited.Assign(vma.type != VMAType::Free); + vma.name.copy(info->name.data(), std::min(info->name.size(), vma.name.size())); if (vma.type == VMAType::Direct) { const auto dmem_it = FindDmemArea(vma.phys_base); ASSERT(dmem_it != dmem_map.end()); @@ -392,8 +394,23 @@ std::pair MemoryManager::GetVulkanBuffer(VAddr addr) { return std::make_pair(*it->second.buffer, addr - it->first); } -VAddr MemoryManager::SearchFree(VAddr virtual_addr, size_t size, u32 alignment) { +void MemoryManager::NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name) { auto it = FindVMA(virtual_addr); + + ASSERT_MSG(it->second.Contains(virtual_addr, size), + "Range provided is not fully containted in vma"); + it->second.name = name; +} +VAddr MemoryManager::SearchFree(VAddr virtual_addr, size_t size, u32 alignment) { + // If the requested address is below the mapped range, start search from the lowest address + auto min_search_address = impl.SystemManagedVirtualBase(); + if (virtual_addr < min_search_address) { + virtual_addr = min_search_address; + } + + auto it = FindVMA(virtual_addr); + ASSERT_MSG(it != vma_map.end(), "Specified mapping address was not found!"); + // If the VMA is free and contains the requested mapping we are done. if (it->second.IsFree() && it->second.Contains(virtual_addr, size)) { return virtual_addr; diff --git a/src/core/memory.h b/src/core/memory.h index 0122deed..0a9641d3 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -179,6 +179,8 @@ public: int GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut); + void NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name); + private: VMAHandle FindVMA(VAddr target) { return std::prev(vma_map.upper_bound(target)); diff --git a/src/emulator.cpp b/src/emulator.cpp index 47ac57ac..a34ee359 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -1,29 +1,33 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include -#include -#include -#include -#include -#include #include + #include "common/config.h" #include "common/debug.h" #include "common/logging/backend.h" +#include "common/logging/log.h" #include "common/ntapi.h" #include "common/path_util.h" #include "common/polyfill_thread.h" #include "common/singleton.h" #include "common/version.h" +#include "core/file_format/playgo_chunk.h" +#include "core/file_format/psf.h" +#include "core/file_format/splash.h" #include "core/file_sys/fs.h" +#include "core/libraries/disc_map/disc_map.h" #include "core/libraries/kernel/thread_management.h" +#include "core/libraries/libc/libc.h" +#include "core/libraries/libc_internal/libc_internal.h" #include "core/libraries/libs.h" +#include "core/libraries/rtc/rtc.h" +#include "core/libraries/videoout/video_out.h" #include "core/linker.h" #include "core/memory.h" #include "emulator.h" +#include "src/common/scm_rev.h" +#include "video_core/renderdoc.h" Frontend::WindowSDL* g_window = nullptr; @@ -46,14 +50,17 @@ Emulator::Emulator() { Common::Log::Initialize(); Common::Log::Start(); LOG_INFO(Loader, "Starting shadps4 emulator v{} ", Common::VERSION); + LOG_INFO(Loader, "Revision {}", Common::g_scm_rev); + LOG_INFO(Loader, "Branch {}", Common::g_scm_branch); + LOG_INFO(Loader, "Description {}", Common::g_scm_desc); // Defer until after logging is initialized. memory = Core::Memory::Instance(); controller = Common::Singleton::Instance(); linker = Common::Singleton::Instance(); - window = std::make_unique(WindowWidth, WindowHeight, controller); - g_window = window.get(); + // Load renderdoc module. + VideoCore::LoadRenderDoc(); } Emulator::~Emulator() { @@ -68,6 +75,8 @@ void Emulator::Run(const std::filesystem::path& file) { // Loading param.sfo file if exists std::string id; + std::string title; + std::string app_version; std::filesystem::path sce_sys_folder = file.parent_path() / "sce_sys"; if (std::filesystem::is_directory(sce_sys_folder)) { for (const auto& entry : std::filesystem::directory_iterator(sce_sys_folder)) { @@ -75,11 +84,14 @@ void Emulator::Run(const std::filesystem::path& file) { auto* param_sfo = Common::Singleton::Instance(); param_sfo->open(sce_sys_folder.string() + "/param.sfo", {}); id = std::string(param_sfo->GetString("CONTENT_ID"), 7, 9); - std::string title(param_sfo->GetString("TITLE")); + title = param_sfo->GetString("TITLE"); LOG_INFO(Loader, "Game id: {} Title: {}", id, title); u32 fw_version = param_sfo->GetInteger("SYSTEM_VER"); - std::string app_version = param_sfo->GetString("APP_VER"); + app_version = param_sfo->GetString("APP_VER"); LOG_INFO(Loader, "Fw: {:#x} App Version: {}", fw_version, app_version); + } else if (entry.path().filename() == "playgo-chunk.dat") { + auto* playgo = Common::Singleton::Instance(); + playgo->Open(sce_sys_folder.string() + "/playgo-chunk.dat"); } else if (entry.path().filename() == "pic0.png" || entry.path().filename() == "pic1.png") { auto* splash = Common::Singleton::Instance(); @@ -93,6 +105,19 @@ void Emulator::Run(const std::filesystem::path& file) { } } + std::string game_title = fmt::format("{} - {} <{}>", id, title, app_version); + std::string window_title = ""; + if (Common::isRelease) { + window_title = fmt::format("shadPS4 v{} | {}", Common::VERSION, game_title); + } else { + window_title = + fmt::format("shadPS4 v{} {} | {}", Common::VERSION, Common::g_scm_desc, game_title); + } + window = + std::make_unique(WindowWidth, WindowHeight, controller, window_title); + + g_window = window.get(); + const auto& mount_data_dir = Common::FS::GetUserPath(Common::FS::PathType::GameDataDir) / id; if (!std::filesystem::exists(mount_data_dir)) { std::filesystem::create_directory(mount_data_dir); @@ -104,6 +129,19 @@ void Emulator::Run(const std::filesystem::path& file) { } mnt->Mount(mount_temp_dir, "/temp0"); // called in app_content ==> stat/mkdir + const auto& mount_download_dir = + Common::FS::GetUserPath(Common::FS::PathType::DownloadDir) / id; + if (!std::filesystem::exists(mount_download_dir)) { + std::filesystem::create_directory(mount_download_dir); + } + mnt->Mount(mount_download_dir, "/download0"); + + const auto& mount_captures_dir = Common::FS::GetUserPath(Common::FS::PathType::CapturesDir); + if (!std::filesystem::exists(mount_captures_dir)) { + std::filesystem::create_directory(mount_captures_dir); + } + VideoCore::SetOutputDir(mount_captures_dir.generic_string(), id); + // Initialize kernel and library facilities. Libraries::Kernel::init_pthreads(); Libraries::InitHLELibs(&linker->GetHLESymbols()); @@ -136,14 +174,8 @@ void Emulator::Run(const std::filesystem::path& file) { std::jthread mainthread = std::jthread([this](std::stop_token stop_token) { linker->Execute(); }); - // Begin main window loop until the application exits - static constexpr std::chrono::milliseconds FlipPeriod{16}; - while (window->isOpen()) { window->waitEvent(); - Libraries::VideoOut::Flip(FlipPeriod); - Libraries::VideoOut::Vblank(); - FRAME_END; } std::exit(0); diff --git a/src/emulator.h b/src/emulator.h index 323170e3..01bce7e7 100644 --- a/src/emulator.h +++ b/src/emulator.h @@ -6,7 +6,7 @@ #include #include -#include +#include "common/singleton.h" #include "core/linker.h" #include "input/controller.h" #include "sdl_window.h" diff --git a/src/images/shadPS4.icns b/src/images/shadPS4.icns new file mode 100644 index 00000000..0e26368d Binary files /dev/null and b/src/images/shadPS4.icns differ diff --git a/src/images/shadps4.ico b/src/images/shadps4.ico index 4f71b234..bb50f999 100644 Binary files a/src/images/shadps4.ico and b/src/images/shadps4.ico differ diff --git a/src/input/controller.cpp b/src/input/controller.cpp index 7bfecadc..247e08ce 100644 --- a/src/input/controller.cpp +++ b/src/input/controller.cpp @@ -4,6 +4,7 @@ #include "core/libraries/kernel/time_management.h" #include "core/libraries/pad/pad.h" #include "input/controller.h" + namespace Input { GameController::GameController() { diff --git a/src/main.cpp b/src/main.cpp index c7210ac5..9df14f13 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include #include +#include "emulator.h" int main(int argc, char* argv[]) { if (argc == 1) { diff --git a/src/qt_gui/elf_viewer.cpp b/src/qt_gui/elf_viewer.cpp index 1674e1ab..72861d15 100644 --- a/src/qt_gui/elf_viewer.cpp +++ b/src/qt_gui/elf_viewer.cpp @@ -2,7 +2,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include + #include "elf_viewer.h" + ElfViewer::ElfViewer(QWidget* parent) : QTableWidget(parent) { dir_list_std = Config::getElfViewer(); for (const auto& str : dir_list_std) { diff --git a/src/qt_gui/elf_viewer.h b/src/qt_gui/elf_viewer.h index 15aeb55f..a3b85223 100644 --- a/src/qt_gui/elf_viewer.h +++ b/src/qt_gui/elf_viewer.h @@ -14,8 +14,9 @@ #include #include #include + +#include "core/loader/elf.h" #include "game_list_frame.h" -#include "src/core/loader/elf.h" class ElfViewer : public QTableWidget { Q_OBJECT diff --git a/src/qt_gui/game_grid_frame.h b/src/qt_gui/game_grid_frame.h index 19ac531b..ce775315 100644 --- a/src/qt_gui/game_grid_frame.h +++ b/src/qt_gui/game_grid_frame.h @@ -14,6 +14,7 @@ #include #include #include + #include "common/config.h" #include "game_info.h" #include "game_list_utils.h" diff --git a/src/qt_gui/game_info.cpp b/src/qt_gui/game_info.cpp index 39cdeb75..0a472eae 100644 --- a/src/qt_gui/game_info.cpp +++ b/src/qt_gui/game_info.cpp @@ -5,6 +5,7 @@ #include #include #include + #include "game_info.h" GameInfoClass::GameInfoClass() = default; @@ -42,4 +43,4 @@ void GameInfoClass::GetGameInfo(QWidget* parent) { &QProgressDialog::setValue); dialog.exec(); -} \ No newline at end of file +} diff --git a/src/qt_gui/game_info.h b/src/qt_gui/game_info.h index c137a5a6..b2b102e0 100644 --- a/src/qt_gui/game_info.h +++ b/src/qt_gui/game_info.h @@ -7,6 +7,7 @@ #include #include #include + #include "common/config.h" #include "core/file_format/psf.h" #include "game_list_utils.h" diff --git a/src/qt_gui/game_install_dialog.cpp b/src/qt_gui/game_install_dialog.cpp index ab4fc273..4b2b8528 100644 --- a/src/qt_gui/game_install_dialog.cpp +++ b/src/qt_gui/game_install_dialog.cpp @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "game_install_dialog.h" - #include #include #include @@ -14,6 +12,8 @@ #include #include +#include "game_install_dialog.h" + GameInstallDialog::GameInstallDialog() : m_gamesDirectory(nullptr) { auto layout = new QVBoxLayout(this); @@ -21,8 +21,8 @@ GameInstallDialog::GameInstallDialog() : m_gamesDirectory(nullptr) { layout->addStretch(); layout->addWidget(SetupDialogActions()); - setWindowTitle("Shadps4 - Choose directory"); - setWindowIcon(QIcon(":/images/shadps4.ico")); + setWindowTitle("shadPS4 - Choose directory"); + setWindowIcon(QIcon(":images/shadps4.ico")); } GameInstallDialog::~GameInstallDialog() {} @@ -47,7 +47,7 @@ QWidget* GameInstallDialog::SetupGamesDirectory() { layout->addWidget(m_gamesDirectory); // Browse button. - auto browse = new QPushButton("..."); + auto browse = new QPushButton("Browse"); connect(browse, &QPushButton::clicked, this, &GameInstallDialog::Browse); diff --git a/src/qt_gui/game_install_dialog.h b/src/qt_gui/game_install_dialog.h index abd447d9..6f439e81 100644 --- a/src/qt_gui/game_install_dialog.h +++ b/src/qt_gui/game_install_dialog.h @@ -4,6 +4,7 @@ #pragma once #include + #include "common/config.h" #include "common/path_util.h" diff --git a/src/qt_gui/game_list_frame.cpp b/src/qt_gui/game_list_frame.cpp index 4353964f..2699c961 100644 --- a/src/qt_gui/game_list_frame.cpp +++ b/src/qt_gui/game_list_frame.cpp @@ -190,17 +190,17 @@ void GameListFrame::SetRegionFlag(int row, int column, QString itemStr) { QTableWidgetItem* item = new QTableWidgetItem(); QImage scaledPixmap; if (itemStr == "Japan") { - scaledPixmap = QImage(":/images/flag_jp.png"); + scaledPixmap = QImage(":images/flag_jp.png"); } else if (itemStr == "Europe") { - scaledPixmap = QImage(":/images/flag_eu.png"); + scaledPixmap = QImage(":images/flag_eu.png"); } else if (itemStr == "USA") { - scaledPixmap = QImage(":/images/flag_us.png"); + scaledPixmap = QImage(":images/flag_us.png"); } else if (itemStr == "Asia") { - scaledPixmap = QImage(":/images/flag_china.png"); + scaledPixmap = QImage(":images/flag_china.png"); } else if (itemStr == "World") { - scaledPixmap = QImage(":/images/flag_world.png"); + scaledPixmap = QImage(":images/flag_world.png"); } else { - scaledPixmap = QImage(":/images/flag_unk.png"); + scaledPixmap = QImage(":images/flag_unk.png"); } QWidget* widget = new QWidget(this); QVBoxLayout* layout = new QVBoxLayout(widget); diff --git a/src/qt_gui/game_list_frame.h b/src/qt_gui/game_list_frame.h index e9f75afd..d8bccf46 100644 --- a/src/qt_gui/game_list_frame.h +++ b/src/qt_gui/game_list_frame.h @@ -16,6 +16,7 @@ #include #include #include + #include "game_info.h" #include "game_list_utils.h" #include "gui_context_menus.h" diff --git a/src/qt_gui/gui_context_menus.h b/src/qt_gui/gui_context_menus.h index 9f895360..146d5c34 100644 --- a/src/qt_gui/gui_context_menus.h +++ b/src/qt_gui/gui_context_menus.h @@ -165,12 +165,12 @@ public: if (createShortcutLinux(linkPath, ebootPath, iconPath)) { #endif QMessageBox::information( - nullptr, "Shortcut Creation", - QString("Shortcut created successfully:\n %1").arg(linkPath)); + nullptr, "Shortcut creation", + QString("Shortcut created successfully!\n %1").arg(linkPath)); } else { QMessageBox::critical( nullptr, "Error", - QString("Error creating shortcut:\n %1").arg(linkPath)); + QString("Error creating shortcut!\n %1").arg(linkPath)); } } else { QMessageBox::critical(nullptr, "Error", "Failed to convert icon."); @@ -183,11 +183,11 @@ public: if (createShortcutLinux(linkPath, ebootPath, iconPath)) { #endif QMessageBox::information( - nullptr, "Shortcut Creation", - QString("Shortcut created successfully:\n %1").arg(linkPath)); + nullptr, "Shortcut creation", + QString("Shortcut created successfully!\n %1").arg(linkPath)); } else { QMessageBox::critical(nullptr, "Error", - QString("Error creating shortcut:\n %1").arg(linkPath)); + QString("Error creating shortcut!\n %1").arg(linkPath)); } } } @@ -308,7 +308,7 @@ private: QFile shortcutFile(linkPath); if (!shortcutFile.open(QIODevice::WriteOnly | QIODevice::Text)) { QMessageBox::critical(nullptr, "Error", - QString("Error creating shortcut:\n %1").arg(linkPath)); + QString("Error creating shortcut!\n %1").arg(linkPath)); return false; } diff --git a/src/qt_gui/main.cpp b/src/qt_gui/main.cpp index ea3f27f8..15a06c86 100644 --- a/src/qt_gui/main.cpp +++ b/src/qt_gui/main.cpp @@ -2,15 +2,14 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include "common/config.h" #include "core/file_sys/fs.h" +#include "emulator.h" #include "qt_gui/game_install_dialog.h" #include "qt_gui/main_window.h" -#include -#include - // Custom message handler to ignore Qt logs void customMessageHandler(QtMsgType, const QMessageLogContext&, const QString&) {} diff --git a/src/qt_gui/main_window.cpp b/src/qt_gui/main_window.cpp index 3f325ff4..b3778be0 100644 --- a/src/qt_gui/main_window.cpp +++ b/src/qt_gui/main_window.cpp @@ -8,6 +8,7 @@ #include #include #include + #include "common/io_file.h" #include "common/version.h" #include "core/file_format/pkg.h" diff --git a/src/qt_gui/main_window.h b/src/qt_gui/main_window.h index 27d14b93..d1ef48dc 100644 --- a/src/qt_gui/main_window.h +++ b/src/qt_gui/main_window.h @@ -9,13 +9,14 @@ #include #include #include -#include #include + #include "common/config.h" #include "common/path_util.h" #include "core/file_format/psf.h" #include "core/file_sys/fs.h" #include "elf_viewer.h" +#include "emulator.h" #include "game_grid_frame.h" #include "game_info.h" #include "game_list_frame.h" diff --git a/src/qt_gui/main_window_themes.cpp b/src/qt_gui/main_window_themes.cpp index 858bbb07..c89fa5a0 100644 --- a/src/qt_gui/main_window_themes.cpp +++ b/src/qt_gui/main_window_themes.cpp @@ -7,25 +7,6 @@ void WindowThemes::SetWindowTheme(Theme theme, QLineEdit* mw_searchbar) { QPalette themePalette; switch (theme) { - case Theme::Light: - mw_searchbar->setStyleSheet("background-color: #ffffff; /* Light gray background */" - "color: #000000; /* Black text */" - "padding: 5px;"); - themePalette.setColor(QPalette::Window, QColor(240, 240, 240)); // Light gray - themePalette.setColor(QPalette::WindowText, Qt::black); // Black - themePalette.setColor(QPalette::Base, QColor(230, 230, 230, 80)); // Grayish - themePalette.setColor(QPalette::ToolTipBase, Qt::black); // Black - themePalette.setColor(QPalette::ToolTipText, Qt::black); // Black - themePalette.setColor(QPalette::Text, Qt::black); // Black - themePalette.setColor(QPalette::Button, QColor(240, 240, 240)); // Light gray - themePalette.setColor(QPalette::ButtonText, Qt::black); // Black - themePalette.setColor(QPalette::BrightText, Qt::red); // Red - themePalette.setColor(QPalette::Link, QColor(42, 130, 218)); // Blue - themePalette.setColor(QPalette::Highlight, QColor(42, 130, 218)); // Blue - themePalette.setColor(QPalette::HighlightedText, Qt::white); // White - qApp->setPalette(themePalette); - break; - case Theme::Dark: mw_searchbar->setStyleSheet("background-color: #1e1e1e; /* Dark background */" "color: #ffffff; /* White text */" @@ -48,6 +29,25 @@ void WindowThemes::SetWindowTheme(Theme theme, QLineEdit* mw_searchbar) { qApp->setPalette(themePalette); break; + case Theme::Light: + mw_searchbar->setStyleSheet("background-color: #ffffff; /* Light gray background */" + "color: #000000; /* Black text */" + "padding: 5px;"); + themePalette.setColor(QPalette::Window, QColor(240, 240, 240)); // Light gray + themePalette.setColor(QPalette::WindowText, Qt::black); // Black + themePalette.setColor(QPalette::Base, QColor(230, 230, 230, 80)); // Grayish + themePalette.setColor(QPalette::ToolTipBase, Qt::black); // Black + themePalette.setColor(QPalette::ToolTipText, Qt::black); // Black + themePalette.setColor(QPalette::Text, Qt::black); // Black + themePalette.setColor(QPalette::Button, QColor(240, 240, 240)); // Light gray + themePalette.setColor(QPalette::ButtonText, Qt::black); // Black + themePalette.setColor(QPalette::BrightText, Qt::red); // Red + themePalette.setColor(QPalette::Link, QColor(42, 130, 218)); // Blue + themePalette.setColor(QPalette::Highlight, QColor(42, 130, 218)); // Blue + themePalette.setColor(QPalette::HighlightedText, Qt::white); // White + qApp->setPalette(themePalette); + break; + case Theme::Green: mw_searchbar->setStyleSheet("background-color: #354535; /* Dark green background */" "color: #ffffff; /* White text */" diff --git a/src/qt_gui/main_window_themes.h b/src/qt_gui/main_window_themes.h index 8b87fbce..6da70e99 100644 --- a/src/qt_gui/main_window_themes.h +++ b/src/qt_gui/main_window_themes.h @@ -2,13 +2,14 @@ // SPDX-License-Identifier: GPL-2.0-or-later #pragma once + #include #include #include enum class Theme : int { - Light, Dark, + Light, Green, Blue, Violet, diff --git a/src/qt_gui/main_window_ui.h b/src/qt_gui/main_window_ui.h index 7b5bf181..69d71847 100644 --- a/src/qt_gui/main_window_ui.h +++ b/src/qt_gui/main_window_ui.h @@ -44,8 +44,8 @@ public: QAction* gameInstallPathAct; QAction* dumpGameListAct; QAction* pkgViewerAct; - QAction* setThemeLight; QAction* setThemeDark; + QAction* setThemeLight; QAction* setThemeGreen; QAction* setThemeBlue; QAction* setThemeViolet; @@ -76,7 +76,7 @@ public: MainWindow->setObjectName("MainWindow"); // MainWindow->resize(1280, 720); QIcon icon; - icon.addFile(QString::fromUtf8(":/images/shadps4.ico"), QSize(), QIcon::Normal, QIcon::Off); + icon.addFile(QString::fromUtf8(":images/shadps4.ico"), QSize(), QIcon::Normal, QIcon::Off); MainWindow->setWindowIcon(icon); QSizePolicy sizePolicy(QSizePolicy::Expanding, QSizePolicy::Expanding); sizePolicy.setHorizontalStretch(0); @@ -136,13 +136,13 @@ public: pkgViewerAct->setObjectName("pkgViewer"); pkgViewerAct->setObjectName("pkgViewer"); pkgViewerAct->setIcon(QIcon(":images/file_icon.png")); - setThemeLight = new QAction(MainWindow); - setThemeLight->setObjectName("setThemeLight"); - setThemeLight->setCheckable(true); - setThemeLight->setChecked(true); setThemeDark = new QAction(MainWindow); setThemeDark->setObjectName("setThemeDark"); setThemeDark->setCheckable(true); + setThemeDark->setChecked(true); + setThemeLight = new QAction(MainWindow); + setThemeLight->setObjectName("setThemeLight"); + setThemeLight->setCheckable(true); setThemeGreen = new QAction(MainWindow); setThemeGreen->setObjectName("setThemeGreen"); setThemeGreen->setCheckable(true); @@ -285,7 +285,7 @@ public: } // setupUi void retranslateUi(QMainWindow* MainWindow) { - MainWindow->setWindowTitle(QCoreApplication::translate("MainWindow", "Shadps4", nullptr)); + MainWindow->setWindowTitle(QCoreApplication::translate("MainWindow", "shadPS4", nullptr)); addElfFolderAct->setText( QCoreApplication::translate("MainWindow", "Open/Add Elf Folder", nullptr)); bootInstallPkgAct->setText( @@ -332,8 +332,8 @@ public: menuSettings->setTitle(QCoreApplication::translate("MainWindow", "Settings", nullptr)); menuUtils->setTitle(QCoreApplication::translate("MainWindow", "Utils", nullptr)); menuThemes->setTitle(QCoreApplication::translate("MainWindow", "Themes", nullptr)); - setThemeLight->setText(QCoreApplication::translate("MainWindow", "Light", nullptr)); setThemeDark->setText(QCoreApplication::translate("MainWindow", "Dark", nullptr)); + setThemeLight->setText(QCoreApplication::translate("MainWindow", "Light", nullptr)); setThemeGreen->setText(QCoreApplication::translate("MainWindow", "Green", nullptr)); setThemeBlue->setText(QCoreApplication::translate("MainWindow", "Blue", nullptr)); setThemeViolet->setText(QCoreApplication::translate("MainWindow", "Violet", nullptr)); diff --git a/src/qt_gui/pkg_viewer.cpp b/src/qt_gui/pkg_viewer.cpp index cf0b2167..cd2ce2b6 100644 --- a/src/qt_gui/pkg_viewer.cpp +++ b/src/qt_gui/pkg_viewer.cpp @@ -3,6 +3,7 @@ #include #include + #include "pkg_viewer.h" PKGViewer::PKGViewer(std::shared_ptr game_info_get, QWidget* parent, diff --git a/src/qt_gui/pkg_viewer.h b/src/qt_gui/pkg_viewer.h index 0e0a8706..e040d595 100644 --- a/src/qt_gui/pkg_viewer.h +++ b/src/qt_gui/pkg_viewer.h @@ -15,6 +15,7 @@ #include #include #include + #include "common/io_file.h" #include "core/file_format/pkg.h" #include "core/file_format/pkg_type.h" diff --git a/src/qt_gui/trophy_viewer.h b/src/qt_gui/trophy_viewer.h index ab79ac50..2b794593 100644 --- a/src/qt_gui/trophy_viewer.h +++ b/src/qt_gui/trophy_viewer.h @@ -16,6 +16,7 @@ #include #include #include + #include "common/types.h" #include "core/file_format/trp.h" diff --git a/src/sdl_window.cpp b/src/sdl_window.cpp index c4fcbcfa..0d25cd3f 100644 --- a/src/sdl_window.cpp +++ b/src/sdl_window.cpp @@ -11,6 +11,7 @@ #include "core/libraries/pad/pad.h" #include "input/controller.h" #include "sdl_window.h" +#include "video_core/renderdoc.h" #ifdef __APPLE__ #include @@ -18,16 +19,17 @@ namespace Frontend { -WindowSDL::WindowSDL(s32 width_, s32 height_, Input::GameController* controller_) +WindowSDL::WindowSDL(s32 width_, s32 height_, Input::GameController* controller_, + std::string_view window_title) : width{width_}, height{height_}, controller{controller_} { if (SDL_Init(SDL_INIT_VIDEO) < 0) { UNREACHABLE_MSG("Failed to initialize SDL video subsystem: {}", SDL_GetError()); } SDL_InitSubSystem(SDL_INIT_AUDIO); - const std::string title = "shadPS4 v" + std::string(Common::VERSION); SDL_PropertiesID props = SDL_CreateProperties(); - SDL_SetStringProperty(props, SDL_PROP_WINDOW_CREATE_TITLE_STRING, title.c_str()); + SDL_SetStringProperty(props, SDL_PROP_WINDOW_CREATE_TITLE_STRING, + std::string(window_title).c_str()); SDL_SetNumberProperty(props, SDL_PROP_WINDOW_CREATE_X_NUMBER, SDL_WINDOWPOS_CENTERED); SDL_SetNumberProperty(props, SDL_PROP_WINDOW_CREATE_Y_NUMBER, SDL_WINDOWPOS_CENTERED); SDL_SetNumberProperty(props, SDL_PROP_WINDOW_CREATE_WIDTH_NUMBER, width); @@ -71,7 +73,7 @@ void WindowSDL::waitEvent() { // Called on main thread SDL_Event event; - if (!SDL_PollEvent(&event)) { + if (!SDL_WaitEvent(&event)) { return; } @@ -179,6 +181,11 @@ void WindowSDL::onKeyPress(const SDL_Event* event) { ax = Input::GetAxis(-0x80, 0x80, axisvalue); break; case SDLK_S: + if (event->key.mod == SDL_KMOD_LCTRL) { + // Trigger rdoc capture + VideoCore::TriggerCapture(); + break; + } axis = Input::Axis::LeftY; if (event->type == SDL_EVENT_KEY_DOWN) { axisvalue += 127; diff --git a/src/sdl_window.h b/src/sdl_window.h index 6e14fbd0..02d01128 100644 --- a/src/sdl_window.h +++ b/src/sdl_window.h @@ -3,6 +3,7 @@ #pragma once +#include #include "common/types.h" struct SDL_Window; @@ -40,7 +41,8 @@ struct WindowSystemInfo { class WindowSDL { public: - explicit WindowSDL(s32 width, s32 height, Input::GameController* controller); + explicit WindowSDL(s32 width, s32 height, Input::GameController* controller, + std::string_view window_title); ~WindowSDL(); s32 getWidth() const { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 561014a3..c7042763 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -183,6 +183,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddCapability(spv::Capability::Float16); ctx.AddCapability(spv::Capability::Int16); } + ctx.AddCapability(spv::Capability::Int64); if (info.has_storage_images) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); } @@ -204,8 +205,8 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { } else { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); } + ctx.AddCapability(spv::Capability::GroupNonUniform); if (info.uses_group_quad) { - ctx.AddCapability(spv::Capability::GroupNonUniform); ctx.AddCapability(spv::Capability::GroupNonUniformQuad); } if (info.has_discard) { @@ -217,9 +218,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { if (info.has_image_query) { ctx.AddCapability(spv::Capability::ImageQuery); } - // if (program.info.stores_frag_depth) { - // ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); - // } + if (info.stores.Get(IR::Attribute::Depth)) { + ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); + } break; default: throw NotImplementedException("Stage {}", u32(program.info.stage)); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp index da29f392..03a0a00f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp @@ -6,8 +6,8 @@ namespace Shader::Backend::SPIRV { -void EmitBitCastU16F16(EmitContext&) { - UNREACHABLE_MSG("SPIR-V Instruction"); +Id EmitBitCastU16F16(EmitContext& ctx, Id value) { + return ctx.OpBitcast(ctx.U16, value); } Id EmitBitCastU32F32(EmitContext& ctx, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 87ffa150..02480303 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -120,6 +120,7 @@ void EmitGetGotoVariable(EmitContext&) { } Id EmitReadConst(EmitContext& ctx) { + return ctx.u32_zero_value; UNREACHABLE_MSG("Unreachable instruction"); } @@ -149,6 +150,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { // Attribute is disabled or varying component is not written return ctx.ConstF32(comp == 3 ? 1.0f : 0.0f); } + if (param.is_default) { + return ctx.OpCompositeExtract(param.component_type, param.id, comp); + } if (param.num_components > 1) { const Id pointer{ @@ -208,7 +212,7 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) { const Id pointer{OutputAttrPointer(ctx, attr, element)}; - ctx.OpStore(pointer, value); + ctx.OpStore(pointer, ctx.OpBitcast(ctx.F32[1], value)); } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index ede592e0..945fa687 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -259,4 +259,8 @@ Id EmitConvertU16U32(EmitContext& ctx, Id value) { return ctx.OpUConvert(ctx.U16, value); } +Id EmitConvertU32U16(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U32[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp index 911983a4..e822eabe 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp @@ -385,4 +385,8 @@ Id EmitFPIsInf64(EmitContext& ctx, Id value) { return ctx.OpIsInf(ctx.U1[1], value); } +void EmitFPCmpClass32(EmitContext&) { + UNREACHABLE(); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 17def57a..030d3948 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -70,7 +70,6 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id o const u32 comp = inst->Flags().gather_comp.Value(); ImageOperands operands; operands.Add(spv::ImageOperandsMask::Offset, offset); - operands.Add(spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); return ctx.OpImageGather(ctx.F32[4], sampled_image, coords, ctx.ConstU32(comp), operands.mask, operands.operands); } @@ -106,8 +105,7 @@ Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod const auto type = ctx.info.images[handle & 0xFFFF].type; const Id zero = ctx.u32_zero_value; const auto mips{[&] { return skip_mips ? zero : ctx.OpImageQueryLevels(ctx.U32[1], image); }}; - const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa && - type != AmdGpu::ImageType::Buffer}; + const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa}; const auto query{[&](Id type) { return uses_lod ? ctx.OpImageQuerySizeLod(type, image, lod) : ctx.OpImageQuerySize(type, image); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index e2b411e4..51899eb4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -42,6 +42,7 @@ void EmitSetVcc(EmitContext& ctx); void EmitSetSccLo(EmitContext& ctx); void EmitSetVccLo(EmitContext& ctx); void EmitSetVccHi(EmitContext& ctx); +void EmitFPCmpClass32(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); void EmitDiscard(EmitContext& ctx); @@ -148,7 +149,7 @@ Id EmitSelectU64(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF16(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF32(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF64(EmitContext& ctx, Id cond, Id true_value, Id false_value); -void EmitBitCastU16F16(EmitContext& ctx); +Id EmitBitCastU16F16(EmitContext& ctx, Id value); Id EmitBitCastU32F32(EmitContext& ctx, Id value); void EmitBitCastU64F64(EmitContext& ctx); Id EmitBitCastF16U16(EmitContext& ctx, Id value); @@ -258,6 +259,7 @@ Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitSMulExt(EmitContext& ctx, Id a, Id b); Id EmitUMulExt(EmitContext& ctx, Id a, Id b); Id EmitIMul32(EmitContext& ctx, Id a, Id b); +Id EmitIMul64(EmitContext& ctx, Id a, Id b); Id EmitSDiv32(EmitContext& ctx, Id a, Id b); Id EmitUDiv32(EmitContext& ctx, Id a, Id b); Id EmitINeg32(EmitContext& ctx, Id value); @@ -271,6 +273,7 @@ Id EmitShiftRightArithmetic32(EmitContext& ctx, Id base, Id shift); Id EmitShiftRightArithmetic64(EmitContext& ctx, Id base, Id shift); Id EmitBitwiseAnd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitBitwiseOr32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); +Id EmitBitwiseOr64(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitBitwiseXor32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitBitFieldInsert(EmitContext& ctx, Id base, Id insert, Id offset, Id count); Id EmitBitFieldSExtract(EmitContext& ctx, IR::Inst* inst, Id base, Id offset, Id count); @@ -280,14 +283,17 @@ Id EmitBitCount32(EmitContext& ctx, Id value); Id EmitBitwiseNot32(EmitContext& ctx, Id value); Id EmitFindSMsb32(EmitContext& ctx, Id value); Id EmitFindUMsb32(EmitContext& ctx, Id value); +Id EmitFindILsb32(EmitContext& ctx, Id value); Id EmitSMin32(EmitContext& ctx, Id a, Id b); Id EmitUMin32(EmitContext& ctx, Id a, Id b); Id EmitSMax32(EmitContext& ctx, Id a, Id b); Id EmitUMax32(EmitContext& ctx, Id a, Id b); Id EmitSClamp32(EmitContext& ctx, IR::Inst* inst, Id value, Id min, Id max); Id EmitUClamp32(EmitContext& ctx, IR::Inst* inst, Id value, Id min, Id max); -Id EmitSLessThan(EmitContext& ctx, Id lhs, Id rhs); -Id EmitULessThan(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSLessThan32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSLessThan64(EmitContext& ctx, Id lhs, Id rhs); +Id EmitULessThan32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitULessThan64(EmitContext& ctx, Id lhs, Id rhs); Id EmitIEqual(EmitContext& ctx, Id lhs, Id rhs); Id EmitSLessThanEqual(EmitContext& ctx, Id lhs, Id rhs); Id EmitULessThanEqual(EmitContext& ctx, Id lhs, Id rhs); @@ -349,6 +355,7 @@ Id EmitConvertF64U16(EmitContext& ctx, Id value); Id EmitConvertF64U32(EmitContext& ctx, Id value); Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); +Id EmitConvertU32U16(EmitContext& ctx, Id value); Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, Id offset); @@ -383,6 +390,7 @@ Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); Id EmitLaneId(EmitContext& ctx); +Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index d5a0f276..f20c4fac 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -84,6 +84,10 @@ Id EmitIMul32(EmitContext& ctx, Id a, Id b) { return ctx.OpIMul(ctx.U32[1], a, b); } +Id EmitIMul64(EmitContext& ctx, Id a, Id b) { + return ctx.OpIMul(ctx.U64, a, b); +} + Id EmitSDiv32(EmitContext& ctx, Id a, Id b) { return ctx.OpSDiv(ctx.U32[1], a, b); } @@ -142,6 +146,13 @@ Id EmitBitwiseOr32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { return result; } +Id EmitBitwiseOr64(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { + const Id result{ctx.OpBitwiseOr(ctx.U64, a, b)}; + SetZeroFlag(ctx, inst, result); + SetSignFlag(ctx, inst, result); + return result; +} + Id EmitBitwiseXor32(EmitContext& ctx, IR::Inst* inst, Id a, Id b) { const Id result{ctx.OpBitwiseXor(ctx.U32[1], a, b)}; SetZeroFlag(ctx, inst, result); @@ -187,6 +198,10 @@ Id EmitFindUMsb32(EmitContext& ctx, Id value) { return ctx.OpFindUMsb(ctx.U32[1], value); } +Id EmitFindILsb32(EmitContext& ctx, Id value) { + return ctx.OpFindILsb(ctx.U32[1], value); +} + Id EmitSMin32(EmitContext& ctx, Id a, Id b) { return ctx.OpSMin(ctx.U32[1], a, b); } @@ -231,11 +246,19 @@ Id EmitUClamp32(EmitContext& ctx, IR::Inst* inst, Id value, Id min, Id max) { return result; } -Id EmitSLessThan(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSLessThan32(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpSLessThan(ctx.U1[1], lhs, rhs); } -Id EmitULessThan(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSLessThan64(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpSLessThan(ctx.U1[1], lhs, rhs); +} + +Id EmitULessThan32(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpULessThan(ctx.U1[1], lhs, rhs); +} + +Id EmitULessThan64(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpULessThan(ctx.U1[1], lhs, rhs); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index a1751588..bd4ac066 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -10,6 +10,10 @@ Id SubgroupScope(EmitContext& ctx) { return ctx.ConstU32(static_cast(spv::Scope::Subgroup)); } +Id EmitWarpId(EmitContext& ctx) { + return ctx.OpLoad(ctx.U32[1], ctx.subgroup_id); +} + Id EmitLaneId(EmitContext& ctx) { return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 9ce87add..f7b30052 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -49,7 +49,7 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin DefineInterfaces(program); DefineBuffers(info); DefineImagesAndSamplers(info); - DefineSharedMemory(info); + DefineSharedMemory(); } EmitContext::~EmitContext() = default; @@ -86,6 +86,7 @@ void EmitContext::DefineArithmeticTypes() { F32[1] = Name(TypeFloat(32), "f32_id"); S32[1] = Name(TypeSInt(32), "i32_id"); U32[1] = Name(TypeUInt(32), "u32_id"); + U64 = Name(TypeUInt(64), "u64_id"); for (u32 i = 2; i <= 4; i++) { if (info.uses_fp16) { @@ -126,6 +127,7 @@ Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::SnormNz: return ctx.F32[4]; case AmdGpu::NumberFormat::Sint: return ctx.S32[4]; @@ -146,6 +148,7 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::SnormNz: return {id, input_f32, F32[1], 4}; case AmdGpu::NumberFormat::Uint: return {id, input_u32, U32[1], 4}; @@ -204,7 +207,9 @@ void EmitContext::DefineInputs(const Info& info) { : 1; // Note that we pass index rather than Id input_params[input.binding] = { - rate_idx, input_u32, U32[1], input.num_components, input.instance_data_buf, + rate_idx, input_u32, + U32[1], input.num_components, + false, input.instance_data_buf, }; } else { Id id{DefineInput(type, input.binding)}; @@ -220,19 +225,18 @@ void EmitContext::DefineInputs(const Info& info) { break; } case Stage::Fragment: - if (info.uses_group_quad) { - subgroup_local_invocation_id = DefineVariable( - U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); - Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); - } + subgroup_id = DefineVariable(U32[1], spv::BuiltIn::SubgroupId, spv::StorageClass::Input); + subgroup_local_invocation_id = DefineVariable( + U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); + Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); for (const auto& input : info.ps_inputs) { const u32 semantic = input.param_index; if (input.is_default) { - input_params[semantic] = {MakeDefaultValue(*this, input.default_value), input_f32, - F32[1]}; + input_params[semantic] = {MakeDefaultValue(*this, input.default_value), F32[1], + F32[1], 4, true}; continue; } const IR::Attribute param{IR::Attribute::Param0 + input.param_index}; @@ -392,7 +396,16 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) { return spv::ImageFormat::Rgba8ui; } - UNREACHABLE(); + if (image.GetDataFmt() == AmdGpu::DataFormat::Format10_11_11 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::R11fG11fB10f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format32_32_32_32 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::Rgba32f; + } + UNREACHABLE_MSG("Unknown storage format data_format={}, num_format={}", image.GetDataFmt(), + image.GetNumberFmt()); } Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { @@ -412,8 +425,6 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, sampled, format); case AmdGpu::ImageType::Cube: return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, sampled, format); - case AmdGpu::ImageType::Buffer: - throw NotImplementedException("Image buffer"); default: break; } @@ -471,10 +482,14 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { } } -void EmitContext::DefineSharedMemory(const Info& info) { - if (info.shared_memory_size == 0) { +void EmitContext::DefineSharedMemory() { + static constexpr size_t DefaultSharedMemSize = 16_KB; + if (!info.uses_shared) { return; } + if (info.shared_memory_size == 0) { + info.shared_memory_size = DefaultSharedMemSize; + } const auto make{[&](Id element_type, u32 element_size) { const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)}; const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index fc678344..34c13d3f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -180,6 +180,7 @@ public: Id workgroup_id{}; Id local_invocation_id{}; + Id subgroup_id{}; Id subgroup_local_invocation_id{}; Id image_u32{}; @@ -219,6 +220,7 @@ public: Id pointer_type; Id component_type; u32 num_components; + bool is_default{}; s32 buffer_handle{-1}; }; std::array input_params{}; @@ -231,7 +233,7 @@ private: void DefineOutputs(const Info& info); void DefineBuffers(const Info& info); void DefineImagesAndSamplers(const Info& info); - void DefineSharedMemory(const Info& info); + void DefineSharedMemory(); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id); }; diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 634566fa..8df3ac36 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -1479,7 +1479,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorFpGraph32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 337 = V_MIN3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 338 = V_MIN3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -1488,7 +1488,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 340 = V_MAX3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 341 = V_MAX3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -1497,7 +1497,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 343 = V_MED3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 344 = V_MED3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -2779,11 +2779,9 @@ constexpr std::array InstructionFormatDS = {{ // 60 = DS_READ_U16 {InstClass::DsIdxRd, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 61 = DS_CONSUME - {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 62 = DS_APPEND - {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 63 = DS_ORDERED_COUNT {InstClass::GdsOrdCnt, InstCategory::DataShare, 3, 1, ScalarType::Undefined, ScalarType::Undefined}, diff --git a/src/shader_recompiler/frontend/instruction.h b/src/shader_recompiler/frontend/instruction.h index d1d10efb..f83f43db 100644 --- a/src/shader_recompiler/frontend/instruction.h +++ b/src/shader_recompiler/frontend/instruction.h @@ -76,11 +76,11 @@ struct SMRD { }; struct InstControlSOPK { - BitField<0, 16, u32> simm; + s16 simm; }; struct InstControlSOPP { - BitField<0, 16, u32> simm; + s16 simm; }; struct InstControlVOP3 { diff --git a/src/shader_recompiler/frontend/opcodes.h b/src/shader_recompiler/frontend/opcodes.h index d38140d8..cdc1e474 100644 --- a/src/shader_recompiler/frontend/opcodes.h +++ b/src/shader_recompiler/frontend/opcodes.h @@ -2392,10 +2392,10 @@ enum class OperandField : u32 { ConstFloatPos_4_0, ConstFloatNeg_4_0, VccZ = 251, - ExecZ, - Scc, - LdsDirect, - LiteralConst, + ExecZ = 252, + Scc = 253, + LdsDirect = 254, + LiteralConst = 255, VectorGPR, Undefined = 0xFFFFFFFF, diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 346f00aa..c8d73858 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -600,13 +600,13 @@ public: TranslatePass(ObjectPool& inst_pool_, ObjectPool& block_pool_, ObjectPool& stmt_pool_, Statement& root_stmt, IR::AbstractSyntaxList& syntax_list_, std::span inst_list_, - Info& info_) + Info& info_, const Profile& profile_) : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, - syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_} { + syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_}, profile{profile_} { Visit(root_stmt, nullptr, nullptr); IR::Block& first_block{*syntax_list.front().data.block}; - Translator{&first_block, info}.EmitPrologue(); + Translator{&first_block, info, profile}.EmitPrologue(); } private: @@ -635,7 +635,7 @@ private: const u32 start = stmt.block->begin_index; const u32 size = stmt.block->end_index - start + 1; Translate(current_block, stmt.block->begin, inst_list.subspan(start, size), - info); + info, profile); } break; } @@ -815,16 +815,18 @@ private: const Block dummy_flow_block{.is_dummy = true}; std::span inst_list; Info& info; + const Profile& profile; }; } // Anonymous namespace IR::AbstractSyntaxList BuildASL(ObjectPool& inst_pool, ObjectPool& block_pool, - CFG& cfg, Info& info) { + CFG& cfg, Info& info, const Profile& profile) { ObjectPool stmt_pool{64}; GotoPass goto_pass{cfg, stmt_pool}; Statement& root{goto_pass.RootStatement()}; IR::AbstractSyntaxList syntax_list; - TranslatePass{inst_pool, block_pool, stmt_pool, root, syntax_list, cfg.inst_list, info}; + TranslatePass{inst_pool, block_pool, stmt_pool, root, + syntax_list, cfg.inst_list, info, profile}; ASSERT_MSG(!info.translation_failed, "Shader translation has failed"); return syntax_list; } diff --git a/src/shader_recompiler/frontend/structured_control_flow.h b/src/shader_recompiler/frontend/structured_control_flow.h index 09814349..da4ef1ff 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.h +++ b/src/shader_recompiler/frontend/structured_control_flow.h @@ -11,12 +11,13 @@ namespace Shader { struct Info; -} +struct Profile; +} // namespace Shader namespace Shader::Gcn { [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool& inst_pool, ObjectPool& block_pool, CFG& cfg, - Info& info); + Info& info, const Profile& profile); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c5d9f0ec..14837166 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -5,6 +5,31 @@ namespace Shader::Gcn { +void Translator::EmitDataShare(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::DS_SWIZZLE_B32: + return DS_SWIZZLE_B32(inst); + case Opcode::DS_READ_B32: + return DS_READ(32, false, false, inst); + case Opcode::DS_READ_B64: + return DS_READ(64, false, false, inst); + case Opcode::DS_READ2_B32: + return DS_READ(32, false, true, inst); + case Opcode::DS_READ2_B64: + return DS_READ(64, false, true, inst); + case Opcode::DS_WRITE_B32: + return DS_WRITE(32, false, false, inst); + case Opcode::DS_WRITE_B64: + return DS_WRITE(64, false, false, inst); + case Opcode::DS_WRITE2_B32: + return DS_WRITE(32, false, true, inst); + case Opcode::DS_WRITE2_B64: + return DS_WRITE(64, false, true, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { const u8 offset0 = inst.control.ds.offset0; const u8 offset1 = inst.control.ds.offset1; @@ -20,14 +45,25 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - const IR::VectorReg dst_reg{inst.dst[0].code}; + IR::VectorReg dst_reg{inst.dst[0].code}; if (is_pair) { - // Pair loads are either 32 or 64-bit. We assume 32-bit for now. - ASSERT(bit_size == 32); + // Pair loads are either 32 or 64-bit const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)}); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); + if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else { + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); + } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)}); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); + if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else { + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); + } } else if (bit_size == 64) { const IR::Value data = ir.LoadShared(bit_size, is_signed, addr); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); @@ -43,11 +79,22 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; if (is_pair) { - ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + } else { + ir.WriteShared( + 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), + addr0); + } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + } else { + ir.WriteShared( + 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), + addr1); + } } else if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); @@ -62,7 +109,18 @@ void Translator::S_BARRIER() { } void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) { - UNREACHABLE(); + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); +} + +void Translator::V_READLANE_B32(const GcnInst& inst) { + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); +} + +void Translator::V_WRITELANE_B32(const GcnInst& inst) { + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 51840537..889de21b 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -6,7 +6,7 @@ namespace Shader::Gcn { -void Translator::EXP(const GcnInst& inst) { +void Translator::EmitExport(const GcnInst& inst) { if (ir.block->has_multiple_predecessors && info.stage == Stage::Fragment) { LOG_WARNING(Render_Recompiler, "An ambiguous export appeared in translation"); ir.Discard(ir.LogicalNot(ir.GetExec())); diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index a20e91ca..795b148d 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -5,8 +5,102 @@ namespace Shader::Gcn { +void Translator::EmitScalarAlu(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_MOVK_I32: + return S_MOVK(inst); + case Opcode::S_MOV_B32: + return S_MOV(inst); + case Opcode::S_MUL_I32: + return S_MUL_I32(inst); + case Opcode::S_AND_SAVEEXEC_B64: + return S_AND_SAVEEXEC_B64(inst); + case Opcode::S_MOV_B64: + return S_MOV_B64(inst); + case Opcode::S_CMP_LT_U32: + return S_CMP(ConditionOp::LT, false, inst); + case Opcode::S_CMP_LE_U32: + return S_CMP(ConditionOp::LE, false, inst); + case Opcode::S_CMP_LG_U32: + return S_CMP(ConditionOp::LG, false, inst); + case Opcode::S_CMP_LT_I32: + return S_CMP(ConditionOp::LT, true, inst); + case Opcode::S_CMP_LG_I32: + return S_CMP(ConditionOp::LG, true, inst); + case Opcode::S_CMP_GT_I32: + return S_CMP(ConditionOp::GT, true, inst); + case Opcode::S_CMP_GE_I32: + return S_CMP(ConditionOp::GE, true, inst); + case Opcode::S_CMP_EQ_I32: + return S_CMP(ConditionOp::EQ, true, inst); + case Opcode::S_CMP_EQ_U32: + return S_CMP(ConditionOp::EQ, false, inst); + case Opcode::S_CMP_GE_U32: + return S_CMP(ConditionOp::GE, false, inst); + case Opcode::S_CMP_GT_U32: + return S_CMP(ConditionOp::GT, false, inst); + case Opcode::S_OR_B64: + return S_OR_B64(NegateMode::None, false, inst); + case Opcode::S_NOR_B64: + return S_OR_B64(NegateMode::Result, false, inst); + case Opcode::S_XOR_B64: + return S_OR_B64(NegateMode::None, true, inst); + case Opcode::S_ORN2_B64: + return S_OR_B64(NegateMode::Src1, false, inst); + case Opcode::S_AND_B64: + return S_AND_B64(NegateMode::None, inst); + case Opcode::S_NAND_B64: + return S_AND_B64(NegateMode::Result, inst); + case Opcode::S_ANDN2_B64: + return S_AND_B64(NegateMode::Src1, inst); + case Opcode::S_NOT_B64: + return S_NOT_B64(inst); + case Opcode::S_ADD_I32: + return S_ADD_I32(inst); + case Opcode::S_AND_B32: + return S_AND_B32(inst); + case Opcode::S_ASHR_I32: + return S_ASHR_I32(inst); + case Opcode::S_OR_B32: + return S_OR_B32(inst); + case Opcode::S_LSHL_B32: + return S_LSHL_B32(inst); + case Opcode::S_LSHR_B32: + return S_LSHR_B32(inst); + case Opcode::S_CSELECT_B32: + return S_CSELECT_B32(inst); + case Opcode::S_CSELECT_B64: + return S_CSELECT_B64(inst); + case Opcode::S_BFE_U32: + return S_BFE_U32(inst); + case Opcode::S_BFM_B32: + return S_BFM_B32(inst); + case Opcode::S_BREV_B32: + return S_BREV_B32(inst); + case Opcode::S_ADD_U32: + return S_ADD_U32(inst); + case Opcode::S_ADDC_U32: + return S_ADDC_U32(inst); + case Opcode::S_ADDK_I32: + return S_ADDK_I32(inst); + case Opcode::S_MULK_I32: + return S_MULK_I32(inst); + case Opcode::S_SUB_U32: + case Opcode::S_SUB_I32: + return S_SUB_U32(inst); + case Opcode::S_MIN_U32: + return S_MIN_U32(inst); + case Opcode::S_MAX_U32: + return S_MAX_U32(inst); + case Opcode::S_WQM_B64: + break; + default: + LogMissingOpcode(inst); + } +} + void Translator::S_MOVK(const GcnInst& inst) { - const auto simm16 = inst.control.sopk.simm.Value(); + const auto simm16 = inst.control.sopk.simm; if (simm16 & (1 << 15)) { // TODO: need to verify the case of imm sign extension UNREACHABLE(); @@ -14,6 +108,16 @@ void Translator::S_MOVK(const GcnInst& inst) { SetDst(inst.dst[0], ir.Imm32(simm16)); } +void Translator::S_ADDK_I32(const GcnInst& inst) { + const s32 simm16 = inst.control.sopk.simm; + SetDst(inst.dst[0], ir.IAdd(GetSrc(inst.dst[0]), ir.Imm32(simm16))); +} + +void Translator::S_MULK_I32(const GcnInst& inst) { + const s32 simm16 = inst.control.sopk.simm; + SetDst(inst.dst[0], ir.IMul(GetSrc(inst.dst[0]), ir.Imm32(simm16))); +} + void Translator::S_MOV(const GcnInst& inst) { SetDst(inst.dst[0], GetSrc(inst.src[0])); } @@ -62,15 +166,10 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } }(); - // Mark destination SPGR as an EXEC context. This means we will use 1-bit - // IR instruction whenever it's loaded. switch (inst.dst[0].field) { - case OperandField::ScalarGPR: { - const u32 reg = inst.dst[0].code; - exec_contexts[reg] = true; - ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), exec); break; - } case OperandField::VccLo: ir.SetVcc(exec); break; @@ -79,27 +178,37 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } // Update EXEC. - ir.SetExec(ir.LogicalAnd(exec, src)); + const IR::U1 result = ir.LogicalAnd(exec, src); + ir.SetExec(result); + ir.SetScc(result); } void Translator::S_MOV_B64(const GcnInst& inst) { - // TODO: Using VCC as EXEC context. - if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) { - return; - } - if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) { - // Exec context push - exec_contexts[inst.dst[0].code] = true; - ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), ir.GetExec()); - } else if (inst.dst[0].field == OperandField::ExecLo && - inst.src[0].field == OperandField::ScalarGPR) { - // Exec context pop - exec_contexts[inst.src[0].code] = false; - ir.SetExec(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))); - } else if (inst.dst[0].field == OperandField::ExecLo && - inst.src[0].field == OperandField::ConstZero) { - ir.SetExec(ir.Imm1(false)); - } else { + const IR::U1 src = [&] { + switch (inst.src[0].field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code)); + case OperandField::ConstZero: + return ir.Imm1(false); + default: + UNREACHABLE(); + } + }(); + switch (inst.dst[0].field) { + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), src); + break; + case OperandField::ExecLo: + ir.SetExec(src); + break; + case OperandField::VccLo: + ir.SetVcc(src); + break; + default: UNREACHABLE(); } } @@ -338,4 +447,20 @@ void Translator::S_ADDC_U32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo())); } +void Translator::S_MAX_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result = ir.UMax(src0, src1); + SetDst(inst.dst[0], result); + ir.SetScc(ir.IEqual(result, src0)); +} + +void Translator::S_MIN_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result = ir.UMin(src0, src1); + SetDst(inst.dst[0], result); + ir.SetScc(ir.IEqual(result, src0)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 3c80764c..29f2acc2 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -7,6 +7,29 @@ namespace Shader::Gcn { static constexpr u32 SQ_SRC_LITERAL = 0xFF; +void Translator::EmitScalarMemory(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_LOAD_DWORDX4: + return S_LOAD_DWORD(4, inst); + case Opcode::S_LOAD_DWORDX8: + return S_LOAD_DWORD(8, inst); + case Opcode::S_LOAD_DWORDX16: + return S_LOAD_DWORD(16, inst); + case Opcode::S_BUFFER_LOAD_DWORD: + return S_BUFFER_LOAD_DWORD(1, inst); + case Opcode::S_BUFFER_LOAD_DWORDX2: + return S_BUFFER_LOAD_DWORD(2, inst); + case Opcode::S_BUFFER_LOAD_DWORDX4: + return S_BUFFER_LOAD_DWORD(4, inst); + case Opcode::S_BUFFER_LOAD_DWORDX8: + return S_BUFFER_LOAD_DWORD(8, inst); + case Opcode::S_BUFFER_LOAD_DWORDX16: + return S_BUFFER_LOAD_DWORD(16, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; const u32 dword_offset = [&] -> u32 { diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 9e67e82e..e8c2a31c 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -16,13 +16,10 @@ namespace Shader::Gcn { -std::array Translator::exec_contexts{}; - -Translator::Translator(IR::Block* block_, Info& info_) - : ir{*block_, block_->begin()}, info{info_} {} +Translator::Translator(IR::Block* block_, Info& info_, const Profile& profile_) + : ir{*block_, block_->begin()}, info{info_}, profile{profile_} {} void Translator::EmitPrologue() { - exec_contexts.fill(false); ir.Prologue(); ir.SetExec(ir.Imm1(true)); @@ -76,28 +73,28 @@ void Translator::EmitPrologue() { } } +template <> IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - // Input modifiers work on float values. - force_flt |= operand.input_modifier.abs | operand.input_modifier.neg; - IR::U32F32 value{}; + + const bool is_float = operand.type == ScalarType::Float32 || force_flt; switch (operand.field) { case OperandField::ScalarGPR: - if (operand.type == ScalarType::Float32 || force_flt) { + if (is_float) { value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } else { value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } break; case OperandField::VectorGPR: - if (operand.type == ScalarType::Float32 || force_flt) { + if (is_float) { value = ir.GetVectorReg(IR::VectorReg(operand.code)); } else { value = ir.GetVectorReg(IR::VectorReg(operand.code)); } break; case OperandField::ConstZero: - if (force_flt) { + if (is_float) { value = ir.Imm32(0.f); } else { value = ir.Imm32(0U); @@ -112,14 +109,14 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); break; case OperandField::LiteralConst: - if (force_flt) { + if (is_float) { value = ir.Imm32(std::bit_cast(operand.code)); } else { value = ir.Imm32(operand.code); } break; case OperandField::ConstFloatPos_1_0: - if (force_flt) { + if (is_float) { value = ir.Imm32(1.f); } else { value = ir.Imm32(std::bit_cast(1.f)); @@ -138,7 +135,11 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.Imm32(-0.5f); break; case OperandField::ConstFloatNeg_1_0: - value = ir.Imm32(-1.0f); + if (is_float) { + value = ir.Imm32(-1.0f); + } else { + value = ir.Imm32(std::bit_cast(-1.0f)); + } break; case OperandField::ConstFloatNeg_2_0: value = ir.Imm32(-2.0f); @@ -160,19 +161,166 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.GetVccHi(); } break; + case OperandField::M0: + return m0_value; default: UNREACHABLE(); } - if (operand.input_modifier.abs) { - value = ir.FPAbs(value); - } - if (operand.input_modifier.neg) { - value = ir.FPNeg(value); + if (is_float) { + if (operand.input_modifier.abs) { + value = ir.FPAbs(value); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(value); + } } return value; } +template <> +IR::U32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { + return GetSrc(operand, force_flt); +} + +template <> +IR::F32 Translator::GetSrc(const InstOperand& operand, bool) { + return GetSrc(operand, true); +} + +template <> +IR::U64F64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { + IR::Value value_hi{}; + IR::Value value_lo{}; + + bool immediate = false; + const bool is_float = operand.type == ScalarType::Float64 || force_flt; + switch (operand.field) { + case OperandField::ScalarGPR: + if (is_float) { + value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); + value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); + } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { + value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); + value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); + } else { + UNREACHABLE(); + } + break; + case OperandField::VectorGPR: + if (is_float) { + value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); + value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); + } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { + value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); + value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); + } else { + UNREACHABLE(); + } + break; + case OperandField::ConstZero: + immediate = true; + if (force_flt) { + value_lo = ir.Imm64(0.0); + } else { + value_lo = ir.Imm64(u64(0U)); + } + break; + case OperandField::SignedConstIntPos: + ASSERT(!force_flt); + immediate = true; + value_lo = ir.Imm64(s64(operand.code) - SignedConstIntPosMin + 1); + break; + case OperandField::SignedConstIntNeg: + ASSERT(!force_flt); + immediate = true; + value_lo = ir.Imm64(-s64(operand.code) + SignedConstIntNegMin - 1); + break; + case OperandField::LiteralConst: + immediate = true; + if (force_flt) { + UNREACHABLE(); // There is a literal double? + } else { + value_lo = ir.Imm64(u64(operand.code)); + } + break; + case OperandField::ConstFloatPos_1_0: + immediate = true; + if (force_flt) { + value_lo = ir.Imm64(1.0); + } else { + value_lo = ir.Imm64(std::bit_cast(f64(1.0))); + } + break; + case OperandField::ConstFloatPos_0_5: + immediate = true; + value_lo = ir.Imm64(0.5); + break; + case OperandField::ConstFloatPos_2_0: + immediate = true; + value_lo = ir.Imm64(2.0); + break; + case OperandField::ConstFloatPos_4_0: + immediate = true; + value_lo = ir.Imm64(4.0); + break; + case OperandField::ConstFloatNeg_0_5: + immediate = true; + value_lo = ir.Imm64(-0.5); + break; + case OperandField::ConstFloatNeg_1_0: + immediate = true; + value_lo = ir.Imm64(-1.0); + break; + case OperandField::ConstFloatNeg_2_0: + immediate = true; + value_lo = ir.Imm64(-2.0); + break; + case OperandField::ConstFloatNeg_4_0: + immediate = true; + value_lo = ir.Imm64(-4.0); + break; + case OperandField::VccLo: { + value_lo = ir.GetVccLo(); + value_hi = ir.GetVccHi(); + } break; + case OperandField::VccHi: + UNREACHABLE(); + default: + UNREACHABLE(); + } + + IR::Value value; + + if (immediate) { + value = value_lo; + } else if (is_float) { + throw NotImplementedException("required OpPackDouble2x32 implementation"); + } else { + IR::Value packed = ir.CompositeConstruct(value_lo, value_hi); + value = ir.PackUint2x32(packed); + } + + if (is_float) { + if (operand.input_modifier.abs) { + value = ir.FPAbs(IR::F32F64(value)); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(IR::F32F64(value)); + } + } + return IR::U64F64(value); +} + +template <> +IR::U64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { + return GetSrc64(operand, force_flt); +} +template <> +IR::F64 Translator::GetSrc64(const InstOperand& operand, bool) { + return GetSrc64(operand, true); +} + void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { IR::U32F32 result = value; if (operand.output_modifier.multiplier != 0.f) { @@ -190,6 +338,44 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { return ir.SetVccLo(result); case OperandField::VccHi: return ir.SetVccHi(result); + case OperandField::M0: + m0_value = result; + break; + default: + UNREACHABLE(); + } +} + +void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_raw) { + IR::U64F64 value_untyped = value_raw; + + const bool is_float = value_raw.Type() == IR::Type::F64 || value_raw.Type() == IR::Type::F32; + if (is_float) { + if (operand.output_modifier.multiplier != 0.f) { + value_untyped = + ir.FPMul(value_untyped, ir.Imm64(f64(operand.output_modifier.multiplier))); + } + if (operand.output_modifier.clamp) { + value_untyped = ir.FPSaturate(value_raw); + } + } + const IR::U64 value = + is_float ? ir.BitCast(IR::F64{value_untyped}) : IR::U64{value_untyped}; + + const IR::Value unpacked{ir.UnpackUint2x32(value)}; + const IR::U32 lo{ir.CompositeExtract(unpacked, 0U)}; + const IR::U32 hi{ir.CompositeExtract(unpacked, 1U)}; + switch (operand.field) { + case OperandField::ScalarGPR: + ir.SetScalarReg(IR::ScalarReg(operand.code + 1), hi); + return ir.SetScalarReg(IR::ScalarReg(operand.code), lo); + case OperandField::VectorGPR: + ir.SetVectorReg(IR::VectorReg(operand.code + 1), hi); + return ir.SetVectorReg(IR::VectorReg(operand.code), lo); + case OperandField::VccLo: + UNREACHABLE(); + case OperandField::VccHi: + UNREACHABLE(); case OperandField::M0: break; default: @@ -276,703 +462,84 @@ void Translator::EmitFetch(const GcnInst& inst) { } } -void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info) { +void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_BARRIER: + return S_BARRIER(); + case Opcode::S_TTRACEDATA: + LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); + return; + case Opcode::S_GETPC_B64: + return S_GETPC_B64(pc, inst); + case Opcode::S_WAITCNT: + case Opcode::S_NOP: + case Opcode::S_ENDPGM: + case Opcode::S_CBRANCH_EXECZ: + case Opcode::S_CBRANCH_SCC0: + case Opcode::S_CBRANCH_SCC1: + case Opcode::S_CBRANCH_VCCNZ: + case Opcode::S_CBRANCH_VCCZ: + case Opcode::S_BRANCH: + return; + default: + UNREACHABLE(); + } +} + +void Translator::LogMissingOpcode(const GcnInst& inst) { + const u32 opcode = u32(inst.opcode); + LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({}, category = {})", + magic_enum::enum_name(inst.opcode), u32(inst.opcode), + magic_enum::enum_name(inst.category)); + info.translation_failed = true; +} + +void Translate(IR::Block* block, u32 pc, std::span inst_list, Info& info, + const Profile& profile) { if (inst_list.empty()) { return; } - Translator translator{block, info}; + Translator translator{block, info, profile}; for (const auto& inst : inst_list) { - block_base += inst.length; - switch (inst.opcode) { - case Opcode::S_MOVK_I32: - translator.S_MOVK(inst); - break; - case Opcode::S_MOV_B32: - translator.S_MOV(inst); - break; - case Opcode::S_MUL_I32: - translator.S_MUL_I32(inst); - break; - case Opcode::V_MAD_F32: - translator.V_MAD_F32(inst); - break; - case Opcode::V_MOV_B32: - translator.V_MOV(inst); - break; - case Opcode::V_MAC_F32: - translator.V_MAC_F32(inst); - break; - case Opcode::V_MUL_F32: - translator.V_MUL_F32(inst); - break; - case Opcode::V_AND_B32: - translator.V_AND_B32(inst); - break; - case Opcode::V_OR_B32: - translator.V_OR_B32(false, inst); - break; - case Opcode::V_XOR_B32: - translator.V_OR_B32(true, inst); - break; - case Opcode::V_LSHLREV_B32: - translator.V_LSHLREV_B32(inst); - break; - case Opcode::V_ADD_I32: - translator.V_ADD_I32(inst); - break; - case Opcode::V_CVT_F32_I32: - translator.V_CVT_F32_I32(inst); - break; - case Opcode::V_CVT_F32_U32: - translator.V_CVT_F32_U32(inst); - break; - case Opcode::V_RCP_F32: - translator.V_RCP_F32(inst); - break; - case Opcode::S_SWAPPC_B64: + pc += inst.length; + + // Special case for emitting fetch shader. + if (inst.opcode == Opcode::S_SWAPPC_B64) { ASSERT(info.stage == Stage::Vertex); translator.EmitFetch(inst); - break; - case Opcode::S_WAITCNT: - break; - case Opcode::S_LOAD_DWORDX4: - translator.S_LOAD_DWORD(4, inst); - break; - case Opcode::S_LOAD_DWORDX8: - translator.S_LOAD_DWORD(8, inst); - break; - case Opcode::S_LOAD_DWORDX16: - translator.S_LOAD_DWORD(16, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORD: - translator.S_BUFFER_LOAD_DWORD(1, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX2: - translator.S_BUFFER_LOAD_DWORD(2, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX4: - translator.S_BUFFER_LOAD_DWORD(4, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX8: - translator.S_BUFFER_LOAD_DWORD(8, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX16: - translator.S_BUFFER_LOAD_DWORD(16, inst); - break; - case Opcode::EXP: - translator.EXP(inst); - break; - case Opcode::V_INTERP_P2_F32: - translator.V_INTERP_P2_F32(inst); - break; - case Opcode::V_CVT_PKRTZ_F16_F32: - translator.V_CVT_PKRTZ_F16_F32(inst); - break; - case Opcode::V_CVT_F32_F16: - translator.V_CVT_F32_F16(inst); - break; - case Opcode::V_CVT_F32_UBYTE0: - translator.V_CVT_F32_UBYTE(0, inst); - break; - case Opcode::V_CVT_F32_UBYTE1: - translator.V_CVT_F32_UBYTE(1, inst); - break; - case Opcode::V_CVT_F32_UBYTE2: - translator.V_CVT_F32_UBYTE(2, inst); - break; - case Opcode::V_CVT_F32_UBYTE3: - translator.V_CVT_F32_UBYTE(3, inst); - break; - case Opcode::V_BFREV_B32: - translator.V_BFREV_B32(inst); - break; - case Opcode::V_LDEXP_F32: - translator.V_LDEXP_F32(inst); - break; - case Opcode::V_FRACT_F32: - translator.V_FRACT_F32(inst); - break; - case Opcode::V_ADD_F32: - translator.V_ADD_F32(inst); - break; - case Opcode::V_CVT_OFF_F32_I4: - translator.V_CVT_OFF_F32_I4(inst); - break; - case Opcode::V_MED3_F32: - translator.V_MED3_F32(inst); - break; - case Opcode::V_FLOOR_F32: - translator.V_FLOOR_F32(inst); - break; - case Opcode::V_SUB_F32: - translator.V_SUB_F32(inst); - break; - case Opcode::V_FMA_F32: - case Opcode::V_MADAK_F32: // Yes these can share the opcode - translator.V_FMA_F32(inst); - break; - case Opcode::IMAGE_SAMPLE_LZ_O: - case Opcode::IMAGE_SAMPLE_O: - case Opcode::IMAGE_SAMPLE_C: - case Opcode::IMAGE_SAMPLE_C_LZ: - case Opcode::IMAGE_SAMPLE_LZ: - case Opcode::IMAGE_SAMPLE: - case Opcode::IMAGE_SAMPLE_L: - case Opcode::IMAGE_SAMPLE_C_O: - case Opcode::IMAGE_SAMPLE_B: - case Opcode::IMAGE_SAMPLE_C_LZ_O: - translator.IMAGE_SAMPLE(inst); - break; - case Opcode::IMAGE_ATOMIC_ADD: - translator.IMAGE_ATOMIC(AtomicOp::Add, inst); - break; - case Opcode::IMAGE_ATOMIC_AND: - translator.IMAGE_ATOMIC(AtomicOp::And, inst); - break; - case Opcode::IMAGE_ATOMIC_OR: - translator.IMAGE_ATOMIC(AtomicOp::Or, inst); - break; - case Opcode::IMAGE_ATOMIC_XOR: - translator.IMAGE_ATOMIC(AtomicOp::Xor, inst); - break; - case Opcode::IMAGE_ATOMIC_UMAX: - translator.IMAGE_ATOMIC(AtomicOp::Umax, inst); - break; - case Opcode::IMAGE_ATOMIC_SMAX: - translator.IMAGE_ATOMIC(AtomicOp::Smax, inst); - break; - case Opcode::IMAGE_ATOMIC_UMIN: - translator.IMAGE_ATOMIC(AtomicOp::Umin, inst); - break; - case Opcode::IMAGE_ATOMIC_SMIN: - translator.IMAGE_ATOMIC(AtomicOp::Smin, inst); - break; - case Opcode::IMAGE_ATOMIC_INC: - translator.IMAGE_ATOMIC(AtomicOp::Inc, inst); - break; - case Opcode::IMAGE_ATOMIC_DEC: - translator.IMAGE_ATOMIC(AtomicOp::Dec, inst); - break; - case Opcode::IMAGE_GET_LOD: - translator.IMAGE_GET_LOD(inst); - break; - case Opcode::IMAGE_GATHER4_C: - case Opcode::IMAGE_GATHER4_LZ: - case Opcode::IMAGE_GATHER4_LZ_O: - translator.IMAGE_GATHER(inst); - break; - case Opcode::IMAGE_STORE: - translator.IMAGE_STORE(inst); - break; - case Opcode::IMAGE_LOAD_MIP: - translator.IMAGE_LOAD(true, inst); - break; - case Opcode::IMAGE_LOAD: - translator.IMAGE_LOAD(false, inst); - break; - case Opcode::V_CMP_GE_I32: - translator.V_CMP_U32(ConditionOp::GE, true, false, inst); - break; - case Opcode::V_CMP_EQ_I32: - translator.V_CMP_U32(ConditionOp::EQ, true, false, inst); - break; - case Opcode::V_CMP_LE_I32: - translator.V_CMP_U32(ConditionOp::LE, true, false, inst); - break; - case Opcode::V_CMP_NE_I32: - translator.V_CMP_U32(ConditionOp::LG, true, false, inst); - break; - case Opcode::V_CMP_NE_U32: - translator.V_CMP_U32(ConditionOp::LG, false, false, inst); - break; - case Opcode::V_CMP_EQ_U32: - translator.V_CMP_U32(ConditionOp::EQ, false, false, inst); - break; - case Opcode::V_CMP_F_U32: - translator.V_CMP_U32(ConditionOp::F, false, false, inst); - break; - case Opcode::V_CMP_LT_U32: - translator.V_CMP_U32(ConditionOp::LT, false, false, inst); - break; - case Opcode::V_CMP_GT_U32: - translator.V_CMP_U32(ConditionOp::GT, false, false, inst); - break; - case Opcode::V_CMP_GE_U32: - translator.V_CMP_U32(ConditionOp::GE, false, false, inst); - break; - case Opcode::V_CMP_TRU_U32: - translator.V_CMP_U32(ConditionOp::TRU, false, false, inst); - break; - case Opcode::V_CMP_NEQ_F32: - translator.V_CMP_F32(ConditionOp::LG, false, inst); - break; - case Opcode::V_CMP_F_F32: - translator.V_CMP_F32(ConditionOp::F, false, inst); - break; - case Opcode::V_CMP_LT_F32: - translator.V_CMP_F32(ConditionOp::LT, false, inst); - break; - case Opcode::V_CMP_EQ_F32: - translator.V_CMP_F32(ConditionOp::EQ, false, inst); - break; - case Opcode::V_CMP_LE_F32: - translator.V_CMP_F32(ConditionOp::LE, false, inst); - break; - case Opcode::V_CMP_GT_F32: - translator.V_CMP_F32(ConditionOp::GT, false, inst); - break; - case Opcode::V_CMP_LG_F32: - translator.V_CMP_F32(ConditionOp::LG, false, inst); - break; - case Opcode::V_CMP_GE_F32: - translator.V_CMP_F32(ConditionOp::GE, false, inst); - break; - case Opcode::V_CMP_NLE_F32: - translator.V_CMP_F32(ConditionOp::GT, false, inst); - break; - case Opcode::V_CMP_NLT_F32: - translator.V_CMP_F32(ConditionOp::GE, false, inst); - break; - case Opcode::V_CMP_NGT_F32: - translator.V_CMP_F32(ConditionOp::LE, false, inst); - break; - case Opcode::V_CMP_NGE_F32: - translator.V_CMP_F32(ConditionOp::LT, false, inst); - break; - case Opcode::S_CMP_LT_U32: - translator.S_CMP(ConditionOp::LT, false, inst); - break; - case Opcode::S_CMP_LE_U32: - translator.S_CMP(ConditionOp::LE, false, inst); - break; - case Opcode::S_CMP_LG_U32: - translator.S_CMP(ConditionOp::LG, false, inst); - break; - case Opcode::S_CMP_LT_I32: - translator.S_CMP(ConditionOp::LT, true, inst); - break; - case Opcode::S_CMP_LG_I32: - translator.S_CMP(ConditionOp::LG, true, inst); - break; - case Opcode::S_CMP_GT_I32: - translator.S_CMP(ConditionOp::GT, true, inst); - break; - case Opcode::S_CMP_GE_I32: - translator.S_CMP(ConditionOp::GE, true, inst); - break; - case Opcode::S_CMP_EQ_I32: - translator.S_CMP(ConditionOp::EQ, true, inst); - break; - case Opcode::S_CMP_EQ_U32: - translator.S_CMP(ConditionOp::EQ, false, inst); - break; - case Opcode::S_LSHL_B32: - translator.S_LSHL_B32(inst); - break; - case Opcode::V_CNDMASK_B32: - translator.V_CNDMASK_B32(inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_X: - translator.BUFFER_LOAD_FORMAT(1, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XY: - translator.BUFFER_LOAD_FORMAT(2, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XYZ: - translator.BUFFER_LOAD_FORMAT(3, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - translator.BUFFER_LOAD_FORMAT(4, true, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_X: - translator.BUFFER_LOAD_FORMAT(1, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XY: - translator.BUFFER_LOAD_FORMAT(2, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XYZ: - translator.BUFFER_LOAD_FORMAT(3, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XYZW: - translator.BUFFER_LOAD_FORMAT(4, false, true, inst); - break; - case Opcode::BUFFER_LOAD_DWORD: - translator.BUFFER_LOAD_FORMAT(1, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX2: - translator.BUFFER_LOAD_FORMAT(2, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX3: - translator.BUFFER_LOAD_FORMAT(3, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX4: - translator.BUFFER_LOAD_FORMAT(4, false, false, inst); - break; - case Opcode::BUFFER_STORE_FORMAT_X: - case Opcode::BUFFER_STORE_DWORD: - translator.BUFFER_STORE_FORMAT(1, false, inst); - break; - case Opcode::BUFFER_STORE_DWORDX3: - translator.BUFFER_STORE_FORMAT(3, false, inst); - break; - case Opcode::BUFFER_STORE_FORMAT_XYZW: - case Opcode::BUFFER_STORE_DWORDX4: - translator.BUFFER_STORE_FORMAT(4, false, inst); - break; - case Opcode::V_MAX_F32: - translator.V_MAX_F32(inst); - break; - case Opcode::V_MAX_I32: - translator.V_MAX_U32(true, inst); - break; - case Opcode::V_MAX_U32: - translator.V_MAX_U32(false, inst); - break; - case Opcode::V_NOT_B32: - translator.V_NOT_B32(inst); - break; - case Opcode::V_RSQ_F32: - translator.V_RSQ_F32(inst); - break; - case Opcode::S_ANDN2_B64: - translator.S_AND_B64(NegateMode::Src1, inst); - break; - case Opcode::S_ORN2_B64: - translator.S_OR_B64(NegateMode::Src1, false, inst); - break; - case Opcode::V_SIN_F32: - translator.V_SIN_F32(inst); - break; - case Opcode::V_COS_F32: - translator.V_COS_F32(inst); - break; - case Opcode::V_LOG_F32: - translator.V_LOG_F32(inst); - break; - case Opcode::V_EXP_F32: - translator.V_EXP_F32(inst); - break; - case Opcode::V_SQRT_F32: - translator.V_SQRT_F32(inst); - break; - case Opcode::V_MIN_F32: - translator.V_MIN_F32(inst); - break; - case Opcode::V_MIN_I32: - translator.V_MIN_I32(inst); - break; - case Opcode::V_MIN3_F32: - translator.V_MIN3_F32(inst); - break; - case Opcode::V_MIN_LEGACY_F32: - translator.V_MIN_F32(inst, true); - break; - case Opcode::V_MADMK_F32: - translator.V_MADMK_F32(inst); - break; - case Opcode::V_CUBEMA_F32: - translator.V_CUBEMA_F32(inst); - break; - case Opcode::V_CUBESC_F32: - translator.V_CUBESC_F32(inst); - break; - case Opcode::V_CUBETC_F32: - translator.V_CUBETC_F32(inst); - break; - case Opcode::V_CUBEID_F32: - translator.V_CUBEID_F32(inst); - break; - case Opcode::V_CVT_U32_F32: - translator.V_CVT_U32_F32(inst); - break; - case Opcode::V_CVT_I32_F32: - translator.V_CVT_I32_F32(inst); - break; - case Opcode::V_CVT_FLR_I32_F32: - translator.V_CVT_FLR_I32_F32(inst); - break; - case Opcode::V_SUBREV_F32: - translator.V_SUBREV_F32(inst); - break; - case Opcode::S_AND_SAVEEXEC_B64: - translator.S_AND_SAVEEXEC_B64(inst); - break; - case Opcode::S_MOV_B64: - translator.S_MOV_B64(inst); - break; - case Opcode::V_SUBREV_I32: - translator.V_SUBREV_I32(inst); - break; + continue; + } - case Opcode::V_CMPX_F_F32: - translator.V_CMP_F32(ConditionOp::F, true, inst); + // Emit instructions for each category. + switch (inst.category) { + case InstCategory::DataShare: + translator.EmitDataShare(inst); break; - case Opcode::V_CMPX_LT_F32: - translator.V_CMP_F32(ConditionOp::LT, true, inst); + case InstCategory::VectorInterpolation: + translator.EmitVectorInterpolation(inst); break; - case Opcode::V_CMPX_EQ_F32: - translator.V_CMP_F32(ConditionOp::EQ, true, inst); + case InstCategory::ScalarMemory: + translator.EmitScalarMemory(inst); break; - case Opcode::V_CMPX_LE_F32: - translator.V_CMP_F32(ConditionOp::LE, true, inst); + case InstCategory::VectorMemory: + translator.EmitVectorMemory(inst); break; - case Opcode::V_CMPX_GT_F32: - translator.V_CMP_F32(ConditionOp::GT, true, inst); + case InstCategory::Export: + translator.EmitExport(inst); break; - case Opcode::V_CMPX_LG_F32: - translator.V_CMP_F32(ConditionOp::LG, true, inst); + case InstCategory::FlowControl: + translator.EmitFlowControl(pc, inst); break; - case Opcode::V_CMPX_GE_F32: - translator.V_CMP_F32(ConditionOp::GE, true, inst); + case InstCategory::ScalarALU: + translator.EmitScalarAlu(inst); break; - case Opcode::V_CMPX_NGE_F32: - translator.V_CMP_F32(ConditionOp::LT, true, inst); + case InstCategory::VectorALU: + translator.EmitVectorAlu(inst); break; - case Opcode::V_CMPX_NLG_F32: - translator.V_CMP_F32(ConditionOp::EQ, true, inst); - break; - case Opcode::V_CMPX_NGT_F32: - translator.V_CMP_F32(ConditionOp::LE, true, inst); - break; - case Opcode::V_CMPX_NLE_F32: - translator.V_CMP_F32(ConditionOp::GT, true, inst); - break; - case Opcode::V_CMPX_NEQ_F32: - translator.V_CMP_F32(ConditionOp::LG, true, inst); - break; - case Opcode::V_CMPX_NLT_F32: - translator.V_CMP_F32(ConditionOp::GE, true, inst); - break; - case Opcode::V_CMPX_TRU_F32: - translator.V_CMP_F32(ConditionOp::TRU, true, inst); - break; - case Opcode::V_CMP_LE_U32: - translator.V_CMP_U32(ConditionOp::LE, false, false, inst); - break; - case Opcode::V_CMP_GT_I32: - translator.V_CMP_U32(ConditionOp::GT, true, false, inst); - break; - case Opcode::V_CMP_LT_I32: - translator.V_CMP_U32(ConditionOp::LT, true, false, inst); - break; - case Opcode::V_CMPX_LT_I32: - translator.V_CMP_U32(ConditionOp::LT, true, true, inst); - break; - case Opcode::V_CMPX_F_U32: - translator.V_CMP_U32(ConditionOp::F, false, true, inst); - break; - case Opcode::V_CMPX_LT_U32: - translator.V_CMP_U32(ConditionOp::LT, false, true, inst); - break; - case Opcode::V_CMPX_EQ_U32: - translator.V_CMP_U32(ConditionOp::EQ, false, true, inst); - break; - case Opcode::V_CMPX_LE_U32: - translator.V_CMP_U32(ConditionOp::LE, false, true, inst); - break; - case Opcode::V_CMPX_GT_U32: - translator.V_CMP_U32(ConditionOp::GT, false, true, inst); - break; - case Opcode::V_CMPX_NE_U32: - translator.V_CMP_U32(ConditionOp::LG, false, true, inst); - break; - case Opcode::V_CMPX_GE_U32: - translator.V_CMP_U32(ConditionOp::GE, false, true, inst); - break; - case Opcode::V_CMPX_TRU_U32: - translator.V_CMP_U32(ConditionOp::TRU, false, true, inst); - break; - case Opcode::S_OR_B64: - translator.S_OR_B64(NegateMode::None, false, inst); - break; - case Opcode::S_NOR_B64: - translator.S_OR_B64(NegateMode::Result, false, inst); - break; - case Opcode::S_XOR_B64: - translator.S_OR_B64(NegateMode::None, true, inst); - break; - case Opcode::S_AND_B64: - translator.S_AND_B64(NegateMode::None, inst); - break; - case Opcode::S_NOT_B64: - translator.S_NOT_B64(inst); - break; - case Opcode::S_NAND_B64: - translator.S_AND_B64(NegateMode::Result, inst); - break; - case Opcode::V_LSHRREV_B32: - translator.V_LSHRREV_B32(inst); - break; - case Opcode::S_ADD_I32: - translator.S_ADD_I32(inst); - break; - case Opcode::V_MUL_HI_U32: - translator.V_MUL_HI_U32(false, inst); - break; - case Opcode::V_MUL_LO_I32: - translator.V_MUL_LO_U32(inst); - break; - case Opcode::V_SAD_U32: - translator.V_SAD_U32(inst); - break; - case Opcode::V_BFE_U32: - translator.V_BFE_U32(false, inst); - break; - case Opcode::V_BFE_I32: - translator.V_BFE_U32(true, inst); - break; - case Opcode::V_MAD_I32_I24: - translator.V_MAD_I32_I24(inst); - break; - case Opcode::V_MUL_I32_I24: - case Opcode::V_MUL_U32_U24: - translator.V_MUL_I32_I24(inst); - break; - case Opcode::V_SUB_I32: - translator.V_SUB_I32(inst); - break; - case Opcode::V_LSHR_B32: - translator.V_LSHR_B32(inst); - break; - case Opcode::V_ASHRREV_I32: - translator.V_ASHRREV_I32(inst); - break; - case Opcode::V_MAD_U32_U24: - translator.V_MAD_U32_U24(inst); - break; - case Opcode::S_AND_B32: - translator.S_AND_B32(inst); - break; - case Opcode::S_ASHR_I32: - translator.S_ASHR_I32(inst); - break; - case Opcode::S_OR_B32: - translator.S_OR_B32(inst); - break; - case Opcode::S_LSHR_B32: - translator.S_LSHR_B32(inst); - break; - case Opcode::S_CSELECT_B32: - translator.S_CSELECT_B32(inst); - break; - case Opcode::S_CSELECT_B64: - translator.S_CSELECT_B64(inst); - break; - case Opcode::S_BFE_U32: - translator.S_BFE_U32(inst); - break; - case Opcode::V_RNDNE_F32: - translator.V_RNDNE_F32(inst); - break; - case Opcode::V_BCNT_U32_B32: - translator.V_BCNT_U32_B32(inst); - break; - case Opcode::V_MAX3_F32: - translator.V_MAX3_F32(inst); - break; - case Opcode::DS_SWIZZLE_B32: - translator.DS_SWIZZLE_B32(inst); - break; - case Opcode::V_MUL_LO_U32: - translator.V_MUL_LO_U32(inst); - break; - case Opcode::S_BFM_B32: - translator.S_BFM_B32(inst); - break; - case Opcode::V_MIN_U32: - translator.V_MIN_U32(inst); - break; - case Opcode::V_CMP_NE_U64: - translator.V_CMP_NE_U64(inst); - break; - case Opcode::V_CMP_CLASS_F32: - translator.V_CMP_CLASS_F32(inst); - break; - case Opcode::V_TRUNC_F32: - translator.V_TRUNC_F32(inst); - break; - case Opcode::V_CEIL_F32: - translator.V_CEIL_F32(inst); - break; - case Opcode::V_BFI_B32: - translator.V_BFI_B32(inst); - break; - case Opcode::S_BREV_B32: - translator.S_BREV_B32(inst); - break; - case Opcode::S_ADD_U32: - translator.S_ADD_U32(inst); - break; - case Opcode::S_ADDC_U32: - translator.S_ADDC_U32(inst); - break; - case Opcode::S_SUB_U32: - case Opcode::S_SUB_I32: - translator.S_SUB_U32(inst); - break; - // TODO: Separate implementation for legacy variants. - case Opcode::V_MUL_LEGACY_F32: - translator.V_MUL_F32(inst); - break; - case Opcode::V_MAC_LEGACY_F32: - translator.V_MAC_F32(inst); - break; - case Opcode::V_MAD_LEGACY_F32: - translator.V_MAD_F32(inst); - break; - case Opcode::V_MAX_LEGACY_F32: - translator.V_MAX_F32(inst, true); - break; - case Opcode::V_RSQ_LEGACY_F32: - case Opcode::V_RSQ_CLAMP_F32: - translator.V_RSQ_F32(inst); - break; - case Opcode::V_RCP_IFLAG_F32: - translator.V_RCP_F32(inst); - break; - case Opcode::IMAGE_GET_RESINFO: - translator.IMAGE_GET_RESINFO(inst); - break; - case Opcode::S_BARRIER: - translator.S_BARRIER(); - break; - case Opcode::S_TTRACEDATA: - LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); - break; - case Opcode::DS_READ_B32: - translator.DS_READ(32, false, false, inst); - break; - case Opcode::DS_READ2_B32: - translator.DS_READ(32, false, true, inst); - break; - case Opcode::DS_WRITE_B32: - translator.DS_WRITE(32, false, false, inst); - break; - case Opcode::DS_WRITE2_B32: - translator.DS_WRITE(32, false, true, inst); - break; - case Opcode::V_READFIRSTLANE_B32: - translator.V_READFIRSTLANE_B32(inst); - break; - case Opcode::S_GETPC_B64: - translator.S_GETPC_B64(block_base, inst); - break; - case Opcode::S_NOP: - case Opcode::S_CBRANCH_EXECZ: - case Opcode::S_CBRANCH_SCC0: - case Opcode::S_CBRANCH_SCC1: - case Opcode::S_CBRANCH_VCCNZ: - case Opcode::S_CBRANCH_VCCZ: - case Opcode::S_BRANCH: - case Opcode::S_WQM_B64: - case Opcode::V_INTERP_P1_F32: - case Opcode::S_ENDPGM: + case InstCategory::DebugProfile: break; default: - const u32 opcode = u32(inst.opcode); - LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({})", - magic_enum::enum_name(inst.opcode), opcode); - info.translation_failed = true; + UNREACHABLE(); } } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 2aa6f712..8d1b7683 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -11,7 +11,8 @@ namespace Shader { struct Info; -} +struct Profile; +} // namespace Shader namespace Shader::Gcn { @@ -24,6 +25,7 @@ enum class ConditionOp : u32 { LT, LE, TRU, + U, }; enum class AtomicOp : u32 { @@ -53,10 +55,19 @@ enum class NegateMode : u32 { class Translator { public: - explicit Translator(IR::Block* block_, Info& info); + explicit Translator(IR::Block* block_, Info& info, const Profile& profile); + // Instruction categories void EmitPrologue(); void EmitFetch(const GcnInst& inst); + void EmitDataShare(const GcnInst& inst); + void EmitVectorInterpolation(const GcnInst& inst); + void EmitScalarMemory(const GcnInst& inst); + void EmitVectorMemory(const GcnInst& inst); + void EmitExport(const GcnInst& inst); + void EmitFlowControl(u32 pc, const GcnInst& inst); + void EmitScalarAlu(const GcnInst& inst); + void EmitVectorAlu(const GcnInst& inst); // Scalar ALU void S_MOVK(const GcnInst& inst); @@ -83,6 +94,10 @@ public: void S_SUB_U32(const GcnInst& inst); void S_GETPC_B64(u32 pc, const GcnInst& inst); void S_ADDC_U32(const GcnInst& inst); + void S_MULK_I32(const GcnInst& inst); + void S_ADDK_I32(const GcnInst& inst); + void S_MAX_U32(const GcnInst& inst); + void S_MIN_U32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -94,12 +109,15 @@ public: void V_MAC_F32(const GcnInst& inst); void V_CVT_PKRTZ_F16_F32(const GcnInst& inst); void V_CVT_F32_F16(const GcnInst& inst); + void V_CVT_F16_F32(const GcnInst& inst); void V_MUL_F32(const GcnInst& inst); void V_CNDMASK_B32(const GcnInst& inst); void V_OR_B32(bool is_xor, const GcnInst& inst); void V_AND_B32(const GcnInst& inst); void V_LSHLREV_B32(const GcnInst& inst); + void V_LSHL_B32(const GcnInst& inst); void V_ADD_I32(const GcnInst& inst); + void V_ADDC_U32(const GcnInst& inst); void V_CVT_F32_I32(const GcnInst& inst); void V_CVT_F32_U32(const GcnInst& inst); void V_MAD_F32(const GcnInst& inst); @@ -121,6 +139,7 @@ public: void V_SQRT_F32(const GcnInst& inst); void V_MIN_F32(const GcnInst& inst, bool is_legacy = false); void V_MIN3_F32(const GcnInst& inst); + void V_MIN3_I32(const GcnInst& inst); void V_MADMK_F32(const GcnInst& inst); void V_CUBEMA_F32(const GcnInst& inst); void V_CUBESC_F32(const GcnInst& inst); @@ -129,6 +148,7 @@ public: void V_CVT_U32_F32(const GcnInst& inst); void V_SUBREV_F32(const GcnInst& inst); void V_SUBREV_I32(const GcnInst& inst); + void V_MAD_U64_U32(const GcnInst& inst); void V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst); void V_LSHRREV_B32(const GcnInst& inst); void V_MUL_HI_U32(bool is_signed, const GcnInst& inst); @@ -144,6 +164,7 @@ public: void V_BCNT_U32_B32(const GcnInst& inst); void V_COS_F32(const GcnInst& inst); void V_MAX3_F32(const GcnInst& inst); + void V_MAX3_U32(const GcnInst& inst); void V_CVT_I32_F32(const GcnInst& inst); void V_MIN_I32(const GcnInst& inst); void V_MUL_LO_U32(const GcnInst& inst); @@ -158,6 +179,8 @@ public: void V_LDEXP_F32(const GcnInst& inst); void V_CVT_FLR_I32_F32(const GcnInst& inst); void V_CMP_CLASS_F32(const GcnInst& inst); + void V_FFBL_B32(const GcnInst& inst); + void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); @@ -165,12 +188,15 @@ public: // Vector interpolation void V_INTERP_P2_F32(const GcnInst& inst); + void V_INTERP_MOV_F32(const GcnInst& inst); // Data share void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void V_READFIRSTLANE_B32(const GcnInst& inst); + void V_READLANE_B32(const GcnInst& inst); + void V_WRITELANE_B32(const GcnInst& inst); void S_BARRIER(); // MIMG @@ -182,19 +208,25 @@ public: void IMAGE_GET_LOD(const GcnInst& inst); void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); - // Export - void EXP(const GcnInst& inst); - private: - IR::U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false); + template + [[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false); + template + [[nodiscard]] T GetSrc64(const InstOperand& operand, bool flt_zero = false); void SetDst(const InstOperand& operand, const IR::U32F32& value); + void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); + + void LogMissingOpcode(const GcnInst& inst); private: IR::IREmitter ir; Info& info; - static std::array exec_contexts; + const Profile& profile; + IR::U32 m0_value; + bool opcode_missing = false; }; -void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info); +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info, + const Profile& profile); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index ca648f88..669ef7ca 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -2,9 +2,311 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/translate/translate.h" +#include "shader_recompiler/profile.h" namespace Shader::Gcn { +void Translator::EmitVectorAlu(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::V_LSHLREV_B32: + return V_LSHLREV_B32(inst); + case Opcode::V_LSHL_B32: + return V_LSHL_B32(inst); + case Opcode::V_BFREV_B32: + return V_BFREV_B32(inst); + case Opcode::V_BFE_U32: + return V_BFE_U32(false, inst); + case Opcode::V_BFE_I32: + return V_BFE_U32(true, inst); + case Opcode::V_BFI_B32: + return V_BFI_B32(inst); + case Opcode::V_LSHR_B32: + return V_LSHR_B32(inst); + case Opcode::V_ASHRREV_I32: + return V_ASHRREV_I32(inst); + case Opcode::V_LSHRREV_B32: + return V_LSHRREV_B32(inst); + case Opcode::V_NOT_B32: + return V_NOT_B32(inst); + case Opcode::V_AND_B32: + return V_AND_B32(inst); + case Opcode::V_OR_B32: + return V_OR_B32(false, inst); + case Opcode::V_XOR_B32: + return V_OR_B32(true, inst); + case Opcode::V_FFBL_B32: + return V_FFBL_B32(inst); + + case Opcode::V_MOV_B32: + return V_MOV(inst); + case Opcode::V_ADD_I32: + return V_ADD_I32(inst); + case Opcode::V_ADDC_U32: + return V_ADDC_U32(inst); + case Opcode::V_CVT_F32_I32: + return V_CVT_F32_I32(inst); + case Opcode::V_CVT_F32_U32: + return V_CVT_F32_U32(inst); + case Opcode::V_CVT_PKRTZ_F16_F32: + return V_CVT_PKRTZ_F16_F32(inst); + case Opcode::V_CVT_F32_F16: + return V_CVT_F32_F16(inst); + case Opcode::V_CVT_F16_F32: + return V_CVT_F16_F32(inst); + case Opcode::V_CVT_F32_UBYTE0: + return V_CVT_F32_UBYTE(0, inst); + case Opcode::V_CVT_F32_UBYTE1: + return V_CVT_F32_UBYTE(1, inst); + case Opcode::V_CVT_F32_UBYTE2: + return V_CVT_F32_UBYTE(2, inst); + case Opcode::V_CVT_F32_UBYTE3: + return V_CVT_F32_UBYTE(3, inst); + case Opcode::V_CVT_OFF_F32_I4: + return V_CVT_OFF_F32_I4(inst); + case Opcode::V_MAD_U64_U32: + return V_MAD_U64_U32(inst); + case Opcode::V_CMP_GE_I32: + return V_CMP_U32(ConditionOp::GE, true, false, inst); + case Opcode::V_CMP_EQ_I32: + return V_CMP_U32(ConditionOp::EQ, true, false, inst); + case Opcode::V_CMP_LE_I32: + return V_CMP_U32(ConditionOp::LE, true, false, inst); + case Opcode::V_CMP_NE_I32: + return V_CMP_U32(ConditionOp::LG, true, false, inst); + case Opcode::V_CMP_NE_U32: + return V_CMP_U32(ConditionOp::LG, false, false, inst); + case Opcode::V_CMP_EQ_U32: + return V_CMP_U32(ConditionOp::EQ, false, false, inst); + case Opcode::V_CMP_F_U32: + return V_CMP_U32(ConditionOp::F, false, false, inst); + case Opcode::V_CMP_LT_U32: + return V_CMP_U32(ConditionOp::LT, false, false, inst); + case Opcode::V_CMP_GT_U32: + return V_CMP_U32(ConditionOp::GT, false, false, inst); + case Opcode::V_CMP_GE_U32: + return V_CMP_U32(ConditionOp::GE, false, false, inst); + case Opcode::V_CMP_TRU_U32: + return V_CMP_U32(ConditionOp::TRU, false, false, inst); + case Opcode::V_CMP_NEQ_F32: + return V_CMP_F32(ConditionOp::LG, false, inst); + case Opcode::V_CMP_F_F32: + return V_CMP_F32(ConditionOp::F, false, inst); + case Opcode::V_CMP_LT_F32: + return V_CMP_F32(ConditionOp::LT, false, inst); + case Opcode::V_CMP_EQ_F32: + return V_CMP_F32(ConditionOp::EQ, false, inst); + case Opcode::V_CMP_LE_F32: + return V_CMP_F32(ConditionOp::LE, false, inst); + case Opcode::V_CMP_GT_F32: + return V_CMP_F32(ConditionOp::GT, false, inst); + case Opcode::V_CMP_LG_F32: + return V_CMP_F32(ConditionOp::LG, false, inst); + case Opcode::V_CMP_GE_F32: + return V_CMP_F32(ConditionOp::GE, false, inst); + case Opcode::V_CMP_NLE_F32: + return V_CMP_F32(ConditionOp::GT, false, inst); + case Opcode::V_CMP_NLT_F32: + return V_CMP_F32(ConditionOp::GE, false, inst); + case Opcode::V_CMP_NGT_F32: + return V_CMP_F32(ConditionOp::LE, false, inst); + case Opcode::V_CMP_NGE_F32: + return V_CMP_F32(ConditionOp::LT, false, inst); + case Opcode::V_CMP_U_F32: + return V_CMP_F32(ConditionOp::U, false, inst); + case Opcode::V_CNDMASK_B32: + return V_CNDMASK_B32(inst); + case Opcode::V_MAX_I32: + return V_MAX_U32(true, inst); + case Opcode::V_MAX_U32: + return V_MAX_U32(false, inst); + case Opcode::V_MIN_I32: + return V_MIN_I32(inst); + case Opcode::V_CUBEMA_F32: + return V_CUBEMA_F32(inst); + case Opcode::V_CUBESC_F32: + return V_CUBESC_F32(inst); + case Opcode::V_CUBETC_F32: + return V_CUBETC_F32(inst); + case Opcode::V_CUBEID_F32: + return V_CUBEID_F32(inst); + case Opcode::V_CVT_U32_F32: + return V_CVT_U32_F32(inst); + case Opcode::V_CVT_I32_F32: + return V_CVT_I32_F32(inst); + case Opcode::V_CVT_FLR_I32_F32: + return V_CVT_FLR_I32_F32(inst); + case Opcode::V_SUBREV_I32: + return V_SUBREV_I32(inst); + case Opcode::V_MUL_HI_U32: + return V_MUL_HI_U32(false, inst); + case Opcode::V_MUL_LO_I32: + return V_MUL_LO_U32(inst); + case Opcode::V_SAD_U32: + return V_SAD_U32(inst); + case Opcode::V_SUB_I32: + return V_SUB_I32(inst); + case Opcode::V_MAD_I32_I24: + return V_MAD_I32_I24(inst); + case Opcode::V_MUL_I32_I24: + case Opcode::V_MUL_U32_U24: + return V_MUL_I32_I24(inst); + case Opcode::V_MAD_U32_U24: + return V_MAD_U32_U24(inst); + case Opcode::V_BCNT_U32_B32: + return V_BCNT_U32_B32(inst); + case Opcode::V_MUL_LO_U32: + return V_MUL_LO_U32(inst); + case Opcode::V_MIN_U32: + return V_MIN_U32(inst); + case Opcode::V_CMP_NE_U64: + return V_CMP_NE_U64(inst); + case Opcode::V_READFIRSTLANE_B32: + return V_READFIRSTLANE_B32(inst); + case Opcode::V_READLANE_B32: + return V_READLANE_B32(inst); + case Opcode::V_WRITELANE_B32: + return V_WRITELANE_B32(inst); + + case Opcode::V_MAD_F32: + return V_MAD_F32(inst); + case Opcode::V_MAC_F32: + return V_MAC_F32(inst); + case Opcode::V_MUL_F32: + return V_MUL_F32(inst); + case Opcode::V_RCP_F32: + return V_RCP_F32(inst); + case Opcode::V_LDEXP_F32: + return V_LDEXP_F32(inst); + case Opcode::V_FRACT_F32: + return V_FRACT_F32(inst); + case Opcode::V_ADD_F32: + return V_ADD_F32(inst); + case Opcode::V_MED3_F32: + return V_MED3_F32(inst); + case Opcode::V_FLOOR_F32: + return V_FLOOR_F32(inst); + case Opcode::V_SUB_F32: + return V_SUB_F32(inst); + case Opcode::V_FMA_F32: + case Opcode::V_MADAK_F32: + return V_FMA_F32(inst); + case Opcode::V_MAX_F32: + return V_MAX_F32(inst); + case Opcode::V_RSQ_F32: + return V_RSQ_F32(inst); + case Opcode::V_SIN_F32: + return V_SIN_F32(inst); + case Opcode::V_COS_F32: + return V_COS_F32(inst); + case Opcode::V_LOG_F32: + return V_LOG_F32(inst); + case Opcode::V_EXP_F32: + return V_EXP_F32(inst); + case Opcode::V_SQRT_F32: + return V_SQRT_F32(inst); + case Opcode::V_MIN_F32: + return V_MIN_F32(inst, false); + case Opcode::V_MIN3_F32: + return V_MIN3_F32(inst); + case Opcode::V_MIN3_I32: + return V_MIN3_I32(inst); + case Opcode::V_MIN_LEGACY_F32: + return V_MIN_F32(inst, true); + case Opcode::V_MADMK_F32: + return V_MADMK_F32(inst); + case Opcode::V_SUBREV_F32: + return V_SUBREV_F32(inst); + case Opcode::V_RNDNE_F32: + return V_RNDNE_F32(inst); + case Opcode::V_MAX3_F32: + return V_MAX3_F32(inst); + case Opcode::V_MAX3_U32: + return V_MAX3_U32(inst); + case Opcode::V_TRUNC_F32: + return V_TRUNC_F32(inst); + case Opcode::V_CEIL_F32: + return V_CEIL_F32(inst); + case Opcode::V_MUL_LEGACY_F32: + return V_MUL_F32(inst); + case Opcode::V_MAC_LEGACY_F32: + return V_MAC_F32(inst); + case Opcode::V_MAD_LEGACY_F32: + return V_MAD_F32(inst); + case Opcode::V_MAX_LEGACY_F32: + return V_MAX_F32(inst, true); + case Opcode::V_RSQ_LEGACY_F32: + case Opcode::V_RSQ_CLAMP_F32: + return V_RSQ_F32(inst); + case Opcode::V_RCP_IFLAG_F32: + return V_RCP_F32(inst); + + case Opcode::V_CMPX_F_F32: + return V_CMP_F32(ConditionOp::F, true, inst); + case Opcode::V_CMPX_LT_F32: + return V_CMP_F32(ConditionOp::LT, true, inst); + case Opcode::V_CMPX_EQ_F32: + return V_CMP_F32(ConditionOp::EQ, true, inst); + case Opcode::V_CMPX_LE_F32: + return V_CMP_F32(ConditionOp::LE, true, inst); + case Opcode::V_CMPX_GT_F32: + return V_CMP_F32(ConditionOp::GT, true, inst); + case Opcode::V_CMPX_LG_F32: + return V_CMP_F32(ConditionOp::LG, true, inst); + case Opcode::V_CMPX_GE_F32: + return V_CMP_F32(ConditionOp::GE, true, inst); + case Opcode::V_CMPX_NGE_F32: + return V_CMP_F32(ConditionOp::LT, true, inst); + case Opcode::V_CMPX_NLG_F32: + return V_CMP_F32(ConditionOp::EQ, true, inst); + case Opcode::V_CMPX_NGT_F32: + return V_CMP_F32(ConditionOp::LE, true, inst); + case Opcode::V_CMPX_NLE_F32: + return V_CMP_F32(ConditionOp::GT, true, inst); + case Opcode::V_CMPX_NEQ_F32: + return V_CMP_F32(ConditionOp::LG, true, inst); + case Opcode::V_CMPX_NLT_F32: + return V_CMP_F32(ConditionOp::GE, true, inst); + case Opcode::V_CMPX_TRU_F32: + return V_CMP_F32(ConditionOp::TRU, true, inst); + case Opcode::V_CMP_CLASS_F32: + return V_CMP_CLASS_F32(inst); + + case Opcode::V_CMP_LE_U32: + return V_CMP_U32(ConditionOp::LE, false, false, inst); + case Opcode::V_CMP_GT_I32: + return V_CMP_U32(ConditionOp::GT, true, false, inst); + case Opcode::V_CMP_LT_I32: + return V_CMP_U32(ConditionOp::LT, true, false, inst); + case Opcode::V_CMPX_LT_I32: + return V_CMP_U32(ConditionOp::LT, true, true, inst); + case Opcode::V_CMPX_F_U32: + return V_CMP_U32(ConditionOp::F, false, true, inst); + case Opcode::V_CMPX_LT_U32: + return V_CMP_U32(ConditionOp::LT, false, true, inst); + case Opcode::V_CMPX_EQ_U32: + return V_CMP_U32(ConditionOp::EQ, false, true, inst); + case Opcode::V_CMPX_LE_U32: + return V_CMP_U32(ConditionOp::LE, false, true, inst); + case Opcode::V_CMPX_GT_U32: + return V_CMP_U32(ConditionOp::GT, false, true, inst); + case Opcode::V_CMPX_NE_U32: + return V_CMP_U32(ConditionOp::LG, false, true, inst); + case Opcode::V_CMPX_GE_U32: + return V_CMP_U32(ConditionOp::GE, false, true, inst); + case Opcode::V_CMPX_TRU_U32: + return V_CMP_U32(ConditionOp::TRU, false, true, inst); + case Opcode::V_CMPX_LG_I32: + return V_CMP_U32(ConditionOp::LG, true, true, inst); + + case Opcode::V_MBCNT_LO_U32_B32: + return V_MBCNT_U32_B32(true, inst); + case Opcode::V_MBCNT_HI_U32_B32: + return V_MBCNT_U32_B32(false, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::V_MOV(const GcnInst& inst) { SetDst(inst.dst[0], GetSrc(inst.src[0])); } @@ -32,6 +334,12 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPConvert(32, ir.BitCast(src0l))); } +void Translator::V_CVT_F16_F32(const GcnInst& inst) { + const IR::F32 src0 = GetSrc(inst.src[0], true); + const IR::F16 src0fp16 = ir.FPConvert(16, src0); + SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast(src0fp16))); +} + void Translator::V_MUL_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); } @@ -67,7 +375,8 @@ void Translator::V_OR_B32(bool is_xor, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))}; const IR::VectorReg dst_reg{inst.dst[0].code}; - ir.SetVectorReg(dst_reg, is_xor ? ir.BitwiseXor(src0, src1) : ir.BitwiseOr(src0, src1)); + ir.SetVectorReg(dst_reg, + is_xor ? ir.BitwiseXor(src0, src1) : IR::U32(ir.BitwiseOr(src0, src1))); } void Translator::V_AND_B32(const GcnInst& inst) { @@ -84,6 +393,12 @@ void Translator::V_LSHLREV_B32(const GcnInst& inst) { ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F)))); } +void Translator::V_LSHL_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F)))); +} + void Translator::V_ADD_I32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))}; @@ -92,6 +407,30 @@ void Translator::V_ADD_I32(const GcnInst& inst) { // TODO: Carry } +void Translator::V_ADDC_U32(const GcnInst& inst) { + + const auto src0 = GetSrc(inst.src[0]); + const auto src1 = GetSrc(inst.src[1]); + + IR::U32 scarry; + if (inst.src_count == 3) { // VOP3 + IR::U1 thread_bit{ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[2].code))}; + scarry = IR::U32{ir.Select(thread_bit, ir.Imm32(1), ir.Imm32(0))}; + } else { // VOP2 + scarry = ir.GetVccLo(); + } + + const IR::U32 result = ir.IAdd(ir.IAdd(src0, src1), scarry); + + const IR::VectorReg dst_reg{inst.dst[0].code}; + ir.SetVectorReg(dst_reg, result); + + const IR::U1 less_src0 = ir.ILessThan(result, src0, false); + const IR::U1 less_src1 = ir.ILessThan(result, src1, false); + const IR::U1 did_overflow = ir.LogicalOr(less_src0, less_src1); + ir.SetVcc(did_overflow); +} + void Translator::V_CVT_F32_I32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; @@ -183,6 +522,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { return ir.FPLessThanEqual(src0, src1); case ConditionOp::GE: return ir.FPGreaterThanEqual(src0, src1); + case ConditionOp::U: + return ir.LogicalNot(ir.LogicalAnd(ir.FPIsNan(src0), ir.FPIsNan(src1))); default: UNREACHABLE(); } @@ -253,6 +594,13 @@ void Translator::V_MIN3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } +void Translator::V_MIN3_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 src2{GetSrc(inst.src[2])}; + SetDst(inst.dst[0], ir.SMin(src0, ir.SMin(src1, src2))); +} + void Translator::V_MADMK_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::F32 src1{GetSrc(inst.src[1], true)}; @@ -294,6 +642,24 @@ void Translator::V_SUBREV_I32(const GcnInst& inst) { // TODO: Carry-out } +void Translator::V_MAD_U64_U32(const GcnInst& inst) { + const auto src0 = GetSrc(inst.src[0]); + const auto src1 = GetSrc(inst.src[1]); + const auto src2 = GetSrc64(inst.src[2]); + + // const IR::U64 mul_result = ir.UConvert(64, ir.IMul(src0, src1)); + const IR::U64 mul_result = + ir.PackUint2x32(ir.CompositeConstruct(ir.IMul(src0, src1), ir.Imm32(0U))); + const IR::U64 sum_result = ir.IAdd(mul_result, src2); + + SetDst64(inst.dst[0], sum_result); + + const IR::U1 less_src0 = ir.ILessThan(sum_result, mul_result, false); + const IR::U1 less_src1 = ir.ILessThan(sum_result, src2, false); + const IR::U1 did_overflow = ir.LogicalOr(less_src0, less_src1); + ir.SetVcc(did_overflow); +} + void Translator::V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; @@ -421,6 +787,13 @@ void Translator::V_MAX3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } +void Translator::V_MAX3_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 src2{GetSrc(inst.src[2])}; + SetDst(inst.dst[0], ir.UMax(src0, ir.UMax(src1, src2))); +} + void Translator::V_CVT_I32_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); @@ -519,38 +892,58 @@ void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { } void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { - constexpr u32 SIGNALING_NAN = 1 << 0; - constexpr u32 QUIET_NAN = 1 << 1; - constexpr u32 NEGATIVE_INFINITY = 1 << 2; - constexpr u32 NEGATIVE_NORMAL = 1 << 3; - constexpr u32 NEGATIVE_DENORM = 1 << 4; - constexpr u32 NEGATIVE_ZERO = 1 << 5; - constexpr u32 POSITIVE_ZERO = 1 << 6; - constexpr u32 POSITIVE_DENORM = 1 << 7; - constexpr u32 POSITIVE_NORMAL = 1 << 8; - constexpr u32 POSITIVE_INFINITY = 1 << 9; - const IR::F32F64 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; + IR::U1 value; if (src1.IsImmediate()) { - const u32 class_mask = src1.U32(); - IR::U1 value; - if ((class_mask & (SIGNALING_NAN | QUIET_NAN)) == (SIGNALING_NAN | QUIET_NAN)) { + const auto class_mask = static_cast(src1.U32()); + if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) { value = ir.FPIsNan(src0); - } else if ((class_mask & (POSITIVE_INFINITY | NEGATIVE_INFINITY)) == - (POSITIVE_INFINITY | NEGATIVE_INFINITY)) { + } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) { value = ir.FPIsInf(src0); } else { UNREACHABLE(); } - if (inst.dst[1].field == OperandField::VccLo) { - return ir.SetVcc(value); - } else { - UNREACHABLE(); - } } else { + // We don't know the type yet, delay its resolution. + value = ir.FPCmpClass32(src0, src1); + } + + switch (inst.dst[1].field) { + case OperandField::VccLo: + return ir.SetVcc(value); + default: UNREACHABLE(); } } +void Translator::V_FFBL_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + SetDst(inst.dst[0], ir.FindILsb(src0)); +} + +void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 lane_id = ir.LaneId(); + + const auto [warp_half, mask_shift] = [&]() -> std::pair { + if (profile.subgroup_size == 32) { + const IR::U32 warp_half = ir.BitwiseAnd(ir.WarpId(), ir.Imm32(1)); + return std::make_pair(warp_half, lane_id); + } + const IR::U32 warp_half = ir.ShiftRightLogical(lane_id, ir.Imm32(5)); + const IR::U32 mask_shift = ir.BitwiseAnd(lane_id, ir.Imm32(0x1F)); + return std::make_pair(warp_half, mask_shift); + }(); + + const IR::U32 thread_mask = ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1), mask_shift), ir.Imm32(1)); + const IR::U1 is_odd_warp = ir.INotEqual(warp_half, ir.Imm32(0)); + const IR::U32 mask = IR::U32{ir.Select(is_odd_warp, is_low ? ir.Imm32(~0U) : thread_mask, + is_low ? thread_mask : ir.Imm32(0))}; + const IR::U32 masked_value = ir.BitwiseAnd(src0, mask); + const IR::U32 result = ir.IAdd(src1, ir.BitCount(masked_value)); + SetDst(inst.dst[0], result); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp index 55a2d624..4ff846cf 100644 --- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp +++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp @@ -12,4 +12,24 @@ void Translator::V_INTERP_P2_F32(const GcnInst& inst) { ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan)); } +void Translator::V_INTERP_MOV_F32(const GcnInst& inst) { + const IR::VectorReg dst_reg{inst.dst[0].code}; + auto& attr = info.ps_inputs.at(inst.control.vintrp.attr); + const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; + ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan)); +} + +void Translator::EmitVectorInterpolation(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::V_INTERP_P1_F32: + return; + case Opcode::V_INTERP_P2_F32: + return V_INTERP_P2_F32(inst); + case Opcode::V_INTERP_MOV_F32: + return V_INTERP_MOV_F32(inst); + default: + LogMissingOpcode(inst); + } +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index f4383c61..c667968a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -5,9 +5,96 @@ namespace Shader::Gcn { +void Translator::EmitVectorMemory(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::IMAGE_SAMPLE_LZ_O: + case Opcode::IMAGE_SAMPLE_O: + case Opcode::IMAGE_SAMPLE_C: + case Opcode::IMAGE_SAMPLE_C_LZ: + case Opcode::IMAGE_SAMPLE_LZ: + case Opcode::IMAGE_SAMPLE: + case Opcode::IMAGE_SAMPLE_L: + case Opcode::IMAGE_SAMPLE_C_O: + case Opcode::IMAGE_SAMPLE_B: + case Opcode::IMAGE_SAMPLE_C_LZ_O: + return IMAGE_SAMPLE(inst); + case Opcode::IMAGE_GATHER4_C: + case Opcode::IMAGE_GATHER4_LZ: + case Opcode::IMAGE_GATHER4_LZ_O: + return IMAGE_GATHER(inst); + case Opcode::IMAGE_ATOMIC_ADD: + return IMAGE_ATOMIC(AtomicOp::Add, inst); + case Opcode::IMAGE_ATOMIC_AND: + return IMAGE_ATOMIC(AtomicOp::And, inst); + case Opcode::IMAGE_ATOMIC_OR: + return IMAGE_ATOMIC(AtomicOp::Or, inst); + case Opcode::IMAGE_ATOMIC_XOR: + return IMAGE_ATOMIC(AtomicOp::Xor, inst); + case Opcode::IMAGE_ATOMIC_UMAX: + return IMAGE_ATOMIC(AtomicOp::Umax, inst); + case Opcode::IMAGE_ATOMIC_SMAX: + return IMAGE_ATOMIC(AtomicOp::Smax, inst); + case Opcode::IMAGE_ATOMIC_UMIN: + return IMAGE_ATOMIC(AtomicOp::Umin, inst); + case Opcode::IMAGE_ATOMIC_SMIN: + return IMAGE_ATOMIC(AtomicOp::Smin, inst); + case Opcode::IMAGE_ATOMIC_INC: + return IMAGE_ATOMIC(AtomicOp::Inc, inst); + case Opcode::IMAGE_ATOMIC_DEC: + return IMAGE_ATOMIC(AtomicOp::Dec, inst); + case Opcode::IMAGE_GET_LOD: + return IMAGE_GET_LOD(inst); + case Opcode::IMAGE_STORE: + return IMAGE_STORE(inst); + case Opcode::IMAGE_LOAD_MIP: + return IMAGE_LOAD(true, inst); + case Opcode::IMAGE_LOAD: + return IMAGE_LOAD(false, inst); + case Opcode::IMAGE_GET_RESINFO: + return IMAGE_GET_RESINFO(inst); + + case Opcode::TBUFFER_LOAD_FORMAT_X: + return BUFFER_LOAD_FORMAT(1, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XY: + return BUFFER_LOAD_FORMAT(2, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XYZ: + return BUFFER_LOAD_FORMAT(3, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XYZW: + return BUFFER_LOAD_FORMAT(4, true, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_X: + return BUFFER_LOAD_FORMAT(1, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XY: + return BUFFER_LOAD_FORMAT(2, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XYZ: + return BUFFER_LOAD_FORMAT(3, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XYZW: + return BUFFER_LOAD_FORMAT(4, false, true, inst); + case Opcode::BUFFER_LOAD_DWORD: + return BUFFER_LOAD_FORMAT(1, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX2: + return BUFFER_LOAD_FORMAT(2, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX3: + return BUFFER_LOAD_FORMAT(3, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX4: + return BUFFER_LOAD_FORMAT(4, false, false, inst); + case Opcode::BUFFER_STORE_FORMAT_X: + case Opcode::BUFFER_STORE_DWORD: + return BUFFER_STORE_FORMAT(1, false, inst); + case Opcode::BUFFER_STORE_DWORDX2: + return BUFFER_STORE_FORMAT(2, false, inst); + case Opcode::BUFFER_STORE_DWORDX3: + return BUFFER_STORE_FORMAT(3, false, inst); + case Opcode::BUFFER_STORE_FORMAT_XYZW: + case Opcode::BUFFER_STORE_DWORDX4: + return BUFFER_STORE_FORMAT(4, false, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::IMAGE_GET_RESINFO(const GcnInst& inst) { IR::VectorReg dst_reg{inst.dst[0].code}; - const IR::ScalarReg tsharp_reg{inst.src[2].code}; + const IR::ScalarReg tsharp_reg{inst.src[2].code * 4}; const auto flags = ImageResFlags(inst.control.mimg.dmask); const bool has_mips = flags.test(ImageResComponent::MipCount); const IR::U32 lod = ir.GetVectorReg(IR::VectorReg(inst.src[0].code)); @@ -157,7 +244,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) { info.has_bias.Assign(flags.test(MimgModifier::LodBias)); info.has_lod_clamp.Assign(flags.test(MimgModifier::LodClamp)); info.force_level0.Assign(flags.test(MimgModifier::Level0)); - info.explicit_lod.Assign(explicit_lod); + // info.explicit_lod.Assign(explicit_lod); info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1); // Issue IR instruction, leaving unknown fields blank to patch later. diff --git a/src/shader_recompiler/ir/breadth_first_search.h b/src/shader_recompiler/ir/breadth_first_search.h index 21a34a90..0156303f 100644 --- a/src/shader_recompiler/ir/breadth_first_search.h +++ b/src/shader_recompiler/ir/breadth_first_search.h @@ -12,16 +12,16 @@ namespace Shader::IR { template -auto BreadthFirstSearch(const Value& value, Pred&& pred) - -> std::invoke_result_t { - if (value.IsImmediate()) { - // Nothing to do with immediates - return std::nullopt; +auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t { + // Most often case the instruction is the desired already. + if (const std::optional result = pred(inst)) { + return result; } + // Breadth-first search visiting the right most arguments first boost::container::small_vector visited; std::queue queue; - queue.push(value.InstRecursive()); + queue.push(inst); while (!queue.empty()) { // Pop one instruction from the queue @@ -49,4 +49,14 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred) return std::nullopt; } +template +auto BreadthFirstSearch(const Value& value, Pred&& pred) + -> std::invoke_result_t { + if (value.IsImmediate()) { + // Nothing to do with immediates + return std::nullopt; + } + return BreadthFirstSearch(value.InstRecursive(), pred); +} + } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index cd4fdaa2..03404aca 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -2,14 +2,19 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include "shader_recompiler/exception.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/value.h" namespace Shader::IR { namespace { -[[noreturn]] void ThrowInvalidType(Type type) { - UNREACHABLE_MSG("Invalid type {}", u32(type)); +[[noreturn]] void ThrowInvalidType(Type type, + std::source_location loc = std::source_location::current()) { + const std::string functionName = loc.function_name(); + const int lineNumber = loc.line(); + UNREACHABLE_MSG("Invalid type = {}, functionName = {}, line = {}", u32(type), functionName, + lineNumber); } Value MakeLodClampPair(IREmitter& ir, const F32& bias_lod, const F32& lod_clamp) { @@ -273,7 +278,7 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); case 128: return Inst(Opcode::LoadSharedU128, offset); default: @@ -368,6 +373,10 @@ U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } +U32 IREmitter::WarpId() { + return Inst(Opcode::WarpId); +} + U32 IREmitter::QuadShuffle(const U32& value, const U32& index) { return Inst(Opcode::QuadShuffle, value, index); } @@ -871,6 +880,10 @@ U1 IREmitter::FPIsInf(const F32F64& value) { } } +U1 IREmitter::FPCmpClass32(const F32& value, const U32& op) { + return Inst(Opcode::FPCmpClass32, value, op); +} + U1 IREmitter::FPOrdered(const F32F64& lhs, const F32F64& rhs) { if (lhs.Type() != rhs.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); @@ -964,8 +977,18 @@ IR::Value IREmitter::IMulExt(const U32& a, const U32& b, bool is_signed) { return Inst(is_signed ? Opcode::SMulExt : Opcode::UMulExt, a, b); } -U32 IREmitter::IMul(const U32& a, const U32& b) { - return Inst(Opcode::IMul32, a, b); +U32U64 IREmitter::IMul(const U32U64& a, const U32U64& b) { + if (a.Type() != b.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); + } + switch (a.Type()) { + case Type::U32: + return Inst(Opcode::IMul32, a, b); + case Type::U64: + return Inst(Opcode::IMul64, a, b); + default: + ThrowInvalidType(a.Type()); + } } U32 IREmitter::IDiv(const U32& a, const U32& b, bool is_signed) { @@ -1024,8 +1047,18 @@ U32 IREmitter::BitwiseAnd(const U32& a, const U32& b) { return Inst(Opcode::BitwiseAnd32, a, b); } -U32 IREmitter::BitwiseOr(const U32& a, const U32& b) { - return Inst(Opcode::BitwiseOr32, a, b); +U32U64 IREmitter::BitwiseOr(const U32U64& a, const U32U64& b) { + if (a.Type() != b.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); + } + switch (a.Type()) { + case Type::U32: + return Inst(Opcode::BitwiseOr32, a, b); + case Type::U64: + return Inst(Opcode::BitwiseOr64, a, b); + default: + ThrowInvalidType(a.Type()); + } } U32 IREmitter::BitwiseXor(const U32& a, const U32& b) { @@ -1063,6 +1096,10 @@ U32 IREmitter::FindUMsb(const U32& value) { return Inst(Opcode::FindUMsb32, value); } +U32 IREmitter::FindILsb(const U32& value) { + return Inst(Opcode::FindILsb32, value); +} + U32 IREmitter::SMin(const U32& a, const U32& b) { return Inst(Opcode::SMin32, a, b); } @@ -1095,8 +1132,18 @@ U32 IREmitter::UClamp(const U32& value, const U32& min, const U32& max) { return Inst(Opcode::UClamp32, value, min, max); } -U1 IREmitter::ILessThan(const U32& lhs, const U32& rhs, bool is_signed) { - return Inst(is_signed ? Opcode::SLessThan : Opcode::ULessThan, lhs, rhs); +U1 IREmitter::ILessThan(const U32U64& lhs, const U32U64& rhs, bool is_signed) { + if (lhs.Type() != rhs.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); + } + switch (lhs.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SLessThan32 : Opcode::ULessThan32, lhs, rhs); + case Type::U64: + return Inst(is_signed ? Opcode::SLessThan64 : Opcode::ULessThan64, lhs, rhs); + default: + ThrowInvalidType(lhs.Type()); + } } U1 IREmitter::IEqual(const U32U64& lhs, const U32U64& rhs) { @@ -1155,8 +1202,9 @@ U32U64 IREmitter::ConvertFToS(size_t bitsize, const F32F64& value) { ThrowInvalidType(value.Type()); } default: - UNREACHABLE_MSG("Invalid destination bitsize {}", bitsize); + break; } + throw NotImplementedException("Invalid destination bitsize {}", bitsize); } U32U64 IREmitter::ConvertFToU(size_t bitsize, const F32F64& value) { @@ -1183,13 +1231,17 @@ F32F64 IREmitter::ConvertSToF(size_t dest_bitsize, size_t src_bitsize, const Val switch (src_bitsize) { case 32: return Inst(Opcode::ConvertF32S32, value); + default: + break; } - break; case 64: switch (src_bitsize) { case 32: return Inst(Opcode::ConvertF64S32, value); + default: + break; } + default: break; } UNREACHABLE_MSG("Invalid bit size combination dst={} src={}", dest_bitsize, src_bitsize); @@ -1203,13 +1255,17 @@ F32F64 IREmitter::ConvertUToF(size_t dest_bitsize, size_t src_bitsize, const Val return Inst(Opcode::ConvertF32U16, value); case 32: return Inst(Opcode::ConvertF32U32, value); + default: + break; } - break; case 64: switch (src_bitsize) { case 32: return Inst(Opcode::ConvertF64U32, value); + default: + break; } + default: break; } UNREACHABLE_MSG("Invalid bit size combination dst={} src={}", dest_bitsize, src_bitsize); @@ -1227,7 +1283,16 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { switch (value.Type()) { case Type::U32: return Inst(Opcode::ConvertU16U32, value); + default: + break; } + case 32: + switch (value.Type()) { + case Type::U16: + return Inst(Opcode::ConvertU32U16, value); + } + default: + break; } throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize); } @@ -1238,13 +1303,17 @@ F16F32F64 IREmitter::FPConvert(size_t result_bitsize, const F16F32F64& value) { switch (value.Type()) { case Type::F32: return Inst(Opcode::ConvertF16F32, value); + default: + break; } - break; case 32: switch (value.Type()) { case Type::F16: return Inst(Opcode::ConvertF32F16, value); + default: + break; } + default: break; } throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index e7512430..a65e4613 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -95,6 +95,7 @@ public: BufferInstInfo info); [[nodiscard]] U32 LaneId(); + [[nodiscard]] U32 WarpId(); [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); @@ -150,6 +151,7 @@ public: [[nodiscard]] U1 FPGreaterThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true); [[nodiscard]] U1 FPIsNan(const F32F64& value); [[nodiscard]] U1 FPIsInf(const F32F64& value); + [[nodiscard]] U1 FPCmpClass32(const F32& value, const U32& op); [[nodiscard]] U1 FPOrdered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] U1 FPUnordered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] F32F64 FPMax(const F32F64& lhs, const F32F64& rhs, bool is_legacy = false); @@ -159,7 +161,7 @@ public: [[nodiscard]] Value IAddCary(const U32& a, const U32& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); [[nodiscard]] Value IMulExt(const U32& a, const U32& b, bool is_signed = false); - [[nodiscard]] U32 IMul(const U32& a, const U32& b); + [[nodiscard]] U32U64 IMul(const U32U64& a, const U32U64& b); [[nodiscard]] U32 IDiv(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 INeg(const U32U64& value); [[nodiscard]] U32 IAbs(const U32& value); @@ -167,7 +169,7 @@ public: [[nodiscard]] U32U64 ShiftRightLogical(const U32U64& base, const U32& shift); [[nodiscard]] U32U64 ShiftRightArithmetic(const U32U64& base, const U32& shift); [[nodiscard]] U32 BitwiseAnd(const U32& a, const U32& b); - [[nodiscard]] U32 BitwiseOr(const U32& a, const U32& b); + [[nodiscard]] U32U64 BitwiseOr(const U32U64& a, const U32U64& b); [[nodiscard]] U32 BitwiseXor(const U32& a, const U32& b); [[nodiscard]] U32 BitFieldInsert(const U32& base, const U32& insert, const U32& offset, const U32& count); @@ -179,6 +181,7 @@ public: [[nodiscard]] U32 FindSMsb(const U32& value); [[nodiscard]] U32 FindUMsb(const U32& value); + [[nodiscard]] U32 FindILsb(const U32& value); [[nodiscard]] U32 SMin(const U32& a, const U32& b); [[nodiscard]] U32 UMin(const U32& a, const U32& b); [[nodiscard]] U32 IMin(const U32& a, const U32& b, bool is_signed); @@ -188,7 +191,7 @@ public: [[nodiscard]] U32 SClamp(const U32& value, const U32& min, const U32& max); [[nodiscard]] U32 UClamp(const U32& value, const U32& min, const U32& max); - [[nodiscard]] U1 ILessThan(const U32& lhs, const U32& rhs, bool is_signed); + [[nodiscard]] U1 ILessThan(const U32U64& lhs, const U32U64& rhs, bool is_signed); [[nodiscard]] U1 IEqual(const U32U64& lhs, const U32U64& rhs); [[nodiscard]] U1 ILessThanEqual(const U32& lhs, const U32& rhs, bool is_signed); [[nodiscard]] U1 IGreaterThan(const U32& lhs, const U32& rhs, bool is_signed); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 9aefc8b3..aa2fd3f8 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -219,6 +219,7 @@ OPCODE(FPIsNan32, U1, F32, OPCODE(FPIsNan64, U1, F64, ) OPCODE(FPIsInf32, U1, F32, ) OPCODE(FPIsInf64, U1, F64, ) +OPCODE(FPCmpClass32, U1, F32, U32 ) // Integer operations OPCODE(IAdd32, U32, U32, U32, ) @@ -227,6 +228,7 @@ OPCODE(IAddCary32, U32x2, U32, OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) +OPCODE(IMul64, U64, U64, U64, ) OPCODE(SMulExt, U32x2, U32, U32, ) OPCODE(UMulExt, U32x2, U32, U32, ) OPCODE(SDiv32, U32, U32, U32, ) @@ -242,6 +244,7 @@ OPCODE(ShiftRightArithmetic32, U32, U32, OPCODE(ShiftRightArithmetic64, U64, U64, U32, ) OPCODE(BitwiseAnd32, U32, U32, U32, ) OPCODE(BitwiseOr32, U32, U32, U32, ) +OPCODE(BitwiseOr64, U64, U64, U64, ) OPCODE(BitwiseXor32, U32, U32, U32, ) OPCODE(BitFieldInsert, U32, U32, U32, U32, U32, ) OPCODE(BitFieldSExtract, U32, U32, U32, U32, ) @@ -252,14 +255,17 @@ OPCODE(BitwiseNot32, U32, U32, OPCODE(FindSMsb32, U32, U32, ) OPCODE(FindUMsb32, U32, U32, ) +OPCODE(FindILsb32, U32, U32, ) OPCODE(SMin32, U32, U32, U32, ) OPCODE(UMin32, U32, U32, U32, ) OPCODE(SMax32, U32, U32, U32, ) OPCODE(UMax32, U32, U32, U32, ) OPCODE(SClamp32, U32, U32, U32, U32, ) OPCODE(UClamp32, U32, U32, U32, U32, ) -OPCODE(SLessThan, U1, U32, U32, ) -OPCODE(ULessThan, U1, U32, U32, ) +OPCODE(SLessThan32, U1, U32, U32, ) +OPCODE(SLessThan64, U1, U64, U64, ) +OPCODE(ULessThan32, U1, U32, U32, ) +OPCODE(ULessThan64, U1, U64, U64, ) OPCODE(IEqual, U1, U32, U32, ) OPCODE(SLessThanEqual, U1, U32, U32, ) OPCODE(ULessThanEqual, U1, U32, U32, ) @@ -289,6 +295,7 @@ OPCODE(ConvertF64S32, F64, U32, OPCODE(ConvertF64U32, F64, U32, ) OPCODE(ConvertF32U16, F32, U16, ) OPCODE(ConvertU16U32, U16, U32, ) +OPCODE(ConvertU32U16, U32, U16, ) // Image operations OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) @@ -319,4 +326,5 @@ OPCODE(ImageAtomicExchange32, U32, Opaq // Warp operations OPCODE(LaneId, U32, ) +OPCODE(WarpId, U32, ) OPCODE(QuadShuffle, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp index 7cd896fb..94218b32 100644 --- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp @@ -21,6 +21,8 @@ template return value.F32(); } else if constexpr (std::is_same_v) { return value.U64(); + } else if constexpr (std::is_same_v) { + return static_cast(value.U64()); } } @@ -236,6 +238,18 @@ void FoldBooleanConvert(IR::Inst& inst) { } } +void FoldCmpClass(IR::Inst& inst) { + ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation"); + const auto class_mask = static_cast(inst.Arg(1).U32()); + if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) { + inst.ReplaceOpcode(IR::Opcode::FPIsNan32); + } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) { + inst.ReplaceOpcode(IR::Opcode::FPIsInf32); + } else { + UNREACHABLE(); + } +} + void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: @@ -249,6 +263,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::IMul32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); return; + case IR::Opcode::FPCmpClass32: + FoldCmpClass(inst); + return; case IR::Opcode::ShiftRightArithmetic32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast(a >> b); }); return; @@ -281,12 +298,18 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { return FoldLogicalOr(inst); case IR::Opcode::LogicalNot: return FoldLogicalNot(inst); - case IR::Opcode::SLessThan: + case IR::Opcode::SLessThan32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a < b; }); return; - case IR::Opcode::ULessThan: + case IR::Opcode::SLessThan64: + FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a < b; }); + return; + case IR::Opcode::ULessThan32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a < b; }); return; + case IR::Opcode::ULessThan64: + FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a < b; }); + return; case IR::Opcode::SLessThanEqual: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; }); return; diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 6526ece6..eaca8ce8 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include #include #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" @@ -273,9 +272,18 @@ std::pair TryDisableAnisoLod0(const IR::Inst* inst) { } SharpLocation TrackSharp(const IR::Inst* inst) { - while (inst->GetOpcode() == IR::Opcode::Phi) { - inst = inst->Arg(0).InstRecursive(); - } + // Search until we find a potential sharp source. + const auto pred0 = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + const auto result = IR::BreadthFirstSearch(inst, pred0); + ASSERT_MSG(result, "Unable to track sharp source"); + inst = result.value(); + // If its from user data not much else to do. if (inst->GetOpcode() == IR::Opcode::GetUserData) { return SharpLocation{ .sgpr_base = u32(IR::ScalarReg::Max), @@ -289,14 +297,14 @@ SharpLocation TrackSharp(const IR::Inst* inst) { const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); // Retrieve SGPR pair that holds sbase - const auto pred = [](const IR::Inst* inst) -> std::optional { + const auto pred1 = [](const IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetUserData) { return inst->Arg(0).ScalarReg(); } return std::nullopt; }; - const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred); - const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred); + const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1); + const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1); ASSERT_MSG(base0 && base1, "Nested resource loads not supported"); // Return retrieved location. @@ -456,36 +464,26 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& } void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { - std::deque insts{&inst}; - const auto& pred = [](auto opcode) -> bool { - return (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler) - opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only) - opcode == IR::Opcode::GetUserData); + const auto pred = [](const IR::Inst* inst) -> std::optional { + const auto opcode = inst->GetOpcode(); + if (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler) + opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only) + opcode == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; }; - - IR::Inst* producer{}; - while (!insts.empty() && (producer = insts.front(), !pred(producer->GetOpcode()))) { - for (auto arg_idx = 0u; arg_idx < producer->NumArgs(); ++arg_idx) { - const auto arg = producer->Arg(arg_idx); - if (arg.TryInstRecursive()) { - insts.push_back(arg.InstRecursive()); - } - } - insts.pop_front(); - } - ASSERT(pred(producer->GetOpcode())); - auto [tsharp_handle, ssharp_handle] = [&] -> std::pair { - if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) { - return std::make_pair(producer->Arg(0).InstRecursive(), - producer->Arg(1).InstRecursive()); - } - return std::make_pair(producer, nullptr); - }(); + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to find image sharp source"); + const IR::Inst* producer = result.value(); + const bool has_sampler = producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2; + const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer; // Read image sharp. const auto tsharp = TrackSharp(tsharp_handle); const auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); const auto inst_info = inst.Flags(); + ASSERT(image.GetType() != AmdGpu::ImageType::Invalid); u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, .dword_offset = tsharp.dword_offset, @@ -496,17 +494,32 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip }); // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions - if (ssharp_handle) { + const u32 sampler_binding = [&] { + if (!has_sampler) { + return 0U; + } + const IR::Value& handle = producer->Arg(1); + // Inline sampler resource. + if (handle.IsImmediate()) { + LOG_WARNING(Render_Vulkan, "Inline sampler detected"); + return descriptors.Add(SamplerResource{ + .sgpr_base = std::numeric_limits::max(), + .dword_offset = 0, + .inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()}, + }); + } + // Normal sampler resource. + const auto ssharp_handle = handle.InstRecursive(); const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle); const auto ssharp = TrackSharp(ssharp_ud); - const u32 sampler_binding = descriptors.Add(SamplerResource{ + return descriptors.Add(SamplerResource{ .sgpr_base = ssharp.sgpr_base, .dword_offset = ssharp.dword_offset, .associated_image = image_binding, .disable_aniso = disable_aniso, }); - image_binding |= (sampler_binding << 16); - } + }(); + image_binding |= (sampler_binding << 16); // Patch image handle IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -607,7 +620,7 @@ void ResourceTrackingPass(IR::Program& program) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; Descriptors descriptors{info.buffers, info.images, info.samplers}; - for (IR::Block* const block : program.post_order_blocks) { + for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferInstruction(*block, inst, info, descriptors); diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index b51ce94e..7100b384 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -20,11 +20,19 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::LoadSharedU8: case IR::Opcode::WriteSharedU8: info.uses_shared_u8 = true; + info.uses_shared = true; break; case IR::Opcode::LoadSharedS16: case IR::Opcode::LoadSharedU16: case IR::Opcode::WriteSharedU16: info.uses_shared_u16 = true; + info.uses_shared = true; + break; + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + info.uses_shared = true; break; case IR::Opcode::ConvertF32F16: case IR::Opcode::BitCastF16U16: diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index 6a43ad6b..80591492 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -348,13 +348,15 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetThreadBitScalarReg: case IR::Opcode::GetScalarRegister: { const IR::ScalarReg reg{inst.Arg(0).ScalarReg()}; - inst.ReplaceUsesWith( - pass.ReadVariable(reg, block, opcode == IR::Opcode::GetThreadBitScalarReg)); + const bool thread_bit = opcode == IR::Opcode::GetThreadBitScalarReg; + const IR::Value value = pass.ReadVariable(reg, block, thread_bit); + inst.ReplaceUsesWith(value); break; } case IR::Opcode::GetVectorRegister: { const IR::VectorReg reg{inst.Arg(0).VectorReg()}; - inst.ReplaceUsesWith(pass.ReadVariable(reg, block)); + const IR::Value value = pass.ReadVariable(reg, block); + inst.ReplaceUsesWith(value); break; } case IR::Opcode::GetGotoVariable: diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index d9e9b030..e3d04260 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -5,6 +5,7 @@ #include "common/assert.h" #include "common/bit_field.h" +#include "common/enum.h" #include "common/types.h" #include "video_core/amdgpu/pixel_format.h" @@ -24,6 +25,23 @@ enum class FpDenormMode : u32 { InOutAllow = 3, }; +enum class FloatClassFunc : u32 { + SignalingNan = 1 << 0, + QuietNan = 1 << 1, + NegativeInfinity = 1 << 2, + NegativeNormal = 1 << 3, + NegativeDenorm = 1 << 4, + NegativeZero = 1 << 5, + PositiveZero = 1 << 6, + PositiveDenorm = 1 << 7, + PositiveNormal = 1 << 8, + PositiveInfinity = 1 << 9, + + NaN = SignalingNan | QuietNan, + Infinity = PositiveInfinity | NegativeInfinity, +}; +DECLARE_ENUM_FLAG_OPERATORS(FloatClassFunc) + union Mode { BitField<0, 4, FpRoundMode> fp_round; BitField<4, 2, FpDenormMode> fp_denorm_single; diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index a43c17f5..db939eaa 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -220,6 +220,7 @@ using F16 = TypedValue; using F32 = TypedValue; using F64 = TypedValue; using U32F32 = TypedValue; +using U64F64 = TypedValue; using U32U64 = TypedValue; using U16U32U64 = TypedValue; using F32F64 = TypedValue; diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 54b34730..badd5455 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -9,6 +9,7 @@ namespace Shader { struct Profile { u32 supported_spirv{0x00010000}; + u32 subgroup_size{}; bool unified_descriptor_binding{}; bool support_descriptor_aliasing{}; bool support_int8{}; diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index f2834abf..d747c016 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -28,7 +28,8 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { } IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& block_pool, - std::span token, const Info&& info) { + std::span token, const Info&& info, + const Profile& profile) { // Ensure first instruction is expected. constexpr u32 token_mov_vcchi = 0xBEEB03FF; ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm"); @@ -49,7 +50,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool& block_pool, - std::span code, const Info&& info); + std::span code, const Info&& info, + const Profile& profile); } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 8824e344..277c38b7 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -97,8 +97,11 @@ using ImageResourceList = boost::container::static_vector; struct SamplerResource { u32 sgpr_base; u32 dword_offset; + AmdGpu::Sampler inline_sampler{}; u32 associated_image : 4; u32 disable_aniso : 1; + + constexpr AmdGpu::Sampler GetSsharp(const Info& info) const noexcept; }; using SamplerResourceList = boost::container::static_vector; @@ -175,6 +178,7 @@ struct Info { bool has_image_gather{}; bool has_image_query{}; bool uses_group_quad{}; + bool uses_shared{}; bool uses_shared_u8{}; bool uses_shared_u16{}; bool uses_fp16{}; @@ -196,6 +200,10 @@ constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexc return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); } +constexpr AmdGpu::Sampler SamplerResource::GetSsharp(const Info& info) const noexcept { + return inline_sampler ? inline_sampler : info.ReadUd(sgpr_base, dword_offset); +} + } // namespace Shader template <> diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index ab7ad241..af1963ee 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -5,8 +5,10 @@ #include "common/debug.h" #include "common/polyfill_thread.h" #include "common/thread.h" +#include "core/libraries/videoout/driver.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/pm4_cmds.h" +#include "video_core/renderdoc.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" namespace AmdGpu { @@ -32,12 +34,15 @@ void Liverpool::Process(std::stop_token stoken) { while (!stoken.stop_requested()) { { std::unique_lock lk{submit_mutex}; - Common::CondvarWait(submit_cv, lk, stoken, [this] { return num_submits != 0; }); + Common::CondvarWait(submit_cv, lk, stoken, + [this] { return num_submits != 0 || submit_done; }); } if (stoken.stop_requested()) { break; } + VideoCore::StartCapture(); + int qid = -1; while (num_submits) { @@ -48,11 +53,9 @@ void Liverpool::Process(std::stop_token stoken) { Task::Handle task{}; { std::scoped_lock lock{queue.m_access}; - if (queue.submits.empty()) { continue; } - task = queue.submits.front(); } task.resume(); @@ -64,9 +67,20 @@ void Liverpool::Process(std::stop_token stoken) { queue.submits.pop(); --num_submits; + std::scoped_lock lock2{submit_mutex}; + submit_cv.notify_all(); } } + if (submit_done) { + VideoCore::EndCapture(); + + if (rasterizer) { + rasterizer->Flush(); + } + submit_done = false; + } + Platform::IrqC::Instance()->Signal(Platform::InterruptId::GpuIdle); } } @@ -365,8 +379,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); const u32 data_size = (header->type3.count.Value() - 2) * 4; + u64* address = write_data->Address(); if (!write_data->wr_one_addr.Value()) { - std::memcpy(write_data->Address(), write_data->data, data_size); + std::memcpy(address, write_data->data, data_size); } else { UNREACHABLE(); } @@ -379,10 +394,20 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + // Optimization: VO label waits are special because the emulator + // will write to the label when presentation is finished. So if + // there are no other submits to yield to we can sleep the thread + // instead and allow other tasks to run. + const u64* wait_addr = wait_reg_mem->Address(); + if (vo_port->IsVoLabel(wait_addr) && num_submits == 1) { + vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); }); + } while (!wait_reg_mem->Test()) { + mapped_queues[GfxQueueId].cs_state = regs.cs_program; TracyFiberLeave; co_yield {}; TracyFiberEnter(dcb_task_name); + regs.cs_program = mapped_queues[GfxQueueId].cs_state; } break; } @@ -483,9 +508,11 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); while (!wait_reg_mem->Test()) { + mapped_queues[vqid].cs_state = regs.cs_program; TracyFiberLeave; co_yield {}; TracyFiberEnter(acb_task_name); + regs.cs_program = mapped_queues[vqid].cs_state; } break; } @@ -506,12 +533,11 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } void Liverpool::SubmitGfx(std::span dcb, std::span ccb) { - static constexpr u32 GfxQueueId = 0u; auto& queue = mapped_queues[GfxQueueId]; auto task = ProcessGraphics(dcb, ccb); { - std::unique_lock lock{queue.m_access}; + std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); } @@ -526,7 +552,7 @@ void Liverpool::SubmitAsc(u32 vqid, std::span acb) { const auto& task = ProcessCompute(acb, vqid); { - std::unique_lock lock{queue.m_access}; + std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index b87c80ed..b0285809 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,10 @@ namespace Vulkan { class Rasterizer; } +namespace Libraries::VideoOut { +struct VideoOutPort; +} + namespace AmdGpu { #define GFX6_3D_REG_INDEX(field_name) (offsetof(AmdGpu::Liverpool::Regs, field_name) / sizeof(u32)) @@ -31,6 +36,7 @@ namespace AmdGpu { [[maybe_unused]] std::array CONCAT2(pad, __LINE__) struct Liverpool { + static constexpr u32 GfxQueueId = 0u; static constexpr u32 NumGfxRings = 1u; // actually 2, but HP is reserved by system software static constexpr u32 NumComputePipes = 7u; // actually 8, but #7 is reserved by system software static constexpr u32 NumQueuesPerPipe = 8u; @@ -372,9 +378,13 @@ struct Liverpool { return 1u << z_info.num_samples; // spec doesn't say it is a log2 } + u32 NumBits() const { + return z_info.format == ZFormat::Z32Float ? 32 : 16; + } + size_t GetDepthSliceSize() const { ASSERT(z_info.format != ZFormat::Invalid); - const auto bpe = z_info.format == ZFormat::Z32Float ? 4 : 2; + const auto bpe = NumBits() >> 3; // in bytes return (depth_slice.tile_max + 1) * 64 * bpe * NumSamples(); } }; @@ -991,10 +1001,25 @@ public: void SubmitGfx(std::span dcb, std::span ccb); void SubmitAsc(u32 vqid, std::span acb); + void SubmitDone() noexcept { + std::scoped_lock lk{submit_mutex}; + submit_done = true; + submit_cv.notify_one(); + } + + void WaitGpuIdle() noexcept { + std::unique_lock lk{submit_mutex}; + submit_cv.wait(lk, [this] { return num_submits == 0; }); + } + bool IsGpuIdle() const { return num_submits == 0; } + void SetVoPort(Libraries::VideoOut::VideoOutPort* port) { + vo_port = port; + } + void BindRasterizer(Vulkan::Rasterizer* rasterizer_) { rasterizer = rasterizer_; } @@ -1037,6 +1062,7 @@ private: struct GpuQueue { std::mutex m_access{}; std::queue submits{}; + ComputeProgram cs_state{}; }; std::array mapped_queues{}; @@ -1059,8 +1085,10 @@ private: } cblock{}; Vulkan::Rasterizer* rasterizer{}; + Libraries::VideoOut::VideoOutPort* vo_port{}; std::jthread process_thread{}; std::atomic num_submits{}; + std::atomic submit_done{}; std::mutex submit_mutex; std::condition_variable_any submit_cv; }; diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 6618e72a..6744891a 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -7,6 +7,77 @@ namespace AmdGpu { +std::string_view NameOf(DataFormat fmt) { + switch (fmt) { + case DataFormat::FormatInvalid: + return "FormatInvalid"; + case DataFormat::Format8: + return "Format8"; + case DataFormat::Format16: + return "Format16"; + case DataFormat::Format8_8: + return "Format8_8"; + case DataFormat::Format32: + return "Format32"; + case DataFormat::Format16_16: + return "Format16_16"; + case DataFormat::Format10_11_11: + return "Format10_11_11"; + case DataFormat::Format11_11_10: + return "Format11_11_10"; + case DataFormat::Format10_10_10_2: + return "Format10_10_10_2"; + case DataFormat::Format2_10_10_10: + return "Format2_10_10_10"; + case DataFormat::Format8_8_8_8: + return "Format8_8_8_8"; + case DataFormat::Format32_32: + return "Format32_32"; + case DataFormat::Format16_16_16_16: + return "Format16_16_16_16"; + case DataFormat::Format32_32_32: + return "Format32_32_32"; + case DataFormat::Format32_32_32_32: + return "Format32_32_32_32"; + case DataFormat::Format5_6_5: + return "Format5_6_5"; + case DataFormat::Format1_5_5_5: + return "Format1_5_5_5"; + case DataFormat::Format5_5_5_1: + return "Format5_5_5_1"; + case DataFormat::Format4_4_4_4: + return "Format4_4_4_4"; + case DataFormat::Format8_24: + return "Format8_24"; + case DataFormat::Format24_8: + return "Format24_8"; + case DataFormat::FormatX24_8_32: + return "FormatX24_8_32"; + case DataFormat::FormatGB_GR: + return "FormatGB_GR"; + case DataFormat::FormatBG_RG: + return "FormatBG_RG"; + case DataFormat::Format5_9_9_9: + return "Format5_9_9_9"; + case DataFormat::FormatBc1: + return "FormatBc1"; + case DataFormat::FormatBc2: + return "FormatBc2"; + case DataFormat::FormatBc3: + return "FormatBc3"; + case DataFormat::FormatBc4: + return "FormatBc4"; + case DataFormat::FormatBc5: + return "FormatBc5"; + case DataFormat::FormatBc6: + return "FormatBc6"; + case DataFormat::FormatBc7: + return "FormatBc7"; + default: + UNREACHABLE(); + } +} + std::string_view NameOf(NumberFormat fmt) { switch (fmt) { case NumberFormat::Unorm: diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 2a38c5a0..1004ed7d 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -61,6 +61,7 @@ enum class NumberFormat : u32 { Ubscaled = 13, }; +[[nodiscard]] std::string_view NameOf(DataFormat fmt); [[nodiscard]] std::string_view NameOf(NumberFormat fmt); int NumComponents(DataFormat format); @@ -70,6 +71,16 @@ s32 ComponentOffset(DataFormat format, u32 comp); } // namespace AmdGpu +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(AmdGpu::DataFormat fmt, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(fmt)); + } +}; + template <> struct fmt::formatter { constexpr auto parse(format_parse_context& ctx) { diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index eded2de3..e5f618cc 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -404,8 +404,9 @@ struct PM4CmdWaitRegMem { u32 mask; u32 poll_interval; - u32* Address() const { - return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); + template + T Address() const { + return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); } bool Test() const { @@ -464,8 +465,8 @@ struct PM4CmdWriteData { } template - T* Address() const { - return reinterpret_cast(addr64); + T Address() const { + return reinterpret_cast(addr64); } }; @@ -494,8 +495,9 @@ struct PM4CmdEventWriteEos { BitField<16, 16, u32> size; ///< Number of DWs to read from the GDS }; - u32* Address() const { - return reinterpret_cast(address_lo | u64(address_hi) << 32); + template + T Address() const { + return reinterpret_cast(address_lo | u64(address_hi) << 32); } u32 DataDWord() const { @@ -650,6 +652,13 @@ struct PM4CmdReleaseMem { return data_lo | u64(data_hi) << 32; } + uint64_t GetGpuClock64() const { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + auto ticks = std::chrono::duration_cast(duration).count(); + return static_cast(ticks); + } + void SignalFence(Platform::InterruptId irq_id) const { switch (data_sel.Value()) { case DataSelect::Data32Low: { @@ -660,6 +669,10 @@ struct PM4CmdReleaseMem { *Address() = DataQWord(); break; } + case DataSelect::GpuClock64: { + *Address() = GetGpuClock64(); + break; + } case DataSelect::PerfCounter: { *Address() = Common::FencedRDTSC(); break; diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 6ab3306b..01271792 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -75,7 +75,7 @@ struct Buffer { static_assert(sizeof(Buffer) == 16); // 128bits enum class ImageType : u64 { - Buffer = 0, + Invalid = 0, Color1D = 8, Color2D = 9, Color3D = 10, @@ -88,8 +88,8 @@ enum class ImageType : u64 { constexpr std::string_view NameOf(ImageType type) { switch (type) { - case ImageType::Buffer: - return "Buffer"; + case ImageType::Invalid: + return "Invalid"; case ImageType::Color1D: return "Color1D"; case ImageType::Color2D: @@ -179,6 +179,40 @@ struct Image { return base_address << 8; } + u32 DstSelect() const { + return dst_sel_x | (dst_sel_y << 3) | (dst_sel_z << 6) | (dst_sel_w << 9); + } + + static char SelectComp(u32 sel) { + switch (sel) { + case 0: + return '0'; + case 1: + return '1'; + case 4: + return 'R'; + case 5: + return 'G'; + case 6: + return 'B'; + case 7: + return 'A'; + default: + UNREACHABLE(); + } + } + + std::string DstSelectName() const { + std::string result = "["; + u32 dst_sel = DstSelect(); + for (u32 i = 0; i < 4; i++) { + result += SelectComp(dst_sel & 7); + dst_sel >>= 3; + } + result += ']'; + return result; + } + u32 Pitch() const { return pitch + 1; } @@ -290,6 +324,7 @@ enum class BorderColor : u64 { // Table 8.12 Sampler Resource Definition struct Sampler { union { + u64 raw0; BitField<0, 3, ClampMode> clamp_x; BitField<3, 3, ClampMode> clamp_y; BitField<6, 3, ClampMode> clamp_z; @@ -309,6 +344,7 @@ struct Sampler { BitField<60, 4, u64> perf_z; }; union { + u64 raw1; BitField<0, 14, u64> lod_bias; BitField<14, 6, u64> lod_bias_sec; BitField<20, 2, Filter> xy_mag_filter; @@ -323,6 +359,10 @@ struct Sampler { BitField<62, 2, BorderColor> border_color_type; }; + operator bool() const noexcept { + return raw0 != 0 || raw1 != 0; + } + float LodBias() const noexcept { return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / 256.0f; diff --git a/src/video_core/host_shaders/detile_m32x1.comp b/src/video_core/host_shaders/detile_m32x1.comp index f3e84c75..fecea109 100644 --- a/src/video_core/host_shaders/detile_m32x1.comp +++ b/src/video_core/host_shaders/detile_m32x1.comp @@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(r32ui, binding = 1) uniform writeonly uimage2D output_img; +layout(std430, binding = 1) buffer output_buf { + uint out_data[]; +}; layout(push_constant) uniform image_info { + uint num_levels; uint pitch; + uint sizes[14]; } info; // Inverse morton LUT, small enough to fit into K$ @@ -31,20 +35,22 @@ uint rmort[16] = { #define TEXELS_PER_ELEMENT (1) void main() { + uint tile_base = gl_GlobalInvocationID.x - gl_LocalInvocationID.x; // WG*16 + uint p0 = in_data[gl_GlobalInvocationID.x]; uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; uint col = bitfieldExtract(packed_pos, 4, 4); uint row = bitfieldExtract(packed_pos, 0, 4); - uint p0 = in_data[gl_GlobalInvocationID.x]; + uint mip = 0; + for (int m = 0; m < info.num_levels; ++m) { + mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0; + } - uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) + uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1); uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; - - uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; - uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; - - ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); - imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0)); -} \ No newline at end of file + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col; + uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM; + out_data[dw_ofs_x + dw_ofs_y] = p0; +} diff --git a/src/video_core/host_shaders/detile_m32x2.comp b/src/video_core/host_shaders/detile_m32x2.comp index 2853f8b7..c2caa62c 100644 --- a/src/video_core/host_shaders/detile_m32x2.comp +++ b/src/video_core/host_shaders/detile_m32x2.comp @@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img; +layout(std430, binding = 1) buffer output_buf { + uint out_data[]; +}; layout(push_constant) uniform image_info { + uint num_levels; uint pitch; + uint sizes[14]; } info; // Inverse morton LUT, small enough to fit into K$ @@ -30,19 +34,25 @@ uint rmort[16] = { #define MICRO_TILE_DIM (8) void main() { + uint block_ofs = 2 * gl_GlobalInvocationID.x; + uint p0 = in_data[block_ofs + 0]; + uint p1 = in_data[block_ofs + 1]; + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; uint col = bitfieldExtract(packed_pos, 4, 4); uint row = bitfieldExtract(packed_pos, 0, 4); - uint block_ofs = 2 * gl_GlobalInvocationID.x; - uint p0 = in_data[block_ofs + 0]; - uint p1 = in_data[block_ofs + 1]; + uint mip = 0; + for (int m = 0; m < info.num_levels; ++m) { + mip += (gl_GlobalInvocationID.x * 8) >= info.sizes[m] ? 1 : 0; + } - uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4 - ivec2 img_pos = MICRO_TILE_DIM * ivec2( - gl_WorkGroupID.x % tiles_per_pitch, - gl_WorkGroupID.x / tiles_per_pitch - ); - imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0)); -} \ No newline at end of file + uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1) * 2; + uint target_tile_x = 2 * gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = 2 * gl_WorkGroupID.x / tiles_per_pitch; + uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col * 2; + uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM; + out_data[dw_ofs_x + dw_ofs_y] = p0; + out_data[dw_ofs_x + dw_ofs_y + 1] = p1; +} diff --git a/src/video_core/host_shaders/detile_m32x4.comp b/src/video_core/host_shaders/detile_m32x4.comp index 64f34e6f..11353870 100644 --- a/src/video_core/host_shaders/detile_m32x4.comp +++ b/src/video_core/host_shaders/detile_m32x4.comp @@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img; +layout(std430, binding = 1) buffer output_buf { + uint out_data[]; +}; layout(push_constant) uniform image_info { + uint num_levels; uint pitch; + uint sizes[14]; } info; // Inverse morton LUT, small enough to fit into K$ @@ -30,21 +34,29 @@ uint rmort[16] = { #define MICRO_TILE_DIM (8) void main() { - uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); - uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; - uint col = bitfieldExtract(packed_pos, 4, 4); - uint row = bitfieldExtract(packed_pos, 0, 4); - uint block_ofs = 4 * gl_GlobalInvocationID.x; uint p0 = in_data[block_ofs + 0]; uint p1 = in_data[block_ofs + 1]; uint p2 = in_data[block_ofs + 2]; uint p3 = in_data[block_ofs + 3]; - uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4 - ivec2 img_pos = MICRO_TILE_DIM * ivec2( - gl_WorkGroupID.x % tiles_per_pitch, - gl_WorkGroupID.x / tiles_per_pitch - ); - imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3)); -} \ No newline at end of file + uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4); + uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs; + uint col = bitfieldExtract(packed_pos, 4, 4); + uint row = bitfieldExtract(packed_pos, 0, 4); + + uint mip = 0; + for (int m = 0; m < info.num_levels; ++m) { + mip += (gl_GlobalInvocationID.x * 16) >= info.sizes[m] ? 1 : 0; + } + + uint tiles_per_pitch = max(((info.pitch >> mip) / MICRO_TILE_DIM), 1u) * 4; + uint target_tile_x = 4 * gl_WorkGroupID.x % tiles_per_pitch; + uint target_tile_y = 4 * gl_WorkGroupID.x / tiles_per_pitch; + uint dw_ofs_x = (target_tile_x * MICRO_TILE_DIM) + 4 * col; + uint dw_ofs_y = ((target_tile_y * tiles_per_pitch) * 64u) + ((row * tiles_per_pitch) * MICRO_TILE_DIM); + out_data[dw_ofs_x + dw_ofs_y] = p0; + out_data[dw_ofs_x + dw_ofs_y + 1] = p1; + out_data[dw_ofs_x + dw_ofs_y + 2] = p2; + out_data[dw_ofs_x + dw_ofs_y + 3] = p3; +} diff --git a/src/video_core/host_shaders/detile_m8x1.comp b/src/video_core/host_shaders/detile_m8x1.comp index b4d920e6..5ec48fae 100644 --- a/src/video_core/host_shaders/detile_m8x1.comp +++ b/src/video_core/host_shaders/detile_m8x1.comp @@ -11,10 +11,14 @@ layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(r8ui, binding = 1) uniform writeonly uimage2D output_img; +layout(std430, binding = 1) buffer output_buf { + uint out_data[]; +}; layout(push_constant) uniform image_info { + uint num_levels; uint pitch; + uint sizes[14]; } info; #define MICRO_TILE_DIM 8 @@ -32,17 +36,15 @@ void main() { uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT) + TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3); - uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) + uint mip = 0; + for (int m = 0; m < info.num_levels; ++m) { + mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0; + } + + uint tiles_per_pitch = max((info.pitch >> mip) / 8, 1); uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; - uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col; - uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; - - ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); - - #pragma unroll - for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) { - imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff)); - dst_tx >>= 8; - } + uint dw_ofs_x = target_tile_x * 2 + col; // 2 = uints + uint dw_ofs_y = (target_tile_y * MICRO_TILE_DIM + row) * tiles_per_pitch * 2; // 2 = uints + out_data[dw_ofs_x + dw_ofs_y] = dst_tx; } \ No newline at end of file diff --git a/src/video_core/host_shaders/detile_m8x2.comp b/src/video_core/host_shaders/detile_m8x2.comp index 1cebc12b..d27bc6e2 100644 --- a/src/video_core/host_shaders/detile_m8x2.comp +++ b/src/video_core/host_shaders/detile_m8x2.comp @@ -10,10 +10,14 @@ layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in; layout(std430, binding = 0) buffer input_buf { uint in_data[]; }; -layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img; +layout(std430, binding = 1) buffer output_buf { + uint out_data[]; +}; layout(push_constant) uniform image_info { + uint num_levels; uint pitch; + uint sizes[14]; } info; #define MICRO_TILE_DIM 8 @@ -44,18 +48,14 @@ void main() { uint col = bitfieldExtract(packed_pos, 4, 4); uint row = bitfieldExtract(packed_pos, 0, 4); - uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM) + uint mip = 0u; + for (int m = 0; m < info.num_levels; ++m) { + mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0; + } + uint tiles_per_pitch = max(((info.pitch >> mip) / 8u), 1u); uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch; uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch; - uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col; - uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row; - - ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y); - - #pragma unroll - for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) { - uint p0 = (p[ofs] >> 8) & 0xff; - uint p1 = p[ofs] & 0xff; - imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p1, p0, 0, 0)); - } + uint dw_ofs_x = target_tile_x * 8 + col; + uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * 8; + out_data[(dw_ofs_x + dw_ofs_y) / 2] = src_tx; } diff --git a/src/video_core/renderdoc.cpp b/src/video_core/renderdoc.cpp new file mode 100644 index 00000000..7f88e126 --- /dev/null +++ b/src/video_core/renderdoc.cpp @@ -0,0 +1,120 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "common/config.h" +#include "video_core/renderdoc.h" + +#include + +#ifdef _WIN32 +#include +#else +#include +#endif + +#include + +namespace VideoCore { + +enum class CaptureState { + Idle, + Triggered, + InProgress, +}; +static CaptureState capture_state{CaptureState::Idle}; + +RENDERDOC_API_1_6_0* rdoc_api{}; + +void LoadRenderDoc() { +#ifdef WIN32 + + // Check if we are running by RDoc GUI + HMODULE mod = GetModuleHandleA("renderdoc.dll"); + if (!mod && Config::isRdocEnabled()) { + // If enabled in config, try to load RDoc runtime in offline mode + HKEY h_reg_key; + LONG result = RegOpenKeyExW(HKEY_LOCAL_MACHINE, + L"SOFTWARE\\Classes\\RenderDoc.RDCCapture.1\\DefaultIcon\\", 0, + KEY_READ, &h_reg_key); + if (result != ERROR_SUCCESS) { + return; + } + std::array key_str{}; + DWORD str_sz_out{key_str.size()}; + result = RegQueryValueExW(h_reg_key, L"", 0, NULL, (LPBYTE)key_str.data(), &str_sz_out); + if (result != ERROR_SUCCESS) { + return; + } + + std::filesystem::path path{key_str.cbegin(), key_str.cend()}; + path = path.parent_path().append("renderdoc.dll"); + const auto path_to_lib = path.generic_string(); + mod = LoadLibraryA(path_to_lib.c_str()); + } + + if (mod) { + const auto RENDERDOC_GetAPI = + reinterpret_cast(GetProcAddress(mod, "RENDERDOC_GetAPI")); + const s32 ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_6_0, (void**)&rdoc_api); + ASSERT(ret == 1); + } +#else +#ifdef ANDROID + static constexpr const char RENDERDOC_LIB[] = "libVkLayer_GLES_RenderDoc.so"; +#else + static constexpr const char RENDERDOC_LIB[] = "librenderdoc.so"; +#endif + if (void* mod = dlopen(RENDERDOC_LIB, RTLD_NOW | RTLD_NOLOAD)) { + const auto RENDERDOC_GetAPI = + reinterpret_cast(dlsym(mod, "RENDERDOC_GetAPI")); + const s32 ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_6_0, (void**)&rdoc_api); + ASSERT(ret == 1); + } +#endif + if (rdoc_api) { + // Disable default capture keys as they suppose to trigger present-to-present capturing + // and it is not what we want + rdoc_api->SetCaptureKeys(nullptr, 0); + + // Also remove rdoc crash handler + rdoc_api->UnloadCrashHandler(); + } +} + +void StartCapture() { + if (!rdoc_api) { + return; + } + + if (capture_state == CaptureState::Triggered) { + rdoc_api->StartFrameCapture(nullptr, nullptr); + capture_state = CaptureState::InProgress; + } +} + +void EndCapture() { + if (!rdoc_api) { + return; + } + + if (capture_state == CaptureState::InProgress) { + rdoc_api->EndFrameCapture(nullptr, nullptr); + capture_state = CaptureState::Idle; + } +} + +void TriggerCapture() { + if (capture_state == CaptureState::Idle) { + capture_state = CaptureState::Triggered; + } +} + +void SetOutputDir(const std::string& path, const std::string& prefix) { + if (!rdoc_api) { + return; + } + rdoc_api->SetCaptureFilePathTemplate((path + '\\' + prefix).c_str()); +} + +} // namespace VideoCore diff --git a/src/video_core/renderdoc.h b/src/video_core/renderdoc.h new file mode 100644 index 00000000..febf6fbc --- /dev/null +++ b/src/video_core/renderdoc.h @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +namespace VideoCore { + +/// Loads renderdoc dynamic library module. +void LoadRenderDoc(); + +/// Begins a capture if a renderdoc instance is attached. +void StartCapture(); + +/// Ends current renderdoc capture. +void EndCapture(); + +/// Triggers capturing process. +void TriggerCapture(); + +/// Sets output directory for captures +void SetOutputDir(const std::string& path, const std::string& prefix); + +} // namespace VideoCore diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 90d97404..01526265 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -297,6 +297,7 @@ std::span GetAllFormats() { vk::Format::eBc3UnormBlock, vk::Format::eBc4UnormBlock, vk::Format::eBc5UnormBlock, + vk::Format::eBc5SnormBlock, vk::Format::eBc7SrgbBlock, vk::Format::eBc7UnormBlock, vk::Format::eD16Unorm, @@ -308,6 +309,7 @@ std::span GetAllFormats() { vk::Format::eR8G8B8A8Srgb, vk::Format::eR8G8B8A8Uint, vk::Format::eR8G8B8A8Unorm, + vk::Format::eR8G8B8A8Snorm, vk::Format::eR8G8B8A8Uscaled, vk::Format::eR8G8Snorm, vk::Format::eR8G8Uint, @@ -315,6 +317,7 @@ std::span GetAllFormats() { vk::Format::eR8Sint, vk::Format::eR8Uint, vk::Format::eR8Unorm, + vk::Format::eR8Srgb, vk::Format::eR16G16B16A16Sfloat, vk::Format::eR16G16B16A16Sint, vk::Format::eR16G16B16A16Snorm, @@ -335,6 +338,11 @@ std::span GetAllFormats() { vk::Format::eR32Sfloat, vk::Format::eR32Sint, vk::Format::eR32Uint, + vk::Format::eBc6HUfloatBlock, + vk::Format::eR16G16Unorm, + vk::Format::eR16G16B16A16Sscaled, + vk::Format::eR16G16Sscaled, + vk::Format::eE5B9G9R9UfloatPack32, }; return formats; } @@ -384,17 +392,24 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Unorm) { return vk::Format::eBc5UnormBlock; } + if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eBc5SnormBlock; + } if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && num_format == AmdGpu::NumberFormat::Sint) { return vk::Format::eR16G16B16A16Sint; } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Sscaled) { + return vk::Format::eR16G16B16A16Sscaled; + } if (data_format == AmdGpu::DataFormat::Format16_16 && num_format == AmdGpu::NumberFormat::Float) { return vk::Format::eR16G16Sfloat; } - if (data_format == AmdGpu::DataFormat::Format10_11_11 && - num_format == AmdGpu::NumberFormat::Float) { - return vk::Format::eB10G11R11UfloatPack32; + if (data_format == AmdGpu::DataFormat::Format16_16 && + num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eR16G16Unorm; } if (data_format == AmdGpu::DataFormat::Format2_10_10_10 && num_format == AmdGpu::NumberFormat::Unorm) { @@ -492,6 +507,10 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::Sint) { return vk::Format::eR16G16Sint; } + if (data_format == AmdGpu::DataFormat::Format16_16 && + num_format == AmdGpu::NumberFormat::Sscaled) { + return vk::Format::eR16G16Sscaled; + } if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && num_format == AmdGpu::NumberFormat::Uscaled) { return vk::Format::eR8G8B8A8Uscaled; @@ -514,6 +533,34 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::SnormNz) { return vk::Format::eR16G16B16A16Snorm; } + if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && + num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eR8G8B8A8Snorm; + } + if (data_format == AmdGpu::DataFormat::FormatBc6 && num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eBc6HUfloatBlock; + } + if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && + num_format == AmdGpu::NumberFormat::Sint) { + return vk::Format::eR8G8B8A8Sint; + } + if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Srgb) { + return vk::Format::eR8Srgb; + } + if (data_format == AmdGpu::DataFormat::Format11_11_10 && + num_format == AmdGpu::NumberFormat::Float) { + return vk::Format::eB10G11R11UfloatPack32; + } + if (data_format == AmdGpu::DataFormat::Format16 && num_format == AmdGpu::NumberFormat::Uint) { + return vk::Format::eR16Uint; + } + if (data_format == AmdGpu::DataFormat::Format5_9_9_9 && + num_format == AmdGpu::NumberFormat::Float) { + return vk::Format::eE5B9G9R9UfloatPack32; + } + if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eR8Snorm; + } UNREACHABLE_MSG("Unknown data_format={} and num_format={}", u32(data_format), u32(num_format)); } @@ -645,6 +692,8 @@ vk::SampleCountFlagBits NumSamples(u32 num_samples) { return vk::SampleCountFlagBits::e4; case 8: return vk::SampleCountFlagBits::e8; + case 16: + return vk::SampleCountFlagBits::e16; default: UNREACHABLE(); } diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 098f14d9..6810bf34 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -63,44 +63,30 @@ bool CanBlitToSwapchain(const vk::PhysicalDevice physical_device, vk::Format for }; } -RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool) - : window{window_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, - scheduler{instance}, swapchain{instance, window}, texture_cache{instance, scheduler} { - rasterizer = std::make_unique(instance, scheduler, texture_cache, liverpool); +RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool_) + : window{window_}, liverpool{liverpool_}, + instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance}, + present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window}, + texture_cache{instance, draw_scheduler} { + rasterizer = std::make_unique(instance, draw_scheduler, texture_cache, liverpool); const u32 num_images = swapchain.GetImageCount(); const vk::Device device = instance.GetDevice(); - const vk::CommandPoolCreateInfo pool_info = { - .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | - vk::CommandPoolCreateFlagBits::eTransient, - .queueFamilyIndex = instance.GetGraphicsQueueFamilyIndex(), - }; - command_pool = device.createCommandPoolUnique(pool_info); - - const vk::CommandBufferAllocateInfo alloc_info = { - .commandPool = *command_pool, - .level = vk::CommandBufferLevel::ePrimary, - .commandBufferCount = num_images, - }; - - const auto cmdbuffers = device.allocateCommandBuffers(alloc_info); + // Create presentation frames. present_frames.resize(num_images); for (u32 i = 0; i < num_images; i++) { Frame& frame = present_frames[i]; - frame.cmdbuf = cmdbuffers[i]; - frame.render_ready = device.createSemaphore({}); frame.present_done = device.createFence({.flags = vk::FenceCreateFlagBits::eSignaled}); free_queue.push(&frame); } } RendererVulkan::~RendererVulkan() { - scheduler.Finish(); + draw_scheduler.Finish(); const vk::Device device = instance.GetDevice(); for (auto& frame : present_frames) { vmaDestroyImage(instance.GetAllocator(), frame.image, frame.allocation); device.destroyImageView(frame.image_view); - device.destroySemaphore(frame.render_ready); device.destroyFence(frame.present_done); } } @@ -184,7 +170,7 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) { info.pitch = splash->GetImageInfo().width; info.guest_address = VAddr(splash->GetImageData().data()); info.guest_size_bytes = splash->GetImageData().size(); - splash_img.emplace(instance, scheduler, info); + splash_img.emplace(instance, present_scheduler, info); texture_cache.RefreshImage(*splash_img); } frame = PrepareFrameInternal(*splash_img); @@ -193,12 +179,18 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) { return true; } -Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { +Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop) { // Request a free presentation frame. Frame* frame = GetRenderFrame(); - // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image. - image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead); + // EOP flips are triggered from GPU thread so use the drawing scheduler to record + // commands. Otherwise we are dealing with a CPU flip which could have arrived + // from any guest thread. Use a separate scheduler for that. + auto& scheduler = is_eop ? draw_scheduler : flip_scheduler; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead, cmdbuf); const std::array pre_barrier{ vk::ImageMemoryBarrier{ @@ -218,12 +210,11 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { }, }, }; - - const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, {}, {}, pre_barrier); + // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image. cmdbuf.blitImage( image.image, image.layout, frame->image, vk::ImageLayout::eTransferDstOptimal, MakeImageBlit(image.info.size.width, image.info.size.height, frame->width, frame->height), @@ -245,13 +236,15 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; - cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::DependencyFlagBits::eByRegion, {}, {}, post_barrier); - // Flush pending vulkan operations. - scheduler.Flush(frame->render_ready); + // Flush frame creation commands. + frame->ready_semaphore = scheduler.GetMasterSemaphore()->Handle(); + frame->ready_tick = scheduler.CurrentTick(); + SubmitInfo info{}; + scheduler.Flush(info); return frame; } @@ -260,11 +253,8 @@ void RendererVulkan::Present(Frame* frame) { const vk::Image swapchain_image = swapchain.Image(); - const vk::CommandBufferBeginInfo begin_info = { - .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, - }; - const vk::CommandBuffer cmdbuf = frame->cmdbuf; - cmdbuf.begin(begin_info); + auto& scheduler = present_scheduler; + const auto cmdbuf = scheduler.CommandBuffer(); { auto* profiler_ctx = instance.GetProfilerContext(); TracyVkNamedZoneC(profiler_ctx, renderer_gpu_zone, cmdbuf, "Host frame", @@ -339,35 +329,17 @@ void RendererVulkan::Present(Frame* frame) { TracyVkCollect(profiler_ctx, cmdbuf); } } - cmdbuf.end(); - static constexpr std::array wait_stage_masks = { - vk::PipelineStageFlagBits::eColorAttachmentOutput, - vk::PipelineStageFlagBits::eAllGraphics, - }; - - const vk::Semaphore present_ready = swapchain.GetPresentReadySemaphore(); - const vk::Semaphore image_acquired = swapchain.GetImageAcquiredSemaphore(); - const std::array wait_semaphores = {image_acquired, frame->render_ready}; - - vk::SubmitInfo submit_info = { - .waitSemaphoreCount = static_cast(wait_semaphores.size()), - .pWaitSemaphores = wait_semaphores.data(), - .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1u, - .pCommandBuffers = &cmdbuf, - .signalSemaphoreCount = 1, - .pSignalSemaphores = &present_ready, - }; - - std::scoped_lock submit_lock{scheduler.submit_mutex}; - try { - instance.GetGraphicsQueue().submit(submit_info, frame->present_done); - } catch (vk::DeviceLostError& err) { - LOG_CRITICAL(Render_Vulkan, "Device lost during present submit: {}", err.what()); - UNREACHABLE(); - } + // Flush vulkan commands. + SubmitInfo info{}; + info.AddWait(swapchain.GetImageAcquiredSemaphore()); + info.AddWait(frame->ready_semaphore, frame->ready_tick); + info.AddSignal(swapchain.GetPresentReadySemaphore()); + info.AddSignal(frame->present_done); + scheduler.Flush(info); + // Present to swapchain. + std::scoped_lock submit_lock{Scheduler::submit_mutex}; swapchain.Present(); // Free the frame for reuse diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 701d3d14..3fe9267f 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -26,9 +26,15 @@ struct Frame { VmaAllocation allocation; vk::Image image; vk::ImageView image_view; - vk::Semaphore render_ready; vk::Fence present_done; - vk::CommandBuffer cmdbuf; + vk::Semaphore ready_semaphore; + u64 ready_tick; +}; + +enum SchedulerType { + Draw, + Present, + CpuFlip, }; class Rasterizer; @@ -39,16 +45,16 @@ public: ~RendererVulkan(); Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, - VAddr cpu_address) { + VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; const auto image_id = texture_cache.FindImage(info, cpu_address); auto& image = texture_cache.GetImage(image_id); - return PrepareFrameInternal(image); + return PrepareFrameInternal(image, is_eop); } Frame* PrepareBlankFrame() { auto& image = texture_cache.GetImage(VideoCore::NULL_IMAGE_ID); - return PrepareFrameInternal(image); + return PrepareFrameInternal(image, true); } VideoCore::Image& RegisterVideoOutSurface( @@ -60,9 +66,9 @@ public: } bool IsVideoOutSurface(const AmdGpu::Liverpool::ColorBuffer& color_buffer) { - return std::find_if(vo_buffers_addr.cbegin(), vo_buffers_addr.cend(), [&](VAddr vo_buffer) { + return std::ranges::find_if(vo_buffers_addr, [&](VAddr vo_buffer) { return vo_buffer == color_buffer.Address(); - }) != vo_buffers_addr.cend(); + }) != vo_buffers_addr.end(); } bool ShowSplash(Frame* frame = nullptr); @@ -70,13 +76,16 @@ public: void RecreateFrame(Frame* frame, u32 width, u32 height); private: - Frame* PrepareFrameInternal(VideoCore::Image& image); + Frame* PrepareFrameInternal(VideoCore::Image& image, bool is_eop = true); Frame* GetRenderFrame(); private: Frontend::WindowSDL& window; + AmdGpu::Liverpool* liverpool; Instance instance; - Scheduler scheduler; + Scheduler draw_scheduler; + Scheduler present_scheduler; + Scheduler flip_scheduler; Swapchain swapchain; std::unique_ptr rasterizer; VideoCore::TextureCache texture_cache; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 954adf44..34f1e9cc 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -128,7 +128,9 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s for (const auto& image_desc : info.images) { const auto tsharp = info.ReadUd(image_desc.sgpr_base, image_desc.dword_offset); - const auto& image_view = texture_cache.FindTexture(tsharp, image_desc.is_storage); + VideoCore::ImageInfo image_info{tsharp}; + VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; + const auto& image_view = texture_cache.FindTexture(image_info, view_info); const auto& image = texture_cache.GetImage(image_view.image_id); image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); set_writes.push_back({ @@ -146,7 +148,7 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s } } for (const auto& sampler : info.samplers) { - const auto ssharp = info.ReadUd(sampler.sgpr_base, sampler.dword_offset); + const auto ssharp = sampler.GetSsharp(info); const auto vk_sampler = texture_cache.GetSampler(ssharp); image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); set_writes.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index f119bc77..7b00a911 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -366,7 +366,9 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& for (const auto& image_desc : stage.images) { const auto& tsharp = tsharps.emplace_back( stage.ReadUd(image_desc.sgpr_base, image_desc.dword_offset)); - const auto& image_view = texture_cache.FindTexture(tsharp, image_desc.is_storage); + VideoCore::ImageInfo image_info{tsharp}; + VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; + const auto& image_view = texture_cache.FindTexture(image_info, view_info); const auto& image = texture_cache.GetImage(image_view.image_id); image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); set_writes.push_back({ @@ -384,7 +386,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& } } for (const auto& sampler : stage.samplers) { - auto ssharp = stage.ReadUd(sampler.sgpr_base, sampler.dword_offset); + auto ssharp = sampler.GetSsharp(stage); if (sampler.disable_aniso) { const auto& tsharp = tsharps[sampler.associated_image]; if (tsharp.base_level == 0 && tsharp.last_level == 0) { diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 09a9180e..735303a3 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -164,10 +164,11 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVulkan13Features, vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR, vk::PhysicalDeviceDepthClipControlFeaturesEXT>(); - const vk::StructureChain properties_chain = - physical_device.getProperties2(); + const vk::StructureChain properties_chain = physical_device.getProperties2< + vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, + vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); + subgroup_size = properties_chain.get().subgroupSize; + LOG_INFO(Render_Vulkan, "Physical device subgroup size {}", subgroup_size); features = feature_chain.get().features; if (available_extensions.empty()) { @@ -261,6 +262,7 @@ bool Instance::CreateDevice() { .shaderStorageImageExtendedFormats = features.shaderStorageImageExtendedFormats, .shaderStorageImageMultisample = features.shaderStorageImageMultisample, .shaderClipDistance = features.shaderClipDistance, + .shaderInt64 = features.shaderInt64, .shaderInt16 = features.shaderInt16, }, }, diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 32965ddb..a8c0dcf4 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -188,6 +188,11 @@ public: return properties.limits.nonCoherentAtomSize; } + /// Returns the subgroup size of the selected physical device. + u32 SubgroupSize() const { + return subgroup_size; + } + /// Returns the maximum supported elements in a texel buffer u32 MaxTexelBufferElements() const { return properties.limits.maxTexelBufferElements; @@ -249,6 +254,7 @@ private: bool workgroup_memory_explicit_layout{}; bool color_write_en{}; u64 min_imported_host_pointer_alignment{}; + u32 subgroup_size{}; bool tooling_info{}; bool debug_utils_supported{}; bool has_nsight_graphics{}; diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index 037510d4..753f2bbd 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -2,8 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include "common/assert.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" @@ -60,46 +58,4 @@ void MasterSemaphore::Wait(u64 tick) { Refresh(); } -void MasterSemaphore::SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal, - u64 signal_value) { - cmdbuf.end(); - - const u32 num_signal_semaphores = signal ? 2U : 1U; - const std::array signal_values{signal_value, u64(0)}; - const std::array signal_semaphores{Handle(), signal}; - - const u32 num_wait_semaphores = wait ? 2U : 1U; - const std::array wait_values{signal_value - 1, u64(1)}; - const std::array wait_semaphores{Handle(), wait}; - - static constexpr std::array wait_stage_masks = { - vk::PipelineStageFlagBits::eAllCommands, - vk::PipelineStageFlagBits::eColorAttachmentOutput, - }; - - const vk::TimelineSemaphoreSubmitInfo timeline_si = { - .waitSemaphoreValueCount = num_wait_semaphores, - .pWaitSemaphoreValues = wait_values.data(), - .signalSemaphoreValueCount = num_signal_semaphores, - .pSignalSemaphoreValues = signal_values.data(), - }; - - const vk::SubmitInfo submit_info = { - .pNext = &timeline_si, - .waitSemaphoreCount = num_wait_semaphores, - .pWaitSemaphores = wait_semaphores.data(), - .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1u, - .pCommandBuffers = &cmdbuf, - .signalSemaphoreCount = num_signal_semaphores, - .pSignalSemaphores = signal_semaphores.data(), - }; - - try { - instance.GetGraphicsQueue().submit(submit_info); - } catch (vk::DeviceLostError& err) { - UNREACHABLE_MSG("Device lost during submit: {}", err.what()); - } -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 963676b1..ebc7a60a 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -46,10 +46,6 @@ public: /// Waits for a tick to be hit on the GPU void Wait(u64 tick); - /// Submits the provided command buffer for execution - void SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal, - u64 signal_value); - protected: const Instance& instance; vk::UniqueSemaphore semaphore; ///< Timeline semaphore. diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7f0b74ab..8d27d252 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -109,6 +109,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, pipeline_cache = instance.GetDevice().createPipelineCacheUnique({}); profile = Shader::Profile{ .supported_spirv = 0x00010600U, + .subgroup_size = instance.SubgroupSize(), .support_explicit_workgroup_layout = true, }; } @@ -191,7 +192,7 @@ void PipelineCache::RefreshGraphicsKey() { LiverpoolToVK::SurfaceFormat(col_buf.info.format, col_buf.NumFormat()); const auto is_vo_surface = renderer->IsVideoOutSurface(col_buf); key.color_formats[remapped_cb] = LiverpoolToVK::AdjustColorBufferFormat( - base_format, col_buf.info.comp_swap.Value(), is_vo_surface); + base_format, col_buf.info.comp_swap.Value(), false /*is_vo_surface*/); key.blend_controls[remapped_cb] = regs.blend_control[cb]; key.blend_controls[remapped_cb].enable.Assign(key.blend_controls[remapped_cb].enable && !col_buf.info.blend_bypass); @@ -268,7 +269,8 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); info.pgm_base = pgm->Address(); info.pgm_hash = hash; - programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + programs[i] = + Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); // Compile IR to SPIR-V auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding); @@ -308,7 +310,8 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { Shader::Info info = MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); info.pgm_base = cs_pgm.Address(); - auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + auto program = + Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); // Compile IR to SPIR-V u32 binding{}; diff --git a/src/video_core/renderer_vulkan/vk_platform.cpp b/src/video_core/renderer_vulkan/vk_platform.cpp index 1499d877..0915514b 100644 --- a/src/video_core/renderer_vulkan/vk_platform.cpp +++ b/src/video_core/renderer_vulkan/vk_platform.cpp @@ -32,6 +32,7 @@ static VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback( switch (static_cast(callback_data->messageIdNumber)) { case 0x609a13b: // Vertex attribute at location not consumed by shader case 0xc81ad50e: + case 0x92d66fc1: // `pMultisampleState is NULL` for depth only passes (confirmed VL error) return VK_FALSE; default: break; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fe52d074..ff5e97d5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -23,7 +23,7 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, : instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool}, - vertex_index_buffer{instance, scheduler, VertexIndexFlags, 1_GB, BufferType::Upload} { + vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} { if (!Config::nullGpu()) { liverpool->BindRasterizer(this); } @@ -96,6 +96,13 @@ void Rasterizer::DispatchDirect() { cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); } +u64 Rasterizer::Flush() { + const u64 current_tick = scheduler.CurrentTick(); + SubmitInfo info{}; + scheduler.Flush(info); + return current_tick; +} + void Rasterizer::BeginRendering() { const auto& regs = liverpool->regs; RenderState state; @@ -113,12 +120,15 @@ void Rasterizer::BeginRendering() { } const auto& hint = liverpool->last_cb_extent[col_buf_id]; - const auto& image_view = texture_cache.FindRenderTarget(col_buf, hint); + VideoCore::ImageInfo image_info{col_buf, hint}; + VideoCore::ImageViewInfo view_info{col_buf, false /*!!image.info.usage.vo_buffer*/}; + const auto& image_view = texture_cache.FindRenderTarget(image_info, view_info); const auto& image = texture_cache.GetImage(image_view.image_id); state.width = std::min(state.width, image.info.size.width); state.height = std::min(state.height, image.info.size.height); const bool is_clear = texture_cache.IsMetaCleared(col_buf.CmaskAddress()); + state.color_images[state.num_color_attachments] = image.image; state.color_attachments[state.num_color_attachments++] = { .imageView = *image_view.image_view, .imageLayout = vk::ImageLayout::eGeneral, @@ -136,12 +146,14 @@ void Rasterizer::BeginRendering() { const bool is_clear = regs.depth_render_control.depth_clear_enable || texture_cache.IsMetaCleared(htile_address); const auto& hint = liverpool->last_db_extent; - const auto& image_view = texture_cache.FindDepthTarget( - regs.depth_buffer, regs.depth_view.NumSlices(), htile_address, hint, - regs.depth_control.depth_write_enable); + VideoCore::ImageInfo image_info{regs.depth_buffer, regs.depth_view.NumSlices(), + htile_address, hint}; + VideoCore::ImageViewInfo view_info{regs.depth_buffer, regs.depth_view, regs.depth_control}; + const auto& image_view = texture_cache.FindDepthTarget(image_info, view_info); const auto& image = texture_cache.GetImage(image_view.image_id); state.width = std::min(state.width, image.info.size.width); state.height = std::min(state.height, image.info.size.height); + state.depth_image = image.image; state.depth_attachment = { .imageView = *image_view.image_view, .imageLayout = image.layout, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index aead5955..64dc87ef 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -36,6 +36,8 @@ public: void ScopeMarkerBegin(const std::string& str); void ScopeMarkerEnd(); + u64 Flush(); + private: u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset); void MapMemory(VAddr addr, size_t size); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 39dc2847..fb64285f 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -2,12 +2,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include "common/assert.h" #include "common/debug.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" namespace Vulkan { +std::mutex Scheduler::submit_mutex; + Scheduler::Scheduler(const Instance& instance) : instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} { profiler_scope = reinterpret_cast(std::malloc(sizeof(tracy::VkCtxScope))); @@ -47,25 +50,52 @@ void Scheduler::EndRendering() { return; } is_rendering = false; + boost::container::static_vector barriers; + for (size_t i = 0; i < render_state.num_color_attachments; ++i) { + barriers.push_back(vk::ImageMemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eColorAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = render_state.color_images[i], + .subresourceRange = + { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); + } current_cmdbuf.endRendering(); + if (!barriers.empty()) { + current_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput, + vk::PipelineStageFlagBits::eFragmentShader, + vk::DependencyFlagBits::eByRegion, {}, {}, barriers); + } } -void Scheduler::Flush(vk::Semaphore signal, vk::Semaphore wait) { - // When flushing, we only send data to the worker thread; no waiting is necessary. - SubmitExecution(signal, wait); +void Scheduler::Flush(SubmitInfo& info) { + // When flushing, we only send data to the driver; no waiting is necessary. + SubmitExecution(info); } -void Scheduler::Finish(vk::Semaphore signal, vk::Semaphore wait) { +void Scheduler::Finish() { // When finishing, we need to wait for the submission to have executed on the device. const u64 presubmit_tick = CurrentTick(); - SubmitExecution(signal, wait); + SubmitInfo info{}; + SubmitExecution(info); Wait(presubmit_tick); } void Scheduler::Wait(u64 tick) { if (tick >= master_semaphore.CurrentTick()) { // Make sure we are not waiting for the current tick without signalling - Flush(); + SubmitInfo info{}; + Flush(info); } master_semaphore.Wait(tick); } @@ -86,7 +116,7 @@ void Scheduler::AllocateWorkerCommandBuffers() { } } -void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore) { +void Scheduler::SubmitExecution(SubmitInfo& info) { std::scoped_lock lk{submit_mutex}; const u64 signal_value = master_semaphore.NextTick(); @@ -97,7 +127,40 @@ void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wa } EndRendering(); - master_semaphore.SubmitWork(current_cmdbuf, wait_semaphore, signal_semaphore, signal_value); + current_cmdbuf.end(); + + const vk::Semaphore timeline = master_semaphore.Handle(); + info.AddSignal(timeline, signal_value); + + static constexpr std::array wait_stage_masks = { + vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eColorAttachmentOutput, + }; + + const vk::TimelineSemaphoreSubmitInfo timeline_si = { + .waitSemaphoreValueCount = static_cast(info.wait_ticks.size()), + .pWaitSemaphoreValues = info.wait_ticks.data(), + .signalSemaphoreValueCount = static_cast(info.signal_ticks.size()), + .pSignalSemaphoreValues = info.signal_ticks.data(), + }; + + const vk::SubmitInfo submit_info = { + .pNext = &timeline_si, + .waitSemaphoreCount = static_cast(info.wait_semas.size()), + .pWaitSemaphores = info.wait_semas.data(), + .pWaitDstStageMask = wait_stage_masks.data(), + .commandBufferCount = 1U, + .pCommandBuffers = ¤t_cmdbuf, + .signalSemaphoreCount = static_cast(info.signal_semas.size()), + .pSignalSemaphores = info.signal_semas.data(), + }; + + try { + instance.GetGraphicsQueue().submit(submit_info, info.fence); + } catch (vk::DeviceLostError& err) { + UNREACHABLE_MSG("Device lost during submit: {}", err.what()); + } + master_semaphore.Refresh(); AllocateWorkerCommandBuffers(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index b4504274..48c3af7a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -15,7 +15,9 @@ class Instance; struct RenderState { std::array color_attachments{}; + std::array color_images{}; vk::RenderingAttachmentInfo depth_attachment{}; + vk::Image depth_image{}; u32 num_color_attachments{}; u32 num_depth_attachments{}; u32 width = std::numeric_limits::max(); @@ -26,16 +28,39 @@ struct RenderState { } }; +struct SubmitInfo { + boost::container::static_vector wait_semas; + boost::container::static_vector wait_ticks; + boost::container::static_vector signal_semas; + boost::container::static_vector signal_ticks; + vk::Fence fence; + + void AddWait(vk::Semaphore semaphore, u64 tick = 1) { + wait_semas.emplace_back(semaphore); + wait_ticks.emplace_back(tick); + } + + void AddSignal(vk::Semaphore semaphore, u64 tick = 1) { + signal_semas.emplace_back(semaphore); + signal_ticks.emplace_back(tick); + } + + void AddSignal(vk::Fence fence) { + this->fence = fence; + } +}; + class Scheduler { public: explicit Scheduler(const Instance& instance); ~Scheduler(); - /// Sends the current execution context to the GPU. - void Flush(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr); + /// Sends the current execution context to the GPU + /// and increments the scheduler timeline semaphore. + void Flush(SubmitInfo& info); /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr); + void Finish(); /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); @@ -76,12 +101,12 @@ public: pending_ops.emplace(func, CurrentTick()); } - std::mutex submit_mutex; + static std::mutex submit_mutex; private: void AllocateWorkerCommandBuffers(); - void SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore); + void SubmitExecution(SubmitInfo& info); private: const Instance& instance; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 7fffdeb2..20c99e30 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -55,7 +55,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { .pQueueFamilyIndices = queue_family_indices.data(), .preTransform = transform, .compositeAlpha = composite_alpha, - .presentMode = vk::PresentModeKHR::eFifo, + .presentMode = vk::PresentModeKHR::eMailbox, .clipped = true, .oldSwapchain = nullptr, }; diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index b4b3f48a..f7aef847 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -117,18 +117,15 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address}, cpu_addr_end{cpu_addr + info.guest_size_bytes} { ASSERT(info.pixel_format != vk::Format::eUndefined); + // Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case + // the texture cache should re-create the resource with the usage requested vk::ImageCreateFlags flags{vk::ImageCreateFlagBits::eMutableFormat | vk::ImageCreateFlagBits::eExtendedUsage}; - if (info.type == vk::ImageType::e2D && info.resources.layers >= 6 && - info.size.width == info.size.height) { + if (info.props.is_cube) { flags |= vk::ImageCreateFlagBits::eCubeCompatible; - } - if (info.type == vk::ImageType::e3D) { + } else if (info.props.is_volume) { flags |= vk::ImageCreateFlagBits::e2DArrayCompatible; } - if (info.IsBlockCoded()) { - flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; - } usage = ImageUsageFlags(info); @@ -157,15 +154,6 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, }; image.Create(image_ci); - - // Create a special view for detiler - if (info.is_tiled) { - ImageViewInfo view_info; - view_info.format = DemoteImageFormatForDetiling(info.pixel_format); - view_for_detiler.emplace(*instance, view_info, *this, ImageId{}); - } - - Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone); } void Image::Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask, diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 97ceaa09..b18f1002 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -105,7 +105,6 @@ struct Image { VAddr cpu_addr_end = 0; std::vector image_view_infos; std::vector image_view_ids; - std::optional view_for_detiler; // Resource state tracking vk::ImageUsageFlags usage; diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 41ad0938..e01a61ae 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -47,33 +47,33 @@ static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { // clang-format off // The table of macro tiles parameters for given tiling index (row) and bpp (column) static constexpr std::array macro_tile_extents{ - std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, // 00 - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 01 - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 02 - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 03 - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 04 - std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 05 - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 06 - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 07 - std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 08 - std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 09 - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0A - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 0B - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 0C - std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 0D - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0E - std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0F - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 10 - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 11 - std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, // 12 - std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 13 - std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 14 - std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 15 - std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 16 - std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 17 - std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 18 - std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 19 - std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 1A + std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, std::pair{256u, 128u}, // 00 + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 01 + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 02 + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 03 + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 04 + std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 05 + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, std::pair{128u, 128u}, // 06 + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 07 + std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 08 + std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 09 + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0A + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0B + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 0C + std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 0D + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0E + std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 0F + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 10 + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, // 11 + std::pair{256u, 256u}, std::pair{256u, 128u}, std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{128u, 64u}, // 12 + std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, std::pair{0u, 0u}, // 13 + std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 14 + std::pair{128u, 64u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 15 + std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 16 + std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 17 + std::pair{128u, 128u}, std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{128u, 64u}, // 18 + std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 19 + std::pair{128u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, std::pair{64u, 64u}, // 1A }; // clang-format on @@ -82,62 +82,65 @@ static constexpr auto hw_pipe_interleave = 256u; static constexpr std::pair GetMacroTileExtents(u32 tiling_idx, u32 bpp, u32 num_samples) { ASSERT(num_samples == 1); - const auto row = tiling_idx * 4; - const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64 + const auto row = tiling_idx * 5; + const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64, 128 return macro_tile_extents[row + column]; } -static constexpr size_t ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, u32 num_samples) { +static constexpr std::pair ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, + u32 num_samples) { const auto pitch_align = std::max(8u, 64u / ((bpp + 7) / 8)); auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = height; - size_t log_sz = 1; - const auto slice_align = std::max(64u, hw_pipe_interleave / (bpp + 7) / 8); + size_t log_sz = pitch_aligned * height_aligned * num_samples; + const auto slice_align = std::max(64u, 256u / ((bpp + 7) / 8)); while (log_sz % slice_align) { - log_sz = pitch_aligned * height_aligned * num_samples; pitch_aligned += pitch_align; + log_sz = pitch_aligned * height_aligned * num_samples; } - return (log_sz * bpp + 7) / 8; + return {pitch_aligned, (log_sz * bpp + 7) / 8}; } -static constexpr size_t ImageSizeMicroTiled(u32 pitch, u32 height, u32 bpp, u32 num_samples) { +static constexpr std::pair ImageSizeMicroTiled(u32 pitch, u32 height, u32 bpp, + u32 num_samples) { const auto& [pitch_align, height_align] = micro_tile_extent; auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); - size_t log_sz = 1; + size_t log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; while (log_sz % 256) { - log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; pitch_aligned += 8; + log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; } - return log_sz; + return {pitch_aligned, log_sz}; } -static constexpr size_t ImageSizeMacroTiled(u32 pitch, u32 height, u32 bpp, u32 num_samples, - u32 tiling_idx) { +static constexpr std::pair ImageSizeMacroTiled(u32 pitch, u32 height, u32 bpp, + u32 num_samples, u32 tiling_idx) { const auto& [pitch_align, height_align] = GetMacroTileExtents(tiling_idx, bpp, num_samples); ASSERT(pitch_align != 0 && height_align != 0); const auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); - return (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; + const auto log_sz = pitch_aligned * height_aligned * num_samples; + return {pitch_aligned, (log_sz * bpp + 7) / 8}; } ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, VAddr cpu_address) noexcept { const auto& attrib = group.attrib; - is_tiled = attrib.tiling_mode == TilingMode::Tile; - tiling_mode = - is_tiled ? AmdGpu::TilingMode::Display_MacroTiled : AmdGpu::TilingMode::Display_Linear; + props.is_tiled = attrib.tiling_mode == TilingMode::Tile; + tiling_mode = props.is_tiled ? AmdGpu::TilingMode::Display_MacroTiled + : AmdGpu::TilingMode::Display_Linear; pixel_format = ConvertPixelFormat(attrib.pixel_format); type = vk::ImageType::e2D; size.width = attrib.width; size.height = attrib.height; pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127); usage.vo_buffer = true; - const bool is_32bpp = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float; - ASSERT(is_32bpp); + num_bits = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float ? 32 : 64; + ASSERT(num_bits == 32); guest_address = cpu_address; - if (!is_tiled) { + if (!props.is_tiled) { guest_size_bytes = pitch * size.height * 4; } else { if (Config::isNeoMode()) { @@ -146,15 +149,16 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, guest_size_bytes = pitch * ((size.height + 63) & (~63)) * 4; } } - mips_layout.emplace_back(0, guest_size_bytes); + mips_layout.emplace_back(guest_size_bytes, pitch, 0); } ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint /*= {}*/) noexcept { - is_tiled = buffer.IsTiled(); + props.is_tiled = buffer.IsTiled(); tiling_mode = buffer.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.info.format, buffer.NumFormat()); num_samples = 1 << buffer.attrib.num_fragments_log2; + num_bits = NumBits(buffer.info.format); type = vk::ImageType::e2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); @@ -168,15 +172,16 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, guest_address = buffer.Address(); const auto color_slice_sz = buffer.GetColorSliceSize(); guest_size_bytes = color_slice_sz * buffer.NumSlices(); - mips_layout.emplace_back(0, color_slice_sz); + mips_layout.emplace_back(color_slice_sz, pitch, 0); } ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slices, VAddr htile_address, const AmdGpu::Liverpool::CbDbExtent& hint) noexcept { - is_tiled = false; + props.is_tiled = false; pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format); type = vk::ImageType::e2D; num_samples = 1 << buffer.z_info.num_samples; // spec doesn't say it is a log2 + num_bits = buffer.NumBits(); size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; @@ -188,37 +193,38 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice guest_address = buffer.Address(); const auto depth_slice_sz = buffer.GetDepthSliceSize(); guest_size_bytes = depth_slice_sz * num_slices; - mips_layout.emplace_back(0, depth_slice_sz); + mips_layout.emplace_back(depth_slice_sz, pitch, 0); } ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { - is_tiled = image.IsTiled(); tiling_mode = image.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); type = ConvertImageType(image.GetType()); - is_cube = image.GetType() == AmdGpu::ImageType::Cube; - is_volume = image.GetType() == AmdGpu::ImageType::Color3D; + props.is_tiled = image.IsTiled(); + props.is_cube = image.GetType() == AmdGpu::ImageType::Cube; + props.is_volume = image.GetType() == AmdGpu::ImageType::Color3D; + props.is_pow2 = image.pow2pad; + props.is_block = IsBlockCoded(); size.width = image.width + 1; size.height = image.height + 1; - size.depth = is_volume ? image.depth + 1 : 1; + size.depth = props.is_volume ? image.depth + 1 : 1; pitch = image.Pitch(); resources.levels = image.NumLevels(); resources.layers = image.NumLayers(); + num_bits = NumBits(image.GetDataFmt()); usage.texture = true; guest_address = image.Address(); mips_layout.reserve(resources.levels); - const auto num_bits = NumBits(image.GetDataFmt()); - const auto is_block = IsBlockCoded(); - const auto is_pow2 = image.pow2pad; + MipInfo mip_info{}; guest_size_bytes = 0; for (auto mip = 0u; mip < resources.levels; ++mip) { auto bpp = num_bits; auto mip_w = pitch >> mip; auto mip_h = size.height >> mip; - if (is_block) { + if (props.is_block) { mip_w = (mip_w + 3) / 4; mip_h = (mip_h + 3) / 4; bpp *= 16; @@ -227,40 +233,48 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { mip_h = std::max(mip_h, 1u); auto mip_d = std::max(size.depth >> mip, 1u); - if (is_pow2) { + if (props.is_pow2) { mip_w = std::bit_ceil(mip_w); mip_h = std::bit_ceil(mip_h); mip_d = std::bit_ceil(mip_d); } - size_t mip_size = 0; switch (tiling_mode) { case AmdGpu::TilingMode::Display_Linear: { - ASSERT(!is_cube); - mip_size = ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples); + ASSERT(!props.is_cube); + std::tie(mip_info.pitch, mip_info.size) = + ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples); + mip_info.height = mip_h; break; } case AmdGpu::TilingMode::Texture_MicroTiled: { - mip_size = ImageSizeMicroTiled(mip_w, mip_h, bpp, num_samples); + std::tie(mip_info.pitch, mip_info.size) = + ImageSizeMicroTiled(mip_w, mip_h, bpp, num_samples); + mip_info.height = std::max(mip_h, 8u); + if (props.is_block) { + mip_info.pitch = std::max(mip_info.pitch * 4, 32u); + mip_info.height = std::max(mip_info.height * 4, 32u); + } break; } case AmdGpu::TilingMode::Display_MacroTiled: case AmdGpu::TilingMode::Texture_MacroTiled: case AmdGpu::TilingMode::Depth_MacroTiled: { - ASSERT(!is_cube && !is_block); + ASSERT(!props.is_cube && !props.is_block); ASSERT(num_samples == 1); - ASSERT(num_bits <= 64); - mip_size = ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); + std::tie(mip_info.pitch, mip_info.size) = + ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); break; } default: { UNREACHABLE(); } } - mip_size *= mip_d; + mip_info.size *= mip_d; - mips_layout.emplace_back(guest_size_bytes, mip_size); - guest_size_bytes += mip_size; + mip_info.offset = guest_size_bytes; + mips_layout.emplace_back(mip_info); + guest_size_bytes += mip_info.size; } guest_size_bytes *= resources.layers; } diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index b98410b9..9dad0dd6 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -9,6 +9,8 @@ #include "video_core/amdgpu/liverpool.h" #include "video_core/texture_cache/types.h" +#include + namespace VideoCore { struct ImageInfo { @@ -42,18 +44,29 @@ struct ImageInfo { u32 vo_buffer : 1; } usage{}; // Usage data tracked during image lifetime - bool is_cube = false; - bool is_volume = false; - bool is_tiled = false; - bool is_read_only = false; + struct { + u32 is_cube : 1; + u32 is_volume : 1; + u32 is_tiled : 1; + u32 is_pow2 : 1; + u32 is_block : 1; + } props{}; // Surface properties with impact on various calculation factors + vk::Format pixel_format = vk::Format::eUndefined; vk::ImageType type = vk::ImageType::e1D; SubresourceExtent resources; Extent3D size{1, 1, 1}; + u32 num_bits{}; u32 num_samples = 1; u32 pitch = 0; AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear}; - std::vector> mips_layout; + struct MipInfo { + u32 size; + u32 pitch; + u32 height; + u32 offset; + }; + boost::container::small_vector mips_layout; VAddr guest_address{0}; u32 guest_size_bytes{0}; }; diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 8f972253..04bedaff 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/logging/log.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/texture_cache/image.h" @@ -46,19 +47,43 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) { } } +bool IsIdentityMapping(u32 dst_sel, u32 num_components) { + return (num_components == 1 && dst_sel == 0b100) || + (num_components == 2 && dst_sel == 0b101'100) || + (num_components == 3 && dst_sel == 0b110'101'100) || + (num_components == 4 && dst_sel == 0b111'110'101'100); +} + +vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) { + if (format == vk::Format::eR8G8B8A8Unorm && dst_sel == 0b111100101110) { + return vk::Format::eB8G8R8A8Unorm; + } + return format; +} + ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept : is_storage{is_storage} { type = ConvertImageViewType(image.GetType()); format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); - range.base.level = static_cast(image.base_level); - range.base.layer = static_cast(image.base_array); - range.extent.levels = image.NumLevels(); - range.extent.layers = image.NumLayers(); - if (!is_storage) { - mapping.r = ConvertComponentSwizzle(image.dst_sel_x); - mapping.g = ConvertComponentSwizzle(image.dst_sel_y); - mapping.b = ConvertComponentSwizzle(image.dst_sel_z); - mapping.a = ConvertComponentSwizzle(image.dst_sel_w); + range.base.level = image.base_level; + range.base.layer = image.base_array; + range.extent.levels = image.last_level + 1; + range.extent.layers = image.last_array + 1; + mapping.r = ConvertComponentSwizzle(image.dst_sel_x); + mapping.g = ConvertComponentSwizzle(image.dst_sel_y); + mapping.b = ConvertComponentSwizzle(image.dst_sel_z); + mapping.a = ConvertComponentSwizzle(image.dst_sel_w); + // Check for unfortunate case of storage images being swizzled + const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt()); + const u32 dst_sel = image.DstSelect(); + if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) { + mapping = vk::ComponentMapping{}; + if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) { + format = new_format; + return; + } + LOG_ERROR(Render_Vulkan, "Storage image (num_comps = {}) requires swizzling {}", num_comps, + image.DstSelectName()); } } @@ -70,6 +95,16 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, base_format, col_buffer.info.comp_swap.Value(), is_vo_surface); } +ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, + AmdGpu::Liverpool::DepthView view, + AmdGpu::Liverpool::DepthControl ctl) { + format = Vulkan::LiverpoolToVK::DepthFormat(depth_buffer.z_info.format, + depth_buffer.stencil_info.format); + is_storage = ctl.depth_write_enable; + range.base.layer = view.slice_start; + range.extent.layers = view.NumSlices(); +} + ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info_, Image& image, ImageId image_id_, std::optional usage_override /*= {}*/) : info{info_}, image_id{image_id_} { @@ -93,10 +128,10 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info .components = instance.GetSupportedComponentSwizzle(format, info.mapping), .subresourceRange{ .aspectMask = aspect, - .baseMipLevel = 0U, - .levelCount = 1, + .baseMipLevel = info.range.base.level, + .levelCount = info.range.extent.levels - info.range.base.level, .baseArrayLayer = info_.range.base.layer, - .layerCount = image.info.IsBlockCoded() ? 1 : VK_REMAINING_ARRAY_LAYERS, + .layerCount = info.range.extent.layers - info.range.base.layer, }, }; image_view = instance.GetDevice().createImageViewUnique(image_view_ci); diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index b43f65de..fbc62db3 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -18,10 +18,11 @@ class Scheduler; namespace VideoCore { struct ImageViewInfo { - explicit ImageViewInfo() = default; - explicit ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept; - explicit ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, - bool is_vo_surface) noexcept; + ImageViewInfo() = default; + ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept; + ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, bool is_vo_surface) noexcept; + ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, + AmdGpu::Liverpool::DepthView view, AmdGpu::Liverpool::DepthControl ctl); vk::ImageViewType type = vk::ImageViewType::e2D; vk::Format format = vk::Format::eR8G8B8A8Unorm; @@ -34,6 +35,8 @@ struct ImageViewInfo { struct Image; +constexpr Common::SlotId NULL_IMAGE_VIEW_ID{0}; + struct ImageView { explicit ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info, Image& image, ImageId image_id, std::optional usage_override = {}); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 55bb99cc..7b8a5554 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -142,18 +142,16 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { image_ids.push_back(image_id); }); - ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!"); + // ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!"); ImageId image_id{}; if (image_ids.empty()) { image_id = slot_images.insert(instance, scheduler, info); RegisterImage(image_id); } else { - image_id = image_ids[0]; + image_id = image_ids[image_ids.size() > 1 ? 1 : 0]; } - RegisterMeta(info, image_id); - Image& image = slot_images[image_id]; if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) { RefreshImage(image); @@ -184,14 +182,18 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo return slot_image_views[view_id]; } -ImageView& TextureCache::FindTexture(const AmdGpu::Image& desc, bool is_storage) { - const ImageInfo info{desc}; +ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { + if (info.guest_address == 0) [[unlikely]] { + return slot_image_views[NULL_IMAGE_VIEW_ID]; + } + const ImageId image_id = FindImage(info); Image& image = slot_images[image_id]; auto& usage = image.info.usage; - if (is_storage) { - image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite); + if (view_info.is_storage) { + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite); usage.storage = true; } else { const auto new_layout = image.info.IsDepthStencil() @@ -201,14 +203,36 @@ ImageView& TextureCache::FindTexture(const AmdGpu::Image& desc, bool is_storage) usage.texture = true; } - const ImageViewInfo view_info{desc, is_storage}; - return RegisterImageView(image_id, view_info); + // These changes are temporary and should be removed once texture cache will handle subresources + // merging + auto view_info_tmp = view_info; + if (view_info_tmp.range.base.level > image.info.resources.levels - 1 || + view_info_tmp.range.base.layer > image.info.resources.layers - 1 || + view_info_tmp.range.extent.levels > image.info.resources.levels || + view_info_tmp.range.extent.layers > image.info.resources.layers) { + + LOG_DEBUG(Render_Vulkan, + "Subresource range ({}~{},{}~{}) exceeds base image extents ({},{})", + view_info_tmp.range.base.level, view_info_tmp.range.extent.levels, + view_info_tmp.range.base.layer, view_info_tmp.range.extent.layers, + image.info.resources.levels, image.info.resources.layers); + + view_info_tmp.range.base.level = + std::min(view_info_tmp.range.base.level, image.info.resources.levels - 1); + view_info_tmp.range.base.layer = + std::min(view_info_tmp.range.base.layer, image.info.resources.layers - 1); + view_info_tmp.range.extent.levels = + std::min(view_info_tmp.range.extent.levels, image.info.resources.levels); + view_info_tmp.range.extent.layers = + std::min(view_info_tmp.range.extent.layers, image.info.resources.layers); + } + + return RegisterImageView(image_id, view_info_tmp); } -ImageView& TextureCache::FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, - const AmdGpu::Liverpool::CbDbExtent& hint) { - const ImageInfo info{buffer, hint}; - const ImageId image_id = FindImage(info); +ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, + const ImageViewInfo& view_info) { + const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; image.flags &= ~ImageFlagBits::CpuModified; @@ -216,30 +240,56 @@ ImageView& TextureCache::FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer& vk::AccessFlagBits::eColorAttachmentWrite | vk::AccessFlagBits::eColorAttachmentRead); + // Register meta data for this color buffer + if (!(image.flags & ImageFlagBits::MetaRegistered)) { + if (image_info.meta_info.cmask_addr) { + surface_metas.emplace( + image_info.meta_info.cmask_addr, + MetaDataInfo{.type = MetaDataInfo::Type::CMask, .is_cleared = true}); + image.info.meta_info.cmask_addr = image_info.meta_info.cmask_addr; + image.flags |= ImageFlagBits::MetaRegistered; + } + + if (image_info.meta_info.fmask_addr) { + surface_metas.emplace( + image_info.meta_info.fmask_addr, + MetaDataInfo{.type = MetaDataInfo::Type::FMask, .is_cleared = true}); + image.info.meta_info.fmask_addr = image_info.meta_info.fmask_addr; + image.flags |= ImageFlagBits::MetaRegistered; + } + } + + // Update tracked image usage image.info.usage.render_target = true; - ImageViewInfo view_info{buffer, !!image.info.usage.vo_buffer}; return RegisterImageView(image_id, view_info); } -ImageView& TextureCache::FindDepthTarget(const AmdGpu::Liverpool::DepthBuffer& buffer, - u32 num_slices, VAddr htile_address, - const AmdGpu::Liverpool::CbDbExtent& hint, - bool write_enabled) { - const ImageInfo info{buffer, num_slices, htile_address, hint}; - const ImageId image_id = FindImage(info, false); +ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, + const ImageViewInfo& view_info) { + const ImageId image_id = FindImage(image_info, false); Image& image = slot_images[image_id]; image.flags &= ~ImageFlagBits::CpuModified; - const auto new_layout = write_enabled ? vk::ImageLayout::eDepthStencilAttachmentOptimal - : vk::ImageLayout::eDepthStencilReadOnlyOptimal; + const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal + : vk::ImageLayout::eDepthStencilReadOnlyOptimal; image.Transit(new_layout, vk::AccessFlagBits::eDepthStencilAttachmentWrite | vk::AccessFlagBits::eDepthStencilAttachmentRead); + // Register meta data for this depth buffer + if (!(image.flags & ImageFlagBits::MetaRegistered)) { + if (image_info.meta_info.htile_addr) { + surface_metas.emplace( + image_info.meta_info.htile_addr, + MetaDataInfo{.type = MetaDataInfo::Type::HTile, .is_cleared = true}); + image.info.meta_info.htile_addr = image_info.meta_info.htile_addr; + image.flags |= ImageFlagBits::MetaRegistered; + } + } + + // Update tracked image usage image.info.usage.depth_target = true; - ImageViewInfo view_info; - view_info.format = info.pixel_format; return RegisterImageView(image_id, view_info); } @@ -247,64 +297,56 @@ void TextureCache::RefreshImage(Image& image) { // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; - { - if (!tile_manager.TryDetile(image)) { - // Upload data to the staging buffer. - const auto offset = staging.Copy(image.cpu_addr, image.info.guest_size_bytes, 4); - // Copy to the image. - image.Upload(staging.Handle(), offset); - } + scheduler.EndRendering(); - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); - return; + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + + vk::Buffer buffer{staging.Handle()}; + u32 offset{0}; + + auto upload_buffer = tile_manager.TryDetile(image); + if (upload_buffer) { + buffer = *upload_buffer; + } else { + // Upload data to the staging buffer. + const auto [data, offset_, _] = staging.Map(image.info.guest_size_bytes, 16); + std::memcpy(data, (void*)image.info.guest_address, image.info.guest_size_bytes); + staging.Commit(image.info.guest_size_bytes); + offset = offset_; } - ASSERT(image.info.resources.levels == image.info.mips_layout.size()); - const u8* image_data = reinterpret_cast(image.cpu_addr); - for (u32 m = 0; m < image.info.resources.levels; m++) { + const auto& num_layers = image.info.resources.layers; + const auto& num_mips = image.info.resources.levels; + ASSERT(num_mips == image.info.mips_layout.size()); + + boost::container::small_vector image_copy{}; + for (u32 m = 0; m < num_mips; m++) { const u32 width = std::max(image.info.size.width >> m, 1u); const u32 height = std::max(image.info.size.height >> m, 1u); - const u32 depth = image.info.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const u32 map_size = image.info.mips_layout[m].second * image.info.resources.layers; + const u32 depth = + image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; + const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; - // Upload data to the staging buffer. - const auto [data, offset, _] = staging.Map(map_size, 16); - if (image.info.is_tiled) { - ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode()); - } else { - std::memcpy(data, - image_data + image.info.mips_layout[m].first * image.info.resources.layers, - map_size); - } - staging.Commit(map_size); - - // Copy to the image. - const vk::BufferImageCopy image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, + image_copy.push_back({ + .bufferOffset = offset + mip_ofs * num_layers, + .bufferRowLength = static_cast(mip_pitch), + .bufferImageHeight = static_cast(mip_height), .imageSubresource{ .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = m, .baseArrayLayer = 0, - .layerCount = u32(image.info.resources.layers), + .layerCount = num_layers, }, .imageOffset = {0, 0, 0}, .imageExtent = {width, height, depth}, - }; - - scheduler.EndRendering(); - - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - cmdbuf.copyBufferToImage(staging.Handle(), image.image, - vk::ImageLayout::eTransferDstOptimal, image_copy); - - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + }); } + + cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); + + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead); } vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) { @@ -320,47 +362,8 @@ void TextureCache::RegisterImage(ImageId image_id) { image.flags |= ImageFlagBits::Registered; ForEachPage(image.cpu_addr, image.info.guest_size_bytes, [this, image_id](u64 page) { page_table[page].push_back(image_id); }); -} -void TextureCache::RegisterMeta(const ImageInfo& info, ImageId image_id) { - Image& image = slot_images[image_id]; - - if (image.flags & ImageFlagBits::MetaRegistered) { - return; - } - - bool registered = true; - // Current resource tracking implementation allows us to detect usage of meta only in the last - // moment, so we likely will miss its first clear. To avoid this and make first frame, where - // the meta is encountered, looks correct we set its state to "cleared" at registrations time. - if (info.usage.render_target) { - if (info.meta_info.cmask_addr) { - surface_metas.emplace( - info.meta_info.cmask_addr, - MetaDataInfo{.type = MetaDataInfo::Type::CMask, .is_cleared = true}); - image.info.meta_info.cmask_addr = info.meta_info.cmask_addr; - } - - if (info.meta_info.fmask_addr) { - surface_metas.emplace( - info.meta_info.fmask_addr, - MetaDataInfo{.type = MetaDataInfo::Type::FMask, .is_cleared = true}); - image.info.meta_info.fmask_addr = info.meta_info.fmask_addr; - } - } else if (info.usage.depth_target) { - if (info.meta_info.htile_addr) { - surface_metas.emplace( - info.meta_info.htile_addr, - MetaDataInfo{.type = MetaDataInfo::Type::HTile, .is_cleared = true}); - image.info.meta_info.htile_addr = info.meta_info.htile_addr; - } - } else { - registered = false; - } - - if (registered) { - image.flags |= ImageFlagBits::MetaRegistered; - } + image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eNone); } void TextureCache::UnregisterImage(ImageId image_id) { diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 8a618983..aef33bcf 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -51,17 +51,16 @@ public: [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true); /// Retrieves an image view with the properties of the specified image descriptor. - [[nodiscard]] ImageView& FindTexture(const AmdGpu::Image& image, bool is_storage); + [[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info, + const ImageViewInfo& view_info); /// Retrieves the render target with specified properties - [[nodiscard]] ImageView& FindRenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer, - const AmdGpu::Liverpool::CbDbExtent& hint); + [[nodiscard]] ImageView& FindRenderTarget(const ImageInfo& image_info, + const ImageViewInfo& view_info); /// Retrieves the depth target with specified properties - [[nodiscard]] ImageView& FindDepthTarget(const AmdGpu::Liverpool::DepthBuffer& buffer, - u32 num_slices, VAddr htile_address, - const AmdGpu::Liverpool::CbDbExtent& hint, - bool write_enabled); + [[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info, + const ImageViewInfo& view_info); /// Reuploads image contents. void RefreshImage(Image& image); @@ -158,9 +157,6 @@ private: /// Register image in the page table void RegisterImage(ImageId image); - /// Register metadata surfaces attached to the image - void RegisterMeta(const ImageInfo& info, ImageId image); - /// Unregister image from the page table void UnregisterImage(ImageId image); diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index e097ba3e..4f199f81 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace VideoCore { @@ -176,6 +177,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { return vk::Format::eR8Uint; case vk::Format::eR8G8Unorm: case vk::Format::eR16Sfloat: + case vk::Format::eR16Unorm: return vk::Format::eR8G8Uint; case vk::Format::eR8G8B8A8Srgb: case vk::Format::eB8G8R8A8Srgb: @@ -183,10 +185,13 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR8G8B8A8Unorm: case vk::Format::eR32Sfloat: case vk::Format::eR32Uint: + case vk::Format::eR16G16Sfloat: return vk::Format::eR32Uint; case vk::Format::eBc1RgbaUnormBlock: case vk::Format::eBc4UnormBlock: case vk::Format::eR32G32Sfloat: + case vk::Format::eR32G32Uint: + case vk::Format::eR16G16B16A16Unorm: return vk::Format::eR32G32Uint; case vk::Format::eBc2SrgbBlock: case vk::Format::eBc2UnormBlock: @@ -225,14 +230,14 @@ const DetilerContext* TileManager::GetDetiler(const Image& image) const { return nullptr; } -static constexpr vk::BufferUsageFlags StagingFlags = vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eUniformBuffer | - vk::BufferUsageFlagBits::eStorageBuffer; +struct DetilerParams { + u32 num_levels; + u32 pitch0; + u32 sizes[14]; +}; TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) - : instance{instance}, scheduler{scheduler}, - staging{instance, scheduler, StagingFlags, 128_MB, Vulkan::BufferType::Upload} { - + : instance{instance}, scheduler{scheduler} { static const std::array detiler_shaders{ HostShaders::DETILE_M8X1_COMP, HostShaders::DETILE_M8X2_COMP, HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP, @@ -264,7 +269,7 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc }, { .binding = 1, - .descriptorType = vk::DescriptorType::eStorageImage, + .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute, }, @@ -281,7 +286,7 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc const vk::PushConstantRange push_constants = { .stageFlags = vk::ShaderStageFlagBits::eCompute, .offset = 0, - .size = sizeof(u32), + .size = sizeof(DetilerParams), }; const vk::DescriptorSetLayout set_layout = *desc_layout; @@ -312,35 +317,93 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc TileManager::~TileManager() = default; -bool TileManager::TryDetile(Image& image) { - if (!image.info.is_tiled) { - return false; +TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*= false*/) { + const auto usage = vk::BufferUsageFlagBits::eStorageBuffer | + (is_storage ? vk::BufferUsageFlagBits::eTransferSrc + : vk::BufferUsageFlagBits::eTransferDst); + const vk::BufferCreateInfo buffer_ci{ + .size = size, + .usage = usage, + }; + +#ifdef __APPLE__ + // Fix for detiler artifacts on macOS + const bool is_large_buffer = true; +#else + const bool is_large_buffer = size > 128_MB; +#endif + VmaAllocationCreateInfo alloc_info{ + .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | + VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT + : static_cast(0), + .usage = is_large_buffer ? VMA_MEMORY_USAGE_AUTO_PREFER_HOST + : VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, + .requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT + : static_cast(0), + }; + + VkBuffer buffer; + VmaAllocation allocation; + const auto buffer_ci_unsafe = static_cast(buffer_ci); + const auto result = vmaCreateBuffer(instance.GetAllocator(), &buffer_ci_unsafe, &alloc_info, + &buffer, &allocation, nullptr); + ASSERT(result == VK_SUCCESS); + return {buffer, allocation}; +} + +void TileManager::Upload(ScratchBuffer buffer, const void* data, size_t size) { + VmaAllocationInfo alloc_info{}; + vmaGetAllocationInfo(instance.GetAllocator(), buffer.second, &alloc_info); + ASSERT(size <= alloc_info.size); + void* ptr{}; + const auto result = vmaMapMemory(instance.GetAllocator(), buffer.second, &ptr); + ASSERT(result == VK_SUCCESS); + std::memcpy(ptr, data, size); + vmaUnmapMemory(instance.GetAllocator(), buffer.second); +} + +void TileManager::FreeBuffer(ScratchBuffer buffer) { + vmaDestroyBuffer(instance.GetAllocator(), buffer.first, buffer.second); +} + +std::optional TileManager::TryDetile(Image& image) { + if (!image.info.props.is_tiled) { + return std::nullopt; } const auto* detiler = GetDetiler(image); if (!detiler) { LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode)); - return false; + return std::nullopt; } - const auto offset = - staging.Copy(image.cpu_addr, image.info.guest_size_bytes, instance.StorageMinAlignment()); - image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite); + // Prepare input buffer + auto in_buffer = AllocBuffer(image.info.guest_size_bytes); + Upload(in_buffer, reinterpret_cast(image.info.guest_address), + image.info.guest_size_bytes); + + // Prepare output buffer + auto out_buffer = AllocBuffer(image.info.guest_size_bytes, true); + + scheduler.DeferOperation([=, this]() { + FreeBuffer(in_buffer); + FreeBuffer(out_buffer); + }); auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl); const vk::DescriptorBufferInfo input_buffer_info{ - .buffer = staging.Handle(), - .offset = offset, + .buffer = in_buffer.first, + .offset = 0, .range = image.info.guest_size_bytes, }; - ASSERT(image.view_for_detiler.has_value()); - const vk::DescriptorImageInfo output_image_info{ - .imageView = *image.view_for_detiler->image_view, - .imageLayout = image.layout, + const vk::DescriptorBufferInfo output_buffer_info{ + .buffer = out_buffer.first, + .offset = 0, + .range = image.info.guest_size_bytes, }; std::vector set_writes{ @@ -357,20 +420,44 @@ bool TileManager::TryDetile(Image& image) { .dstBinding = 1, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eStorageImage, - .pImageInfo = &output_image_info, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &output_buffer_info, }, }; cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *detiler->pl_layout, 0, set_writes); - cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, - sizeof(image.info.pitch), &image.info.pitch); + DetilerParams params; + params.pitch0 = image.info.pitch >> (image.info.props.is_block ? 2u : 0u); + params.num_levels = image.info.resources.levels; - cmdbuf.dispatch((image.info.size.width * image.info.size.height) / 64, 1, - 1); // round to 64 + ASSERT(image.info.resources.levels <= 14); + std::memset(¶ms.sizes, 0, sizeof(params.sizes)); + for (int m = 0; m < image.info.resources.levels; ++m) { + params.sizes[m] = image.info.mips_layout[m].size * image.info.resources.layers + + (m > 0 ? params.sizes[m - 1] : 0); + } - return true; + auto pitch = image.info.pitch; + cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params), + ¶ms); + + ASSERT((image.info.guest_size_bytes % 64) == 0); + const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u); + const auto num_tiles = image.info.guest_size_bytes / (64 * (bpp / 8)); + cmdbuf.dispatch(num_tiles, 1, 1); + + const vk::BufferMemoryBarrier post_barrier{ + .srcAccessMask = vk::AccessFlagBits::eShaderWrite, + .dstAccessMask = vk::AccessFlagBits::eTransferRead, + .buffer = out_buffer.first, + .size = image.info.guest_size_bytes, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, + {}, post_barrier, {}); + + return {out_buffer.first}; } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index 98a33786..9102da08 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -34,10 +34,16 @@ struct DetilerContext { class TileManager { public: + using ScratchBuffer = std::pair; + TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); ~TileManager(); - bool TryDetile(Image& image); + std::optional TryDetile(Image& image); + + ScratchBuffer AllocBuffer(u32 size, bool is_storage = false); + void Upload(ScratchBuffer buffer, const void* data, size_t size); + void FreeBuffer(ScratchBuffer buffer); private: const DetilerContext* GetDetiler(const Image& image) const; @@ -45,7 +51,6 @@ private: private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; - Vulkan::StreamBuffer staging; std::array detilers; };