diff --git a/src/common/config.cpp b/src/common/config.cpp index 57f40b21..7e677f84 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -15,7 +15,7 @@ static u32 screenWidth = 1280; static u32 screenHeight = 720; static s32 gpuId = -1; // Vulkan physical device index. Set to negative for auto select static std::string logFilter; -static std::string logType = "sync"; +static std::string logType = "async"; static bool isDebugDump = false; static bool isLibc = true; static bool isShowSplash = false; diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp index 0b03c86b..a21af8bb 100644 --- a/src/common/logging/backend.cpp +++ b/src/common/logging/backend.cpp @@ -207,8 +207,8 @@ public: message_queue.EmplaceWait(entry); } else { ForEachBackend([&entry](auto& backend) { backend.Write(entry); }); + std::fflush(stdout); } - std::fflush(stdout); } private: diff --git a/src/core/address_space.h b/src/core/address_space.h index e2515902..29f74f56 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -34,10 +34,7 @@ constexpr VAddr USER_MAX = 0xFBFFFFFFFFULL; static constexpr size_t SystemManagedSize = SYSTEM_MANAGED_MAX - SYSTEM_MANAGED_MIN + 1; static constexpr size_t SystemReservedSize = SYSTEM_RESERVED_MAX - SYSTEM_RESERVED_MIN + 1; -// User area size is normally larger than this. However games are unlikely to map to high -// regions of that area, so by default we allocate a smaller virtual address space (about 1/4th). -// to save space on page tables. -static constexpr size_t UserSize = 1ULL << 39; +static constexpr size_t UserSize = 1ULL << 40; /** * Represents the user virtual address space backed by a dmem memory block diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 3177770b..2bcff191 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -70,7 +70,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) // exist in filesystem but in different case. auto guest_path = current_path; while (!path_parts.empty()) { - const auto& part = path_parts.back(); + const auto part = path_parts.back(); const auto add_match = [&](const auto& host_part) { current_path /= host_part; guest_path /= part; diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index dba69d6e..650252f9 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -957,7 +957,7 @@ int PS4_SYSV_ABI sceGnmGetGpuBlockStatus() { } int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); + LOG_DEBUG(Lib_GnmDriver, "(STUBBED) called"); return ORBIS_OK; } diff --git a/src/core/libraries/kernel/file_system.cpp b/src/core/libraries/kernel/file_system.cpp index 4a42b0d6..f8386347 100644 --- a/src/core/libraries/kernel/file_system.cpp +++ b/src/core/libraries/kernel/file_system.cpp @@ -472,6 +472,28 @@ s64 PS4_SYSV_ABI sceKernelPwrite(int d, void* buf, size_t nbytes, s64 offset) { return file->f.WriteRaw(buf, nbytes); } +s32 PS4_SYSV_ABI sceKernelRename(const char* from, const char* to) { + auto* mnt = Common::Singleton::Instance(); + const auto src_path = mnt->GetHostPath(from); + if (!std::filesystem::exists(src_path)) { + return ORBIS_KERNEL_ERROR_ENOENT; + } + const auto dst_path = mnt->GetHostPath(to); + const bool src_is_dir = std::filesystem::is_directory(src_path); + const bool dst_is_dir = std::filesystem::is_directory(dst_path); + if (src_is_dir && !dst_is_dir) { + return ORBIS_KERNEL_ERROR_ENOTDIR; + } + if (!src_is_dir && dst_is_dir) { + return ORBIS_KERNEL_ERROR_EISDIR; + } + if (dst_is_dir && !std::filesystem::is_empty(dst_path)) { + return ORBIS_KERNEL_ERROR_ENOTEMPTY; + } + std::filesystem::copy(src_path, dst_path, std::filesystem::copy_options::overwrite_existing); + return ORBIS_OK; +} + void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) { std::srand(std::time(nullptr)); LIB_FUNCTION("1G3lF1Gg1k8", "libkernel", 1, "libkernel", 1, 1, sceKernelOpen); @@ -493,6 +515,7 @@ void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("kBwCPsYX-m4", "libkernel", 1, "libkernel", 1, 1, sceKernelFStat); LIB_FUNCTION("mqQMh1zPPT8", "libScePosix", 1, "libkernel", 1, 1, posix_fstat); LIB_FUNCTION("VW3TVZiM4-E", "libkernel", 1, "libkernel", 1, 1, sceKernelFtruncate); + LIB_FUNCTION("52NcYU9+lEo", "libkernel", 1, "libkernel", 1, 1, sceKernelRename); LIB_FUNCTION("E6ao34wPw+U", "libScePosix", 1, "libkernel", 1, 1, posix_stat); LIB_FUNCTION("+r3rMFwItV4", "libkernel", 1, "libkernel", 1, 1, sceKernelPread); diff --git a/src/core/libraries/kernel/libkernel.cpp b/src/core/libraries/kernel/libkernel.cpp index 5f2e5a50..e2625819 100644 --- a/src/core/libraries/kernel/libkernel.cpp +++ b/src/core/libraries/kernel/libkernel.cpp @@ -7,6 +7,7 @@ #include #include "common/assert.h" +#include "common/debug.h" #include "common/logging/log.h" #include "common/polyfill_thread.h" #include "common/singleton.h" @@ -84,6 +85,9 @@ static PS4_SYSV_ABI void stack_chk_fail() { int PS4_SYSV_ABI sceKernelMunmap(void* addr, size_t len) { LOG_INFO(Kernel_Vmm, "addr = {}, len = {:#x}", fmt::ptr(addr), len); + if (len == 0) { + return ORBIS_OK; + } auto* memory = Core::Memory::Instance(); memory->UnmapMemory(std::bit_cast(addr), len); return SCE_OK; diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index d396e1d7..94762c4a 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -262,6 +262,16 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn LOG_INFO(Kernel_Vmm, "BatchMap: entry = {}, operation = {}, len = {:#x}, result = {}", i, entries[i].operation, entries[i].length, result); + if (result == 0) + processed++; + } else if (entries[i].operation == MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_FLEXIBLE) { + result = sceKernelMapNamedFlexibleMemory(&entries[i].start, entries[i].length, + entries[i].protection, flags, ""); + LOG_INFO(Kernel_Vmm, + "BatchMap: entry = {}, operation = {}, len = {:#x}, type = {}, " + "result = {}", + i, entries[i].operation, entries[i].length, (u8)entries[i].type, result); + if (result == 0) processed++; } else { diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 4615b246..c5237d0a 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -439,11 +439,7 @@ int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMut int result = pthread_mutex_init(&(*mutex)->pth_mutex, &(*attr)->pth_mutex_attr); - static auto mutex_loc = MUTEX_LOCATION("mutex"); - (*mutex)->tracy_lock = std::make_unique(&mutex_loc); - if (name != nullptr) { - (*mutex)->tracy_lock->CustomName(name, std::strlen(name)); LOG_INFO(Kernel_Pthread, "name={}, result={}", name, result); } @@ -555,15 +551,11 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) { return SCE_KERNEL_ERROR_EINVAL; } - (*mutex)->tracy_lock->BeforeLock(); - int result = pthread_mutex_lock(&(*mutex)->pth_mutex); if (result != 0) { LOG_TRACE(Kernel_Pthread, "Locked name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterLock(); - switch (result) { case 0: return SCE_OK; @@ -589,8 +581,6 @@ int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) { LOG_TRACE(Kernel_Pthread, "Unlocking name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterUnlock(); - switch (result) { case 0: return SCE_OK; @@ -1195,8 +1185,6 @@ int PS4_SYSV_ABI scePthreadMutexTrylock(ScePthreadMutex* mutex) { LOG_TRACE(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result); } - (*mutex)->tracy_lock->AfterTryLock(result == 0); - switch (result) { case 0: return ORBIS_OK; diff --git a/src/core/libraries/kernel/thread_management.h b/src/core/libraries/kernel/thread_management.h index 1b33ac94..c5935275 100644 --- a/src/core/libraries/kernel/thread_management.h +++ b/src/core/libraries/kernel/thread_management.h @@ -9,7 +9,6 @@ #include #include #include -#include "common/debug.h" #include "common/types.h" namespace Core::Loader { @@ -74,7 +73,6 @@ struct PthreadMutexInternal { u8 reserved[256]; std::string name; pthread_mutex_t pth_mutex; - std::unique_ptr tracy_lock; }; struct PthreadMutexattrInternal { diff --git a/src/core/libraries/network/net.cpp b/src/core/libraries/network/net.cpp index 1569a51c..958f9264 100644 --- a/src/core/libraries/network/net.cpp +++ b/src/core/libraries/network/net.cpp @@ -559,7 +559,7 @@ int PS4_SYSV_ABI sceNetEpollDestroy() { } int PS4_SYSV_ABI sceNetEpollWait() { - LOG_ERROR(Lib_Net, "(STUBBED) called"); + LOG_TRACE(Lib_Net, "(STUBBED) called"); return ORBIS_OK; } diff --git a/src/core/libraries/network/netctl.cpp b/src/core/libraries/network/netctl.cpp index ab1cb8ae..a1c8e81c 100644 --- a/src/core/libraries/network/netctl.cpp +++ b/src/core/libraries/network/netctl.cpp @@ -79,7 +79,7 @@ int PS4_SYSV_ABI sceNetCtlUnregisterCallbackV6() { } int PS4_SYSV_ABI sceNetCtlCheckCallback() { - LOG_ERROR(Lib_NetCtl, "(STUBBED) called"); + LOG_TRACE(Lib_NetCtl, "(STUBBED) called"); return ORBIS_OK; } diff --git a/src/core/libraries/np_manager/np_manager.cpp b/src/core/libraries/np_manager/np_manager.cpp index ee4b3d6b..33308abc 100644 --- a/src/core/libraries/np_manager/np_manager.cpp +++ b/src/core/libraries/np_manager/np_manager.cpp @@ -870,7 +870,7 @@ int PS4_SYSV_ABI sceNpAsmTerminate() { } int PS4_SYSV_ABI sceNpCheckCallback() { - LOG_ERROR(Lib_NpManager, "(STUBBED) called"); + LOG_TRACE(Lib_NpManager, "(STUBBED) called"); return ORBIS_OK; } @@ -3510,4 +3510,4 @@ void RegisterlibSceNpManager(Core::Loader::SymbolsResolver* sym) { sceNpUnregisterStateCallbackForToolkit); }; -} // namespace Libraries::NpManager \ No newline at end of file +} // namespace Libraries::NpManager diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 561014a3..c7042763 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -183,6 +183,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddCapability(spv::Capability::Float16); ctx.AddCapability(spv::Capability::Int16); } + ctx.AddCapability(spv::Capability::Int64); if (info.has_storage_images) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); } @@ -204,8 +205,8 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { } else { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); } + ctx.AddCapability(spv::Capability::GroupNonUniform); if (info.uses_group_quad) { - ctx.AddCapability(spv::Capability::GroupNonUniform); ctx.AddCapability(spv::Capability::GroupNonUniformQuad); } if (info.has_discard) { @@ -217,9 +218,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { if (info.has_image_query) { ctx.AddCapability(spv::Capability::ImageQuery); } - // if (program.info.stores_frag_depth) { - // ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); - // } + if (info.stores.Get(IR::Attribute::Depth)) { + ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); + } break; default: throw NotImplementedException("Stage {}", u32(program.info.stage)); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp index da29f392..03a0a00f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp @@ -6,8 +6,8 @@ namespace Shader::Backend::SPIRV { -void EmitBitCastU16F16(EmitContext&) { - UNREACHABLE_MSG("SPIR-V Instruction"); +Id EmitBitCastU16F16(EmitContext& ctx, Id value) { + return ctx.OpBitcast(ctx.U16, value); } Id EmitBitCastU32F32(EmitContext& ctx, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 87ffa150..02480303 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -120,6 +120,7 @@ void EmitGetGotoVariable(EmitContext&) { } Id EmitReadConst(EmitContext& ctx) { + return ctx.u32_zero_value; UNREACHABLE_MSG("Unreachable instruction"); } @@ -149,6 +150,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { // Attribute is disabled or varying component is not written return ctx.ConstF32(comp == 3 ? 1.0f : 0.0f); } + if (param.is_default) { + return ctx.OpCompositeExtract(param.component_type, param.id, comp); + } if (param.num_components > 1) { const Id pointer{ @@ -208,7 +212,7 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) { const Id pointer{OutputAttrPointer(ctx, attr, element)}; - ctx.OpStore(pointer, value); + ctx.OpStore(pointer, ctx.OpBitcast(ctx.F32[1], value)); } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index ede592e0..945fa687 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -259,4 +259,8 @@ Id EmitConvertU16U32(EmitContext& ctx, Id value) { return ctx.OpUConvert(ctx.U16, value); } +Id EmitConvertU32U16(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U32[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp index 911983a4..e822eabe 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp @@ -385,4 +385,8 @@ Id EmitFPIsInf64(EmitContext& ctx, Id value) { return ctx.OpIsInf(ctx.U1[1], value); } +void EmitFPCmpClass32(EmitContext&) { + UNREACHABLE(); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 17def57a..030d3948 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -70,7 +70,6 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id o const u32 comp = inst->Flags().gather_comp.Value(); ImageOperands operands; operands.Add(spv::ImageOperandsMask::Offset, offset); - operands.Add(spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); return ctx.OpImageGather(ctx.F32[4], sampled_image, coords, ctx.ConstU32(comp), operands.mask, operands.operands); } @@ -106,8 +105,7 @@ Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod const auto type = ctx.info.images[handle & 0xFFFF].type; const Id zero = ctx.u32_zero_value; const auto mips{[&] { return skip_mips ? zero : ctx.OpImageQueryLevels(ctx.U32[1], image); }}; - const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa && - type != AmdGpu::ImageType::Buffer}; + const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa}; const auto query{[&](Id type) { return uses_lod ? ctx.OpImageQuerySizeLod(type, image, lod) : ctx.OpImageQuerySize(type, image); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 80dd66b1..51899eb4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -42,6 +42,7 @@ void EmitSetVcc(EmitContext& ctx); void EmitSetSccLo(EmitContext& ctx); void EmitSetVccLo(EmitContext& ctx); void EmitSetVccHi(EmitContext& ctx); +void EmitFPCmpClass32(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); void EmitDiscard(EmitContext& ctx); @@ -148,7 +149,7 @@ Id EmitSelectU64(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF16(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF32(EmitContext& ctx, Id cond, Id true_value, Id false_value); Id EmitSelectF64(EmitContext& ctx, Id cond, Id true_value, Id false_value); -void EmitBitCastU16F16(EmitContext& ctx); +Id EmitBitCastU16F16(EmitContext& ctx, Id value); Id EmitBitCastU32F32(EmitContext& ctx, Id value); void EmitBitCastU64F64(EmitContext& ctx); Id EmitBitCastF16U16(EmitContext& ctx, Id value); @@ -282,6 +283,7 @@ Id EmitBitCount32(EmitContext& ctx, Id value); Id EmitBitwiseNot32(EmitContext& ctx, Id value); Id EmitFindSMsb32(EmitContext& ctx, Id value); Id EmitFindUMsb32(EmitContext& ctx, Id value); +Id EmitFindILsb32(EmitContext& ctx, Id value); Id EmitSMin32(EmitContext& ctx, Id a, Id b); Id EmitUMin32(EmitContext& ctx, Id a, Id b); Id EmitSMax32(EmitContext& ctx, Id a, Id b); @@ -353,6 +355,7 @@ Id EmitConvertF64U16(EmitContext& ctx, Id value); Id EmitConvertF64U32(EmitContext& ctx, Id value); Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); +Id EmitConvertU32U16(EmitContext& ctx, Id value); Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, Id offset); @@ -387,6 +390,7 @@ Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value); Id EmitLaneId(EmitContext& ctx); +Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 019ceb01..f20c4fac 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -198,6 +198,10 @@ Id EmitFindUMsb32(EmitContext& ctx, Id value) { return ctx.OpFindUMsb(ctx.U32[1], value); } +Id EmitFindILsb32(EmitContext& ctx, Id value) { + return ctx.OpFindILsb(ctx.U32[1], value); +} + Id EmitSMin32(EmitContext& ctx, Id a, Id b) { return ctx.OpSMin(ctx.U32[1], a, b); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index a1751588..bd4ac066 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -10,6 +10,10 @@ Id SubgroupScope(EmitContext& ctx) { return ctx.ConstU32(static_cast(spv::Scope::Subgroup)); } +Id EmitWarpId(EmitContext& ctx) { + return ctx.OpLoad(ctx.U32[1], ctx.subgroup_id); +} + Id EmitLaneId(EmitContext& ctx) { return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 9ce87add..f7b30052 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -49,7 +49,7 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin DefineInterfaces(program); DefineBuffers(info); DefineImagesAndSamplers(info); - DefineSharedMemory(info); + DefineSharedMemory(); } EmitContext::~EmitContext() = default; @@ -86,6 +86,7 @@ void EmitContext::DefineArithmeticTypes() { F32[1] = Name(TypeFloat(32), "f32_id"); S32[1] = Name(TypeSInt(32), "i32_id"); U32[1] = Name(TypeUInt(32), "u32_id"); + U64 = Name(TypeUInt(64), "u64_id"); for (u32 i = 2; i <= 4; i++) { if (info.uses_fp16) { @@ -126,6 +127,7 @@ Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::SnormNz: return ctx.F32[4]; case AmdGpu::NumberFormat::Sint: return ctx.S32[4]; @@ -146,6 +148,7 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: + case AmdGpu::NumberFormat::SnormNz: return {id, input_f32, F32[1], 4}; case AmdGpu::NumberFormat::Uint: return {id, input_u32, U32[1], 4}; @@ -204,7 +207,9 @@ void EmitContext::DefineInputs(const Info& info) { : 1; // Note that we pass index rather than Id input_params[input.binding] = { - rate_idx, input_u32, U32[1], input.num_components, input.instance_data_buf, + rate_idx, input_u32, + U32[1], input.num_components, + false, input.instance_data_buf, }; } else { Id id{DefineInput(type, input.binding)}; @@ -220,19 +225,18 @@ void EmitContext::DefineInputs(const Info& info) { break; } case Stage::Fragment: - if (info.uses_group_quad) { - subgroup_local_invocation_id = DefineVariable( - U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); - Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); - } + subgroup_id = DefineVariable(U32[1], spv::BuiltIn::SubgroupId, spv::StorageClass::Input); + subgroup_local_invocation_id = DefineVariable( + U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); + Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); for (const auto& input : info.ps_inputs) { const u32 semantic = input.param_index; if (input.is_default) { - input_params[semantic] = {MakeDefaultValue(*this, input.default_value), input_f32, - F32[1]}; + input_params[semantic] = {MakeDefaultValue(*this, input.default_value), F32[1], + F32[1], 4, true}; continue; } const IR::Attribute param{IR::Attribute::Param0 + input.param_index}; @@ -392,7 +396,16 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) { return spv::ImageFormat::Rgba8ui; } - UNREACHABLE(); + if (image.GetDataFmt() == AmdGpu::DataFormat::Format10_11_11 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::R11fG11fB10f; + } + if (image.GetDataFmt() == AmdGpu::DataFormat::Format32_32_32_32 && + image.GetNumberFmt() == AmdGpu::NumberFormat::Float) { + return spv::ImageFormat::Rgba32f; + } + UNREACHABLE_MSG("Unknown storage format data_format={}, num_format={}", image.GetDataFmt(), + image.GetNumberFmt()); } Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { @@ -412,8 +425,6 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, sampled, format); case AmdGpu::ImageType::Cube: return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, sampled, format); - case AmdGpu::ImageType::Buffer: - throw NotImplementedException("Image buffer"); default: break; } @@ -471,10 +482,14 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { } } -void EmitContext::DefineSharedMemory(const Info& info) { - if (info.shared_memory_size == 0) { +void EmitContext::DefineSharedMemory() { + static constexpr size_t DefaultSharedMemSize = 16_KB; + if (!info.uses_shared) { return; } + if (info.shared_memory_size == 0) { + info.shared_memory_size = DefaultSharedMemSize; + } const auto make{[&](Id element_type, u32 element_size) { const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)}; const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index fc678344..34c13d3f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -180,6 +180,7 @@ public: Id workgroup_id{}; Id local_invocation_id{}; + Id subgroup_id{}; Id subgroup_local_invocation_id{}; Id image_u32{}; @@ -219,6 +220,7 @@ public: Id pointer_type; Id component_type; u32 num_components; + bool is_default{}; s32 buffer_handle{-1}; }; std::array input_params{}; @@ -231,7 +233,7 @@ private: void DefineOutputs(const Info& info); void DefineBuffers(const Info& info); void DefineImagesAndSamplers(const Info& info); - void DefineSharedMemory(const Info& info); + void DefineSharedMemory(); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id); }; diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 634566fa..8df3ac36 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -1479,7 +1479,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorFpGraph32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 337 = V_MIN3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 338 = V_MIN3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -1488,7 +1488,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 340 = V_MAX3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 341 = V_MAX3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -1497,7 +1497,7 @@ constexpr std::array InstructionFormatVOP3 = {{ {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 343 = V_MED3_F32 - {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, + {InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32, ScalarType::Float32}, // 344 = V_MED3_I32 {InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32, @@ -2779,11 +2779,9 @@ constexpr std::array InstructionFormatDS = {{ // 60 = DS_READ_U16 {InstClass::DsIdxRd, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 61 = DS_CONSUME - {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 62 = DS_APPEND - {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined, - ScalarType::Undefined}, + {InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32}, // 63 = DS_ORDERED_COUNT {InstClass::GdsOrdCnt, InstCategory::DataShare, 3, 1, ScalarType::Undefined, ScalarType::Undefined}, diff --git a/src/shader_recompiler/frontend/instruction.h b/src/shader_recompiler/frontend/instruction.h index d1d10efb..f83f43db 100644 --- a/src/shader_recompiler/frontend/instruction.h +++ b/src/shader_recompiler/frontend/instruction.h @@ -76,11 +76,11 @@ struct SMRD { }; struct InstControlSOPK { - BitField<0, 16, u32> simm; + s16 simm; }; struct InstControlSOPP { - BitField<0, 16, u32> simm; + s16 simm; }; struct InstControlVOP3 { diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 346f00aa..c8d73858 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -600,13 +600,13 @@ public: TranslatePass(ObjectPool& inst_pool_, ObjectPool& block_pool_, ObjectPool& stmt_pool_, Statement& root_stmt, IR::AbstractSyntaxList& syntax_list_, std::span inst_list_, - Info& info_) + Info& info_, const Profile& profile_) : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, - syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_} { + syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_}, profile{profile_} { Visit(root_stmt, nullptr, nullptr); IR::Block& first_block{*syntax_list.front().data.block}; - Translator{&first_block, info}.EmitPrologue(); + Translator{&first_block, info, profile}.EmitPrologue(); } private: @@ -635,7 +635,7 @@ private: const u32 start = stmt.block->begin_index; const u32 size = stmt.block->end_index - start + 1; Translate(current_block, stmt.block->begin, inst_list.subspan(start, size), - info); + info, profile); } break; } @@ -815,16 +815,18 @@ private: const Block dummy_flow_block{.is_dummy = true}; std::span inst_list; Info& info; + const Profile& profile; }; } // Anonymous namespace IR::AbstractSyntaxList BuildASL(ObjectPool& inst_pool, ObjectPool& block_pool, - CFG& cfg, Info& info) { + CFG& cfg, Info& info, const Profile& profile) { ObjectPool stmt_pool{64}; GotoPass goto_pass{cfg, stmt_pool}; Statement& root{goto_pass.RootStatement()}; IR::AbstractSyntaxList syntax_list; - TranslatePass{inst_pool, block_pool, stmt_pool, root, syntax_list, cfg.inst_list, info}; + TranslatePass{inst_pool, block_pool, stmt_pool, root, + syntax_list, cfg.inst_list, info, profile}; ASSERT_MSG(!info.translation_failed, "Shader translation has failed"); return syntax_list; } diff --git a/src/shader_recompiler/frontend/structured_control_flow.h b/src/shader_recompiler/frontend/structured_control_flow.h index 09814349..da4ef1ff 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.h +++ b/src/shader_recompiler/frontend/structured_control_flow.h @@ -11,12 +11,13 @@ namespace Shader { struct Info; -} +struct Profile; +} // namespace Shader namespace Shader::Gcn { [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool& inst_pool, ObjectPool& block_pool, CFG& cfg, - Info& info); + Info& info, const Profile& profile); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c5d9f0ec..14837166 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -5,6 +5,31 @@ namespace Shader::Gcn { +void Translator::EmitDataShare(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::DS_SWIZZLE_B32: + return DS_SWIZZLE_B32(inst); + case Opcode::DS_READ_B32: + return DS_READ(32, false, false, inst); + case Opcode::DS_READ_B64: + return DS_READ(64, false, false, inst); + case Opcode::DS_READ2_B32: + return DS_READ(32, false, true, inst); + case Opcode::DS_READ2_B64: + return DS_READ(64, false, true, inst); + case Opcode::DS_WRITE_B32: + return DS_WRITE(32, false, false, inst); + case Opcode::DS_WRITE_B64: + return DS_WRITE(64, false, false, inst); + case Opcode::DS_WRITE2_B32: + return DS_WRITE(32, false, true, inst); + case Opcode::DS_WRITE2_B64: + return DS_WRITE(64, false, true, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { const u8 offset0 = inst.control.ds.offset0; const u8 offset1 = inst.control.ds.offset1; @@ -20,14 +45,25 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - const IR::VectorReg dst_reg{inst.dst[0].code}; + IR::VectorReg dst_reg{inst.dst[0].code}; if (is_pair) { - // Pair loads are either 32 or 64-bit. We assume 32-bit for now. - ASSERT(bit_size == 32); + // Pair loads are either 32 or 64-bit const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)}); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); + if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else { + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); + } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)}); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); + if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else { + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); + } } else if (bit_size == 64) { const IR::Value data = ir.LoadShared(bit_size, is_signed, addr); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); @@ -43,11 +79,22 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; if (is_pair) { - ASSERT(bit_size == 32); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0))); - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + } else { + ir.WriteShared( + 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), + addr0); + } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1))); - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + } else { + ir.WriteShared( + 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), + addr1); + } } else if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); @@ -62,7 +109,18 @@ void Translator::S_BARRIER() { } void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) { - UNREACHABLE(); + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); +} + +void Translator::V_READLANE_B32(const GcnInst& inst) { + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); +} + +void Translator::V_WRITELANE_B32(const GcnInst& inst) { + ASSERT(info.stage != Stage::Compute); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 51840537..889de21b 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -6,7 +6,7 @@ namespace Shader::Gcn { -void Translator::EXP(const GcnInst& inst) { +void Translator::EmitExport(const GcnInst& inst) { if (ir.block->has_multiple_predecessors && info.stage == Stage::Fragment) { LOG_WARNING(Render_Recompiler, "An ambiguous export appeared in translation"); ir.Discard(ir.LogicalNot(ir.GetExec())); diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index a20e91ca..795b148d 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -5,8 +5,102 @@ namespace Shader::Gcn { +void Translator::EmitScalarAlu(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_MOVK_I32: + return S_MOVK(inst); + case Opcode::S_MOV_B32: + return S_MOV(inst); + case Opcode::S_MUL_I32: + return S_MUL_I32(inst); + case Opcode::S_AND_SAVEEXEC_B64: + return S_AND_SAVEEXEC_B64(inst); + case Opcode::S_MOV_B64: + return S_MOV_B64(inst); + case Opcode::S_CMP_LT_U32: + return S_CMP(ConditionOp::LT, false, inst); + case Opcode::S_CMP_LE_U32: + return S_CMP(ConditionOp::LE, false, inst); + case Opcode::S_CMP_LG_U32: + return S_CMP(ConditionOp::LG, false, inst); + case Opcode::S_CMP_LT_I32: + return S_CMP(ConditionOp::LT, true, inst); + case Opcode::S_CMP_LG_I32: + return S_CMP(ConditionOp::LG, true, inst); + case Opcode::S_CMP_GT_I32: + return S_CMP(ConditionOp::GT, true, inst); + case Opcode::S_CMP_GE_I32: + return S_CMP(ConditionOp::GE, true, inst); + case Opcode::S_CMP_EQ_I32: + return S_CMP(ConditionOp::EQ, true, inst); + case Opcode::S_CMP_EQ_U32: + return S_CMP(ConditionOp::EQ, false, inst); + case Opcode::S_CMP_GE_U32: + return S_CMP(ConditionOp::GE, false, inst); + case Opcode::S_CMP_GT_U32: + return S_CMP(ConditionOp::GT, false, inst); + case Opcode::S_OR_B64: + return S_OR_B64(NegateMode::None, false, inst); + case Opcode::S_NOR_B64: + return S_OR_B64(NegateMode::Result, false, inst); + case Opcode::S_XOR_B64: + return S_OR_B64(NegateMode::None, true, inst); + case Opcode::S_ORN2_B64: + return S_OR_B64(NegateMode::Src1, false, inst); + case Opcode::S_AND_B64: + return S_AND_B64(NegateMode::None, inst); + case Opcode::S_NAND_B64: + return S_AND_B64(NegateMode::Result, inst); + case Opcode::S_ANDN2_B64: + return S_AND_B64(NegateMode::Src1, inst); + case Opcode::S_NOT_B64: + return S_NOT_B64(inst); + case Opcode::S_ADD_I32: + return S_ADD_I32(inst); + case Opcode::S_AND_B32: + return S_AND_B32(inst); + case Opcode::S_ASHR_I32: + return S_ASHR_I32(inst); + case Opcode::S_OR_B32: + return S_OR_B32(inst); + case Opcode::S_LSHL_B32: + return S_LSHL_B32(inst); + case Opcode::S_LSHR_B32: + return S_LSHR_B32(inst); + case Opcode::S_CSELECT_B32: + return S_CSELECT_B32(inst); + case Opcode::S_CSELECT_B64: + return S_CSELECT_B64(inst); + case Opcode::S_BFE_U32: + return S_BFE_U32(inst); + case Opcode::S_BFM_B32: + return S_BFM_B32(inst); + case Opcode::S_BREV_B32: + return S_BREV_B32(inst); + case Opcode::S_ADD_U32: + return S_ADD_U32(inst); + case Opcode::S_ADDC_U32: + return S_ADDC_U32(inst); + case Opcode::S_ADDK_I32: + return S_ADDK_I32(inst); + case Opcode::S_MULK_I32: + return S_MULK_I32(inst); + case Opcode::S_SUB_U32: + case Opcode::S_SUB_I32: + return S_SUB_U32(inst); + case Opcode::S_MIN_U32: + return S_MIN_U32(inst); + case Opcode::S_MAX_U32: + return S_MAX_U32(inst); + case Opcode::S_WQM_B64: + break; + default: + LogMissingOpcode(inst); + } +} + void Translator::S_MOVK(const GcnInst& inst) { - const auto simm16 = inst.control.sopk.simm.Value(); + const auto simm16 = inst.control.sopk.simm; if (simm16 & (1 << 15)) { // TODO: need to verify the case of imm sign extension UNREACHABLE(); @@ -14,6 +108,16 @@ void Translator::S_MOVK(const GcnInst& inst) { SetDst(inst.dst[0], ir.Imm32(simm16)); } +void Translator::S_ADDK_I32(const GcnInst& inst) { + const s32 simm16 = inst.control.sopk.simm; + SetDst(inst.dst[0], ir.IAdd(GetSrc(inst.dst[0]), ir.Imm32(simm16))); +} + +void Translator::S_MULK_I32(const GcnInst& inst) { + const s32 simm16 = inst.control.sopk.simm; + SetDst(inst.dst[0], ir.IMul(GetSrc(inst.dst[0]), ir.Imm32(simm16))); +} + void Translator::S_MOV(const GcnInst& inst) { SetDst(inst.dst[0], GetSrc(inst.src[0])); } @@ -62,15 +166,10 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } }(); - // Mark destination SPGR as an EXEC context. This means we will use 1-bit - // IR instruction whenever it's loaded. switch (inst.dst[0].field) { - case OperandField::ScalarGPR: { - const u32 reg = inst.dst[0].code; - exec_contexts[reg] = true; - ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), exec); break; - } case OperandField::VccLo: ir.SetVcc(exec); break; @@ -79,27 +178,37 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } // Update EXEC. - ir.SetExec(ir.LogicalAnd(exec, src)); + const IR::U1 result = ir.LogicalAnd(exec, src); + ir.SetExec(result); + ir.SetScc(result); } void Translator::S_MOV_B64(const GcnInst& inst) { - // TODO: Using VCC as EXEC context. - if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) { - return; - } - if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) { - // Exec context push - exec_contexts[inst.dst[0].code] = true; - ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), ir.GetExec()); - } else if (inst.dst[0].field == OperandField::ExecLo && - inst.src[0].field == OperandField::ScalarGPR) { - // Exec context pop - exec_contexts[inst.src[0].code] = false; - ir.SetExec(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))); - } else if (inst.dst[0].field == OperandField::ExecLo && - inst.src[0].field == OperandField::ConstZero) { - ir.SetExec(ir.Imm1(false)); - } else { + const IR::U1 src = [&] { + switch (inst.src[0].field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code)); + case OperandField::ConstZero: + return ir.Imm1(false); + default: + UNREACHABLE(); + } + }(); + switch (inst.dst[0].field) { + case OperandField::ScalarGPR: + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), src); + break; + case OperandField::ExecLo: + ir.SetExec(src); + break; + case OperandField::VccLo: + ir.SetVcc(src); + break; + default: UNREACHABLE(); } } @@ -338,4 +447,20 @@ void Translator::S_ADDC_U32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo())); } +void Translator::S_MAX_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result = ir.UMax(src0, src1); + SetDst(inst.dst[0], result); + ir.SetScc(ir.IEqual(result, src0)); +} + +void Translator::S_MIN_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result = ir.UMin(src0, src1); + SetDst(inst.dst[0], result); + ir.SetScc(ir.IEqual(result, src0)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 3c80764c..29f2acc2 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -7,6 +7,29 @@ namespace Shader::Gcn { static constexpr u32 SQ_SRC_LITERAL = 0xFF; +void Translator::EmitScalarMemory(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_LOAD_DWORDX4: + return S_LOAD_DWORD(4, inst); + case Opcode::S_LOAD_DWORDX8: + return S_LOAD_DWORD(8, inst); + case Opcode::S_LOAD_DWORDX16: + return S_LOAD_DWORD(16, inst); + case Opcode::S_BUFFER_LOAD_DWORD: + return S_BUFFER_LOAD_DWORD(1, inst); + case Opcode::S_BUFFER_LOAD_DWORDX2: + return S_BUFFER_LOAD_DWORD(2, inst); + case Opcode::S_BUFFER_LOAD_DWORDX4: + return S_BUFFER_LOAD_DWORD(4, inst); + case Opcode::S_BUFFER_LOAD_DWORDX8: + return S_BUFFER_LOAD_DWORD(8, inst); + case Opcode::S_BUFFER_LOAD_DWORDX16: + return S_BUFFER_LOAD_DWORD(16, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; const u32 dword_offset = [&] -> u32 { diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index c4c6e505..e8c2a31c 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -16,13 +16,10 @@ namespace Shader::Gcn { -std::array Translator::exec_contexts{}; - -Translator::Translator(IR::Block* block_, Info& info_) - : ir{*block_, block_->begin()}, info{info_} {} +Translator::Translator(IR::Block* block_, Info& info_, const Profile& profile_) + : ir{*block_, block_->begin()}, info{info_}, profile{profile_} {} void Translator::EmitPrologue() { - exec_contexts.fill(false); ir.Prologue(); ir.SetExec(ir.Imm1(true)); @@ -97,7 +94,7 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { } break; case OperandField::ConstZero: - if (force_flt) { + if (is_float) { value = ir.Imm32(0.f); } else { value = ir.Imm32(0U); @@ -112,14 +109,14 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); break; case OperandField::LiteralConst: - if (force_flt) { + if (is_float) { value = ir.Imm32(std::bit_cast(operand.code)); } else { value = ir.Imm32(operand.code); } break; case OperandField::ConstFloatPos_1_0: - if (force_flt) { + if (is_float) { value = ir.Imm32(1.f); } else { value = ir.Imm32(std::bit_cast(1.f)); @@ -138,7 +135,11 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.Imm32(-0.5f); break; case OperandField::ConstFloatNeg_1_0: - value = ir.Imm32(-1.0f); + if (is_float) { + value = ir.Imm32(-1.0f); + } else { + value = ir.Imm32(std::bit_cast(-1.0f)); + } break; case OperandField::ConstFloatNeg_2_0: value = ir.Imm32(-2.0f); @@ -160,6 +161,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { value = ir.GetVccHi(); } break; + case OperandField::M0: + return m0_value; default: UNREACHABLE(); } @@ -336,6 +339,7 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { case OperandField::VccHi: return ir.SetVccHi(result); case OperandField::M0: + m0_value = result; break; default: UNREACHABLE(); @@ -458,712 +462,84 @@ void Translator::EmitFetch(const GcnInst& inst) { } } -void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info) { +void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::S_BARRIER: + return S_BARRIER(); + case Opcode::S_TTRACEDATA: + LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); + return; + case Opcode::S_GETPC_B64: + return S_GETPC_B64(pc, inst); + case Opcode::S_WAITCNT: + case Opcode::S_NOP: + case Opcode::S_ENDPGM: + case Opcode::S_CBRANCH_EXECZ: + case Opcode::S_CBRANCH_SCC0: + case Opcode::S_CBRANCH_SCC1: + case Opcode::S_CBRANCH_VCCNZ: + case Opcode::S_CBRANCH_VCCZ: + case Opcode::S_BRANCH: + return; + default: + UNREACHABLE(); + } +} + +void Translator::LogMissingOpcode(const GcnInst& inst) { + const u32 opcode = u32(inst.opcode); + LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({}, category = {})", + magic_enum::enum_name(inst.opcode), u32(inst.opcode), + magic_enum::enum_name(inst.category)); + info.translation_failed = true; +} + +void Translate(IR::Block* block, u32 pc, std::span inst_list, Info& info, + const Profile& profile) { if (inst_list.empty()) { return; } - Translator translator{block, info}; + Translator translator{block, info, profile}; for (const auto& inst : inst_list) { - block_base += inst.length; - switch (inst.opcode) { - case Opcode::S_MOVK_I32: - translator.S_MOVK(inst); - break; - case Opcode::S_MOV_B32: - translator.S_MOV(inst); - break; - case Opcode::S_MUL_I32: - translator.S_MUL_I32(inst); - break; - case Opcode::V_MAD_F32: - translator.V_MAD_F32(inst); - break; - case Opcode::V_MOV_B32: - translator.V_MOV(inst); - break; - case Opcode::V_MAC_F32: - translator.V_MAC_F32(inst); - break; - case Opcode::V_MUL_F32: - translator.V_MUL_F32(inst); - break; - case Opcode::V_AND_B32: - translator.V_AND_B32(inst); - break; - case Opcode::V_OR_B32: - translator.V_OR_B32(false, inst); - break; - case Opcode::V_XOR_B32: - translator.V_OR_B32(true, inst); - break; - case Opcode::V_LSHLREV_B32: - translator.V_LSHLREV_B32(inst); - break; - case Opcode::V_ADD_I32: - translator.V_ADD_I32(inst); - break; - case Opcode::V_ADDC_U32: - translator.V_ADDC_U32(inst); - break; - case Opcode::V_CVT_F32_I32: - translator.V_CVT_F32_I32(inst); - break; - case Opcode::V_CVT_F32_U32: - translator.V_CVT_F32_U32(inst); - break; - case Opcode::V_RCP_F32: - translator.V_RCP_F32(inst); - break; - case Opcode::S_SWAPPC_B64: + pc += inst.length; + + // Special case for emitting fetch shader. + if (inst.opcode == Opcode::S_SWAPPC_B64) { ASSERT(info.stage == Stage::Vertex); translator.EmitFetch(inst); - break; - case Opcode::S_WAITCNT: - break; - case Opcode::S_LOAD_DWORDX4: - translator.S_LOAD_DWORD(4, inst); - break; - case Opcode::S_LOAD_DWORDX8: - translator.S_LOAD_DWORD(8, inst); - break; - case Opcode::S_LOAD_DWORDX16: - translator.S_LOAD_DWORD(16, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORD: - translator.S_BUFFER_LOAD_DWORD(1, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX2: - translator.S_BUFFER_LOAD_DWORD(2, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX4: - translator.S_BUFFER_LOAD_DWORD(4, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX8: - translator.S_BUFFER_LOAD_DWORD(8, inst); - break; - case Opcode::S_BUFFER_LOAD_DWORDX16: - translator.S_BUFFER_LOAD_DWORD(16, inst); - break; - case Opcode::EXP: - translator.EXP(inst); - break; - case Opcode::V_INTERP_P2_F32: - translator.V_INTERP_P2_F32(inst); - break; - case Opcode::V_CVT_PKRTZ_F16_F32: - translator.V_CVT_PKRTZ_F16_F32(inst); - break; - case Opcode::V_CVT_F32_F16: - translator.V_CVT_F32_F16(inst); - break; - case Opcode::V_CVT_F32_UBYTE0: - translator.V_CVT_F32_UBYTE(0, inst); - break; - case Opcode::V_CVT_F32_UBYTE1: - translator.V_CVT_F32_UBYTE(1, inst); - break; - case Opcode::V_CVT_F32_UBYTE2: - translator.V_CVT_F32_UBYTE(2, inst); - break; - case Opcode::V_CVT_F32_UBYTE3: - translator.V_CVT_F32_UBYTE(3, inst); - break; - case Opcode::V_BFREV_B32: - translator.V_BFREV_B32(inst); - break; - case Opcode::V_LDEXP_F32: - translator.V_LDEXP_F32(inst); - break; - case Opcode::V_FRACT_F32: - translator.V_FRACT_F32(inst); - break; - case Opcode::V_ADD_F32: - translator.V_ADD_F32(inst); - break; - case Opcode::V_CVT_OFF_F32_I4: - translator.V_CVT_OFF_F32_I4(inst); - break; - case Opcode::V_MED3_F32: - translator.V_MED3_F32(inst); - break; - case Opcode::V_FLOOR_F32: - translator.V_FLOOR_F32(inst); - break; - case Opcode::V_SUB_F32: - translator.V_SUB_F32(inst); - break; - case Opcode::V_FMA_F32: - case Opcode::V_MADAK_F32: // Yes these can share the opcode - translator.V_FMA_F32(inst); - break; - case Opcode::IMAGE_SAMPLE_LZ_O: - case Opcode::IMAGE_SAMPLE_O: - case Opcode::IMAGE_SAMPLE_C: - case Opcode::IMAGE_SAMPLE_C_LZ: - case Opcode::IMAGE_SAMPLE_LZ: - case Opcode::IMAGE_SAMPLE: - case Opcode::IMAGE_SAMPLE_L: - case Opcode::IMAGE_SAMPLE_C_O: - case Opcode::IMAGE_SAMPLE_B: - case Opcode::IMAGE_SAMPLE_C_LZ_O: - translator.IMAGE_SAMPLE(inst); - break; - case Opcode::IMAGE_ATOMIC_ADD: - translator.IMAGE_ATOMIC(AtomicOp::Add, inst); - break; - case Opcode::IMAGE_ATOMIC_AND: - translator.IMAGE_ATOMIC(AtomicOp::And, inst); - break; - case Opcode::IMAGE_ATOMIC_OR: - translator.IMAGE_ATOMIC(AtomicOp::Or, inst); - break; - case Opcode::IMAGE_ATOMIC_XOR: - translator.IMAGE_ATOMIC(AtomicOp::Xor, inst); - break; - case Opcode::IMAGE_ATOMIC_UMAX: - translator.IMAGE_ATOMIC(AtomicOp::Umax, inst); - break; - case Opcode::IMAGE_ATOMIC_SMAX: - translator.IMAGE_ATOMIC(AtomicOp::Smax, inst); - break; - case Opcode::IMAGE_ATOMIC_UMIN: - translator.IMAGE_ATOMIC(AtomicOp::Umin, inst); - break; - case Opcode::IMAGE_ATOMIC_SMIN: - translator.IMAGE_ATOMIC(AtomicOp::Smin, inst); - break; - case Opcode::IMAGE_ATOMIC_INC: - translator.IMAGE_ATOMIC(AtomicOp::Inc, inst); - break; - case Opcode::IMAGE_ATOMIC_DEC: - translator.IMAGE_ATOMIC(AtomicOp::Dec, inst); - break; - case Opcode::IMAGE_GET_LOD: - translator.IMAGE_GET_LOD(inst); - break; - case Opcode::IMAGE_GATHER4_C: - case Opcode::IMAGE_GATHER4_LZ: - case Opcode::IMAGE_GATHER4_LZ_O: - translator.IMAGE_GATHER(inst); - break; - case Opcode::IMAGE_STORE: - translator.IMAGE_STORE(inst); - break; - case Opcode::IMAGE_LOAD_MIP: - translator.IMAGE_LOAD(true, inst); - break; - case Opcode::IMAGE_LOAD: - translator.IMAGE_LOAD(false, inst); - break; - case Opcode::V_MAD_U64_U32: - translator.V_MAD_U64_U32(inst); - break; - case Opcode::V_CMP_GE_I32: - translator.V_CMP_U32(ConditionOp::GE, true, false, inst); - break; - case Opcode::V_CMP_EQ_I32: - translator.V_CMP_U32(ConditionOp::EQ, true, false, inst); - break; - case Opcode::V_CMP_LE_I32: - translator.V_CMP_U32(ConditionOp::LE, true, false, inst); - break; - case Opcode::V_CMP_NE_I32: - translator.V_CMP_U32(ConditionOp::LG, true, false, inst); - break; - case Opcode::V_CMP_NE_U32: - translator.V_CMP_U32(ConditionOp::LG, false, false, inst); - break; - case Opcode::V_CMP_EQ_U32: - translator.V_CMP_U32(ConditionOp::EQ, false, false, inst); - break; - case Opcode::V_CMP_F_U32: - translator.V_CMP_U32(ConditionOp::F, false, false, inst); - break; - case Opcode::V_CMP_LT_U32: - translator.V_CMP_U32(ConditionOp::LT, false, false, inst); - break; - case Opcode::V_CMP_GT_U32: - translator.V_CMP_U32(ConditionOp::GT, false, false, inst); - break; - case Opcode::V_CMP_GE_U32: - translator.V_CMP_U32(ConditionOp::GE, false, false, inst); - break; - case Opcode::V_CMP_TRU_U32: - translator.V_CMP_U32(ConditionOp::TRU, false, false, inst); - break; - case Opcode::V_CMP_NEQ_F32: - translator.V_CMP_F32(ConditionOp::LG, false, inst); - break; - case Opcode::V_CMP_F_F32: - translator.V_CMP_F32(ConditionOp::F, false, inst); - break; - case Opcode::V_CMP_LT_F32: - translator.V_CMP_F32(ConditionOp::LT, false, inst); - break; - case Opcode::V_CMP_EQ_F32: - translator.V_CMP_F32(ConditionOp::EQ, false, inst); - break; - case Opcode::V_CMP_LE_F32: - translator.V_CMP_F32(ConditionOp::LE, false, inst); - break; - case Opcode::V_CMP_GT_F32: - translator.V_CMP_F32(ConditionOp::GT, false, inst); - break; - case Opcode::V_CMP_LG_F32: - translator.V_CMP_F32(ConditionOp::LG, false, inst); - break; - case Opcode::V_CMP_GE_F32: - translator.V_CMP_F32(ConditionOp::GE, false, inst); - break; - case Opcode::V_CMP_NLE_F32: - translator.V_CMP_F32(ConditionOp::GT, false, inst); - break; - case Opcode::V_CMP_NLT_F32: - translator.V_CMP_F32(ConditionOp::GE, false, inst); - break; - case Opcode::V_CMP_NGT_F32: - translator.V_CMP_F32(ConditionOp::LE, false, inst); - break; - case Opcode::V_CMP_NGE_F32: - translator.V_CMP_F32(ConditionOp::LT, false, inst); - break; - case Opcode::S_CMP_LT_U32: - translator.S_CMP(ConditionOp::LT, false, inst); - break; - case Opcode::S_CMP_LE_U32: - translator.S_CMP(ConditionOp::LE, false, inst); - break; - case Opcode::S_CMP_LG_U32: - translator.S_CMP(ConditionOp::LG, false, inst); - break; - case Opcode::S_CMP_LT_I32: - translator.S_CMP(ConditionOp::LT, true, inst); - break; - case Opcode::S_CMP_LG_I32: - translator.S_CMP(ConditionOp::LG, true, inst); - break; - case Opcode::S_CMP_GT_I32: - translator.S_CMP(ConditionOp::GT, true, inst); - break; - case Opcode::S_CMP_GE_I32: - translator.S_CMP(ConditionOp::GE, true, inst); - break; - case Opcode::S_CMP_EQ_I32: - translator.S_CMP(ConditionOp::EQ, true, inst); - break; - case Opcode::S_CMP_EQ_U32: - translator.S_CMP(ConditionOp::EQ, false, inst); - break; - case Opcode::S_LSHL_B32: - translator.S_LSHL_B32(inst); - break; - case Opcode::V_CNDMASK_B32: - translator.V_CNDMASK_B32(inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_X: - translator.BUFFER_LOAD_FORMAT(1, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XY: - translator.BUFFER_LOAD_FORMAT(2, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XYZ: - translator.BUFFER_LOAD_FORMAT(3, true, true, inst); - break; - case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - translator.BUFFER_LOAD_FORMAT(4, true, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_X: - translator.BUFFER_LOAD_FORMAT(1, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XY: - translator.BUFFER_LOAD_FORMAT(2, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XYZ: - translator.BUFFER_LOAD_FORMAT(3, false, true, inst); - break; - case Opcode::BUFFER_LOAD_FORMAT_XYZW: - translator.BUFFER_LOAD_FORMAT(4, false, true, inst); - break; - case Opcode::BUFFER_LOAD_DWORD: - translator.BUFFER_LOAD_FORMAT(1, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX2: - translator.BUFFER_LOAD_FORMAT(2, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX3: - translator.BUFFER_LOAD_FORMAT(3, false, false, inst); - break; - case Opcode::BUFFER_LOAD_DWORDX4: - translator.BUFFER_LOAD_FORMAT(4, false, false, inst); - break; - case Opcode::BUFFER_STORE_FORMAT_X: - case Opcode::BUFFER_STORE_DWORD: - translator.BUFFER_STORE_FORMAT(1, false, inst); - break; - case Opcode::BUFFER_STORE_DWORDX2: - translator.BUFFER_STORE_FORMAT(2, false, inst); - break; - case Opcode::BUFFER_STORE_DWORDX3: - translator.BUFFER_STORE_FORMAT(3, false, inst); - break; - case Opcode::BUFFER_STORE_FORMAT_XYZW: - case Opcode::BUFFER_STORE_DWORDX4: - translator.BUFFER_STORE_FORMAT(4, false, inst); - break; - case Opcode::V_MAX_F32: - translator.V_MAX_F32(inst); - break; - case Opcode::V_MAX_I32: - translator.V_MAX_U32(true, inst); - break; - case Opcode::V_MAX_U32: - translator.V_MAX_U32(false, inst); - break; - case Opcode::V_NOT_B32: - translator.V_NOT_B32(inst); - break; - case Opcode::V_RSQ_F32: - translator.V_RSQ_F32(inst); - break; - case Opcode::S_ANDN2_B64: - translator.S_AND_B64(NegateMode::Src1, inst); - break; - case Opcode::S_ORN2_B64: - translator.S_OR_B64(NegateMode::Src1, false, inst); - break; - case Opcode::V_SIN_F32: - translator.V_SIN_F32(inst); - break; - case Opcode::V_COS_F32: - translator.V_COS_F32(inst); - break; - case Opcode::V_LOG_F32: - translator.V_LOG_F32(inst); - break; - case Opcode::V_EXP_F32: - translator.V_EXP_F32(inst); - break; - case Opcode::V_SQRT_F32: - translator.V_SQRT_F32(inst); - break; - case Opcode::V_MIN_F32: - translator.V_MIN_F32(inst); - break; - case Opcode::V_MIN_I32: - translator.V_MIN_I32(inst); - break; - case Opcode::V_MIN3_F32: - translator.V_MIN3_F32(inst); - break; - case Opcode::V_MIN_LEGACY_F32: - translator.V_MIN_F32(inst, true); - break; - case Opcode::V_MADMK_F32: - translator.V_MADMK_F32(inst); - break; - case Opcode::V_CUBEMA_F32: - translator.V_CUBEMA_F32(inst); - break; - case Opcode::V_CUBESC_F32: - translator.V_CUBESC_F32(inst); - break; - case Opcode::V_CUBETC_F32: - translator.V_CUBETC_F32(inst); - break; - case Opcode::V_CUBEID_F32: - translator.V_CUBEID_F32(inst); - break; - case Opcode::V_CVT_U32_F32: - translator.V_CVT_U32_F32(inst); - break; - case Opcode::V_CVT_I32_F32: - translator.V_CVT_I32_F32(inst); - break; - case Opcode::V_CVT_FLR_I32_F32: - translator.V_CVT_FLR_I32_F32(inst); - break; - case Opcode::V_SUBREV_F32: - translator.V_SUBREV_F32(inst); - break; - case Opcode::S_AND_SAVEEXEC_B64: - translator.S_AND_SAVEEXEC_B64(inst); - break; - case Opcode::S_MOV_B64: - translator.S_MOV_B64(inst); - break; - case Opcode::V_SUBREV_I32: - translator.V_SUBREV_I32(inst); - break; + continue; + } - case Opcode::V_CMPX_F_F32: - translator.V_CMP_F32(ConditionOp::F, true, inst); + // Emit instructions for each category. + switch (inst.category) { + case InstCategory::DataShare: + translator.EmitDataShare(inst); break; - case Opcode::V_CMPX_LT_F32: - translator.V_CMP_F32(ConditionOp::LT, true, inst); + case InstCategory::VectorInterpolation: + translator.EmitVectorInterpolation(inst); break; - case Opcode::V_CMPX_EQ_F32: - translator.V_CMP_F32(ConditionOp::EQ, true, inst); + case InstCategory::ScalarMemory: + translator.EmitScalarMemory(inst); break; - case Opcode::V_CMPX_LE_F32: - translator.V_CMP_F32(ConditionOp::LE, true, inst); + case InstCategory::VectorMemory: + translator.EmitVectorMemory(inst); break; - case Opcode::V_CMPX_GT_F32: - translator.V_CMP_F32(ConditionOp::GT, true, inst); + case InstCategory::Export: + translator.EmitExport(inst); break; - case Opcode::V_CMPX_LG_F32: - translator.V_CMP_F32(ConditionOp::LG, true, inst); + case InstCategory::FlowControl: + translator.EmitFlowControl(pc, inst); break; - case Opcode::V_CMPX_GE_F32: - translator.V_CMP_F32(ConditionOp::GE, true, inst); + case InstCategory::ScalarALU: + translator.EmitScalarAlu(inst); break; - case Opcode::V_CMPX_NGE_F32: - translator.V_CMP_F32(ConditionOp::LT, true, inst); + case InstCategory::VectorALU: + translator.EmitVectorAlu(inst); break; - case Opcode::V_CMPX_NLG_F32: - translator.V_CMP_F32(ConditionOp::EQ, true, inst); - break; - case Opcode::V_CMPX_NGT_F32: - translator.V_CMP_F32(ConditionOp::LE, true, inst); - break; - case Opcode::V_CMPX_NLE_F32: - translator.V_CMP_F32(ConditionOp::GT, true, inst); - break; - case Opcode::V_CMPX_NEQ_F32: - translator.V_CMP_F32(ConditionOp::LG, true, inst); - break; - case Opcode::V_CMPX_NLT_F32: - translator.V_CMP_F32(ConditionOp::GE, true, inst); - break; - case Opcode::V_CMPX_TRU_F32: - translator.V_CMP_F32(ConditionOp::TRU, true, inst); - break; - case Opcode::V_CMP_LE_U32: - translator.V_CMP_U32(ConditionOp::LE, false, false, inst); - break; - case Opcode::V_CMP_GT_I32: - translator.V_CMP_U32(ConditionOp::GT, true, false, inst); - break; - case Opcode::V_CMP_LT_I32: - translator.V_CMP_U32(ConditionOp::LT, true, false, inst); - break; - case Opcode::V_CMPX_LT_I32: - translator.V_CMP_U32(ConditionOp::LT, true, true, inst); - break; - case Opcode::V_CMPX_F_U32: - translator.V_CMP_U32(ConditionOp::F, false, true, inst); - break; - case Opcode::V_CMPX_LT_U32: - translator.V_CMP_U32(ConditionOp::LT, false, true, inst); - break; - case Opcode::V_CMPX_EQ_U32: - translator.V_CMP_U32(ConditionOp::EQ, false, true, inst); - break; - case Opcode::V_CMPX_LE_U32: - translator.V_CMP_U32(ConditionOp::LE, false, true, inst); - break; - case Opcode::V_CMPX_GT_U32: - translator.V_CMP_U32(ConditionOp::GT, false, true, inst); - break; - case Opcode::V_CMPX_NE_U32: - translator.V_CMP_U32(ConditionOp::LG, false, true, inst); - break; - case Opcode::V_CMPX_GE_U32: - translator.V_CMP_U32(ConditionOp::GE, false, true, inst); - break; - case Opcode::V_CMPX_TRU_U32: - translator.V_CMP_U32(ConditionOp::TRU, false, true, inst); - break; - case Opcode::S_OR_B64: - translator.S_OR_B64(NegateMode::None, false, inst); - break; - case Opcode::S_NOR_B64: - translator.S_OR_B64(NegateMode::Result, false, inst); - break; - case Opcode::S_XOR_B64: - translator.S_OR_B64(NegateMode::None, true, inst); - break; - case Opcode::S_AND_B64: - translator.S_AND_B64(NegateMode::None, inst); - break; - case Opcode::S_NOT_B64: - translator.S_NOT_B64(inst); - break; - case Opcode::S_NAND_B64: - translator.S_AND_B64(NegateMode::Result, inst); - break; - case Opcode::V_LSHRREV_B32: - translator.V_LSHRREV_B32(inst); - break; - case Opcode::S_ADD_I32: - translator.S_ADD_I32(inst); - break; - case Opcode::V_MUL_HI_U32: - translator.V_MUL_HI_U32(false, inst); - break; - case Opcode::V_MUL_LO_I32: - translator.V_MUL_LO_U32(inst); - break; - case Opcode::V_SAD_U32: - translator.V_SAD_U32(inst); - break; - case Opcode::V_BFE_U32: - translator.V_BFE_U32(false, inst); - break; - case Opcode::V_BFE_I32: - translator.V_BFE_U32(true, inst); - break; - case Opcode::V_MAD_I32_I24: - translator.V_MAD_I32_I24(inst); - break; - case Opcode::V_MUL_I32_I24: - case Opcode::V_MUL_U32_U24: - translator.V_MUL_I32_I24(inst); - break; - case Opcode::V_SUB_I32: - translator.V_SUB_I32(inst); - break; - case Opcode::V_LSHR_B32: - translator.V_LSHR_B32(inst); - break; - case Opcode::V_ASHRREV_I32: - translator.V_ASHRREV_I32(inst); - break; - case Opcode::V_MAD_U32_U24: - translator.V_MAD_U32_U24(inst); - break; - case Opcode::S_AND_B32: - translator.S_AND_B32(inst); - break; - case Opcode::S_ASHR_I32: - translator.S_ASHR_I32(inst); - break; - case Opcode::S_OR_B32: - translator.S_OR_B32(inst); - break; - case Opcode::S_LSHR_B32: - translator.S_LSHR_B32(inst); - break; - case Opcode::S_CSELECT_B32: - translator.S_CSELECT_B32(inst); - break; - case Opcode::S_CSELECT_B64: - translator.S_CSELECT_B64(inst); - break; - case Opcode::S_BFE_U32: - translator.S_BFE_U32(inst); - break; - case Opcode::V_RNDNE_F32: - translator.V_RNDNE_F32(inst); - break; - case Opcode::V_BCNT_U32_B32: - translator.V_BCNT_U32_B32(inst); - break; - case Opcode::V_MAX3_F32: - translator.V_MAX3_F32(inst); - break; - case Opcode::DS_SWIZZLE_B32: - translator.DS_SWIZZLE_B32(inst); - break; - case Opcode::V_MUL_LO_U32: - translator.V_MUL_LO_U32(inst); - break; - case Opcode::S_BFM_B32: - translator.S_BFM_B32(inst); - break; - case Opcode::V_MIN_U32: - translator.V_MIN_U32(inst); - break; - case Opcode::V_CMP_NE_U64: - translator.V_CMP_NE_U64(inst); - break; - case Opcode::V_CMP_CLASS_F32: - translator.V_CMP_CLASS_F32(inst); - break; - case Opcode::V_TRUNC_F32: - translator.V_TRUNC_F32(inst); - break; - case Opcode::V_CEIL_F32: - translator.V_CEIL_F32(inst); - break; - case Opcode::V_BFI_B32: - translator.V_BFI_B32(inst); - break; - case Opcode::S_BREV_B32: - translator.S_BREV_B32(inst); - break; - case Opcode::S_ADD_U32: - translator.S_ADD_U32(inst); - break; - case Opcode::S_ADDC_U32: - translator.S_ADDC_U32(inst); - break; - case Opcode::S_SUB_U32: - case Opcode::S_SUB_I32: - translator.S_SUB_U32(inst); - break; - // TODO: Separate implementation for legacy variants. - case Opcode::V_MUL_LEGACY_F32: - translator.V_MUL_F32(inst); - break; - case Opcode::V_MAC_LEGACY_F32: - translator.V_MAC_F32(inst); - break; - case Opcode::V_MAD_LEGACY_F32: - translator.V_MAD_F32(inst); - break; - case Opcode::V_MAX_LEGACY_F32: - translator.V_MAX_F32(inst, true); - break; - case Opcode::V_RSQ_LEGACY_F32: - case Opcode::V_RSQ_CLAMP_F32: - translator.V_RSQ_F32(inst); - break; - case Opcode::V_RCP_IFLAG_F32: - translator.V_RCP_F32(inst); - break; - case Opcode::IMAGE_GET_RESINFO: - translator.IMAGE_GET_RESINFO(inst); - break; - case Opcode::S_BARRIER: - translator.S_BARRIER(); - break; - case Opcode::S_TTRACEDATA: - LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!"); - break; - case Opcode::DS_READ_B32: - translator.DS_READ(32, false, false, inst); - break; - case Opcode::DS_READ2_B32: - translator.DS_READ(32, false, true, inst); - break; - case Opcode::DS_WRITE_B32: - translator.DS_WRITE(32, false, false, inst); - break; - case Opcode::DS_WRITE2_B32: - translator.DS_WRITE(32, false, true, inst); - break; - case Opcode::V_READFIRSTLANE_B32: - translator.V_READFIRSTLANE_B32(inst); - break; - case Opcode::S_GETPC_B64: - translator.S_GETPC_B64(block_base, inst); - break; - case Opcode::S_NOP: - case Opcode::S_CBRANCH_EXECZ: - case Opcode::S_CBRANCH_SCC0: - case Opcode::S_CBRANCH_SCC1: - case Opcode::S_CBRANCH_VCCNZ: - case Opcode::S_CBRANCH_VCCZ: - case Opcode::S_BRANCH: - case Opcode::S_WQM_B64: - case Opcode::V_INTERP_P1_F32: - case Opcode::S_ENDPGM: + case InstCategory::DebugProfile: break; default: - const u32 opcode = u32(inst.opcode); - LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({})", - magic_enum::enum_name(inst.opcode), opcode); - info.translation_failed = true; + UNREACHABLE(); } } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 3203ad73..8d1b7683 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -11,7 +11,8 @@ namespace Shader { struct Info; -} +struct Profile; +} // namespace Shader namespace Shader::Gcn { @@ -24,6 +25,7 @@ enum class ConditionOp : u32 { LT, LE, TRU, + U, }; enum class AtomicOp : u32 { @@ -53,10 +55,19 @@ enum class NegateMode : u32 { class Translator { public: - explicit Translator(IR::Block* block_, Info& info); + explicit Translator(IR::Block* block_, Info& info, const Profile& profile); + // Instruction categories void EmitPrologue(); void EmitFetch(const GcnInst& inst); + void EmitDataShare(const GcnInst& inst); + void EmitVectorInterpolation(const GcnInst& inst); + void EmitScalarMemory(const GcnInst& inst); + void EmitVectorMemory(const GcnInst& inst); + void EmitExport(const GcnInst& inst); + void EmitFlowControl(u32 pc, const GcnInst& inst); + void EmitScalarAlu(const GcnInst& inst); + void EmitVectorAlu(const GcnInst& inst); // Scalar ALU void S_MOVK(const GcnInst& inst); @@ -83,6 +94,10 @@ public: void S_SUB_U32(const GcnInst& inst); void S_GETPC_B64(u32 pc, const GcnInst& inst); void S_ADDC_U32(const GcnInst& inst); + void S_MULK_I32(const GcnInst& inst); + void S_ADDK_I32(const GcnInst& inst); + void S_MAX_U32(const GcnInst& inst); + void S_MIN_U32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -94,11 +109,13 @@ public: void V_MAC_F32(const GcnInst& inst); void V_CVT_PKRTZ_F16_F32(const GcnInst& inst); void V_CVT_F32_F16(const GcnInst& inst); + void V_CVT_F16_F32(const GcnInst& inst); void V_MUL_F32(const GcnInst& inst); void V_CNDMASK_B32(const GcnInst& inst); void V_OR_B32(bool is_xor, const GcnInst& inst); void V_AND_B32(const GcnInst& inst); void V_LSHLREV_B32(const GcnInst& inst); + void V_LSHL_B32(const GcnInst& inst); void V_ADD_I32(const GcnInst& inst); void V_ADDC_U32(const GcnInst& inst); void V_CVT_F32_I32(const GcnInst& inst); @@ -122,6 +139,7 @@ public: void V_SQRT_F32(const GcnInst& inst); void V_MIN_F32(const GcnInst& inst, bool is_legacy = false); void V_MIN3_F32(const GcnInst& inst); + void V_MIN3_I32(const GcnInst& inst); void V_MADMK_F32(const GcnInst& inst); void V_CUBEMA_F32(const GcnInst& inst); void V_CUBESC_F32(const GcnInst& inst); @@ -146,6 +164,7 @@ public: void V_BCNT_U32_B32(const GcnInst& inst); void V_COS_F32(const GcnInst& inst); void V_MAX3_F32(const GcnInst& inst); + void V_MAX3_U32(const GcnInst& inst); void V_CVT_I32_F32(const GcnInst& inst); void V_MIN_I32(const GcnInst& inst); void V_MUL_LO_U32(const GcnInst& inst); @@ -160,6 +179,8 @@ public: void V_LDEXP_F32(const GcnInst& inst); void V_CVT_FLR_I32_F32(const GcnInst& inst); void V_CMP_CLASS_F32(const GcnInst& inst); + void V_FFBL_B32(const GcnInst& inst); + void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); @@ -167,12 +188,15 @@ public: // Vector interpolation void V_INTERP_P2_F32(const GcnInst& inst); + void V_INTERP_MOV_F32(const GcnInst& inst); // Data share void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void V_READFIRSTLANE_B32(const GcnInst& inst); + void V_READLANE_B32(const GcnInst& inst); + void V_WRITELANE_B32(const GcnInst& inst); void S_BARRIER(); // MIMG @@ -184,9 +208,6 @@ public: void IMAGE_GET_LOD(const GcnInst& inst); void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); - // Export - void EXP(const GcnInst& inst); - private: template [[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false); @@ -195,12 +216,17 @@ private: void SetDst(const InstOperand& operand, const IR::U32F32& value); void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); + void LogMissingOpcode(const GcnInst& inst); + private: IR::IREmitter ir; Info& info; - static std::array exec_contexts; + const Profile& profile; + IR::U32 m0_value; + bool opcode_missing = false; }; -void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info); +void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info, + const Profile& profile); } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 1b2024f8..669ef7ca 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -2,9 +2,311 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/translate/translate.h" +#include "shader_recompiler/profile.h" namespace Shader::Gcn { +void Translator::EmitVectorAlu(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::V_LSHLREV_B32: + return V_LSHLREV_B32(inst); + case Opcode::V_LSHL_B32: + return V_LSHL_B32(inst); + case Opcode::V_BFREV_B32: + return V_BFREV_B32(inst); + case Opcode::V_BFE_U32: + return V_BFE_U32(false, inst); + case Opcode::V_BFE_I32: + return V_BFE_U32(true, inst); + case Opcode::V_BFI_B32: + return V_BFI_B32(inst); + case Opcode::V_LSHR_B32: + return V_LSHR_B32(inst); + case Opcode::V_ASHRREV_I32: + return V_ASHRREV_I32(inst); + case Opcode::V_LSHRREV_B32: + return V_LSHRREV_B32(inst); + case Opcode::V_NOT_B32: + return V_NOT_B32(inst); + case Opcode::V_AND_B32: + return V_AND_B32(inst); + case Opcode::V_OR_B32: + return V_OR_B32(false, inst); + case Opcode::V_XOR_B32: + return V_OR_B32(true, inst); + case Opcode::V_FFBL_B32: + return V_FFBL_B32(inst); + + case Opcode::V_MOV_B32: + return V_MOV(inst); + case Opcode::V_ADD_I32: + return V_ADD_I32(inst); + case Opcode::V_ADDC_U32: + return V_ADDC_U32(inst); + case Opcode::V_CVT_F32_I32: + return V_CVT_F32_I32(inst); + case Opcode::V_CVT_F32_U32: + return V_CVT_F32_U32(inst); + case Opcode::V_CVT_PKRTZ_F16_F32: + return V_CVT_PKRTZ_F16_F32(inst); + case Opcode::V_CVT_F32_F16: + return V_CVT_F32_F16(inst); + case Opcode::V_CVT_F16_F32: + return V_CVT_F16_F32(inst); + case Opcode::V_CVT_F32_UBYTE0: + return V_CVT_F32_UBYTE(0, inst); + case Opcode::V_CVT_F32_UBYTE1: + return V_CVT_F32_UBYTE(1, inst); + case Opcode::V_CVT_F32_UBYTE2: + return V_CVT_F32_UBYTE(2, inst); + case Opcode::V_CVT_F32_UBYTE3: + return V_CVT_F32_UBYTE(3, inst); + case Opcode::V_CVT_OFF_F32_I4: + return V_CVT_OFF_F32_I4(inst); + case Opcode::V_MAD_U64_U32: + return V_MAD_U64_U32(inst); + case Opcode::V_CMP_GE_I32: + return V_CMP_U32(ConditionOp::GE, true, false, inst); + case Opcode::V_CMP_EQ_I32: + return V_CMP_U32(ConditionOp::EQ, true, false, inst); + case Opcode::V_CMP_LE_I32: + return V_CMP_U32(ConditionOp::LE, true, false, inst); + case Opcode::V_CMP_NE_I32: + return V_CMP_U32(ConditionOp::LG, true, false, inst); + case Opcode::V_CMP_NE_U32: + return V_CMP_U32(ConditionOp::LG, false, false, inst); + case Opcode::V_CMP_EQ_U32: + return V_CMP_U32(ConditionOp::EQ, false, false, inst); + case Opcode::V_CMP_F_U32: + return V_CMP_U32(ConditionOp::F, false, false, inst); + case Opcode::V_CMP_LT_U32: + return V_CMP_U32(ConditionOp::LT, false, false, inst); + case Opcode::V_CMP_GT_U32: + return V_CMP_U32(ConditionOp::GT, false, false, inst); + case Opcode::V_CMP_GE_U32: + return V_CMP_U32(ConditionOp::GE, false, false, inst); + case Opcode::V_CMP_TRU_U32: + return V_CMP_U32(ConditionOp::TRU, false, false, inst); + case Opcode::V_CMP_NEQ_F32: + return V_CMP_F32(ConditionOp::LG, false, inst); + case Opcode::V_CMP_F_F32: + return V_CMP_F32(ConditionOp::F, false, inst); + case Opcode::V_CMP_LT_F32: + return V_CMP_F32(ConditionOp::LT, false, inst); + case Opcode::V_CMP_EQ_F32: + return V_CMP_F32(ConditionOp::EQ, false, inst); + case Opcode::V_CMP_LE_F32: + return V_CMP_F32(ConditionOp::LE, false, inst); + case Opcode::V_CMP_GT_F32: + return V_CMP_F32(ConditionOp::GT, false, inst); + case Opcode::V_CMP_LG_F32: + return V_CMP_F32(ConditionOp::LG, false, inst); + case Opcode::V_CMP_GE_F32: + return V_CMP_F32(ConditionOp::GE, false, inst); + case Opcode::V_CMP_NLE_F32: + return V_CMP_F32(ConditionOp::GT, false, inst); + case Opcode::V_CMP_NLT_F32: + return V_CMP_F32(ConditionOp::GE, false, inst); + case Opcode::V_CMP_NGT_F32: + return V_CMP_F32(ConditionOp::LE, false, inst); + case Opcode::V_CMP_NGE_F32: + return V_CMP_F32(ConditionOp::LT, false, inst); + case Opcode::V_CMP_U_F32: + return V_CMP_F32(ConditionOp::U, false, inst); + case Opcode::V_CNDMASK_B32: + return V_CNDMASK_B32(inst); + case Opcode::V_MAX_I32: + return V_MAX_U32(true, inst); + case Opcode::V_MAX_U32: + return V_MAX_U32(false, inst); + case Opcode::V_MIN_I32: + return V_MIN_I32(inst); + case Opcode::V_CUBEMA_F32: + return V_CUBEMA_F32(inst); + case Opcode::V_CUBESC_F32: + return V_CUBESC_F32(inst); + case Opcode::V_CUBETC_F32: + return V_CUBETC_F32(inst); + case Opcode::V_CUBEID_F32: + return V_CUBEID_F32(inst); + case Opcode::V_CVT_U32_F32: + return V_CVT_U32_F32(inst); + case Opcode::V_CVT_I32_F32: + return V_CVT_I32_F32(inst); + case Opcode::V_CVT_FLR_I32_F32: + return V_CVT_FLR_I32_F32(inst); + case Opcode::V_SUBREV_I32: + return V_SUBREV_I32(inst); + case Opcode::V_MUL_HI_U32: + return V_MUL_HI_U32(false, inst); + case Opcode::V_MUL_LO_I32: + return V_MUL_LO_U32(inst); + case Opcode::V_SAD_U32: + return V_SAD_U32(inst); + case Opcode::V_SUB_I32: + return V_SUB_I32(inst); + case Opcode::V_MAD_I32_I24: + return V_MAD_I32_I24(inst); + case Opcode::V_MUL_I32_I24: + case Opcode::V_MUL_U32_U24: + return V_MUL_I32_I24(inst); + case Opcode::V_MAD_U32_U24: + return V_MAD_U32_U24(inst); + case Opcode::V_BCNT_U32_B32: + return V_BCNT_U32_B32(inst); + case Opcode::V_MUL_LO_U32: + return V_MUL_LO_U32(inst); + case Opcode::V_MIN_U32: + return V_MIN_U32(inst); + case Opcode::V_CMP_NE_U64: + return V_CMP_NE_U64(inst); + case Opcode::V_READFIRSTLANE_B32: + return V_READFIRSTLANE_B32(inst); + case Opcode::V_READLANE_B32: + return V_READLANE_B32(inst); + case Opcode::V_WRITELANE_B32: + return V_WRITELANE_B32(inst); + + case Opcode::V_MAD_F32: + return V_MAD_F32(inst); + case Opcode::V_MAC_F32: + return V_MAC_F32(inst); + case Opcode::V_MUL_F32: + return V_MUL_F32(inst); + case Opcode::V_RCP_F32: + return V_RCP_F32(inst); + case Opcode::V_LDEXP_F32: + return V_LDEXP_F32(inst); + case Opcode::V_FRACT_F32: + return V_FRACT_F32(inst); + case Opcode::V_ADD_F32: + return V_ADD_F32(inst); + case Opcode::V_MED3_F32: + return V_MED3_F32(inst); + case Opcode::V_FLOOR_F32: + return V_FLOOR_F32(inst); + case Opcode::V_SUB_F32: + return V_SUB_F32(inst); + case Opcode::V_FMA_F32: + case Opcode::V_MADAK_F32: + return V_FMA_F32(inst); + case Opcode::V_MAX_F32: + return V_MAX_F32(inst); + case Opcode::V_RSQ_F32: + return V_RSQ_F32(inst); + case Opcode::V_SIN_F32: + return V_SIN_F32(inst); + case Opcode::V_COS_F32: + return V_COS_F32(inst); + case Opcode::V_LOG_F32: + return V_LOG_F32(inst); + case Opcode::V_EXP_F32: + return V_EXP_F32(inst); + case Opcode::V_SQRT_F32: + return V_SQRT_F32(inst); + case Opcode::V_MIN_F32: + return V_MIN_F32(inst, false); + case Opcode::V_MIN3_F32: + return V_MIN3_F32(inst); + case Opcode::V_MIN3_I32: + return V_MIN3_I32(inst); + case Opcode::V_MIN_LEGACY_F32: + return V_MIN_F32(inst, true); + case Opcode::V_MADMK_F32: + return V_MADMK_F32(inst); + case Opcode::V_SUBREV_F32: + return V_SUBREV_F32(inst); + case Opcode::V_RNDNE_F32: + return V_RNDNE_F32(inst); + case Opcode::V_MAX3_F32: + return V_MAX3_F32(inst); + case Opcode::V_MAX3_U32: + return V_MAX3_U32(inst); + case Opcode::V_TRUNC_F32: + return V_TRUNC_F32(inst); + case Opcode::V_CEIL_F32: + return V_CEIL_F32(inst); + case Opcode::V_MUL_LEGACY_F32: + return V_MUL_F32(inst); + case Opcode::V_MAC_LEGACY_F32: + return V_MAC_F32(inst); + case Opcode::V_MAD_LEGACY_F32: + return V_MAD_F32(inst); + case Opcode::V_MAX_LEGACY_F32: + return V_MAX_F32(inst, true); + case Opcode::V_RSQ_LEGACY_F32: + case Opcode::V_RSQ_CLAMP_F32: + return V_RSQ_F32(inst); + case Opcode::V_RCP_IFLAG_F32: + return V_RCP_F32(inst); + + case Opcode::V_CMPX_F_F32: + return V_CMP_F32(ConditionOp::F, true, inst); + case Opcode::V_CMPX_LT_F32: + return V_CMP_F32(ConditionOp::LT, true, inst); + case Opcode::V_CMPX_EQ_F32: + return V_CMP_F32(ConditionOp::EQ, true, inst); + case Opcode::V_CMPX_LE_F32: + return V_CMP_F32(ConditionOp::LE, true, inst); + case Opcode::V_CMPX_GT_F32: + return V_CMP_F32(ConditionOp::GT, true, inst); + case Opcode::V_CMPX_LG_F32: + return V_CMP_F32(ConditionOp::LG, true, inst); + case Opcode::V_CMPX_GE_F32: + return V_CMP_F32(ConditionOp::GE, true, inst); + case Opcode::V_CMPX_NGE_F32: + return V_CMP_F32(ConditionOp::LT, true, inst); + case Opcode::V_CMPX_NLG_F32: + return V_CMP_F32(ConditionOp::EQ, true, inst); + case Opcode::V_CMPX_NGT_F32: + return V_CMP_F32(ConditionOp::LE, true, inst); + case Opcode::V_CMPX_NLE_F32: + return V_CMP_F32(ConditionOp::GT, true, inst); + case Opcode::V_CMPX_NEQ_F32: + return V_CMP_F32(ConditionOp::LG, true, inst); + case Opcode::V_CMPX_NLT_F32: + return V_CMP_F32(ConditionOp::GE, true, inst); + case Opcode::V_CMPX_TRU_F32: + return V_CMP_F32(ConditionOp::TRU, true, inst); + case Opcode::V_CMP_CLASS_F32: + return V_CMP_CLASS_F32(inst); + + case Opcode::V_CMP_LE_U32: + return V_CMP_U32(ConditionOp::LE, false, false, inst); + case Opcode::V_CMP_GT_I32: + return V_CMP_U32(ConditionOp::GT, true, false, inst); + case Opcode::V_CMP_LT_I32: + return V_CMP_U32(ConditionOp::LT, true, false, inst); + case Opcode::V_CMPX_LT_I32: + return V_CMP_U32(ConditionOp::LT, true, true, inst); + case Opcode::V_CMPX_F_U32: + return V_CMP_U32(ConditionOp::F, false, true, inst); + case Opcode::V_CMPX_LT_U32: + return V_CMP_U32(ConditionOp::LT, false, true, inst); + case Opcode::V_CMPX_EQ_U32: + return V_CMP_U32(ConditionOp::EQ, false, true, inst); + case Opcode::V_CMPX_LE_U32: + return V_CMP_U32(ConditionOp::LE, false, true, inst); + case Opcode::V_CMPX_GT_U32: + return V_CMP_U32(ConditionOp::GT, false, true, inst); + case Opcode::V_CMPX_NE_U32: + return V_CMP_U32(ConditionOp::LG, false, true, inst); + case Opcode::V_CMPX_GE_U32: + return V_CMP_U32(ConditionOp::GE, false, true, inst); + case Opcode::V_CMPX_TRU_U32: + return V_CMP_U32(ConditionOp::TRU, false, true, inst); + case Opcode::V_CMPX_LG_I32: + return V_CMP_U32(ConditionOp::LG, true, true, inst); + + case Opcode::V_MBCNT_LO_U32_B32: + return V_MBCNT_U32_B32(true, inst); + case Opcode::V_MBCNT_HI_U32_B32: + return V_MBCNT_U32_B32(false, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::V_MOV(const GcnInst& inst) { SetDst(inst.dst[0], GetSrc(inst.src[0])); } @@ -32,6 +334,12 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPConvert(32, ir.BitCast(src0l))); } +void Translator::V_CVT_F16_F32(const GcnInst& inst) { + const IR::F32 src0 = GetSrc(inst.src[0], true); + const IR::F16 src0fp16 = ir.FPConvert(16, src0); + SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast(src0fp16))); +} + void Translator::V_MUL_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); } @@ -85,6 +393,12 @@ void Translator::V_LSHLREV_B32(const GcnInst& inst) { ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F)))); } +void Translator::V_LSHL_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F)))); +} + void Translator::V_ADD_I32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))}; @@ -208,6 +522,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { return ir.FPLessThanEqual(src0, src1); case ConditionOp::GE: return ir.FPGreaterThanEqual(src0, src1); + case ConditionOp::U: + return ir.LogicalNot(ir.LogicalAnd(ir.FPIsNan(src0), ir.FPIsNan(src1))); default: UNREACHABLE(); } @@ -278,6 +594,13 @@ void Translator::V_MIN3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } +void Translator::V_MIN3_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 src2{GetSrc(inst.src[2])}; + SetDst(inst.dst[0], ir.SMin(src0, ir.SMin(src1, src2))); +} + void Translator::V_MADMK_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::F32 src1{GetSrc(inst.src[1], true)}; @@ -320,12 +643,13 @@ void Translator::V_SUBREV_I32(const GcnInst& inst) { } void Translator::V_MAD_U64_U32(const GcnInst& inst) { - const auto src0 = GetSrc(inst.src[0]); const auto src1 = GetSrc(inst.src[1]); const auto src2 = GetSrc64(inst.src[2]); - const IR::U64 mul_result = ir.UConvert(64, ir.IMul(src0, src1)); + // const IR::U64 mul_result = ir.UConvert(64, ir.IMul(src0, src1)); + const IR::U64 mul_result = + ir.PackUint2x32(ir.CompositeConstruct(ir.IMul(src0, src1), ir.Imm32(0U))); const IR::U64 sum_result = ir.IAdd(mul_result, src2); SetDst64(inst.dst[0], sum_result); @@ -463,6 +787,13 @@ void Translator::V_MAX3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } +void Translator::V_MAX3_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 src2{GetSrc(inst.src[2])}; + SetDst(inst.dst[0], ir.UMax(src0, ir.UMax(src1, src2))); +} + void Translator::V_CVT_I32_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); @@ -561,38 +892,58 @@ void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { } void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { - constexpr u32 SIGNALING_NAN = 1 << 0; - constexpr u32 QUIET_NAN = 1 << 1; - constexpr u32 NEGATIVE_INFINITY = 1 << 2; - constexpr u32 NEGATIVE_NORMAL = 1 << 3; - constexpr u32 NEGATIVE_DENORM = 1 << 4; - constexpr u32 NEGATIVE_ZERO = 1 << 5; - constexpr u32 POSITIVE_ZERO = 1 << 6; - constexpr u32 POSITIVE_DENORM = 1 << 7; - constexpr u32 POSITIVE_NORMAL = 1 << 8; - constexpr u32 POSITIVE_INFINITY = 1 << 9; - const IR::F32F64 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; + IR::U1 value; if (src1.IsImmediate()) { - const u32 class_mask = src1.U32(); - IR::U1 value; - if ((class_mask & (SIGNALING_NAN | QUIET_NAN)) == (SIGNALING_NAN | QUIET_NAN)) { + const auto class_mask = static_cast(src1.U32()); + if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) { value = ir.FPIsNan(src0); - } else if ((class_mask & (POSITIVE_INFINITY | NEGATIVE_INFINITY)) == - (POSITIVE_INFINITY | NEGATIVE_INFINITY)) { + } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) { value = ir.FPIsInf(src0); } else { UNREACHABLE(); } - if (inst.dst[1].field == OperandField::VccLo) { - return ir.SetVcc(value); - } else { - UNREACHABLE(); - } } else { + // We don't know the type yet, delay its resolution. + value = ir.FPCmpClass32(src0, src1); + } + + switch (inst.dst[1].field) { + case OperandField::VccLo: + return ir.SetVcc(value); + default: UNREACHABLE(); } } +void Translator::V_FFBL_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + SetDst(inst.dst[0], ir.FindILsb(src0)); +} + +void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 lane_id = ir.LaneId(); + + const auto [warp_half, mask_shift] = [&]() -> std::pair { + if (profile.subgroup_size == 32) { + const IR::U32 warp_half = ir.BitwiseAnd(ir.WarpId(), ir.Imm32(1)); + return std::make_pair(warp_half, lane_id); + } + const IR::U32 warp_half = ir.ShiftRightLogical(lane_id, ir.Imm32(5)); + const IR::U32 mask_shift = ir.BitwiseAnd(lane_id, ir.Imm32(0x1F)); + return std::make_pair(warp_half, mask_shift); + }(); + + const IR::U32 thread_mask = ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1), mask_shift), ir.Imm32(1)); + const IR::U1 is_odd_warp = ir.INotEqual(warp_half, ir.Imm32(0)); + const IR::U32 mask = IR::U32{ir.Select(is_odd_warp, is_low ? ir.Imm32(~0U) : thread_mask, + is_low ? thread_mask : ir.Imm32(0))}; + const IR::U32 masked_value = ir.BitwiseAnd(src0, mask); + const IR::U32 result = ir.IAdd(src1, ir.BitCount(masked_value)); + SetDst(inst.dst[0], result); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp index 55a2d624..4ff846cf 100644 --- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp +++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp @@ -12,4 +12,24 @@ void Translator::V_INTERP_P2_F32(const GcnInst& inst) { ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan)); } +void Translator::V_INTERP_MOV_F32(const GcnInst& inst) { + const IR::VectorReg dst_reg{inst.dst[0].code}; + auto& attr = info.ps_inputs.at(inst.control.vintrp.attr); + const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; + ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan)); +} + +void Translator::EmitVectorInterpolation(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::V_INTERP_P1_F32: + return; + case Opcode::V_INTERP_P2_F32: + return V_INTERP_P2_F32(inst); + case Opcode::V_INTERP_MOV_F32: + return V_INTERP_MOV_F32(inst); + default: + LogMissingOpcode(inst); + } +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index f4383c61..c667968a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -5,9 +5,96 @@ namespace Shader::Gcn { +void Translator::EmitVectorMemory(const GcnInst& inst) { + switch (inst.opcode) { + case Opcode::IMAGE_SAMPLE_LZ_O: + case Opcode::IMAGE_SAMPLE_O: + case Opcode::IMAGE_SAMPLE_C: + case Opcode::IMAGE_SAMPLE_C_LZ: + case Opcode::IMAGE_SAMPLE_LZ: + case Opcode::IMAGE_SAMPLE: + case Opcode::IMAGE_SAMPLE_L: + case Opcode::IMAGE_SAMPLE_C_O: + case Opcode::IMAGE_SAMPLE_B: + case Opcode::IMAGE_SAMPLE_C_LZ_O: + return IMAGE_SAMPLE(inst); + case Opcode::IMAGE_GATHER4_C: + case Opcode::IMAGE_GATHER4_LZ: + case Opcode::IMAGE_GATHER4_LZ_O: + return IMAGE_GATHER(inst); + case Opcode::IMAGE_ATOMIC_ADD: + return IMAGE_ATOMIC(AtomicOp::Add, inst); + case Opcode::IMAGE_ATOMIC_AND: + return IMAGE_ATOMIC(AtomicOp::And, inst); + case Opcode::IMAGE_ATOMIC_OR: + return IMAGE_ATOMIC(AtomicOp::Or, inst); + case Opcode::IMAGE_ATOMIC_XOR: + return IMAGE_ATOMIC(AtomicOp::Xor, inst); + case Opcode::IMAGE_ATOMIC_UMAX: + return IMAGE_ATOMIC(AtomicOp::Umax, inst); + case Opcode::IMAGE_ATOMIC_SMAX: + return IMAGE_ATOMIC(AtomicOp::Smax, inst); + case Opcode::IMAGE_ATOMIC_UMIN: + return IMAGE_ATOMIC(AtomicOp::Umin, inst); + case Opcode::IMAGE_ATOMIC_SMIN: + return IMAGE_ATOMIC(AtomicOp::Smin, inst); + case Opcode::IMAGE_ATOMIC_INC: + return IMAGE_ATOMIC(AtomicOp::Inc, inst); + case Opcode::IMAGE_ATOMIC_DEC: + return IMAGE_ATOMIC(AtomicOp::Dec, inst); + case Opcode::IMAGE_GET_LOD: + return IMAGE_GET_LOD(inst); + case Opcode::IMAGE_STORE: + return IMAGE_STORE(inst); + case Opcode::IMAGE_LOAD_MIP: + return IMAGE_LOAD(true, inst); + case Opcode::IMAGE_LOAD: + return IMAGE_LOAD(false, inst); + case Opcode::IMAGE_GET_RESINFO: + return IMAGE_GET_RESINFO(inst); + + case Opcode::TBUFFER_LOAD_FORMAT_X: + return BUFFER_LOAD_FORMAT(1, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XY: + return BUFFER_LOAD_FORMAT(2, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XYZ: + return BUFFER_LOAD_FORMAT(3, true, true, inst); + case Opcode::TBUFFER_LOAD_FORMAT_XYZW: + return BUFFER_LOAD_FORMAT(4, true, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_X: + return BUFFER_LOAD_FORMAT(1, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XY: + return BUFFER_LOAD_FORMAT(2, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XYZ: + return BUFFER_LOAD_FORMAT(3, false, true, inst); + case Opcode::BUFFER_LOAD_FORMAT_XYZW: + return BUFFER_LOAD_FORMAT(4, false, true, inst); + case Opcode::BUFFER_LOAD_DWORD: + return BUFFER_LOAD_FORMAT(1, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX2: + return BUFFER_LOAD_FORMAT(2, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX3: + return BUFFER_LOAD_FORMAT(3, false, false, inst); + case Opcode::BUFFER_LOAD_DWORDX4: + return BUFFER_LOAD_FORMAT(4, false, false, inst); + case Opcode::BUFFER_STORE_FORMAT_X: + case Opcode::BUFFER_STORE_DWORD: + return BUFFER_STORE_FORMAT(1, false, inst); + case Opcode::BUFFER_STORE_DWORDX2: + return BUFFER_STORE_FORMAT(2, false, inst); + case Opcode::BUFFER_STORE_DWORDX3: + return BUFFER_STORE_FORMAT(3, false, inst); + case Opcode::BUFFER_STORE_FORMAT_XYZW: + case Opcode::BUFFER_STORE_DWORDX4: + return BUFFER_STORE_FORMAT(4, false, inst); + default: + LogMissingOpcode(inst); + } +} + void Translator::IMAGE_GET_RESINFO(const GcnInst& inst) { IR::VectorReg dst_reg{inst.dst[0].code}; - const IR::ScalarReg tsharp_reg{inst.src[2].code}; + const IR::ScalarReg tsharp_reg{inst.src[2].code * 4}; const auto flags = ImageResFlags(inst.control.mimg.dmask); const bool has_mips = flags.test(ImageResComponent::MipCount); const IR::U32 lod = ir.GetVectorReg(IR::VectorReg(inst.src[0].code)); @@ -157,7 +244,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) { info.has_bias.Assign(flags.test(MimgModifier::LodBias)); info.has_lod_clamp.Assign(flags.test(MimgModifier::LodClamp)); info.force_level0.Assign(flags.test(MimgModifier::Level0)); - info.explicit_lod.Assign(explicit_lod); + // info.explicit_lod.Assign(explicit_lod); info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1); // Issue IR instruction, leaving unknown fields blank to patch later. diff --git a/src/shader_recompiler/ir/breadth_first_search.h b/src/shader_recompiler/ir/breadth_first_search.h index 21a34a90..0156303f 100644 --- a/src/shader_recompiler/ir/breadth_first_search.h +++ b/src/shader_recompiler/ir/breadth_first_search.h @@ -12,16 +12,16 @@ namespace Shader::IR { template -auto BreadthFirstSearch(const Value& value, Pred&& pred) - -> std::invoke_result_t { - if (value.IsImmediate()) { - // Nothing to do with immediates - return std::nullopt; +auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t { + // Most often case the instruction is the desired already. + if (const std::optional result = pred(inst)) { + return result; } + // Breadth-first search visiting the right most arguments first boost::container::small_vector visited; std::queue queue; - queue.push(value.InstRecursive()); + queue.push(inst); while (!queue.empty()) { // Pop one instruction from the queue @@ -49,4 +49,14 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred) return std::nullopt; } +template +auto BreadthFirstSearch(const Value& value, Pred&& pred) + -> std::invoke_result_t { + if (value.IsImmediate()) { + // Nothing to do with immediates + return std::nullopt; + } + return BreadthFirstSearch(value.InstRecursive(), pred); +} + } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 8b605df8..03404aca 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -278,7 +278,7 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); case 128: return Inst(Opcode::LoadSharedU128, offset); default: @@ -373,6 +373,10 @@ U32 IREmitter::LaneId() { return Inst(Opcode::LaneId); } +U32 IREmitter::WarpId() { + return Inst(Opcode::WarpId); +} + U32 IREmitter::QuadShuffle(const U32& value, const U32& index) { return Inst(Opcode::QuadShuffle, value, index); } @@ -876,6 +880,10 @@ U1 IREmitter::FPIsInf(const F32F64& value) { } } +U1 IREmitter::FPCmpClass32(const F32& value, const U32& op) { + return Inst(Opcode::FPCmpClass32, value, op); +} + U1 IREmitter::FPOrdered(const F32F64& lhs, const F32F64& rhs) { if (lhs.Type() != rhs.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); @@ -1088,6 +1096,10 @@ U32 IREmitter::FindUMsb(const U32& value) { return Inst(Opcode::FindUMsb32, value); } +U32 IREmitter::FindILsb(const U32& value) { + return Inst(Opcode::FindILsb32, value); +} + U32 IREmitter::SMin(const U32& a, const U32& b) { return Inst(Opcode::SMin32, a, b); } @@ -1274,6 +1286,11 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { default: break; } + case 32: + switch (value.Type()) { + case Type::U16: + return Inst(Opcode::ConvertU32U16, value); + } default: break; } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 7ee4e824..a65e4613 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -95,6 +95,7 @@ public: BufferInstInfo info); [[nodiscard]] U32 LaneId(); + [[nodiscard]] U32 WarpId(); [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); @@ -150,6 +151,7 @@ public: [[nodiscard]] U1 FPGreaterThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true); [[nodiscard]] U1 FPIsNan(const F32F64& value); [[nodiscard]] U1 FPIsInf(const F32F64& value); + [[nodiscard]] U1 FPCmpClass32(const F32& value, const U32& op); [[nodiscard]] U1 FPOrdered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] U1 FPUnordered(const F32F64& lhs, const F32F64& rhs); [[nodiscard]] F32F64 FPMax(const F32F64& lhs, const F32F64& rhs, bool is_legacy = false); @@ -179,6 +181,7 @@ public: [[nodiscard]] U32 FindSMsb(const U32& value); [[nodiscard]] U32 FindUMsb(const U32& value); + [[nodiscard]] U32 FindILsb(const U32& value); [[nodiscard]] U32 SMin(const U32& a, const U32& b); [[nodiscard]] U32 UMin(const U32& a, const U32& b); [[nodiscard]] U32 IMin(const U32& a, const U32& b, bool is_signed); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 628b8d4f..aa2fd3f8 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -219,6 +219,7 @@ OPCODE(FPIsNan32, U1, F32, OPCODE(FPIsNan64, U1, F64, ) OPCODE(FPIsInf32, U1, F32, ) OPCODE(FPIsInf64, U1, F64, ) +OPCODE(FPCmpClass32, U1, F32, U32 ) // Integer operations OPCODE(IAdd32, U32, U32, U32, ) @@ -254,6 +255,7 @@ OPCODE(BitwiseNot32, U32, U32, OPCODE(FindSMsb32, U32, U32, ) OPCODE(FindUMsb32, U32, U32, ) +OPCODE(FindILsb32, U32, U32, ) OPCODE(SMin32, U32, U32, U32, ) OPCODE(UMin32, U32, U32, U32, ) OPCODE(SMax32, U32, U32, U32, ) @@ -293,6 +295,7 @@ OPCODE(ConvertF64S32, F64, U32, OPCODE(ConvertF64U32, F64, U32, ) OPCODE(ConvertF32U16, F32, U16, ) OPCODE(ConvertU16U32, U16, U32, ) +OPCODE(ConvertU32U16, U32, U16, ) // Image operations OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) @@ -323,4 +326,5 @@ OPCODE(ImageAtomicExchange32, U32, Opaq // Warp operations OPCODE(LaneId, U32, ) +OPCODE(WarpId, U32, ) OPCODE(QuadShuffle, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp index 13c0246e..94218b32 100644 --- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp @@ -238,6 +238,18 @@ void FoldBooleanConvert(IR::Inst& inst) { } } +void FoldCmpClass(IR::Inst& inst) { + ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation"); + const auto class_mask = static_cast(inst.Arg(1).U32()); + if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) { + inst.ReplaceOpcode(IR::Opcode::FPIsNan32); + } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) { + inst.ReplaceOpcode(IR::Opcode::FPIsInf32); + } else { + UNREACHABLE(); + } +} + void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: @@ -251,6 +263,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::IMul32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); return; + case IR::Opcode::FPCmpClass32: + FoldCmpClass(inst); + return; case IR::Opcode::ShiftRightArithmetic32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast(a >> b); }); return; diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 6526ece6..eaca8ce8 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include #include #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" @@ -273,9 +272,18 @@ std::pair TryDisableAnisoLod0(const IR::Inst* inst) { } SharpLocation TrackSharp(const IR::Inst* inst) { - while (inst->GetOpcode() == IR::Opcode::Phi) { - inst = inst->Arg(0).InstRecursive(); - } + // Search until we find a potential sharp source. + const auto pred0 = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + const auto result = IR::BreadthFirstSearch(inst, pred0); + ASSERT_MSG(result, "Unable to track sharp source"); + inst = result.value(); + // If its from user data not much else to do. if (inst->GetOpcode() == IR::Opcode::GetUserData) { return SharpLocation{ .sgpr_base = u32(IR::ScalarReg::Max), @@ -289,14 +297,14 @@ SharpLocation TrackSharp(const IR::Inst* inst) { const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); // Retrieve SGPR pair that holds sbase - const auto pred = [](const IR::Inst* inst) -> std::optional { + const auto pred1 = [](const IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetUserData) { return inst->Arg(0).ScalarReg(); } return std::nullopt; }; - const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred); - const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred); + const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1); + const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1); ASSERT_MSG(base0 && base1, "Nested resource loads not supported"); // Return retrieved location. @@ -456,36 +464,26 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& } void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { - std::deque insts{&inst}; - const auto& pred = [](auto opcode) -> bool { - return (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler) - opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only) - opcode == IR::Opcode::GetUserData); + const auto pred = [](const IR::Inst* inst) -> std::optional { + const auto opcode = inst->GetOpcode(); + if (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler) + opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only) + opcode == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; }; - - IR::Inst* producer{}; - while (!insts.empty() && (producer = insts.front(), !pred(producer->GetOpcode()))) { - for (auto arg_idx = 0u; arg_idx < producer->NumArgs(); ++arg_idx) { - const auto arg = producer->Arg(arg_idx); - if (arg.TryInstRecursive()) { - insts.push_back(arg.InstRecursive()); - } - } - insts.pop_front(); - } - ASSERT(pred(producer->GetOpcode())); - auto [tsharp_handle, ssharp_handle] = [&] -> std::pair { - if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) { - return std::make_pair(producer->Arg(0).InstRecursive(), - producer->Arg(1).InstRecursive()); - } - return std::make_pair(producer, nullptr); - }(); + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to find image sharp source"); + const IR::Inst* producer = result.value(); + const bool has_sampler = producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2; + const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer; // Read image sharp. const auto tsharp = TrackSharp(tsharp_handle); const auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); const auto inst_info = inst.Flags(); + ASSERT(image.GetType() != AmdGpu::ImageType::Invalid); u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, .dword_offset = tsharp.dword_offset, @@ -496,17 +494,32 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip }); // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions - if (ssharp_handle) { + const u32 sampler_binding = [&] { + if (!has_sampler) { + return 0U; + } + const IR::Value& handle = producer->Arg(1); + // Inline sampler resource. + if (handle.IsImmediate()) { + LOG_WARNING(Render_Vulkan, "Inline sampler detected"); + return descriptors.Add(SamplerResource{ + .sgpr_base = std::numeric_limits::max(), + .dword_offset = 0, + .inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()}, + }); + } + // Normal sampler resource. + const auto ssharp_handle = handle.InstRecursive(); const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle); const auto ssharp = TrackSharp(ssharp_ud); - const u32 sampler_binding = descriptors.Add(SamplerResource{ + return descriptors.Add(SamplerResource{ .sgpr_base = ssharp.sgpr_base, .dword_offset = ssharp.dword_offset, .associated_image = image_binding, .disable_aniso = disable_aniso, }); - image_binding |= (sampler_binding << 16); - } + }(); + image_binding |= (sampler_binding << 16); // Patch image handle IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -607,7 +620,7 @@ void ResourceTrackingPass(IR::Program& program) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; Descriptors descriptors{info.buffers, info.images, info.samplers}; - for (IR::Block* const block : program.post_order_blocks) { + for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferInstruction(*block, inst, info, descriptors); diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index b51ce94e..7100b384 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -20,11 +20,19 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::LoadSharedU8: case IR::Opcode::WriteSharedU8: info.uses_shared_u8 = true; + info.uses_shared = true; break; case IR::Opcode::LoadSharedS16: case IR::Opcode::LoadSharedU16: case IR::Opcode::WriteSharedU16: info.uses_shared_u16 = true; + info.uses_shared = true; + break; + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + info.uses_shared = true; break; case IR::Opcode::ConvertF32F16: case IR::Opcode::BitCastF16U16: diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index d9e9b030..e3d04260 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -5,6 +5,7 @@ #include "common/assert.h" #include "common/bit_field.h" +#include "common/enum.h" #include "common/types.h" #include "video_core/amdgpu/pixel_format.h" @@ -24,6 +25,23 @@ enum class FpDenormMode : u32 { InOutAllow = 3, }; +enum class FloatClassFunc : u32 { + SignalingNan = 1 << 0, + QuietNan = 1 << 1, + NegativeInfinity = 1 << 2, + NegativeNormal = 1 << 3, + NegativeDenorm = 1 << 4, + NegativeZero = 1 << 5, + PositiveZero = 1 << 6, + PositiveDenorm = 1 << 7, + PositiveNormal = 1 << 8, + PositiveInfinity = 1 << 9, + + NaN = SignalingNan | QuietNan, + Infinity = PositiveInfinity | NegativeInfinity, +}; +DECLARE_ENUM_FLAG_OPERATORS(FloatClassFunc) + union Mode { BitField<0, 4, FpRoundMode> fp_round; BitField<4, 2, FpDenormMode> fp_denorm_single; diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 54b34730..badd5455 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -9,6 +9,7 @@ namespace Shader { struct Profile { u32 supported_spirv{0x00010000}; + u32 subgroup_size{}; bool unified_descriptor_binding{}; bool support_descriptor_aliasing{}; bool support_int8{}; diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index f2834abf..d747c016 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -28,7 +28,8 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { } IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& block_pool, - std::span token, const Info&& info) { + std::span token, const Info&& info, + const Profile& profile) { // Ensure first instruction is expected. constexpr u32 token_mov_vcchi = 0xBEEB03FF; ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm"); @@ -49,7 +50,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool& block_pool, - std::span code, const Info&& info); + std::span code, const Info&& info, + const Profile& profile); } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 8824e344..277c38b7 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -97,8 +97,11 @@ using ImageResourceList = boost::container::static_vector; struct SamplerResource { u32 sgpr_base; u32 dword_offset; + AmdGpu::Sampler inline_sampler{}; u32 associated_image : 4; u32 disable_aniso : 1; + + constexpr AmdGpu::Sampler GetSsharp(const Info& info) const noexcept; }; using SamplerResourceList = boost::container::static_vector; @@ -175,6 +178,7 @@ struct Info { bool has_image_gather{}; bool has_image_query{}; bool uses_group_quad{}; + bool uses_shared{}; bool uses_shared_u8{}; bool uses_shared_u16{}; bool uses_fp16{}; @@ -196,6 +200,10 @@ constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexc return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); } +constexpr AmdGpu::Sampler SamplerResource::GetSsharp(const Info& info) const noexcept { + return inline_sampler ? inline_sampler : info.ReadUd(sgpr_base, dword_offset); +} + } // namespace Shader template <> diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index df7eec82..af1963ee 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -403,9 +403,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanWaitVoLabel([&] { return wait_reg_mem->Test(); }); } while (!wait_reg_mem->Test()) { + mapped_queues[GfxQueueId].cs_state = regs.cs_program; TracyFiberLeave; co_yield {}; TracyFiberEnter(dcb_task_name); + regs.cs_program = mapped_queues[GfxQueueId].cs_state; } break; } @@ -506,9 +508,11 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); while (!wait_reg_mem->Test()) { + mapped_queues[vqid].cs_state = regs.cs_program; TracyFiberLeave; co_yield {}; TracyFiberEnter(acb_task_name); + regs.cs_program = mapped_queues[vqid].cs_state; } break; } @@ -529,7 +533,6 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } void Liverpool::SubmitGfx(std::span dcb, std::span ccb) { - static constexpr u32 GfxQueueId = 0u; auto& queue = mapped_queues[GfxQueueId]; auto task = ProcessGraphics(dcb, ccb); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index db2ee91c..b0285809 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -36,6 +36,7 @@ namespace AmdGpu { [[maybe_unused]] std::array CONCAT2(pad, __LINE__) struct Liverpool { + static constexpr u32 GfxQueueId = 0u; static constexpr u32 NumGfxRings = 1u; // actually 2, but HP is reserved by system software static constexpr u32 NumComputePipes = 7u; // actually 8, but #7 is reserved by system software static constexpr u32 NumQueuesPerPipe = 8u; @@ -1061,6 +1062,7 @@ private: struct GpuQueue { std::mutex m_access{}; std::queue submits{}; + ComputeProgram cs_state{}; }; std::array mapped_queues{}; diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 6618e72a..6744891a 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -7,6 +7,77 @@ namespace AmdGpu { +std::string_view NameOf(DataFormat fmt) { + switch (fmt) { + case DataFormat::FormatInvalid: + return "FormatInvalid"; + case DataFormat::Format8: + return "Format8"; + case DataFormat::Format16: + return "Format16"; + case DataFormat::Format8_8: + return "Format8_8"; + case DataFormat::Format32: + return "Format32"; + case DataFormat::Format16_16: + return "Format16_16"; + case DataFormat::Format10_11_11: + return "Format10_11_11"; + case DataFormat::Format11_11_10: + return "Format11_11_10"; + case DataFormat::Format10_10_10_2: + return "Format10_10_10_2"; + case DataFormat::Format2_10_10_10: + return "Format2_10_10_10"; + case DataFormat::Format8_8_8_8: + return "Format8_8_8_8"; + case DataFormat::Format32_32: + return "Format32_32"; + case DataFormat::Format16_16_16_16: + return "Format16_16_16_16"; + case DataFormat::Format32_32_32: + return "Format32_32_32"; + case DataFormat::Format32_32_32_32: + return "Format32_32_32_32"; + case DataFormat::Format5_6_5: + return "Format5_6_5"; + case DataFormat::Format1_5_5_5: + return "Format1_5_5_5"; + case DataFormat::Format5_5_5_1: + return "Format5_5_5_1"; + case DataFormat::Format4_4_4_4: + return "Format4_4_4_4"; + case DataFormat::Format8_24: + return "Format8_24"; + case DataFormat::Format24_8: + return "Format24_8"; + case DataFormat::FormatX24_8_32: + return "FormatX24_8_32"; + case DataFormat::FormatGB_GR: + return "FormatGB_GR"; + case DataFormat::FormatBG_RG: + return "FormatBG_RG"; + case DataFormat::Format5_9_9_9: + return "Format5_9_9_9"; + case DataFormat::FormatBc1: + return "FormatBc1"; + case DataFormat::FormatBc2: + return "FormatBc2"; + case DataFormat::FormatBc3: + return "FormatBc3"; + case DataFormat::FormatBc4: + return "FormatBc4"; + case DataFormat::FormatBc5: + return "FormatBc5"; + case DataFormat::FormatBc6: + return "FormatBc6"; + case DataFormat::FormatBc7: + return "FormatBc7"; + default: + UNREACHABLE(); + } +} + std::string_view NameOf(NumberFormat fmt) { switch (fmt) { case NumberFormat::Unorm: diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 2a38c5a0..1004ed7d 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -61,6 +61,7 @@ enum class NumberFormat : u32 { Ubscaled = 13, }; +[[nodiscard]] std::string_view NameOf(DataFormat fmt); [[nodiscard]] std::string_view NameOf(NumberFormat fmt); int NumComponents(DataFormat format); @@ -70,6 +71,16 @@ s32 ComponentOffset(DataFormat format, u32 comp); } // namespace AmdGpu +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(AmdGpu::DataFormat fmt, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(fmt)); + } +}; + template <> struct fmt::formatter { constexpr auto parse(format_parse_context& ctx) { diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 6ab3306b..01271792 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -75,7 +75,7 @@ struct Buffer { static_assert(sizeof(Buffer) == 16); // 128bits enum class ImageType : u64 { - Buffer = 0, + Invalid = 0, Color1D = 8, Color2D = 9, Color3D = 10, @@ -88,8 +88,8 @@ enum class ImageType : u64 { constexpr std::string_view NameOf(ImageType type) { switch (type) { - case ImageType::Buffer: - return "Buffer"; + case ImageType::Invalid: + return "Invalid"; case ImageType::Color1D: return "Color1D"; case ImageType::Color2D: @@ -179,6 +179,40 @@ struct Image { return base_address << 8; } + u32 DstSelect() const { + return dst_sel_x | (dst_sel_y << 3) | (dst_sel_z << 6) | (dst_sel_w << 9); + } + + static char SelectComp(u32 sel) { + switch (sel) { + case 0: + return '0'; + case 1: + return '1'; + case 4: + return 'R'; + case 5: + return 'G'; + case 6: + return 'B'; + case 7: + return 'A'; + default: + UNREACHABLE(); + } + } + + std::string DstSelectName() const { + std::string result = "["; + u32 dst_sel = DstSelect(); + for (u32 i = 0; i < 4; i++) { + result += SelectComp(dst_sel & 7); + dst_sel >>= 3; + } + result += ']'; + return result; + } + u32 Pitch() const { return pitch + 1; } @@ -290,6 +324,7 @@ enum class BorderColor : u64 { // Table 8.12 Sampler Resource Definition struct Sampler { union { + u64 raw0; BitField<0, 3, ClampMode> clamp_x; BitField<3, 3, ClampMode> clamp_y; BitField<6, 3, ClampMode> clamp_z; @@ -309,6 +344,7 @@ struct Sampler { BitField<60, 4, u64> perf_z; }; union { + u64 raw1; BitField<0, 14, u64> lod_bias; BitField<14, 6, u64> lod_bias_sec; BitField<20, 2, Filter> xy_mag_filter; @@ -323,6 +359,10 @@ struct Sampler { BitField<62, 2, BorderColor> border_color_type; }; + operator bool() const noexcept { + return raw0 != 0 || raw1 != 0; + } + float LodBias() const noexcept { return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / 256.0f; diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index fc7943e6..e7c39de0 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -297,6 +297,7 @@ std::span GetAllFormats() { vk::Format::eBc3UnormBlock, vk::Format::eBc4UnormBlock, vk::Format::eBc5UnormBlock, + vk::Format::eBc5SnormBlock, vk::Format::eBc7SrgbBlock, vk::Format::eBc7UnormBlock, vk::Format::eD16Unorm, @@ -308,6 +309,7 @@ std::span GetAllFormats() { vk::Format::eR8G8B8A8Srgb, vk::Format::eR8G8B8A8Uint, vk::Format::eR8G8B8A8Unorm, + vk::Format::eR8G8B8A8Snorm, vk::Format::eR8G8B8A8Uscaled, vk::Format::eR8G8Snorm, vk::Format::eR8G8Uint, @@ -335,6 +337,10 @@ std::span GetAllFormats() { vk::Format::eR32Sfloat, vk::Format::eR32Sint, vk::Format::eR32Uint, + vk::Format::eBc6HUfloatBlock, + vk::Format::eR16G16Unorm, + vk::Format::eR16G16B16A16Sscaled, + vk::Format::eR16G16Sscaled, }; return formats; } @@ -384,10 +390,17 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Unorm) { return vk::Format::eBc5UnormBlock; } + if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eBc5SnormBlock; + } if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && num_format == AmdGpu::NumberFormat::Sint) { return vk::Format::eR16G16B16A16Sint; } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Sscaled) { + return vk::Format::eR16G16B16A16Sscaled; + } if (data_format == AmdGpu::DataFormat::Format16_16 && num_format == AmdGpu::NumberFormat::Float) { return vk::Format::eR16G16Sfloat; @@ -496,6 +509,10 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::Sint) { return vk::Format::eR16G16Sint; } + if (data_format == AmdGpu::DataFormat::Format16_16 && + num_format == AmdGpu::NumberFormat::Sscaled) { + return vk::Format::eR16G16Sscaled; + } if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && num_format == AmdGpu::NumberFormat::Uscaled) { return vk::Format::eR8G8B8A8Uscaled; @@ -518,6 +535,13 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu num_format == AmdGpu::NumberFormat::SnormNz) { return vk::Format::eR16G16B16A16Snorm; } + if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && + num_format == AmdGpu::NumberFormat::Snorm) { + return vk::Format::eR8G8B8A8Snorm; + } + if (data_format == AmdGpu::DataFormat::FormatBc6 && num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eBc6HUfloatBlock; + } UNREACHABLE_MSG("Unknown data_format={} and num_format={}", u32(data_format), u32(num_format)); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 51bb7f83..34f1e9cc 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -148,7 +148,7 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s } } for (const auto& sampler : info.samplers) { - const auto ssharp = info.ReadUd(sampler.sgpr_base, sampler.dword_offset); + const auto ssharp = sampler.GetSsharp(info); const auto vk_sampler = texture_cache.GetSampler(ssharp); image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); set_writes.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index eb552268..7b00a911 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -386,7 +386,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& } } for (const auto& sampler : stage.samplers) { - auto ssharp = stage.ReadUd(sampler.sgpr_base, sampler.dword_offset); + auto ssharp = sampler.GetSsharp(stage); if (sampler.disable_aniso) { const auto& tsharp = tsharps[sampler.associated_image]; if (tsharp.base_level == 0 && tsharp.last_level == 0) { diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 09a9180e..735303a3 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -164,10 +164,11 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVulkan13Features, vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR, vk::PhysicalDeviceDepthClipControlFeaturesEXT>(); - const vk::StructureChain properties_chain = - physical_device.getProperties2(); + const vk::StructureChain properties_chain = physical_device.getProperties2< + vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, + vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); + subgroup_size = properties_chain.get().subgroupSize; + LOG_INFO(Render_Vulkan, "Physical device subgroup size {}", subgroup_size); features = feature_chain.get().features; if (available_extensions.empty()) { @@ -261,6 +262,7 @@ bool Instance::CreateDevice() { .shaderStorageImageExtendedFormats = features.shaderStorageImageExtendedFormats, .shaderStorageImageMultisample = features.shaderStorageImageMultisample, .shaderClipDistance = features.shaderClipDistance, + .shaderInt64 = features.shaderInt64, .shaderInt16 = features.shaderInt16, }, }, diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 32965ddb..a8c0dcf4 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -188,6 +188,11 @@ public: return properties.limits.nonCoherentAtomSize; } + /// Returns the subgroup size of the selected physical device. + u32 SubgroupSize() const { + return subgroup_size; + } + /// Returns the maximum supported elements in a texel buffer u32 MaxTexelBufferElements() const { return properties.limits.maxTexelBufferElements; @@ -249,6 +254,7 @@ private: bool workgroup_memory_explicit_layout{}; bool color_write_en{}; u64 min_imported_host_pointer_alignment{}; + u32 subgroup_size{}; bool tooling_info{}; bool debug_utils_supported{}; bool has_nsight_graphics{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 67994485..8d27d252 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -109,6 +109,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, pipeline_cache = instance.GetDevice().createPipelineCacheUnique({}); profile = Shader::Profile{ .supported_spirv = 0x00010600U, + .subgroup_size = instance.SubgroupSize(), .support_explicit_workgroup_layout = true, }; } @@ -268,7 +269,8 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs); info.pgm_base = pgm->Address(); info.pgm_hash = hash; - programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + programs[i] = + Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); // Compile IR to SPIR-V auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding); @@ -308,7 +310,8 @@ std::unique_ptr PipelineCache::CreateComputePipeline() { Shader::Info info = MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs); info.pgm_base = cs_pgm.Address(); - auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); + auto program = + Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile); // Compile IR to SPIR-V u32 binding{}; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c64f6089..ff5e97d5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -23,7 +23,7 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, : instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool}, - vertex_index_buffer{instance, scheduler, VertexIndexFlags, 1_GB, BufferType::Upload} { + vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} { if (!Config::nullGpu()) { liverpool->BindRasterizer(this); } @@ -128,6 +128,7 @@ void Rasterizer::BeginRendering() { state.height = std::min(state.height, image.info.size.height); const bool is_clear = texture_cache.IsMetaCleared(col_buf.CmaskAddress()); + state.color_images[state.num_color_attachments] = image.image; state.color_attachments[state.num_color_attachments++] = { .imageView = *image_view.image_view, .imageLayout = vk::ImageLayout::eGeneral, @@ -152,6 +153,7 @@ void Rasterizer::BeginRendering() { const auto& image = texture_cache.GetImage(image_view.image_id); state.width = std::min(state.width, image.info.size.width); state.height = std::min(state.height, image.info.size.height); + state.depth_image = image.image; state.depth_attachment = { .imageView = *image_view.image_view, .imageLayout = image.layout, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e7b12d49..fb64285f 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -50,7 +50,32 @@ void Scheduler::EndRendering() { return; } is_rendering = false; + boost::container::static_vector barriers; + for (size_t i = 0; i < render_state.num_color_attachments; ++i) { + barriers.push_back(vk::ImageMemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eColorAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = render_state.color_images[i], + .subresourceRange = + { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); + } current_cmdbuf.endRendering(); + if (!barriers.empty()) { + current_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput, + vk::PipelineStageFlagBits::eFragmentShader, + vk::DependencyFlagBits::eByRegion, {}, {}, barriers); + } } void Scheduler::Flush(SubmitInfo& info) { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 1e640b08..48c3af7a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -15,7 +15,9 @@ class Instance; struct RenderState { std::array color_attachments{}; + std::array color_images{}; vk::RenderingAttachmentInfo depth_attachment{}; + vk::Image depth_image{}; u32 num_color_attachments{}; u32 num_depth_attachments{}; u32 width = std::numeric_limits::max(); diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index ff85a8aa..04bedaff 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -47,6 +47,20 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) { } } +bool IsIdentityMapping(u32 dst_sel, u32 num_components) { + return (num_components == 1 && dst_sel == 0b100) || + (num_components == 2 && dst_sel == 0b101'100) || + (num_components == 3 && dst_sel == 0b110'101'100) || + (num_components == 4 && dst_sel == 0b111'110'101'100); +} + +vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) { + if (format == vk::Format::eR8G8B8A8Unorm && dst_sel == 0b111100101110) { + return vk::Format::eB8G8R8A8Unorm; + } + return format; +} + ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept : is_storage{is_storage} { type = ConvertImageViewType(image.GetType()); @@ -60,9 +74,16 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexce mapping.b = ConvertComponentSwizzle(image.dst_sel_z); mapping.a = ConvertComponentSwizzle(image.dst_sel_w); // Check for unfortunate case of storage images being swizzled - if (is_storage && (mapping != vk::ComponentMapping{})) { - LOG_ERROR(Render_Vulkan, "Storage image requires swizzling"); + const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt()); + const u32 dst_sel = image.DstSelect(); + if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) { mapping = vk::ComponentMapping{}; + if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) { + format = new_format; + return; + } + LOG_ERROR(Render_Vulkan, "Storage image (num_comps = {}) requires swizzling {}", num_comps, + image.DstSelectName()); } } diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index 590ac9be..fbc62db3 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -35,6 +35,8 @@ struct ImageViewInfo { struct Image; +constexpr Common::SlotId NULL_IMAGE_VIEW_ID{0}; + struct ImageView { explicit ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info, Image& image, ImageId image_id, std::optional usage_override = {}); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 9131e6f1..7b8a5554 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -142,14 +142,14 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { image_ids.push_back(image_id); }); - ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!"); + // ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!"); ImageId image_id{}; if (image_ids.empty()) { image_id = slot_images.insert(instance, scheduler, info); RegisterImage(image_id); } else { - image_id = image_ids[0]; + image_id = image_ids[image_ids.size() > 1 ? 1 : 0]; } Image& image = slot_images[image_id]; @@ -183,12 +183,17 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo } ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { + if (info.guest_address == 0) [[unlikely]] { + return slot_image_views[NULL_IMAGE_VIEW_ID]; + } + const ImageId image_id = FindImage(info); Image& image = slot_images[image_id]; auto& usage = image.info.usage; if (view_info.is_storage) { - image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite); + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite); usage.storage = true; } else { const auto new_layout = image.info.IsDepthStencil() @@ -206,7 +211,7 @@ ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info_tmp.range.extent.levels > image.info.resources.levels || view_info_tmp.range.extent.layers > image.info.resources.layers) { - LOG_ERROR(Render_Vulkan, + LOG_DEBUG(Render_Vulkan, "Subresource range ({}~{},{}~{}) exceeds base image extents ({},{})", view_info_tmp.range.base.level, view_info_tmp.range.extent.levels, view_info_tmp.range.base.layer, view_info_tmp.range.extent.layers, @@ -341,7 +346,7 @@ void TextureCache::RefreshImage(Image& image) { cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead); } vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {