shader_recompiler: Small instruction parsing refactor/bugfixes (#340)

* translator: Implemtn f32 to f16 convert

* shader_recompiler: Add bit instructions

* shader_recompiler: More data share instructions

* shader_recompiler: Remove exec contexts, fix S_MOV_B64

* shader_recompiler: Split instruction parsing into categories

* shader_recompiler: Better BFS search

* shader_recompiler: Constant propagation pass for cmp_class_f32

* shader_recompiler: Partial readfirstlane implementation

* shader_recompiler: Stub readlane/writelane only for non-compute

* hack: Fix swizzle on RDR

* Will properly fix this when merging this

* clang format

* address_space: Bump user area size to full

* shader_recompiler: V_INTERP_MOV_F32

* Should work the same as spirv will emit flat decoration on demand

* kernel: Add MAP_OP_MAP_FLEXIBLE

* image_view: Attempt to apply storage swizzle on format

* vk_scheduler: Barrier attachments on renderpass end

* clang format

* liverpool: cs state backup

* shader_recompiler: More instructions and formats

* vector_alu: Proper V_MBCNT_U32_B32

* shader_recompiler: Port some dark souls things

* file_system: Implement sceKernelRename

* more formats

* clang format

* resource_tracking_pass: Back to assert

* translate: Tracedata

* kernel: Remove tracy lock

* Solves random crashes in Dark Souls

* code: Review comments
This commit is contained in:
TheTurtle 2024-07-31 00:32:40 +03:00 committed by GitHub
parent ac6dc20c3b
commit a7c9bfa5c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
66 changed files with 1349 additions and 904 deletions

View File

@ -15,7 +15,7 @@ static u32 screenWidth = 1280;
static u32 screenHeight = 720;
static s32 gpuId = -1; // Vulkan physical device index. Set to negative for auto select
static std::string logFilter;
static std::string logType = "sync";
static std::string logType = "async";
static bool isDebugDump = false;
static bool isLibc = true;
static bool isShowSplash = false;

View File

@ -207,8 +207,8 @@ public:
message_queue.EmplaceWait(entry);
} else {
ForEachBackend([&entry](auto& backend) { backend.Write(entry); });
std::fflush(stdout);
}
std::fflush(stdout);
}
private:

View File

@ -34,10 +34,7 @@ constexpr VAddr USER_MAX = 0xFBFFFFFFFFULL;
static constexpr size_t SystemManagedSize = SYSTEM_MANAGED_MAX - SYSTEM_MANAGED_MIN + 1;
static constexpr size_t SystemReservedSize = SYSTEM_RESERVED_MAX - SYSTEM_RESERVED_MIN + 1;
// User area size is normally larger than this. However games are unlikely to map to high
// regions of that area, so by default we allocate a smaller virtual address space (about 1/4th).
// to save space on page tables.
static constexpr size_t UserSize = 1ULL << 39;
static constexpr size_t UserSize = 1ULL << 40;
/**
* Represents the user virtual address space backed by a dmem memory block

View File

@ -70,7 +70,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory)
// exist in filesystem but in different case.
auto guest_path = current_path;
while (!path_parts.empty()) {
const auto& part = path_parts.back();
const auto part = path_parts.back();
const auto add_match = [&](const auto& host_part) {
current_path /= host_part;
guest_path /= part;

View File

@ -957,7 +957,7 @@ int PS4_SYSV_ABI sceGnmGetGpuBlockStatus() {
}
int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() {
LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
LOG_DEBUG(Lib_GnmDriver, "(STUBBED) called");
return ORBIS_OK;
}

View File

@ -472,6 +472,28 @@ s64 PS4_SYSV_ABI sceKernelPwrite(int d, void* buf, size_t nbytes, s64 offset) {
return file->f.WriteRaw<u8>(buf, nbytes);
}
s32 PS4_SYSV_ABI sceKernelRename(const char* from, const char* to) {
auto* mnt = Common::Singleton<Core::FileSys::MntPoints>::Instance();
const auto src_path = mnt->GetHostPath(from);
if (!std::filesystem::exists(src_path)) {
return ORBIS_KERNEL_ERROR_ENOENT;
}
const auto dst_path = mnt->GetHostPath(to);
const bool src_is_dir = std::filesystem::is_directory(src_path);
const bool dst_is_dir = std::filesystem::is_directory(dst_path);
if (src_is_dir && !dst_is_dir) {
return ORBIS_KERNEL_ERROR_ENOTDIR;
}
if (!src_is_dir && dst_is_dir) {
return ORBIS_KERNEL_ERROR_EISDIR;
}
if (dst_is_dir && !std::filesystem::is_empty(dst_path)) {
return ORBIS_KERNEL_ERROR_ENOTEMPTY;
}
std::filesystem::copy(src_path, dst_path, std::filesystem::copy_options::overwrite_existing);
return ORBIS_OK;
}
void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
std::srand(std::time(nullptr));
LIB_FUNCTION("1G3lF1Gg1k8", "libkernel", 1, "libkernel", 1, 1, sceKernelOpen);
@ -493,6 +515,7 @@ void fileSystemSymbolsRegister(Core::Loader::SymbolsResolver* sym) {
LIB_FUNCTION("kBwCPsYX-m4", "libkernel", 1, "libkernel", 1, 1, sceKernelFStat);
LIB_FUNCTION("mqQMh1zPPT8", "libScePosix", 1, "libkernel", 1, 1, posix_fstat);
LIB_FUNCTION("VW3TVZiM4-E", "libkernel", 1, "libkernel", 1, 1, sceKernelFtruncate);
LIB_FUNCTION("52NcYU9+lEo", "libkernel", 1, "libkernel", 1, 1, sceKernelRename);
LIB_FUNCTION("E6ao34wPw+U", "libScePosix", 1, "libkernel", 1, 1, posix_stat);
LIB_FUNCTION("+r3rMFwItV4", "libkernel", 1, "libkernel", 1, 1, sceKernelPread);

View File

@ -7,6 +7,7 @@
#include <boost/asio/io_context.hpp>
#include "common/assert.h"
#include "common/debug.h"
#include "common/logging/log.h"
#include "common/polyfill_thread.h"
#include "common/singleton.h"
@ -84,6 +85,9 @@ static PS4_SYSV_ABI void stack_chk_fail() {
int PS4_SYSV_ABI sceKernelMunmap(void* addr, size_t len) {
LOG_INFO(Kernel_Vmm, "addr = {}, len = {:#x}", fmt::ptr(addr), len);
if (len == 0) {
return ORBIS_OK;
}
auto* memory = Core::Memory::Instance();
memory->UnmapMemory(std::bit_cast<VAddr>(addr), len);
return SCE_OK;

View File

@ -262,6 +262,16 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn
LOG_INFO(Kernel_Vmm, "BatchMap: entry = {}, operation = {}, len = {:#x}, result = {}",
i, entries[i].operation, entries[i].length, result);
if (result == 0)
processed++;
} else if (entries[i].operation == MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_FLEXIBLE) {
result = sceKernelMapNamedFlexibleMemory(&entries[i].start, entries[i].length,
entries[i].protection, flags, "");
LOG_INFO(Kernel_Vmm,
"BatchMap: entry = {}, operation = {}, len = {:#x}, type = {}, "
"result = {}",
i, entries[i].operation, entries[i].length, (u8)entries[i].type, result);
if (result == 0)
processed++;
} else {

View File

@ -439,11 +439,7 @@ int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMut
int result = pthread_mutex_init(&(*mutex)->pth_mutex, &(*attr)->pth_mutex_attr);
static auto mutex_loc = MUTEX_LOCATION("mutex");
(*mutex)->tracy_lock = std::make_unique<tracy::LockableCtx>(&mutex_loc);
if (name != nullptr) {
(*mutex)->tracy_lock->CustomName(name, std::strlen(name));
LOG_INFO(Kernel_Pthread, "name={}, result={}", name, result);
}
@ -555,15 +551,11 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) {
return SCE_KERNEL_ERROR_EINVAL;
}
(*mutex)->tracy_lock->BeforeLock();
int result = pthread_mutex_lock(&(*mutex)->pth_mutex);
if (result != 0) {
LOG_TRACE(Kernel_Pthread, "Locked name={}, result={}", (*mutex)->name, result);
}
(*mutex)->tracy_lock->AfterLock();
switch (result) {
case 0:
return SCE_OK;
@ -589,8 +581,6 @@ int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) {
LOG_TRACE(Kernel_Pthread, "Unlocking name={}, result={}", (*mutex)->name, result);
}
(*mutex)->tracy_lock->AfterUnlock();
switch (result) {
case 0:
return SCE_OK;
@ -1195,8 +1185,6 @@ int PS4_SYSV_ABI scePthreadMutexTrylock(ScePthreadMutex* mutex) {
LOG_TRACE(Kernel_Pthread, "name={}, result={}", (*mutex)->name, result);
}
(*mutex)->tracy_lock->AfterTryLock(result == 0);
switch (result) {
case 0:
return ORBIS_OK;

View File

@ -9,7 +9,6 @@
#include <vector>
#include <pthread.h>
#include <sched.h>
#include "common/debug.h"
#include "common/types.h"
namespace Core::Loader {
@ -74,7 +73,6 @@ struct PthreadMutexInternal {
u8 reserved[256];
std::string name;
pthread_mutex_t pth_mutex;
std::unique_ptr<tracy::LockableCtx> tracy_lock;
};
struct PthreadMutexattrInternal {

View File

@ -559,7 +559,7 @@ int PS4_SYSV_ABI sceNetEpollDestroy() {
}
int PS4_SYSV_ABI sceNetEpollWait() {
LOG_ERROR(Lib_Net, "(STUBBED) called");
LOG_TRACE(Lib_Net, "(STUBBED) called");
return ORBIS_OK;
}

View File

@ -79,7 +79,7 @@ int PS4_SYSV_ABI sceNetCtlUnregisterCallbackV6() {
}
int PS4_SYSV_ABI sceNetCtlCheckCallback() {
LOG_ERROR(Lib_NetCtl, "(STUBBED) called");
LOG_TRACE(Lib_NetCtl, "(STUBBED) called");
return ORBIS_OK;
}

View File

@ -870,7 +870,7 @@ int PS4_SYSV_ABI sceNpAsmTerminate() {
}
int PS4_SYSV_ABI sceNpCheckCallback() {
LOG_ERROR(Lib_NpManager, "(STUBBED) called");
LOG_TRACE(Lib_NpManager, "(STUBBED) called");
return ORBIS_OK;
}

View File

@ -183,6 +183,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) {
ctx.AddCapability(spv::Capability::Float16);
ctx.AddCapability(spv::Capability::Int16);
}
ctx.AddCapability(spv::Capability::Int64);
if (info.has_storage_images) {
ctx.AddCapability(spv::Capability::StorageImageExtendedFormats);
}
@ -204,8 +205,8 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) {
} else {
ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft);
}
ctx.AddCapability(spv::Capability::GroupNonUniform);
if (info.uses_group_quad) {
ctx.AddCapability(spv::Capability::GroupNonUniform);
ctx.AddCapability(spv::Capability::GroupNonUniformQuad);
}
if (info.has_discard) {
@ -217,9 +218,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) {
if (info.has_image_query) {
ctx.AddCapability(spv::Capability::ImageQuery);
}
// if (program.info.stores_frag_depth) {
// ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing);
// }
if (info.stores.Get(IR::Attribute::Depth)) {
ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing);
}
break;
default:
throw NotImplementedException("Stage {}", u32(program.info.stage));

View File

@ -6,8 +6,8 @@
namespace Shader::Backend::SPIRV {
void EmitBitCastU16F16(EmitContext&) {
UNREACHABLE_MSG("SPIR-V Instruction");
Id EmitBitCastU16F16(EmitContext& ctx, Id value) {
return ctx.OpBitcast(ctx.U16, value);
}
Id EmitBitCastU32F32(EmitContext& ctx, Id value) {

View File

@ -120,6 +120,7 @@ void EmitGetGotoVariable(EmitContext&) {
}
Id EmitReadConst(EmitContext& ctx) {
return ctx.u32_zero_value;
UNREACHABLE_MSG("Unreachable instruction");
}
@ -149,6 +150,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) {
// Attribute is disabled or varying component is not written
return ctx.ConstF32(comp == 3 ? 1.0f : 0.0f);
}
if (param.is_default) {
return ctx.OpCompositeExtract(param.component_type, param.id, comp);
}
if (param.num_components > 1) {
const Id pointer{
@ -208,7 +212,7 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) {
const Id pointer{OutputAttrPointer(ctx, attr, element)};
ctx.OpStore(pointer, value);
ctx.OpStore(pointer, ctx.OpBitcast(ctx.F32[1], value));
}
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {

View File

@ -259,4 +259,8 @@ Id EmitConvertU16U32(EmitContext& ctx, Id value) {
return ctx.OpUConvert(ctx.U16, value);
}
Id EmitConvertU32U16(EmitContext& ctx, Id value) {
return ctx.OpUConvert(ctx.U32[1], value);
}
} // namespace Shader::Backend::SPIRV

View File

@ -385,4 +385,8 @@ Id EmitFPIsInf64(EmitContext& ctx, Id value) {
return ctx.OpIsInf(ctx.U1[1], value);
}
void EmitFPCmpClass32(EmitContext&) {
UNREACHABLE();
}
} // namespace Shader::Backend::SPIRV

View File

@ -70,7 +70,6 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id o
const u32 comp = inst->Flags<IR::TextureInstInfo>().gather_comp.Value();
ImageOperands operands;
operands.Add(spv::ImageOperandsMask::Offset, offset);
operands.Add(spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f));
return ctx.OpImageGather(ctx.F32[4], sampled_image, coords, ctx.ConstU32(comp), operands.mask,
operands.operands);
}
@ -106,8 +105,7 @@ Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod
const auto type = ctx.info.images[handle & 0xFFFF].type;
const Id zero = ctx.u32_zero_value;
const auto mips{[&] { return skip_mips ? zero : ctx.OpImageQueryLevels(ctx.U32[1], image); }};
const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa &&
type != AmdGpu::ImageType::Buffer};
const bool uses_lod{type != AmdGpu::ImageType::Color2DMsaa};
const auto query{[&](Id type) {
return uses_lod ? ctx.OpImageQuerySizeLod(type, image, lod)
: ctx.OpImageQuerySize(type, image);

View File

@ -42,6 +42,7 @@ void EmitSetVcc(EmitContext& ctx);
void EmitSetSccLo(EmitContext& ctx);
void EmitSetVccLo(EmitContext& ctx);
void EmitSetVccHi(EmitContext& ctx);
void EmitFPCmpClass32(EmitContext& ctx);
void EmitPrologue(EmitContext& ctx);
void EmitEpilogue(EmitContext& ctx);
void EmitDiscard(EmitContext& ctx);
@ -148,7 +149,7 @@ Id EmitSelectU64(EmitContext& ctx, Id cond, Id true_value, Id false_value);
Id EmitSelectF16(EmitContext& ctx, Id cond, Id true_value, Id false_value);
Id EmitSelectF32(EmitContext& ctx, Id cond, Id true_value, Id false_value);
Id EmitSelectF64(EmitContext& ctx, Id cond, Id true_value, Id false_value);
void EmitBitCastU16F16(EmitContext& ctx);
Id EmitBitCastU16F16(EmitContext& ctx, Id value);
Id EmitBitCastU32F32(EmitContext& ctx, Id value);
void EmitBitCastU64F64(EmitContext& ctx);
Id EmitBitCastF16U16(EmitContext& ctx, Id value);
@ -282,6 +283,7 @@ Id EmitBitCount32(EmitContext& ctx, Id value);
Id EmitBitwiseNot32(EmitContext& ctx, Id value);
Id EmitFindSMsb32(EmitContext& ctx, Id value);
Id EmitFindUMsb32(EmitContext& ctx, Id value);
Id EmitFindILsb32(EmitContext& ctx, Id value);
Id EmitSMin32(EmitContext& ctx, Id a, Id b);
Id EmitUMin32(EmitContext& ctx, Id a, Id b);
Id EmitSMax32(EmitContext& ctx, Id a, Id b);
@ -353,6 +355,7 @@ Id EmitConvertF64U16(EmitContext& ctx, Id value);
Id EmitConvertF64U32(EmitContext& ctx, Id value);
Id EmitConvertF64U64(EmitContext& ctx, Id value);
Id EmitConvertU16U32(EmitContext& ctx, Id value);
Id EmitConvertU32U16(EmitContext& ctx, Id value);
Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc,
Id offset);
@ -387,6 +390,7 @@ Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords,
Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value);
Id EmitLaneId(EmitContext& ctx);
Id EmitWarpId(EmitContext& ctx);
Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index);
} // namespace Shader::Backend::SPIRV

View File

@ -198,6 +198,10 @@ Id EmitFindUMsb32(EmitContext& ctx, Id value) {
return ctx.OpFindUMsb(ctx.U32[1], value);
}
Id EmitFindILsb32(EmitContext& ctx, Id value) {
return ctx.OpFindILsb(ctx.U32[1], value);
}
Id EmitSMin32(EmitContext& ctx, Id a, Id b) {
return ctx.OpSMin(ctx.U32[1], a, b);
}

View File

@ -10,6 +10,10 @@ Id SubgroupScope(EmitContext& ctx) {
return ctx.ConstU32(static_cast<u32>(spv::Scope::Subgroup));
}
Id EmitWarpId(EmitContext& ctx) {
return ctx.OpLoad(ctx.U32[1], ctx.subgroup_id);
}
Id EmitLaneId(EmitContext& ctx) {
return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id);
}

View File

@ -49,7 +49,7 @@ EmitContext::EmitContext(const Profile& profile_, IR::Program& program, u32& bin
DefineInterfaces(program);
DefineBuffers(info);
DefineImagesAndSamplers(info);
DefineSharedMemory(info);
DefineSharedMemory();
}
EmitContext::~EmitContext() = default;
@ -86,6 +86,7 @@ void EmitContext::DefineArithmeticTypes() {
F32[1] = Name(TypeFloat(32), "f32_id");
S32[1] = Name(TypeSInt(32), "i32_id");
U32[1] = Name(TypeUInt(32), "u32_id");
U64 = Name(TypeUInt(64), "u64_id");
for (u32 i = 2; i <= 4; i++) {
if (info.uses_fp16) {
@ -126,6 +127,7 @@ Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) {
case AmdGpu::NumberFormat::Float:
case AmdGpu::NumberFormat::Unorm:
case AmdGpu::NumberFormat::Snorm:
case AmdGpu::NumberFormat::SnormNz:
return ctx.F32[4];
case AmdGpu::NumberFormat::Sint:
return ctx.S32[4];
@ -146,6 +148,7 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
case AmdGpu::NumberFormat::Float:
case AmdGpu::NumberFormat::Unorm:
case AmdGpu::NumberFormat::Snorm:
case AmdGpu::NumberFormat::SnormNz:
return {id, input_f32, F32[1], 4};
case AmdGpu::NumberFormat::Uint:
return {id, input_u32, U32[1], 4};
@ -204,7 +207,9 @@ void EmitContext::DefineInputs(const Info& info) {
: 1;
// Note that we pass index rather than Id
input_params[input.binding] = {
rate_idx, input_u32, U32[1], input.num_components, input.instance_data_buf,
rate_idx, input_u32,
U32[1], input.num_components,
false, input.instance_data_buf,
};
} else {
Id id{DefineInput(type, input.binding)};
@ -220,19 +225,18 @@ void EmitContext::DefineInputs(const Info& info) {
break;
}
case Stage::Fragment:
if (info.uses_group_quad) {
subgroup_local_invocation_id = DefineVariable(
U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input);
Decorate(subgroup_local_invocation_id, spv::Decoration::Flat);
}
subgroup_id = DefineVariable(U32[1], spv::BuiltIn::SubgroupId, spv::StorageClass::Input);
subgroup_local_invocation_id = DefineVariable(
U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input);
Decorate(subgroup_local_invocation_id, spv::Decoration::Flat);
frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output);
front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input);
for (const auto& input : info.ps_inputs) {
const u32 semantic = input.param_index;
if (input.is_default) {
input_params[semantic] = {MakeDefaultValue(*this, input.default_value), input_f32,
F32[1]};
input_params[semantic] = {MakeDefaultValue(*this, input.default_value), F32[1],
F32[1], 4, true};
continue;
}
const IR::Attribute param{IR::Attribute::Param0 + input.param_index};
@ -392,7 +396,16 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) {
image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) {
return spv::ImageFormat::Rgba8ui;
}
UNREACHABLE();
if (image.GetDataFmt() == AmdGpu::DataFormat::Format10_11_11 &&
image.GetNumberFmt() == AmdGpu::NumberFormat::Float) {
return spv::ImageFormat::R11fG11fB10f;
}
if (image.GetDataFmt() == AmdGpu::DataFormat::Format32_32_32_32 &&
image.GetNumberFmt() == AmdGpu::NumberFormat::Float) {
return spv::ImageFormat::Rgba32f;
}
UNREACHABLE_MSG("Unknown storage format data_format={}, num_format={}", image.GetDataFmt(),
image.GetNumberFmt());
}
Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
@ -412,8 +425,6 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, sampled, format);
case AmdGpu::ImageType::Cube:
return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, sampled, format);
case AmdGpu::ImageType::Buffer:
throw NotImplementedException("Image buffer");
default:
break;
}
@ -471,10 +482,14 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) {
}
}
void EmitContext::DefineSharedMemory(const Info& info) {
if (info.shared_memory_size == 0) {
void EmitContext::DefineSharedMemory() {
static constexpr size_t DefaultSharedMemSize = 16_KB;
if (!info.uses_shared) {
return;
}
if (info.shared_memory_size == 0) {
info.shared_memory_size = DefaultSharedMemSize;
}
const auto make{[&](Id element_type, u32 element_size) {
const u32 num_elements{Common::DivCeil(info.shared_memory_size, element_size)};
const Id array_type{TypeArray(element_type, ConstU32(num_elements))};

View File

@ -180,6 +180,7 @@ public:
Id workgroup_id{};
Id local_invocation_id{};
Id subgroup_id{};
Id subgroup_local_invocation_id{};
Id image_u32{};
@ -219,6 +220,7 @@ public:
Id pointer_type;
Id component_type;
u32 num_components;
bool is_default{};
s32 buffer_handle{-1};
};
std::array<SpirvAttribute, 32> input_params{};
@ -231,7 +233,7 @@ private:
void DefineOutputs(const Info& info);
void DefineBuffers(const Info& info);
void DefineImagesAndSamplers(const Info& info);
void DefineSharedMemory(const Info& info);
void DefineSharedMemory();
SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id);
};

View File

@ -1479,7 +1479,7 @@ constexpr std::array<InstFormat, 455> InstructionFormatVOP3 = {{
{InstClass::VectorFpGraph32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
ScalarType::Float32},
// 337 = V_MIN3_F32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
{InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
ScalarType::Float32},
// 338 = V_MIN3_I32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32,
@ -1488,7 +1488,7 @@ constexpr std::array<InstFormat, 455> InstructionFormatVOP3 = {{
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 340 = V_MAX3_F32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
{InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
ScalarType::Float32},
// 341 = V_MAX3_I32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32,
@ -1497,7 +1497,7 @@ constexpr std::array<InstFormat, 455> InstructionFormatVOP3 = {{
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 343 = V_MED3_F32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
{InstClass::VectorFpArith32, InstCategory::VectorALU, 3, 1, ScalarType::Float32,
ScalarType::Float32},
// 344 = V_MED3_I32
{InstClass::VectorIntArith32, InstCategory::VectorALU, 3, 1, ScalarType::Sint32,
@ -2779,11 +2779,9 @@ constexpr std::array<InstFormat, 256> InstructionFormatDS = {{
// 60 = DS_READ_U16
{InstClass::DsIdxRd, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32},
// 61 = DS_CONSUME
{InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32},
// 62 = DS_APPEND
{InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::DsAppendCon, InstCategory::DataShare, 3, 1, ScalarType::Uint32, ScalarType::Uint32},
// 63 = DS_ORDERED_COUNT
{InstClass::GdsOrdCnt, InstCategory::DataShare, 3, 1, ScalarType::Undefined,
ScalarType::Undefined},

View File

@ -76,11 +76,11 @@ struct SMRD {
};
struct InstControlSOPK {
BitField<0, 16, u32> simm;
s16 simm;
};
struct InstControlSOPP {
BitField<0, 16, u32> simm;
s16 simm;
};
struct InstControlVOP3 {

View File

@ -600,13 +600,13 @@ public:
TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
ObjectPool<Statement>& stmt_pool_, Statement& root_stmt,
IR::AbstractSyntaxList& syntax_list_, std::span<const GcnInst> inst_list_,
Info& info_)
Info& info_, const Profile& profile_)
: stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_},
syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_} {
syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_}, profile{profile_} {
Visit(root_stmt, nullptr, nullptr);
IR::Block& first_block{*syntax_list.front().data.block};
Translator{&first_block, info}.EmitPrologue();
Translator{&first_block, info, profile}.EmitPrologue();
}
private:
@ -635,7 +635,7 @@ private:
const u32 start = stmt.block->begin_index;
const u32 size = stmt.block->end_index - start + 1;
Translate(current_block, stmt.block->begin, inst_list.subspan(start, size),
info);
info, profile);
}
break;
}
@ -815,16 +815,18 @@ private:
const Block dummy_flow_block{.is_dummy = true};
std::span<const GcnInst> inst_list;
Info& info;
const Profile& profile;
};
} // Anonymous namespace
IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
CFG& cfg, Info& info) {
CFG& cfg, Info& info, const Profile& profile) {
ObjectPool<Statement> stmt_pool{64};
GotoPass goto_pass{cfg, stmt_pool};
Statement& root{goto_pass.RootStatement()};
IR::AbstractSyntaxList syntax_list;
TranslatePass{inst_pool, block_pool, stmt_pool, root, syntax_list, cfg.inst_list, info};
TranslatePass{inst_pool, block_pool, stmt_pool, root,
syntax_list, cfg.inst_list, info, profile};
ASSERT_MSG(!info.translation_failed, "Shader translation has failed");
return syntax_list;
}

View File

@ -11,12 +11,13 @@
namespace Shader {
struct Info;
}
struct Profile;
} // namespace Shader
namespace Shader::Gcn {
[[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
ObjectPool<IR::Block>& block_pool, CFG& cfg,
Info& info);
Info& info, const Profile& profile);
} // namespace Shader::Gcn

View File

@ -5,6 +5,31 @@
namespace Shader::Gcn {
void Translator::EmitDataShare(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::DS_SWIZZLE_B32:
return DS_SWIZZLE_B32(inst);
case Opcode::DS_READ_B32:
return DS_READ(32, false, false, inst);
case Opcode::DS_READ_B64:
return DS_READ(64, false, false, inst);
case Opcode::DS_READ2_B32:
return DS_READ(32, false, true, inst);
case Opcode::DS_READ2_B64:
return DS_READ(64, false, true, inst);
case Opcode::DS_WRITE_B32:
return DS_WRITE(32, false, false, inst);
case Opcode::DS_WRITE_B64:
return DS_WRITE(64, false, false, inst);
case Opcode::DS_WRITE2_B32:
return DS_WRITE(32, false, true, inst);
case Opcode::DS_WRITE2_B64:
return DS_WRITE(64, false, true, inst);
default:
LogMissingOpcode(inst);
}
}
void Translator::DS_SWIZZLE_B32(const GcnInst& inst) {
const u8 offset0 = inst.control.ds.offset0;
const u8 offset1 = inst.control.ds.offset1;
@ -20,14 +45,25 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) {
void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) {
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
const IR::VectorReg dst_reg{inst.dst[0].code};
IR::VectorReg dst_reg{inst.dst[0].code};
if (is_pair) {
// Pair loads are either 32 or 64-bit. We assume 32-bit for now.
ASSERT(bit_size == 32);
// Pair loads are either 32 or 64-bit
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)});
const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0);
if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data0});
} else {
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)});
}
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)});
const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data1});
} else {
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)});
}
} else if (bit_size == 64) {
const IR::Value data = ir.LoadShared(bit_size, is_signed, addr);
ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)});
@ -43,11 +79,22 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
const IR::VectorReg data0{inst.src[1].code};
const IR::VectorReg data1{inst.src[2].code};
if (is_pair) {
ASSERT(bit_size == 32);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
} else {
ir.WriteShared(
64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)),
addr0);
}
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
} else {
ir.WriteShared(
64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)),
addr1);
}
} else if (bit_size == 64) {
const IR::Value data =
ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
@ -62,7 +109,18 @@ void Translator::S_BARRIER() {
}
void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) {
UNREACHABLE();
ASSERT(info.stage != Stage::Compute);
SetDst(inst.dst[0], GetSrc(inst.src[0]));
}
void Translator::V_READLANE_B32(const GcnInst& inst) {
ASSERT(info.stage != Stage::Compute);
SetDst(inst.dst[0], GetSrc(inst.src[0]));
}
void Translator::V_WRITELANE_B32(const GcnInst& inst) {
ASSERT(info.stage != Stage::Compute);
SetDst(inst.dst[0], GetSrc(inst.src[0]));
}
} // namespace Shader::Gcn

View File

@ -6,7 +6,7 @@
namespace Shader::Gcn {
void Translator::EXP(const GcnInst& inst) {
void Translator::EmitExport(const GcnInst& inst) {
if (ir.block->has_multiple_predecessors && info.stage == Stage::Fragment) {
LOG_WARNING(Render_Recompiler, "An ambiguous export appeared in translation");
ir.Discard(ir.LogicalNot(ir.GetExec()));

View File

@ -5,8 +5,102 @@
namespace Shader::Gcn {
void Translator::EmitScalarAlu(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::S_MOVK_I32:
return S_MOVK(inst);
case Opcode::S_MOV_B32:
return S_MOV(inst);
case Opcode::S_MUL_I32:
return S_MUL_I32(inst);
case Opcode::S_AND_SAVEEXEC_B64:
return S_AND_SAVEEXEC_B64(inst);
case Opcode::S_MOV_B64:
return S_MOV_B64(inst);
case Opcode::S_CMP_LT_U32:
return S_CMP(ConditionOp::LT, false, inst);
case Opcode::S_CMP_LE_U32:
return S_CMP(ConditionOp::LE, false, inst);
case Opcode::S_CMP_LG_U32:
return S_CMP(ConditionOp::LG, false, inst);
case Opcode::S_CMP_LT_I32:
return S_CMP(ConditionOp::LT, true, inst);
case Opcode::S_CMP_LG_I32:
return S_CMP(ConditionOp::LG, true, inst);
case Opcode::S_CMP_GT_I32:
return S_CMP(ConditionOp::GT, true, inst);
case Opcode::S_CMP_GE_I32:
return S_CMP(ConditionOp::GE, true, inst);
case Opcode::S_CMP_EQ_I32:
return S_CMP(ConditionOp::EQ, true, inst);
case Opcode::S_CMP_EQ_U32:
return S_CMP(ConditionOp::EQ, false, inst);
case Opcode::S_CMP_GE_U32:
return S_CMP(ConditionOp::GE, false, inst);
case Opcode::S_CMP_GT_U32:
return S_CMP(ConditionOp::GT, false, inst);
case Opcode::S_OR_B64:
return S_OR_B64(NegateMode::None, false, inst);
case Opcode::S_NOR_B64:
return S_OR_B64(NegateMode::Result, false, inst);
case Opcode::S_XOR_B64:
return S_OR_B64(NegateMode::None, true, inst);
case Opcode::S_ORN2_B64:
return S_OR_B64(NegateMode::Src1, false, inst);
case Opcode::S_AND_B64:
return S_AND_B64(NegateMode::None, inst);
case Opcode::S_NAND_B64:
return S_AND_B64(NegateMode::Result, inst);
case Opcode::S_ANDN2_B64:
return S_AND_B64(NegateMode::Src1, inst);
case Opcode::S_NOT_B64:
return S_NOT_B64(inst);
case Opcode::S_ADD_I32:
return S_ADD_I32(inst);
case Opcode::S_AND_B32:
return S_AND_B32(inst);
case Opcode::S_ASHR_I32:
return S_ASHR_I32(inst);
case Opcode::S_OR_B32:
return S_OR_B32(inst);
case Opcode::S_LSHL_B32:
return S_LSHL_B32(inst);
case Opcode::S_LSHR_B32:
return S_LSHR_B32(inst);
case Opcode::S_CSELECT_B32:
return S_CSELECT_B32(inst);
case Opcode::S_CSELECT_B64:
return S_CSELECT_B64(inst);
case Opcode::S_BFE_U32:
return S_BFE_U32(inst);
case Opcode::S_BFM_B32:
return S_BFM_B32(inst);
case Opcode::S_BREV_B32:
return S_BREV_B32(inst);
case Opcode::S_ADD_U32:
return S_ADD_U32(inst);
case Opcode::S_ADDC_U32:
return S_ADDC_U32(inst);
case Opcode::S_ADDK_I32:
return S_ADDK_I32(inst);
case Opcode::S_MULK_I32:
return S_MULK_I32(inst);
case Opcode::S_SUB_U32:
case Opcode::S_SUB_I32:
return S_SUB_U32(inst);
case Opcode::S_MIN_U32:
return S_MIN_U32(inst);
case Opcode::S_MAX_U32:
return S_MAX_U32(inst);
case Opcode::S_WQM_B64:
break;
default:
LogMissingOpcode(inst);
}
}
void Translator::S_MOVK(const GcnInst& inst) {
const auto simm16 = inst.control.sopk.simm.Value();
const auto simm16 = inst.control.sopk.simm;
if (simm16 & (1 << 15)) {
// TODO: need to verify the case of imm sign extension
UNREACHABLE();
@ -14,6 +108,16 @@ void Translator::S_MOVK(const GcnInst& inst) {
SetDst(inst.dst[0], ir.Imm32(simm16));
}
void Translator::S_ADDK_I32(const GcnInst& inst) {
const s32 simm16 = inst.control.sopk.simm;
SetDst(inst.dst[0], ir.IAdd(GetSrc(inst.dst[0]), ir.Imm32(simm16)));
}
void Translator::S_MULK_I32(const GcnInst& inst) {
const s32 simm16 = inst.control.sopk.simm;
SetDst(inst.dst[0], ir.IMul(GetSrc(inst.dst[0]), ir.Imm32(simm16)));
}
void Translator::S_MOV(const GcnInst& inst) {
SetDst(inst.dst[0], GetSrc(inst.src[0]));
}
@ -62,15 +166,10 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) {
}
}();
// Mark destination SPGR as an EXEC context. This means we will use 1-bit
// IR instruction whenever it's loaded.
switch (inst.dst[0].field) {
case OperandField::ScalarGPR: {
const u32 reg = inst.dst[0].code;
exec_contexts[reg] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec);
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), exec);
break;
}
case OperandField::VccLo:
ir.SetVcc(exec);
break;
@ -79,27 +178,37 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) {
}
// Update EXEC.
ir.SetExec(ir.LogicalAnd(exec, src));
const IR::U1 result = ir.LogicalAnd(exec, src);
ir.SetExec(result);
ir.SetScc(result);
}
void Translator::S_MOV_B64(const GcnInst& inst) {
// TODO: Using VCC as EXEC context.
if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) {
return;
}
if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) {
// Exec context push
exec_contexts[inst.dst[0].code] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), ir.GetExec());
} else if (inst.dst[0].field == OperandField::ExecLo &&
inst.src[0].field == OperandField::ScalarGPR) {
// Exec context pop
exec_contexts[inst.src[0].code] = false;
ir.SetExec(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code)));
} else if (inst.dst[0].field == OperandField::ExecLo &&
inst.src[0].field == OperandField::ConstZero) {
ir.SetExec(ir.Imm1(false));
} else {
const IR::U1 src = [&] {
switch (inst.src[0].field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code));
case OperandField::ConstZero:
return ir.Imm1(false);
default:
UNREACHABLE();
}
}();
switch (inst.dst[0].field) {
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), src);
break;
case OperandField::ExecLo:
ir.SetExec(src);
break;
case OperandField::VccLo:
ir.SetVcc(src);
break;
default:
UNREACHABLE();
}
}
@ -338,4 +447,20 @@ void Translator::S_ADDC_U32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo()));
}
void Translator::S_MAX_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.UMax(src0, src1);
SetDst(inst.dst[0], result);
ir.SetScc(ir.IEqual(result, src0));
}
void Translator::S_MIN_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.UMin(src0, src1);
SetDst(inst.dst[0], result);
ir.SetScc(ir.IEqual(result, src0));
}
} // namespace Shader::Gcn

View File

@ -7,6 +7,29 @@ namespace Shader::Gcn {
static constexpr u32 SQ_SRC_LITERAL = 0xFF;
void Translator::EmitScalarMemory(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::S_LOAD_DWORDX4:
return S_LOAD_DWORD(4, inst);
case Opcode::S_LOAD_DWORDX8:
return S_LOAD_DWORD(8, inst);
case Opcode::S_LOAD_DWORDX16:
return S_LOAD_DWORD(16, inst);
case Opcode::S_BUFFER_LOAD_DWORD:
return S_BUFFER_LOAD_DWORD(1, inst);
case Opcode::S_BUFFER_LOAD_DWORDX2:
return S_BUFFER_LOAD_DWORD(2, inst);
case Opcode::S_BUFFER_LOAD_DWORDX4:
return S_BUFFER_LOAD_DWORD(4, inst);
case Opcode::S_BUFFER_LOAD_DWORDX8:
return S_BUFFER_LOAD_DWORD(8, inst);
case Opcode::S_BUFFER_LOAD_DWORDX16:
return S_BUFFER_LOAD_DWORD(16, inst);
default:
LogMissingOpcode(inst);
}
}
void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const auto& smrd = inst.control.smrd;
const u32 dword_offset = [&] -> u32 {

View File

@ -16,13 +16,10 @@
namespace Shader::Gcn {
std::array<bool, IR::NumScalarRegs> Translator::exec_contexts{};
Translator::Translator(IR::Block* block_, Info& info_)
: ir{*block_, block_->begin()}, info{info_} {}
Translator::Translator(IR::Block* block_, Info& info_, const Profile& profile_)
: ir{*block_, block_->begin()}, info{info_}, profile{profile_} {}
void Translator::EmitPrologue() {
exec_contexts.fill(false);
ir.Prologue();
ir.SetExec(ir.Imm1(true));
@ -97,7 +94,7 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
}
break;
case OperandField::ConstZero:
if (force_flt) {
if (is_float) {
value = ir.Imm32(0.f);
} else {
value = ir.Imm32(0U);
@ -112,14 +109,14 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1);
break;
case OperandField::LiteralConst:
if (force_flt) {
if (is_float) {
value = ir.Imm32(std::bit_cast<float>(operand.code));
} else {
value = ir.Imm32(operand.code);
}
break;
case OperandField::ConstFloatPos_1_0:
if (force_flt) {
if (is_float) {
value = ir.Imm32(1.f);
} else {
value = ir.Imm32(std::bit_cast<u32>(1.f));
@ -138,7 +135,11 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
value = ir.Imm32(-0.5f);
break;
case OperandField::ConstFloatNeg_1_0:
value = ir.Imm32(-1.0f);
if (is_float) {
value = ir.Imm32(-1.0f);
} else {
value = ir.Imm32(std::bit_cast<u32>(-1.0f));
}
break;
case OperandField::ConstFloatNeg_2_0:
value = ir.Imm32(-2.0f);
@ -160,6 +161,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
value = ir.GetVccHi();
}
break;
case OperandField::M0:
return m0_value;
default:
UNREACHABLE();
}
@ -336,6 +339,7 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) {
case OperandField::VccHi:
return ir.SetVccHi(result);
case OperandField::M0:
m0_value = result;
break;
default:
UNREACHABLE();
@ -458,712 +462,84 @@ void Translator::EmitFetch(const GcnInst& inst) {
}
}
void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_list, Info& info) {
void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::S_BARRIER:
return S_BARRIER();
case Opcode::S_TTRACEDATA:
LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!");
return;
case Opcode::S_GETPC_B64:
return S_GETPC_B64(pc, inst);
case Opcode::S_WAITCNT:
case Opcode::S_NOP:
case Opcode::S_ENDPGM:
case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0:
case Opcode::S_CBRANCH_SCC1:
case Opcode::S_CBRANCH_VCCNZ:
case Opcode::S_CBRANCH_VCCZ:
case Opcode::S_BRANCH:
return;
default:
UNREACHABLE();
}
}
void Translator::LogMissingOpcode(const GcnInst& inst) {
const u32 opcode = u32(inst.opcode);
LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({}, category = {})",
magic_enum::enum_name(inst.opcode), u32(inst.opcode),
magic_enum::enum_name(inst.category));
info.translation_failed = true;
}
void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Info& info,
const Profile& profile) {
if (inst_list.empty()) {
return;
}
Translator translator{block, info};
Translator translator{block, info, profile};
for (const auto& inst : inst_list) {
block_base += inst.length;
switch (inst.opcode) {
case Opcode::S_MOVK_I32:
translator.S_MOVK(inst);
break;
case Opcode::S_MOV_B32:
translator.S_MOV(inst);
break;
case Opcode::S_MUL_I32:
translator.S_MUL_I32(inst);
break;
case Opcode::V_MAD_F32:
translator.V_MAD_F32(inst);
break;
case Opcode::V_MOV_B32:
translator.V_MOV(inst);
break;
case Opcode::V_MAC_F32:
translator.V_MAC_F32(inst);
break;
case Opcode::V_MUL_F32:
translator.V_MUL_F32(inst);
break;
case Opcode::V_AND_B32:
translator.V_AND_B32(inst);
break;
case Opcode::V_OR_B32:
translator.V_OR_B32(false, inst);
break;
case Opcode::V_XOR_B32:
translator.V_OR_B32(true, inst);
break;
case Opcode::V_LSHLREV_B32:
translator.V_LSHLREV_B32(inst);
break;
case Opcode::V_ADD_I32:
translator.V_ADD_I32(inst);
break;
case Opcode::V_ADDC_U32:
translator.V_ADDC_U32(inst);
break;
case Opcode::V_CVT_F32_I32:
translator.V_CVT_F32_I32(inst);
break;
case Opcode::V_CVT_F32_U32:
translator.V_CVT_F32_U32(inst);
break;
case Opcode::V_RCP_F32:
translator.V_RCP_F32(inst);
break;
case Opcode::S_SWAPPC_B64:
pc += inst.length;
// Special case for emitting fetch shader.
if (inst.opcode == Opcode::S_SWAPPC_B64) {
ASSERT(info.stage == Stage::Vertex);
translator.EmitFetch(inst);
break;
case Opcode::S_WAITCNT:
break;
case Opcode::S_LOAD_DWORDX4:
translator.S_LOAD_DWORD(4, inst);
break;
case Opcode::S_LOAD_DWORDX8:
translator.S_LOAD_DWORD(8, inst);
break;
case Opcode::S_LOAD_DWORDX16:
translator.S_LOAD_DWORD(16, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORD:
translator.S_BUFFER_LOAD_DWORD(1, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORDX2:
translator.S_BUFFER_LOAD_DWORD(2, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORDX4:
translator.S_BUFFER_LOAD_DWORD(4, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORDX8:
translator.S_BUFFER_LOAD_DWORD(8, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORDX16:
translator.S_BUFFER_LOAD_DWORD(16, inst);
break;
case Opcode::EXP:
translator.EXP(inst);
break;
case Opcode::V_INTERP_P2_F32:
translator.V_INTERP_P2_F32(inst);
break;
case Opcode::V_CVT_PKRTZ_F16_F32:
translator.V_CVT_PKRTZ_F16_F32(inst);
break;
case Opcode::V_CVT_F32_F16:
translator.V_CVT_F32_F16(inst);
break;
case Opcode::V_CVT_F32_UBYTE0:
translator.V_CVT_F32_UBYTE(0, inst);
break;
case Opcode::V_CVT_F32_UBYTE1:
translator.V_CVT_F32_UBYTE(1, inst);
break;
case Opcode::V_CVT_F32_UBYTE2:
translator.V_CVT_F32_UBYTE(2, inst);
break;
case Opcode::V_CVT_F32_UBYTE3:
translator.V_CVT_F32_UBYTE(3, inst);
break;
case Opcode::V_BFREV_B32:
translator.V_BFREV_B32(inst);
break;
case Opcode::V_LDEXP_F32:
translator.V_LDEXP_F32(inst);
break;
case Opcode::V_FRACT_F32:
translator.V_FRACT_F32(inst);
break;
case Opcode::V_ADD_F32:
translator.V_ADD_F32(inst);
break;
case Opcode::V_CVT_OFF_F32_I4:
translator.V_CVT_OFF_F32_I4(inst);
break;
case Opcode::V_MED3_F32:
translator.V_MED3_F32(inst);
break;
case Opcode::V_FLOOR_F32:
translator.V_FLOOR_F32(inst);
break;
case Opcode::V_SUB_F32:
translator.V_SUB_F32(inst);
break;
case Opcode::V_FMA_F32:
case Opcode::V_MADAK_F32: // Yes these can share the opcode
translator.V_FMA_F32(inst);
break;
case Opcode::IMAGE_SAMPLE_LZ_O:
case Opcode::IMAGE_SAMPLE_O:
case Opcode::IMAGE_SAMPLE_C:
case Opcode::IMAGE_SAMPLE_C_LZ:
case Opcode::IMAGE_SAMPLE_LZ:
case Opcode::IMAGE_SAMPLE:
case Opcode::IMAGE_SAMPLE_L:
case Opcode::IMAGE_SAMPLE_C_O:
case Opcode::IMAGE_SAMPLE_B:
case Opcode::IMAGE_SAMPLE_C_LZ_O:
translator.IMAGE_SAMPLE(inst);
break;
case Opcode::IMAGE_ATOMIC_ADD:
translator.IMAGE_ATOMIC(AtomicOp::Add, inst);
break;
case Opcode::IMAGE_ATOMIC_AND:
translator.IMAGE_ATOMIC(AtomicOp::And, inst);
break;
case Opcode::IMAGE_ATOMIC_OR:
translator.IMAGE_ATOMIC(AtomicOp::Or, inst);
break;
case Opcode::IMAGE_ATOMIC_XOR:
translator.IMAGE_ATOMIC(AtomicOp::Xor, inst);
break;
case Opcode::IMAGE_ATOMIC_UMAX:
translator.IMAGE_ATOMIC(AtomicOp::Umax, inst);
break;
case Opcode::IMAGE_ATOMIC_SMAX:
translator.IMAGE_ATOMIC(AtomicOp::Smax, inst);
break;
case Opcode::IMAGE_ATOMIC_UMIN:
translator.IMAGE_ATOMIC(AtomicOp::Umin, inst);
break;
case Opcode::IMAGE_ATOMIC_SMIN:
translator.IMAGE_ATOMIC(AtomicOp::Smin, inst);
break;
case Opcode::IMAGE_ATOMIC_INC:
translator.IMAGE_ATOMIC(AtomicOp::Inc, inst);
break;
case Opcode::IMAGE_ATOMIC_DEC:
translator.IMAGE_ATOMIC(AtomicOp::Dec, inst);
break;
case Opcode::IMAGE_GET_LOD:
translator.IMAGE_GET_LOD(inst);
break;
case Opcode::IMAGE_GATHER4_C:
case Opcode::IMAGE_GATHER4_LZ:
case Opcode::IMAGE_GATHER4_LZ_O:
translator.IMAGE_GATHER(inst);
break;
case Opcode::IMAGE_STORE:
translator.IMAGE_STORE(inst);
break;
case Opcode::IMAGE_LOAD_MIP:
translator.IMAGE_LOAD(true, inst);
break;
case Opcode::IMAGE_LOAD:
translator.IMAGE_LOAD(false, inst);
break;
case Opcode::V_MAD_U64_U32:
translator.V_MAD_U64_U32(inst);
break;
case Opcode::V_CMP_GE_I32:
translator.V_CMP_U32(ConditionOp::GE, true, false, inst);
break;
case Opcode::V_CMP_EQ_I32:
translator.V_CMP_U32(ConditionOp::EQ, true, false, inst);
break;
case Opcode::V_CMP_LE_I32:
translator.V_CMP_U32(ConditionOp::LE, true, false, inst);
break;
case Opcode::V_CMP_NE_I32:
translator.V_CMP_U32(ConditionOp::LG, true, false, inst);
break;
case Opcode::V_CMP_NE_U32:
translator.V_CMP_U32(ConditionOp::LG, false, false, inst);
break;
case Opcode::V_CMP_EQ_U32:
translator.V_CMP_U32(ConditionOp::EQ, false, false, inst);
break;
case Opcode::V_CMP_F_U32:
translator.V_CMP_U32(ConditionOp::F, false, false, inst);
break;
case Opcode::V_CMP_LT_U32:
translator.V_CMP_U32(ConditionOp::LT, false, false, inst);
break;
case Opcode::V_CMP_GT_U32:
translator.V_CMP_U32(ConditionOp::GT, false, false, inst);
break;
case Opcode::V_CMP_GE_U32:
translator.V_CMP_U32(ConditionOp::GE, false, false, inst);
break;
case Opcode::V_CMP_TRU_U32:
translator.V_CMP_U32(ConditionOp::TRU, false, false, inst);
break;
case Opcode::V_CMP_NEQ_F32:
translator.V_CMP_F32(ConditionOp::LG, false, inst);
break;
case Opcode::V_CMP_F_F32:
translator.V_CMP_F32(ConditionOp::F, false, inst);
break;
case Opcode::V_CMP_LT_F32:
translator.V_CMP_F32(ConditionOp::LT, false, inst);
break;
case Opcode::V_CMP_EQ_F32:
translator.V_CMP_F32(ConditionOp::EQ, false, inst);
break;
case Opcode::V_CMP_LE_F32:
translator.V_CMP_F32(ConditionOp::LE, false, inst);
break;
case Opcode::V_CMP_GT_F32:
translator.V_CMP_F32(ConditionOp::GT, false, inst);
break;
case Opcode::V_CMP_LG_F32:
translator.V_CMP_F32(ConditionOp::LG, false, inst);
break;
case Opcode::V_CMP_GE_F32:
translator.V_CMP_F32(ConditionOp::GE, false, inst);
break;
case Opcode::V_CMP_NLE_F32:
translator.V_CMP_F32(ConditionOp::GT, false, inst);
break;
case Opcode::V_CMP_NLT_F32:
translator.V_CMP_F32(ConditionOp::GE, false, inst);
break;
case Opcode::V_CMP_NGT_F32:
translator.V_CMP_F32(ConditionOp::LE, false, inst);
break;
case Opcode::V_CMP_NGE_F32:
translator.V_CMP_F32(ConditionOp::LT, false, inst);
break;
case Opcode::S_CMP_LT_U32:
translator.S_CMP(ConditionOp::LT, false, inst);
break;
case Opcode::S_CMP_LE_U32:
translator.S_CMP(ConditionOp::LE, false, inst);
break;
case Opcode::S_CMP_LG_U32:
translator.S_CMP(ConditionOp::LG, false, inst);
break;
case Opcode::S_CMP_LT_I32:
translator.S_CMP(ConditionOp::LT, true, inst);
break;
case Opcode::S_CMP_LG_I32:
translator.S_CMP(ConditionOp::LG, true, inst);
break;
case Opcode::S_CMP_GT_I32:
translator.S_CMP(ConditionOp::GT, true, inst);
break;
case Opcode::S_CMP_GE_I32:
translator.S_CMP(ConditionOp::GE, true, inst);
break;
case Opcode::S_CMP_EQ_I32:
translator.S_CMP(ConditionOp::EQ, true, inst);
break;
case Opcode::S_CMP_EQ_U32:
translator.S_CMP(ConditionOp::EQ, false, inst);
break;
case Opcode::S_LSHL_B32:
translator.S_LSHL_B32(inst);
break;
case Opcode::V_CNDMASK_B32:
translator.V_CNDMASK_B32(inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_X:
translator.BUFFER_LOAD_FORMAT(1, true, true, inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_XY:
translator.BUFFER_LOAD_FORMAT(2, true, true, inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
translator.BUFFER_LOAD_FORMAT(3, true, true, inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
translator.BUFFER_LOAD_FORMAT(4, true, true, inst);
break;
case Opcode::BUFFER_LOAD_FORMAT_X:
translator.BUFFER_LOAD_FORMAT(1, false, true, inst);
break;
case Opcode::BUFFER_LOAD_FORMAT_XY:
translator.BUFFER_LOAD_FORMAT(2, false, true, inst);
break;
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
translator.BUFFER_LOAD_FORMAT(3, false, true, inst);
break;
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
translator.BUFFER_LOAD_FORMAT(4, false, true, inst);
break;
case Opcode::BUFFER_LOAD_DWORD:
translator.BUFFER_LOAD_FORMAT(1, false, false, inst);
break;
case Opcode::BUFFER_LOAD_DWORDX2:
translator.BUFFER_LOAD_FORMAT(2, false, false, inst);
break;
case Opcode::BUFFER_LOAD_DWORDX3:
translator.BUFFER_LOAD_FORMAT(3, false, false, inst);
break;
case Opcode::BUFFER_LOAD_DWORDX4:
translator.BUFFER_LOAD_FORMAT(4, false, false, inst);
break;
case Opcode::BUFFER_STORE_FORMAT_X:
case Opcode::BUFFER_STORE_DWORD:
translator.BUFFER_STORE_FORMAT(1, false, inst);
break;
case Opcode::BUFFER_STORE_DWORDX2:
translator.BUFFER_STORE_FORMAT(2, false, inst);
break;
case Opcode::BUFFER_STORE_DWORDX3:
translator.BUFFER_STORE_FORMAT(3, false, inst);
break;
case Opcode::BUFFER_STORE_FORMAT_XYZW:
case Opcode::BUFFER_STORE_DWORDX4:
translator.BUFFER_STORE_FORMAT(4, false, inst);
break;
case Opcode::V_MAX_F32:
translator.V_MAX_F32(inst);
break;
case Opcode::V_MAX_I32:
translator.V_MAX_U32(true, inst);
break;
case Opcode::V_MAX_U32:
translator.V_MAX_U32(false, inst);
break;
case Opcode::V_NOT_B32:
translator.V_NOT_B32(inst);
break;
case Opcode::V_RSQ_F32:
translator.V_RSQ_F32(inst);
break;
case Opcode::S_ANDN2_B64:
translator.S_AND_B64(NegateMode::Src1, inst);
break;
case Opcode::S_ORN2_B64:
translator.S_OR_B64(NegateMode::Src1, false, inst);
break;
case Opcode::V_SIN_F32:
translator.V_SIN_F32(inst);
break;
case Opcode::V_COS_F32:
translator.V_COS_F32(inst);
break;
case Opcode::V_LOG_F32:
translator.V_LOG_F32(inst);
break;
case Opcode::V_EXP_F32:
translator.V_EXP_F32(inst);
break;
case Opcode::V_SQRT_F32:
translator.V_SQRT_F32(inst);
break;
case Opcode::V_MIN_F32:
translator.V_MIN_F32(inst);
break;
case Opcode::V_MIN_I32:
translator.V_MIN_I32(inst);
break;
case Opcode::V_MIN3_F32:
translator.V_MIN3_F32(inst);
break;
case Opcode::V_MIN_LEGACY_F32:
translator.V_MIN_F32(inst, true);
break;
case Opcode::V_MADMK_F32:
translator.V_MADMK_F32(inst);
break;
case Opcode::V_CUBEMA_F32:
translator.V_CUBEMA_F32(inst);
break;
case Opcode::V_CUBESC_F32:
translator.V_CUBESC_F32(inst);
break;
case Opcode::V_CUBETC_F32:
translator.V_CUBETC_F32(inst);
break;
case Opcode::V_CUBEID_F32:
translator.V_CUBEID_F32(inst);
break;
case Opcode::V_CVT_U32_F32:
translator.V_CVT_U32_F32(inst);
break;
case Opcode::V_CVT_I32_F32:
translator.V_CVT_I32_F32(inst);
break;
case Opcode::V_CVT_FLR_I32_F32:
translator.V_CVT_FLR_I32_F32(inst);
break;
case Opcode::V_SUBREV_F32:
translator.V_SUBREV_F32(inst);
break;
case Opcode::S_AND_SAVEEXEC_B64:
translator.S_AND_SAVEEXEC_B64(inst);
break;
case Opcode::S_MOV_B64:
translator.S_MOV_B64(inst);
break;
case Opcode::V_SUBREV_I32:
translator.V_SUBREV_I32(inst);
break;
continue;
}
case Opcode::V_CMPX_F_F32:
translator.V_CMP_F32(ConditionOp::F, true, inst);
// Emit instructions for each category.
switch (inst.category) {
case InstCategory::DataShare:
translator.EmitDataShare(inst);
break;
case Opcode::V_CMPX_LT_F32:
translator.V_CMP_F32(ConditionOp::LT, true, inst);
case InstCategory::VectorInterpolation:
translator.EmitVectorInterpolation(inst);
break;
case Opcode::V_CMPX_EQ_F32:
translator.V_CMP_F32(ConditionOp::EQ, true, inst);
case InstCategory::ScalarMemory:
translator.EmitScalarMemory(inst);
break;
case Opcode::V_CMPX_LE_F32:
translator.V_CMP_F32(ConditionOp::LE, true, inst);
case InstCategory::VectorMemory:
translator.EmitVectorMemory(inst);
break;
case Opcode::V_CMPX_GT_F32:
translator.V_CMP_F32(ConditionOp::GT, true, inst);
case InstCategory::Export:
translator.EmitExport(inst);
break;
case Opcode::V_CMPX_LG_F32:
translator.V_CMP_F32(ConditionOp::LG, true, inst);
case InstCategory::FlowControl:
translator.EmitFlowControl(pc, inst);
break;
case Opcode::V_CMPX_GE_F32:
translator.V_CMP_F32(ConditionOp::GE, true, inst);
case InstCategory::ScalarALU:
translator.EmitScalarAlu(inst);
break;
case Opcode::V_CMPX_NGE_F32:
translator.V_CMP_F32(ConditionOp::LT, true, inst);
case InstCategory::VectorALU:
translator.EmitVectorAlu(inst);
break;
case Opcode::V_CMPX_NLG_F32:
translator.V_CMP_F32(ConditionOp::EQ, true, inst);
break;
case Opcode::V_CMPX_NGT_F32:
translator.V_CMP_F32(ConditionOp::LE, true, inst);
break;
case Opcode::V_CMPX_NLE_F32:
translator.V_CMP_F32(ConditionOp::GT, true, inst);
break;
case Opcode::V_CMPX_NEQ_F32:
translator.V_CMP_F32(ConditionOp::LG, true, inst);
break;
case Opcode::V_CMPX_NLT_F32:
translator.V_CMP_F32(ConditionOp::GE, true, inst);
break;
case Opcode::V_CMPX_TRU_F32:
translator.V_CMP_F32(ConditionOp::TRU, true, inst);
break;
case Opcode::V_CMP_LE_U32:
translator.V_CMP_U32(ConditionOp::LE, false, false, inst);
break;
case Opcode::V_CMP_GT_I32:
translator.V_CMP_U32(ConditionOp::GT, true, false, inst);
break;
case Opcode::V_CMP_LT_I32:
translator.V_CMP_U32(ConditionOp::LT, true, false, inst);
break;
case Opcode::V_CMPX_LT_I32:
translator.V_CMP_U32(ConditionOp::LT, true, true, inst);
break;
case Opcode::V_CMPX_F_U32:
translator.V_CMP_U32(ConditionOp::F, false, true, inst);
break;
case Opcode::V_CMPX_LT_U32:
translator.V_CMP_U32(ConditionOp::LT, false, true, inst);
break;
case Opcode::V_CMPX_EQ_U32:
translator.V_CMP_U32(ConditionOp::EQ, false, true, inst);
break;
case Opcode::V_CMPX_LE_U32:
translator.V_CMP_U32(ConditionOp::LE, false, true, inst);
break;
case Opcode::V_CMPX_GT_U32:
translator.V_CMP_U32(ConditionOp::GT, false, true, inst);
break;
case Opcode::V_CMPX_NE_U32:
translator.V_CMP_U32(ConditionOp::LG, false, true, inst);
break;
case Opcode::V_CMPX_GE_U32:
translator.V_CMP_U32(ConditionOp::GE, false, true, inst);
break;
case Opcode::V_CMPX_TRU_U32:
translator.V_CMP_U32(ConditionOp::TRU, false, true, inst);
break;
case Opcode::S_OR_B64:
translator.S_OR_B64(NegateMode::None, false, inst);
break;
case Opcode::S_NOR_B64:
translator.S_OR_B64(NegateMode::Result, false, inst);
break;
case Opcode::S_XOR_B64:
translator.S_OR_B64(NegateMode::None, true, inst);
break;
case Opcode::S_AND_B64:
translator.S_AND_B64(NegateMode::None, inst);
break;
case Opcode::S_NOT_B64:
translator.S_NOT_B64(inst);
break;
case Opcode::S_NAND_B64:
translator.S_AND_B64(NegateMode::Result, inst);
break;
case Opcode::V_LSHRREV_B32:
translator.V_LSHRREV_B32(inst);
break;
case Opcode::S_ADD_I32:
translator.S_ADD_I32(inst);
break;
case Opcode::V_MUL_HI_U32:
translator.V_MUL_HI_U32(false, inst);
break;
case Opcode::V_MUL_LO_I32:
translator.V_MUL_LO_U32(inst);
break;
case Opcode::V_SAD_U32:
translator.V_SAD_U32(inst);
break;
case Opcode::V_BFE_U32:
translator.V_BFE_U32(false, inst);
break;
case Opcode::V_BFE_I32:
translator.V_BFE_U32(true, inst);
break;
case Opcode::V_MAD_I32_I24:
translator.V_MAD_I32_I24(inst);
break;
case Opcode::V_MUL_I32_I24:
case Opcode::V_MUL_U32_U24:
translator.V_MUL_I32_I24(inst);
break;
case Opcode::V_SUB_I32:
translator.V_SUB_I32(inst);
break;
case Opcode::V_LSHR_B32:
translator.V_LSHR_B32(inst);
break;
case Opcode::V_ASHRREV_I32:
translator.V_ASHRREV_I32(inst);
break;
case Opcode::V_MAD_U32_U24:
translator.V_MAD_U32_U24(inst);
break;
case Opcode::S_AND_B32:
translator.S_AND_B32(inst);
break;
case Opcode::S_ASHR_I32:
translator.S_ASHR_I32(inst);
break;
case Opcode::S_OR_B32:
translator.S_OR_B32(inst);
break;
case Opcode::S_LSHR_B32:
translator.S_LSHR_B32(inst);
break;
case Opcode::S_CSELECT_B32:
translator.S_CSELECT_B32(inst);
break;
case Opcode::S_CSELECT_B64:
translator.S_CSELECT_B64(inst);
break;
case Opcode::S_BFE_U32:
translator.S_BFE_U32(inst);
break;
case Opcode::V_RNDNE_F32:
translator.V_RNDNE_F32(inst);
break;
case Opcode::V_BCNT_U32_B32:
translator.V_BCNT_U32_B32(inst);
break;
case Opcode::V_MAX3_F32:
translator.V_MAX3_F32(inst);
break;
case Opcode::DS_SWIZZLE_B32:
translator.DS_SWIZZLE_B32(inst);
break;
case Opcode::V_MUL_LO_U32:
translator.V_MUL_LO_U32(inst);
break;
case Opcode::S_BFM_B32:
translator.S_BFM_B32(inst);
break;
case Opcode::V_MIN_U32:
translator.V_MIN_U32(inst);
break;
case Opcode::V_CMP_NE_U64:
translator.V_CMP_NE_U64(inst);
break;
case Opcode::V_CMP_CLASS_F32:
translator.V_CMP_CLASS_F32(inst);
break;
case Opcode::V_TRUNC_F32:
translator.V_TRUNC_F32(inst);
break;
case Opcode::V_CEIL_F32:
translator.V_CEIL_F32(inst);
break;
case Opcode::V_BFI_B32:
translator.V_BFI_B32(inst);
break;
case Opcode::S_BREV_B32:
translator.S_BREV_B32(inst);
break;
case Opcode::S_ADD_U32:
translator.S_ADD_U32(inst);
break;
case Opcode::S_ADDC_U32:
translator.S_ADDC_U32(inst);
break;
case Opcode::S_SUB_U32:
case Opcode::S_SUB_I32:
translator.S_SUB_U32(inst);
break;
// TODO: Separate implementation for legacy variants.
case Opcode::V_MUL_LEGACY_F32:
translator.V_MUL_F32(inst);
break;
case Opcode::V_MAC_LEGACY_F32:
translator.V_MAC_F32(inst);
break;
case Opcode::V_MAD_LEGACY_F32:
translator.V_MAD_F32(inst);
break;
case Opcode::V_MAX_LEGACY_F32:
translator.V_MAX_F32(inst, true);
break;
case Opcode::V_RSQ_LEGACY_F32:
case Opcode::V_RSQ_CLAMP_F32:
translator.V_RSQ_F32(inst);
break;
case Opcode::V_RCP_IFLAG_F32:
translator.V_RCP_F32(inst);
break;
case Opcode::IMAGE_GET_RESINFO:
translator.IMAGE_GET_RESINFO(inst);
break;
case Opcode::S_BARRIER:
translator.S_BARRIER();
break;
case Opcode::S_TTRACEDATA:
LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!");
break;
case Opcode::DS_READ_B32:
translator.DS_READ(32, false, false, inst);
break;
case Opcode::DS_READ2_B32:
translator.DS_READ(32, false, true, inst);
break;
case Opcode::DS_WRITE_B32:
translator.DS_WRITE(32, false, false, inst);
break;
case Opcode::DS_WRITE2_B32:
translator.DS_WRITE(32, false, true, inst);
break;
case Opcode::V_READFIRSTLANE_B32:
translator.V_READFIRSTLANE_B32(inst);
break;
case Opcode::S_GETPC_B64:
translator.S_GETPC_B64(block_base, inst);
break;
case Opcode::S_NOP:
case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0:
case Opcode::S_CBRANCH_SCC1:
case Opcode::S_CBRANCH_VCCNZ:
case Opcode::S_CBRANCH_VCCZ:
case Opcode::S_BRANCH:
case Opcode::S_WQM_B64:
case Opcode::V_INTERP_P1_F32:
case Opcode::S_ENDPGM:
case InstCategory::DebugProfile:
break;
default:
const u32 opcode = u32(inst.opcode);
LOG_ERROR(Render_Recompiler, "Unknown opcode {} ({})",
magic_enum::enum_name(inst.opcode), opcode);
info.translation_failed = true;
UNREACHABLE();
}
}
}

View File

@ -11,7 +11,8 @@
namespace Shader {
struct Info;
}
struct Profile;
} // namespace Shader
namespace Shader::Gcn {
@ -24,6 +25,7 @@ enum class ConditionOp : u32 {
LT,
LE,
TRU,
U,
};
enum class AtomicOp : u32 {
@ -53,10 +55,19 @@ enum class NegateMode : u32 {
class Translator {
public:
explicit Translator(IR::Block* block_, Info& info);
explicit Translator(IR::Block* block_, Info& info, const Profile& profile);
// Instruction categories
void EmitPrologue();
void EmitFetch(const GcnInst& inst);
void EmitDataShare(const GcnInst& inst);
void EmitVectorInterpolation(const GcnInst& inst);
void EmitScalarMemory(const GcnInst& inst);
void EmitVectorMemory(const GcnInst& inst);
void EmitExport(const GcnInst& inst);
void EmitFlowControl(u32 pc, const GcnInst& inst);
void EmitScalarAlu(const GcnInst& inst);
void EmitVectorAlu(const GcnInst& inst);
// Scalar ALU
void S_MOVK(const GcnInst& inst);
@ -83,6 +94,10 @@ public:
void S_SUB_U32(const GcnInst& inst);
void S_GETPC_B64(u32 pc, const GcnInst& inst);
void S_ADDC_U32(const GcnInst& inst);
void S_MULK_I32(const GcnInst& inst);
void S_ADDK_I32(const GcnInst& inst);
void S_MAX_U32(const GcnInst& inst);
void S_MIN_U32(const GcnInst& inst);
// Scalar Memory
void S_LOAD_DWORD(int num_dwords, const GcnInst& inst);
@ -94,11 +109,13 @@ public:
void V_MAC_F32(const GcnInst& inst);
void V_CVT_PKRTZ_F16_F32(const GcnInst& inst);
void V_CVT_F32_F16(const GcnInst& inst);
void V_CVT_F16_F32(const GcnInst& inst);
void V_MUL_F32(const GcnInst& inst);
void V_CNDMASK_B32(const GcnInst& inst);
void V_OR_B32(bool is_xor, const GcnInst& inst);
void V_AND_B32(const GcnInst& inst);
void V_LSHLREV_B32(const GcnInst& inst);
void V_LSHL_B32(const GcnInst& inst);
void V_ADD_I32(const GcnInst& inst);
void V_ADDC_U32(const GcnInst& inst);
void V_CVT_F32_I32(const GcnInst& inst);
@ -122,6 +139,7 @@ public:
void V_SQRT_F32(const GcnInst& inst);
void V_MIN_F32(const GcnInst& inst, bool is_legacy = false);
void V_MIN3_F32(const GcnInst& inst);
void V_MIN3_I32(const GcnInst& inst);
void V_MADMK_F32(const GcnInst& inst);
void V_CUBEMA_F32(const GcnInst& inst);
void V_CUBESC_F32(const GcnInst& inst);
@ -146,6 +164,7 @@ public:
void V_BCNT_U32_B32(const GcnInst& inst);
void V_COS_F32(const GcnInst& inst);
void V_MAX3_F32(const GcnInst& inst);
void V_MAX3_U32(const GcnInst& inst);
void V_CVT_I32_F32(const GcnInst& inst);
void V_MIN_I32(const GcnInst& inst);
void V_MUL_LO_U32(const GcnInst& inst);
@ -160,6 +179,8 @@ public:
void V_LDEXP_F32(const GcnInst& inst);
void V_CVT_FLR_I32_F32(const GcnInst& inst);
void V_CMP_CLASS_F32(const GcnInst& inst);
void V_FFBL_B32(const GcnInst& inst);
void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst);
@ -167,12 +188,15 @@ public:
// Vector interpolation
void V_INTERP_P2_F32(const GcnInst& inst);
void V_INTERP_MOV_F32(const GcnInst& inst);
// Data share
void DS_SWIZZLE_B32(const GcnInst& inst);
void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void V_READFIRSTLANE_B32(const GcnInst& inst);
void V_READLANE_B32(const GcnInst& inst);
void V_WRITELANE_B32(const GcnInst& inst);
void S_BARRIER();
// MIMG
@ -184,9 +208,6 @@ public:
void IMAGE_GET_LOD(const GcnInst& inst);
void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst);
// Export
void EXP(const GcnInst& inst);
private:
template <typename T = IR::U32F32>
[[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false);
@ -195,12 +216,17 @@ private:
void SetDst(const InstOperand& operand, const IR::U32F32& value);
void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw);
void LogMissingOpcode(const GcnInst& inst);
private:
IR::IREmitter ir;
Info& info;
static std::array<bool, IR::NumScalarRegs> exec_contexts;
const Profile& profile;
IR::U32 m0_value;
bool opcode_missing = false;
};
void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_list, Info& info);
void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_list, Info& info,
const Profile& profile);
} // namespace Shader::Gcn

View File

@ -2,9 +2,311 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/frontend/translate/translate.h"
#include "shader_recompiler/profile.h"
namespace Shader::Gcn {
void Translator::EmitVectorAlu(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::V_LSHLREV_B32:
return V_LSHLREV_B32(inst);
case Opcode::V_LSHL_B32:
return V_LSHL_B32(inst);
case Opcode::V_BFREV_B32:
return V_BFREV_B32(inst);
case Opcode::V_BFE_U32:
return V_BFE_U32(false, inst);
case Opcode::V_BFE_I32:
return V_BFE_U32(true, inst);
case Opcode::V_BFI_B32:
return V_BFI_B32(inst);
case Opcode::V_LSHR_B32:
return V_LSHR_B32(inst);
case Opcode::V_ASHRREV_I32:
return V_ASHRREV_I32(inst);
case Opcode::V_LSHRREV_B32:
return V_LSHRREV_B32(inst);
case Opcode::V_NOT_B32:
return V_NOT_B32(inst);
case Opcode::V_AND_B32:
return V_AND_B32(inst);
case Opcode::V_OR_B32:
return V_OR_B32(false, inst);
case Opcode::V_XOR_B32:
return V_OR_B32(true, inst);
case Opcode::V_FFBL_B32:
return V_FFBL_B32(inst);
case Opcode::V_MOV_B32:
return V_MOV(inst);
case Opcode::V_ADD_I32:
return V_ADD_I32(inst);
case Opcode::V_ADDC_U32:
return V_ADDC_U32(inst);
case Opcode::V_CVT_F32_I32:
return V_CVT_F32_I32(inst);
case Opcode::V_CVT_F32_U32:
return V_CVT_F32_U32(inst);
case Opcode::V_CVT_PKRTZ_F16_F32:
return V_CVT_PKRTZ_F16_F32(inst);
case Opcode::V_CVT_F32_F16:
return V_CVT_F32_F16(inst);
case Opcode::V_CVT_F16_F32:
return V_CVT_F16_F32(inst);
case Opcode::V_CVT_F32_UBYTE0:
return V_CVT_F32_UBYTE(0, inst);
case Opcode::V_CVT_F32_UBYTE1:
return V_CVT_F32_UBYTE(1, inst);
case Opcode::V_CVT_F32_UBYTE2:
return V_CVT_F32_UBYTE(2, inst);
case Opcode::V_CVT_F32_UBYTE3:
return V_CVT_F32_UBYTE(3, inst);
case Opcode::V_CVT_OFF_F32_I4:
return V_CVT_OFF_F32_I4(inst);
case Opcode::V_MAD_U64_U32:
return V_MAD_U64_U32(inst);
case Opcode::V_CMP_GE_I32:
return V_CMP_U32(ConditionOp::GE, true, false, inst);
case Opcode::V_CMP_EQ_I32:
return V_CMP_U32(ConditionOp::EQ, true, false, inst);
case Opcode::V_CMP_LE_I32:
return V_CMP_U32(ConditionOp::LE, true, false, inst);
case Opcode::V_CMP_NE_I32:
return V_CMP_U32(ConditionOp::LG, true, false, inst);
case Opcode::V_CMP_NE_U32:
return V_CMP_U32(ConditionOp::LG, false, false, inst);
case Opcode::V_CMP_EQ_U32:
return V_CMP_U32(ConditionOp::EQ, false, false, inst);
case Opcode::V_CMP_F_U32:
return V_CMP_U32(ConditionOp::F, false, false, inst);
case Opcode::V_CMP_LT_U32:
return V_CMP_U32(ConditionOp::LT, false, false, inst);
case Opcode::V_CMP_GT_U32:
return V_CMP_U32(ConditionOp::GT, false, false, inst);
case Opcode::V_CMP_GE_U32:
return V_CMP_U32(ConditionOp::GE, false, false, inst);
case Opcode::V_CMP_TRU_U32:
return V_CMP_U32(ConditionOp::TRU, false, false, inst);
case Opcode::V_CMP_NEQ_F32:
return V_CMP_F32(ConditionOp::LG, false, inst);
case Opcode::V_CMP_F_F32:
return V_CMP_F32(ConditionOp::F, false, inst);
case Opcode::V_CMP_LT_F32:
return V_CMP_F32(ConditionOp::LT, false, inst);
case Opcode::V_CMP_EQ_F32:
return V_CMP_F32(ConditionOp::EQ, false, inst);
case Opcode::V_CMP_LE_F32:
return V_CMP_F32(ConditionOp::LE, false, inst);
case Opcode::V_CMP_GT_F32:
return V_CMP_F32(ConditionOp::GT, false, inst);
case Opcode::V_CMP_LG_F32:
return V_CMP_F32(ConditionOp::LG, false, inst);
case Opcode::V_CMP_GE_F32:
return V_CMP_F32(ConditionOp::GE, false, inst);
case Opcode::V_CMP_NLE_F32:
return V_CMP_F32(ConditionOp::GT, false, inst);
case Opcode::V_CMP_NLT_F32:
return V_CMP_F32(ConditionOp::GE, false, inst);
case Opcode::V_CMP_NGT_F32:
return V_CMP_F32(ConditionOp::LE, false, inst);
case Opcode::V_CMP_NGE_F32:
return V_CMP_F32(ConditionOp::LT, false, inst);
case Opcode::V_CMP_U_F32:
return V_CMP_F32(ConditionOp::U, false, inst);
case Opcode::V_CNDMASK_B32:
return V_CNDMASK_B32(inst);
case Opcode::V_MAX_I32:
return V_MAX_U32(true, inst);
case Opcode::V_MAX_U32:
return V_MAX_U32(false, inst);
case Opcode::V_MIN_I32:
return V_MIN_I32(inst);
case Opcode::V_CUBEMA_F32:
return V_CUBEMA_F32(inst);
case Opcode::V_CUBESC_F32:
return V_CUBESC_F32(inst);
case Opcode::V_CUBETC_F32:
return V_CUBETC_F32(inst);
case Opcode::V_CUBEID_F32:
return V_CUBEID_F32(inst);
case Opcode::V_CVT_U32_F32:
return V_CVT_U32_F32(inst);
case Opcode::V_CVT_I32_F32:
return V_CVT_I32_F32(inst);
case Opcode::V_CVT_FLR_I32_F32:
return V_CVT_FLR_I32_F32(inst);
case Opcode::V_SUBREV_I32:
return V_SUBREV_I32(inst);
case Opcode::V_MUL_HI_U32:
return V_MUL_HI_U32(false, inst);
case Opcode::V_MUL_LO_I32:
return V_MUL_LO_U32(inst);
case Opcode::V_SAD_U32:
return V_SAD_U32(inst);
case Opcode::V_SUB_I32:
return V_SUB_I32(inst);
case Opcode::V_MAD_I32_I24:
return V_MAD_I32_I24(inst);
case Opcode::V_MUL_I32_I24:
case Opcode::V_MUL_U32_U24:
return V_MUL_I32_I24(inst);
case Opcode::V_MAD_U32_U24:
return V_MAD_U32_U24(inst);
case Opcode::V_BCNT_U32_B32:
return V_BCNT_U32_B32(inst);
case Opcode::V_MUL_LO_U32:
return V_MUL_LO_U32(inst);
case Opcode::V_MIN_U32:
return V_MIN_U32(inst);
case Opcode::V_CMP_NE_U64:
return V_CMP_NE_U64(inst);
case Opcode::V_READFIRSTLANE_B32:
return V_READFIRSTLANE_B32(inst);
case Opcode::V_READLANE_B32:
return V_READLANE_B32(inst);
case Opcode::V_WRITELANE_B32:
return V_WRITELANE_B32(inst);
case Opcode::V_MAD_F32:
return V_MAD_F32(inst);
case Opcode::V_MAC_F32:
return V_MAC_F32(inst);
case Opcode::V_MUL_F32:
return V_MUL_F32(inst);
case Opcode::V_RCP_F32:
return V_RCP_F32(inst);
case Opcode::V_LDEXP_F32:
return V_LDEXP_F32(inst);
case Opcode::V_FRACT_F32:
return V_FRACT_F32(inst);
case Opcode::V_ADD_F32:
return V_ADD_F32(inst);
case Opcode::V_MED3_F32:
return V_MED3_F32(inst);
case Opcode::V_FLOOR_F32:
return V_FLOOR_F32(inst);
case Opcode::V_SUB_F32:
return V_SUB_F32(inst);
case Opcode::V_FMA_F32:
case Opcode::V_MADAK_F32:
return V_FMA_F32(inst);
case Opcode::V_MAX_F32:
return V_MAX_F32(inst);
case Opcode::V_RSQ_F32:
return V_RSQ_F32(inst);
case Opcode::V_SIN_F32:
return V_SIN_F32(inst);
case Opcode::V_COS_F32:
return V_COS_F32(inst);
case Opcode::V_LOG_F32:
return V_LOG_F32(inst);
case Opcode::V_EXP_F32:
return V_EXP_F32(inst);
case Opcode::V_SQRT_F32:
return V_SQRT_F32(inst);
case Opcode::V_MIN_F32:
return V_MIN_F32(inst, false);
case Opcode::V_MIN3_F32:
return V_MIN3_F32(inst);
case Opcode::V_MIN3_I32:
return V_MIN3_I32(inst);
case Opcode::V_MIN_LEGACY_F32:
return V_MIN_F32(inst, true);
case Opcode::V_MADMK_F32:
return V_MADMK_F32(inst);
case Opcode::V_SUBREV_F32:
return V_SUBREV_F32(inst);
case Opcode::V_RNDNE_F32:
return V_RNDNE_F32(inst);
case Opcode::V_MAX3_F32:
return V_MAX3_F32(inst);
case Opcode::V_MAX3_U32:
return V_MAX3_U32(inst);
case Opcode::V_TRUNC_F32:
return V_TRUNC_F32(inst);
case Opcode::V_CEIL_F32:
return V_CEIL_F32(inst);
case Opcode::V_MUL_LEGACY_F32:
return V_MUL_F32(inst);
case Opcode::V_MAC_LEGACY_F32:
return V_MAC_F32(inst);
case Opcode::V_MAD_LEGACY_F32:
return V_MAD_F32(inst);
case Opcode::V_MAX_LEGACY_F32:
return V_MAX_F32(inst, true);
case Opcode::V_RSQ_LEGACY_F32:
case Opcode::V_RSQ_CLAMP_F32:
return V_RSQ_F32(inst);
case Opcode::V_RCP_IFLAG_F32:
return V_RCP_F32(inst);
case Opcode::V_CMPX_F_F32:
return V_CMP_F32(ConditionOp::F, true, inst);
case Opcode::V_CMPX_LT_F32:
return V_CMP_F32(ConditionOp::LT, true, inst);
case Opcode::V_CMPX_EQ_F32:
return V_CMP_F32(ConditionOp::EQ, true, inst);
case Opcode::V_CMPX_LE_F32:
return V_CMP_F32(ConditionOp::LE, true, inst);
case Opcode::V_CMPX_GT_F32:
return V_CMP_F32(ConditionOp::GT, true, inst);
case Opcode::V_CMPX_LG_F32:
return V_CMP_F32(ConditionOp::LG, true, inst);
case Opcode::V_CMPX_GE_F32:
return V_CMP_F32(ConditionOp::GE, true, inst);
case Opcode::V_CMPX_NGE_F32:
return V_CMP_F32(ConditionOp::LT, true, inst);
case Opcode::V_CMPX_NLG_F32:
return V_CMP_F32(ConditionOp::EQ, true, inst);
case Opcode::V_CMPX_NGT_F32:
return V_CMP_F32(ConditionOp::LE, true, inst);
case Opcode::V_CMPX_NLE_F32:
return V_CMP_F32(ConditionOp::GT, true, inst);
case Opcode::V_CMPX_NEQ_F32:
return V_CMP_F32(ConditionOp::LG, true, inst);
case Opcode::V_CMPX_NLT_F32:
return V_CMP_F32(ConditionOp::GE, true, inst);
case Opcode::V_CMPX_TRU_F32:
return V_CMP_F32(ConditionOp::TRU, true, inst);
case Opcode::V_CMP_CLASS_F32:
return V_CMP_CLASS_F32(inst);
case Opcode::V_CMP_LE_U32:
return V_CMP_U32(ConditionOp::LE, false, false, inst);
case Opcode::V_CMP_GT_I32:
return V_CMP_U32(ConditionOp::GT, true, false, inst);
case Opcode::V_CMP_LT_I32:
return V_CMP_U32(ConditionOp::LT, true, false, inst);
case Opcode::V_CMPX_LT_I32:
return V_CMP_U32(ConditionOp::LT, true, true, inst);
case Opcode::V_CMPX_F_U32:
return V_CMP_U32(ConditionOp::F, false, true, inst);
case Opcode::V_CMPX_LT_U32:
return V_CMP_U32(ConditionOp::LT, false, true, inst);
case Opcode::V_CMPX_EQ_U32:
return V_CMP_U32(ConditionOp::EQ, false, true, inst);
case Opcode::V_CMPX_LE_U32:
return V_CMP_U32(ConditionOp::LE, false, true, inst);
case Opcode::V_CMPX_GT_U32:
return V_CMP_U32(ConditionOp::GT, false, true, inst);
case Opcode::V_CMPX_NE_U32:
return V_CMP_U32(ConditionOp::LG, false, true, inst);
case Opcode::V_CMPX_GE_U32:
return V_CMP_U32(ConditionOp::GE, false, true, inst);
case Opcode::V_CMPX_TRU_U32:
return V_CMP_U32(ConditionOp::TRU, false, true, inst);
case Opcode::V_CMPX_LG_I32:
return V_CMP_U32(ConditionOp::LG, true, true, inst);
case Opcode::V_MBCNT_LO_U32_B32:
return V_MBCNT_U32_B32(true, inst);
case Opcode::V_MBCNT_HI_U32_B32:
return V_MBCNT_U32_B32(false, inst);
default:
LogMissingOpcode(inst);
}
}
void Translator::V_MOV(const GcnInst& inst) {
SetDst(inst.dst[0], GetSrc(inst.src[0]));
}
@ -32,6 +334,12 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPConvert(32, ir.BitCast<IR::F16>(src0l)));
}
void Translator::V_CVT_F16_F32(const GcnInst& inst) {
const IR::F32 src0 = GetSrc(inst.src[0], true);
const IR::F16 src0fp16 = ir.FPConvert(16, src0);
SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast<IR::U16>(src0fp16)));
}
void Translator::V_MUL_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)));
}
@ -85,6 +393,12 @@ void Translator::V_LSHLREV_B32(const GcnInst& inst) {
ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F))));
}
void Translator::V_LSHL_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F))));
}
void Translator::V_ADD_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))};
@ -208,6 +522,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) {
return ir.FPLessThanEqual(src0, src1);
case ConditionOp::GE:
return ir.FPGreaterThanEqual(src0, src1);
case ConditionOp::U:
return ir.LogicalNot(ir.LogicalAnd(ir.FPIsNan(src0), ir.FPIsNan(src1)));
default:
UNREACHABLE();
}
@ -278,6 +594,13 @@ void Translator::V_MIN3_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2)));
}
void Translator::V_MIN3_I32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 src2{GetSrc(inst.src[2])};
SetDst(inst.dst[0], ir.SMin(src0, ir.SMin(src1, src2)));
}
void Translator::V_MADMK_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
@ -320,12 +643,13 @@ void Translator::V_SUBREV_I32(const GcnInst& inst) {
}
void Translator::V_MAD_U64_U32(const GcnInst& inst) {
const auto src0 = GetSrc<IR::U32>(inst.src[0]);
const auto src1 = GetSrc<IR::U32>(inst.src[1]);
const auto src2 = GetSrc64<IR::U64>(inst.src[2]);
const IR::U64 mul_result = ir.UConvert(64, ir.IMul(src0, src1));
// const IR::U64 mul_result = ir.UConvert(64, ir.IMul(src0, src1));
const IR::U64 mul_result =
ir.PackUint2x32(ir.CompositeConstruct(ir.IMul(src0, src1), ir.Imm32(0U)));
const IR::U64 sum_result = ir.IAdd(mul_result, src2);
SetDst64(inst.dst[0], sum_result);
@ -463,6 +787,13 @@ void Translator::V_MAX3_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2)));
}
void Translator::V_MAX3_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 src2{GetSrc(inst.src[2])};
SetDst(inst.dst[0], ir.UMax(src0, ir.UMax(src1, src2)));
}
void Translator::V_CVT_I32_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
SetDst(inst.dst[0], ir.ConvertFToS(32, src0));
@ -561,38 +892,58 @@ void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) {
}
void Translator::V_CMP_CLASS_F32(const GcnInst& inst) {
constexpr u32 SIGNALING_NAN = 1 << 0;
constexpr u32 QUIET_NAN = 1 << 1;
constexpr u32 NEGATIVE_INFINITY = 1 << 2;
constexpr u32 NEGATIVE_NORMAL = 1 << 3;
constexpr u32 NEGATIVE_DENORM = 1 << 4;
constexpr u32 NEGATIVE_ZERO = 1 << 5;
constexpr u32 POSITIVE_ZERO = 1 << 6;
constexpr u32 POSITIVE_DENORM = 1 << 7;
constexpr u32 POSITIVE_NORMAL = 1 << 8;
constexpr u32 POSITIVE_INFINITY = 1 << 9;
const IR::F32F64 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
IR::U1 value;
if (src1.IsImmediate()) {
const u32 class_mask = src1.U32();
IR::U1 value;
if ((class_mask & (SIGNALING_NAN | QUIET_NAN)) == (SIGNALING_NAN | QUIET_NAN)) {
const auto class_mask = static_cast<IR::FloatClassFunc>(src1.U32());
if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) {
value = ir.FPIsNan(src0);
} else if ((class_mask & (POSITIVE_INFINITY | NEGATIVE_INFINITY)) ==
(POSITIVE_INFINITY | NEGATIVE_INFINITY)) {
} else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) {
value = ir.FPIsInf(src0);
} else {
UNREACHABLE();
}
if (inst.dst[1].field == OperandField::VccLo) {
return ir.SetVcc(value);
} else {
UNREACHABLE();
}
} else {
// We don't know the type yet, delay its resolution.
value = ir.FPCmpClass32(src0, src1);
}
switch (inst.dst[1].field) {
case OperandField::VccLo:
return ir.SetVcc(value);
default:
UNREACHABLE();
}
}
void Translator::V_FFBL_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
SetDst(inst.dst[0], ir.FindILsb(src0));
}
void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 lane_id = ir.LaneId();
const auto [warp_half, mask_shift] = [&]() -> std::pair<IR::U32, IR::U32> {
if (profile.subgroup_size == 32) {
const IR::U32 warp_half = ir.BitwiseAnd(ir.WarpId(), ir.Imm32(1));
return std::make_pair(warp_half, lane_id);
}
const IR::U32 warp_half = ir.ShiftRightLogical(lane_id, ir.Imm32(5));
const IR::U32 mask_shift = ir.BitwiseAnd(lane_id, ir.Imm32(0x1F));
return std::make_pair(warp_half, mask_shift);
}();
const IR::U32 thread_mask = ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1), mask_shift), ir.Imm32(1));
const IR::U1 is_odd_warp = ir.INotEqual(warp_half, ir.Imm32(0));
const IR::U32 mask = IR::U32{ir.Select(is_odd_warp, is_low ? ir.Imm32(~0U) : thread_mask,
is_low ? thread_mask : ir.Imm32(0))};
const IR::U32 masked_value = ir.BitwiseAnd(src0, mask);
const IR::U32 result = ir.IAdd(src1, ir.BitCount(masked_value));
SetDst(inst.dst[0], result);
}
} // namespace Shader::Gcn

View File

@ -12,4 +12,24 @@ void Translator::V_INTERP_P2_F32(const GcnInst& inst) {
ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan));
}
void Translator::V_INTERP_MOV_F32(const GcnInst& inst) {
const IR::VectorReg dst_reg{inst.dst[0].code};
auto& attr = info.ps_inputs.at(inst.control.vintrp.attr);
const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan));
}
void Translator::EmitVectorInterpolation(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::V_INTERP_P1_F32:
return;
case Opcode::V_INTERP_P2_F32:
return V_INTERP_P2_F32(inst);
case Opcode::V_INTERP_MOV_F32:
return V_INTERP_MOV_F32(inst);
default:
LogMissingOpcode(inst);
}
}
} // namespace Shader::Gcn

View File

@ -5,9 +5,96 @@
namespace Shader::Gcn {
void Translator::EmitVectorMemory(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::IMAGE_SAMPLE_LZ_O:
case Opcode::IMAGE_SAMPLE_O:
case Opcode::IMAGE_SAMPLE_C:
case Opcode::IMAGE_SAMPLE_C_LZ:
case Opcode::IMAGE_SAMPLE_LZ:
case Opcode::IMAGE_SAMPLE:
case Opcode::IMAGE_SAMPLE_L:
case Opcode::IMAGE_SAMPLE_C_O:
case Opcode::IMAGE_SAMPLE_B:
case Opcode::IMAGE_SAMPLE_C_LZ_O:
return IMAGE_SAMPLE(inst);
case Opcode::IMAGE_GATHER4_C:
case Opcode::IMAGE_GATHER4_LZ:
case Opcode::IMAGE_GATHER4_LZ_O:
return IMAGE_GATHER(inst);
case Opcode::IMAGE_ATOMIC_ADD:
return IMAGE_ATOMIC(AtomicOp::Add, inst);
case Opcode::IMAGE_ATOMIC_AND:
return IMAGE_ATOMIC(AtomicOp::And, inst);
case Opcode::IMAGE_ATOMIC_OR:
return IMAGE_ATOMIC(AtomicOp::Or, inst);
case Opcode::IMAGE_ATOMIC_XOR:
return IMAGE_ATOMIC(AtomicOp::Xor, inst);
case Opcode::IMAGE_ATOMIC_UMAX:
return IMAGE_ATOMIC(AtomicOp::Umax, inst);
case Opcode::IMAGE_ATOMIC_SMAX:
return IMAGE_ATOMIC(AtomicOp::Smax, inst);
case Opcode::IMAGE_ATOMIC_UMIN:
return IMAGE_ATOMIC(AtomicOp::Umin, inst);
case Opcode::IMAGE_ATOMIC_SMIN:
return IMAGE_ATOMIC(AtomicOp::Smin, inst);
case Opcode::IMAGE_ATOMIC_INC:
return IMAGE_ATOMIC(AtomicOp::Inc, inst);
case Opcode::IMAGE_ATOMIC_DEC:
return IMAGE_ATOMIC(AtomicOp::Dec, inst);
case Opcode::IMAGE_GET_LOD:
return IMAGE_GET_LOD(inst);
case Opcode::IMAGE_STORE:
return IMAGE_STORE(inst);
case Opcode::IMAGE_LOAD_MIP:
return IMAGE_LOAD(true, inst);
case Opcode::IMAGE_LOAD:
return IMAGE_LOAD(false, inst);
case Opcode::IMAGE_GET_RESINFO:
return IMAGE_GET_RESINFO(inst);
case Opcode::TBUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD_FORMAT(1, true, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD_FORMAT(2, true, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD_FORMAT(3, true, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD_FORMAT(4, true, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD_FORMAT(1, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD_FORMAT(2, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD_FORMAT(3, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD_FORMAT(4, false, true, inst);
case Opcode::BUFFER_LOAD_DWORD:
return BUFFER_LOAD_FORMAT(1, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX2:
return BUFFER_LOAD_FORMAT(2, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX3:
return BUFFER_LOAD_FORMAT(3, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX4:
return BUFFER_LOAD_FORMAT(4, false, false, inst);
case Opcode::BUFFER_STORE_FORMAT_X:
case Opcode::BUFFER_STORE_DWORD:
return BUFFER_STORE_FORMAT(1, false, inst);
case Opcode::BUFFER_STORE_DWORDX2:
return BUFFER_STORE_FORMAT(2, false, inst);
case Opcode::BUFFER_STORE_DWORDX3:
return BUFFER_STORE_FORMAT(3, false, inst);
case Opcode::BUFFER_STORE_FORMAT_XYZW:
case Opcode::BUFFER_STORE_DWORDX4:
return BUFFER_STORE_FORMAT(4, false, inst);
default:
LogMissingOpcode(inst);
}
}
void Translator::IMAGE_GET_RESINFO(const GcnInst& inst) {
IR::VectorReg dst_reg{inst.dst[0].code};
const IR::ScalarReg tsharp_reg{inst.src[2].code};
const IR::ScalarReg tsharp_reg{inst.src[2].code * 4};
const auto flags = ImageResFlags(inst.control.mimg.dmask);
const bool has_mips = flags.test(ImageResComponent::MipCount);
const IR::U32 lod = ir.GetVectorReg(IR::VectorReg(inst.src[0].code));
@ -157,7 +244,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) {
info.has_bias.Assign(flags.test(MimgModifier::LodBias));
info.has_lod_clamp.Assign(flags.test(MimgModifier::LodClamp));
info.force_level0.Assign(flags.test(MimgModifier::Level0));
info.explicit_lod.Assign(explicit_lod);
// info.explicit_lod.Assign(explicit_lod);
info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1);
// Issue IR instruction, leaving unknown fields blank to patch later.

View File

@ -12,16 +12,16 @@
namespace Shader::IR {
template <typename Pred>
auto BreadthFirstSearch(const Value& value, Pred&& pred)
-> std::invoke_result_t<Pred, const Inst*> {
if (value.IsImmediate()) {
// Nothing to do with immediates
return std::nullopt;
auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t<Pred, const Inst*> {
// Most often case the instruction is the desired already.
if (const std::optional result = pred(inst)) {
return result;
}
// Breadth-first search visiting the right most arguments first
boost::container::small_vector<const Inst*, 2> visited;
std::queue<const Inst*> queue;
queue.push(value.InstRecursive());
queue.push(inst);
while (!queue.empty()) {
// Pop one instruction from the queue
@ -49,4 +49,14 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred)
return std::nullopt;
}
template <typename Pred>
auto BreadthFirstSearch(const Value& value, Pred&& pred)
-> std::invoke_result_t<Pred, const Inst*> {
if (value.IsImmediate()) {
// Nothing to do with immediates
return std::nullopt;
}
return BreadthFirstSearch(value.InstRecursive(), pred);
}
} // namespace Shader::IR

View File

@ -278,7 +278,7 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) {
case 32:
return Inst<U32>(Opcode::LoadSharedU32, offset);
case 64:
return Inst<U64>(Opcode::LoadSharedU64, offset);
return Inst(Opcode::LoadSharedU64, offset);
case 128:
return Inst(Opcode::LoadSharedU128, offset);
default:
@ -373,6 +373,10 @@ U32 IREmitter::LaneId() {
return Inst<U32>(Opcode::LaneId);
}
U32 IREmitter::WarpId() {
return Inst<U32>(Opcode::WarpId);
}
U32 IREmitter::QuadShuffle(const U32& value, const U32& index) {
return Inst<U32>(Opcode::QuadShuffle, value, index);
}
@ -876,6 +880,10 @@ U1 IREmitter::FPIsInf(const F32F64& value) {
}
}
U1 IREmitter::FPCmpClass32(const F32& value, const U32& op) {
return Inst<U1>(Opcode::FPCmpClass32, value, op);
}
U1 IREmitter::FPOrdered(const F32F64& lhs, const F32F64& rhs) {
if (lhs.Type() != rhs.Type()) {
UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type());
@ -1088,6 +1096,10 @@ U32 IREmitter::FindUMsb(const U32& value) {
return Inst<U32>(Opcode::FindUMsb32, value);
}
U32 IREmitter::FindILsb(const U32& value) {
return Inst<U32>(Opcode::FindILsb32, value);
}
U32 IREmitter::SMin(const U32& a, const U32& b) {
return Inst<U32>(Opcode::SMin32, a, b);
}
@ -1274,6 +1286,11 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
default:
break;
}
case 32:
switch (value.Type()) {
case Type::U16:
return Inst<U32>(Opcode::ConvertU32U16, value);
}
default:
break;
}

View File

@ -95,6 +95,7 @@ public:
BufferInstInfo info);
[[nodiscard]] U32 LaneId();
[[nodiscard]] U32 WarpId();
[[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index);
[[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2);
@ -150,6 +151,7 @@ public:
[[nodiscard]] U1 FPGreaterThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true);
[[nodiscard]] U1 FPIsNan(const F32F64& value);
[[nodiscard]] U1 FPIsInf(const F32F64& value);
[[nodiscard]] U1 FPCmpClass32(const F32& value, const U32& op);
[[nodiscard]] U1 FPOrdered(const F32F64& lhs, const F32F64& rhs);
[[nodiscard]] U1 FPUnordered(const F32F64& lhs, const F32F64& rhs);
[[nodiscard]] F32F64 FPMax(const F32F64& lhs, const F32F64& rhs, bool is_legacy = false);
@ -179,6 +181,7 @@ public:
[[nodiscard]] U32 FindSMsb(const U32& value);
[[nodiscard]] U32 FindUMsb(const U32& value);
[[nodiscard]] U32 FindILsb(const U32& value);
[[nodiscard]] U32 SMin(const U32& a, const U32& b);
[[nodiscard]] U32 UMin(const U32& a, const U32& b);
[[nodiscard]] U32 IMin(const U32& a, const U32& b, bool is_signed);

View File

@ -219,6 +219,7 @@ OPCODE(FPIsNan32, U1, F32,
OPCODE(FPIsNan64, U1, F64, )
OPCODE(FPIsInf32, U1, F32, )
OPCODE(FPIsInf64, U1, F64, )
OPCODE(FPCmpClass32, U1, F32, U32 )
// Integer operations
OPCODE(IAdd32, U32, U32, U32, )
@ -254,6 +255,7 @@ OPCODE(BitwiseNot32, U32, U32,
OPCODE(FindSMsb32, U32, U32, )
OPCODE(FindUMsb32, U32, U32, )
OPCODE(FindILsb32, U32, U32, )
OPCODE(SMin32, U32, U32, U32, )
OPCODE(UMin32, U32, U32, U32, )
OPCODE(SMax32, U32, U32, U32, )
@ -293,6 +295,7 @@ OPCODE(ConvertF64S32, F64, U32,
OPCODE(ConvertF64U32, F64, U32, )
OPCODE(ConvertF32U16, F32, U16, )
OPCODE(ConvertU16U32, U16, U32, )
OPCODE(ConvertU32U16, U32, U16, )
// Image operations
OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, )
@ -323,4 +326,5 @@ OPCODE(ImageAtomicExchange32, U32, Opaq
// Warp operations
OPCODE(LaneId, U32, )
OPCODE(WarpId, U32, )
OPCODE(QuadShuffle, U32, U32, U32 )

View File

@ -238,6 +238,18 @@ void FoldBooleanConvert(IR::Inst& inst) {
}
}
void FoldCmpClass(IR::Inst& inst) {
ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation");
const auto class_mask = static_cast<IR::FloatClassFunc>(inst.Arg(1).U32());
if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) {
inst.ReplaceOpcode(IR::Opcode::FPIsNan32);
} else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) {
inst.ReplaceOpcode(IR::Opcode::FPIsInf32);
} else {
UNREACHABLE();
}
}
void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::IAdd32:
@ -251,6 +263,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
case IR::Opcode::IMul32:
FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; });
return;
case IR::Opcode::FPCmpClass32:
FoldCmpClass(inst);
return;
case IR::Opcode::ShiftRightArithmetic32:
FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); });
return;

View File

@ -2,7 +2,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <deque>
#include <boost/container/small_vector.hpp>
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/breadth_first_search.h"
@ -273,9 +272,18 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
}
SharpLocation TrackSharp(const IR::Inst* inst) {
while (inst->GetOpcode() == IR::Opcode::Phi) {
inst = inst->Arg(0).InstRecursive();
}
// Search until we find a potential sharp source.
const auto pred0 = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
const auto result = IR::BreadthFirstSearch(inst, pred0);
ASSERT_MSG(result, "Unable to track sharp source");
inst = result.value();
// If its from user data not much else to do.
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return SharpLocation{
.sgpr_base = u32(IR::ScalarReg::Max),
@ -289,14 +297,14 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive();
// Retrieve SGPR pair that holds sbase
const auto pred = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
const auto pred1 = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return inst->Arg(0).ScalarReg();
}
return std::nullopt;
};
const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred);
const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred);
const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1);
const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1);
ASSERT_MSG(base0 && base1, "Nested resource loads not supported");
// Return retrieved location.
@ -456,36 +464,26 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value&
}
void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
std::deque<IR::Inst*> insts{&inst};
const auto& pred = [](auto opcode) -> bool {
return (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only)
opcode == IR::Opcode::GetUserData);
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
const auto opcode = inst->GetOpcode();
if (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only)
opcode == IR::Opcode::GetUserData) {
return inst;
}
return std::nullopt;
};
IR::Inst* producer{};
while (!insts.empty() && (producer = insts.front(), !pred(producer->GetOpcode()))) {
for (auto arg_idx = 0u; arg_idx < producer->NumArgs(); ++arg_idx) {
const auto arg = producer->Arg(arg_idx);
if (arg.TryInstRecursive()) {
insts.push_back(arg.InstRecursive());
}
}
insts.pop_front();
}
ASSERT(pred(producer->GetOpcode()));
auto [tsharp_handle, ssharp_handle] = [&] -> std::pair<IR::Inst*, IR::Inst*> {
if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) {
return std::make_pair(producer->Arg(0).InstRecursive(),
producer->Arg(1).InstRecursive());
}
return std::make_pair(producer, nullptr);
}();
const auto result = IR::BreadthFirstSearch(&inst, pred);
ASSERT_MSG(result, "Unable to find image sharp source");
const IR::Inst* producer = result.value();
const bool has_sampler = producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2;
const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer;
// Read image sharp.
const auto tsharp = TrackSharp(tsharp_handle);
const auto image = info.ReadUd<AmdGpu::Image>(tsharp.sgpr_base, tsharp.dword_offset);
const auto inst_info = inst.Flags<IR::TextureInstInfo>();
ASSERT(image.GetType() != AmdGpu::ImageType::Invalid);
u32 image_binding = descriptors.Add(ImageResource{
.sgpr_base = tsharp.sgpr_base,
.dword_offset = tsharp.dword_offset,
@ -496,17 +494,32 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
});
// Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions
if (ssharp_handle) {
const u32 sampler_binding = [&] {
if (!has_sampler) {
return 0U;
}
const IR::Value& handle = producer->Arg(1);
// Inline sampler resource.
if (handle.IsImmediate()) {
LOG_WARNING(Render_Vulkan, "Inline sampler detected");
return descriptors.Add(SamplerResource{
.sgpr_base = std::numeric_limits<u32>::max(),
.dword_offset = 0,
.inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()},
});
}
// Normal sampler resource.
const auto ssharp_handle = handle.InstRecursive();
const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle);
const auto ssharp = TrackSharp(ssharp_ud);
const u32 sampler_binding = descriptors.Add(SamplerResource{
return descriptors.Add(SamplerResource{
.sgpr_base = ssharp.sgpr_base,
.dword_offset = ssharp.dword_offset,
.associated_image = image_binding,
.disable_aniso = disable_aniso,
});
image_binding |= (sampler_binding << 16);
}
}();
image_binding |= (sampler_binding << 16);
// Patch image handle
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
@ -607,7 +620,7 @@ void ResourceTrackingPass(IR::Program& program) {
// Iterate resource instructions and patch them after finding the sharp.
auto& info = program.info;
Descriptors descriptors{info.buffers, info.images, info.samplers};
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (IsBufferInstruction(inst)) {
PatchBufferInstruction(*block, inst, info, descriptors);

View File

@ -20,11 +20,19 @@ void Visit(Info& info, IR::Inst& inst) {
case IR::Opcode::LoadSharedU8:
case IR::Opcode::WriteSharedU8:
info.uses_shared_u8 = true;
info.uses_shared = true;
break;
case IR::Opcode::LoadSharedS16:
case IR::Opcode::LoadSharedU16:
case IR::Opcode::WriteSharedU16:
info.uses_shared_u16 = true;
info.uses_shared = true;
break;
case IR::Opcode::LoadSharedU32:
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU32:
case IR::Opcode::WriteSharedU64:
info.uses_shared = true;
break;
case IR::Opcode::ConvertF32F16:
case IR::Opcode::BitCastF16U16:

View File

@ -5,6 +5,7 @@
#include "common/assert.h"
#include "common/bit_field.h"
#include "common/enum.h"
#include "common/types.h"
#include "video_core/amdgpu/pixel_format.h"
@ -24,6 +25,23 @@ enum class FpDenormMode : u32 {
InOutAllow = 3,
};
enum class FloatClassFunc : u32 {
SignalingNan = 1 << 0,
QuietNan = 1 << 1,
NegativeInfinity = 1 << 2,
NegativeNormal = 1 << 3,
NegativeDenorm = 1 << 4,
NegativeZero = 1 << 5,
PositiveZero = 1 << 6,
PositiveDenorm = 1 << 7,
PositiveNormal = 1 << 8,
PositiveInfinity = 1 << 9,
NaN = SignalingNan | QuietNan,
Infinity = PositiveInfinity | NegativeInfinity,
};
DECLARE_ENUM_FLAG_OPERATORS(FloatClassFunc)
union Mode {
BitField<0, 4, FpRoundMode> fp_round;
BitField<4, 2, FpDenormMode> fp_denorm_single;

View File

@ -9,6 +9,7 @@ namespace Shader {
struct Profile {
u32 supported_spirv{0x00010000};
u32 subgroup_size{};
bool unified_descriptor_binding{};
bool support_descriptor_aliasing{};
bool support_int8{};

View File

@ -28,7 +28,8 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) {
}
IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
std::span<const u32> token, const Info&& info) {
std::span<const u32> token, const Info&& info,
const Profile& profile) {
// Ensure first instruction is expected.
constexpr u32 token_mov_vcchi = 0xBEEB03FF;
ASSERT_MSG(token[0] == token_mov_vcchi, "First instruction is not s_mov_b32 vcc_hi, #imm");
@ -49,7 +50,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
// Structurize control flow graph and create program.
program.info = std::move(info);
program.syntax_list = Shader::Gcn::BuildASL(inst_pool, block_pool, cfg, program.info);
program.syntax_list = Shader::Gcn::BuildASL(inst_pool, block_pool, cfg, program.info, profile);
program.blocks = GenerateBlocks(program.syntax_list);
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
@ -60,9 +61,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
Shader::Optimization::IdentityRemovalPass(program.blocks);
Shader::Optimization::DeadCodeEliminationPass(program);
Shader::Optimization::CollectShaderInfoPass(program);
fmt::print("Post passes\n\n{}\n", Shader::IR::DumpProgram(program));
std::fflush(stdout);
LOG_INFO(Render_Vulkan, "{}", Shader::IR::DumpProgram(program));
return program;
}

View File

@ -9,8 +9,11 @@
namespace Shader {
struct Profile;
[[nodiscard]] IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool,
ObjectPool<IR::Block>& block_pool,
std::span<const u32> code, const Info&& info);
std::span<const u32> code, const Info&& info,
const Profile& profile);
} // namespace Shader

View File

@ -97,8 +97,11 @@ using ImageResourceList = boost::container::static_vector<ImageResource, 16>;
struct SamplerResource {
u32 sgpr_base;
u32 dword_offset;
AmdGpu::Sampler inline_sampler{};
u32 associated_image : 4;
u32 disable_aniso : 1;
constexpr AmdGpu::Sampler GetSsharp(const Info& info) const noexcept;
};
using SamplerResourceList = boost::container::static_vector<SamplerResource, 16>;
@ -175,6 +178,7 @@ struct Info {
bool has_image_gather{};
bool has_image_query{};
bool uses_group_quad{};
bool uses_shared{};
bool uses_shared_u8{};
bool uses_shared_u16{};
bool uses_fp16{};
@ -196,6 +200,10 @@ constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexc
return inline_cbuf ? inline_cbuf : info.ReadUd<AmdGpu::Buffer>(sgpr_base, dword_offset);
}
constexpr AmdGpu::Sampler SamplerResource::GetSsharp(const Info& info) const noexcept {
return inline_sampler ? inline_sampler : info.ReadUd<AmdGpu::Sampler>(sgpr_base, dword_offset);
}
} // namespace Shader
template <>

View File

@ -403,9 +403,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
}
while (!wait_reg_mem->Test()) {
mapped_queues[GfxQueueId].cs_state = regs.cs_program;
TracyFiberLeave;
co_yield {};
TracyFiberEnter(dcb_task_name);
regs.cs_program = mapped_queues[GfxQueueId].cs_state;
}
break;
}
@ -506,9 +508,11 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
while (!wait_reg_mem->Test()) {
mapped_queues[vqid].cs_state = regs.cs_program;
TracyFiberLeave;
co_yield {};
TracyFiberEnter(acb_task_name);
regs.cs_program = mapped_queues[vqid].cs_state;
}
break;
}
@ -529,7 +533,6 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
}
void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
static constexpr u32 GfxQueueId = 0u;
auto& queue = mapped_queues[GfxQueueId];
auto task = ProcessGraphics(dcb, ccb);

View File

@ -36,6 +36,7 @@ namespace AmdGpu {
[[maybe_unused]] std::array<u32, num_words> CONCAT2(pad, __LINE__)
struct Liverpool {
static constexpr u32 GfxQueueId = 0u;
static constexpr u32 NumGfxRings = 1u; // actually 2, but HP is reserved by system software
static constexpr u32 NumComputePipes = 7u; // actually 8, but #7 is reserved by system software
static constexpr u32 NumQueuesPerPipe = 8u;
@ -1061,6 +1062,7 @@ private:
struct GpuQueue {
std::mutex m_access{};
std::queue<Task::Handle> submits{};
ComputeProgram cs_state{};
};
std::array<GpuQueue, NumTotalQueues> mapped_queues{};

View File

@ -7,6 +7,77 @@
namespace AmdGpu {
std::string_view NameOf(DataFormat fmt) {
switch (fmt) {
case DataFormat::FormatInvalid:
return "FormatInvalid";
case DataFormat::Format8:
return "Format8";
case DataFormat::Format16:
return "Format16";
case DataFormat::Format8_8:
return "Format8_8";
case DataFormat::Format32:
return "Format32";
case DataFormat::Format16_16:
return "Format16_16";
case DataFormat::Format10_11_11:
return "Format10_11_11";
case DataFormat::Format11_11_10:
return "Format11_11_10";
case DataFormat::Format10_10_10_2:
return "Format10_10_10_2";
case DataFormat::Format2_10_10_10:
return "Format2_10_10_10";
case DataFormat::Format8_8_8_8:
return "Format8_8_8_8";
case DataFormat::Format32_32:
return "Format32_32";
case DataFormat::Format16_16_16_16:
return "Format16_16_16_16";
case DataFormat::Format32_32_32:
return "Format32_32_32";
case DataFormat::Format32_32_32_32:
return "Format32_32_32_32";
case DataFormat::Format5_6_5:
return "Format5_6_5";
case DataFormat::Format1_5_5_5:
return "Format1_5_5_5";
case DataFormat::Format5_5_5_1:
return "Format5_5_5_1";
case DataFormat::Format4_4_4_4:
return "Format4_4_4_4";
case DataFormat::Format8_24:
return "Format8_24";
case DataFormat::Format24_8:
return "Format24_8";
case DataFormat::FormatX24_8_32:
return "FormatX24_8_32";
case DataFormat::FormatGB_GR:
return "FormatGB_GR";
case DataFormat::FormatBG_RG:
return "FormatBG_RG";
case DataFormat::Format5_9_9_9:
return "Format5_9_9_9";
case DataFormat::FormatBc1:
return "FormatBc1";
case DataFormat::FormatBc2:
return "FormatBc2";
case DataFormat::FormatBc3:
return "FormatBc3";
case DataFormat::FormatBc4:
return "FormatBc4";
case DataFormat::FormatBc5:
return "FormatBc5";
case DataFormat::FormatBc6:
return "FormatBc6";
case DataFormat::FormatBc7:
return "FormatBc7";
default:
UNREACHABLE();
}
}
std::string_view NameOf(NumberFormat fmt) {
switch (fmt) {
case NumberFormat::Unorm:

View File

@ -61,6 +61,7 @@ enum class NumberFormat : u32 {
Ubscaled = 13,
};
[[nodiscard]] std::string_view NameOf(DataFormat fmt);
[[nodiscard]] std::string_view NameOf(NumberFormat fmt);
int NumComponents(DataFormat format);
@ -70,6 +71,16 @@ s32 ComponentOffset(DataFormat format, u32 comp);
} // namespace AmdGpu
template <>
struct fmt::formatter<AmdGpu::DataFormat> {
constexpr auto parse(format_parse_context& ctx) {
return ctx.begin();
}
auto format(AmdGpu::DataFormat fmt, format_context& ctx) const {
return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(fmt));
}
};
template <>
struct fmt::formatter<AmdGpu::NumberFormat> {
constexpr auto parse(format_parse_context& ctx) {

View File

@ -75,7 +75,7 @@ struct Buffer {
static_assert(sizeof(Buffer) == 16); // 128bits
enum class ImageType : u64 {
Buffer = 0,
Invalid = 0,
Color1D = 8,
Color2D = 9,
Color3D = 10,
@ -88,8 +88,8 @@ enum class ImageType : u64 {
constexpr std::string_view NameOf(ImageType type) {
switch (type) {
case ImageType::Buffer:
return "Buffer";
case ImageType::Invalid:
return "Invalid";
case ImageType::Color1D:
return "Color1D";
case ImageType::Color2D:
@ -179,6 +179,40 @@ struct Image {
return base_address << 8;
}
u32 DstSelect() const {
return dst_sel_x | (dst_sel_y << 3) | (dst_sel_z << 6) | (dst_sel_w << 9);
}
static char SelectComp(u32 sel) {
switch (sel) {
case 0:
return '0';
case 1:
return '1';
case 4:
return 'R';
case 5:
return 'G';
case 6:
return 'B';
case 7:
return 'A';
default:
UNREACHABLE();
}
}
std::string DstSelectName() const {
std::string result = "[";
u32 dst_sel = DstSelect();
for (u32 i = 0; i < 4; i++) {
result += SelectComp(dst_sel & 7);
dst_sel >>= 3;
}
result += ']';
return result;
}
u32 Pitch() const {
return pitch + 1;
}
@ -290,6 +324,7 @@ enum class BorderColor : u64 {
// Table 8.12 Sampler Resource Definition
struct Sampler {
union {
u64 raw0;
BitField<0, 3, ClampMode> clamp_x;
BitField<3, 3, ClampMode> clamp_y;
BitField<6, 3, ClampMode> clamp_z;
@ -309,6 +344,7 @@ struct Sampler {
BitField<60, 4, u64> perf_z;
};
union {
u64 raw1;
BitField<0, 14, u64> lod_bias;
BitField<14, 6, u64> lod_bias_sec;
BitField<20, 2, Filter> xy_mag_filter;
@ -323,6 +359,10 @@ struct Sampler {
BitField<62, 2, BorderColor> border_color_type;
};
operator bool() const noexcept {
return raw0 != 0 || raw1 != 0;
}
float LodBias() const noexcept {
return static_cast<float>(static_cast<int16_t>((lod_bias.Value() ^ 0x2000u) - 0x2000u)) /
256.0f;

View File

@ -297,6 +297,7 @@ std::span<const vk::Format> GetAllFormats() {
vk::Format::eBc3UnormBlock,
vk::Format::eBc4UnormBlock,
vk::Format::eBc5UnormBlock,
vk::Format::eBc5SnormBlock,
vk::Format::eBc7SrgbBlock,
vk::Format::eBc7UnormBlock,
vk::Format::eD16Unorm,
@ -308,6 +309,7 @@ std::span<const vk::Format> GetAllFormats() {
vk::Format::eR8G8B8A8Srgb,
vk::Format::eR8G8B8A8Uint,
vk::Format::eR8G8B8A8Unorm,
vk::Format::eR8G8B8A8Snorm,
vk::Format::eR8G8B8A8Uscaled,
vk::Format::eR8G8Snorm,
vk::Format::eR8G8Uint,
@ -335,6 +337,10 @@ std::span<const vk::Format> GetAllFormats() {
vk::Format::eR32Sfloat,
vk::Format::eR32Sint,
vk::Format::eR32Uint,
vk::Format::eBc6HUfloatBlock,
vk::Format::eR16G16Unorm,
vk::Format::eR16G16B16A16Sscaled,
vk::Format::eR16G16Sscaled,
};
return formats;
}
@ -384,10 +390,17 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Unorm) {
return vk::Format::eBc5UnormBlock;
}
if (data_format == AmdGpu::DataFormat::FormatBc5 && num_format == AmdGpu::NumberFormat::Snorm) {
return vk::Format::eBc5SnormBlock;
}
if (data_format == AmdGpu::DataFormat::Format16_16_16_16 &&
num_format == AmdGpu::NumberFormat::Sint) {
return vk::Format::eR16G16B16A16Sint;
}
if (data_format == AmdGpu::DataFormat::Format16_16_16_16 &&
num_format == AmdGpu::NumberFormat::Sscaled) {
return vk::Format::eR16G16B16A16Sscaled;
}
if (data_format == AmdGpu::DataFormat::Format16_16 &&
num_format == AmdGpu::NumberFormat::Float) {
return vk::Format::eR16G16Sfloat;
@ -496,6 +509,10 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
num_format == AmdGpu::NumberFormat::Sint) {
return vk::Format::eR16G16Sint;
}
if (data_format == AmdGpu::DataFormat::Format16_16 &&
num_format == AmdGpu::NumberFormat::Sscaled) {
return vk::Format::eR16G16Sscaled;
}
if (data_format == AmdGpu::DataFormat::Format8_8_8_8 &&
num_format == AmdGpu::NumberFormat::Uscaled) {
return vk::Format::eR8G8B8A8Uscaled;
@ -518,6 +535,13 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
num_format == AmdGpu::NumberFormat::SnormNz) {
return vk::Format::eR16G16B16A16Snorm;
}
if (data_format == AmdGpu::DataFormat::Format8_8_8_8 &&
num_format == AmdGpu::NumberFormat::Snorm) {
return vk::Format::eR8G8B8A8Snorm;
}
if (data_format == AmdGpu::DataFormat::FormatBc6 && num_format == AmdGpu::NumberFormat::Unorm) {
return vk::Format::eBc6HUfloatBlock;
}
UNREACHABLE_MSG("Unknown data_format={} and num_format={}", u32(data_format), u32(num_format));
}

View File

@ -148,7 +148,7 @@ bool ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s
}
}
for (const auto& sampler : info.samplers) {
const auto ssharp = info.ReadUd<AmdGpu::Sampler>(sampler.sgpr_base, sampler.dword_offset);
const auto ssharp = sampler.GetSsharp(info);
const auto vk_sampler = texture_cache.GetSampler(ssharp);
image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral);
set_writes.push_back({

View File

@ -386,7 +386,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer&
}
}
for (const auto& sampler : stage.samplers) {
auto ssharp = stage.ReadUd<AmdGpu::Sampler>(sampler.sgpr_base, sampler.dword_offset);
auto ssharp = sampler.GetSsharp(stage);
if (sampler.disable_aniso) {
const auto& tsharp = tsharps[sampler.associated_image];
if (tsharp.base_level == 0 && tsharp.last_level == 0) {

View File

@ -164,10 +164,11 @@ bool Instance::CreateDevice() {
vk::PhysicalDeviceVulkan13Features,
vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR,
vk::PhysicalDeviceDepthClipControlFeaturesEXT>();
const vk::StructureChain properties_chain =
physical_device.getProperties2<vk::PhysicalDeviceProperties2,
vk::PhysicalDevicePortabilitySubsetPropertiesKHR,
vk::PhysicalDeviceExternalMemoryHostPropertiesEXT>();
const vk::StructureChain properties_chain = physical_device.getProperties2<
vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR,
vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>();
subgroup_size = properties_chain.get<vk::PhysicalDeviceVulkan11Properties>().subgroupSize;
LOG_INFO(Render_Vulkan, "Physical device subgroup size {}", subgroup_size);
features = feature_chain.get().features;
if (available_extensions.empty()) {
@ -261,6 +262,7 @@ bool Instance::CreateDevice() {
.shaderStorageImageExtendedFormats = features.shaderStorageImageExtendedFormats,
.shaderStorageImageMultisample = features.shaderStorageImageMultisample,
.shaderClipDistance = features.shaderClipDistance,
.shaderInt64 = features.shaderInt64,
.shaderInt16 = features.shaderInt16,
},
},

View File

@ -188,6 +188,11 @@ public:
return properties.limits.nonCoherentAtomSize;
}
/// Returns the subgroup size of the selected physical device.
u32 SubgroupSize() const {
return subgroup_size;
}
/// Returns the maximum supported elements in a texel buffer
u32 MaxTexelBufferElements() const {
return properties.limits.maxTexelBufferElements;
@ -249,6 +254,7 @@ private:
bool workgroup_memory_explicit_layout{};
bool color_write_en{};
u64 min_imported_host_pointer_alignment{};
u32 subgroup_size{};
bool tooling_info{};
bool debug_utils_supported{};
bool has_nsight_graphics{};

View File

@ -109,6 +109,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
pipeline_cache = instance.GetDevice().createPipelineCacheUnique({});
profile = Shader::Profile{
.supported_spirv = 0x00010600U,
.subgroup_size = instance.SubgroupSize(),
.support_explicit_workgroup_layout = true,
};
}
@ -268,7 +269,8 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline() {
Shader::Info info = MakeShaderInfo(stage, pgm->user_data, regs);
info.pgm_base = pgm->Address<uintptr_t>();
info.pgm_hash = hash;
programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info));
programs[i] =
Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile);
// Compile IR to SPIR-V
auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding);
@ -308,7 +310,8 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline() {
Shader::Info info =
MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs);
info.pgm_base = cs_pgm.Address<uintptr_t>();
auto program = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info));
auto program =
Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile);
// Compile IR to SPIR-V
u32 binding{};

View File

@ -23,7 +23,7 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
: instance{instance_}, scheduler{scheduler_}, texture_cache{texture_cache_},
liverpool{liverpool_}, memory{Core::Memory::Instance()},
pipeline_cache{instance, scheduler, liverpool},
vertex_index_buffer{instance, scheduler, VertexIndexFlags, 1_GB, BufferType::Upload} {
vertex_index_buffer{instance, scheduler, VertexIndexFlags, 2_GB, BufferType::Upload} {
if (!Config::nullGpu()) {
liverpool->BindRasterizer(this);
}
@ -128,6 +128,7 @@ void Rasterizer::BeginRendering() {
state.height = std::min<u32>(state.height, image.info.size.height);
const bool is_clear = texture_cache.IsMetaCleared(col_buf.CmaskAddress());
state.color_images[state.num_color_attachments] = image.image;
state.color_attachments[state.num_color_attachments++] = {
.imageView = *image_view.image_view,
.imageLayout = vk::ImageLayout::eGeneral,
@ -152,6 +153,7 @@ void Rasterizer::BeginRendering() {
const auto& image = texture_cache.GetImage(image_view.image_id);
state.width = std::min<u32>(state.width, image.info.size.width);
state.height = std::min<u32>(state.height, image.info.size.height);
state.depth_image = image.image;
state.depth_attachment = {
.imageView = *image_view.image_view,
.imageLayout = image.layout,

View File

@ -50,7 +50,32 @@ void Scheduler::EndRendering() {
return;
}
is_rendering = false;
boost::container::static_vector<vk::ImageMemoryBarrier, 9> barriers;
for (size_t i = 0; i < render_state.num_color_attachments; ++i) {
barriers.push_back(vk::ImageMemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eColorAttachmentWrite,
.dstAccessMask = vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite,
.oldLayout = vk::ImageLayout::eColorAttachmentOptimal,
.newLayout = vk::ImageLayout::eColorAttachmentOptimal,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = render_state.color_images[i],
.subresourceRange =
{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
});
}
current_cmdbuf.endRendering();
if (!barriers.empty()) {
current_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput,
vk::PipelineStageFlagBits::eFragmentShader,
vk::DependencyFlagBits::eByRegion, {}, {}, barriers);
}
}
void Scheduler::Flush(SubmitInfo& info) {

View File

@ -15,7 +15,9 @@ class Instance;
struct RenderState {
std::array<vk::RenderingAttachmentInfo, 8> color_attachments{};
std::array<vk::Image, 8> color_images{};
vk::RenderingAttachmentInfo depth_attachment{};
vk::Image depth_image{};
u32 num_color_attachments{};
u32 num_depth_attachments{};
u32 width = std::numeric_limits<u32>::max();

View File

@ -47,6 +47,20 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) {
}
}
bool IsIdentityMapping(u32 dst_sel, u32 num_components) {
return (num_components == 1 && dst_sel == 0b100) ||
(num_components == 2 && dst_sel == 0b101'100) ||
(num_components == 3 && dst_sel == 0b110'101'100) ||
(num_components == 4 && dst_sel == 0b111'110'101'100);
}
vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) {
if (format == vk::Format::eR8G8B8A8Unorm && dst_sel == 0b111100101110) {
return vk::Format::eB8G8R8A8Unorm;
}
return format;
}
ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept
: is_storage{is_storage} {
type = ConvertImageViewType(image.GetType());
@ -60,9 +74,16 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexce
mapping.b = ConvertComponentSwizzle(image.dst_sel_z);
mapping.a = ConvertComponentSwizzle(image.dst_sel_w);
// Check for unfortunate case of storage images being swizzled
if (is_storage && (mapping != vk::ComponentMapping{})) {
LOG_ERROR(Render_Vulkan, "Storage image requires swizzling");
const u32 num_comps = AmdGpu::NumComponents(image.GetDataFmt());
const u32 dst_sel = image.DstSelect();
if (is_storage && !IsIdentityMapping(dst_sel, num_comps)) {
mapping = vk::ComponentMapping{};
if (auto new_format = TrySwizzleFormat(format, dst_sel); new_format != format) {
format = new_format;
return;
}
LOG_ERROR(Render_Vulkan, "Storage image (num_comps = {}) requires swizzling {}", num_comps,
image.DstSelectName());
}
}

View File

@ -35,6 +35,8 @@ struct ImageViewInfo {
struct Image;
constexpr Common::SlotId NULL_IMAGE_VIEW_ID{0};
struct ImageView {
explicit ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info, Image& image,
ImageId image_id, std::optional<vk::ImageUsageFlags> usage_override = {});

View File

@ -142,14 +142,14 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
image_ids.push_back(image_id);
});
ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!");
// ASSERT_MSG(image_ids.size() <= 1, "Overlapping images not allowed!");
ImageId image_id{};
if (image_ids.empty()) {
image_id = slot_images.insert(instance, scheduler, info);
RegisterImage(image_id);
} else {
image_id = image_ids[0];
image_id = image_ids[image_ids.size() > 1 ? 1 : 0];
}
Image& image = slot_images[image_id];
@ -183,12 +183,17 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
}
ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
if (info.guest_address == 0) [[unlikely]] {
return slot_image_views[NULL_IMAGE_VIEW_ID];
}
const ImageId image_id = FindImage(info);
Image& image = slot_images[image_id];
auto& usage = image.info.usage;
if (view_info.is_storage) {
image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite);
usage.storage = true;
} else {
const auto new_layout = image.info.IsDepthStencil()
@ -206,7 +211,7 @@ ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo&
view_info_tmp.range.extent.levels > image.info.resources.levels ||
view_info_tmp.range.extent.layers > image.info.resources.layers) {
LOG_ERROR(Render_Vulkan,
LOG_DEBUG(Render_Vulkan,
"Subresource range ({}~{},{}~{}) exceeds base image extents ({},{})",
view_info_tmp.range.base.level, view_info_tmp.range.extent.levels,
view_info_tmp.range.base.layer, view_info_tmp.range.extent.layers,
@ -341,7 +346,7 @@ void TextureCache::RefreshImage(Image& image) {
cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
image.Transit(vk::ImageLayout::eGeneral,
vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
vk::AccessFlagBits::eMemoryWrite | vk::AccessFlagBits::eMemoryRead);
}
vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {