From f9e96793ccd0fabd33fc1cda4805f3a388ba27c1 Mon Sep 17 00:00:00 2001
From: Vladislav Mikhalin <mikhalinvlad@gmail.com>
Date: Tue, 16 Jul 2024 15:03:07 +0300
Subject: [PATCH] Implemented load_buffer_format_* conversions (#295)

* Implemented load_buffer_format_* conversions

* clang-format insists on ugly things
---
 .../spirv/emit_spirv_context_get_set.cpp      | 241 +++++++++++++++---
 .../backend/spirv/emit_spirv_instructions.h   |   4 +
 .../backend/spirv/spirv_emit_context.cpp      |   5 +-
 .../backend/spirv/spirv_emit_context.h        |   1 +
 .../frontend/translate/translate.cpp          |  33 ++-
 .../frontend/translate/translate.h            |   2 +-
 .../frontend/translate/vector_memory.cpp      |   6 +-
 src/shader_recompiler/ir/ir_emitter.cpp       |  16 ++
 src/shader_recompiler/ir/ir_emitter.h         |   2 +
 src/shader_recompiler/ir/opcodes.inc          |   4 +
 .../ir/passes/resource_tracking_pass.cpp      | 133 ++++++++--
 src/shader_recompiler/runtime_info.h          |   3 +-
 src/video_core/amdgpu/pixel_format.cpp        | 106 ++++++++
 src/video_core/amdgpu/pixel_format.h          |   2 +
 src/video_core/amdgpu/resource.h              |   8 -
 15 files changed, 475 insertions(+), 91 deletions(-)
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 75ee3ae9..c88a1cbb 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -4,6 +4,8 @@
 #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 
+#include <magic_enum.hpp>
+
 namespace Shader::Backend::SPIRV {
 namespace {
 
@@ -209,57 +211,216 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen
     ctx.OpStore(pointer, value);
 }
 
-Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    const auto info = inst->Flags<IR::BufferInstInfo>();
-    const auto& buffer = ctx.buffers[handle];
-    if (info.index_enable && info.offset_enable) {
-        UNREACHABLE();
-    } else if (info.index_enable) {
-        const Id ptr{
-            ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)};
-        return ctx.OpLoad(buffer.data_types->Get(1), ptr);
-    }
-    UNREACHABLE();
-}
-
 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     return EmitLoadBufferF32(ctx, inst, handle, address);
 }
 
-Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    const auto info = inst->Flags<IR::BufferInstInfo>();
+template <int N>
+static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
     const auto& buffer = ctx.buffers[handle];
-    boost::container::static_vector<Id, 2> ids;
-    for (u32 i = 0; i < 2; i++) {
-        const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
-        const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
-        ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
+    Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
+    if constexpr (N == 1) {
+        const Id ptr{
+            ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, address)};
+        return ctx.OpLoad(buffer.data_types->Get(1), ptr);
+    } else {
+        boost::container::static_vector<Id, N> ids;
+        for (u32 i = 0; i < N; i++) {
+            index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
+            const Id ptr{
+                ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
+            ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
+        }
+        return ctx.OpCompositeConstruct(buffer.data_types->Get(N), ids);
     }
-    return ctx.OpCompositeConstruct(buffer.data_types->Get(2), ids);
 }
 
-Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    const auto info = inst->Flags<IR::BufferInstInfo>();
-    const auto& buffer = ctx.buffers[handle];
-    boost::container::static_vector<Id, 3> ids;
-    for (u32 i = 0; i < 3; i++) {
-        const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
-        const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
-        ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
-    }
-    return ctx.OpCompositeConstruct(buffer.data_types->Get(3), ids);
+Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
+    return EmitLoadBufferF32xN<1>(ctx, handle, address);
 }
 
-Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    const auto info = inst->Flags<IR::BufferInstInfo>();
-    const auto& buffer = ctx.buffers[handle];
-    boost::container::static_vector<Id, 4> ids;
-    for (u32 i = 0; i < 4; i++) {
-        const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))};
-        const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
-        ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr));
+Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
+    return EmitLoadBufferF32xN<2>(ctx, handle, address);
+}
+
+Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
+    return EmitLoadBufferF32xN<3>(ctx, handle, address);
+}
+
+Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
+    return EmitLoadBufferF32xN<4>(ctx, handle, address);
+}
+
+static bool IsSignedInteger(AmdGpu::NumberFormat format) {
+    switch (format) {
+    case AmdGpu::NumberFormat::Unorm:
+    case AmdGpu::NumberFormat::Uscaled:
+    case AmdGpu::NumberFormat::Uint:
+        return false;
+    case AmdGpu::NumberFormat::Snorm:
+    case AmdGpu::NumberFormat::Sscaled:
+    case AmdGpu::NumberFormat::Sint:
+    case AmdGpu::NumberFormat::SnormNz:
+        return true;
+    case AmdGpu::NumberFormat::Float:
+    default:
+        UNREACHABLE();
     }
-    return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids);
+}
+
+static u32 UXBitsMax(u32 bit_width) {
+    return (1u << bit_width) - 1u;
+}
+
+static u32 SXBitsMax(u32 bit_width) {
+    return (1u << (bit_width - 1u)) - 1u;
+}
+
+static Id ConvertValue(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) {
+    switch (format) {
+    case AmdGpu::NumberFormat::Unorm:
+        return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width))));
+    case AmdGpu::NumberFormat::Snorm:
+        return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width))));
+    case AmdGpu::NumberFormat::SnormNz:
+        // (x * 2 + 1) / (Format::SMAX * 2)
+        value = ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(2.f));
+        value = ctx.OpFAdd(ctx.F32[1], value, ctx.ConstF32(1.f));
+        return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width) * 2)));
+    case AmdGpu::NumberFormat::Uscaled:
+    case AmdGpu::NumberFormat::Sscaled:
+    case AmdGpu::NumberFormat::Uint:
+    case AmdGpu::NumberFormat::Sint:
+    case AmdGpu::NumberFormat::Float:
+        return value;
+    default:
+        UNREACHABLE_MSG("Unsupported number fromat for conversion: {}",
+                        magic_enum::enum_name(format));
+    }
+}
+
+static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offset) {
+    Id comp_offset = ctx.ConstU32(bit_offset);
+    if (stride < 4) {
+        // comp_offset += (address % 4) * 8;
+        const Id byte_offset = ctx.OpUMod(ctx.U32[1], address, ctx.ConstU32(4u));
+        const Id bit_offset = ctx.OpShiftLeftLogical(ctx.U32[1], byte_offset, ctx.ConstU32(3u));
+        comp_offset = ctx.OpIAdd(ctx.U32[1], comp_offset, bit_offset);
+    }
+    return comp_offset;
+}
+
+static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) {
+    const auto& buffer = ctx.buffers[handle];
+    const auto format = buffer.buffer.GetDataFmt();
+    switch (format) {
+    case AmdGpu::DataFormat::FormatInvalid:
+        return ctx.f32_zero_value;
+    case AmdGpu::DataFormat::Format8:
+    case AmdGpu::DataFormat::Format16:
+    case AmdGpu::DataFormat::Format32:
+    case AmdGpu::DataFormat::Format8_8:
+    case AmdGpu::DataFormat::Format16_16:
+    case AmdGpu::DataFormat::Format10_11_11:
+    case AmdGpu::DataFormat::Format11_11_10:
+    case AmdGpu::DataFormat::Format10_10_10_2:
+    case AmdGpu::DataFormat::Format2_10_10_10:
+    case AmdGpu::DataFormat::Format8_8_8_8:
+    case AmdGpu::DataFormat::Format32_32:
+    case AmdGpu::DataFormat::Format16_16_16_16:
+    case AmdGpu::DataFormat::Format32_32_32:
+    case AmdGpu::DataFormat::Format32_32_32_32: {
+        const u32 num_components = AmdGpu::NumComponents(format);
+        if (comp >= num_components) {
+            return ctx.f32_zero_value;
+        }
+
+        // uint index = address / 4;
+        Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
+        const u32 stride = buffer.buffer.GetStride();
+        if (stride > 4) {
+            const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32);
+            if (index_offset > 0) {
+                // index += index_offset;
+                index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(index_offset));
+            }
+        }
+        const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index);
+
+        const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32;
+        const u32 bit_width = AmdGpu::ComponentBits(format, comp);
+        const auto num_format = buffer.buffer.GetNumberFmt();
+        if (num_format == AmdGpu::NumberFormat::Float) {
+            if (bit_width == 32) {
+                return ctx.OpLoad(ctx.F32[1], ptr);
+            } else if (bit_width == 16) {
+                const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset);
+                Id value = ctx.OpLoad(ctx.U32[1], ptr);
+                value =
+                    ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width));
+                value = ctx.OpSConvert(ctx.U16, value);
+                value = ctx.OpBitcast(ctx.F16[1], value);
+                return ctx.OpFConvert(ctx.F32[1], value);
+            } else {
+                UNREACHABLE_MSG("Invalid float bit width {}", bit_width);
+            }
+        } else {
+            Id value = ctx.OpLoad(ctx.U32[1], ptr);
+            const bool is_signed = IsSignedInteger(num_format);
+            if (bit_width < 32) {
+                const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset);
+                if (is_signed) {
+                    value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset,
+                                                   ctx.ConstU32(bit_width));
+                    value = ctx.OpConvertSToF(ctx.F32[1], value);
+                } else {
+                    value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset,
+                                                   ctx.ConstU32(bit_width));
+                    value = ctx.OpConvertUToF(ctx.F32[1], value);
+                }
+            } else {
+                if (is_signed) {
+                    value = ctx.OpConvertSToF(ctx.F32[1], value);
+                } else {
+                    value = ctx.OpConvertUToF(ctx.F32[1], value);
+                }
+            }
+            return ConvertValue(ctx, value, num_format, bit_width);
+        }
+        break;
+    }
+    default:
+        UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format));
+    }
+}
+
+template <int N>
+static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    if constexpr (N == 1) {
+        return GetBufferFormatValue(ctx, handle, address, 0);
+    } else {
+        boost::container::static_vector<Id, N> ids;
+        for (u32 i = 0; i < N; i++) {
+            ids.push_back(GetBufferFormatValue(ctx, handle, address, i));
+        }
+        return ctx.OpCompositeConstruct(ctx.F32[N], ids);
+    }
+}
+
+Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return EmitLoadBufferFormatF32xN<1>(ctx, inst, handle, address);
+}
+
+Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return EmitLoadBufferFormatF32xN<2>(ctx, inst, handle, address);
+}
+
+Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return EmitLoadBufferFormatF32xN<3>(ctx, inst, handle, address);
+}
+
+Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return EmitLoadBufferFormatF32xN<4>(ctx, inst, handle, address);
 }
 
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
index e0b19f4f..f43ea3b3 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -66,6 +66,10 @@ Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 98f9d1c7..3ea01a1d 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -301,9 +301,7 @@ void EmitContext::DefineBuffers(const Info& info) {
     for (u32 i = 0; const auto& buffer : info.buffers) {
         const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32;
         const Id data_type = (*data_types)[1];
-        const u32 stride = buffer.stride == 0 ? 1 : buffer.stride;
-        const u32 num_elements = stride * buffer.num_records;
-        const Id record_array_type{TypeArray(data_type, ConstU32(num_elements))};
+        const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))};
         const Id struct_type{TypeStruct(record_array_type)};
         if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) {
             Decorate(record_array_type, spv::Decoration::ArrayStride, 4);
@@ -333,6 +331,7 @@ void EmitContext::DefineBuffers(const Info& info) {
             .id = id,
             .data_types = data_types,
             .pointer_type = pointer_type,
+            .buffer = buffer.GetVsharp(info),
         });
         interfaces.push_back(id);
         i++;
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index b51edd63..0f8081fd 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -201,6 +201,7 @@ public:
         Id id;
         const VectorIds* data_types;
         Id pointer_type;
+        AmdGpu::Buffer buffer;
     };
 
     u32& binding;
diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp
index 96f08519..bc2e0bf2 100644
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@@ -254,8 +254,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
             info.buffers.push_back({
                 .sgpr_base = attrib.sgpr_base,
                 .dword_offset = attrib.dword_offset,
-                .stride = buffer.GetStride(),
-                .num_records = buffer.num_records,
+                .length = buffer.num_records,
                 .used_types = IR::Type::F32,
                 .is_storage = true, // we may not fit into UBO with large meshes
                 .is_instance_data = true,
@@ -571,28 +570,40 @@ void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_l
             translator.V_CNDMASK_B32(inst);
             break;
         case Opcode::TBUFFER_LOAD_FORMAT_X:
-            translator.BUFFER_LOAD_FORMAT(1, true, inst);
+            translator.BUFFER_LOAD_FORMAT(1, true, true, inst);
             break;
         case Opcode::TBUFFER_LOAD_FORMAT_XY:
-            translator.BUFFER_LOAD_FORMAT(2, true, inst);
+            translator.BUFFER_LOAD_FORMAT(2, true, true, inst);
             break;
         case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
-            translator.BUFFER_LOAD_FORMAT(3, true, inst);
+            translator.BUFFER_LOAD_FORMAT(3, true, true, inst);
             break;
         case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
-            translator.BUFFER_LOAD_FORMAT(4, true, inst);
+            translator.BUFFER_LOAD_FORMAT(4, true, true, inst);
             break;
         case Opcode::BUFFER_LOAD_FORMAT_X:
-        case Opcode::BUFFER_LOAD_DWORD:
-            translator.BUFFER_LOAD_FORMAT(1, false, inst);
+            translator.BUFFER_LOAD_FORMAT(1, false, true, inst);
+            break;
+        case Opcode::BUFFER_LOAD_FORMAT_XY:
+            translator.BUFFER_LOAD_FORMAT(2, false, true, inst);
             break;
         case Opcode::BUFFER_LOAD_FORMAT_XYZ:
-        case Opcode::BUFFER_LOAD_DWORDX3:
-            translator.BUFFER_LOAD_FORMAT(3, false, inst);
+            translator.BUFFER_LOAD_FORMAT(3, false, true, inst);
             break;
         case Opcode::BUFFER_LOAD_FORMAT_XYZW:
+            translator.BUFFER_LOAD_FORMAT(4, false, true, inst);
+            break;
+        case Opcode::BUFFER_LOAD_DWORD:
+            translator.BUFFER_LOAD_FORMAT(1, false, false, inst);
+            break;
+        case Opcode::BUFFER_LOAD_DWORDX2:
+            translator.BUFFER_LOAD_FORMAT(2, false, false, inst);
+            break;
+        case Opcode::BUFFER_LOAD_DWORDX3:
+            translator.BUFFER_LOAD_FORMAT(3, false, false, inst);
+            break;
         case Opcode::BUFFER_LOAD_DWORDX4:
-            translator.BUFFER_LOAD_FORMAT(4, false, inst);
+            translator.BUFFER_LOAD_FORMAT(4, false, false, inst);
             break;
         case Opcode::BUFFER_STORE_FORMAT_X:
         case Opcode::BUFFER_STORE_DWORD:
diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h
index 6dd0a481..2aa6f712 100644
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@@ -160,7 +160,7 @@ public:
     void V_CMP_CLASS_F32(const GcnInst& inst);
 
     // Vector Memory
-    void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
+    void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst);
     void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
 
     // Vector interpolation
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
index f0ef85b3..1ddee523 100644
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -225,7 +225,8 @@ void Translator::IMAGE_STORE(const GcnInst& inst) {
     ir.ImageWrite(handle, body, value, {});
 }
 
-void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) {
+void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format,
+                                    const GcnInst& inst) {
     const auto& mtbuf = inst.control.mtbuf;
     const IR::VectorReg vaddr{inst.src[0].code};
     const IR::ScalarReg sharp{inst.src[2].code * 4};
@@ -254,7 +255,8 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst
     const IR::Value handle =
         ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
                               ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
-    const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info);
+    const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info)
+                                      : ir.LoadBuffer(num_dwords, handle, address, info);
     const IR::VectorReg dst_reg{inst.src[1].code};
     if (num_dwords == 1) {
         ir.SetVectorReg(dst_reg, IR::F32{value});
diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp
index 5dabbb4c..cd4fdaa2 100644
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@@ -327,6 +327,22 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad
     }
 }
 
+Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
+                                  BufferInstInfo info) {
+    switch (num_dwords) {
+    case 1:
+        return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
+    case 2:
+        return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address);
+    case 3:
+        return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address);
+    case 4:
+        return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address);
+    default:
+        UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
+    }
+}
+
 void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address,
                             const Value& data, BufferInstInfo info) {
     switch (num_dwords) {
diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h
index 5d6fd714..e7512430 100644
--- a/src/shader_recompiler/ir/ir_emitter.h
+++ b/src/shader_recompiler/ir/ir_emitter.h
@@ -89,6 +89,8 @@ public:
 
     [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,
                                    BufferInstInfo info);
+    [[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
+                                         BufferInstInfo info);
     void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data,
                      BufferInstInfo info);
 
diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc
index 94ef1784..9aefc8b3 100644
--- a/src/shader_recompiler/ir/opcodes.inc
+++ b/src/shader_recompiler/ir/opcodes.inc
@@ -79,6 +79,10 @@ OPCODE(LoadBufferF32,                                       F32,            Opaq
 OPCODE(LoadBufferF32x2,                                     F32x2,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x3,                                     F32x3,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x4,                                     F32x4,          Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferFormatF32,                                 F32,            Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferFormatF32x2,                               F32x2,          Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferFormatF32x3,                               F32x3,          Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferFormatF32x4,                               F32x4,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32,                                       U32,            Opaque,         Opaque,                                                         )
 OPCODE(StoreBufferF32,                                      Void,           Opaque,         Opaque,         F32,                                            )
 OPCODE(StoreBufferF32x2,                                    Void,           Opaque,         Opaque,         F32x2,                                          )
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index b7d6a722..f58b4d96 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -27,6 +27,10 @@ bool IsBufferInstruction(const IR::Inst& inst) {
     case IR::Opcode::LoadBufferF32x2:
     case IR::Opcode::LoadBufferF32x3:
     case IR::Opcode::LoadBufferF32x4:
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::LoadBufferFormatF32x2:
+    case IR::Opcode::LoadBufferFormatF32x3:
+    case IR::Opcode::LoadBufferFormatF32x4:
     case IR::Opcode::LoadBufferU32:
     case IR::Opcode::ReadConstBuffer:
     case IR::Opcode::ReadConstBufferU32:
@@ -41,8 +45,49 @@ bool IsBufferInstruction(const IR::Inst& inst) {
     }
 }
 
-IR::Type BufferDataType(const IR::Inst& inst) {
+static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
+    switch (num_format) {
+    case AmdGpu::NumberFormat::Float:
+        switch (data_format) {
+        case AmdGpu::DataFormat::Format16:
+        case AmdGpu::DataFormat::Format16_16:
+        case AmdGpu::DataFormat::Format16_16_16_16:
+            return true;
+        default:
+            return false;
+        }
+    case AmdGpu::NumberFormat::Unorm:
+    case AmdGpu::NumberFormat::Snorm:
+    case AmdGpu::NumberFormat::Uscaled:
+    case AmdGpu::NumberFormat::Sscaled:
+    case AmdGpu::NumberFormat::Uint:
+    case AmdGpu::NumberFormat::Sint:
+    case AmdGpu::NumberFormat::SnormNz:
+    default:
+        return false;
+    }
+}
+
+IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
     switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::LoadBufferFormatF32x2:
+    case IR::Opcode::LoadBufferFormatF32x3:
+    case IR::Opcode::LoadBufferFormatF32x4:
+        switch (num_format) {
+        case AmdGpu::NumberFormat::Unorm:
+        case AmdGpu::NumberFormat::Snorm:
+        case AmdGpu::NumberFormat::Uscaled:
+        case AmdGpu::NumberFormat::Sscaled:
+        case AmdGpu::NumberFormat::Uint:
+        case AmdGpu::NumberFormat::Sint:
+        case AmdGpu::NumberFormat::SnormNz:
+            return IR::Type::U32;
+        case AmdGpu::NumberFormat::Float:
+            return IR::Type::F32;
+        default:
+            UNREACHABLE();
+        }
     case IR::Opcode::LoadBufferF32:
     case IR::Opcode::LoadBufferF32x2:
     case IR::Opcode::LoadBufferF32x3:
@@ -141,7 +186,7 @@ public:
                    desc.inline_cbuf == existing.inline_cbuf;
         })};
         auto& buffer = buffer_resources[index];
-        ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records);
+        ASSERT(buffer.length == desc.length);
         buffer.is_storage |= desc.is_storage;
         buffer.used_types |= desc.used_types;
         return index;
@@ -263,6 +308,41 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
 
 static constexpr size_t MaxUboSize = 65536;
 
+static bool IsLoadBufferFormat(const IR::Inst& inst) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::LoadBufferFormatF32x2:
+    case IR::Opcode::LoadBufferFormatF32x3:
+    case IR::Opcode::LoadBufferFormatF32x4:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static bool IsReadConstBuffer(const IR::Inst& inst) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::ReadConstBuffer:
+    case IR::Opcode::ReadConstBufferU32:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static u32 BufferLength(const AmdGpu::Buffer& buffer) {
+    const auto stride = buffer.GetStride();
+    if (stride < sizeof(f32)) {
+        ASSERT(sizeof(f32) % stride == 0);
+        return (((buffer.num_records - 1) / sizeof(f32)) + 1) * stride;
+    } else if (stride == sizeof(f32)) {
+        return buffer.num_records;
+    } else {
+        ASSERT(stride % sizeof(f32) == 0);
+        return buffer.num_records * (stride / sizeof(f32));
+    }
+}
+
 s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
                         AmdGpu::Buffer& cbuf) {
 
@@ -298,9 +378,8 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
     return descriptors.Add(BufferResource{
         .sgpr_base = std::numeric_limits<u32>::max(),
         .dword_offset = 0,
-        .stride = cbuf.GetStride(),
-        .num_records = u32(cbuf.num_records),
-        .used_types = BufferDataType(inst),
+        .length = BufferLength(cbuf),
+        .used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
         .inline_cbuf = cbuf,
         .is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize,
     });
@@ -318,9 +397,8 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
         binding = descriptors.Add(BufferResource{
             .sgpr_base = sharp.sgpr_base,
             .dword_offset = sharp.dword_offset,
-            .stride = buffer.GetStride(),
-            .num_records = u32(buffer.num_records),
-            .used_types = BufferDataType(inst),
+            .length = BufferLength(buffer),
+            .used_types = BufferDataType(inst, buffer.GetNumberFmt()),
             .is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
         });
     }
@@ -337,24 +415,31 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
                 inst_info.dmft == AmdGpu::DataFormat::Format32_32 ||
                 inst_info.dmft == AmdGpu::DataFormat::Format32));
     }
-    if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
-        inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
+
+    if (IsReadConstBuffer(inst)) {
         return;
     }
-    // Calculate buffer address.
-    const u32 dword_stride = buffer.GetStrideElements(sizeof(u32));
-    const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32);
-    IR::U32 address = ir.Imm32(dword_offset);
-    if (inst_info.index_enable && inst_info.offset_enable) {
-        const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 1)};
-        const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 0)};
-        address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
-        address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2)));
-    } else if (inst_info.index_enable) {
-        const IR::U32 index{inst.Arg(1)};
-        address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address);
-    } else if (inst_info.offset_enable) {
-        const IR::U32 offset{inst.Arg(1)};
+
+    if (IsLoadBufferFormat(inst)) {
+        if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) {
+            info.uses_fp16 = true;
+        }
+    } else {
+        const u32 stride = buffer.GetStride();
+        ASSERT_MSG(stride >= 4, "non-formatting load_buffer_* is not implemented for stride {}",
+                   stride);
+    }
+
+    IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
+    if (inst_info.index_enable) {
+        const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
+                                                      : IR::U32{inst.Arg(1)};
+        address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride())));
+    }
+    if (inst_info.offset_enable) {
+        const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
+                                                      : IR::U32{inst.Arg(1)};
+        address = ir.IAdd(address, offset);
     }
     inst.SetArg(1, address);
 }
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index 2b2103ca..8824e344 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -74,8 +74,7 @@ struct Info;
 struct BufferResource {
     u32 sgpr_base;
     u32 dword_offset;
-    u32 stride;
-    u32 num_records;
+    u32 length;
     IR::Type used_types;
     AmdGpu::Buffer inline_cbuf;
     bool is_storage{false};
diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp
index 5f6eb903..6618e72a 100644
--- a/src/video_core/amdgpu/pixel_format.cpp
+++ b/src/video_core/amdgpu/pixel_format.cpp
@@ -66,4 +66,110 @@ int NumBits(DataFormat format) {
     return num_bits_per_element[index];
 }
 
+static constexpr std::array component_bits = {
+    std::array{0, 0, 0, 0},     //  0 FormatInvalid
+    std::array{8, 0, 0, 0},     //  1 Format8
+    std::array{16, 0, 0, 0},    //  2 Format16
+    std::array{8, 8, 0, 0},     //  3 Format8_8
+    std::array{32, 0, 0, 0},    //  4 Format32
+    std::array{16, 16, 0, 0},   //  5 Format16_16
+    std::array{10, 11, 11, 0},  //  6 Format10_11_11
+    std::array{11, 11, 10, 0},  //  7 Format11_11_10
+    std::array{10, 10, 10, 2},  //  8 Format10_10_10_2
+    std::array{2, 10, 10, 10},  //  9 Format2_10_10_10
+    std::array{8, 8, 8, 8},     // 10 Format8_8_8_8
+    std::array{32, 32, 0, 0},   // 11 Format32_32
+    std::array{16, 16, 16, 16}, // 12 Format16_16_16_16
+    std::array{32, 32, 32, 0},  // 13 Format32_32_32
+    std::array{32, 32, 32, 32}, // 14 Format32_32_32_32
+    std::array{0, 0, 0, 0},     // 15
+    std::array{5, 6, 5, 0},     // 16 Format5_6_5
+    std::array{1, 5, 5, 5},     // 17 Format1_5_5_5
+    std::array{5, 5, 5, 1},     // 18 Format5_5_5_1
+    std::array{4, 4, 4, 4},     // 19 Format4_4_4_4
+    std::array{8, 24, 0, 0},    // 20 Format8_24
+    std::array{24, 8, 0, 0},    // 21 Format24_8
+    std::array{24, 8, 0, 0},    // 22 FormatX24_8_32
+    std::array{0, 0, 0, 0},     // 23
+    std::array{0, 0, 0, 0},     // 24
+    std::array{0, 0, 0, 0},     // 25
+    std::array{0, 0, 0, 0},     // 26
+    std::array{0, 0, 0, 0},     // 27
+    std::array{0, 0, 0, 0},     // 28
+    std::array{0, 0, 0, 0},     // 29
+    std::array{0, 0, 0, 0},     // 30
+    std::array{0, 0, 0, 0},     // 31
+    std::array{0, 0, 0, 0},     // 32 FormatGB_GR
+    std::array{0, 0, 0, 0},     // 33 FormatBG_RG
+    std::array{0, 0, 0, 0},     // 34 Format5_9_9_9
+    std::array{0, 0, 0, 0},     // 35 FormatBc1
+    std::array{0, 0, 0, 0},     // 36 FormatBc2
+    std::array{0, 0, 0, 0},     // 37 FormatBc3
+    std::array{0, 0, 0, 0},     // 38 FormatBc4
+    std::array{0, 0, 0, 0},     // 39 FormatBc5
+    std::array{0, 0, 0, 0},     // 40 FormatBc6
+    std::array{0, 0, 0, 0},     // 41 FormatBc7
+};
+
+u32 ComponentBits(DataFormat format, u32 comp) {
+    const u32 index = static_cast<u32>(format);
+    if (index >= component_bits.size() || comp >= 4) {
+        return 0;
+    }
+    return component_bits[index][comp];
+}
+
+static constexpr std::array component_offset = {
+    std::array{-1, -1, -1, -1}, //  0 FormatInvalid
+    std::array{0, -1, -1, -1},  //  1 Format8
+    std::array{0, -1, -1, -1},  //  2 Format16
+    std::array{0, 8, -1, -1},   //  3 Format8_8
+    std::array{0, -1, -1, -1},  //  4 Format32
+    std::array{0, 16, -1, -1},  //  5 Format16_16
+    std::array{0, 10, 21, -1},  //  6 Format10_11_11
+    std::array{0, 11, 22, -1},  //  7 Format11_11_10
+    std::array{0, 10, 20, 30},  //  8 Format10_10_10_2
+    std::array{0, 2, 12, 22},   //  9 Format2_10_10_10
+    std::array{0, 8, 16, 24},   // 10 Format8_8_8_8
+    std::array{0, 32, -1, -1},  // 11 Format32_32
+    std::array{0, 16, 32, 48},  // 12 Format16_16_16_16
+    std::array{0, 32, 64, -1},  // 13 Format32_32_32
+    std::array{0, 32, 64, 96},  // 14 Format32_32_32_32
+    std::array{-1, -1, -1, -1}, // 15
+    std::array{0, 5, 11, -1},   // 16 Format5_6_5
+    std::array{0, 1, 6, 11},    // 17 Format1_5_5_5
+    std::array{0, 5, 10, 15},   // 18 Format5_5_5_1
+    std::array{0, 4, 8, 12},    // 19 Format4_4_4_4
+    std::array{0, 8, -1, -1},   // 20 Format8_24
+    std::array{0, 24, -1, -1},  // 21 Format24_8
+    std::array{0, 24, -1, -1},  // 22 FormatX24_8_32
+    std::array{-1, -1, -1, -1}, // 23
+    std::array{-1, -1, -1, -1}, // 24
+    std::array{-1, -1, -1, -1}, // 25
+    std::array{-1, -1, -1, -1}, // 26
+    std::array{-1, -1, -1, -1}, // 27
+    std::array{-1, -1, -1, -1}, // 28
+    std::array{-1, -1, -1, -1}, // 29
+    std::array{-1, -1, -1, -1}, // 30
+    std::array{-1, -1, -1, -1}, // 31
+    std::array{-1, -1, -1, -1}, // 32 FormatGB_GR
+    std::array{-1, -1, -1, -1}, // 33 FormatBG_RG
+    std::array{-1, -1, -1, -1}, // 34 Format5_9_9_9
+    std::array{-1, -1, -1, -1}, // 35 FormatBc1
+    std::array{-1, -1, -1, -1}, // 36 FormatBc2
+    std::array{-1, -1, -1, -1}, // 37 FormatBc3
+    std::array{-1, -1, -1, -1}, // 38 FormatBc4
+    std::array{-1, -1, -1, -1}, // 39 FormatBc5
+    std::array{-1, -1, -1, -1}, // 40 FormatBc6
+    std::array{-1, -1, -1, -1}, // 41 FormatBc7
+};
+
+s32 ComponentOffset(DataFormat format, u32 comp) {
+    const u32 index = static_cast<u32>(format);
+    if (index >= component_offset.size() || comp >= 4) {
+        return -1;
+    }
+    return component_offset[index][comp];
+}
+
 } // namespace AmdGpu
diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h
index 22d102af..2a38c5a0 100644
--- a/src/video_core/amdgpu/pixel_format.h
+++ b/src/video_core/amdgpu/pixel_format.h
@@ -65,6 +65,8 @@ enum class NumberFormat : u32 {
 
 int NumComponents(DataFormat format);
 int NumBits(DataFormat format);
+u32 ComponentBits(DataFormat format, u32 comp);
+s32 ComponentOffset(DataFormat format, u32 comp);
 
 } // namespace AmdGpu
 
diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h
index ba3de154..1247c025 100644
--- a/src/video_core/amdgpu/resource.h
+++ b/src/video_core/amdgpu/resource.h
@@ -62,14 +62,6 @@ struct Buffer {
         return stride == 0 ? 1U : stride;
     }
 
-    u32 GetStrideElements(u32 element_size) const noexcept {
-        if (stride == 0) {
-            return 1U;
-        }
-        ASSERT(stride % element_size == 0);
-        return stride / element_size;
-    }
-
     u32 GetSize() const noexcept {
         return GetStride() * num_records;
     }