Initial instancing and asynchronous compute queues (#207)

* gnm_driver: added `sceGnmRegisterOwner` and `sceGnmRegisterResource` * video_out: `sceVideoOutGetDeviceCapabilityInfo` for sdk runtime * gnm_driver: correct vqid index range * amdgpu: indirect buffer, release mem and some additional irq modes * amdgpu: added ASC commands processor * shader_recompiler: added support for fetch instance id * amdgpu: classic bitfields for T# representation (debugging experience) * renderer_vulkan: skip zero sized VBs from binding * texture_cache: image upload logic moved into `Image` object * gnm_driver: `sceGnmDingDong` implementation * texture_cache: `Image` usage flags moved; correct VO buffer pitch
2024-06-22 18:50:20 +02:00 · 2024-06-22 18:50:20 +02:00 · cb6b21de1f
parent a9cbd8287c
commit cb6b21de1f
19 changed files with 361 additions and 100 deletions
--- a/src/core/libraries/error_codes.h
+++ b/src/core/libraries/error_codes.h
@ -252,6 +252,7 @@ constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_QUEUE_ID = 0x80D17001;
 constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_RING_BASE_ADDR = 0x80D17003;
 constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_RING_SIZE = 0x80D17002;
 constexpr int ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_READ_PTR_ADDR = 0x80D17004;
+constexpr int ORBIS_GNM_ERROR_FAILURE = 0x8EEE00FF;

 // Generic
 constexpr int ORBIS_OK = 0x00000000;
--- a/src/core/libraries/gnmdriver/gnmdriver.cpp
+++ b/src/core/libraries/gnmdriver/gnmdriver.cpp
@ -31,6 +31,7 @@ static constexpr bool g_fair_hw_init = false;

 // In case if `submitDone` is issued we need to block submissions until GPU idle
 static u32 submission_lock{};
+static std::mutex m_submission{};
 static u64 frames_submitted{}; // frame counter

 struct AscQueueInfo {
@ -211,9 +212,32 @@ int PS4_SYSV_ABI sceGnmDestroyWorkloadStream() {
    return ORBIS_OK;
 }

-int PS4_SYSV_ABI sceGnmDingDong() {
-    LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
-    return ORBIS_OK;
+void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) {
+    LOG_INFO(Lib_GnmDriver, "vqid {}, offset_dw {}", gnm_vqid, next_offs_dw);
+
+    if (gnm_vqid == 0) {
+        return;
+    }
+
+    std::unique_lock lock{m_submission};
+    if (submission_lock != 0) {
+        liverpool->WaitGpuIdle();
+
+        // Suspend logic goes here
+
+        submission_lock = 0;
+    }
+
+    auto vqid = gnm_vqid - 1;
+    auto& asc_queue = asc_queues[{vqid}];
+    const auto* acb_ptr = reinterpret_cast<const u32*>(asc_queue.map_addr + *asc_queue.read_addr);
+    const auto acb_size = next_offs_dw ? (next_offs_dw << 2u) - *asc_queue.read_addr
+                                       : (asc_queue.ring_size_dw << 2u) - *asc_queue.read_addr;
+
+    liverpool->SubmitAsc(vqid, {acb_ptr, acb_size >> 2u});
+
+    *asc_queue.read_addr += acb_size;
+    *asc_queue.read_addr %= asc_queue.ring_size_dw * 4;
 }

 int PS4_SYSV_ABI sceGnmDingDongForWorkload() {
@ -764,10 +788,12 @@ int PS4_SYSV_ABI sceGnmMapComputeQueue(u32 pipe_id, u32 queue_id, VAddr ring_bas
    }

    auto vqid = asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw);
+    // We need to offset index as `dingDong` assumes it to be from the range [1..64]
+    const auto gnm_vqid = vqid.index + 1;
    LOG_INFO(Lib_GnmDriver, "ASC pipe {} queue {} mapped to vqueue {}", pipe_id, queue_id,
-             vqid.index);
+             gnm_vqid);

-    return vqid.index;
+    return gnm_vqid;
 }

 int PS4_SYSV_ABI sceGnmMapComputeQueueWithPriority(u32 pipe_id, u32 queue_id, VAddr ring_base_addr,
@ -814,14 +840,16 @@ int PS4_SYSV_ABI sceGnmRegisterGnmLiveCallbackConfig() {
    return ORBIS_OK;
 }

-int PS4_SYSV_ABI sceGnmRegisterOwner() {
-    LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
-    return ORBIS_OK;
+s32 PS4_SYSV_ABI sceGnmRegisterOwner(void* handle, const char* name) {
+    LOG_TRACE(Lib_GnmDriver, "called");
+    return ORBIS_GNM_ERROR_FAILURE; // PA Debug is always disabled in retail FW
 }

-int PS4_SYSV_ABI sceGnmRegisterResource() {
-    LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
-    return ORBIS_OK;
+s32 PS4_SYSV_ABI sceGnmRegisterResource(void* res_handle, void* owner_handle, const void* addr,
+                                        size_t size, const char* name, int res_type,
+                                        u64 user_data) {
+    LOG_TRACE(Lib_GnmDriver, "called");
+    return ORBIS_GNM_ERROR_FAILURE; // PA Debug is always disabled in retail FW
 }

 int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDone() {
--- a/src/core/libraries/gnmdriver/gnmdriver.h
+++ b/src/core/libraries/gnmdriver/gnmdriver.h
@ -33,7 +33,7 @@ int PS4_SYSV_ABI sceGnmDebuggerWriteSqIndirectRegister();
 int PS4_SYSV_ABI sceGnmDebugHardwareStatus();
 s32 PS4_SYSV_ABI sceGnmDeleteEqEvent(SceKernelEqueue eq, u64 id);
 int PS4_SYSV_ABI sceGnmDestroyWorkloadStream();
-int PS4_SYSV_ABI sceGnmDingDong();
+void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw);
 int PS4_SYSV_ABI sceGnmDingDongForWorkload();
 int PS4_SYSV_ABI sceGnmDisableMipStatsReport();
 s32 PS4_SYSV_ABI sceGnmDispatchDirect(u32* cmdbuf, u32 size, u32 threads_x, u32 threads_y,
@ -125,8 +125,9 @@ int PS4_SYSV_ABI sceGnmQueryResourceRegistrationUserMemoryRequirements();
 int PS4_SYSV_ABI sceGnmRaiseUserExceptionEvent();
 int PS4_SYSV_ABI sceGnmRegisterGdsResource();
 int PS4_SYSV_ABI sceGnmRegisterGnmLiveCallbackConfig();
-int PS4_SYSV_ABI sceGnmRegisterOwner();
-int PS4_SYSV_ABI sceGnmRegisterResource();
+s32 PS4_SYSV_ABI sceGnmRegisterOwner(void* handle, const char* name);
+s32 PS4_SYSV_ABI sceGnmRegisterResource(void* res_handle, void* owner_handle, const void* addr,
+                                        size_t size, const char* name, int res_type, u64 user_data);
 int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDone();
 int PS4_SYSV_ABI sceGnmRequestFlipAndSubmitDoneForWorkload();
 int PS4_SYSV_ABI sceGnmRequestMipStatsReportAndReset();
--- a/src/core/libraries/videoout/video_out.cpp
+++ b/src/core/libraries/videoout/video_out.cpp
@ -288,6 +288,8 @@ void RegisterLib(Core::Loader::SymbolsResolver* sym) {
    LIB_FUNCTION("uquVH4-Du78", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutClose);
    LIB_FUNCTION("1FZBKy8HeNU", "libSceVideoOut", 1, "libSceVideoOut", 0, 0,
                 sceVideoOutGetVblankStatus);
+    LIB_FUNCTION("kGVLc3htQE8", "libSceVideoOut", 1, "libSceVideoOut", 0, 0,
+                 sceVideoOutGetDeviceCapabilityInfo);

    // openOrbis appears to have libSceVideoOut_v1 module libSceVideoOut_v1.1
    LIB_FUNCTION("Up36PTk687E", "libSceVideoOut", 1, "libSceVideoOut", 1, 1, sceVideoOutOpen);
--- a/src/shader_recompiler/frontend/fetch_shader.cpp
+++ b/src/shader_recompiler/frontend/fetch_shader.cpp
@ -72,6 +72,9 @@ std::vector<VertexAttribute> ParseFetchShader(const u32* code) {
            attrib.sgpr_base = it->base_sgpr;
            attrib.dword_offset = it->dword_offset;

+            // Store instance id rate
+            attrib.instance_data = inst.src[0].code;
+
            // Mark load as used.
            it->dst_reg = -1;
        }
--- a/src/shader_recompiler/frontend/fetch_shader.h
+++ b/src/shader_recompiler/frontend/fetch_shader.h
@ -9,11 +9,12 @@
 namespace Shader::Gcn {

 struct VertexAttribute {
-    u8 semantic;     ///< Semantic index of the attribute
-    u8 dest_vgpr;    ///< Destination VGPR to load first component.
-    u8 num_elements; ///< Number of components to load
-    u8 sgpr_base;    ///< SGPR that contains the pointer to the list of vertex V#
-    u8 dword_offset; ///< The dword offset of the V# that describes this attribute.
+    u8 semantic;      ///< Semantic index of the attribute
+    u8 dest_vgpr;     ///< Destination VGPR to load first component.
+    u8 num_elements;  ///< Number of components to load
+    u8 sgpr_base;     ///< SGPR that contains the pointer to the list of vertex V#
+    u8 dword_offset;  ///< The dword offset of the V# that describes this attribute.
+    u8 instance_data; ///< Indicates that the buffer will be accessed in instance rate
 };

 std::vector<VertexAttribute> ParseFetchShader(const u32* code);
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -194,6 +194,11 @@ void Translator::EmitFetch(const GcnInst& inst) {
            ir.SetVectorReg(dst_reg++, ir.GetAttribute(attr, i));
        }

+        if (attrib.instance_data == 2 || attrib.instance_data == 3) {
+            LOG_WARNING(Render_Recompiler, "Unsupported instance step rate = {}",
+                        attrib.instance_data);
+        }
+
        // Read the V# of the attribute to figure out component number and type.
        const auto buffer = info.ReadUd<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
        const u32 num_components = AmdGpu::NumComponents(buffer.data_format);
@ -203,6 +208,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
            .num_components = std::min<u16>(attrib.num_elements, num_components),
            .sgpr_base = attrib.sgpr_base,
            .dword_offset = attrib.dword_offset,
+            .instance_step_rate = static_cast<Info::VsInput::InstanceIdType>(attrib.instance_data),
        });
    }
 }
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@ -270,8 +270,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
    u32 image_binding = descriptors.Add(ImageResource{
        .sgpr_base = tsharp.sgpr_base,
        .dword_offset = tsharp.dword_offset,
-        .type = image.type,
-        .nfmt = static_cast<AmdGpu::NumberFormat>(image.num_format.Value()),
+        .type = image.GetType(),
+        .nfmt = static_cast<AmdGpu::NumberFormat>(image.GetNumberFmt()),
        .is_storage = IsImageStorageInstruction(inst),
        .is_depth = bool(inst_info.is_depth),
    });
@ -293,7 +293,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
    // Now that we know the image type, adjust texture coordinate vector.
    const IR::Inst* body = inst.Arg(1).InstRecursive();
    const auto [coords, arg] = [&] -> std::pair<IR::Value, IR::Value> {
-        switch (image.type) {
+        switch (image.GetType()) {
        case AmdGpu::ImageType::Color1D:
            return {body->Arg(0), body->Arg(1)};
        case AmdGpu::ImageType::Color1DArray:
@ -305,7 +305,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
        case AmdGpu::ImageType::Cube:
            return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
        default:
-            UNREACHABLE_MSG("Unknown image type {}", image.type.Value());
+            UNREACHABLE_MSG("Unknown image type {}", image.GetType());
        }
    }();
    inst.SetArg(1, coords);
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@ -72,11 +72,19 @@ using SamplerResourceList = boost::container::static_vector<SamplerResource, 8>;

 struct Info {
    struct VsInput {
+        enum InstanceIdType : u8 {
+            None = 0,
+            OverStepRate0 = 1,
+            OverStepRate1 = 2,
+            Plain = 3,
+        };
+
        AmdGpu::NumberFormat fmt;
        u16 binding;
        u16 num_components;
        u8 sgpr_base;
        u8 dword_offset;
+        InstanceIdType instance_step_rate;
    };
    boost::container::static_vector<VsInput, 32> vs_inputs{};

--- a/src/video_core/amdgpu/liverpool.cpp
+++ b/src/video_core/amdgpu/liverpool.cpp
@ -12,7 +12,7 @@ namespace AmdGpu {

 static const char* dcb_task_name{"DCB_TASK"};
 static const char* ccb_task_name{"CCB_TASK"};
-static const char* asc_task_name{"ACB_TASK"};
+static const char* acb_task_name{"ACB_TASK"};

 std::array<u8, 48_KB> Liverpool::ConstantEngine::constants_heap;

@ -381,6 +381,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
 }

 Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb) {
+    TracyFiberEnter(acb_task_name);
+
    while (!acb.empty()) {
        const auto* header = reinterpret_cast<const PM4Header*>(acb.data());
        const u32 type = header->type;
@ -393,6 +395,69 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb) {
        const PM4ItOpcode opcode = header->type3.opcode;
        const auto* it_body = reinterpret_cast<const u32*>(header) + 1;
        switch (opcode) {
+        case PM4ItOpcode::Nop: {
+            const auto* nop = reinterpret_cast<const PM4CmdNop*>(header);
+            break;
+        }
+        case PM4ItOpcode::IndirectBuffer: {
+            const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
+            auto task =
+                ProcessCompute({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
+            while (!task.handle.done()) {
+                task.handle.resume();
+
+                TracyFiberLeave;
+                co_yield {};
+                TracyFiberEnter(acb_task_name);
+            };
+            break;
+        }
+        case PM4ItOpcode::AcquireMem: {
+            break;
+        }
+        case PM4ItOpcode::SetShReg: {
+            const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
+            std::memcpy(&regs.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
+                        (count - 1) * sizeof(u32));
+            break;
+        }
+        case PM4ItOpcode::DispatchDirect: {
+            const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
+            regs.cs_program.dim_x = dispatch_direct->dim_x;
+            regs.cs_program.dim_y = dispatch_direct->dim_y;
+            regs.cs_program.dim_z = dispatch_direct->dim_z;
+            regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
+            if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
+                rasterizer->DispatchDirect();
+            }
+            break;
+        }
+        case PM4ItOpcode::WriteData: {
+            const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
+            ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
+            const u32 data_size = (header->type3.count.Value() - 2) * 4;
+            if (!write_data->wr_one_addr.Value()) {
+                std::memcpy(write_data->Address<void*>(), write_data->data, data_size);
+            } else {
+                UNREACHABLE();
+            }
+            break;
+        }
+        case PM4ItOpcode::WaitRegMem: {
+            const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
+            ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
+            while (!wait_reg_mem->Test()) {
+                TracyFiberLeave;
+                co_yield {};
+                TracyFiberEnter(acb_task_name);
+            }
+            break;
+        }
+        case PM4ItOpcode::ReleaseMem: {
+            const auto* release_mem = reinterpret_cast<const PM4CmdReleaseMem*>(header);
+            release_mem->SignalFence(Platform::InterruptId::Compute0RelMem); // <---
+            break;
+        }
        default:
            UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
                            static_cast<u32>(opcode), count);
@ -401,7 +466,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb) {
        acb = acb.subspan(header->type3.NumWords() + 1);
    }

-    return {}; // Not a coroutine yet
+    TracyFiberLeave;
 }

 void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
--- a/src/video_core/amdgpu/pm4_cmds.h
+++ b/src/video_core/amdgpu/pm4_cmds.h
@ -265,6 +265,7 @@ enum class InterruptSelect : u32 {
    None = 0,
    IrqOnly = 1,
    IrqWhenWriteConfirm = 2,
+    IrqUndocumented = 3,
 };

 struct PM4CmdEventWriteEop {
@ -299,6 +300,9 @@ struct PM4CmdEventWriteEop {

    void SignalFence() const {
        switch (data_sel.Value()) {
+        case DataSelect::None: {
+            break;
+        }
        case DataSelect::Data32Low: {
            *Address<u32>() = DataDWord();
            break;
@ -321,6 +325,9 @@ struct PM4CmdEventWriteEop {
            // No interrupt
            break;
        }
+        case InterruptSelect::IrqOnly:
+            ASSERT(data_sel == DataSelect::None);
+            [[fallthrough]];
        case InterruptSelect::IrqWhenWriteConfirm: {
            Platform::IrqC::Instance()->Signal(Platform::InterruptId::GfxEop);
            break;
@ -559,4 +566,105 @@ struct PM4CmdDrawIndexBase {
    u32 addr_hi;
 };

+struct PM4CmdIndirectBuffer {
+    PM4Type3Header header;
+    u32 ibase_lo; ///< Indirect buffer base address, must be 4 byte aligned
+    union {
+        BitField<0, 16, u32> ibase_hi; ///< Indirect buffer base address
+        u32 dw1;
+    };
+    union {
+        BitField<0, 20, u32> ib_size; ///< Indirect buffer size
+        BitField<20, 1, u32> chain;   ///< set to chain to IB allocations
+        BitField<24, 8, u32> vmid;    ///< Virtual memory domain ID for command buffer
+        u32 dw2;
+    };
+
+    template <typename T>
+    T* Address() const {
+        return reinterpret_cast<T*>((u64(ibase_hi) << 32u) | ibase_lo);
+    }
+};
+
+struct PM4CmdReleaseMem {
+    PM4Type3Header header;
+    union {
+        BitField<0, 6, u32> event_type;  ///< Event type written to VGT_EVENT_INITIATOR
+        BitField<8, 4, u32> event_index; ///< Event index
+        BitField<12, 1, u32> tcl1_vol_action_ena;
+        BitField<13, 1, u32> tc_vol_action_ena;
+        BitField<15, 1, u32> tc_wb_action_ena;
+        BitField<16, 1, u32> tcl1__action_ena;
+        BitField<17, 1, u32> tc_action_ena;
+        BitField<25, 2, u32> cache_policy; ///< Cache Policy setting used for writing fences and
+                                           ///< timestamps to the TCL2
+        u32 dw1;
+    };
+    union {
+        BitField<16, 2, u32> dst_sel;             ///< destination select
+        BitField<24, 3, InterruptSelect> int_sel; ///< selects interrupt action for end-of-pipe
+        BitField<29, 3, DataSelect> data_sel;     ///< selects source of data
+        u32 dw2;
+    };
+    u32 address_lo; ///< low bits of address
+    u32 address_hi; ///< high bits of address
+    union {
+        struct {
+            u16 gds_index; ///< Byte offset into GDS to copy from
+            u16 num_dw;    ///< Number of DWORDS of GDS to copy
+        };
+        u32 data_lo; ///< value that will be written to memory when event occurs
+    };
+    u32 data_hi;
+
+    template <typename T>
+    T* Address() const {
+        return reinterpret_cast<T*>(address_lo | u64(address_hi) << 32);
+    }
+
+    u32 DataDWord() const {
+        return data_lo;
+    }
+
+    u64 DataQWord() const {
+        return data_lo | u64(data_hi) << 32;
+    }
+
+    void SignalFence(Platform::InterruptId irq_id) const {
+        switch (data_sel.Value()) {
+        case DataSelect::Data32Low: {
+            *Address<u32>() = DataDWord();
+            break;
+        }
+        case DataSelect::Data64: {
+            *Address<u64>() = DataQWord();
+            break;
+        }
+        case DataSelect::PerfCounter: {
+            *Address<u64>() = Common::FencedRDTSC();
+            break;
+        }
+        default: {
+            UNREACHABLE();
+        }
+        }
+
+        switch (int_sel.Value()) {
+        case InterruptSelect::None: {
+            // No interrupt
+            break;
+        }
+        case InterruptSelect::IrqUndocumented:
+            [[fallthrough]];
+        case InterruptSelect::IrqWhenWriteConfirm: {
+            Platform::IrqC::Instance()->Signal(irq_id);
+            break;
+        }
+        default: {
+            UNREACHABLE();
+        }
+        }
+    }
+};
+
 } // namespace AmdGpu
--- a/src/video_core/amdgpu/pm4_opcodes.h
+++ b/src/video_core/amdgpu/pm4_opcodes.h
@ -46,6 +46,7 @@ enum class PM4ItOpcode : u32 {
    EventWrite = 0x46,
    EventWriteEop = 0x47,
    EventWriteEos = 0x48,
+    ReleaseMem = 0x49,
    PremableCntl = 0x4A,
    DmaData = 0x50,
    ContextRegRmw = 0x51,
--- a/src/video_core/amdgpu/resource.h
+++ b/src/video_core/amdgpu/resource.h
@ -108,36 +108,39 @@ constexpr std::string_view NameOf(TilingMode type) {
 }

 struct Image {
-    union {
-        BitField<0, 38, u64> base_address;
-        BitField<40, 12, u64> min_lod;
-        BitField<52, 6, u64> data_format;
-        BitField<58, 4, u64> num_format;
-        BitField<62, 2, u64> mtype;
-    };
-    union {
-        BitField<0, 14, u64> width;
-        BitField<14, 14, u64> height;
-        BitField<28, 3, u64> perf_modulation;
-        BitField<31, 1, u64> interlaced;
-        BitField<32, 3, u64> dst_sel_x;
-        BitField<35, 3, u64> dst_sel_y;
-        BitField<38, 3, u64> dst_sel_z;
-        BitField<41, 3, u64> dst_sel_w;
-        BitField<44, 4, u64> base_level;
-        BitField<48, 4, u64> last_level;
-        BitField<52, 5, u64> tiling_index;
-        BitField<57, 1, u64> pow2pad;
-        BitField<58, 1, u64> mtype2;
-        BitField<59, 1, u64> atc;
-        BitField<60, 4, ImageType> type;
-    };
-    union {
-        BitField<0, 13, u64> depth;
-        BitField<13, 14, u64> pitch;
-        BitField<32, 13, u64> base_array;
-        BitField<45, 13, u64> last_array;
-    };
+    u64 base_address : 38;
+    u64 mtype_l2 : 2;
+    u64 min_lod : 12;
+    u64 data_format : 6;
+    u64 num_format : 4;
+    u64 mtype : 2;
+
+    u64 width : 14;
+    u64 height : 14;
+    u64 perf_modulation : 3;
+    u64 interlaced : 1;
+    u64 dst_sel_x : 3;
+    u64 dst_sel_y : 3;
+    u64 dst_sel_z : 3;
+    u64 dst_sel_w : 3;
+    u64 base_level : 4;
+    u64 last_level : 4;
+    u64 tiling_index : 5;
+    u64 pow2pad : 1;
+    u64 mtype2 : 1;
+    u64 atc : 1;
+    u64 type : 4;
+
+    u64 depth : 13;
+    u64 pitch : 14;
+    u64 : 5;
+    u64 base_array : 13;
+    u64 last_array : 13;
+    u64 : 6;
+    u64 min_lod_warn : 12;
+    u64 counter_bank_id : 8;
+    u64 lod_hw_cnt_en : 1;
+    u64 : 43;

    VAddr Address() const {
        return base_address << 8;
@ -148,8 +151,8 @@ struct Image {
    }

    u32 NumLayers() const {
-        u32 slices = type == ImageType::Color3D ? 1 : depth.Value() + 1;
-        if (type == ImageType::Cube) {
+        u32 slices = GetType() == ImageType::Color3D ? 1 : depth + 1;
+        if (GetType() == ImageType::Cube) {
            slices *= 6;
        }
        if (pow2pad) {
@ -159,33 +162,38 @@ struct Image {
    }

    u32 NumLevels() const {
-        if (type == ImageType::Color2DMsaa || type == ImageType::Color2DMsaaArray) {
+        if (GetType() == ImageType::Color2DMsaa || GetType() == ImageType::Color2DMsaaArray) {
            return 1;
        }
        return last_level + 1;
    }

+    ImageType GetType() const noexcept {
+        return static_cast<ImageType>(type);
+    }
+
    DataFormat GetDataFmt() const noexcept {
-        return static_cast<DataFormat>(data_format.Value());
+        return static_cast<DataFormat>(data_format);
    }

    NumberFormat GetNumberFmt() const noexcept {
-        return static_cast<NumberFormat>(num_format.Value());
+        return static_cast<NumberFormat>(num_format);
    }

-    [[nodiscard]] TilingMode GetTilingMode() const {
-        return static_cast<TilingMode>(tiling_index.Value());
+    TilingMode GetTilingMode() const {
+        return static_cast<TilingMode>(tiling_index);
    }

-    [[nodiscard]] bool IsTiled() const {
+    bool IsTiled() const {
        return GetTilingMode() != TilingMode::Display_Linear;
    }

-    [[nodiscard]] size_t GetSizeAligned() const {
+    size_t GetSizeAligned() const {
        // TODO: Derive this properly from tiling params
-        return (width + 1) * (height + 1) * NumComponents(GetDataFmt());
+        return Pitch() * (height + 1) * NumComponents(GetDataFmt());
    }
 };
+static_assert(sizeof(Image) == 32); // 256bits

 // 8.2.7. Image Sampler [RDNA 2 Instruction Set Architecture]
 enum class ClampMode : u64 {
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@ -53,7 +53,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
        bindings.push_back({
            .binding = input.binding,
            .stride = buffer.GetStride(),
-            .inputRate = vk::VertexInputRate::eVertex,
+            .inputRate = input.instance_step_rate == Shader::Info::VsInput::None
+                             ? vk::VertexInputRate::eVertex
+                             : vk::VertexInputRate::eInstance,
        });
    }

@ -402,8 +404,11 @@ void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const {
    // Calculate buffers memory overlaps
    boost::container::static_vector<BufferRange, MaxVertexBufferCount> ranges{};
    for (const auto& input : vs_info.vs_inputs) {
-        const auto& buffer = guest_buffers.emplace_back(
-            vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset));
+        const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
+        if (buffer.GetSize() == 0) {
+            continue;
+        }
+        guest_buffers.emplace_back(buffer);
        ranges.emplace_back(buffer.base_address.Value(),
                            buffer.base_address.Value() + buffer.GetSize());
    }
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@ -116,11 +116,13 @@ static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept {
 ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group) noexcept {
    const auto& attrib = group.attrib;
    is_tiled = attrib.tiling_mode == TilingMode::Tile;
+    tiling_mode =
+        is_tiled ? AmdGpu::TilingMode::Display_MacroTiled : AmdGpu::TilingMode::Display_Linear;
    pixel_format = ConvertPixelFormat(attrib.pixel_format);
    type = vk::ImageType::e2D;
    size.width = attrib.width;
    size.height = attrib.height;
-    pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) >> 7;
+    pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127);
    const bool is_32bpp = attrib.pixel_format != VideoOutFormat::A16R16G16B16Float;
    ASSERT(is_32bpp);
    if (!is_tiled) {
@ -128,11 +130,11 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group) noe
        return;
    }
    if (Config::isNeoMode()) {
-        guest_size_bytes = pitch * 128 * ((size.height + 127) & (~127)) * 4;
+        guest_size_bytes = pitch * ((size.height + 127) & (~127)) * 4;
    } else {
-        guest_size_bytes = pitch * 128 * ((size.height + 63) & (~63)) * 4;
+        guest_size_bytes = pitch * ((size.height + 63) & (~63)) * 4;
    }
-    is_vo_surface = true;
+    usage.vo_buffer = true;
 }

 ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer,
@ -140,12 +142,14 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer,
    is_tiled = buffer.IsTiled();
    tiling_mode = buffer.GetTilingMode();
    pixel_format = LiverpoolToVK::SurfaceFormat(buffer.info.format, buffer.NumFormat());
+    num_samples = 1 << buffer.attrib.num_fragments_log2;
    type = vk::ImageType::e2D;
    size.width = hint.Valid() ? hint.width : buffer.Pitch();
    size.height = hint.Valid() ? hint.height : buffer.Height();
    size.depth = 1;
    pitch = size.width;
    guest_size_bytes = buffer.GetSizeAligned();
+    usage.render_target = true;
 }

 ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer,
@ -153,18 +157,20 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer,
    is_tiled = false;
    pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format);
    type = vk::ImageType::e2D;
+    num_samples = 1 << buffer.z_info.num_samples; // spec doesn't say it is a log2
    size.width = hint.Valid() ? hint.width : buffer.Pitch();
    size.height = hint.Valid() ? hint.height : buffer.Height();
    size.depth = 1;
    pitch = size.width;
    guest_size_bytes = buffer.GetSizeAligned();
+    usage.depth_target = true;
 }

 ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept {
    is_tiled = image.IsTiled();
    tiling_mode = image.GetTilingMode();
    pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
-    type = ConvertImageType(image.type);
+    type = ConvertImageType(image.GetType());
    size.width = image.width + 1;
    size.height = image.height + 1;
    size.depth = 1;
@ -222,7 +228,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
        }
    }

-    info.usage = ImageUsageFlags(info);
+    usage = ImageUsageFlags(info);

    if (info.pixel_format == vk::Format::eD32Sfloat) {
        aspect_mask = vk::ImageAspectFlagBits::eDepth;
@ -243,7 +249,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
        .mipLevels = static_cast<u32>(info.resources.levels),
        .arrayLayers = static_cast<u32>(info.resources.layers),
        .tiling = vk::ImageTiling::eOptimal,
-        .usage = info.usage,
+        .usage = usage,
        .initialLayout = vk::ImageLayout::eUndefined,
    };

@ -296,6 +302,31 @@ void Image::Transit(vk::ImageLayout dst_layout, vk::Flags<vk::AccessFlagBits> ds
    pl_stage = dst_pl_stage;
 }

+void Image::Upload(vk::Buffer buffer, u64 offset) {
+    Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
+
+    // Copy to the image.
+    const vk::BufferImageCopy image_copy = {
+        .bufferOffset = offset,
+        .bufferRowLength = info.pitch,
+        .bufferImageHeight = info.size.height,
+        .imageSubresource{
+            .aspectMask = vk::ImageAspectFlagBits::eColor,
+            .mipLevel = 0,
+            .baseArrayLayer = 0,
+            .layerCount = 1,
+        },
+        .imageOffset = {0, 0, 0},
+        .imageExtent = {info.size.width, info.size.height, 1},
+    };
+
+    const auto cmdbuf = scheduler->CommandBuffer();
+    cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, image_copy);
+
+    Transit(vk::ImageLayout::eGeneral,
+            vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead);
+}
+
 Image::~Image() = default;

 } // namespace VideoCore
--- a/src/video_core/texture_cache/image.h
+++ b/src/video_core/texture_cache/image.h
@ -42,18 +42,28 @@ struct ImageInfo {
                       const AmdGpu::Liverpool::CbDbExtent& hint = {}) noexcept;
    explicit ImageInfo(const AmdGpu::Image& image) noexcept;

+    bool IsTiled() const {
+        return tiling_mode != AmdGpu::TilingMode::Display_Linear;
+    }
    bool IsBlockCoded() const;
    bool IsPacked() const;
    bool IsDepthStencil() const;

+    struct {
+        u32 texture : 1;
+        u32 storage : 1;
+        u32 render_target : 1;
+        u32 depth_target : 1;
+        u32 vo_buffer : 1;
+    } usage; // Usage data tracked during image lifetime
+
    bool is_tiled = false;
    bool is_storage = false;
-    bool is_vo_surface = false;
    vk::Format pixel_format = vk::Format::eUndefined;
    vk::ImageType type = vk::ImageType::e1D;
-    vk::ImageUsageFlags usage;
    SubresourceExtent resources;
    Extent3D size{1, 1, 1};
+    u32 num_samples = 1;
    u32 pitch = 0;
    u32 guest_size_bytes = 0;
    AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear};
@ -117,6 +127,7 @@ struct Image {
    }

    void Transit(vk::ImageLayout dst_layout, vk::Flags<vk::AccessFlagBits> dst_mask);
+    void Upload(vk::Buffer buffer, u64 offset);

    const Vulkan::Instance* instance;
    Vulkan::Scheduler* scheduler;
@ -131,6 +142,7 @@ struct Image {
    std::optional<ImageView> view_for_detiler;

    // Resource state tracking
+    vk::ImageUsageFlags usage;
    vk::Flags<vk::PipelineStageFlagBits> pl_stage = vk::PipelineStageFlagBits::eAllCommands;
    vk::Flags<vk::AccessFlagBits> access_mask = vk::AccessFlagBits::eNone;
    vk::ImageLayout layout = vk::ImageLayout::eUndefined;
--- a/src/video_core/texture_cache/image_view.cpp
+++ b/src/video_core/texture_cache/image_view.cpp
@ -48,7 +48,7 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) {

 ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage) noexcept
    : is_storage{is_storage} {
-    type = ConvertImageViewType(image.type);
+    type = ConvertImageViewType(image.GetType());
    format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt());
    range.base.level = 0;
    range.base.layer = 0;
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@ -151,7 +151,7 @@ ImageView& TextureCache::RegisterImageView(Image& image, const ImageViewInfo& vi
    // temporary remove its storage bit.
    std::optional<vk::ImageUsageFlags> usage_override;
    if (!image.info.is_storage) {
-        usage_override = image.info.usage & ~vk::ImageUsageFlagBits::eStorage;
+        usage_override = image.usage & ~vk::ImageUsageFlagBits::eStorage;
    }

    const ImageViewId view_id = slot_image_views.insert(instance, view_info, image, usage_override);
@ -183,7 +183,7 @@ ImageView& TextureCache::RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buff
                  vk::AccessFlagBits::eColorAttachmentWrite |
                      vk::AccessFlagBits::eColorAttachmentRead);

-    ImageViewInfo view_info{buffer, image.info.is_vo_surface};
+    ImageViewInfo view_info{buffer, !!image.info.usage.vo_buffer};
    return RegisterImageView(image, view_info);
 }

@ -210,26 +210,8 @@ void TextureCache::RefreshImage(Image& image) {
        if (!tile_manager.TryDetile(image)) {
            // Upload data to the staging buffer.
            const auto offset = staging.Copy(image.cpu_addr, image.info.guest_size_bytes, 4);
-            image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
-
            // Copy to the image.
-            const vk::BufferImageCopy image_copy = {
-                .bufferOffset = offset,
-                .bufferRowLength = 0,
-                .bufferImageHeight = 0,
-                .imageSubresource{
-                    .aspectMask = vk::ImageAspectFlagBits::eColor,
-                    .mipLevel = 0,
-                    .baseArrayLayer = 0,
-                    .layerCount = 1,
-                },
-                .imageOffset = {0, 0, 0},
-                .imageExtent = {image.info.size.width, image.info.size.height, 1},
-            };
-
-            const auto cmdbuf = scheduler.CommandBuffer();
-            cmdbuf.copyBufferToImage(staging.Handle(), image.image,
-                                     vk::ImageLayout::eTransferDstOptimal, image_copy);
+            image.Upload(staging.Handle(), offset);
        }

        image.Transit(vk::ImageLayout::eGeneral,
--- a/src/video_core/texture_cache/tile_manager.cpp
+++ b/src/video_core/texture_cache/tile_manager.cpp
@ -15,7 +15,6 @@

 #include <boost/container/static_vector.hpp>
 #include <magic_enum.hpp>
-#include <vulkan/vulkan_to_string.hpp>

 namespace VideoCore {