diff --git a/src/core/libraries/network/net.cpp b/src/core/libraries/network/net.cpp
index 958f9264..2c03dde3 100644
--- a/src/core/libraries/network/net.cpp
+++ b/src/core/libraries/network/net.cpp
@@ -10,7 +10,7 @@
 #include <arpa/inet.h>
 #endif
 
-#include <common/assert.h>
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/libraries/error_codes.h"
 #include "core/libraries/libs.h"
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index bbf259fe..f933ed3c 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -323,7 +323,7 @@ static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offs
 
 static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) {
     auto& buffer = ctx.buffers[handle];
-    const auto format = buffer.buffer.GetDataFmt();
+    const auto format = buffer.dfmt;
     switch (format) {
     case AmdGpu::DataFormat::FormatInvalid:
         return ctx.f32_zero_value;
@@ -348,7 +348,7 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com
 
         // uint index = address / 4;
         Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-        const u32 stride = buffer.buffer.GetStride();
+        const u32 stride = buffer.stride;
         if (stride > 4) {
             const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32);
             if (index_offset > 0) {
@@ -360,7 +360,7 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com
 
         const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32;
         const u32 bit_width = AmdGpu::ComponentBits(format, comp);
-        const auto num_format = buffer.buffer.GetNumberFmt();
+        const auto num_format = buffer.nfmt;
         if (num_format == AmdGpu::NumberFormat::Float) {
             if (bit_width == 32) {
                 return ctx.OpLoad(ctx.F32[1], ptr);
@@ -486,8 +486,8 @@ static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat fo
 template <u32 N>
 static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
     auto& buffer = ctx.buffers[handle];
-    const auto format = buffer.buffer.GetDataFmt();
-    const auto num_format = buffer.buffer.GetNumberFmt();
+    const auto format = buffer.dfmt;
+    const auto num_format = buffer.nfmt;
 
     switch (format) {
     case AmdGpu::DataFormat::FormatInvalid:
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 4b732ecd..8b99482f 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -363,7 +363,9 @@ void EmitContext::DefineBuffers() {
             .binding = binding++,
             .data_types = data_types,
             .pointer_type = pointer_type,
-            .buffer = buffer.GetVsharp(info),
+            .dfmt = buffer.dfmt,
+            .nfmt = buffer.nfmt,
+            .stride = buffer.GetVsharp(info).GetStride(),
         });
         interfaces.push_back(id);
         i++;
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 81237a9a..768b591f 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -207,7 +207,9 @@ public:
         u32 binding;
         const VectorIds* data_types;
         Id pointer_type;
-        AmdGpu::Buffer buffer;
+        AmdGpu::DataFormat dfmt;
+        AmdGpu::NumberFormat nfmt;
+        u32 stride;
     };
 
     u32& binding;
diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp
index 2925c05d..4f3ab86e 100644
--- a/src/shader_recompiler/frontend/control_flow_graph.cpp
+++ b/src/shader_recompiler/frontend/control_flow_graph.cpp
@@ -35,15 +35,22 @@ static IR::Condition MakeCondition(Opcode opcode) {
         return IR::Condition::Execz;
     case Opcode::S_CBRANCH_EXECNZ:
         return IR::Condition::Execnz;
+    case Opcode::S_AND_SAVEEXEC_B64:
+    case Opcode::S_ANDN2_B64:
+        return IR::Condition::Execnz;
     default:
         return IR::Condition::True;
     }
 }
 
+static constexpr size_t LabelReserveSize = 32;
+
 CFG::CFG(Common::ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_list_)
     : block_pool{block_pool_}, inst_list{inst_list_} {
     index_to_pc.resize(inst_list.size() + 1);
+    labels.reserve(LabelReserveSize);
     EmitLabels();
+    EmitDivergenceLabels();
     EmitBlocks();
     LinkBlocks();
 }
@@ -51,14 +58,7 @@ CFG::CFG(Common::ObjectPool<Block>& block_pool_, std::span<const GcnInst> inst_l
 void CFG::EmitLabels() {
     // Always set a label at entry point.
     u32 pc = 0;
-    labels.push_back(pc);
-
-    const auto add_label = [this](u32 address) {
-        const auto it = std::ranges::find(labels, address);
-        if (it == labels.end()) {
-            labels.push_back(address);
-        }
-    };
+    AddLabel(pc);
 
     // Iterate instruction list and add labels to branch targets.
     for (u32 i = 0; i < inst_list.size(); i++) {
@@ -66,15 +66,15 @@ void CFG::EmitLabels() {
         const GcnInst inst = inst_list[i];
         if (inst.IsUnconditionalBranch()) {
             const u32 target = inst.BranchTarget(pc);
-            add_label(target);
+            AddLabel(target);
         } else if (inst.IsConditionalBranch()) {
             const u32 true_label = inst.BranchTarget(pc);
             const u32 false_label = pc + inst.length;
-            add_label(true_label);
-            add_label(false_label);
+            AddLabel(true_label);
+            AddLabel(false_label);
         } else if (inst.opcode == Opcode::S_ENDPGM) {
             const u32 next_label = pc + inst.length;
-            add_label(next_label);
+            AddLabel(next_label);
         }
         pc += inst.length;
     }
@@ -84,16 +84,70 @@ void CFG::EmitLabels() {
     std::ranges::sort(labels);
 }
 
-void CFG::EmitBlocks() {
-    const auto get_index = [this](Label label) -> size_t {
-        if (label == 0) {
-            return 0ULL;
-        }
-        const auto it_index = std::ranges::lower_bound(index_to_pc, label);
-        ASSERT(it_index != index_to_pc.end() || label > index_to_pc.back());
-        return std::distance(index_to_pc.begin(), it_index);
+void CFG::EmitDivergenceLabels() {
+    const auto is_open_scope = [](const GcnInst& inst) {
+        // An open scope instruction is an instruction that modifies EXEC
+        // but also saves the previous value to restore later. This indicates
+        // we are entering a scope.
+        return inst.opcode == Opcode::S_AND_SAVEEXEC_B64 ||
+               // While this instruction does not save EXEC it is often used paired
+               // with SAVEEXEC to mask the threads that didn't pass the condition
+               // of initial branch.
+               inst.opcode == Opcode::S_ANDN2_B64;
+    };
+    const auto is_close_scope = [](const GcnInst& inst) {
+        // Closing an EXEC scope can be either a branch instruction
+        // (typical case when S_AND_SAVEEXEC_B64 is right before a branch)
+        // or by a move instruction to EXEC that restores the backup.
+        return (inst.opcode == Opcode::S_MOV_B64 && inst.dst[0].field == OperandField::ExecLo) ||
+               // Sometimes compiler might insert instructions between the SAVEEXEC and the branch.
+               // Those instructions need to be wrapped in the condition as well so allow branch
+               // as end scope instruction.
+               inst.opcode == Opcode::S_CBRANCH_EXECZ || inst.opcode == Opcode::S_ANDN2_B64;
     };
 
+    // Since we will be adding new labels, avoid iterating those as well.
+    const size_t end_size = labels.size();
+    for (u32 l = 0; l < end_size; l++) {
+        const Label start = labels[l];
+        // Stop if we reached end of existing labels.
+        if (l == end_size - 1) {
+            break;
+        }
+        const Label end = labels[l + 1];
+        const size_t end_index = GetIndex(end);
+
+        s32 curr_begin = -1;
+        for (size_t index = GetIndex(start); index < end_index; index++) {
+            const auto& inst = inst_list[index];
+            if (is_close_scope(inst) && curr_begin != -1) {
+                // If there are no instructions inside scope don't do anything.
+                if (index - curr_begin == 1) {
+                    curr_begin = -1;
+                    continue;
+                }
+                // Add a label to the instruction right after the open scope call.
+                // It is the start of a new basic block.
+                const auto& save_inst = inst_list[curr_begin];
+                const Label label = index_to_pc[curr_begin] + save_inst.length;
+                AddLabel(label);
+                // Add a label to the close scope instruction as well.
+                AddLabel(index_to_pc[index]);
+                // Reset scope begin.
+                curr_begin = -1;
+            }
+            // Mark a potential start of an exec scope.
+            if (is_open_scope(inst)) {
+                curr_begin = index;
+            }
+        }
+    }
+
+    // Sort labels to make sure block insertion is correct.
+    std::ranges::sort(labels);
+}
+
+void CFG::EmitBlocks() {
     for (auto it = labels.begin(); it != labels.end(); it++) {
         const Label start = *it;
         const auto next_it = std::next(it);
@@ -102,8 +156,10 @@ void CFG::EmitBlocks() {
             // Last label is special.
             return;
         }
+        // The end label is the start instruction of next block.
+        // The end instruction of this block is the previous one.
         const Label end = *next_it;
-        const size_t end_index = get_index(end) - 1;
+        const size_t end_index = GetIndex(end) - 1;
         const auto& end_inst = inst_list[end_index];
 
         // Insert block between the labels using the last instruction
@@ -111,7 +167,7 @@ void CFG::EmitBlocks() {
         Block* block = block_pool.Create();
         block->begin = start;
         block->end = end;
-        block->begin_index = get_index(start);
+        block->begin_index = GetIndex(start);
         block->end_index = end_index;
         block->end_inst = end_inst;
         block->cond = MakeCondition(end_inst.opcode);
@@ -126,8 +182,26 @@ void CFG::LinkBlocks() {
         return &*it;
     };
 
-    for (auto& block : blocks) {
+    for (auto it = blocks.begin(); it != blocks.end(); it++) {
+        auto& block = *it;
         const auto end_inst{block.end_inst};
+        // Handle divergence block inserted here.
+        if (end_inst.opcode == Opcode::S_AND_SAVEEXEC_B64 ||
+            end_inst.opcode == Opcode::S_ANDN2_B64) {
+            // Blocks are stored ordered by address in the set
+            auto next_it = std::next(it);
+            auto* target_block = &(*next_it);
+            ++target_block->num_predecessors;
+            block.branch_true = target_block;
+
+            auto merge_it = std::next(next_it);
+            auto* merge_block = &(*merge_it);
+            ++merge_block->num_predecessors;
+            block.branch_false = merge_block;
+            block.end_class = EndClass::Branch;
+            continue;
+        }
+
         // If the block doesn't end with a branch we simply
         // need to link with the next block.
         if (!end_inst.IsTerminateInstruction()) {
diff --git a/src/shader_recompiler/frontend/control_flow_graph.h b/src/shader_recompiler/frontend/control_flow_graph.h
index ebe614ee..d98d4b05 100644
--- a/src/shader_recompiler/frontend/control_flow_graph.h
+++ b/src/shader_recompiler/frontend/control_flow_graph.h
@@ -3,11 +3,13 @@
 
 #pragma once
 
+#include <algorithm>
 #include <span>
 #include <string>
 #include <boost/container/small_vector.hpp>
 #include <boost/intrusive/set.hpp>
 
+#include "common/assert.h"
 #include "common/object_pool.h"
 #include "common/types.h"
 #include "shader_recompiler/frontend/instruction.h"
@@ -55,9 +57,26 @@ public:
 
 private:
     void EmitLabels();
+    void EmitDivergenceLabels();
     void EmitBlocks();
     void LinkBlocks();
 
+    void AddLabel(Label address) {
+        const auto it = std::ranges::find(labels, address);
+        if (it == labels.end()) {
+            labels.push_back(address);
+        }
+    };
+
+    size_t GetIndex(Label label) {
+        if (label == 0) {
+            return 0ULL;
+        }
+        const auto it_index = std::ranges::lower_bound(index_to_pc, label);
+        ASSERT(it_index != index_to_pc.end() || label > index_to_pc.back());
+        return std::distance(index_to_pc.begin(), it_index);
+    };
+
 public:
     Common::ObjectPool<Block>& block_pool;
     std::span<const GcnInst> inst_list;
diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
index 795b148d..812d93ba 100644
--- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
@@ -29,6 +29,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
         return S_CMP(ConditionOp::LG, true, inst);
     case Opcode::S_CMP_GT_I32:
         return S_CMP(ConditionOp::GT, true, inst);
+    case Opcode::S_CMP_LE_I32:
+        return S_CMP(ConditionOp::LE, true, inst);
     case Opcode::S_CMP_GE_I32:
         return S_CMP(ConditionOp::GE, true, inst);
     case Opcode::S_CMP_EQ_I32:
diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp
index d48e4def..4070560a 100644
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@@ -64,9 +64,15 @@ void Translator::EmitPrologue() {
         ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 1));
         ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 2));
 
-        ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 0));
-        ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 1));
-        ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2));
+        if (info.tgid_enable[0]) {
+            ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 0));
+        }
+        if (info.tgid_enable[1]) {
+            ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 1));
+        }
+        if (info.tgid_enable[2]) {
+            ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2));
+        }
         break;
     default:
         throw NotImplementedException("Unknown shader stage");
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
index 63f6c3b4..01a549f4 100644
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -91,6 +91,11 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
     case Opcode::BUFFER_STORE_FORMAT_XYZW:
         return BUFFER_STORE_FORMAT(4, false, true, inst);
 
+    case Opcode::TBUFFER_STORE_FORMAT_X:
+        return BUFFER_STORE_FORMAT(1, true, true, inst);
+    case Opcode::TBUFFER_STORE_FORMAT_XYZ:
+        return BUFFER_STORE_FORMAT(3, true, true, inst);
+
     case Opcode::BUFFER_STORE_DWORD:
         return BUFFER_STORE_FORMAT(1, false, false, inst);
     case Opcode::BUFFER_STORE_DWORDX2:
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index 674d7c5f..b1eb6aea 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -180,6 +180,7 @@ struct Info {
     SamplerResourceList samplers;
 
     std::array<u32, 3> workgroup_size{};
+    std::array<bool, 3> tgid_enable;
 
     u32 num_user_data;
     u32 num_input_vgprs;
diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h
index e888bdcc..92a24795 100644
--- a/src/video_core/amdgpu/liverpool.h
+++ b/src/video_core/amdgpu/liverpool.h
@@ -130,6 +130,7 @@ struct Liverpool {
             BitField<0, 6, u64> num_vgprs;
             BitField<6, 4, u64> num_sgprs;
             BitField<33, 5, u64> num_user_regs;
+            BitField<39, 3, u64> tgid_enable;
             BitField<47, 9, u64> lds_dwords;
         } settings;
         INSERT_PADDING_WORDS(1);
@@ -148,6 +149,10 @@ struct Liverpool {
             return settings.lds_dwords.Value() * 128 * 4;
         }
 
+        bool IsTgidEnabled(u32 i) const noexcept {
+            return (settings.tgid_enable.Value() >> i) & 1;
+        }
+
         std::span<const u32> Code() const {
             const u32* code = Address<u32*>();
             BinaryInfo bininfo;
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index 2246807a..02d6b2ce 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -13,7 +13,7 @@
 
 namespace VideoCore {
 
-static constexpr size_t StagingBufferSize = 256_MB;
+static constexpr size_t StagingBufferSize = 512_MB;
 static constexpr size_t UboStreamBufferSize = 64_MB;
 
 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 33ea3f86..2bcc4f0e 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include <array>
 #include <mutex>
 #include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_map.hpp>
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 6974d850..c11705e7 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -93,6 +93,8 @@ Shader::Info MakeShaderInfo(Shader::Stage stage, std::span<const u32, 16> user_d
         info.num_user_data = cs_pgm.settings.num_user_regs;
         info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full,
                                cs_pgm.num_thread_z.full};
+        info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1),
+                            cs_pgm.IsTgidEnabled(2)};
         info.shared_memory_size = cs_pgm.SharedMemSize();
         break;
     }
@@ -324,6 +326,7 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline() {
         Shader::Info info =
             MakeShaderInfo(Shader::Stage::Compute, cs_pgm.user_data, liverpool->regs);
         info.pgm_base = cs_pgm.Address<uintptr_t>();
+        info.pgm_hash = compute_key;
         auto program =
             Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info), profile);