diff --git a/src/core/libraries/avplayer/avplayer.cpp b/src/core/libraries/avplayer/avplayer.cpp index 406583a8..87a0e49e 100644 --- a/src/core/libraries/avplayer/avplayer.cpp +++ b/src/core/libraries/avplayer/avplayer.cpp @@ -120,6 +120,7 @@ bool PS4_SYSV_ABI sceAvPlayerGetVideoDataEx(SceAvPlayerHandle handle, } SceAvPlayerHandle PS4_SYSV_ABI sceAvPlayerInit(SceAvPlayerInitData* data) { + return nullptr; LOG_TRACE(Lib_AvPlayer, "called"); if (data == nullptr) { return nullptr; @@ -325,4 +326,4 @@ void RegisterlibSceAvPlayer(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("yN7Jhuv8g24", "libSceAvPlayer", 1, "libSceAvPlayer", 1, 0, sceAvPlayerVprintf); }; -} // namespace Libraries::AvPlayer \ No newline at end of file +} // namespace Libraries::AvPlayer diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 605d0d29..455486b0 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -1066,7 +1066,16 @@ ScePthread PThreadPool::Create() { } } +#ifdef _WIN64 auto* ret = new PthreadInternal{}; +#else + // TODO: Linux specific hack + static u8* hint_address = reinterpret_cast(0x7FFFFC000ULL); + auto* ret = reinterpret_cast( + mmap(hint_address, sizeof(PthreadInternal), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)); + hint_address += Common::AlignUp(sizeof(PthreadInternal), 4_KB); +#endif ret->is_free = false; ret->is_detached = false; ret->is_almost_done = false; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index ca49047c..59387faf 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -189,6 +189,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat); } + if (info.has_texel_buffers) { + ctx.AddCapability(spv::Capability::SampledBuffer); + } switch (program.info.stage) { case Stage::Compute: { const std::array workgroup_size{program.info.workgroup_size}; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 03fc52ff..527a0551 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -262,171 +262,15 @@ Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { return EmitLoadBufferF32xN<4>(ctx, handle, address); } -static bool IsSignedInteger(AmdGpu::NumberFormat format) { - switch (format) { - case AmdGpu::NumberFormat::Unorm: - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Uint: - return false; - case AmdGpu::NumberFormat::Snorm: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::SnormNz: - return true; - case AmdGpu::NumberFormat::Float: - default: - UNREACHABLE(); - } -} - -static u32 UXBitsMax(u32 bit_width) { - return (1u << bit_width) - 1u; -} - -static u32 SXBitsMax(u32 bit_width) { - return (1u << (bit_width - 1u)) - 1u; -} - -static Id ConvertValue(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) { - switch (format) { - case AmdGpu::NumberFormat::Unorm: - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(UXBitsMax(bit_width)))); - case AmdGpu::NumberFormat::Snorm: - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width)))); - case AmdGpu::NumberFormat::SnormNz: - // (x * 2 + 1) / (Format::SMAX * 2) - value = ctx.OpFMul(ctx.F32[1], value, ctx.ConstF32(2.f)); - value = ctx.OpFAdd(ctx.F32[1], value, ctx.ConstF32(1.f)); - return ctx.OpFDiv(ctx.F32[1], value, ctx.ConstF32(float(SXBitsMax(bit_width) * 2))); - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Uint: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::Float: - return value; - default: - UNREACHABLE_MSG("Unsupported number format for conversion: {}", - magic_enum::enum_name(format)); - } -} - -static Id ComponentOffset(EmitContext& ctx, Id address, u32 stride, u32 bit_offset) { - Id comp_offset = ctx.ConstU32(bit_offset); - if (stride < 4) { - // comp_offset += (address % 4) * 8; - const Id byte_offset = ctx.OpUMod(ctx.U32[1], address, ctx.ConstU32(4u)); - const Id bit_offset = ctx.OpShiftLeftLogical(ctx.U32[1], byte_offset, ctx.ConstU32(3u)); - comp_offset = ctx.OpIAdd(ctx.U32[1], comp_offset, bit_offset); - } - return comp_offset; -} - -static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 comp) { - auto& buffer = ctx.buffers[handle]; - const auto format = buffer.dfmt; - switch (format) { - case AmdGpu::DataFormat::FormatInvalid: - return ctx.f32_zero_value; - case AmdGpu::DataFormat::Format8: - case AmdGpu::DataFormat::Format16: - case AmdGpu::DataFormat::Format32: - case AmdGpu::DataFormat::Format8_8: - case AmdGpu::DataFormat::Format16_16: - case AmdGpu::DataFormat::Format10_11_11: - case AmdGpu::DataFormat::Format11_11_10: - case AmdGpu::DataFormat::Format10_10_10_2: - case AmdGpu::DataFormat::Format2_10_10_10: - case AmdGpu::DataFormat::Format8_8_8_8: - case AmdGpu::DataFormat::Format32_32: - case AmdGpu::DataFormat::Format16_16_16_16: - case AmdGpu::DataFormat::Format32_32_32: - case AmdGpu::DataFormat::Format32_32_32_32: { - const u32 num_components = AmdGpu::NumComponents(format); - if (comp >= num_components) { - return ctx.f32_zero_value; - } - - // uint index = address / 4; - Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const u32 stride = buffer.stride; - if (stride > 4) { - const u32 index_offset = u32(AmdGpu::ComponentOffset(format, comp) / 32); - if (index_offset > 0) { - // index += index_offset; - index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(index_offset)); - } - } - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); - - const u32 bit_offset = AmdGpu::ComponentOffset(format, comp) % 32; - const u32 bit_width = AmdGpu::ComponentBits(format, comp); - const auto num_format = buffer.nfmt; - if (num_format == AmdGpu::NumberFormat::Float) { - if (bit_width == 32) { - return ctx.OpLoad(ctx.F32[1], ptr); - } else if (bit_width == 16) { - const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); - Id value = ctx.OpLoad(ctx.U32[1], ptr); - value = - ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpSConvert(ctx.U16, value); - value = ctx.OpBitcast(ctx.F16[1], value); - return ctx.OpFConvert(ctx.F32[1], value); - } else { - UNREACHABLE_MSG("Invalid float bit width {}", bit_width); - } - } else { - Id value = ctx.OpLoad(ctx.U32[1], ptr); - const bool is_signed = IsSignedInteger(num_format); - if (bit_width < 32) { - const Id comp_offset = ComponentOffset(ctx, address, stride, bit_offset); - if (is_signed) { - value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, - ctx.ConstU32(bit_width)); - } else { - value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset, - ctx.ConstU32(bit_width)); - } - } - value = ctx.OpBitcast(ctx.F32[1], value); - return ConvertValue(ctx, value, num_format, bit_width); - } - break; - } - default: - UNREACHABLE_MSG("Invalid format for conversion: {}", magic_enum::enum_name(format)); - } -} - -template -static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - auto& buffer = ctx.buffers[handle]; - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); - if constexpr (N == 1) { - return GetBufferFormatValue(ctx, handle, address, 0); - } else { - boost::container::static_vector ids; - for (u32 i = 0; i < N; i++) { - ids.push_back(GetBufferFormatValue(ctx, handle, address, i)); - } - return ctx.OpCompositeConstruct(ctx.F32[N], ids); - } -} - Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<1>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<2>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<3>(ctx, inst, handle, address); -} - -Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferFormatF32xN<4>(ctx, inst, handle, address); + const auto& buffer = ctx.texture_buffers[handle]; + const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); + const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + Id texel = ctx.OpImageFetch(buffer.result_type, tex_buffer, coord); + if (buffer.is_integer) { + texel = ctx.OpBitcast(ctx.F32[4], texel); + } + return texel; } template @@ -467,6 +311,7 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address EmitStoreBufferF32xN<1>(ctx, handle, address, value); } +<<<<<<< HEAD static Id ConvertF32ToFormat(EmitContext& ctx, Id value, AmdGpu::NumberFormat format, u32 bit_width) { switch (format) { @@ -541,23 +386,16 @@ static void EmitStoreBufferFormatF32xN(EmitContext& ctx, u32 handle, Id address, } } +======= +>>>>>>> 8b824588 (video_core: Use texture buffers for untyped format load/store) void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferFormatF32xN<1>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<2>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<3>(ctx, handle, address, value); -} - -void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, - Id value) { - EmitStoreBufferFormatF32xN<4>(ctx, handle, address, value); + const auto& buffer = ctx.texture_buffers[handle]; + const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); + const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + if (buffer.is_integer) { + value = ctx.OpBitcast(ctx.U32[4], value); + } + ctx.OpImageWrite(tex_buffer, coord, value); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 0eea88e4..b45a5aa3 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -48,6 +48,7 @@ EmitContext::EmitContext(const Profile& profile_, const Shader::Info& info_, u32 DefineArithmeticTypes(); DefineInterfaces(); DefineBuffers(); + DefineTextureBuffers(); DefineImagesAndSamplers(); DefineSharedMemory(); } @@ -123,21 +124,19 @@ void EmitContext::DefineInterfaces() { DefineOutputs(); } -Id GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { +const VectorIds& GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) { switch (fmt) { case AmdGpu::NumberFormat::Float: case AmdGpu::NumberFormat::Unorm: case AmdGpu::NumberFormat::Snorm: case AmdGpu::NumberFormat::SnormNz: - return ctx.F32[4]; - case AmdGpu::NumberFormat::Sint: - return ctx.S32[4]; - case AmdGpu::NumberFormat::Uint: - return ctx.U32[4]; case AmdGpu::NumberFormat::Sscaled: - return ctx.F32[4]; case AmdGpu::NumberFormat::Uscaled: - return ctx.F32[4]; + return ctx.F32; + case AmdGpu::NumberFormat::Sint: + return ctx.S32; + case AmdGpu::NumberFormat::Uint: + return ctx.U32; default: break; } @@ -177,6 +176,16 @@ void EmitContext::DefineBufferOffsets() { buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); } + for (auto& tex_buffer : texture_buffers) { + const u32 binding = tex_buffer.binding; + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); + } } Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { @@ -202,7 +211,7 @@ void EmitContext::DefineInputs() { instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input); for (const auto& input : info.vs_inputs) { - const Id type{GetAttributeType(*this, input.fmt)}; + const Id type{GetAttributeType(*this, input.fmt)[4]}; if (input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate0 || input.instance_step_rate == Info::VsInput::InstanceIdType::OverStepRate1) { @@ -328,27 +337,30 @@ void EmitContext::DefinePushDataBlock() { void EmitContext::DefineBuffers() { boost::container::small_vector type_ids; - for (u32 i = 0; const auto& buffer : info.buffers) { - const auto sharp = buffer.GetVsharp(info); - const bool is_storage = buffer.IsStorage(sharp); - const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; + const auto define_struct = [&](Id record_array_type, bool is_instance_data) { + const Id struct_type{TypeStruct(record_array_type)}; + if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) { + return struct_type; + } + Decorate(record_array_type, spv::Decoration::ArrayStride, 4); + const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) + : fmt::format("{}_cbuf_block_f32", stage); + Name(struct_type, name); + Decorate(struct_type, spv::Decoration::Block); + MemberName(struct_type, 0, "data"); + MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); + type_ids.push_back(record_array_type); + return struct_type; + }; + + for (const auto& desc : info.buffers) { + const auto sharp = desc.GetVsharp(info); + const bool is_storage = desc.IsStorage(sharp); + const auto* data_types = True(desc.used_types & IR::Type::F32) ? &F32 : &U32; const Id data_type = (*data_types)[1]; const Id record_array_type{is_storage ? TypeRuntimeArray(data_type) - : TypeArray(data_type, ConstU32(buffer.length))}; - const Id struct_type{TypeStruct(record_array_type)}; - if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { - Decorate(record_array_type, spv::Decoration::ArrayStride, 4); - const auto name = - buffer.is_instance_data - ? fmt::format("{}_instance_data{}_{}{}", stage, i, 'f', - sizeof(float) * CHAR_BIT) - : fmt::format("{}_cbuf_block_{}{}", stage, 'f', sizeof(float) * CHAR_BIT); - Name(struct_type, name); - Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, 0, "data"); - MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); - type_ids.push_back(record_array_type); - } + : TypeArray(data_type, ConstU32(desc.length))}; + const Id struct_type{define_struct(record_array_type, desc.is_instance_data)}; const auto storage_class = is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; @@ -357,19 +369,39 @@ void EmitContext::DefineBuffers() { const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; Decorate(id, spv::Decoration::Binding, binding); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", buffer.sgpr_base)); + Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base)); buffers.push_back({ .id = id, .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, - .dfmt = buffer.dfmt, - .nfmt = buffer.nfmt, - .stride = buffer.GetVsharp(info).GetStride(), }); interfaces.push_back(id); - i++; + } +} + +void EmitContext::DefineTextureBuffers() { + for (const auto& desc : info.texture_buffers) { + const bool is_integer = + desc.nfmt == AmdGpu::NumberFormat::Uint || desc.nfmt == AmdGpu::NumberFormat::Sint; + const VectorIds& sampled_type{GetAttributeType(*this, desc.nfmt)}; + const u32 sampled = desc.is_written ? 2 : 1; + const Id image_type{TypeImage(sampled_type[1], spv::Dim::Buffer, false, false, false, + sampled, spv::ImageFormat::Unknown)}; + const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)}; + const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; + Decorate(id, spv::Decoration::Binding, binding); + Decorate(id, spv::Decoration::DescriptorSet, 0U); + Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base)); + texture_buffers.push_back({ + .id = id, + .binding = binding++, + .image_type = image_type, + .result_type = sampled_type[4], + .is_integer = is_integer, + }); + interfaces.push_back(id); } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index ed39ee57..5391108f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -207,13 +207,19 @@ public: u32 binding; const VectorIds* data_types; Id pointer_type; - AmdGpu::DataFormat dfmt; - AmdGpu::NumberFormat nfmt; - u32 stride; + }; + struct TextureBufferDefinition { + Id id; + Id coord_offset; + u32 binding; + Id image_type; + Id result_type; + bool is_integer; }; u32& binding; boost::container::small_vector buffers; + boost::container::small_vector texture_buffers; boost::container::small_vector images; boost::container::small_vector samplers; @@ -238,6 +244,7 @@ private: void DefineOutputs(); void DefinePushDataBlock(); void DefineBuffers(); + void DefineTextureBuffers(); void DefineImagesAndSamplers(); void DefineSharedMemory(); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 9995e446..8d418421 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -191,8 +191,10 @@ public: void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst); // Vector Memory - void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); - void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst); + void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst); + void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst); + void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst); void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst); // Vector interpolation diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index b88cfc46..73530dad 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -56,57 +56,57 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { // Buffer load operations case Opcode::TBUFFER_LOAD_FORMAT_X: - return BUFFER_LOAD_FORMAT(1, true, true, inst); + return BUFFER_LOAD(1, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XY: - return BUFFER_LOAD_FORMAT(2, true, true, inst); + return BUFFER_LOAD(2, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZ: - return BUFFER_LOAD_FORMAT(3, true, true, inst); + return BUFFER_LOAD(3, true, inst); case Opcode::TBUFFER_LOAD_FORMAT_XYZW: - return BUFFER_LOAD_FORMAT(4, true, true, inst); + return BUFFER_LOAD(4, true, inst); case Opcode::BUFFER_LOAD_FORMAT_X: - return BUFFER_LOAD_FORMAT(1, false, true, inst); + return BUFFER_LOAD_FORMAT(1, inst); case Opcode::BUFFER_LOAD_FORMAT_XY: - return BUFFER_LOAD_FORMAT(2, false, true, inst); + return BUFFER_LOAD_FORMAT(2, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZ: - return BUFFER_LOAD_FORMAT(3, false, true, inst); + return BUFFER_LOAD_FORMAT(3, inst); case Opcode::BUFFER_LOAD_FORMAT_XYZW: - return BUFFER_LOAD_FORMAT(4, false, true, inst); + return BUFFER_LOAD_FORMAT(4, inst); case Opcode::BUFFER_LOAD_DWORD: - return BUFFER_LOAD_FORMAT(1, false, false, inst); + return BUFFER_LOAD(1, false, inst); case Opcode::BUFFER_LOAD_DWORDX2: - return BUFFER_LOAD_FORMAT(2, false, false, inst); + return BUFFER_LOAD(2, false, inst); case Opcode::BUFFER_LOAD_DWORDX3: - return BUFFER_LOAD_FORMAT(3, false, false, inst); + return BUFFER_LOAD(3, false, inst); case Opcode::BUFFER_LOAD_DWORDX4: - return BUFFER_LOAD_FORMAT(4, false, false, inst); + return BUFFER_LOAD(4, false, inst); // Buffer store operations case Opcode::BUFFER_STORE_FORMAT_X: - return BUFFER_STORE_FORMAT(1, false, true, inst); + return BUFFER_STORE_FORMAT(1, inst); case Opcode::BUFFER_STORE_FORMAT_XY: - return BUFFER_STORE_FORMAT(2, false, true, inst); + return BUFFER_STORE_FORMAT(2, inst); case Opcode::BUFFER_STORE_FORMAT_XYZ: - return BUFFER_STORE_FORMAT(3, false, true, inst); + return BUFFER_STORE_FORMAT(3, inst); case Opcode::BUFFER_STORE_FORMAT_XYZW: - return BUFFER_STORE_FORMAT(4, false, true, inst); + return BUFFER_STORE_FORMAT(4, inst); case Opcode::TBUFFER_STORE_FORMAT_X: - return BUFFER_STORE_FORMAT(1, true, true, inst); + return BUFFER_STORE(1, true, inst); case Opcode::TBUFFER_STORE_FORMAT_XY: - return BUFFER_STORE_FORMAT(2, true, true, inst); + return BUFFER_STORE(2, true, inst); case Opcode::TBUFFER_STORE_FORMAT_XYZ: - return BUFFER_STORE_FORMAT(3, true, true, inst); + return BUFFER_STORE(3, true, inst); case Opcode::BUFFER_STORE_DWORD: - return BUFFER_STORE_FORMAT(1, false, false, inst); + return BUFFER_STORE(1, false, inst); case Opcode::BUFFER_STORE_DWORDX2: - return BUFFER_STORE_FORMAT(2, false, false, inst); + return BUFFER_STORE(2, false, inst); case Opcode::BUFFER_STORE_DWORDX3: - return BUFFER_STORE_FORMAT(3, false, false, inst); + return BUFFER_STORE(3, false, inst); case Opcode::BUFFER_STORE_DWORDX4: - return BUFFER_STORE_FORMAT(4, false, false, inst); + return BUFFER_STORE(4, false, inst); // Buffer atomic operations case Opcode::BUFFER_ATOMIC_ADD: @@ -349,8 +349,7 @@ void Translator::IMAGE_STORE(const GcnInst& inst) { ir.ImageWrite(handle, body, value, {}); } -void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, - const GcnInst& inst) { +void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -370,22 +369,19 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma info.index_enable.Assign(mtbuf.idxen); info.offset_enable.Assign(mtbuf.offen); info.inst_offset.Assign(mtbuf.offset); - info.is_typed.Assign(is_typed); if (is_typed) { - info.dmft.Assign(static_cast(mtbuf.dfmt)); - info.nfmt.Assign(static_cast(mtbuf.nfmt)); - ASSERT(info.nfmt == AmdGpu::NumberFormat::Float && - (info.dmft == AmdGpu::DataFormat::Format32_32_32_32 || - info.dmft == AmdGpu::DataFormat::Format32_32_32 || - info.dmft == AmdGpu::DataFormat::Format32_32 || - info.dmft == AmdGpu::DataFormat::Format32)); + const auto dmft = static_cast(mtbuf.dfmt); + const auto nfmt = static_cast(mtbuf.nfmt); + ASSERT(nfmt == AmdGpu::NumberFormat::Float && + (dmft == AmdGpu::DataFormat::Format32_32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32)); } const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info) - : ir.LoadBuffer(num_dwords, handle, address, info); + const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::F32{value}); @@ -396,8 +392,34 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma } } -void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, - const GcnInst& inst) { +void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) { + const auto& mubuf = inst.control.mubuf; + const IR::VectorReg vaddr{inst.src[0].code}; + const IR::ScalarReg sharp{inst.src[2].code * 4}; + ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); + const IR::Value address = [&] -> IR::Value { + if (mubuf.idxen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); + const IR::Value soffset{GetSrc(inst.src[3])}; + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + + IR::BufferInstInfo info{}; + info.index_enable.Assign(mubuf.idxen); + + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + const IR::Value value = ir.LoadBufferFormat(handle, address, info); + const IR::VectorReg dst_reg{inst.src[1].code}; + for (u32 i = 0; i < num_dwords; i++) { + ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)}); + } +} + +void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; @@ -417,45 +439,76 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_form info.index_enable.Assign(mtbuf.idxen); info.offset_enable.Assign(mtbuf.offen); info.inst_offset.Assign(mtbuf.offset); - info.is_typed.Assign(is_typed); if (is_typed) { - info.dmft.Assign(static_cast(mtbuf.dfmt)); - info.nfmt.Assign(static_cast(mtbuf.nfmt)); + const auto dmft = static_cast(mtbuf.dfmt); + const auto nfmt = static_cast(mtbuf.nfmt); + ASSERT(nfmt == AmdGpu::NumberFormat::Float && + (dmft == AmdGpu::DataFormat::Format32_32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32_32 || + dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32)); } IR::Value value{}; const IR::VectorReg src_reg{inst.src[1].code}; switch (num_dwords) { case 1: - value = ir.GetVectorReg(src_reg); + value = ir.GetVectorReg(src_reg); break; case 2: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1)); + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), + ir.GetVectorReg(src_reg + 1)); break; case 3: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1), - ir.GetVectorReg(src_reg + 2)); + value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), + ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2)); break; case 4: - value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), - ir.GetVectorReg(src_reg + 1), - ir.GetVectorReg(src_reg + 2), - ir.GetVectorReg(src_reg + 3)); + value = ir.CompositeConstruct( + ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1), + ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3)); break; } const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - if (is_format) { - ir.StoreBufferFormat(num_dwords, handle, address, value, info); - } else { - ir.StoreBuffer(num_dwords, handle, address, value, info); - } + ir.StoreBuffer(num_dwords, handle, address, value, info); +} + +void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) { + const auto& mubuf = inst.control.mubuf; + const IR::VectorReg vaddr{inst.src[0].code}; + const IR::ScalarReg sharp{inst.src[2].code * 4}; + ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); + const IR::Value address = [&] -> IR::Value { + if (mubuf.idxen) { + return ir.GetVectorReg(vaddr); + } + return {}; + }(); + const IR::Value soffset{GetSrc(inst.src[3])}; + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + + IR::BufferInstInfo info{}; + info.index_enable.Assign(mubuf.idxen); + + const IR::VectorReg src_reg{inst.src[1].code}; + + std::array comps{}; + for (u32 i = 0; i < num_dwords; i++) { + comps[i] = ir.GetVectorReg(src_reg + i); + } + for (u32 i = num_dwords; i < 4; i++) { + comps[i] = ir.Imm32(0.f); + } + + const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]); + const IR::Value handle = + ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), + ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); + ir.StoreBufferFormat(handle, address, value, info); } -// TODO: U64 void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const auto& mubuf = inst.control.mubuf; const IR::VectorReg vaddr{inst.src[0].code}; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 65de98b7..473ae4f6 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -325,20 +325,8 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad } } -Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, - BufferInstInfo info) { - switch (num_dwords) { - case 1: - return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); - case 2: - return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address); - case 3: - return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address); - case 4: - return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address); - default: - UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); - } +Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); } void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address, @@ -409,24 +397,9 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value); } -void IREmitter::StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, - const Value& data, BufferInstInfo info) { - switch (num_dwords) { - case 1: - Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); - break; - case 2: - Inst(Opcode::StoreBufferFormatF32x2, Flags{info}, handle, address, data); - break; - case 3: - Inst(Opcode::StoreBufferFormatF32x3, Flags{info}, handle, address, data); - break; - case 4: - Inst(Opcode::StoreBufferFormatF32x4, Flags{info}, handle, address, data); - break; - default: - UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); - } +void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); } U32 IREmitter::LaneId() { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a60f4c28..de8fe450 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -92,12 +92,12 @@ public: [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); - [[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address, + [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info); void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); - void StoreBufferFormat(int num_dwords, const Value& handle, const Value& address, - const Value& data, BufferInstInfo info); + void StoreBufferFormat(const Value& handle, const Value& address, const Value& data, + BufferInstInfo info); [[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index a8c8b073..d6ef49cf 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -56,9 +56,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: case Opcode::StoreBufferFormatF32: - case Opcode::StoreBufferFormatF32x2: - case Opcode::StoreBufferFormatF32x3: - case Opcode::StoreBufferFormatF32x4: case Opcode::StoreBufferU32: case Opcode::BufferAtomicIAdd32: case Opcode::BufferAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index a49ea1c7..1e33d6d4 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -79,19 +79,13 @@ OPCODE(LoadBufferF32, F32, Opaq OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32, F32, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x2, F32x2, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x3, F32x3, Opaque, Opaque, ) -OPCODE(LoadBufferFormatF32x4, F32x4, Opaque, Opaque, ) +OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) -OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32, ) -OPCODE(StoreBufferFormatF32x2, Void, Opaque, Opaque, F32x2, ) -OPCODE(StoreBufferFormatF32x3, Void, Opaque, Opaque, F32x3, ) -OPCODE(StoreBufferFormatF32x4, Void, Opaque, Opaque, F32x4, ) +OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) // Buffer atomic operations diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 95fe86b4..953273b2 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -45,10 +45,6 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: case IR::Opcode::StoreBufferF32x4: - case IR::Opcode::StoreBufferFormatF32: - case IR::Opcode::StoreBufferFormatF32x2: - case IR::Opcode::StoreBufferFormatF32x3: - case IR::Opcode::StoreBufferFormatF32x4: case IR::Opcode::StoreBufferU32: return true; default: @@ -62,10 +58,6 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: case IR::Opcode::LoadBufferF32x4: - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: case IR::Opcode::LoadBufferU32: case IR::Opcode::ReadConstBuffer: case IR::Opcode::ReadConstBufferU32: @@ -75,6 +67,11 @@ bool IsBufferInstruction(const IR::Inst& inst) { } } +bool IsTextureBufferInstruction(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || + inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32; +} + static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) { switch (num_format) { case AmdGpu::NumberFormat::Float: @@ -100,28 +97,6 @@ static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_for IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { switch (inst.GetOpcode()) { - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: - case IR::Opcode::StoreBufferFormatF32: - case IR::Opcode::StoreBufferFormatF32x2: - case IR::Opcode::StoreBufferFormatF32x3: - case IR::Opcode::StoreBufferFormatF32x4: - switch (num_format) { - case AmdGpu::NumberFormat::Unorm: - case AmdGpu::NumberFormat::Snorm: - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Uint: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::SnormNz: - return IR::Type::U32; - case AmdGpu::NumberFormat::Float: - return IR::Type::F32; - default: - UNREACHABLE(); - } case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: @@ -209,7 +184,8 @@ u32 ImageOffsetArgumentPosition(const IR::Inst& inst) { class Descriptors { public: explicit Descriptors(Info& info_) - : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images}, + : info{info_}, buffer_resources{info_.buffers}, + texture_buffer_resources{info_.texture_buffers}, image_resources{info_.images}, sampler_resources{info_.samplers} {} u32 Add(const BufferResource& desc) { @@ -225,6 +201,16 @@ public: return index; } + u32 Add(const TextureBufferResource& desc) { + const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) { + return desc.sgpr_base == existing.sgpr_base && + desc.dword_offset == existing.dword_offset; + })}; + auto& buffer = texture_buffer_resources[index]; + buffer.is_written |= desc.is_written; + return index; + } + u32 Add(const ImageResource& desc) { const u32 index{Add(image_resources, desc, [&desc](const auto& existing) { return desc.sgpr_base == existing.sgpr_base && @@ -259,6 +245,7 @@ private: const Info& info; BufferResourceList& buffer_resources; + TextureBufferResourceList& texture_buffer_resources; ImageResourceList& image_resources; SamplerResourceList& sampler_resources; }; @@ -355,20 +342,6 @@ SharpLocation TrackSharp(const IR::Inst* inst) { }; } -static constexpr size_t MaxUboSize = 65536; - -static bool IsLoadBufferFormat(const IR::Inst& inst) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadBufferFormatF32: - case IR::Opcode::LoadBufferFormatF32x2: - case IR::Opcode::LoadBufferFormatF32x3: - case IR::Opcode::LoadBufferFormatF32x4: - return true; - default: - return false; - } -} - static u32 BufferLength(const AmdGpu::Buffer& buffer) { const auto stride = buffer.GetStride(); if (stride < sizeof(f32)) { @@ -434,14 +407,6 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, // Update buffer descriptor format. const auto inst_info = inst.Flags(); - auto& buffer_desc = info.buffers[binding]; - if (inst_info.is_typed) { - buffer_desc.dfmt = inst_info.dmft; - buffer_desc.nfmt = inst_info.nfmt; - } else { - buffer_desc.dfmt = buffer.GetDataFmt(); - buffer_desc.nfmt = buffer.GetNumberFmt(); - } // Replace handle with binding index in buffer resource list. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; @@ -454,20 +419,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, return; } - if (IsLoadBufferFormat(inst)) { - if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) { - info.uses_fp16 = true; - } - } else { - const u32 stride = buffer.GetStride(); - if (stride < 4) { - LOG_WARNING(Render_Vulkan, - "non-formatting load_buffer_* is not implemented for stride {}", stride); - } - } - // Compute address of the buffer using the stride. - // Todo: What if buffer is rebound with different stride? IR::U32 address = ir.Imm32(inst_info.inst_offset.Value()); if (inst_info.index_enable) { const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} @@ -482,6 +434,25 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, inst.SetArg(1, address); } +void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { + const IR::Inst* handle = inst.Arg(0).InstRecursive(); + const IR::Inst* producer = handle->Arg(0).InstRecursive(); + const auto sharp = TrackSharp(producer); + const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const s32 binding = descriptors.Add(TextureBufferResource{ + .sgpr_base = sharp.sgpr_base, + .dword_offset = sharp.dword_offset, + .nfmt = buffer.GetNumberFmt(), + .is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, + }); + + // Replace handle with binding index in texture buffer resource list. + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + inst.SetArg(0, ir.Imm32(binding)); + ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable); +} + IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t, const IR::Value& z, bool is_storage) { // When cubemap is written with imageStore it is treated like 2DArray. @@ -666,6 +637,10 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferInstruction(*block, inst, info, descriptors); continue; } + if (IsTextureBufferInstruction(inst)) { + PatchTextureBufferInstruction(*block, inst, info, descriptors); + continue; + } if (IsImageInstruction(inst)) { PatchImageInstruction(*block, inst, info, descriptors); } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 52087a65..82d99f99 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -29,6 +29,10 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::ImageWrite: info.has_storage_images = true; break; + case IR::Opcode::LoadBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32: + info.has_texel_buffers = true; + break; case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index 7868a5a3..fba04f33 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -66,9 +66,6 @@ union BufferInstInfo { BitField<0, 1, u32> index_enable; BitField<1, 1, u32> offset_enable; BitField<2, 12, u32> inst_offset; - BitField<14, 4, AmdGpu::DataFormat> dmft; - BitField<18, 3, AmdGpu::NumberFormat> nfmt; - BitField<21, 1, u32> is_typed; }; enum class ScalarReg : u32 { diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 3e087647..62318a20 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -81,8 +81,6 @@ struct BufferResource { u32 length; IR::Type used_types; AmdGpu::Buffer inline_cbuf; - AmdGpu::DataFormat dfmt; - AmdGpu::NumberFormat nfmt; bool is_instance_data{}; bool is_written{}; @@ -107,6 +105,23 @@ struct BufferResource { }; using BufferResourceList = boost::container::static_vector; +struct TextureBufferResource { + u32 sgpr_base; + u32 dword_offset; + AmdGpu::NumberFormat nfmt; + bool is_written{}; + + u64 GetKey(const Info& info) const { + const auto sharp = GetVsharp(info); + const bool is_integer = sharp.GetNumberFmt() == AmdGpu::NumberFormat::Uint || + sharp.GetNumberFmt() == AmdGpu::NumberFormat::Sint; + return is_integer; + } + + constexpr AmdGpu::Buffer GetVsharp(const Info& info) const noexcept; +}; +using TextureBufferResourceList = boost::container::static_vector; + struct ImageResource { u32 sgpr_base; u32 dword_offset; @@ -207,6 +222,7 @@ struct Info { s8 instance_offset_sgpr = -1; BufferResourceList buffers; + TextureBufferResourceList texture_buffers; ImageResourceList images; SamplerResourceList samplers; @@ -222,12 +238,13 @@ struct Info { u64 pgm_hash{}; u32 shared_memory_size{}; bool has_storage_images{}; + bool has_texel_buffers{}; bool has_discard{}; bool has_image_gather{}; bool has_image_query{}; bool uses_group_quad{}; bool uses_shared{}; - bool uses_fp16{}; + bool uses_fp16{true}; bool uses_step_rates{}; bool translation_failed{}; // indicates that shader has unsupported instructions @@ -243,7 +260,7 @@ struct Info { } size_t NumBindings() const noexcept { - return buffers.size() + images.size() + samplers.size(); + return buffers.size() + texture_buffers.size() + images.size() + samplers.size(); } u64 GetStageSpecializedKey(u32 binding = 0) const noexcept { @@ -251,6 +268,9 @@ struct Info { for (const auto& buffer : buffers) { key = HashCombine(key, buffer.GetKey(*this)); } + for (const auto& buffer : texture_buffers) { + key = HashCombine(key, buffer.GetKey(*this)); + } for (const auto& image : images) { key = HashCombine(key, image.GetKey(*this)); } @@ -274,6 +294,10 @@ constexpr AmdGpu::Buffer BufferResource::GetVsharp(const Info& info) const noexc return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); } +constexpr AmdGpu::Buffer TextureBufferResource::GetVsharp(const Info& info) const noexcept { + return info.ReadUd(sgpr_base, dword_offset); +} + constexpr AmdGpu::Image ImageResource::GetTsharp(const Info& info) const noexcept { return info.ReadUd(sgpr_base, dword_offset); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 001d0eed..a2590670 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -33,6 +33,15 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler .stageFlags = vk::ShaderStageFlagBits::eCompute, }); } + for (const auto& tex_buffer : info->texture_buffers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } for (const auto& image : info->images) { bindings.push_back({ .binding = binding++, @@ -91,6 +100,7 @@ ComputePipeline::~ComputePipeline() = default; bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. + boost::container::static_vector buffer_views; boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; @@ -141,6 +151,41 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, }); } + for (const auto& tex_buffer : info->texture_buffers) { + const auto vsharp = tex_buffer.GetVsharp(*info); + vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE); + if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + const VAddr address = vsharp.base_address; + const u32 size = vsharp.GetSize(); + if (tex_buffer.is_written) { + texture_cache.InvalidateMemory(address, size, true); + } + const u32 alignment = instance.TexelBufferMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, tex_buffer.is_written); + const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; + ASSERT_MSG(fmt_stride == vsharp.GetStride(), + "Texel buffer stride must match format stride"); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % fmt_stride == 0); + push_data.AddOffset(binding, adjust / fmt_stride); + } + buffer_view = vk_buffer->View(offset_aligned, size + adjust, vsharp.GetDataFmt(), + vsharp.GetNumberFmt()); + } + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .pTexelBufferView = &buffer_view, + }); + } + for (const auto& image_desc : info->images) { const auto tsharp = image_desc.GetTsharp(*info); VideoCore::ImageInfo image_info{tsharp}; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5b57d81a..6e7c7782 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -316,6 +316,15 @@ void GraphicsPipeline::BuildDescSetLayout() { .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, }); } + for (const auto& tex_buffer : stage->texture_buffers) { + bindings.push_back({ + .binding = binding++, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } for (const auto& image : stage->images) { bindings.push_back({ .binding = binding++, @@ -346,6 +355,7 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. + boost::container::static_vector buffer_views; boost::container::static_vector buffer_infos; boost::container::static_vector image_infos; boost::container::small_vector set_writes; @@ -394,6 +404,38 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, }); } + for (const auto& tex_buffer : stage->texture_buffers) { + const auto vsharp = tex_buffer.GetVsharp(*stage); + vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE); + if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + const VAddr address = vsharp.base_address; + const u32 size = vsharp.GetSize(); + const u32 alignment = instance.TexelBufferMinAlignment(); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(address, size, tex_buffer.is_written); + const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; + ASSERT_MSG(fmt_stride == vsharp.GetStride(), + "Texel buffer stride must match format stride"); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + if (adjust != 0) { + ASSERT(adjust % fmt_stride == 0); + push_data.AddOffset(binding, adjust / fmt_stride); + } + buffer_view = vk_buffer->View(offset, size + adjust, vsharp.GetDataFmt(), + vsharp.GetNumberFmt()); + } + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .pTexelBufferView = &buffer_view, + }); + } + boost::container::static_vector tsharps; for (const auto& image_desc : stage->images) { const auto tsharp = image_desc.GetTsharp(*stage); diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 4cb4741a..5f985d4a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -192,6 +192,11 @@ public: return properties.limits.minStorageBufferOffsetAlignment; } + /// Returns the minimum required alignment for texel buffers + vk::DeviceSize TexelBufferMinAlignment() const { + return properties.limits.minTexelBufferOffsetAlignment; + } + /// Returns the minimum alignemt required for accessing host-mapped device memory vk::DeviceSize NonCoherentAtomSize() const { return properties.limits.nonCoherentAtomSize; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 958fe14e..08c56dde 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -273,6 +273,10 @@ vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, std::span