diff --git a/.copyrightignore b/.copyrightignore index 61b2808df3..510c6d98eb 100644 --- a/.copyrightignore +++ b/.copyrightignore @@ -29,3 +29,4 @@ run-clang-tidy.py package-list.txt .spv .pre-commit-config.yaml +.vgf diff --git a/.gitmodules b/.gitmodules index d3bfb3ed65..9702655303 100644 --- a/.gitmodules +++ b/.gitmodules @@ -50,3 +50,9 @@ [submodule "third_party/tracy"] path = third_party/tracy url = https://github.com/wolfpld/tracy.git +[submodule "third_party/flatbuffers"] + path = third_party/flatbuffers + url = https://github.com/google/flatbuffers +[submodule "third_party/ai-ml-sdk-vgf-library"] + path = third_party/ai-ml-sdk-vgf-library + url = https://github.com/arm/ai-ml-sdk-vgf-library diff --git a/antora/modules/ROOT/nav.adoc b/antora/modules/ROOT/nav.adoc index 9f9b795ecf..03744fdadb 100644 --- a/antora/modules/ROOT/nav.adoc +++ b/antora/modules/ROOT/nav.adoc @@ -97,6 +97,8 @@ *** xref:samples/extensions/tensor_and_data_graph/simple_tensor_and_data_graph/README.adoc[Simple Tensor and Data Graph] *** xref:samples/extensions/tensor_and_data_graph/graph_constants/README.adoc[Graph constants] *** xref:samples/extensions/tensor_and_data_graph/compute_shaders_with_tensors/README.adoc[Compute shaders with tensors] +*** xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[Tensor image aliasing] +*** xref:samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/README.adoc[Postprocessing with VGF] ** xref:samples/extensions/timeline_semaphore/README.adoc[Timeline semaphore] ** xref:samples/extensions/vertex_dynamic_state/README.adoc[Vertex dynamic state] ** xref:samples/extensions/dynamic_multisample_rasterization/README.adoc[Dynamic multisample rasterization] diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 9f10c15307..684966fe80 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -96,6 +96,8 @@ set(ORDER_LIST "simple_tensor_and_data_graph" "graph_constants" "compute_shaders_with_tensors" + "tensor_image_aliasing" + "postprocessing_with_vgf" #Performance Samples "swapchain_images" diff --git a/samples/extensions/README.adoc b/samples/extensions/README.adoc index 66dd399829..1a72157bd3 100644 --- a/samples/extensions/README.adoc +++ b/samples/extensions/README.adoc @@ -325,6 +325,12 @@ Demonstrate how to build data graph pipelines and execute neural networks: * xref:./{extension_samplespath}tensor_and_data_graph/compute_shaders_with_tensors/README.adoc[compute_shaders_with_tensors] - Explains how compute shaders can be used to write input tensors and read output tensors from a simple convolutional neural network. +* xref:./{extension_samplespath}tensor_and_data_graph/tensor_image_aliasing/README.adoc[tensor_image_aliasing] +- Explains how to implement zero-copy data exchange between tensors and images through aliasing. + +* xref:./{extension_samplespath}tensor_and_data_graph/postprocessing_with_vgf/README.adoc[postprocessing_with_vgf] +- Explains how to load and run a VGF file defining a neural network model. A VGF file contains information regarding input, output and constant tensors along with the SPIR-V code which defines the model, which can then be used to run a data graph pipeline. + === xref:./{extension_samplespath}ray_tracing_invocation_reorder/README.adoc[Ray Tracing Invocation Reorder] *Extensions:* https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_EXT_ray_tracing_invocation_reorder.html[`VK_EXT_ray_tracing_invocation_reorder`] diff --git a/samples/extensions/tensor_and_data_graph/README.adoc b/samples/extensions/tensor_and_data_graph/README.adoc index 6c5384389a..fc3178789e 100644 --- a/samples/extensions/tensor_and_data_graph/README.adoc +++ b/samples/extensions/tensor_and_data_graph/README.adoc @@ -33,6 +33,8 @@ The samples in this folder are: * xref:./simple_tensor_and_data_graph/README.adoc[Simple tensor and data graph] * xref:./graph_constants/README.adoc[Graph constants] * xref:./compute_shaders_with_tensors/README.adoc[Compute shaders with tensors] +* xref:./tensor_image_aliasing/README.adoc[Tensor image aliasing] +* xref:./postprocessing_with_vgf/README.adoc[Postprocessing with VGF] == Setup diff --git a/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/CMakeLists.txt b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/CMakeLists.txt new file mode 100644 index 0000000000..c623c16c3a --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright (c) 2025-2026, Arm Limited and Contributors +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Define path to the third-party ai-ml-sdk-vgf-library +set(VGF_LIB_ROOT "${CMAKE_SOURCE_DIR}/third_party/ai-ml-sdk-vgf-library") + +# Include directories +set(VGF_LIB_INCLUDE_DIR "${VGF_LIB_ROOT}/include-c") +include_directories(${VGF_LIB_INCLUDE_DIR}) + +if(WIN32 OR ANDROID OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + add_sample_with_tags( + ID "postprocessing_with_vgf" + CATEGORY "extensions/tensor_and_data_graph" + AUTHOR "Arm Ltd." + NAME "Postprocessing with VGF" + DESCRIPTION "Demonstrates creating and running a data graph pipeline from a VGF file" + LIBS + vgf + FILES + ${VGF_LIB_INCLUDE_DIR}/vgf/decoder.h + "postprocessing_with_vgf.h" + "postprocessing_with_vgf.cpp" + SHADER_FILES_GLSL + "base.vert" + "base.frag" + "tensor_and_data_graph/glsl/blit.frag" + "tensor_and_data_graph/glsl/fullscreen.vert" + ) +endif() \ No newline at end of file diff --git a/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/README.adoc b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/README.adoc new file mode 100644 index 0000000000..2826e80e94 --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/README.adoc @@ -0,0 +1,238 @@ +//// +- Copyright (c) 2025-2026, Arm Limited and Contributors +- +- SPDX-License-Identifier: Apache-2.0 +- +- Licensed under the Apache License, Version 2.0 the "License"; +- you may not use this file except in compliance with the License. +- You may obtain a copy of the License at +- +- http://www.apache.org/licenses/LICENSE-2.0 +- +- Unless required by applicable law or agreed to in writing, software +- distributed under the License is distributed on an "AS IS" BASIS, +- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- See the License for the specific language governing permissions and +- limitations under the License. +- +//// += Postprocessing with VGF + +ifdef::site-gen-antora[] +TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/extensions/tensor_and_data_graph/simple_tensor_and_data_graph[Khronos Vulkan samples github repository]. +endif::[] + +image::./images/sample.png[Sample] + +== Overview + +This is the fifth sample in a series, which follows on from the previous xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[tensor image aliasing sample]. To best understand this sample, it would be helpful to have first looked at the xref:samples/extensions/tensor_and_data_graph/README.adoc[previous samples in the series]. +This sample shows how to use the VGF format, which stores information about a neural network model such as SPIR-V code, input and output info and constant data used to run a data graph pipeline. The same tensor aliasing features are used as in the previous sample xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[tensor image aliasing] to run a simple post-processing effect on a 3D scene. + +== Setup + +If you would like to build and run this sample, please make sure to first follow the setup steps on the xref:samples/extensions/tensor_and_data_graph/README.adoc[tensor and data graph] page. + +== Introduction + +A neural network model is more complicated than single shader and so benefits from a more structured format to describe it. The VGF format is a binary file that contains all the information needed to run a neural network model, including SPIR-V code which defines the model architecture, constant data for the trained weights and information about the inputs and outputs of the model. + +VGF files are typically produced from offline tools that convert models from native formats such as PyTorch. In this sample, the VGF file is very simple and was generated from the same model used in the xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[tensor image aliasing sample]. + +A VGF file is made up of multiple sections, as described below: + +1. Module Table: This contains the SPIR-V code for each module. Each module is either a compute shader or a data graph. +2. Model Resource Table: This is a list of tensor descriptions (data formats, size etc.) +3. Model Sequence: This defines the order that the modules should be executed in, as well as their inputs and outputs. +4. Constant table: This contains the raw constant data for all constant tensors used in the model. + +== VGF Creation + +In this sample the VGF file has been generated offline, but there are a number of ways to create your own VGF file. + +=== ML SDK for Vulkan + +The link:https://github.com/arm/ai-ml-sdk-model-converter[ai-ml-sdk-model-converter] tool from the link:https://github.com/arm/ai-ml-sdk-for-vulkan[ai-ml-sdk-for-vulkan] can be used to create VGF files. + +This tool currently supports several different link:https://www.mlplatform.org/tosa/[TOSA] encodings as inputs such as TOSA FlatBuffers, TOSA MLIR bytecode and TOSA MLIR textual format. +The link:https://review.mlplatform.org/plugins/gitiles/tosa/tosa_mlir_translator[tosa_mlir_translator] can be used to translate TOSA MLIR dialect into TOSA Flatbuffers. +The link:https://review.mlplatform.org/plugins/gitiles/tosa/serialization_lib[TOSA Serialization Library] also provides an API for creating TOSA FlatBuffers. + +=== ai-ml-sdk-vgf-library Encoder + +The link:https://github.com/arm/ai-ml-sdk-vgf-library[ai-ml-sdk-vgf-library] provides a lower level interface to create VGF files. This repository is already included as third_party dependency for this sample as it also provides the interface for decoding the VGF file, which is described in the next section. The library provides both a C and a C++ interface for decoding VGF files, in this case we are using the C interface. + +=== Neural Graphics Model Gym === + +The link:https://github.com/arm/neural-graphics-model-gym[Neural Graphics Model Gym] is a Python toolkit for developing real-time neural graphics machine learning models and can export models to VGF format. + + +== VGF Loading + +Firstly, we load the binary file into memory: + +[source,cpp,options="nowrap"] +---- +std::vector vgf_buffer = vkb::filesystem::get()->read_file_binary(vgf_file_path); +---- + +We then decode the headers and validate them: + +[source,cpp,options="nowrap"] +---- +std::vector header_decoder_memory(mlsdk_decoder_header_decoder_mem_reqs()); +mlsdk_decoder_header_decoder* header_decoder = + mlsdk_decoder_create_header_decoder(vgf_buffer.data(), header_decoder_memory.data()); +---- + +With this we can begin to read the sections within the VGF file (as described in the <>): + +[source,cpp,options="nowrap"] +---- +mlsdk_decoder_vgf_section_info section_infos[4]; +for (mlsdk_decoder_section section_type = mlsdk_decoder_section_modules; + section_type <= mlsdk_decoder_section_constants; + section_type = mlsdk_decoder_section(section_type + 1)) +{ + + mlsdk_decoder_get_header_section_info(header_decoder, section_type, §ion_infos[section_type]); + + if (section_infos[section_type].offset + section_infos[section_type].size > vgf_buffer.size()) + { + throw std::runtime_error("Corrupt VGF header (section out of bounds)."); + } +} + +std::vector module_table_decoder_memory(mlsdk_decoder_module_table_decoder_mem_reqs()); +std::vector model_resource_table_decoder_memory(mlsdk_decoder_model_resource_table_decoder_mem_reqs()); +std::vector model_sequence_decoder_memory(mlsdk_decoder_model_sequence_decoder_mem_reqs()); +std::vector constant_table_decoder_memory(mlsdk_decoder_constant_table_decoder_mem_reqs()); + +mlsdk_decoder_module_table_decoder* module_table_decoder = + mlsdk_decoder_create_module_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_modules].offset, + module_table_decoder_memory.data()); + +mlsdk_decoder_model_resource_table_decoder* model_resource_table_decoder = + mlsdk_decoder_create_model_resource_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_resources].offset, + model_resource_table_decoder_memory.data()); + +mlsdk_decoder_model_sequence_decoder* model_sequence_decoder = + mlsdk_decoder_create_model_sequence_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_model_sequence].offset, + model_sequence_decoder_memory.data()); + +mlsdk_decoder_constant_table_decoder* constant_table_decoder = + mlsdk_decoder_create_constant_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_constants].offset, + constant_table_decoder_memory.data()); +---- + +We can then start to read the tensor info from the VGF file. We start by reading all the tensor information into the `all_tensor_infos` array: + +[source,cpp,options="nowrap"] +---- +size_t num_resource_entries = mlsdk_decoder_get_model_resource_table_num_entries(model_resource_table_decoder); +all_tensor_infos.reserve(num_resource_entries); + +for (int resource_idx = 0; resource_idx < num_resource_entries; ++resource_idx) +{ + mlsdk_vk_format vk_format = mlsdk_decoder_get_vk_format(model_resource_table_decoder, resource_idx); + + mlsdk_decoder_tensor_dimensions dims_raw; + mlsdk_decoder_model_resource_table_get_tensor_shape(model_resource_table_decoder, resource_idx, &dims_raw); + std::vector tensor_shape(dims_raw.data, dims_raw.data + dims_raw.size); + + TensorInfo tensor_info; + tensor_info.binding = resource_idx; + tensor_info.dimensions = tensor_shape; + tensor_info.format = static_cast(vk_format); + + all_tensor_infos.push_back(tensor_info); +} +---- + +Next we can load the constants: + +[source,cpp,options="nowrap"] +---- +size_t num_model_constants = mlsdk_decoder_get_constant_table_num_entries(constant_table_decoder); + +mlsdk_decoder_constant_indexes constant_indexes; +mlsdk_decoder_model_sequence_get_segment_constant_indexes(model_sequence_decoder, 0, &constant_indexes); + +for (uint32_t idx = 0; idx < constant_indexes.size; ++idx) +{ + int model_constant_idx = constant_indexes.data[idx]; + if (model_constant_idx >= num_model_constants) + { + throw std::runtime_error("Corrupt VGF (segment constant idx out of bounds)."); + } + + uint32_t resource_index = mlsdk_decoder_constant_table_get_mrt_index(constant_table_decoder, model_constant_idx); + if (resource_index >= num_resource_entries) + { + throw std::runtime_error("Corrupt VGF (constant resource idx out of bounds)"); + } + + mlsdk_decoder_constant_data constant_data; + mlsdk_decoder_constant_table_get_data(constant_table_decoder, model_constant_idx, &constant_data); + + std::vector vector_data(constant_data.data, constant_data.data + constant_data.size); +} +---- + +We also query the input and output binding slots and save the tensor information for these tensors. + +[source,cpp,options="nowrap"] +---- +// Input +{ + uint32_t resource_index = mlsdk_decoder_binding_slot_mrt_index(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_input_binding_slot(model_sequence_decoder), 0); + uint32_t binding_id = mlsdk_decoder_binding_slot_binding_id(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_input_binding_slot(model_sequence_decoder), 0); + + all_tensor_infos[resource_index].binding = binding_id; + input_tensor_infos.push_back(all_tensor_infos[resource_index]); +} + +// Output +{ + uint32_t resource_index = mlsdk_decoder_binding_slot_mrt_index(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_output_binding_slot(model_sequence_decoder), 0); + uint32_t binding_id = mlsdk_decoder_binding_slot_binding_id(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_output_binding_slot(model_sequence_decoder), 0); + + all_tensor_infos[resource_index].binding = binding_id; + output_tensor_infos.push_back(all_tensor_infos[resource_index]); +} +---- + +Lastly, we get the SPIR-V code and the entry point, which is used for setting up a data graph pipeline. + +[source,cpp,options="nowrap"] +---- +int32_t module_index = mlsdk_decoder_model_sequence_get_segment_module_index(model_sequence_decoder, 0); + +mlsdk_decoder_spirv_code spirv_code; +mlsdk_decoder_get_module_code(module_table_decoder, module_index, &spirv_code); +if (!spirv_code.code || spirv_code.words == 0) +{ + throw std::runtime_error("Missing SPIRV code for module."); +} + +std::vector code(spirv_code.code, spirv_code.code + spirv_code.words); +const char* entry_point = mlsdk_decoder_get_module_entry_point(module_table_decoder, 0); +---- + +We then use this information when creating our input and output tensors and our data graph pipeline. The SPIR-V code is used to create the `VkShaderModule` and the constant data is used when creating the data graph pipeline. Once created, the data graph pipeline can be executed as normal. + +== Conclusion + +In this sample, we've walked through some options for creating a VGF file, loading it, and using the data from it to run a data graph pipeline. + +== Known Issues + +* The model in the VGF file uses a fixed resolution of 1280x720, so the sample will always render to a render target of this size, no matter the window size. This will result in poor quality rendering if the window size does not match 1280x720. 1280x720 is the default resolution when running the sample, so this is only a concern if you resize the window manually or using command-line arguments. diff --git a/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/images/sample.png b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/images/sample.png new file mode 100644 index 0000000000..903ab32609 Binary files /dev/null and b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/images/sample.png differ diff --git a/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.cpp b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.cpp new file mode 100644 index 0000000000..a1235712a3 --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.cpp @@ -0,0 +1,787 @@ +/* Copyright (c) 2025-2026, Arm Limited and Contributors + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "postprocessing_with_vgf.h" + +#include "common/vk_common.h" +#include "filesystem/filesystem.hpp" +#include "gltf_loader.h" +#include "gui.h" +#include "rendering/subpasses/forward_subpass.h" +#include "scene_graph/components/perspective_camera.h" +#include "stats/stats.h" + +#include + +PostprocessingWithVgf::PostprocessingWithVgf() +{ + // Declare that we need the data graph and tensor extensions + add_device_extension("VK_ARM_tensors"); + add_device_extension("VK_ARM_data_graph"); + // These extensions are dependencies of the above, so we need to add them too. + add_device_extension("VK_KHR_maintenance5"); + add_device_extension("VK_KHR_deferred_host_operations"); +} + +uint32_t PostprocessingWithVgf::get_api_version() const +{ + return VK_API_VERSION_1_3; // Required by the emulation layers +} + +PostprocessingWithVgf::~PostprocessingWithVgf() +{ + if (data_graph_pipeline_descriptor_set != VK_NULL_HANDLE) + { + vkFreeDescriptorSets(get_device().get_handle(), descriptor_pool, 1, &data_graph_pipeline_descriptor_set); + } + if (descriptor_pool != VK_NULL_HANDLE) + { + vkDestroyDescriptorPool(get_device().get_handle(), descriptor_pool, nullptr); + } + + // Make sure resources created in the render pipeline are destroyed before the Device gets destroyed. + // TODO: Could move this to the base VulkanSample class and upstream this patch. + set_render_pipeline(nullptr); +} + +/** + * @brief Overridden to declare that we require some physical device features to be enabled. + */ +void PostprocessingWithVgf::request_gpu_features(vkb::core::PhysicalDeviceC &gpu) +{ + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan12Features, shaderInt8); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan13Features, synchronization2); + + // Enable the features for tensors and data graphs which we intend to use. + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceTensorFeaturesARM, tensors); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceTensorFeaturesARM, shaderTensorAccess); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceDataGraphFeaturesARM, dataGraph); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceDataGraphFeaturesARM, dataGraphShaderModule); + + // Update-after-bind is required for the emulation layer + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan12Features, descriptorBindingUniformBufferUpdateAfterBind); + + // Enable Int16 and Int64, if available. + if (gpu.get_features().shaderInt16) + { + gpu.get_mutable_requested_features().shaderInt16 = VK_TRUE; + } + else + { + throw std::runtime_error("Required feature VkPhysicalDeviceFeatures::shaderInt16 is not supported."); + } + + if (gpu.get_features().shaderInt64) + { + gpu.get_mutable_requested_features().shaderInt64 = VK_TRUE; + } + else + { + throw std::runtime_error("Required feature VkPhysicalDeviceFeatures::shaderInt64 is not supported."); + } +} + +bool PostprocessingWithVgf::prepare(const vkb::ApplicationOptions &options) +{ + if (!VulkanSample::prepare(options)) + { + return false; + } + + // Workaround for emulation layer issue, remove once fixed. + volkLoadDevice(get_device().get_handle()); + + // Load a 3D to be rendered and set up a camera to view it + load_scene("scenes/sponza/Sponza01.gltf"); + auto &camera_node = vkb::add_free_camera(get_scene(), "main_camera", get_render_context().get_surface_extent()); + vkb::sg::Camera &camera = camera_node.get_component(); + + // Create a forward rendering pipeline to render the scene. + vkb::ShaderSource vert_shader("base.vert.spv"); + vkb::ShaderSource frag_shader("base.frag.spv"); + auto scene_subpass = std::make_unique(get_render_context(), std::move(vert_shader), std::move(frag_shader), get_scene(), camera); + + auto render_pipeline = std::make_unique(); + render_pipeline->add_subpass(std::move(scene_subpass)); + render_pipeline->prepare(); + + set_render_pipeline(std::move(render_pipeline)); + + // Load data from VGF file. + vgf_data = load_vgf("shaders/tensor_and_data_graph/postprocessing_with_vgf/vgf/simple_conv2d_rescale_graph.vgf"); + + // Create Vulkan resources (see individual functions for details) + // All resources are created with a size of 1280x720 which is what the VGF expects. + prepare_scene_render_target(1280, 720); +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + prepare_input_image(); +#endif + prepare_output_image(); + prepare_input_tensor(); + prepare_output_tensor(); + prepare_descriptor_pool(); + prepare_data_graph_pipeline(); + prepare_data_graph_pipeline_descriptor_set(); + + // Create a RenderPipeline to blit `output_image` to the swapchain + blit_pipeline = std::make_unique(); + blit_pipeline->add_subpass(std::make_unique(get_render_context(), output_image_view.get())); + blit_pipeline->prepare(); + + // Create a GUI so that we can toggle the neural network on and off (see draw_gui() function) + create_gui(*window, &get_stats()); + + return true; +} + +VgfData PostprocessingWithVgf::load_vgf(const std::string &vgf_file_path) +{ + std::vector vgf_buffer = vkb::filesystem::get()->read_file_binary(vgf_file_path); + + if (vgf_buffer.size() == 0) + { + throw std::runtime_error("Error loading VGF file: " + vgf_file_path); + } + + // Parse VGF header which contains details of other sections in the file. + std::vector header_decoder_memory(mlsdk_decoder_header_decoder_mem_reqs()); + mlsdk_decoder_header_decoder *header_decoder = + mlsdk_decoder_create_header_decoder(vgf_buffer.data(), header_decoder_memory.data()); + + if (!mlsdk_decoder_is_header_valid(header_decoder)) + { + throw std::runtime_error("VGF header is not valid."); + } + if (!mlsdk_decoder_is_header_compatible(header_decoder)) + { + throw std::runtime_error("VGF header is not compatible."); + } + + // Create decoder objects for each section in the VGF that we care about: + // Module Table: + // Each module is either a compute shader or a data graph. + // The order of these is arbitrary and there is a further information in the VGF + // that describes how to run these. + // Model Resource Table: + // This is a list of tensor descriptions (data formats, size etc.) which is indexed + // into by other fields in the VGF. + // Model Sequence: + // This defines the order that the modules should be executed in as well as their inputs and outputs. + // Constant table: + // Contains the raw constant data for all constant tensors used in the model. + mlsdk_decoder_vgf_section_info section_infos[4]; + for (mlsdk_decoder_section section_type = mlsdk_decoder_section_modules; + section_type <= mlsdk_decoder_section_constants; + section_type = mlsdk_decoder_section(section_type + 1)) + { + mlsdk_decoder_get_header_section_info(header_decoder, section_type, §ion_infos[section_type]); + + if (section_infos[section_type].offset + section_infos[section_type].size > vgf_buffer.size()) + { + throw std::runtime_error("Corrupt VGF header (section out of bounds)."); + } + } + + // Get the decoders + std::vector module_table_decoder_memory(mlsdk_decoder_module_table_decoder_mem_reqs()); + std::vector model_resource_table_decoder_memory(mlsdk_decoder_model_resource_table_decoder_mem_reqs()); + std::vector model_sequence_decoder_memory(mlsdk_decoder_model_sequence_decoder_mem_reqs()); + std::vector constant_table_decoder_memory(mlsdk_decoder_constant_table_decoder_mem_reqs()); + + mlsdk_decoder_module_table_decoder *module_table_decoder = + mlsdk_decoder_create_module_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_modules].offset, + module_table_decoder_memory.data()); + + mlsdk_decoder_model_resource_table_decoder *model_resource_table_decoder = + mlsdk_decoder_create_model_resource_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_resources].offset, + model_resource_table_decoder_memory.data()); + + mlsdk_decoder_model_sequence_decoder *model_sequence_decoder = + mlsdk_decoder_create_model_sequence_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_model_sequence].offset, + model_sequence_decoder_memory.data()); + + mlsdk_decoder_constant_table_decoder *constant_table_decoder = + mlsdk_decoder_create_constant_table_decoder( + vgf_buffer.data() + section_infos[mlsdk_decoder_section_constants].offset, + constant_table_decoder_memory.data()); + + if (!module_table_decoder) + throw std::runtime_error("Failed to create module table decoder."); + if (!model_resource_table_decoder) + throw std::runtime_error("Failed to create module resource table decoder."); + if (!model_sequence_decoder) + throw std::runtime_error("Failed to create module sequence decoder."); + if (!constant_table_decoder) + throw std::runtime_error("Failed to create constant table decoder."); + + size_t num_modules = mlsdk_decoder_get_module_table_num_entries(module_table_decoder); + if (num_modules != 1) + { + throw std::runtime_error("Only a single module VGF is supported."); + } + + std::vector all_tensor_infos; + std::vector input_tensor_infos; + std::vector output_tensor_infos; + + size_t num_resource_entries = mlsdk_decoder_get_model_resource_table_num_entries(model_resource_table_decoder); + all_tensor_infos.reserve(num_resource_entries); + + // Get all resources TensorInfo + for (int resource_idx = 0; resource_idx < num_resource_entries; ++resource_idx) + { + mlsdk_vk_format vk_format = mlsdk_decoder_get_vk_format(model_resource_table_decoder, resource_idx); + + mlsdk_decoder_tensor_dimensions dims_raw; + mlsdk_decoder_model_resource_table_get_tensor_shape(model_resource_table_decoder, resource_idx, &dims_raw); + std::vector tensor_shape(dims_raw.data, dims_raw.data + dims_raw.size); + + TensorInfo tensor_info; + tensor_info.binding = resource_idx; + tensor_info.dimensions = tensor_shape; + tensor_info.format = static_cast(vk_format); + + all_tensor_infos.push_back(tensor_info); + } + + // Get the constants used in model. + size_t num_model_constants = mlsdk_decoder_get_constant_table_num_entries(constant_table_decoder); + + mlsdk_decoder_constant_indexes constant_indexes; + mlsdk_decoder_model_sequence_get_segment_constant_indexes(model_sequence_decoder, 0, &constant_indexes); + + for (uint32_t idx = 0; idx < constant_indexes.size; ++idx) + { + int model_constant_idx = constant_indexes.data[idx]; + if (model_constant_idx >= num_model_constants) + { + throw std::runtime_error("Corrupt VGF (segment constant idx out of bounds)."); + } + + uint32_t resource_index = mlsdk_decoder_constant_table_get_mrt_index(constant_table_decoder, model_constant_idx); + if (resource_index >= num_resource_entries) + { + throw std::runtime_error("Corrupt VGF (constant resource idx out of bounds)"); + } + + mlsdk_decoder_constant_data constant_data; + mlsdk_decoder_constant_table_get_data(constant_table_decoder, model_constant_idx, &constant_data); + + std::vector vector_data(constant_data.data, constant_data.data + constant_data.size); + + // Now that we have the constant data and tensor info, we can populate the PipelineConstantTensor. + constant_tensors.push_back(std::make_unique>()); + + constant_tensors[idx]->dimensions = all_tensor_infos[resource_index].dimensions; + constant_tensors[idx]->constant_data.resize(all_tensor_infos[resource_index].dimensions.size()); + constant_tensors[idx]->constant_data = std::move(vector_data); + + constant_tensors[idx]->tensor_description = { + VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, + nullptr, + VK_TENSOR_TILING_LINEAR_ARM, + all_tensor_infos[resource_index].format, + 4, // dimensions + constant_tensors[idx]->dimensions.data(), + nullptr, // pStrides + VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM, + }; + + constant_tensors[0]->pipeline_constant = { + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM, + &constant_tensors[idx]->tensor_description, + idx, // Matches the unique identifier encoded in OpGraphConstantARM in the SPIR-V module + constant_tensors[idx]->constant_data.data() // Host pointer to raw data + }; + } + + // Input + { + uint32_t resource_index = mlsdk_decoder_binding_slot_mrt_index(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_input_binding_slot(model_sequence_decoder), 0); + uint32_t binding_id = mlsdk_decoder_binding_slot_binding_id(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_input_binding_slot(model_sequence_decoder), 0); + + all_tensor_infos[resource_index].binding = binding_id; + input_tensor_infos.push_back(all_tensor_infos[resource_index]); + } + + // Output + { + uint32_t resource_index = mlsdk_decoder_binding_slot_mrt_index(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_output_binding_slot(model_sequence_decoder), 0); + uint32_t binding_id = mlsdk_decoder_binding_slot_binding_id(model_sequence_decoder, + mlsdk_decoder_model_sequence_get_output_binding_slot(model_sequence_decoder), 0); + + all_tensor_infos[resource_index].binding = binding_id; + output_tensor_infos.push_back(all_tensor_infos[resource_index]); + } + + int32_t module_index = mlsdk_decoder_model_sequence_get_segment_module_index(model_sequence_decoder, 0); + + mlsdk_decoder_spirv_code spirv_code; + mlsdk_decoder_get_module_code(module_table_decoder, module_index, &spirv_code); + if (!spirv_code.code || spirv_code.words == 0) + { + throw std::runtime_error("Missing SPIRV code for module."); + } + + std::vector code(spirv_code.code, spirv_code.code + spirv_code.words); + const char *entry_point = mlsdk_decoder_get_module_entry_point(module_table_decoder, 0); + + return {input_tensor_infos, output_tensor_infos, code, entry_point}; +} + +/** + * Creates a RenderTarget with a single colour and depth attachment which we will render the scene into. + * The colour attachment will be aliased as a tensor input to the neural network, so needs some special flags. + */ +void PostprocessingWithVgf::prepare_scene_render_target(uint32_t width, uint32_t height) +{ + vkb::core::Image colour_image = vkb::core::ImageBuilder(width, height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) +#else + // No aliasing of this image - we will copy it instead + .with_usage(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) +#endif + .with_debug_name("SceneRenderColour") + .build(get_device().get_device()); + + vkb::core::Image depth_image = vkb::core::ImageBuilder(width, height) + .with_format(vkb::get_suitable_depth_format(get_device().get_gpu().get_handle())) + .with_usage(VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) + .with_debug_name("SceneRenderDepth") + .build(get_device().get_device()); + + std::vector images; + images.push_back(std::move(colour_image)); + images.push_back(std::move(depth_image)); + + scene_render_target = std::make_unique(std::move(images)); +} + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + +/** + * In the case that we are using the workaround where we perform an additional copy then this + * function creates the additional image which we will copy the rendered scene into. + * This image will then be aliased as the tensor input to the neural network (rather than the + * scene render target being aliased directly), and needs some special flags. + */ +void PostprocessingWithVgf::prepare_input_image() +{ + input_image = std::make_unique(vkb::core::ImageBuilder(scene_render_target->get_extent().width, scene_render_target->get_extent().height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) + .with_debug_name("InputImage") + .build(get_device().get_device())); +} + +#endif + +/** + * Creates an image to use as the output of the neural network. + * This will be aliased as the output tensor, so needs some special flags. + */ +void PostprocessingWithVgf::prepare_output_image() +{ + output_image = std::make_unique(vkb::core::ImageBuilder(scene_render_target->get_extent().width, scene_render_target->get_extent().height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_usage(VMA_MEMORY_USAGE_GPU_ONLY) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) + .with_debug_name("OutputImage") + .build(get_device().get_device())); + output_image_view = std::make_unique(*output_image, VK_IMAGE_VIEW_TYPE_2D); +} + +/* + * Creates the Tensor used as input to the neural network, aliasing the same memory as the colour + * attachment which the scene is rendered into. + * Also creates a Tensor View (analogous to an Image View). + */ +void PostprocessingWithVgf::prepare_input_tensor() +{ +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + const vkb::core::Image &image_to_alias = scene_render_target->get_views().at(0).get_image(); +#else + const vkb::core::Image &image_to_alias = *input_image; +#endif + + input_tensor = std::make_unique(get_device(), + TensorBuilder({vgf_data.input_tensor_infos[0].dimensions}) + .with_usage(VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM | VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM) + .with_format(vgf_data.input_tensor_infos[0].format) + .with_tiling(VK_TENSOR_TILING_OPTIMAL_ARM), + image_to_alias.get_memory(), image_to_alias.get_memory_offset()); + input_tensor_view = std::make_unique(*input_tensor); +} + +/* + * Creates the Tensor used as output of the neural network, aliasing the same memory as the network_output_image, + * which will be blitted to the screen. + * Also creates a Tensor Views (analogous to an Image View). + */ +void PostprocessingWithVgf::prepare_output_tensor() +{ + const vkb::core::Image &image_to_alias = *output_image; + + output_tensor = std::make_unique(get_device(), + TensorBuilder({vgf_data.output_tensor_infos[0].dimensions}) + .with_usage(VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM | VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM) + .with_format(vgf_data.output_tensor_infos[0].format) + .with_tiling(VK_TENSOR_TILING_OPTIMAL_ARM), + image_to_alias.get_memory(), image_to_alias.get_memory_offset()); + output_tensor_view = std::make_unique(*output_tensor); +} + +/* + * Creates a descriptor pool which can be used to allocate descriptors for tensor bindings. + * Note we can't use vkb::DescriptorPool because it doesn't know about tensors. + */ +void PostprocessingWithVgf::prepare_descriptor_pool() +{ + std::vector descriptor_pool_sizes = { + {VK_DESCRIPTOR_TYPE_TENSOR_ARM, 10}, // Fairly arbitrary count + }; + + VkDescriptorPoolCreateInfo create_info{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + create_info.poolSizeCount = descriptor_pool_sizes.size(); + create_info.pPoolSizes = descriptor_pool_sizes.data(); + create_info.maxSets = 10; // Fairly arbitrary + create_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + + VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &create_info, nullptr, &descriptor_pool)); +} + +/* + * Creates the Pipeline Layout, a Data Graph Pipeline and a Data Graph Pipeline Session used to run the neural network. + */ +void PostprocessingWithVgf::prepare_data_graph_pipeline() +{ + // Create the Pipeline Layout. + // The neural network has its input tensor on binding 0 and its output tensor at binding 1. + std::set tensor_bindings = {0, 1}; + data_graph_pipeline_layout = std::make_unique(get_device(), tensor_bindings); + + // Create a Pipeline from the layout. + std::map> tensor_descriptions; + // All bindings are in set 0 + tensor_descriptions[0] = + { + // Binding 0 is the input tensor + {0, &input_tensor->get_description()}, + // Binding 1 is the output tensor + {1, &output_tensor->get_description()}}; + + // Add constant tensors, which was prepared and stored earlier. + std::vector data_graph_pipeline_constants; + for (const auto &tensor : constant_tensors) + { + data_graph_pipeline_constants.push_back(&tensor->pipeline_constant); + } + + VkShaderModule shader_module = vkb::load_shader_from_vector(vgf_data.code, get_device().get_handle()); + + data_graph_pipeline = + std::make_unique(get_device(), + data_graph_pipeline_layout->get_handle(), + shader_module, + vgf_data.entry_point.c_str(), + tensor_descriptions, + data_graph_pipeline_constants); + + // Create a Pipeline Session for the Pipeline + VmaAllocationCreateInfo alloc_create_info = {}; + data_graph_pipeline_session = std::make_unique(get_device(), data_graph_pipeline->get_handle(), alloc_create_info); +} + +/* + * Allocates and fills in a Descriptor Set to provide bindings to the Data Graph Pipeline. + */ +void PostprocessingWithVgf::prepare_data_graph_pipeline_descriptor_set() +{ + // Allocate descriptor set using the layout of the Data Graph Pipeline + VkDescriptorSetAllocateInfo alloc_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + alloc_info.descriptorPool = descriptor_pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &data_graph_pipeline_layout->get_descriptor_set_layout(); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &alloc_info, &data_graph_pipeline_descriptor_set)); + + // Write bindings to it, telling it which tensors to use as input and output + std::map tensor_bindings = + { + // Binding 0 is the input tensor + {0, VkWriteDescriptorSetTensorARM{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, nullptr, 1, &input_tensor_view->get_handle()}}, + // Binding 1 is the output tensor + {1, VkWriteDescriptorSetTensorARM{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, nullptr, 1, &output_tensor_view->get_handle()}}}; + write_descriptor_set(get_device().get_handle(), data_graph_pipeline_descriptor_set, {}, tensor_bindings); +} + +void PostprocessingWithVgf::draw_renderpass(vkb::core::CommandBufferC &command_buffer, vkb::rendering::RenderTargetC &render_target) +{ + if (!enable_neural_network) + { + // If the neural network is disabled, use the default behaviour which is to render + // the scene directly to the default render target (the swapchain) + vkb::VulkanSampleC::draw_renderpass(command_buffer, render_target); + return; + } + + // When using the neural network, render the scene into the separate render target + uint32_t render_width = scene_render_target->get_extent().width; + uint32_t render_height = scene_render_target->get_extent().height; + + command_buffer.set_viewport(0, {{0.0f, 0.0f, static_cast(render_width), static_cast(render_height), 0.0f, 1.0f}}); + command_buffer.set_scissor(0, {{0, 0, render_width, render_height}}); + + // Barriers and layout transitions to get the render target's attachments ready for rendering + { + const VkImageMemoryBarrier2 imageBarriers[2] = { + // Colour attachment + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // When rendering to an aliased tensor, the render target image would have previously been used as the input to the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // srcAccessMask +#else + // When rendering to a separate image, the render target image would have previously been used as a transfer source + VK_PIPELINE_STAGE_TRANSFER_BIT, // srcStageMask + VK_ACCESS_2_TRANSFER_READ_BIT, // srcAccessMask +#endif + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // dstStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), // image + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, + // Depth attachment + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // The depth attachment would have last been used in the previous frame's rendering + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, // srcStageMask + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, // dstStageMask + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(1).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1}}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Render the scene into scene_render_target + get_render_pipeline().draw(command_buffer, *scene_render_target); + command_buffer.end_render_pass(); + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Barriers and layout transitions for copying the rendered scene into input_image + // (We only do this if we are not rendering directly to the aliased tensor) + { + const VkImageMemoryBarrier2 imageBarriers[2] = { + // Source image - the color image from the scene_render_target + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_TRANSFER_BIT, // dstStageMask + VK_ACCESS_2_TRANSFER_READ_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // oldLayout + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, + // Destination image - the input_image for the neural network + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // srcAccessMask + VK_PIPELINE_STAGE_TRANSFER_BIT, // dstStageMask + VK_ACCESS_2_TRANSFER_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + input_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Copy the rendered scene into input_image + // (We only do this if we are not rendering directly to the aliased tensor) + { + VkImageCopy image_copy; + image_copy.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + image_copy.dstOffset = {0, 0, 0}; + image_copy.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + image_copy.extent = {render_width, render_height, 1}; + image_copy.srcOffset = {0, 0, 0}; + command_buffer.copy_image(scene_render_target->get_views().at(0).get_image(), *input_image, {image_copy}); + } +#endif + + // Barriers and layout transitions for network inputs and outputs to be used in data graph pipeline execution + { + { + const VkImageMemoryBarrier2 imageBarriers[2] = { +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Input tensor (which is aliased as the scene_render_target) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was rendered to as a color attachment + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, +#else + // Input tensor (which is aliased as input_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was copied to + VK_PIPELINE_STAGE_TRANSFER_BIT, // srcStageMask + VK_ACCESS_2_TRANSFER_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + input_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, +#endif + // Output tensor (which is aliased as output_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was read by the blit shader + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // srcStageMask + VK_ACCESS_2_SHADER_READ_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}} + }; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + } + + // Bind and run data graph pipeline. + vkCmdBindPipeline(command_buffer.get_handle(), VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, data_graph_pipeline->get_handle()); + vkCmdBindDescriptorSets(command_buffer.get_handle(), VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, data_graph_pipeline_layout->get_handle(), + 0, 1, &data_graph_pipeline_descriptor_set, 0, nullptr); + vkCmdDispatchDataGraphARM(command_buffer.get_handle(), data_graph_pipeline_session->get_handle(), VK_NULL_HANDLE); + + // Barrier and layout transition for output_image to be a shader input + { + const VkImageMemoryBarrier2 imageBarrier = { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Was previously written to by the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // srcAccessMask + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask + VK_ACCESS_2_SHADER_READ_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // oldLayout + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 1; + dependencyInfo.pImageMemoryBarriers = &imageBarrier; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Blit output_image to the screen and draw the GUI + uint32_t screen_width = render_target.get_extent().width; + uint32_t screen_height = render_target.get_extent().height; + command_buffer.set_viewport(0, {{0.0f, 0.0f, static_cast(screen_width), static_cast(screen_height), 0.0f, 1.0f}}); + command_buffer.set_scissor(0, {{0, 0, screen_width, screen_height}}); + + blit_pipeline->draw(command_buffer, render_target); + + get_gui().draw(command_buffer); + + command_buffer.end_render_pass(); +} + +void PostprocessingWithVgf::draw_gui() +{ + // Define a checkbox to toggle the neural network on and off, so that you can see the effect of the edge enhancement network. + get_gui().show_options_window( + [this]() { + ImGui::Checkbox("Enable Neural Network", &enable_neural_network); + }, + 1); +} + +std::unique_ptr create_postprocessing_with_vgf() +{ + return std::make_unique(); +} diff --git a/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.h b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.h new file mode 100644 index 0000000000..dcafa2549b --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/postprocessing_with_vgf.h @@ -0,0 +1,166 @@ +/* Copyright (c) 2025-2026, Arm Limited and Contributors + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "rendering/render_pipeline.h" + +#include "../tensor_and_data_graph_common.h" +#include "vulkan_sample.h" + +/** + * @struct TensorInfo + * @brief Describes a tensor's binding, shape, and format. + */ +struct TensorInfo +{ + uint32_t binding; ///< Binding index in the descriptor set. + std::vector dimensions; ///< Tensor shape dimensions. + VkFormat format; ///< Format of the tensor data. + std::vector data; ///< Constant data. +}; + +/** + * @struct VgfData + * @brief Represents the deserialized contents of a VGF file. + * + * This structure encapsulates all relevant information extracted from a VGF binary, + * including tensor metadata, shader code and the entry point name. + */ +struct VgfData +{ + std::vector input_tensor_infos; + std::vector output_tensor_infos; + std::vector code; + std::string entry_point; +}; + +/** + * @brief Demonstrates how to use the VGF format, which stores information about the model such as SPIR-V, + input information, output information and constant data used to run a data graph pipeline. + * @details A 3D scene is rendered (using the existing Vulkan Sample framework) to an offscreen Render Target (`scene_render_target`), + * whose colour attachment is aliased to the same memory as a Tensor (`input_tensor`). This Tensor is then used as the input + * to a Data Graph Pipeline (`data_graph_pipeline`), which implements a simple sharpening filter using a convolution layer. + * The output of this Data Graph Pipeline is written to another Tensor (`output_tensor`), which is aliased to the same memory + * as an Image (`output_image`), which is then used to blit the results to the Swapchain. + * + * The VGF file configures the `input_tensor`, `output_tensor` and the `constant_tensors` and contains the SPIR-V required + * to create the VkShaderModule used by the `data_graph_pipeline`. + * + * As a diagram, this looks like: + * + * scene rendering -> scene_render_target output_image -> blit -> swapchain + * || || + * input_tensor -> data_graph_pipeline -> output_tensor + * \ || / + * \ SPIR-V & constants / + * \ || / + * \-------- vgf_data ----------/ + * || + * load VGF + * + * Because the common Vulkan Samples framework code is not aware of the Tensor resource type or Data Graph Pipelines, + * generic functionality for these concepts has been added to a new tensor_and_data_graph_common.h/cpp file, which this sample + * (and other tensor and data graph samples) makes use of. + */ +class PostprocessingWithVgf : public vkb::VulkanSampleC +{ + public: + PostprocessingWithVgf(); + + ~PostprocessingWithVgf() override; + + void request_gpu_features(vkb::core::PhysicalDeviceC &gpu) override; + + bool prepare(const vkb::ApplicationOptions &options) override; + + void draw_renderpass(vkb::core::CommandBufferC &command_buffer, vkb::rendering::RenderTargetC &render_target) override; + + void draw_gui() override; + + private: + // from vkb::VulkanSample + uint32_t get_api_version() const override; + + private: + VgfData load_vgf(const std::string &vgf_file_path); + + void prepare_scene_render_target(uint32_t width, uint32_t height); + + // Determines if this sample will render directly to the (aliased) input tensor, otherwise it will render + // to a separate, dedicated image which is then copied to the input tensor. + // + // Although rendering to an image aliased as a texture is a perfectly valid (and encouraged!) use of + // the extension APIs, the current implementation of the emulation layers do not have good support for this + // kind of use and so the additional copy is a temporary workaround. + // As the emulation layers are currently the only way to run this sample, the default behaviour + // is to use this workaround so that the sample produces the expected output. The more faithful + // and performant path of rendering directly to the aliased tensor is still present and instructive, but + // cannot be executed reliably yet. + // + // Note: it is still possible to run and validate the 'render to aliased tensor' path using the emulation + // layers but it requires some small changes to remove the depth attachment from the scene render target. + // The z-order will clearly be broken, but the aliasing will work as expected. +#define TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE 0 + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + void prepare_input_image(); +#endif + void prepare_output_image(); + + void prepare_input_tensor(); + void prepare_output_tensor(); + + void prepare_descriptor_pool(); + + void prepare_data_graph_pipeline(); + + void prepare_data_graph_pipeline_descriptor_set(); + + bool enable_neural_network = true; + + std::unique_ptr scene_render_target; + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + std::unique_ptr input_image; +#endif + + std::unique_ptr input_tensor; + std::unique_ptr input_tensor_view; + + std::unique_ptr output_image; + std::unique_ptr output_image_view; + + std::unique_ptr output_tensor; + std::unique_ptr output_tensor_view; + + std::unique_ptr blit_pipeline; + + VkDescriptorPool descriptor_pool = VK_NULL_HANDLE; + + std::vector>> constant_tensors; + + std::unique_ptr data_graph_pipeline_layout; + std::unique_ptr data_graph_pipeline; + std::unique_ptr data_graph_pipeline_session; + + VkDescriptorSet data_graph_pipeline_descriptor_set = VK_NULL_HANDLE; + + VgfData vgf_data; +}; + +std::unique_ptr create_postprocessing_with_vgf(); diff --git a/samples/extensions/tensor_and_data_graph/simple_tensor_and_data_graph/README.adoc b/samples/extensions/tensor_and_data_graph/simple_tensor_and_data_graph/README.adoc index f0697ffcf5..25bc3459d4 100644 --- a/samples/extensions/tensor_and_data_graph/simple_tensor_and_data_graph/README.adoc +++ b/samples/extensions/tensor_and_data_graph/simple_tensor_and_data_graph/README.adoc @@ -1,5 +1,5 @@ //// -- Copyright (c) 2024-2025, Arm Limited and Contributors +- Copyright (c) 2024-2026, Arm Limited and Contributors - - SPDX-License-Identifier: Apache-2.0 - @@ -483,14 +483,14 @@ That's all that's necessary to run a simple neural network using a data graph pi The tensor object which is written to by the data graph pipeline can be used in numerous ways. In this sample we are reading from it using a compute shader which produces the simple visualization of the tensor contents which is drawn on the window. There are also several other options for how to use the output tensor: * Read directly from other shaders by binding it as a Tensor resource, as we do in this sample. -* Aliased as a regular `VkImage` and used as a texture to be sampled from in another shader. +* Aliased as a regular `VkImage` and used as a texture to be sampled from in another shader. See the "Tensor image aliasing" sample xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[README]. * Aliased as a regular `VkBuffer` and read from in another shader * Mapped to host-accessible memory and read back for further processing on the CPU There is also the question of how to produce inputs for the neural network - in this example we simply uploaded some fixed data from the CPU but there are also many options here (equivalent to the above): * Write directly from other shaders by binding it as a Tensor resource. -* Aliased as a regular `VkImage` and written to via. a render target (or writable texture). +* Aliased as a regular `VkImage` and written to via a render target (or writable texture). See the "Tensor image aliasing" sample xref:samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc[README]. * Aliased as a regular `VkBuffer` and written to from another shader. * Mapped to host-accessible memory and written to with data produced on the CPU, as we do in this sample. @@ -502,7 +502,7 @@ In this sample we've introduced the VK_ARM_tensors and VK_ARM_data_graph extensi == Appendix A: SPIR-V programs for neural networks -Neural networks can be described using SPIR-V modules, but unlike SPIR-V modules for graphics and compute shaders, there is no GLSL or HLSL syntax for expressing SPIR-V modules for data graph pipelines. There are tools available for generating the SPIR-V code from higher level representations of neural networks such as PyTorch or TensorFlow models (see: link:https://github.com/arm/ai-ml-sdk-model-converter[ML SDK Model Converter]). +Neural networks can be described using SPIR-V modules, but unlike SPIR-V modules for graphics and compute shaders, there is no GLSL or HLSL syntax for expressing SPIR-V modules for data graph pipelines. There are tools available for generating the SPIR-V code from higher level representations of neural networks such as PyTorch or TensorFlow models (see: link:https://github.com/arm/ai-ml-sdk-model-converter[ML SDK Model Converter]), which are covered in the "Postprocessing with VGF" sample xref:samples/extensions/tensor_and_data_graph/postprocessing_with_vgf/README.adoc[README]. However, for this sample we have written the SPIR-V code directly in low-level SPIR-V assembly language to avoid bringing in more dependencies and to give some idea of how the SPIR-V code looks. It is unlikely in practice that you would manually write SPIR-V assembly, however below is a quick overview of SPIR-V for context. The assembly code is in the link:../../../../shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm[pooling.spvasm] file and is assembled into SPIR-V binary code using `spirv-as` as part of the link:../../../../bldsys/cmake/sample_helper.cmake[build system]. Note this is exactly the same process that would be used to compile compute or graphics shaders from SPIR-V assembly, although typically these would be compiled from GLSL or HLSL instead. diff --git a/samples/extensions/tensor_and_data_graph/tensor_and_data_graph_common.h b/samples/extensions/tensor_and_data_graph/tensor_and_data_graph_common.h index e339347fa0..39c6ec3d25 100644 --- a/samples/extensions/tensor_and_data_graph/tensor_and_data_graph_common.h +++ b/samples/extensions/tensor_and_data_graph/tensor_and_data_graph_common.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2024-2025, Arm Limited and Contributors +/* Copyright (c) 2024-2026, Arm Limited and Contributors * * SPDX-License-Identifier: Apache-2.0 * @@ -306,7 +306,7 @@ class ComputePipelineWithTensors : public vkb::core::VulkanResourceC }; /* - * @brief Simple subpass for use with vkb::RenderPipeline, which blits an image to the render target (stretching to fit). + * @brief Simple subpass for use with vkb::rendering::RenderPipelineC, which blits an image to the render target (stretching to fit). */ class BlitSubpass : public vkb::rendering::SubpassC { diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/CMakeLists.txt b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/CMakeLists.txt new file mode 100644 index 0000000000..adbe0b392b --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/CMakeLists.txt @@ -0,0 +1,36 @@ +# Copyright (c) 2025-2026, Arm Limited and Contributors +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if(WIN32 OR ANDROID OR CMAKE_SYSTEM_NAME STREQUAL "Linux") + add_sample_with_tags( + ID "tensor_image_aliasing" + CATEGORY "extensions/tensor_and_data_graph" + AUTHOR "Arm Ltd." + NAME "Tensor image aliasing" + DESCRIPTION "Demonstrates aliasing tensors and images to the same underlying memory to achieve zero-copy interop between rendering and data graphs" + FILES + "tensor_image_aliasing.h" + "tensor_image_aliasing.cpp" + SHADER_FILES_GLSL + "base.vert" + "base.frag" + "tensor_and_data_graph/glsl/blit.frag" + "tensor_and_data_graph/glsl/fullscreen.vert" + SHADER_FILES_SPVASM + "tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm" + ) +endif() \ No newline at end of file diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc new file mode 100644 index 0000000000..8f954a9cae --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/README.adoc @@ -0,0 +1,176 @@ +//// +- Copyright (c) 2025-2026, Arm Limited and Contributors +- +- SPDX-License-Identifier: Apache-2.0 +- +- Licensed under the Apache License, Version 2.0 the "License"; +- you may not use this file except in compliance with the License. +- You may obtain a copy of the License at +- +- http://www.apache.org/licenses/LICENSE-2.0 +- +- Unless required by applicable law or agreed to in writing, software +- distributed under the License is distributed on an "AS IS" BASIS, +- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- See the License for the specific language governing permissions and +- limitations under the License. +- +//// += Tensor Image Aliasing + +ifdef::site-gen-antora[] +TIP: The source for this sample can be found in the https://github.com/KhronosGroup/Vulkan-Samples/tree/main/samples/extensions/extensions/tensor_and_data_graph/simple_tensor_and_data_graph[Khronos Vulkan samples github repository]. +endif::[] + +image::./images/sample.png[Sample] + +== Overview + +This is the fourth sample in a series, which follows on from the previous xref:samples/extensions/tensor_and_data_graph/graph_constants/README.adoc[graph_constants sample]. To best understand this sample, it would be helpful to have first looked at the xref:samples/extensions/tensor_and_data_graph/README.adoc[previous samples in the series]. +This sample shows how to use link:https://registry.khronos.org/vulkan/specs/latest/html/vkspec.html#resources-memory-aliasing[tensor/image aliasing] so that tensors and images use the same memory. This enables the mixing of traditional compute and graphics operations with ML operations without overhead from copying or format conversions. In this sample we show how to run a simple post-processing effect on a 3D scene using a convolutional neural network. + +== Setup + +If you would like to build and run this sample, please make sure to first follow the setup steps on the xref:samples/extensions/tensor_and_data_graph/README.adoc[tensor and data graph] page. + +== Introduction + +Neural networks are often used to process image data. In a Vulkan application, image data is usually manipulated via a `VkImage` resource, however images cannot be used directly by data graph pipelines - they require tensors as inputs and outputs. There is no API to copy data between tensor and image resources (like there is for copying between images and buffers), however there is support for _aliasing_ the same underlying memory between a tensor and an image. This means that the same memory can be used for both resources, allowing data to be written to an image and read from a tensor, or vice versa, without copying. + +This sample uses these features to run a simple edge-enhacement convolutional neural network (with a single convolution layer) as a post-processing effect on a 3D rendered scene. The scene is rendered (using the existing Vulkan Sample framework) to an offscreen render target (`scene_render_target`), whose colour attachment is aliased to the same memory as a tensor (`input_tensor`). This tensor is then used as the input +to a data draph pipeline (`graph_pipeline`), which implements a simple sharpening filter using a convolution layer. The output of this data graph pipeline is written to another tensor (`output_tensor`), which is aliased to the same memory as another image (`output_image`), which is then used to blit the results to the swapchain: + +image::images/flow.svg[Sample] + +== Resource creation + +When creating images that are going to be aliased as tensors, the image must be created with the `VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM` usage flag. In the sample code, we do this when creating the colour attachment for the render target (in `prepare_scene_render_target()`) and when creating the output image (in `prepare_output_image()`): + +[source,cpp,options="nowrap"] +---- + +vkb::core::Image colour_image = vkb::core::ImageBuilder(width, height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage( + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM + ) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) + .with_debug_name("SceneRenderColour") + .build(get_device().get_device()); + +---- + +There is an equivalent flag to use when creating tensors, which is `VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM`. This flag is used when creating the input tensor in `prepare_input_tensor()` and the output tensor in `prepare_output_tensor()`. When creating these tensors, rather than using the helper class `Tensor` which also allocates and binds backing memory for the tensor, we use the `ExternallyAllocatedTensor` helper class which allows us to provide existing memory that the tensor should use. We use the backing memory of the image that we already created, so that the tensor and image will be aliasing each other: + +[source,cpp,options="nowrap"] +---- + +const vkb::core::Image &image_to_alias = scene_render_target->get_views().at(0).get_image(); +input_tensor = std::make_unique( + get_device(), + TensorBuilder({1, image_to_alias.get_extent().height, image_to_alias.get_extent().width, 4}) + .with_usage( + VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM | + VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM + ) + .with_format(VK_FORMAT_R8_SINT), + image_to_alias.get_memory(), image_to_alias.get_memory_offset() +); + +---- + +== Running the data graph pipeline + +The scene is rendered as normal into `scene_render_target` and because the colour attachment is aliased to the same memory as `input_tensor`, this tensor will get the rendered image data directly. The data graph pipeline is then executed, which reads from `input_tensor` and writes to `output_tensor`. Because `output_tensor` is aliased to `output_image`, when `output_image` is blitted to the swapchain it will contain +the results of the data graph pipeline. + +== Synchronization and image layout + +There are some important synchronisation and image layout details to keep in mind when working with aliased tensors and images. Since the tensor and image share the same memory, it's crucial to ensure that they are in compatible layouts and that proper synchronization is in place to avoid any data hazards. + +The colour attachement image of the `scene_render_target` needs to be transitioned to the `VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM` layout before its underlying memory can be used as a tensor. It also needs a barrier to ensure that the data written to it as a colour attachment is visible to the data graph pipeline when it reads from the tensor. The `oldLayout` parameter needs to be set here, because we want to preserve the existing contents. + +Similarly, the `output_image` which is aliased to `output_tensor` needs to be transitioned to the `VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM` layout before it can be used as a tensor, however in this case we don't need to preserve the existing contents so set `oldLayout` to `VK_IMAGE_LAYOUT_UNDEFINED`. It also has a barrier to prevent the data graph pipeline from writing to it before the blit shader (on the previous frame) reads from it: + +[source,cpp,options="nowrap"] +---- + const VkImageMemoryBarrier2 imageBarriers[2] = { + // Input tensor (which is aliased as input_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was rendered to as a color attachment + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // srcAccessMask + // Will be read by the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // oldLayout + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1} + }, + // Output tensor (which is aliased as output_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was read by the blit shader + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // srcStageMask + VK_ACCESS_2_SHADER_READ_BIT, // srcAccessMask + // Will be written to by the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1} + } + }; + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); +---- + +After the data graph pipeline has run, the `output_image` which is aliased to `output_tensor` needs to be transitioned out of the `VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM` layout so that it can be used as a regular image again, and it needs a barrier to ensure that the data written by the data graph pipeline is visible to the blit shader when it reads from it: + +[source,cpp,options="nowrap"] +---- +const VkImageMemoryBarrier2 imageBarrier = { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Was previously written to by the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // srcAccessMask + // Will be read by the blit shader + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask + VK_ACCESS_2_SHADER_READ_BIT, // dstAccessMask + // Transition out of aliasing layout + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // oldLayout + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}; + +VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; +dependencyInfo.imageMemoryBarrierCount = 1; +dependencyInfo.pImageMemoryBarriers = &imageBarrier; +vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); +---- + +== Conclusion + +In this sample, we've walked through how we can implement zero-copy data exchange between tensors and images to run a simple post-processing effect using a convolutional neural network. + +== Known Issues + +* The model in the VGF file uses a fixed resolution of 1280x720, so the sample will always render to a render target of this size, no matter the window size. This will result in poor quality rendering if the window size does not match 1280x720. 1280x720 is the default resolution when running the sample, so this is only a concern if you resize the window manually or using command-line arguments. diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.dot b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.dot new file mode 100644 index 0000000000..ceabebfe0d --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.dot @@ -0,0 +1,29 @@ +// Copyright (c) 2026, Arm Limited and Contributors +digraph D { + + { + rank=same; + scene_rendering[shape=box] + scene_render_target[shape=ellipse] + output_image[shape=ellipse] + blit[shape=box] + swapchain[shape=ellipse] + } + { + rank=same; + input_tensor[shape=ellipse] + data_graph_pipeline[shape=box] + output_tensor[shape=ellipse] + } + + scene_rendering -> scene_render_target + scene_render_target -> input_tensor [arrowhead=none, style=dashed, penwidth=5] + + input_tensor -> data_graph_pipeline + data_graph_pipeline -> output_tensor + + output_tensor -> output_image[arrowhead=none, style=dashed, penwidth=5] + output_image -> blit + blit -> swapchain + +} \ No newline at end of file diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.svg b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.svg new file mode 100644 index 0000000000..41be971041 --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/flow.svg @@ -0,0 +1,89 @@ + + + + + + + +D + + +scene_rendering + +scene_rendering + + +scene_render_target + +scene_render_target + + +scene_rendering->scene_render_target + + + + +input_tensor + +input_tensor + + +scene_render_target->input_tensor + + + +output_image + +output_image + + +blit + +blit + + +output_image->blit + + + + +swapchain + +swapchain + + +blit->swapchain + + + + +graph_pipeline + +graph_pipeline + + +input_tensor->graph_pipeline + + + + +output_tensor + +output_tensor + + +graph_pipeline->output_tensor + + + + +output_tensor->output_image + + + + diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/sample.png b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/sample.png new file mode 100644 index 0000000000..215c8ace32 Binary files /dev/null and b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/images/sample.png differ diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.cpp b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.cpp new file mode 100644 index 0000000000..6a3435c0dc --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.cpp @@ -0,0 +1,633 @@ +/* Copyright (c) 2024-2026, Arm Limited and Contributors + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensor_image_aliasing.h" + +#include "common/vk_common.h" +#include "gltf_loader.h" +#include "gui.h" +#include "rendering/subpasses/forward_subpass.h" +#include "scene_graph/components/perspective_camera.h" +#include "stats/stats.h" + +TensorImageAliasing::TensorImageAliasing() +{ + // Declare that we need the data graph and tensor extensions + add_device_extension("VK_ARM_tensors"); + add_device_extension("VK_ARM_data_graph"); + // These extensions are dependencies of the above, so we need to add them too. + add_device_extension("VK_KHR_maintenance5"); + add_device_extension("VK_KHR_deferred_host_operations"); +} + +uint32_t TensorImageAliasing::get_api_version() const +{ + return VK_API_VERSION_1_3; // Required by the emulation layers +} + +TensorImageAliasing::~TensorImageAliasing() +{ + if (data_graph_pipeline_descriptor_set != VK_NULL_HANDLE) + { + vkFreeDescriptorSets(get_device().get_handle(), descriptor_pool, 1, &data_graph_pipeline_descriptor_set); + } + if (descriptor_pool != VK_NULL_HANDLE) + { + vkDestroyDescriptorPool(get_device().get_handle(), descriptor_pool, nullptr); + } + + // Make sure resources created in the render pipeline are destroyed before the Device gets destroyed. + // TODO: Could move this to the base VulkanSample class and upstream this patch. + set_render_pipeline(nullptr); +} + +/** + * @brief Overridden to declare that we require some physical device features to be enabled. + */ +void TensorImageAliasing::request_gpu_features(vkb::core::PhysicalDeviceC &gpu) +{ + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan12Features, shaderInt8); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan13Features, synchronization2); + + // Enable the features for tensors and data graphs which we intend to use. + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceTensorFeaturesARM, tensors); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceTensorFeaturesARM, shaderTensorAccess); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceDataGraphFeaturesARM, dataGraph); + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceDataGraphFeaturesARM, dataGraphShaderModule); + + // Update-after-bind is required for the emulation layer + REQUEST_REQUIRED_FEATURE(gpu, VkPhysicalDeviceVulkan12Features, descriptorBindingUniformBufferUpdateAfterBind); + + // Enable Int16 and Int64, if available. + if (gpu.get_features().shaderInt16) + { + gpu.get_mutable_requested_features().shaderInt16 = VK_TRUE; + } + else + { + throw std::runtime_error("Required feature VkPhysicalDeviceFeatures::shaderInt16 is not supported."); + } + + if (gpu.get_features().shaderInt64) + { + gpu.get_mutable_requested_features().shaderInt64 = VK_TRUE; + } + else + { + throw std::runtime_error("Required feature VkPhysicalDeviceFeatures::shaderInt64 is not supported."); + } +} + +bool TensorImageAliasing::prepare(const vkb::ApplicationOptions &options) +{ + if (!VulkanSample::prepare(options)) + { + return false; + } + + // Workaround for emulation layer issue, remove once fixed. + volkLoadDevice(get_device().get_handle()); + + // Load a 3D to be rendered and set up a camera to view it + load_scene("scenes/sponza/Sponza01.gltf"); + auto &camera_node = vkb::add_free_camera(get_scene(), "main_camera", get_render_context().get_surface_extent()); + vkb::sg::Camera &camera = camera_node.get_component(); + + // Create a forward rendering pipeline to render the scene. + vkb::ShaderSource vert_shader("base.vert.spv"); + vkb::ShaderSource frag_shader("base.frag.spv"); + auto scene_subpass = std::make_unique(get_render_context(), std::move(vert_shader), std::move(frag_shader), get_scene(), camera); + + auto render_pipeline = std::make_unique(); + render_pipeline->add_subpass(std::move(scene_subpass)); + render_pipeline->prepare(); + + set_render_pipeline(std::move(render_pipeline)); + + // Create Vulkan resources (see individual functions for details) + // All resources are created with a size of 1280x720 which is what the data graph expects. + prepare_scene_render_target(1280, 720); +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + prepare_input_image(); +#endif + prepare_output_image(); + prepare_input_tensor(); + prepare_output_tensor(); + prepare_descriptor_pool(); + prepare_weights_tensor(); + prepare_data_graph_pipeline(); + prepare_data_graph_pipeline_descriptor_set(); + + // Create a RenderPipeline to blit `output_image` to the swapchain + blit_pipeline = std::make_unique(); + blit_pipeline->add_subpass(std::make_unique(get_render_context(), output_image_view.get())); + blit_pipeline->prepare(); + + // Create a GUI so that we can toggle the neural network on and off (see draw_gui() function) + create_gui(*window, &get_stats()); + + return true; +} + +/** + * Creates a RenderTarget with a single colour and depth attachment which we will render the scene into. + * The colour attachment will be aliased as a tensor input to the neural network, so needs some special flags. + */ +void TensorImageAliasing::prepare_scene_render_target(uint32_t width, uint32_t height) +{ + vkb::core::Image colour_image = vkb::core::ImageBuilder(width, height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) +#else + // No aliasing of this image - we will copy it instead + .with_usage(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT) +#endif + .with_debug_name("SceneRenderColour") + .build(get_device().get_device()); + + vkb::core::Image depth_image = vkb::core::ImageBuilder(width, height) + .with_format(vkb::get_suitable_depth_format(get_device().get_gpu().get_handle())) + .with_usage(VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) + .with_debug_name("SceneRenderDepth") + .build(get_device().get_device()); + + std::vector images; + images.push_back(std::move(colour_image)); + images.push_back(std::move(depth_image)); + + scene_render_target = std::make_unique(std::move(images)); +} + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + +/** + * In the case that we are using the workaround where we perform an additional copy then this + * function creates the additional image which we will copy the rendered scene into. + * This image will then be aliased as the tensor input to the neural network (rather than the + * scene render target being aliased directly), and needs some special flags. + */ +void TensorImageAliasing::prepare_input_image() +{ + input_image = std::make_unique(vkb::core::ImageBuilder(scene_render_target->get_extent().width, scene_render_target->get_extent().height) + .with_format(VK_FORMAT_R8G8B8A8_SNORM) + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) + .with_debug_name("InputImage") + .build(get_device().get_device())); +} + +#endif + +/** + * Creates an image to use as the output of the neural network. + * This will be aliased as the output tensor, so needs some special flags. + */ +void TensorImageAliasing::prepare_output_image() +{ + output_image = std::make_unique(vkb::core::ImageBuilder(scene_render_target->get_extent().width, scene_render_target->get_extent().height) + .with_format(VK_FORMAT_R8G8B8A8_UNORM) + // Extra flags are required to allow aliasing of this image as a tensor. + .with_usage(VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TENSOR_ALIASING_BIT_ARM) + .with_vma_usage(VMA_MEMORY_USAGE_GPU_ONLY) + .with_vma_flags(VMA_ALLOCATION_CREATE_CAN_ALIAS_BIT) + .with_debug_name("OutputImage") + .build(get_device().get_device())); + output_image_view = std::make_unique(*output_image, VK_IMAGE_VIEW_TYPE_2D); +} + +/* + * Creates the Tensor used as input to the neural network, aliasing the same memory as the colour + * attachment which the scene is rendered into. + * Also creates a Tensor View (analogous to an Image View). + */ +void TensorImageAliasing::prepare_input_tensor() +{ +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + const vkb::core::Image &image_to_alias = scene_render_target->get_views().at(0).get_image(); +#else + const vkb::core::Image &image_to_alias = *input_image; +#endif + + input_tensor = std::make_unique(get_device(), + TensorBuilder({1, image_to_alias.get_extent().height, image_to_alias.get_extent().width, 4}) + .with_usage(VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM | VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM) + .with_format(VK_FORMAT_R8_SINT) + .with_tiling(VK_TENSOR_TILING_OPTIMAL_ARM), + image_to_alias.get_memory(), image_to_alias.get_memory_offset()); + input_tensor_view = std::make_unique(*input_tensor); +} + +/* + * Creates the Tensor used as output of the neural network, aliasing the same memory as the network_output_image, + * which will be blitted to the screen. + * Also creates a Tensor Views (analogous to an Image View). + */ +void TensorImageAliasing::prepare_output_tensor() +{ + const vkb::core::Image &image_to_alias = *output_image; + + output_tensor = std::make_unique(get_device(), + TensorBuilder({1, image_to_alias.get_extent().height, image_to_alias.get_extent().width, 4}) + .with_usage(VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM | VK_TENSOR_USAGE_IMAGE_ALIASING_BIT_ARM) + .with_format(VK_FORMAT_R8_SINT) + .with_tiling(VK_TENSOR_TILING_OPTIMAL_ARM), + image_to_alias.get_memory(), image_to_alias.get_memory_offset()); + output_tensor_view = std::make_unique(*output_tensor); +} + +/* + * Creates a descriptor pool which can be used to allocate descriptors for tensor bindings. + * Note we can't use vkb::DescriptorPool because it doesn't know about tensors. + */ +void TensorImageAliasing::prepare_descriptor_pool() +{ + std::vector descriptor_pool_sizes = { + {VK_DESCRIPTOR_TYPE_TENSOR_ARM, 10}, // Fairly arbitrary count + }; + + VkDescriptorPoolCreateInfo create_info{VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO}; + create_info.poolSizeCount = descriptor_pool_sizes.size(); + create_info.pPoolSizes = descriptor_pool_sizes.data(); + create_info.maxSets = 10; // Fairly arbitrary + create_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + + VK_CHECK(vkCreateDescriptorPool(get_device().get_handle(), &create_info, nullptr, &descriptor_pool)); +} + +/* + * Creates the constant weights tensor used in the convolution operator. + */ +void TensorImageAliasing::prepare_weights_tensor() +{ + weights_constant_tensor = std::make_unique>(); + + // Weights are in a [OC,KH,KW,IC] shape: + // OC = Output channels + // KH = Kernel height + // KW = Kernel width + // IC = Input channels + // In this case we are using a simple edge-enhancement filter on each of the three colour channels + weights_constant_tensor->dimensions = {4, 3, 3, 4}; + weights_constant_tensor->constant_data.resize(4 * 3 * 3 * 4); + MultidimensionalArrayView array_view( + weights_constant_tensor->constant_data.data(), weights_constant_tensor->dimensions); + + for (int i = 0; i < 4; ++i) + { + // First row of the 3x3 kernel + array_view[{i, 0, 0, i}] = 0; + array_view[{i, 0, 1, i}] = -1; + array_view[{i, 0, 2, i}] = 0; + + // Middle row of the 3x3 kernel + array_view[{i, 1, 0, i}] = -1; + array_view[{i, 1, 1, i}] = 1 + 4; + array_view[{i, 1, 2, i}] = -1; + + // Last row of the 3x3 kernel + array_view[{i, 2, 0, i}] = 0; + array_view[{i, 2, 1, i}] = -1; + array_view[{i, 2, 2, i}] = 0; + } + + weights_constant_tensor->tensor_description = { + VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, + nullptr, + VK_TENSOR_TILING_LINEAR_ARM, + VK_FORMAT_R8_SINT, + 4, // dimensions + weights_constant_tensor->dimensions.data(), + nullptr, // pStrides + VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM, + }; + + weights_constant_tensor->pipeline_constant = { + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM, + &weights_constant_tensor->tensor_description, + 0, // Matches the unique identifier encoded in OpGraphConstantARM in the SPIR-V module + weights_constant_tensor->constant_data.data() // Host pointer to raw data + }; +} + +/* + * Creates the Pipeline Layout, a Data Graph Pipeline and a Data Graph Pipeline Session used to run the neural network. + */ +void TensorImageAliasing::prepare_data_graph_pipeline() +{ + // Create the Pipeline Layout. + // The neural network has its input tensor on binding 0 and its output tensor at binding 1. + std::set tensor_bindings = {0, 1}; + data_graph_pipeline_layout = std::make_unique(get_device(), tensor_bindings); + + // Create a Pipeline from the layout. + std::map> tensor_descriptions; + // All bindings are in set 0 + tensor_descriptions[0] = + { + // Binding 0 is the input tensor + {0, &input_tensor->get_description()}, + // Binding 1 is the output tensor + {1, &output_tensor->get_description()}}; + + // Add weights constant tensor, which was prepared and stored earlier. + std::vector data_graph_pipeline_constants; + data_graph_pipeline_constants.push_back(&weights_constant_tensor->pipeline_constant); + + VkShaderModule shader_module = vkb::load_shader("tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm.spv", get_device().get_handle(), VK_SHADER_STAGE_ALL); + + data_graph_pipeline = + std::make_unique(get_device(), + data_graph_pipeline_layout->get_handle(), + shader_module, + "main", + tensor_descriptions, + data_graph_pipeline_constants); + + // Create a Pipeline Session for the Pipeline + VmaAllocationCreateInfo alloc_create_info = {}; + data_graph_pipeline_session = std::make_unique(get_device(), data_graph_pipeline->get_handle(), alloc_create_info); +} + +/* + * Allocates and fills in a Descriptor Set to provide bindings to the Data Graph Pipeline. + */ +void TensorImageAliasing::prepare_data_graph_pipeline_descriptor_set() +{ + // Allocate descriptor set using the layout of the Data Graph Pipeline + VkDescriptorSetAllocateInfo alloc_info = {VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO}; + alloc_info.descriptorPool = descriptor_pool; + alloc_info.descriptorSetCount = 1; + alloc_info.pSetLayouts = &data_graph_pipeline_layout->get_descriptor_set_layout(); + VK_CHECK(vkAllocateDescriptorSets(get_device().get_handle(), &alloc_info, &data_graph_pipeline_descriptor_set)); + + // Write bindings to it, telling it which tensors to use as input and output + std::map tensor_bindings = + { + // Binding 0 is the input tensor + {0, VkWriteDescriptorSetTensorARM{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, nullptr, 1, &input_tensor_view->get_handle()}}, + // Binding 1 is the output tensor + {1, VkWriteDescriptorSetTensorARM{VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, nullptr, 1, &output_tensor_view->get_handle()}}}; + write_descriptor_set(get_device().get_handle(), data_graph_pipeline_descriptor_set, {}, tensor_bindings); +} + +void TensorImageAliasing::draw_renderpass(vkb::core::CommandBufferC &command_buffer, vkb::rendering::RenderTargetC &render_target) +{ + if (!enable_neural_network) + { + // If the neural network is disabled, use the default behaviour which is to render + // the scene directly to the default render target (the swapchain) + vkb::VulkanSampleC::draw_renderpass(command_buffer, render_target); + return; + } + + // When using the neural network, render the scene into the separate render target + uint32_t render_width = scene_render_target->get_extent().width; + uint32_t render_height = scene_render_target->get_extent().height; + + command_buffer.set_viewport(0, {{0.0f, 0.0f, static_cast(render_width), static_cast(render_height), 0.0f, 1.0f}}); + command_buffer.set_scissor(0, {{0, 0, render_width, render_height}}); + + // Barriers and layout transitions to get the render target's attachments ready for rendering + { + const VkImageMemoryBarrier2 imageBarriers[2] = { + // Colour attachment + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // When rendering to an aliased tensor, the render target image would have previously been used as the input to the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // srcAccessMask +#else + // When rendering to a separate image, the render target image would have previously been used as a transfer source + VK_PIPELINE_STAGE_TRANSFER_BIT, // srcStageMask + VK_ACCESS_2_TRANSFER_READ_BIT, // srcAccessMask +#endif + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // dstStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), // image + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, + // Depth attachment + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // The depth attachment would have last been used in the previous frame's rendering + VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, // srcStageMask + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, // dstStageMask + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(1).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1}}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Render the scene into scene_render_target + get_render_pipeline().draw(command_buffer, *scene_render_target); + command_buffer.end_render_pass(); + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Barriers and layout transitions for copying the rendered scene into input_image + // (We only do this if we are not rendering directly to the aliased tensor) + { + const VkImageMemoryBarrier2 imageBarriers[2] = { + // Source image - the color image from the scene_render_target + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_TRANSFER_BIT, // dstStageMask + VK_ACCESS_2_TRANSFER_READ_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // oldLayout + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, + // Destination image - the input_image for the neural network + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // srcAccessMask + VK_PIPELINE_STAGE_TRANSFER_BIT, // dstStageMask + VK_ACCESS_2_TRANSFER_WRITE_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + input_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Copy the rendered scene into input_image + // (We only do this if we are not rendering directly to the aliased tensor) + { + VkImageCopy image_copy; + image_copy.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + image_copy.dstOffset = {0, 0, 0}; + image_copy.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + image_copy.extent = {render_width, render_height, 1}; + image_copy.srcOffset = {0, 0, 0}; + command_buffer.copy_image(scene_render_target->get_views().at(0).get_image(), *input_image, {image_copy}); + } +#endif + + // Barriers and layout transitions for network inputs and outputs to be used in data graph pipeline execution + { + { + const VkImageMemoryBarrier2 imageBarriers[2] = { +#if TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + // Input tensor (which is aliased as the scene_render_target) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was rendered to as a color attachment + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, // srcStageMask + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + scene_render_target->get_views().at(0).get_image().get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, +#else + // Input tensor (which is aliased as input_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was copied to + VK_PIPELINE_STAGE_TRANSFER_BIT, // srcStageMask + VK_ACCESS_2_TRANSFER_WRITE_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_READ_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + input_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}, +#endif + // Output tensor (which is aliased as output_image) + { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Previously was read by the blit shader + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // srcStageMask + VK_ACCESS_2_SHADER_READ_BIT, // srcAccessMask + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // dstStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // dstAccessMask + VK_IMAGE_LAYOUT_UNDEFINED, // oldLayout + + // Transition to the special layout for tensor aliasing + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}} + }; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 2; + dependencyInfo.pImageMemoryBarriers = &imageBarriers[0]; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + } + + // Bind and run data graph pipeline. + vkCmdBindPipeline(command_buffer.get_handle(), VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, data_graph_pipeline->get_handle()); + vkCmdBindDescriptorSets(command_buffer.get_handle(), VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, data_graph_pipeline_layout->get_handle(), + 0, 1, &data_graph_pipeline_descriptor_set, 0, nullptr); + vkCmdDispatchDataGraphARM(command_buffer.get_handle(), data_graph_pipeline_session->get_handle(), VK_NULL_HANDLE); + + // Barrier and layout transition for output_image to be a shader input + { + const VkImageMemoryBarrier2 imageBarrier = { + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + nullptr, + // Was previously written to by the data graph pipeline + VK_PIPELINE_STAGE_2_DATA_GRAPH_BIT_ARM, // srcStageMask + VK_ACCESS_2_DATA_GRAPH_WRITE_BIT_ARM, // srcAccessMask + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, // dstStageMask + VK_ACCESS_2_SHADER_READ_BIT, // dstAccessMask + VK_IMAGE_LAYOUT_TENSOR_ALIASING_ARM, // oldLayout + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, // newLayout + 0, // srcQueueFamilyIndex + 0, // dstQueueFamilyIndex + output_image->get_handle(), + VkImageSubresourceRange{VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}}; + + VkDependencyInfo dependencyInfo = {VK_STRUCTURE_TYPE_DEPENDENCY_INFO}; + dependencyInfo.imageMemoryBarrierCount = 1; + dependencyInfo.pImageMemoryBarriers = &imageBarrier; + vkCmdPipelineBarrier2(command_buffer.get_handle(), &dependencyInfo); + } + + // Blit output_image to the screen and draw the GUI + uint32_t screen_width = render_target.get_extent().width; + uint32_t screen_height = render_target.get_extent().height; + command_buffer.set_viewport(0, {{0.0f, 0.0f, static_cast(screen_width), static_cast(screen_height), 0.0f, 1.0f}}); + command_buffer.set_scissor(0, {{0, 0, screen_width, screen_height}}); + + blit_pipeline->draw(command_buffer, render_target); + + get_gui().draw(command_buffer); + + command_buffer.end_render_pass(); +} + +void TensorImageAliasing::draw_gui() +{ + // Define a checkbox to toggle the neural network on and off, so that you can see the effect of the edge enhancement network. + get_gui().show_options_window( + [this]() { + ImGui::Checkbox("Enable Neural Network", &enable_neural_network); + }, + 1); +} + +std::unique_ptr create_tensor_image_aliasing() +{ + return std::make_unique(); +} diff --git a/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.h b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.h new file mode 100644 index 0000000000..5c8ceb2b3c --- /dev/null +++ b/samples/extensions/tensor_and_data_graph/tensor_image_aliasing/tensor_image_aliasing.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2024-2026, Arm Limited and Contributors + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "rendering/render_pipeline.h" + +#include "../tensor_and_data_graph_common.h" +#include "vulkan_sample.h" + +/** + * @brief Demonstrates how to use memory aliasing so that tensors and images use the same memory. + * This enables the mixing of traditional compute and graphics operations with ML operations + * with no overhead from copying or format conversions. + * @details A 3D scene is rendered (using the existing Vulkan Sample framework) to an offscreen Render Target (`scene_render_target`), + * whose colour attachment is aliased to the same memory as a Tensor (`input_tensor`). This Tensor is then used as the input + * to a Data Graph Pipeline (`data_graph_pipeline`), which implements a simple sharpening filter using a convolution layer. + * The output of this Data Graph Pipeline is written to another Tensor (`output_tensor`), which is aliased to the same memory + * as an Image (`output_image`), which is then used to blit the results to the Swapchain. + * + * As a diagram, this looks like: + * + * scene rendering -> scene_render_target output_image -> blit -> swapchain + * || || + * input_tensor -> data_graph_pipeline -> output_tensor + * + * Because the common Vulkan Samples framework code is not aware of the Tensor resource type or Data Graph Pipelines, + * generic functionality for these concepts has been added to a new tensor_and_data_graph_common.h/cpp file, which this sample + * (and other tensor and data graph samples) makes use of. + */ +class TensorImageAliasing : public vkb::VulkanSampleC +{ + public: + TensorImageAliasing(); + + ~TensorImageAliasing() override; + + void request_gpu_features(vkb::core::PhysicalDeviceC &gpu) override; + + bool prepare(const vkb::ApplicationOptions &options) override; + + void draw_renderpass(vkb::core::CommandBufferC &command_buffer, vkb::rendering::RenderTargetC &render_target) override; + + void draw_gui() override; + + private: + // from vkb::VulkanSample + uint32_t get_api_version() const override; + + private: + void prepare_scene_render_target(uint32_t width, uint32_t height); + + // Determines if this sample will render directly to the (aliased) input tensor, otherwise it will render + // to a separate, dedicated image which is then copied to the input tensor. + // + // Although rendering to an image aliased as a texture is a perfectly valid (and encouraged!) use of + // the extension APIs, the current implementation of the emulation layers do not have good support for this + // kind of use and so the additional copy is a temporary workaround. + // As the emulation layers are currently the only way to run this sample, the default behaviour + // is to use this workaround so that the sample produces the expected output. The more faithful + // and performant path of rendering directly to the aliased tensor is still present and instructive, but + // cannot be executed reliably yet. + // + // Note: it is still possible to run and validate the 'render to aliased tensor' path using the emulation + // layers but it requires some small changes to remove the depth attachment from the scene render target. + // The z-order will clearly be broken, but the aliasing will work as expected. +#define TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE 0 + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + void prepare_input_image(); +#endif + void prepare_output_image(); + + void prepare_input_tensor(); + void prepare_output_tensor(); + + void prepare_descriptor_pool(); + + void prepare_weights_tensor(); + + void prepare_data_graph_pipeline(); + + void prepare_data_graph_pipeline_descriptor_set(); + + bool enable_neural_network = true; + + std::unique_ptr scene_render_target; + +#if !TENSOR_IMAGE_ALIASING_RENDER_TO_ALIASED_IMAGE + std::unique_ptr input_image; +#endif + + std::unique_ptr input_tensor; + std::unique_ptr input_tensor_view; + + std::unique_ptr output_image; + std::unique_ptr output_image_view; + + std::unique_ptr output_tensor; + std::unique_ptr output_tensor_view; + + std::unique_ptr blit_pipeline; + + VkDescriptorPool descriptor_pool = VK_NULL_HANDLE; + + std::unique_ptr> weights_constant_tensor; + + std::unique_ptr data_graph_pipeline_layout; + std::unique_ptr data_graph_pipeline; + std::unique_ptr data_graph_pipeline_session; + + VkDescriptorSet data_graph_pipeline_descriptor_set = VK_NULL_HANDLE; +}; + +std::unique_ptr create_tensor_image_aliasing(); diff --git a/shaders/tensor_and_data_graph/postprocessing_with_vgf/vgf/simple_conv2d_rescale_graph.vgf b/shaders/tensor_and_data_graph/postprocessing_with_vgf/vgf/simple_conv2d_rescale_graph.vgf new file mode 100644 index 0000000000..262f3df5cf Binary files /dev/null and b/shaders/tensor_and_data_graph/postprocessing_with_vgf/vgf/simple_conv2d_rescale_graph.vgf differ diff --git a/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm b/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm index c7c9124543..832817c8b5 100644 --- a/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm +++ b/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm @@ -1,4 +1,4 @@ -; Copyright (c) 2024-2025, Arm Limited and Contributors +; Copyright (c) 2024-2026, Arm Limited and Contributors ; ; SPDX-License-Identifier: Apache-2.0 ; @@ -68,7 +68,7 @@ ; Parameters for pooling layer %avg_pool_kernel = OpConstantComposite %uint_tensor_r1_s2 %uint_3 %uint_3 %avg_pool_stride = OpConstantComposite %uint_tensor_r1_s2 %uint_2 %uint_2 - %avg_pool_pad = OpConstantComposite %uint_tensor_r1_s4 %uint_0 %uint_0 %uint_0 %uint_0 + %avg_pool_pad = OpConstantComposite %uint_tensor_r1_s4 %uint_1 %uint_0 %uint_1 %uint_0 %avg_pool_input_zero_point = OpConstantComposite %float_tensor_r1_s1 %float_0 %avg_pool_output_zero_point = OpConstantComposite %float_tensor_r1_s1 %float_0 @@ -81,6 +81,6 @@ OpGraphEntryPointARM %graph "main" %input_tensor_ptr %output_tensor_ptr %graph = OpGraphARM %graph_type %in = OpGraphInputARM %float_tensor_r4_s_1_ih_iw_3 %uint_0 - %out = OpExtInst %float_tensor_r4_s_1_oh_ow_3 %tosa AVG_POOL2D %avg_pool_kernel %avg_pool_stride %avg_pool_pad %uint_2 %in %avg_pool_input_zero_point %avg_pool_output_zero_point + %out = OpExtInst %float_tensor_r4_s_1_oh_ow_3 %tosa AVG_POOL2D %avg_pool_kernel %avg_pool_stride %avg_pool_pad %uint_3 %in %avg_pool_input_zero_point %avg_pool_output_zero_point OpGraphSetOutputARM %out %uint_0 OpGraphEndARM diff --git a/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm.spv b/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm.spv index 9e01041da8..dd397194e5 100644 Binary files a/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm.spv and b/shaders/tensor_and_data_graph/simple_tensor_and_data_graph/spirv/pooling.spvasm.spv differ diff --git a/shaders/tensor_and_data_graph/spirv/conv2d.spvasm b/shaders/tensor_and_data_graph/spirv/conv2d.spvasm index f1adb0c53e..754b25d543 100644 --- a/shaders/tensor_and_data_graph/spirv/conv2d.spvasm +++ b/shaders/tensor_and_data_graph/spirv/conv2d.spvasm @@ -1,4 +1,4 @@ -; Copyright (c) 2025, Arm Limited and Contributors +; Copyright (c) 2025-2026, Arm Limited and Contributors ; ; SPDX-License-Identifier: Apache-2.0 ; @@ -84,6 +84,6 @@ OpGraphEntryPointARM %graph "main" %input_tensor_ptr %output_tensor_ptr %graph = OpGraphARM %graph_type %in = OpGraphInputARM %float_tensor_r4_s_1_h_w_3 %uint_0 - %out = OpExtInst %float_tensor_r4_s_1_h_w_3 %tosa CONV2D %conv_pad %conv_stride %conv_dilation %uint_2 %false %in %conv_weights %conv_biases %conv_input_zero_point %conv_weight_zero_point + %out = OpExtInst %float_tensor_r4_s_1_h_w_3 %tosa CONV2D %conv_pad %conv_stride %conv_dilation %uint_3 %false %in %conv_weights %conv_biases %conv_input_zero_point %conv_weight_zero_point OpGraphSetOutputARM %out %uint_0 OpGraphEndARM \ No newline at end of file diff --git a/shaders/tensor_and_data_graph/spirv/conv2d.spvasm.spv b/shaders/tensor_and_data_graph/spirv/conv2d.spvasm.spv index e38cff3434..7f1374ac27 100644 Binary files a/shaders/tensor_and_data_graph/spirv/conv2d.spvasm.spv and b/shaders/tensor_and_data_graph/spirv/conv2d.spvasm.spv differ diff --git a/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm b/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm new file mode 100644 index 0000000000..f96bb4dc92 --- /dev/null +++ b/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm @@ -0,0 +1,120 @@ +; Copyright (c) 2024-2026, Arm Limited and Contributors +; +; SPDX-License-Identifier: Apache-2.0 +; +; Licensed under the Apache License, Version 2.0 the "License"; +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. + +; A simple neural network with a conv2d layer, expressed in SPIR-V assembly. + + ; Boilerplate + OpCapability Int8 + OpCapability Int16 + OpCapability Shader + OpCapability TensorsARM + OpCapability GraphARM + OpExtension "SPV_ARM_tensors" + OpExtension "SPV_ARM_graph" + + ; Import TOSA instructions for neural network operations + %tosa = OpExtInstImport "TOSA.001000.1" + + ; More boilerplate + OpMemoryModel Logical GLSL450 + + ; Descriptor set interface + OpDecorate %input_tensor_ptr DescriptorSet 0 + OpDecorate %input_tensor_ptr Binding 0 + OpDecorate %output_tensor_ptr DescriptorSet 0 + OpDecorate %output_tensor_ptr Binding 1 + + ; Types and constants + %bool = OpTypeBool + %false = OpConstantFalse %bool + %true = OpConstantTrue %bool + %uint = OpTypeInt 32 0 + %uint8 = OpTypeInt 8 0 + %uint16 = OpTypeInt 16 0 + %uint_0 = OpConstant %uint 0 + %uint_1 = OpConstant %uint 1 + %uint_2 = OpConstant %uint 2 + %uint_3 = OpConstant %uint 3 + %uint_4 = OpConstant %uint 4 + %height = OpConstant %uint 720 + %width = OpConstant %uint 1280 + %uint_array_l1 = OpTypeArray %uint %uint_1 + %uint_array_l4 = OpTypeArray %uint %uint_4 + %uint_array_l1_1 = OpConstantComposite %uint_array_l1 %uint_1 + %uint_array_l1_2 = OpConstantComposite %uint_array_l1 %uint_2 + %uint_array_l1_4 = OpConstantComposite %uint_array_l1 %uint_4 + %uint_array_l4_4_3_3_4 = OpConstantComposite %uint_array_l4 %uint_4 %uint_3 %uint_3 %uint_4 + %uint_array_l4_1_h_w_4 = OpConstantComposite %uint_array_l4 %uint_1 %height %width %uint_4 + %uint_tensor_r1_s1 = OpTypeTensorARM %uint %uint_1 %uint_array_l1_1 + %uint_tensor_r1_s2 = OpTypeTensorARM %uint %uint_1 %uint_array_l1_2 + %uint_tensor_r1_s4 = OpTypeTensorARM %uint %uint_1 %uint_array_l1_4 + %uint8_tensor_r1_s1 = OpTypeTensorARM %uint8 %uint_1 %uint_array_l1_1 + %uint8_tensor_r4_s_4_3_3_4 = OpTypeTensorARM %uint8 %uint_4 %uint_array_l4_4_3_3_4 + %uint8_tensor_r4_s_1_h_w_4 = OpTypeTensorARM %uint8 %uint_4 %uint_array_l4_1_h_w_4 + %uint16_tensor_r4_s_1_h_w_4 = OpTypeTensorARM %uint16 %uint_4 %uint_array_l4_1_h_w_4 + %uint_tensor_r4_s_1_h_w_4 = OpTypeTensorARM %uint %uint_4 %uint_array_l4_1_h_w_4 + %ptr_uint8_tensor_r4_s_1_h_w_4 = OpTypePointer UniformConstant %uint8_tensor_r4_s_1_h_w_4 + + ; Parameters for first rescale layer + %rescale0_input_unsigned = OpConstantTrue %bool + %rescale0_output_unsigned = OpConstantFalse %bool + %rescale0_multiplier_value = OpConstant %uint 4 + %rescale0_multiplier = OpConstantComposite %uint_tensor_r1_s1 %rescale0_multiplier_value + %rescale0_shift_value = OpConstant %uint8 2 + %rescale0_shift = OpConstantComposite %uint8_tensor_r1_s1 %rescale0_shift_value + %rescale0_input_zero_point_value = OpConstant %uint8 0 + %rescale0_input_zero_point = OpConstantComposite %uint8_tensor_r1_s1 %rescale0_input_zero_point_value + %rescale0_output_zero_point_value = OpConstant %uint8 128 + %rescale0_output_zero_point = OpConstantComposite %uint8_tensor_r1_s1 %rescale0_output_zero_point_value + + ; Parameters for conv2d layer + %conv_pad = OpConstantComposite %uint_tensor_r1_s4 %uint_1 %uint_1 %uint_1 %uint_1 + %conv_stride = OpConstantComposite %uint_tensor_r1_s2 %uint_1 %uint_1 + %conv_dilation = OpConstantComposite %uint_tensor_r1_s2 %uint_1 %uint_1 + %conv_weights = OpGraphConstantARM %uint8_tensor_r4_s_4_3_3_4 0 + %conv_bias = OpConstantComposite %uint_tensor_r1_s4 %uint_0 %uint_0 %uint_0 %uint_0 + %conv_input_zero_point_value = OpConstant %uint8 128 + %conv_input_zero_point = OpConstantComposite %uint8_tensor_r1_s1 %conv_input_zero_point_value + %conv_weights_zero_point_value = OpConstant %uint8 0 + %conv_weights_zero_point = OpConstantComposite %uint8_tensor_r1_s1 %conv_weights_zero_point_value + + ; Parameters for second rescale layer + %rescale1_input_unsigned = OpConstantFalse %bool + %rescale1_output_unsigned = OpConstantTrue %bool + %rescale1_multiplier_value = OpConstant %uint 4 + %rescale1_multiplier = OpConstantComposite %uint_tensor_r1_s1 %rescale1_multiplier_value + %rescale1_shift_value = OpConstant %uint8 2 + %rescale1_shift = OpConstantComposite %uint8_tensor_r1_s1 %rescale1_shift_value + %rescale1_input_zero_point_value = OpConstant %uint 0 + %rescale1_input_zero_point = OpConstantComposite %uint_tensor_r1_s1 %rescale1_input_zero_point_value + %rescale1_output_zero_point_value = OpConstant %uint8 0 + %rescale1_output_zero_point = OpConstantComposite %uint8_tensor_r1_s1 %rescale1_output_zero_point_value + + ; Graph type and interface variables + %graph_type = OpTypeGraphARM 1 %uint8_tensor_r4_s_1_h_w_4 %uint8_tensor_r4_s_1_h_w_4 + %input_tensor_ptr = OpVariable %ptr_uint8_tensor_r4_s_1_h_w_4 UniformConstant + %output_tensor_ptr = OpVariable %ptr_uint8_tensor_r4_s_1_h_w_4 UniformConstant + + ; Neural network layers defined as a graph + OpGraphEntryPointARM %graph "main" %input_tensor_ptr %output_tensor_ptr + %graph = OpGraphARM %graph_type + %in = OpGraphInputARM %uint8_tensor_r4_s_1_h_w_4 %uint_0 + %in_signed = OpExtInst %uint8_tensor_r4_s_1_h_w_4 %tosa RESCALE %true %uint_0 %false %rescale0_input_unsigned %rescale0_output_unsigned %in %rescale0_multiplier %rescale0_shift %rescale0_input_zero_point %rescale0_output_zero_point + %out_signed = OpExtInst %uint_tensor_r4_s_1_h_w_4 %tosa CONV2D %conv_pad %conv_stride %conv_dilation %uint_1 %false %in_signed %conv_weights %conv_bias %conv_input_zero_point %conv_weights_zero_point + %out_signed16 = OpExtInst %uint16_tensor_r4_s_1_h_w_4 %tosa CAST %out_signed + %out_unsigned = OpExtInst %uint8_tensor_r4_s_1_h_w_4 %tosa RESCALE %true %uint_0 %false %rescale1_input_unsigned %rescale1_output_unsigned %out_signed16 %rescale1_multiplier %rescale1_shift %rescale1_input_zero_point %rescale1_output_zero_point + OpGraphSetOutputARM %out_unsigned %uint_0 + OpGraphEndARM \ No newline at end of file diff --git a/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm.spv b/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm.spv new file mode 100644 index 0000000000..3495065e3f Binary files /dev/null and b/shaders/tensor_and_data_graph/tensor_image_aliasing/spirv/conv2d_int8.spvasm.spv differ diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 4a769b4fb0..f2098b531c 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2025, Arm Limited and Contributors +# Copyright (c) 2019-2026, Arm Limited and Contributors # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -385,3 +385,17 @@ if (VKB_PROFILING) add_compile_definitions(TRACY_NO_INVARIANT_CHECK=1) set_property(TARGET TracyClient PROPERTY FOLDER "ThirdParty") endif() + +# ai-ml-sdk-vgf-library +if(NOT IOS) + set(FLATBUFFERS_PATH "${CMAKE_SOURCE_DIR}/third_party/flatbuffers") + set(ML_SDK_VGF_LIB_BUILD_TOOLS OFF) + add_subdirectory(ai-ml-sdk-vgf-library) + + # The DEBUG define set in global_options.cmake clashes with an enum value defined in ai-ml-sdk-vgf-library/include/vgf/logging.hpp, + # work around this by disabling the DEBUG define for this target. + target_compile_options(vgf PRIVATE -UDEBUG) + + set_target_properties(vgf PROPERTIES FOLDER "ThirdParty" POSITION_INDEPENDENT_CODE ON) +endif() + diff --git a/third_party/README.adoc b/third_party/README.adoc index 4e84726ffb..ad6a89249c 100644 --- a/third_party/README.adoc +++ b/third_party/README.adoc @@ -1,5 +1,5 @@ //// -- Copyright (c) 2019-2025, Arm Limited and Contributors +- Copyright (c) 2019-2026, Arm Limited and Contributors - - SPDX-License-Identifier: Apache-2.0 - @@ -41,3 +41,5 @@ This project has multiple third-party dependencies, each of which may have indep * https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator[vma]: Vulkan Memory Allocator * https://github.com/zeux/volk[volk]: Meta loader for Vulkan API * https://github.com/KhronosGroup/Vulkan-Docs[Vulkan]: Sources for the formal documentation of the Vulkan API +* https://github.com/google/flatbuffers[flatbuffers]: An efficient cross platform serialization library +* https://github.com/arm/ai-ml-sdk-vgf-library[ai-ml-sdk-vgf-library]: Library for encoding and decoding VGF files diff --git a/third_party/ai-ml-sdk-vgf-library b/third_party/ai-ml-sdk-vgf-library new file mode 160000 index 0000000000..5127d06a4f --- /dev/null +++ b/third_party/ai-ml-sdk-vgf-library @@ -0,0 +1 @@ +Subproject commit 5127d06a4f2cbb6699a748006ce52ee570278b7e diff --git a/third_party/flatbuffers b/third_party/flatbuffers new file mode 160000 index 0000000000..1c514626e8 --- /dev/null +++ b/third_party/flatbuffers @@ -0,0 +1 @@ +Subproject commit 1c514626e83c20fffa8557e75641848e1e15cd5e