pytorch
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 0 additions & 13 deletions b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎.ci/scripts/wheel/test_linux.py‎
Lines changed: 0 additions & 15 deletions b/‎.ci/scripts/wheel/test_linux.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 1 addition & 6 deletions b/‎CMakePresets.json‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 83 additions & 6 deletions b/‎backends/arm/README.md‎
Lines changed: 83 additions & 6 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend_Cortex_A.cpp‎
Lines changed: 36 additions & 22 deletions b/‎backends/arm/runtime/EthosUBackend_Cortex_A.cpp‎
Lines changed: 36 additions & 22 deletions
diff --git a/‎backends/qualcomm/CMakeLists.txt‎
Lines changed: 4 additions & 14 deletions b/‎backends/qualcomm/CMakeLists.txt‎
Lines changed: 4 additions & 14 deletions
@@ -44,16 +44,3 @@ fi
 # able to see the installed torch package.
 
 "${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
-
-# Download Qualcomm QNN SDK on Linux x86_64 so the wheel build can include the
-# QNN backend.  The SDK is large, so we download it here (outside CMake) rather
-# than during cmake configure.
-if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
-  echo "Downloading Qualcomm QNN SDK..."
-  QNN_SDK_ROOT=$(python3 \
-    "${GITHUB_WORKSPACE}/${REPOSITORY}/backends/qualcomm/scripts/download_qnn_sdk.py" \
-    --print-sdk-path)
-  export QNN_SDK_ROOT
-  echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" >> "${GITHUB_ENV}"
-  echo "QNN SDK downloaded to ${QNN_SDK_ROOT}"
-fi
@@ -5,25 +5,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import platform
-
 import test_base
 from examples.models import Backend, Model
 
 if __name__ == "__main__":
-    # On Linux x86_64 the wheel is built with the Qualcomm backend.
-    # Verify that it was registered correctly.
-    if platform.system() == "Linux" and platform.machine() in ("x86_64", "amd64"):
-        from executorch.extension.pybindings.portable_lib import (
-            _get_registered_backend_names,
-        )
-
-        registered = _get_registered_backend_names()
-        assert (
-            "QnnBackend" in registered
-        ), f"QnnBackend not found in registered backends: {registered}"
-        print("✓ QnnBackend is registered")
-
     test_base.run_tests(
         model_tests=[
             test_base.ModelTest(
 
@@ -16,6 +16,7 @@ jobs:
   test-qnn-wheel-packages-linux:
     name: test-qnn-wheel-packages-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    if: false
     permissions:
       id-token: write
       contents: read
 
@@ -290,13 +290,8 @@
       "name": "arm-ethosu-linux",
       "displayName": "Build ExecuTorch for Arm Ethos-U Linux",
       "inherits": ["common"],
-      "description": "musl declares __assert_fail with int for line; avoid NDEBUG forward-decl mismatch in Release builds",
       "cacheVariables": {
-        "EXECUTORCH_BUILD_ARM_ETHOSU_LINUX": "ON",
-        "EXECUTORCH_BUILD_EXECUTOR_RUNNER": "ON",
-        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
-        "CMAKE_C_FLAGS_RELEASE": "-UNDEBUG",
-        "CMAKE_CXX_FLAGS_RELEASE": "-UNDEBUG",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake",
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake"
       }
     }
 
@@ -76,27 +76,104 @@ The Arm backend can be built using the following command:
 ./install_executorch.sh
 ```
 
-One of the following commands should also be run once to gather the necessary dependencies for your chosen target(s):
+**NOTE:** While developing, it can be convenient to use `./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
 
-For the Ethos-U target:
+### Target-specific setup and build
+
+Pick one of the target flows below. Each flow has a one-time setup step and a build command.
+
+### Baremetal (Ethos-U) workflow
+
+Builds ExecuTorch runtime libraries for Cortex-M with Ethos-U acceleration.
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --i-agree-to-the-contained-eula
 ```
 
-For the VGF target:
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh
+```
+
+### VGF (Vulkan ML extensions) workflow
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
 ```
 
-For both Ethos-U & VGF targets:
+The current flow lowers to TOSA and converts to VGF for use in external projects,
+so the `executor_runner` is not typically used here.
+
+### Direct Drive (experimental, Ethos-U85 on Linux) workflow
+
+Direct Drive enables execution on Ethos-U85 via the Linux driver stack.
+
+Driver stack (Linux) and API:
+
+```
+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack
+```
+
+An FVP with Linux is available for Direct Drive, but it must be built and run
+manually. See:
 
 ```
-./examples/arm/setup.sh --i-agree-to-the-contained-eula --enable-mlsdk-deps
+https://corstone1000.docs.arm.com/en/corstone1000-2025.12/
 ```
 
-**NOTE:** While developing, it can be convenient to use`./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
+Setup:
+
+```
+./examples/arm/setup.sh --i-agree-to-the-contained-eula --target-toolchain linux-musl
+source ./examples/arm/arm-scratch/setup_path.sh
+```
+
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh \
+  --toolchain=aarch64-linux-musl-gcc \
+  --build_type=Debug
+```
+
+Note: setup selects the linux-musl toolchain; build uses the aarch64-linux-musl GCC toolchain name.
+
+If your Yocto image enables the dropbear SSH server, you can copy the
+`executor_runner` binary into the running FVP via scp:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
+```
+
+#### Direct Drive model (PTE) workflow
+
+Create a PTE file:
+
+```
+python3 -m examples.arm.aot_arm_compiler \
+  --model_name examples/arm/example_modules/add.py \
+  --delegate \
+  --quantize \
+  --target ethos-u85-256 \
+  --direct_drive
+```
+
+Copy the `executor_runner` binary and the generated PTE file to the running FVP:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner add_arm_delegate_ethos-u85-256.pte root@127.0.0.1:/tmp/
+```
+
+Run the model on the FVP:
+
+```
+ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/add_arm_delegate_ethos-u85-256.pte -num_executions 1"
+```
 
 ## Testing
 
 
@@ -347,19 +347,13 @@ Error platform_execute(
     int output_count,
     Span<executorch::runtime::EValue*> args,
     char* /*ethosu_scratch*/) {
-  std::vector<size_t> input_copy_sizes;
-  std::vector<const char*> linux_input_ptrs;
-  if (input_count > 0) {
-    input_copy_sizes.resize(input_count, 0);
-    linux_input_ptrs.resize(input_count, nullptr);
-  }
+  std::vector<size_t> input_copy_sizes(input_count, 0);
+  std::vector<const char*> linux_input_ptrs(input_count, nullptr);
 
-  std::vector<size_t> output_io_bytes;
-  std::vector<char*> linux_output_ptrs;
-  if (output_count > 0) {
-    output_io_bytes.resize(output_count, 0);
-    linux_output_ptrs.resize(output_count, nullptr);
-  }
+  std::vector<size_t> output_io_bytes(output_count, 0);
+  std::vector<char*> linux_output_ptrs(output_count, nullptr);
+  std::vector<std::vector<char>> output_scratch_buffers(output_count);
+  std::vector<bool> output_needs_adjustment(output_count, false);
 
   for (int i = 0; i < input_count; ++i) {
     auto tensor_in = args[i]->toTensor();
@@ -380,16 +374,12 @@ Error platform_execute(
       const size_t tensor_nbytes = tensor_out.nbytes();
       if (i < static_cast<int>(output_io_bytes.size()) &&
           output_io_bytes[i] != tensor_nbytes) {
-        ET_LOG(
-            Error,
-            "Ethos-U Linux backend output size mismatch for index %d: "
-            "driver IO bytes = %zu, tensor bytes = %zu",
-            i,
-            output_io_bytes[i],
-            tensor_nbytes);
-        return Error::InvalidState;
+        output_scratch_buffers[i].resize(output_io_bytes[i]);
+        linux_output_ptrs[i] = output_scratch_buffers[i].data();
+        output_needs_adjustment[i] = true;
+      } else {
+        linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
       }
-      linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
     }
   }
 
@@ -399,13 +389,37 @@ Error platform_execute(
     return Error::InvalidState;
   }
 
-  return invoke_linux_driver(
+  Error status = invoke_linux_driver(
       handles,
       linux_input_ptrs,
       linux_output_ptrs,
       input_copy_sizes,
       output_io_bytes,
       state->options);
+  if (status != Error::Ok) {
+    return status;
+  }
+
+  if (handles.outputs != nullptr) {
+    for (int i = 0; i < output_count; ++i) {
+      if (!output_needs_adjustment[i]) {
+        continue;
+      }
+      auto tensor_out = args[input_count + i]->toTensor();
+      const size_t tensor_nbytes = tensor_out.nbytes();
+      Error adjust_status = copy_with_layout_adjustment(
+          handles.outputs->io[i],
+          i,
+          output_scratch_buffers[i].data(),
+          tensor_out,
+          tensor_nbytes);
+      if (adjust_status != Error::Ok) {
+        return adjust_status;
+      }
+    }
+  }
+
+  return Error::Ok;
 }
 
 } // namespace arm
 
@@ -23,19 +23,9 @@ get_filename_component(
   _common_include_directories "${EXECUTORCH_SOURCE_DIR}/.." ABSOLUTE
 )
 
-# If QNN_SDK_ROOT was not passed as a CMake variable, fall back to the
-# environment variable.  Prefer downloading the SDK *outside* of CMake (e.g. via
-# backends/qualcomm/scripts/download_qnn_sdk.py) and passing the path in.
-if(NOT DEFINED QNN_SDK_ROOT AND DEFINED ENV{QNN_SDK_ROOT})
-  set(QNN_SDK_ROOT
-      $ENV{QNN_SDK_ROOT}
-      CACHE PATH "Qualcomm SDK root directory" FORCE
-  )
-endif()
-
-# Last-resort fallback: download during cmake configure when building wheels and
-# QNN_SDK_ROOT was not provided externally.
-if(NOT DEFINED QNN_SDK_ROOT AND EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
+# We only download QNN SDK when we build pip wheel for ExecuTorch. Please don't
+# change this code unless you know what you are doing.
+if(EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
   set(_qnn_default_sdk_dir "${CMAKE_CURRENT_BINARY_DIR}/sdk/qnn")
 
   if(EXISTS "${_qnn_default_sdk_dir}" AND EXISTS "${_qnn_default_sdk_dir}/lib")
@@ -45,7 +35,7 @@ if(NOT DEFINED QNN_SDK_ROOT AND EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
         CACHE PATH "Qualcomm SDK root directory" FORCE
     )
   else()
-    message(STATUS "Downloading Qualcomm SDK (fallback)")
+    message(STATUS "Downloading Qualcomm SDK")
     execute_process(
       COMMAND
         ${PYTHON_EXECUTABLE}