pytorch
diff --git a/‎CMakePresets.json‎
Lines changed: 1 addition & 6 deletions b/‎CMakePresets.json‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/arm/README.md‎
Lines changed: 83 additions & 6 deletions b/‎backends/arm/README.md‎
Lines changed: 83 additions & 6 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend_Cortex_A.cpp‎
Lines changed: 36 additions & 22 deletions b/‎backends/arm/runtime/EthosUBackend_Cortex_A.cpp‎
Lines changed: 36 additions & 22 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 26 additions & 8 deletions b/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎examples/models/llama/tests/test_export_llama_lib.py‎
Lines changed: 21 additions & 1 deletion b/‎examples/models/llama/tests/test_export_llama_lib.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎extension/llm/export/config/llm_config.py‎
Lines changed: 14 additions & 0 deletions b/‎extension/llm/export/config/llm_config.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎extension/llm/export/partitioner_lib.py‎
Lines changed: 9 additions & 0 deletions b/‎extension/llm/export/partitioner_lib.py‎
Lines changed: 9 additions & 0 deletions
@@ -290,13 +290,8 @@
       "name": "arm-ethosu-linux",
       "displayName": "Build ExecuTorch for Arm Ethos-U Linux",
       "inherits": ["common"],
-      "description": "musl declares __assert_fail with int for line; avoid NDEBUG forward-decl mismatch in Release builds",
       "cacheVariables": {
-        "EXECUTORCH_BUILD_ARM_ETHOSU_LINUX": "ON",
-        "EXECUTORCH_BUILD_EXECUTOR_RUNNER": "ON",
-        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
-        "CMAKE_C_FLAGS_RELEASE": "-UNDEBUG",
-        "CMAKE_CXX_FLAGS_RELEASE": "-UNDEBUG",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake",
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake"
       }
     }
 
@@ -76,27 +76,104 @@ The Arm backend can be built using the following command:
 ./install_executorch.sh
 ```
 
-One of the following commands should also be run once to gather the necessary dependencies for your chosen target(s):
+**NOTE:** While developing, it can be convenient to use `./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
 
-For the Ethos-U target:
+### Target-specific setup and build
+
+Pick one of the target flows below. Each flow has a one-time setup step and a build command.
+
+### Baremetal (Ethos-U) workflow
+
+Builds ExecuTorch runtime libraries for Cortex-M with Ethos-U acceleration.
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --i-agree-to-the-contained-eula
 ```
 
-For the VGF target:
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh
+```
+
+### VGF (Vulkan ML extensions) workflow
+
+Setup:
 
 ```
 ./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
 ```
 
-For both Ethos-U & VGF targets:
+The current flow lowers to TOSA and converts to VGF for use in external projects,
+so the `executor_runner` is not typically used here.
+
+### Direct Drive (experimental, Ethos-U85 on Linux) workflow
+
+Direct Drive enables execution on Ethos-U85 via the Linux driver stack.
+
+Driver stack (Linux) and API:
+
+```
+https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack
+```
+
+An FVP with Linux is available for Direct Drive, but it must be built and run
+manually. See:
 
 ```
-./examples/arm/setup.sh --i-agree-to-the-contained-eula --enable-mlsdk-deps
+https://corstone1000.docs.arm.com/en/corstone1000-2025.12/
 ```
 
-**NOTE:** While developing, it can be convenient to use`./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
+Setup:
+
+```
+./examples/arm/setup.sh --i-agree-to-the-contained-eula --target-toolchain linux-musl
+source ./examples/arm/arm-scratch/setup_path.sh
+```
+
+Build:
+
+```
+./backends/arm/scripts/build_executorch.sh \
+  --toolchain=aarch64-linux-musl-gcc \
+  --build_type=Debug
+```
+
+Note: setup selects the linux-musl toolchain; build uses the aarch64-linux-musl GCC toolchain name.
+
+If your Yocto image enables the dropbear SSH server, you can copy the
+`executor_runner` binary into the running FVP via scp:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
+```
+
+#### Direct Drive model (PTE) workflow
+
+Create a PTE file:
+
+```
+python3 -m examples.arm.aot_arm_compiler \
+  --model_name examples/arm/example_modules/add.py \
+  --delegate \
+  --quantize \
+  --target ethos-u85-256 \
+  --direct_drive
+```
+
+Copy the `executor_runner` binary and the generated PTE file to the running FVP:
+
+```
+scp -P 2222 arm_test/cmake-out/executor_runner add_arm_delegate_ethos-u85-256.pte root@127.0.0.1:/tmp/
+```
+
+Run the model on the FVP:
+
+```
+ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/add_arm_delegate_ethos-u85-256.pte -num_executions 1"
+```
 
 ## Testing
 
 
@@ -347,19 +347,13 @@ Error platform_execute(
     int output_count,
     Span<executorch::runtime::EValue*> args,
     char* /*ethosu_scratch*/) {
-  std::vector<size_t> input_copy_sizes;
-  std::vector<const char*> linux_input_ptrs;
-  if (input_count > 0) {
-    input_copy_sizes.resize(input_count, 0);
-    linux_input_ptrs.resize(input_count, nullptr);
-  }
+  std::vector<size_t> input_copy_sizes(input_count, 0);
+  std::vector<const char*> linux_input_ptrs(input_count, nullptr);
 
-  std::vector<size_t> output_io_bytes;
-  std::vector<char*> linux_output_ptrs;
-  if (output_count > 0) {
-    output_io_bytes.resize(output_count, 0);
-    linux_output_ptrs.resize(output_count, nullptr);
-  }
+  std::vector<size_t> output_io_bytes(output_count, 0);
+  std::vector<char*> linux_output_ptrs(output_count, nullptr);
+  std::vector<std::vector<char>> output_scratch_buffers(output_count);
+  std::vector<bool> output_needs_adjustment(output_count, false);
 
   for (int i = 0; i < input_count; ++i) {
     auto tensor_in = args[i]->toTensor();
@@ -380,16 +374,12 @@ Error platform_execute(
       const size_t tensor_nbytes = tensor_out.nbytes();
       if (i < static_cast<int>(output_io_bytes.size()) &&
           output_io_bytes[i] != tensor_nbytes) {
-        ET_LOG(
-            Error,
-            "Ethos-U Linux backend output size mismatch for index %d: "
-            "driver IO bytes = %zu, tensor bytes = %zu",
-            i,
-            output_io_bytes[i],
-            tensor_nbytes);
-        return Error::InvalidState;
+        output_scratch_buffers[i].resize(output_io_bytes[i]);
+        linux_output_ptrs[i] = output_scratch_buffers[i].data();
+        output_needs_adjustment[i] = true;
+      } else {
+        linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
       }
-      linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
     }
   }
 
@@ -399,13 +389,37 @@ Error platform_execute(
     return Error::InvalidState;
   }
 
-  return invoke_linux_driver(
+  Error status = invoke_linux_driver(
       handles,
       linux_input_ptrs,
       linux_output_ptrs,
       input_copy_sizes,
       output_io_bytes,
       state->options);
+  if (status != Error::Ok) {
+    return status;
+  }
+
+  if (handles.outputs != nullptr) {
+    for (int i = 0; i < output_count; ++i) {
+      if (!output_needs_adjustment[i]) {
+        continue;
+      }
+      auto tensor_out = args[input_count + i]->toTensor();
+      const size_t tensor_nbytes = tensor_out.nbytes();
+      Error adjust_status = copy_with_layout_adjustment(
+          handles.outputs->io[i],
+          i,
+          output_scratch_buffers[i].data(),
+          tensor_out,
+          tensor_nbytes);
+      if (adjust_status != Error::Ok) {
+        return adjust_status;
+      }
+    }
+  }
+
+  return Error::Ok;
 }
 
 } // namespace arm
 
@@ -34,6 +34,7 @@
 from executorch.extension.llm.export.config.llm_config import LlmConfig
 from executorch.extension.llm.export.partitioner_lib import (
     get_coreml_partitioner,
+    get_ethosu_partitioner,
     get_mps_partitioner,
     get_openvino_partitioner,
     get_qnn_partitioner,
@@ -43,6 +44,7 @@
 )
 from executorch.extension.llm.export.quantizer_lib import (
     get_coreml_quantizer,
+    get_ethosu_quantizer,
     get_ov_quantizer,
     get_pt2e_quantization_params,
     get_pt2e_quantizers,
@@ -218,6 +220,7 @@ def build_args_parser() -> argparse.ArgumentParser:
             "coreml_baseline_8a_c4w",
             "vulkan_8w",
             "tosa_8a8w",
+            "ethosu_8a8w",
         ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
@@ -813,6 +816,14 @@ def get_quantizer_and_quant_params(llm_config):
             llm_config.backend.tosa.version, llm_config.quantization.pt2e_quantize.value
         )
         quantizers.append(tosa_quantizer)
+    if llm_config.backend.ethosu.enabled and llm_config.quantization.pt2e_quantize:
+        ethosu_quantizer = get_ethosu_quantizer(
+            llm_config.backend.ethosu.target,
+            llm_config.backend.ethosu.system_config,
+            llm_config.backend.ethosu.memory_mode,
+            llm_config.quantization.pt2e_quantize.value,
+        )
+        quantizers.append(ethosu_quantizer)
     if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize:
         assert (
             len(quantizers) == 0
@@ -984,20 +995,27 @@ def _to_edge_and_lower_llama_openvino(
     return builder.to_executorch(passes=additional_passes)
 
 
-def _to_edge_and_lower_llama_tosa(
+def _to_edge_and_lower_llama_arm(
     builder_exported,
     modelname,
     quantizers,
     additional_passes,
-    tosa_spec,
+    llm_config: LlmConfig,
     verbose: bool = False,
 ) -> LLMEdgeManager:
     logging.info("Lowering model using TOSA partitioner")
 
     partitioners = []
-    partitioners.append(get_tosa_partitioner(tosa_spec))
-
-    modelname = f"tosa_{modelname}"
+    if llm_config.backend.ethosu.enabled:
+        partitioners.append(
+            get_ethosu_partitioner(
+                llm_config.backend.ethosu.target,
+            )
+        )
+        modelname = f"ethosu_{modelname}"
+    elif llm_config.backend.tosa.enabled:
+        partitioners.append(get_tosa_partitioner(llm_config.backend.tosa.version))
+        modelname = f"tosa_{modelname}"
 
     builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
         partitioners
@@ -1365,13 +1383,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             openvino_device=llm_config.backend.openvino.device,
             verbose=llm_config.debug.verbose,
         )
-    elif llm_config.backend.tosa.enabled:
-        builder = _to_edge_and_lower_llama_tosa(
+    elif llm_config.backend.tosa.enabled or llm_config.backend.ethosu.enabled:
+        builder = _to_edge_and_lower_llama_arm(
             builder_exported,
             modelname,
             quantizers,
             additional_passes,
-            llm_config.backend.tosa.version,
+            llm_config,
             verbose=llm_config.debug.verbose,
         )
     else:
 
@@ -10,12 +10,17 @@
 from executorch.devtools.backend_debug import get_delegation_info
 
 try:
-    from executorch.backends.arm.quantizer.arm_quantizer import TOSAQuantizer
+    from executorch.backends.arm.quantizer.arm_quantizer import (
+        EthosUQuantizer,
+        TOSAQuantizer,
+    )
 
     HAS_ARM_BACKEND = True
 except ImportError:
     HAS_ARM_BACKEND = False
+    EthosUQuantizer = None
     TOSAQuantizer = None
+
 from executorch.examples.models.llama.export_llama_lib import (
     _export_llama,
     build_args_parser,
@@ -73,3 +78,18 @@ def test_get_quantizer_and_quant_params_returns_tosa_quantizer(self):
         self.assertIsNone(quant_dtype)
         self.assertEqual(len(quantizers), 1)
         self.assertIsInstance(quantizers[0], TOSAQuantizer)
+
+    @unittest.skipUnless(HAS_ARM_BACKEND, "ARM backend not available")
+    def test_get_quantizer_and_quant_params_returns_ethosu_quantizer(self):
+        llm_config = LlmConfig()
+        llm_config.backend.ethosu.enabled = True
+        llm_config.quantization.pt2e_quantize = Pt2eQuantize.ethosu_8a8w
+
+        pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(
+            llm_config
+        )
+
+        self.assertIsNone(pt2e_quant_params)
+        self.assertIsNone(quant_dtype)
+        self.assertEqual(len(quantizers), 1)
+        self.assertIsInstance(quantizers[0], EthosUQuantizer)
@@ -348,6 +348,7 @@ class Pt2eQuantize(str, Enum):
     coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w"
     vulkan_8w = "vulkan_8w"
     tosa_8a8w = "tosa_8a8w"
+    ethosu_8a8w = "ethosu_8a8w"
 
 
 class SpinQuant(str, Enum):
@@ -545,6 +546,18 @@ class TosaConfig:
     version: str = "TOSA-1.0+INT"
 
 
+@dataclass
+class EthosUConfig:
+    """
+    Configures the Ethos-U backend.
+    """
+
+    enabled: bool = False
+    target: str = "ethos-u85-128"  # Default target, can be overridden.
+    memory_mode: str = "default"
+    system_config: str = "default"
+
+
 @dataclass
 class BackendConfig:
     """
@@ -560,6 +573,7 @@ class BackendConfig:
     openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
     torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig)
     tosa: TosaConfig = field(default_factory=TosaConfig)
+    ethosu: EthosUConfig = field(default_factory=EthosUConfig)
 
 
 ################################################################################
 
@@ -246,3 +246,12 @@ def get_tosa_partitioner(version: str):
     compile_spec = TosaCompileSpec(version)
 
     return TOSAPartitioner(compile_spec)
+
+
+def get_ethosu_partitioner(target: str):
+    from executorch.backends.arm.ethosu.compile_spec import EthosUCompileSpec
+    from executorch.backends.arm.ethosu.partitioner import EthosUPartitioner
+
+    compile_spec = EthosUCompileSpec(target)
+
+    return EthosUPartitioner(compile_spec)