Skip to content

Commit ca4e1c0

Browse files
authored
Merge branch 'main' into bump-torchao-pin
2 parents ad6f166 + f478cb3 commit ca4e1c0

23 files changed

Lines changed: 899 additions & 200 deletions

CMakePresets.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -290,13 +290,8 @@
290290
"name": "arm-ethosu-linux",
291291
"displayName": "Build ExecuTorch for Arm Ethos-U Linux",
292292
"inherits": ["common"],
293-
"description": "musl declares __assert_fail with int for line; avoid NDEBUG forward-decl mismatch in Release builds",
294293
"cacheVariables": {
295-
"EXECUTORCH_BUILD_ARM_ETHOSU_LINUX": "ON",
296-
"EXECUTORCH_BUILD_EXECUTOR_RUNNER": "ON",
297-
"EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
298-
"CMAKE_C_FLAGS_RELEASE": "-UNDEBUG",
299-
"CMAKE_CXX_FLAGS_RELEASE": "-UNDEBUG",
294+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/arm_ethosu_linux.cmake",
300295
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/arm/ethos-u-setup/aarch64-linux-musl-toolchain.cmake"
301296
}
302297
}

backends/arm/README.md

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,27 +76,104 @@ The Arm backend can be built using the following command:
7676
./install_executorch.sh
7777
```
7878

79-
One of the following commands should also be run once to gather the necessary dependencies for your chosen target(s):
79+
**NOTE:** While developing, it can be convenient to use `./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
8080

81-
For the Ethos-U target:
81+
### Target-specific setup and build
82+
83+
Pick one of the target flows below. Each flow has a one-time setup step and a build command.
84+
85+
### Baremetal (Ethos-U) workflow
86+
87+
Builds ExecuTorch runtime libraries for Cortex-M with Ethos-U acceleration.
88+
89+
Setup:
8290

8391
```
8492
./examples/arm/setup.sh --i-agree-to-the-contained-eula
8593
```
8694

87-
For the VGF target:
95+
Build:
96+
97+
```
98+
./backends/arm/scripts/build_executorch.sh
99+
```
100+
101+
### VGF (Vulkan ML extensions) workflow
102+
103+
Setup:
88104

89105
```
90106
./examples/arm/setup.sh --disable-ethos-u-deps --enable-mlsdk-deps
91107
```
92108

93-
For both Ethos-U & VGF targets:
109+
The current flow lowers to TOSA and converts to VGF for use in external projects,
110+
so the `executor_runner` is not typically used here.
111+
112+
### Direct Drive (experimental, Ethos-U85 on Linux) workflow
113+
114+
Direct Drive enables execution on Ethos-U85 via the Linux driver stack.
115+
116+
Driver stack (Linux) and API:
117+
118+
```
119+
https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-linux-driver-stack
120+
```
121+
122+
An FVP with Linux is available for Direct Drive, but it must be built and run
123+
manually. See:
94124

95125
```
96-
./examples/arm/setup.sh --i-agree-to-the-contained-eula --enable-mlsdk-deps
126+
https://corstone1000.docs.arm.com/en/corstone1000-2025.12/
97127
```
98128

99-
**NOTE:** While developing, it can be convenient to use`./install_executorch.sh --editable`, which creates an editable installation of ExecuTorch.
129+
Setup:
130+
131+
```
132+
./examples/arm/setup.sh --i-agree-to-the-contained-eula --target-toolchain linux-musl
133+
source ./examples/arm/arm-scratch/setup_path.sh
134+
```
135+
136+
Build:
137+
138+
```
139+
./backends/arm/scripts/build_executorch.sh \
140+
--toolchain=aarch64-linux-musl-gcc \
141+
--build_type=Debug
142+
```
143+
144+
Note: setup selects the linux-musl toolchain; build uses the aarch64-linux-musl GCC toolchain name.
145+
146+
If your Yocto image enables the dropbear SSH server, you can copy the
147+
`executor_runner` binary into the running FVP via scp:
148+
149+
```
150+
scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
151+
```
152+
153+
#### Direct Drive model (PTE) workflow
154+
155+
Create a PTE file:
156+
157+
```
158+
python3 -m examples.arm.aot_arm_compiler \
159+
--model_name examples/arm/example_modules/add.py \
160+
--delegate \
161+
--quantize \
162+
--target ethos-u85-256 \
163+
--direct_drive
164+
```
165+
166+
Copy the `executor_runner` binary and the generated PTE file to the running FVP:
167+
168+
```
169+
scp -P 2222 arm_test/cmake-out/executor_runner add_arm_delegate_ethos-u85-256.pte root@127.0.0.1:/tmp/
170+
```
171+
172+
Run the model on the FVP:
173+
174+
```
175+
ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/add_arm_delegate_ethos-u85-256.pte -num_executions 1"
176+
```
100177

101178
## Testing
102179

backends/arm/runtime/EthosUBackend_Cortex_A.cpp

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -347,19 +347,13 @@ Error platform_execute(
347347
int output_count,
348348
Span<executorch::runtime::EValue*> args,
349349
char* /*ethosu_scratch*/) {
350-
std::vector<size_t> input_copy_sizes;
351-
std::vector<const char*> linux_input_ptrs;
352-
if (input_count > 0) {
353-
input_copy_sizes.resize(input_count, 0);
354-
linux_input_ptrs.resize(input_count, nullptr);
355-
}
350+
std::vector<size_t> input_copy_sizes(input_count, 0);
351+
std::vector<const char*> linux_input_ptrs(input_count, nullptr);
356352

357-
std::vector<size_t> output_io_bytes;
358-
std::vector<char*> linux_output_ptrs;
359-
if (output_count > 0) {
360-
output_io_bytes.resize(output_count, 0);
361-
linux_output_ptrs.resize(output_count, nullptr);
362-
}
353+
std::vector<size_t> output_io_bytes(output_count, 0);
354+
std::vector<char*> linux_output_ptrs(output_count, nullptr);
355+
std::vector<std::vector<char>> output_scratch_buffers(output_count);
356+
std::vector<bool> output_needs_adjustment(output_count, false);
363357

364358
for (int i = 0; i < input_count; ++i) {
365359
auto tensor_in = args[i]->toTensor();
@@ -380,16 +374,12 @@ Error platform_execute(
380374
const size_t tensor_nbytes = tensor_out.nbytes();
381375
if (i < static_cast<int>(output_io_bytes.size()) &&
382376
output_io_bytes[i] != tensor_nbytes) {
383-
ET_LOG(
384-
Error,
385-
"Ethos-U Linux backend output size mismatch for index %d: "
386-
"driver IO bytes = %zu, tensor bytes = %zu",
387-
i,
388-
output_io_bytes[i],
389-
tensor_nbytes);
390-
return Error::InvalidState;
377+
output_scratch_buffers[i].resize(output_io_bytes[i]);
378+
linux_output_ptrs[i] = output_scratch_buffers[i].data();
379+
output_needs_adjustment[i] = true;
380+
} else {
381+
linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
391382
}
392-
linux_output_ptrs[i] = tensor_out.mutable_data_ptr<char>();
393383
}
394384
}
395385

@@ -399,13 +389,37 @@ Error platform_execute(
399389
return Error::InvalidState;
400390
}
401391

402-
return invoke_linux_driver(
392+
Error status = invoke_linux_driver(
403393
handles,
404394
linux_input_ptrs,
405395
linux_output_ptrs,
406396
input_copy_sizes,
407397
output_io_bytes,
408398
state->options);
399+
if (status != Error::Ok) {
400+
return status;
401+
}
402+
403+
if (handles.outputs != nullptr) {
404+
for (int i = 0; i < output_count; ++i) {
405+
if (!output_needs_adjustment[i]) {
406+
continue;
407+
}
408+
auto tensor_out = args[input_count + i]->toTensor();
409+
const size_t tensor_nbytes = tensor_out.nbytes();
410+
Error adjust_status = copy_with_layout_adjustment(
411+
handles.outputs->io[i],
412+
i,
413+
output_scratch_buffers[i].data(),
414+
tensor_out,
415+
tensor_nbytes);
416+
if (adjust_status != Error::Ok) {
417+
return adjust_status;
418+
}
419+
}
420+
}
421+
422+
return Error::Ok;
409423
}
410424

411425
} // namespace arm

examples/models/llama/export_llama_lib.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from executorch.extension.llm.export.config.llm_config import LlmConfig
3535
from executorch.extension.llm.export.partitioner_lib import (
3636
get_coreml_partitioner,
37+
get_ethosu_partitioner,
3738
get_mps_partitioner,
3839
get_openvino_partitioner,
3940
get_qnn_partitioner,
@@ -43,6 +44,7 @@
4344
)
4445
from executorch.extension.llm.export.quantizer_lib import (
4546
get_coreml_quantizer,
47+
get_ethosu_quantizer,
4648
get_ov_quantizer,
4749
get_pt2e_quantization_params,
4850
get_pt2e_quantizers,
@@ -218,6 +220,7 @@ def build_args_parser() -> argparse.ArgumentParser:
218220
"coreml_baseline_8a_c4w",
219221
"vulkan_8w",
220222
"tosa_8a8w",
223+
"ethosu_8a8w",
221224
],
222225
help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
223226
)
@@ -813,6 +816,14 @@ def get_quantizer_and_quant_params(llm_config):
813816
llm_config.backend.tosa.version, llm_config.quantization.pt2e_quantize.value
814817
)
815818
quantizers.append(tosa_quantizer)
819+
if llm_config.backend.ethosu.enabled and llm_config.quantization.pt2e_quantize:
820+
ethosu_quantizer = get_ethosu_quantizer(
821+
llm_config.backend.ethosu.target,
822+
llm_config.backend.ethosu.system_config,
823+
llm_config.backend.ethosu.memory_mode,
824+
llm_config.quantization.pt2e_quantize.value,
825+
)
826+
quantizers.append(ethosu_quantizer)
816827
if llm_config.backend.vulkan.enabled and llm_config.quantization.pt2e_quantize:
817828
assert (
818829
len(quantizers) == 0
@@ -984,20 +995,27 @@ def _to_edge_and_lower_llama_openvino(
984995
return builder.to_executorch(passes=additional_passes)
985996

986997

987-
def _to_edge_and_lower_llama_tosa(
998+
def _to_edge_and_lower_llama_arm(
988999
builder_exported,
9891000
modelname,
9901001
quantizers,
9911002
additional_passes,
992-
tosa_spec,
1003+
llm_config: LlmConfig,
9931004
verbose: bool = False,
9941005
) -> LLMEdgeManager:
9951006
logging.info("Lowering model using TOSA partitioner")
9961007

9971008
partitioners = []
998-
partitioners.append(get_tosa_partitioner(tosa_spec))
999-
1000-
modelname = f"tosa_{modelname}"
1009+
if llm_config.backend.ethosu.enabled:
1010+
partitioners.append(
1011+
get_ethosu_partitioner(
1012+
llm_config.backend.ethosu.target,
1013+
)
1014+
)
1015+
modelname = f"ethosu_{modelname}"
1016+
elif llm_config.backend.tosa.enabled:
1017+
partitioners.append(get_tosa_partitioner(llm_config.backend.tosa.version))
1018+
modelname = f"tosa_{modelname}"
10011019

10021020
builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
10031021
partitioners
@@ -1365,13 +1383,13 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager: # noqa: C901
13651383
openvino_device=llm_config.backend.openvino.device,
13661384
verbose=llm_config.debug.verbose,
13671385
)
1368-
elif llm_config.backend.tosa.enabled:
1369-
builder = _to_edge_and_lower_llama_tosa(
1386+
elif llm_config.backend.tosa.enabled or llm_config.backend.ethosu.enabled:
1387+
builder = _to_edge_and_lower_llama_arm(
13701388
builder_exported,
13711389
modelname,
13721390
quantizers,
13731391
additional_passes,
1374-
llm_config.backend.tosa.version,
1392+
llm_config,
13751393
verbose=llm_config.debug.verbose,
13761394
)
13771395
else:

examples/models/llama/tests/test_export_llama_lib.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,17 @@
1010
from executorch.devtools.backend_debug import get_delegation_info
1111

1212
try:
13-
from executorch.backends.arm.quantizer.arm_quantizer import TOSAQuantizer
13+
from executorch.backends.arm.quantizer.arm_quantizer import (
14+
EthosUQuantizer,
15+
TOSAQuantizer,
16+
)
1417

1518
HAS_ARM_BACKEND = True
1619
except ImportError:
1720
HAS_ARM_BACKEND = False
21+
EthosUQuantizer = None
1822
TOSAQuantizer = None
23+
1924
from executorch.examples.models.llama.export_llama_lib import (
2025
_export_llama,
2126
build_args_parser,
@@ -73,3 +78,18 @@ def test_get_quantizer_and_quant_params_returns_tosa_quantizer(self):
7378
self.assertIsNone(quant_dtype)
7479
self.assertEqual(len(quantizers), 1)
7580
self.assertIsInstance(quantizers[0], TOSAQuantizer)
81+
82+
@unittest.skipUnless(HAS_ARM_BACKEND, "ARM backend not available")
83+
def test_get_quantizer_and_quant_params_returns_ethosu_quantizer(self):
84+
llm_config = LlmConfig()
85+
llm_config.backend.ethosu.enabled = True
86+
llm_config.quantization.pt2e_quantize = Pt2eQuantize.ethosu_8a8w
87+
88+
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(
89+
llm_config
90+
)
91+
92+
self.assertIsNone(pt2e_quant_params)
93+
self.assertIsNone(quant_dtype)
94+
self.assertEqual(len(quantizers), 1)
95+
self.assertIsInstance(quantizers[0], EthosUQuantizer)

extension/llm/export/config/llm_config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ class Pt2eQuantize(str, Enum):
348348
coreml_baseline_8a_c4w = "coreml_baseline_8a_c4w"
349349
vulkan_8w = "vulkan_8w"
350350
tosa_8a8w = "tosa_8a8w"
351+
ethosu_8a8w = "ethosu_8a8w"
351352

352353

353354
class SpinQuant(str, Enum):
@@ -545,6 +546,18 @@ class TosaConfig:
545546
version: str = "TOSA-1.0+INT"
546547

547548

549+
@dataclass
550+
class EthosUConfig:
551+
"""
552+
Configures the Ethos-U backend.
553+
"""
554+
555+
enabled: bool = False
556+
target: str = "ethos-u85-128" # Default target, can be overridden.
557+
memory_mode: str = "default"
558+
system_config: str = "default"
559+
560+
548561
@dataclass
549562
class BackendConfig:
550563
"""
@@ -560,6 +573,7 @@ class BackendConfig:
560573
openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
561574
torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig)
562575
tosa: TosaConfig = field(default_factory=TosaConfig)
576+
ethosu: EthosUConfig = field(default_factory=EthosUConfig)
563577

564578

565579
################################################################################

extension/llm/export/partitioner_lib.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,12 @@ def get_tosa_partitioner(version: str):
246246
compile_spec = TosaCompileSpec(version)
247247

248248
return TOSAPartitioner(compile_spec)
249+
250+
251+
def get_ethosu_partitioner(target: str):
252+
from executorch.backends.arm.ethosu.compile_spec import EthosUCompileSpec
253+
from executorch.backends.arm.ethosu.partitioner import EthosUPartitioner
254+
255+
compile_spec = EthosUCompileSpec(target)
256+
257+
return EthosUPartitioner(compile_spec)

0 commit comments

Comments
 (0)