Skip to content

Commit ffc9927

Browse files
3l1facebook-github-bot
authored andcommitted
route EthosU input/output memcpy through overridable hook
Summary: The EthosU backend's input/output scratch shuffling currently does plain CPU std::memcpy of every input tensor into the scratch buffer and every output tensor out of it on every inference. On Cortex-M55-based firmware targets that have a DMA engine, this is a significant CPU load, inference is spent in memcpy that could instead be DMA-offloaded so the M55 sleeps while the transfer runs. This change introduces a thin extern-C indirection — `arm_ethos_io_memcpy` — that the EthosU backend uses everywhere it currently calls memcpy for input/output scratch shuffling. The default (weak) implementation lives in a separate translation unit (EthosUBackend_IoMemcpy.cpp) and just calls std::memcpy, so behavior is unchanged for any consumer that doesn't override it. Firmware targets can supply a strong-symbol override (e.g. routing through a DMA engine) without touching the upstream backend code. Implementation notes: - The weak default lives in its own TU so the compiler in the call-site TUs cannot inline its body and bypass the link-time override. This is the same pattern bolt_arm_memcpy_external uses. - Three call sites updated: input scratch copy in EthosUBackend.cpp, the layout-adjustment chunk loop in EthosUBackend.cpp, and the output scratch copy in EthosUBackend_Cortex_M.cpp. Differential Revision: D103455766
1 parent a7e44bf commit ffc9927

4 files changed

Lines changed: 39 additions & 4 deletions

File tree

backends/arm/runtime/EthosUBackend.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
2727
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
2828

29+
// Overridable memcpy used by the EthosU backend for input/output scratch
30+
// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
31+
// std::memcpy. Firmware targets can supply a strong override (e.g. routing
32+
// through a DMA engine) to reduce CPU memcpy load on the host MCU.
33+
extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
34+
2935
using namespace std;
3036

3137
using executorch::aten::ScalarType;
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
237243
if (both_char || both_int || both_short || both_bool) {
238244
EXECUTORCH_PROF_SCOPE(
239245
event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
240-
// Sizes match and elt size matches so memcpy
241-
memcpy(
246+
// Sizes match and elt size matches so memcpy.
247+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
248+
arm_ethos_io_memcpy(
242249
scratch_addr,
243250
tensor_in.mutable_data_ptr<char>(),
244251
tensor_in.nbytes());
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment(
389396
}
390397
const char* src_bytes = src;
391398
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
392-
memcpy(dest, src_bytes, chunk_size);
399+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
400+
arm_ethos_io_memcpy(dest, src_bytes, chunk_size);
393401
src_bytes += vela_chunk_size;
394402
dest += chunk_size;
395403
}

backends/arm/runtime/EthosUBackend_Cortex_M.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex(
4242
return ethosu_reserve_driver();
4343
}
4444

45+
// Overridable memcpy used by the EthosU backend for output scratch
46+
// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
47+
// std::memcpy. Firmware targets can supply a strong override (e.g. routing
48+
// through a DMA engine) to reduce CPU memcpy load on the host MCU.
49+
extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
50+
4551
namespace executorch {
4652
namespace backends {
4753
namespace arm {
@@ -133,7 +139,8 @@ Error platform_execute(
133139
}
134140
io_bytes_total += tensor_bytes;
135141
} else {
136-
memcpy(
142+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
143+
arm_ethos_io_memcpy(
137144
tensor_out.mutable_data_ptr<char>(),
138145
static_cast<const char*>(output_addr),
139146
tensor_bytes);
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright 2026 Arm Limited and/or its affiliates.
3+
*
4+
* This source code is licensed under the BSD-style license found in the
5+
* LICENSE file in the root directory of this source tree.
6+
*/
7+
8+
#include <cstddef>
9+
#include <cstring>
10+
11+
// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a
12+
// strong-symbol override (e.g. routing through DMA on Cortex-M55) without
13+
// touching the upstream EthosUBackend code. Lives in its own translation
14+
// unit so the compiler in the call-site TUs cannot inline this body and
15+
// bypass the link-time override (same trick as bolt_arm_memcpy_external).
16+
extern "C" __attribute__((weak)) void
17+
arm_ethos_io_memcpy(void* dst, const void* src, size_t size) {
18+
std::memcpy(dst, src, size);
19+
}

backends/arm/runtime/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def define_common_targets():
1515
srcs = [
1616
"EthosUBackend.cpp",
1717
"EthosUBackend_Cortex_M.cpp",
18+
"EthosUBackend_IoMemcpy.cpp",
1819
],
1920
headers = ["EthosUBackend_Internal.h"],
2021
compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],

0 commit comments

Comments
 (0)