Skip to content

Commit af90130

Browse files
authored
route EthosU input/output memcpy through overridable hook (#19264)
Differential Revision: D103455766 Pull Request resolved: #19264
1 parent 851cffb commit af90130

5 files changed

Lines changed: 43 additions & 6 deletions

File tree

backends/arm/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
5454

5555
set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
5656

57-
set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp
58-
backends/arm/runtime/VelaBinStream.cpp
57+
set(_arm_backend_sources
58+
backends/arm/runtime/EthosUBackend.cpp
59+
backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
60+
backends/arm/runtime/VelaBinStream.cpp
5961
)
6062
list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
6163

backends/arm/runtime/EthosUBackend.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
2727
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
2828

29+
// Overridable memcpy used by the EthosU backend for input/output scratch
30+
// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
31+
// std::memcpy. Firmware targets can supply a strong override (e.g. routing
32+
// through a DMA engine) to reduce CPU memcpy load on the host MCU.
33+
extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
34+
2935
using namespace std;
3036

3137
using executorch::aten::ScalarType;
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
237243
if (both_char || both_int || both_short || both_bool) {
238244
EXECUTORCH_PROF_SCOPE(
239245
event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
240-
// Sizes match and elt size matches so memcpy
241-
memcpy(
246+
// Sizes match and elt size matches so memcpy.
247+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
248+
arm_ethos_io_memcpy(
242249
scratch_addr,
243250
tensor_in.mutable_data_ptr<char>(),
244251
tensor_in.nbytes());
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment(
389396
}
390397
const char* src_bytes = src;
391398
for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
392-
memcpy(dest, src_bytes, chunk_size);
399+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
400+
arm_ethos_io_memcpy(dest, src_bytes, chunk_size);
393401
src_bytes += vela_chunk_size;
394402
dest += chunk_size;
395403
}

backends/arm/runtime/EthosUBackend_Cortex_M.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex(
4242
return ethosu_reserve_driver();
4343
}
4444

45+
// Overridable memcpy used by the EthosU backend for output scratch
46+
// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
47+
// std::memcpy. Firmware targets can supply a strong override (e.g. routing
48+
// through a DMA engine) to reduce CPU memcpy load on the host MCU.
49+
extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
50+
4551
namespace executorch {
4652
namespace backends {
4753
namespace arm {
@@ -136,7 +142,8 @@ Error platform_execute(
136142
}
137143
io_bytes_total += tensor_bytes;
138144
} else {
139-
memcpy(
145+
// Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
146+
arm_ethos_io_memcpy(
140147
tensor_out.mutable_data_ptr<char>(),
141148
static_cast<const char*>(output_addr),
142149
tensor_bytes);
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright 2026 Arm Limited and/or its affiliates.
3+
*
4+
* This source code is licensed under the BSD-style license found in the
5+
* LICENSE file in the root directory of this source tree.
6+
*/
7+
8+
#include <cstddef>
9+
#include <cstring>
10+
11+
// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a
12+
// strong-symbol override (e.g. routing through DMA on Cortex-M55) without
13+
// touching the upstream EthosUBackend code. Lives in its own translation
14+
// unit so the compiler in the call-site TUs cannot inline this body and
15+
// bypass the link-time override (same trick as bolt_arm_memcpy_external).
16+
extern "C" __attribute__((weak)) void
17+
arm_ethos_io_memcpy(void* dst, const void* src, size_t size) {
18+
std::memcpy(dst, src, size);
19+
}

backends/arm/runtime/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def define_common_targets():
1515
srcs = [
1616
"EthosUBackend.cpp",
1717
"EthosUBackend_Cortex_M.cpp",
18+
"EthosUBackend_IoMemcpy.cpp",
1819
],
1920
headers = ["EthosUBackend_Internal.h"],
2021
compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],

0 commit comments

Comments
 (0)