|
26 | 26 | #include <executorch/runtime/core/exec_aten/util/dim_order_util.h> |
27 | 27 | #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h> |
28 | 28 |
|
| 29 | +// Overridable memcpy used by the EthosU backend for input/output scratch |
| 30 | +// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does |
| 31 | +// std::memcpy. Firmware targets can supply a strong override (e.g. routing |
| 32 | +// through a DMA engine) to reduce CPU memcpy load on the host MCU. |
| 33 | +extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size); |
| 34 | + |
29 | 35 | using namespace std; |
30 | 36 |
|
31 | 37 | using executorch::aten::ScalarType; |
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface { |
237 | 243 | if (both_char || both_int || both_short || both_bool) { |
238 | 244 | EXECUTORCH_PROF_SCOPE( |
239 | 245 | event_tracer, "+EthosUBackend::execute()handles.input.memcpy()"); |
240 | | - // Sizes match and elt size matches so memcpy |
241 | | - memcpy( |
| 246 | + // Sizes match and elt size matches so memcpy. |
| 247 | + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. |
| 248 | + arm_ethos_io_memcpy( |
242 | 249 | scratch_addr, |
243 | 250 | tensor_in.mutable_data_ptr<char>(), |
244 | 251 | tensor_in.nbytes()); |
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment( |
389 | 396 | } |
390 | 397 | const char* src_bytes = src; |
391 | 398 | for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) { |
392 | | - memcpy(dest, src_bytes, chunk_size); |
| 399 | + // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate. |
| 400 | + arm_ethos_io_memcpy(dest, src_bytes, chunk_size); |
393 | 401 | src_bytes += vela_chunk_size; |
394 | 402 | dest += chunk_size; |
395 | 403 | } |
|
0 commit comments