diff --git a/examples/raspberry_pi/pico2/CMakeLists.txt b/examples/raspberry_pi/pico2/CMakeLists.txt index 16bb397252f..dba4b60e2dd 100644 --- a/examples/raspberry_pi/pico2/CMakeLists.txt +++ b/examples/raspberry_pi/pico2/CMakeLists.txt @@ -125,21 +125,83 @@ target_compile_options( set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections") -set(BAREMETAL_BUILD_DIR ${EXECUTORCH_ROOT}/executorch/cmake-out/) +set(BAREMETAL_BUILD_DIR + ${EXECUTORCH_ROOT}/executorch/cmake-out/ + CACHE STRING "ExecuTorch baremetal build dir" +) -# Link ExecuTorch and Pico libraries -target_link_libraries( - executorch_pico - PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a - ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a - -Wl,--whole-archive - ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a - -Wl,--no-whole-archive - ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a - pico_stdlib - pico_stdio_usb +# CMSIS-NN support: link quantized cortex_m kernels instead of portable ops +option(USE_CMSIS_NN "Link CMSIS-NN INT8 kernels for Cortex-M33 acceleration" + OFF +) +option(USE_SELECTIVE_BUILD "Use selective build (only link model-required ops)" + OFF ) +if(USE_CMSIS_NN) + message(STATUS "CMSIS-NN enabled: linking cortex_m_ops_lib + cmsis-nn") + if(USE_SELECTIVE_BUILD) + # CMSIS-NN model uses only cortex_m:: ops, no portable ops needed. Skip + # --whole-archive on portable_ops_lib to avoid pulling unused ops. + message(STATUS "Selective build: CMSIS-NN only (no portable ops)") + target_link_libraries( + executorch_pico + PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a + ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a + -Wl,--whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a + -Wl,--no-whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a + ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a + ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a + pico_stdlib + pico_stdio_usb + ) + else() + target_link_libraries( + executorch_pico + PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a + ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a + -Wl,--whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a + ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a + -Wl,--no-whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a + ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a + ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a + pico_stdlib + pico_stdio_usb + ) + endif() +else() + if(USE_SELECTIVE_BUILD) + message(STATUS "Selective build: using executorch_selected_kernels") + target_link_libraries( + executorch_pico + PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a + ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a + -Wl,--whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_selected_kernels.a + -Wl,--no-whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a + pico_stdlib + pico_stdio_usb + ) + else() + target_link_libraries( + executorch_pico + PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a + ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a + -Wl,--whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a + -Wl,--no-whole-archive + ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a + pico_stdlib + pico_stdio_usb + ) + endif() +endif() + # Only add extra outputs if the target builds successfully if(TARGET executorch_pico) pico_add_extra_outputs(executorch_pico) diff --git a/examples/raspberry_pi/pico2/build_firmware_pico.sh b/examples/raspberry_pi/pico2/build_firmware_pico.sh index e3343b38deb..cf918061106 100755 --- a/examples/raspberry_pi/pico2/build_firmware_pico.sh +++ b/examples/raspberry_pi/pico2/build_firmware_pico.sh @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -#!/bin/bash # build_firmware_pico.sh # Simple script to cross-compile ExecuTorch and build Pico2 firmware with optional model input @@ -17,12 +16,42 @@ PICO2_DIR="${ROOT_DIR}/examples/raspberry_pi/pico2" BUILD_DIR="${PICO2_DIR}/build" EXECUTORCH_BUILD_DIR="${ROOT_DIR}/cmake-out" +# Pico SDK 2.0's mbedtls requires this for CMake >= 3.30 +export CMAKE_POLICY_VERSION_MINIMUM=3.5 + +# Portable nproc: use nproc on Linux, sysctl on macOS +if command -v nproc &>/dev/null; then + NPROC=$(nproc) +else + NPROC=$(sysctl -n hw.ncpu) +fi + +# Source ARM toolchain if available and not already on PATH +if ! command -v arm-none-eabi-gcc &>/dev/null; then + SETUP_PATH="${ROOT_DIR}/examples/arm/arm-scratch/setup_path.sh" + if [ -f "${SETUP_PATH}" ]; then + source "${SETUP_PATH}" + else + # Try to find the toolchain directly + TOOLCHAIN_BIN=$(find "${ROOT_DIR}/examples/arm/arm-scratch" -name "arm-none-eabi-gcc" -type f 2>/dev/null | head -1) + if [ -n "${TOOLCHAIN_BIN:-}" ]; then + export PATH="$(dirname "${TOOLCHAIN_BIN}"):${PATH}" + else + echo "Error: arm-none-eabi-gcc not found. Run: ./examples/arm/setup.sh --i-agree-to-the-contained-eula" + exit 1 + fi + fi +fi + +echo "Using ARM toolchain: $(which arm-none-eabi-gcc)" + # Default model DEFAULT_MODEL="default_model.pte" usage() { - echo "Usage: $0 [--clean] [--model=path/to/model.pte]" + echo "Usage: $0 [--clean] [--cmsis] [--model=path/to/model.pte]" echo " --clean Clean build directories" + echo " --cmsis Build with CMSIS-NN INT8 kernels (requires cortex_m backend)" echo " --model=FILE Specify model file to embed (relative to pico2/)" exit 1 } @@ -30,6 +59,7 @@ usage() { # Parse args MODEL_INPUT="" CLEAN_BUILD=0 +USE_CMSIS=0 for arg in "$@"; do case $arg in @@ -37,6 +67,10 @@ for arg in "$@"; do CLEAN_BUILD=1 shift ;; + --cmsis) + USE_CMSIS=1 + shift + ;; --model=*) MODEL_INPUT="${arg#*=}" shift @@ -68,21 +102,28 @@ if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then echo "Using selective build from model: ${MODEL_ABS_PATH}" fi +CMSIS_FLAGS=() +if [ $USE_CMSIS -eq 1 ]; then + echo "CMSIS-NN mode: building with Cortex-M backend and CMSIS-NN kernels" + CMSIS_FLAGS=( + -DEXECUTORCH_BUILD_CORTEX_M=ON + ) +fi + cmake -B "${EXECUTORCH_BUILD_DIR}" \ -DCMAKE_TOOLCHAIN_FILE="${ROOT_DIR}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \ - -DTARGET_CPU=cortex-m0plus \ + -DTARGET_CPU=cortex-m33+nofp \ -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ -DEXECUTORCH_PAL_DEFAULT=minimal \ - -DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \ -DCMAKE_BUILD_TYPE=MinSizeRel \ -DEXECUTORCH_ENABLE_LOGGING=OFF \ - -DEXECUTORCH_SELECT_ALL_OPS=OFF \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ -DCMAKE_INSTALL_PREFIX="${EXECUTORCH_BUILD_DIR}" \ ${SELECT_OPS_FLAGS} \ + ${CMSIS_FLAGS[@]+"${CMSIS_FLAGS[@]}"} \ "${ROOT_DIR}" -cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j$(nproc) +cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j${NPROC} echo "ExecuTorch cross compile complete." @@ -90,6 +131,12 @@ echo "ExecuTorch cross compile complete." cd "${PICO2_DIR}" +PICO_CMAKE_FLAGS=(-DPICO_BOARD=pico2 -DCMAKE_BUILD_TYPE=Release) + +if [ $USE_CMSIS -eq 1 ]; then + PICO_CMAKE_FLAGS+=(-DUSE_CMSIS_NN=ON) +fi + if [ -n "$MODEL_INPUT" ]; then # Use specified model if [ ! -f "${MODEL_INPUT}" ]; then @@ -97,13 +144,15 @@ if [ -n "$MODEL_INPUT" ]; then exit 1 fi echo "Building firmware with model: ${MODEL_INPUT}" - cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${MODEL_INPUT}" -DCMAKE_BUILD_TYPE=Release + PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${MODEL_INPUT}") else # Use default model echo "Building firmware with default model: ${DEFAULT_MODEL}" - cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${DEFAULT_MODEL}" -DCMAKE_BUILD_TYPE=Release + PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${DEFAULT_MODEL}") fi -cmake --build "${BUILD_DIR}" -j$(nproc) +cmake -B "${BUILD_DIR}" "${PICO_CMAKE_FLAGS[@]}" + +cmake --build "${BUILD_DIR}" -j${NPROC} echo "Firmware build complete. Output in ${BUILD_DIR}, Binary: executorch_pico.uf2" diff --git a/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py b/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py new file mode 100644 index 00000000000..43ff4a41229 --- /dev/null +++ b/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Export the TinyMLP MNIST model with INT8 quantization for CMSIS-NN acceleration. + +Uses the CortexMQuantizer to produce INT8 quantized ops that map to CMSIS-NN +kernels on Cortex-M33 (RP2350/Pico2). The model I/O stays float โ€” quantize and +dequantize nodes are inserted inside the graph. + +Usage: + python export_mlp_mnist_cmsis.py + python export_mlp_mnist_cmsis.py --output my_model.pte + python export_mlp_mnist_cmsis.py --num-calibration 200 +""" + +import argparse +import logging +import os + +import torch + +from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager +from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer +from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge +from executorch.extension.export_util.utils import save_pte_program + +from export_mlp_mnist import create_balanced_model, IMAGE_SIZE, test_comprehensive +from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e + +FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) + + +def get_calibration_data(num_samples: int = 100): + """ + Generate calibration data for quantization. + Mixes structured digit-like patterns and random noise so the observer + sees a representative activation range. + """ + calibration_data = [] + + # Structured patterns that look like the digits the model will see + for _ in range(num_samples // 2): + x = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE) + # Random vertical / horizontal strokes + col = torch.randint(5, 23, (1,)).item() + row = torch.randint(5, 23, (1,)).item() + x[0, 2:26, col - 1 : col + 2] = 1.0 # vertical stroke + x[0, row - 1 : row + 2, 5:23] = 1.0 # horizontal stroke + calibration_data.append(x) + + # Random pixel patterns + for _ in range(num_samples - num_samples // 2): + x = (torch.rand(1, IMAGE_SIZE, IMAGE_SIZE) > 0.7).float() + calibration_data.append(x) + + return calibration_data + + +def quantize_model(model, calibration_data): + quantizer = CortexMQuantizer() + example_input = calibration_data[0] + + exported = torch.export.export(model, (example_input,)) + graph_module = exported.module() + + prepared = prepare_pt2e(graph_module, quantizer) + + logger.info(f"Calibrating with {len(calibration_data)} samples...") + with torch.no_grad(): + for i, data in enumerate(calibration_data): + prepared(data) + if (i + 1) % 25 == 0: + logger.info(f" Calibrated {i + 1}/{len(calibration_data)} samples") + + quantized = convert_pt2e(prepared) + return quantized, example_input + + +def export_to_pte(quantized_model, example_input, output_path: str): + exported_program = torch.export.export(quantized_model, (example_input,)) + + edge_config = EdgeCompileConfig( + _check_ir_validity=False, + preserve_ops=[torch.ops.aten.linear.default], + ) + edge_program = to_edge(exported_program, compile_config=edge_config) + logger.info("Edge program created") + + logger.info("Applying Cortex-M optimization passes...") + pass_manager = CortexMPassManager(edge_program.exported_program()) + transformed_ep = pass_manager.transform() + + edge_program = to_edge(transformed_ep, compile_config=edge_config) + + logger.info("Converting to ExecuTorch format...") + exec_program = edge_program.to_executorch( + config=ExecutorchBackendConfig(extract_delegate_segments=False) + ) + + save_pte_program(exec_program, output_path) + file_size = os.path.getsize(output_path) + logger.info(f"Model saved to {output_path} ({file_size / 1024:.1f} KB)") + + +def main(): + parser = argparse.ArgumentParser( + description="Export TinyMLP MNIST for Cortex-M with CMSIS-NN (INT8)" + ) + parser.add_argument( + "--output", + type=str, + default="balanced_tiny_mlp_mnist_cmsis.pte", + help="Output .pte file path", + ) + parser.add_argument( + "--num-calibration", + type=int, + default=100, + help="Number of calibration samples for quantization", + ) + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + args = parser.parse_args() + + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + + logger.info("Creating balanced MLP MNIST model...") + model = create_balanced_model() + model.eval() + + logger.info("Testing FP32 model before quantization:") + test_comprehensive(model) + + calibration_data = get_calibration_data(args.num_calibration) + quantized_model, example_input = quantize_model(model, calibration_data) + + logger.info("Testing quantized model:") + with torch.no_grad(): + test_input = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE) + test_input[0, 2:26, 13:16] = 1.0 # digit-1-like pattern + output = quantized_model(test_input) + pred = output.argmax(dim=1).item() + logger.info(f" Digit-1 pattern -> predicted: {pred}") + + export_to_pte(quantized_model, example_input, args.output) + logger.info("Export complete!") + logger.info(f"Input shape: (1, {IMAGE_SIZE}, {IMAGE_SIZE})") + logger.info("Input format: Float [0.0, 1.0] (same as FP32 variant)") + + +if __name__ == "__main__": + main() diff --git a/examples/raspberry_pi/pico2/main.cpp b/examples/raspberry_pi/pico2/main.cpp index 8219e3bfc52..08f03709f4f 100644 --- a/examples/raspberry_pi/pico2/main.cpp +++ b/examples/raspberry_pi/pico2/main.cpp @@ -11,6 +11,7 @@ // Pico includes #include "pico/stdio_usb.h" #include "pico/stdlib.h" +#include "pico/time.h" // Executorch includes #include @@ -244,6 +245,8 @@ bool run_inference(Method& method) { printf("๐Ÿงช Testing all supported digits:\n\n"); + uint32_t latencies_us[4] = {0}; + for (int test = 0; test < 4; test++) { const char** ascii_digit = test_cases[test].pattern; const char* digit_name = test_cases[test].name; @@ -294,12 +297,18 @@ bool run_inference(Method& method) { return false; } + uint32_t start_us = time_us_32(); result = method.execute(); + uint32_t elapsed_us = time_us_32() - start_us; + latencies_us[test] = elapsed_us; + if (result != Error::Ok) { printf("โŒ Failed to execute: error %d\n", (int)result); return false; } + printf("โฑ๏ธ Inference time: %lu us\n", (unsigned long)elapsed_us); + auto output_evalue = method.get_output(0); if (!output_evalue.isTensor()) { printf("โŒ Output is not a tensor\n"); @@ -340,6 +349,16 @@ bool run_inference(Method& method) { printf("\n==================================================\n\n"); } + // Print latency summary + uint32_t total_us = 0; + printf("๐Ÿ“Š Inference latency summary:\n"); + for (int i = 0; i < 4; i++) { + printf( + " %s: %lu us\n", test_cases[i].name, (unsigned long)latencies_us[i]); + total_us += latencies_us[i]; + } + printf(" Average: %lu us\n\n", (unsigned long)(total_us / 4)); + printf( "๐ŸŽ‰ All tests complete! ExecuTorch inference of neural network works on Pico2!\n"); return true; @@ -373,6 +392,21 @@ int executor_runner() { printf("Failed to load and prepare model\n"); return 1; } + + // Probe method allocator usage: try allocating 1 byte to find cur_ position + void* probe = method_allocator.allocate(1, 1); + uint32_t method_used = probe + ? (uint32_t)((uint8_t*)probe - method_allocator_pool) + : sizeof(method_allocator_pool); + printf("๐Ÿ“Š Memory usage after method load:\n"); + printf( + " Method allocator: %lu / %lu bytes used\n", + (unsigned long)method_used, + (unsigned long)sizeof(method_allocator_pool)); + printf( + " Activation pool: %lu bytes allocated\n", + (unsigned long)sizeof(activation_pool)); + if (!run_inference(*method_ptr)) { printf("Failed to run inference\n"); return 1;