-
Notifications
You must be signed in to change notification settings - Fork 989
[Pico2] Add CMSIS-NN INT8 support and latency instrumentation #18612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,7 +5,6 @@ | |||||||||||
| # This source code is licensed under the BSD-style license found in the | ||||||||||||
| # LICENSE file in the root directory of this source tree. | ||||||||||||
|
|
||||||||||||
| #!/bin/bash | ||||||||||||
| # build_firmware_pico.sh | ||||||||||||
| # Simple script to cross-compile ExecuTorch and build Pico2 firmware with optional model input | ||||||||||||
|
|
||||||||||||
|
|
@@ -17,26 +16,61 @@ PICO2_DIR="${ROOT_DIR}/examples/raspberry_pi/pico2" | |||||||||||
| BUILD_DIR="${PICO2_DIR}/build" | ||||||||||||
| EXECUTORCH_BUILD_DIR="${ROOT_DIR}/cmake-out" | ||||||||||||
|
|
||||||||||||
| # Pico SDK 2.0's mbedtls requires this for CMake >= 3.30 | ||||||||||||
| export CMAKE_POLICY_VERSION_MINIMUM=3.5 | ||||||||||||
|
|
||||||||||||
| # Portable nproc: use nproc on Linux, sysctl on macOS | ||||||||||||
| if command -v nproc &>/dev/null; then | ||||||||||||
| NPROC=$(nproc) | ||||||||||||
| else | ||||||||||||
| NPROC=$(sysctl -n hw.ncpu) | ||||||||||||
| fi | ||||||||||||
|
|
||||||||||||
| # Source ARM toolchain if available and not already on PATH | ||||||||||||
| if ! command -v arm-none-eabi-gcc &>/dev/null; then | ||||||||||||
| SETUP_PATH="${ROOT_DIR}/examples/arm/arm-scratch/setup_path.sh" | ||||||||||||
| if [ -f "${SETUP_PATH}" ]; then | ||||||||||||
| source "${SETUP_PATH}" | ||||||||||||
| else | ||||||||||||
| # Try to find the toolchain directly | ||||||||||||
| TOOLCHAIN_BIN=$(find "${ROOT_DIR}/examples/arm/arm-scratch" -name "arm-none-eabi-gcc" -type f 2>/dev/null | head -1) | ||||||||||||
| if [ -n "${TOOLCHAIN_BIN:-}" ]; then | ||||||||||||
| export PATH="$(dirname "${TOOLCHAIN_BIN}"):${PATH}" | ||||||||||||
| else | ||||||||||||
| echo "Error: arm-none-eabi-gcc not found. Run: ./examples/arm/setup.sh --i-agree-to-the-contained-eula" | ||||||||||||
| exit 1 | ||||||||||||
| fi | ||||||||||||
| fi | ||||||||||||
| fi | ||||||||||||
|
|
||||||||||||
| echo "Using ARM toolchain: $(which arm-none-eabi-gcc)" | ||||||||||||
|
|
||||||||||||
| # Default model | ||||||||||||
| DEFAULT_MODEL="default_model.pte" | ||||||||||||
|
|
||||||||||||
| usage() { | ||||||||||||
| echo "Usage: $0 [--clean] [--model=path/to/model.pte]" | ||||||||||||
| echo "Usage: $0 [--clean] [--cmsis] [--model=path/to/model.pte]" | ||||||||||||
| echo " --clean Clean build directories" | ||||||||||||
| echo " --cmsis Build with CMSIS-NN INT8 kernels (requires cortex_m backend)" | ||||||||||||
| echo " --model=FILE Specify model file to embed (relative to pico2/)" | ||||||||||||
| exit 1 | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| # Parse args | ||||||||||||
| MODEL_INPUT="" | ||||||||||||
| CLEAN_BUILD=0 | ||||||||||||
| USE_CMSIS=0 | ||||||||||||
|
|
||||||||||||
| for arg in "$@"; do | ||||||||||||
| case $arg in | ||||||||||||
| --clean) | ||||||||||||
| CLEAN_BUILD=1 | ||||||||||||
| shift | ||||||||||||
| ;; | ||||||||||||
| --cmsis) | ||||||||||||
| USE_CMSIS=1 | ||||||||||||
| shift | ||||||||||||
| ;; | ||||||||||||
| --model=*) | ||||||||||||
| MODEL_INPUT="${arg#*=}" | ||||||||||||
| shift | ||||||||||||
|
|
@@ -68,42 +102,57 @@ if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then | |||||||||||
| echo "Using selective build from model: ${MODEL_ABS_PATH}" | ||||||||||||
| fi | ||||||||||||
|
|
||||||||||||
| CMSIS_FLAGS=() | ||||||||||||
| if [ $USE_CMSIS -eq 1 ]; then | ||||||||||||
| echo "CMSIS-NN mode: building with Cortex-M backend and CMSIS-NN kernels" | ||||||||||||
| CMSIS_FLAGS=( | ||||||||||||
| -DEXECUTORCH_BUILD_CORTEX_M=ON | ||||||||||||
| ) | ||||||||||||
| fi | ||||||||||||
|
|
||||||||||||
| cmake -B "${EXECUTORCH_BUILD_DIR}" \ | ||||||||||||
| -DCMAKE_TOOLCHAIN_FILE="${ROOT_DIR}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \ | ||||||||||||
| -DTARGET_CPU=cortex-m0plus \ | ||||||||||||
| -DTARGET_CPU=cortex-m33+nofp \ | ||||||||||||
| -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \ | ||||||||||||
| -DEXECUTORCH_PAL_DEFAULT=minimal \ | ||||||||||||
| -DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \ | ||||||||||||
| -DCMAKE_BUILD_TYPE=MinSizeRel \ | ||||||||||||
| -DEXECUTORCH_ENABLE_LOGGING=OFF \ | ||||||||||||
| -DEXECUTORCH_SELECT_ALL_OPS=OFF \ | ||||||||||||
| -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ | ||||||||||||
| -DCMAKE_INSTALL_PREFIX="${EXECUTORCH_BUILD_DIR}" \ | ||||||||||||
| ${SELECT_OPS_FLAGS} \ | ||||||||||||
| ${CMSIS_FLAGS[@]+"${CMSIS_FLAGS[@]}"} \ | ||||||||||||
| "${ROOT_DIR}" | ||||||||||||
|
|
||||||||||||
| cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j$(nproc) | ||||||||||||
| cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j${NPROC} | ||||||||||||
|
|
||||||||||||
| echo "ExecuTorch cross compile complete." | ||||||||||||
|
|
||||||||||||
| # Step 2: Build firmware for Pico2 with model input | ||||||||||||
|
|
||||||||||||
| cd "${PICO2_DIR}" | ||||||||||||
|
|
||||||||||||
| PICO_CMAKE_FLAGS=(-DPICO_BOARD=pico2 -DCMAKE_BUILD_TYPE=Release) | ||||||||||||
|
|
||||||||||||
| if [ $USE_CMSIS -eq 1 ]; then | ||||||||||||
| PICO_CMAKE_FLAGS+=(-DUSE_CMSIS_NN=ON) | ||||||||||||
| fi | ||||||||||||
|
|
||||||||||||
|
||||||||||||
| if [ -n "${SELECT_OPS_FLAGS:-}" ]; then | |
| PICO_CMAKE_FLAGS+=(-DUSE_SELECTIVE_BUILD=ON) | |
| fi |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,159 @@ | ||
| #!/usr/bin/env python3 | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # All rights reserved. | ||
| # | ||
| # This source code is licensed under the BSD-style license found in the | ||
| # LICENSE file in the root directory of this source tree. | ||
|
|
||
| """ | ||
| Export the TinyMLP MNIST model with INT8 quantization for CMSIS-NN acceleration. | ||
|
|
||
| Uses the CortexMQuantizer to produce INT8 quantized ops that map to CMSIS-NN | ||
| kernels on Cortex-M33 (RP2350/Pico2). The model I/O stays float — quantize and | ||
| dequantize nodes are inserted inside the graph. | ||
|
|
||
| Usage: | ||
| python export_mlp_mnist_cmsis.py | ||
| python export_mlp_mnist_cmsis.py --output my_model.pte | ||
| python export_mlp_mnist_cmsis.py --num-calibration 200 | ||
| """ | ||
|
|
||
| import argparse | ||
| import logging | ||
| import os | ||
|
|
||
| import torch | ||
|
|
||
| from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager | ||
| from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer | ||
| from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge | ||
| from executorch.extension.export_util.utils import save_pte_program | ||
| from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e | ||
|
|
||
| from export_mlp_mnist import create_balanced_model, IMAGE_SIZE, test_comprehensive | ||
|
|
||
| FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" | ||
| logging.basicConfig(level=logging.INFO, format=FORMAT) | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def get_calibration_data(num_samples: int = 100): | ||
| """ | ||
| Generate calibration data for quantization. | ||
| Mixes structured digit-like patterns and random noise so the observer | ||
| sees a representative activation range. | ||
| """ | ||
| calibration_data = [] | ||
|
|
||
| # Structured patterns that look like the digits the model will see | ||
| for _ in range(num_samples // 2): | ||
| x = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE) | ||
| # Random vertical / horizontal strokes | ||
| col = torch.randint(5, 23, (1,)).item() | ||
| row = torch.randint(5, 23, (1,)).item() | ||
| x[0, 2:26, col - 1 : col + 2] = 1.0 # vertical stroke | ||
| x[0, row - 1 : row + 2, 5:23] = 1.0 # horizontal stroke | ||
| calibration_data.append(x) | ||
|
|
||
| # Random pixel patterns | ||
| for _ in range(num_samples - num_samples // 2): | ||
| x = (torch.rand(1, IMAGE_SIZE, IMAGE_SIZE) > 0.7).float() | ||
| calibration_data.append(x) | ||
|
|
||
| return calibration_data | ||
|
|
||
|
|
||
| def quantize_model(model, calibration_data): | ||
| quantizer = CortexMQuantizer() | ||
| example_input = calibration_data[0] | ||
|
|
||
| exported = torch.export.export(model, (example_input,)) | ||
| graph_module = exported.module() | ||
|
|
||
| prepared = prepare_pt2e(graph_module, quantizer) | ||
|
|
||
| logger.info(f"Calibrating with {len(calibration_data)} samples...") | ||
| with torch.no_grad(): | ||
| for i, data in enumerate(calibration_data): | ||
| prepared(data) | ||
| if (i + 1) % 25 == 0: | ||
| logger.info(f" Calibrated {i + 1}/{len(calibration_data)} samples") | ||
|
|
||
| quantized = convert_pt2e(prepared) | ||
| return quantized, example_input | ||
|
|
||
|
|
||
| def export_to_pte(quantized_model, example_input, output_path: str): | ||
| exported_program = torch.export.export(quantized_model, (example_input,)) | ||
|
|
||
| edge_config = EdgeCompileConfig( | ||
| _check_ir_validity=False, | ||
| preserve_ops=[torch.ops.aten.linear.default], | ||
| ) | ||
| edge_program = to_edge(exported_program, compile_config=edge_config) | ||
| logger.info("Edge program created") | ||
|
|
||
| logger.info("Applying Cortex-M optimization passes...") | ||
| pass_manager = CortexMPassManager(edge_program.exported_program()) | ||
| transformed_ep = pass_manager.transform() | ||
|
|
||
| edge_program = to_edge(transformed_ep, compile_config=edge_config) | ||
|
|
||
|
Comment on lines
+97
to
+101
|
||
| logger.info("Converting to ExecuTorch format...") | ||
| exec_program = edge_program.to_executorch( | ||
| config=ExecutorchBackendConfig(extract_delegate_segments=False) | ||
| ) | ||
|
|
||
| save_pte_program(exec_program, output_path) | ||
| file_size = os.path.getsize(output_path) | ||
| logger.info(f"Model saved to {output_path} ({file_size / 1024:.1f} KB)") | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser( | ||
| description="Export TinyMLP MNIST for Cortex-M with CMSIS-NN (INT8)" | ||
| ) | ||
| parser.add_argument( | ||
| "--output", | ||
| type=str, | ||
| default="balanced_tiny_mlp_mnist_cmsis.pte", | ||
| help="Output .pte file path", | ||
| ) | ||
| parser.add_argument( | ||
| "--num-calibration", | ||
| type=int, | ||
| default=100, | ||
| help="Number of calibration samples for quantization", | ||
| ) | ||
| parser.add_argument("--debug", action="store_true", help="Enable debug logging") | ||
| args = parser.parse_args() | ||
|
|
||
| if args.debug: | ||
| logging.getLogger().setLevel(logging.DEBUG) | ||
|
|
||
| logger.info("Creating balanced MLP MNIST model...") | ||
| model = create_balanced_model() | ||
| model.eval() | ||
|
|
||
| logger.info("Testing FP32 model before quantization:") | ||
| test_comprehensive(model) | ||
|
|
||
| calibration_data = get_calibration_data(args.num_calibration) | ||
| quantized_model, example_input = quantize_model(model, calibration_data) | ||
|
|
||
| logger.info("Testing quantized model:") | ||
| with torch.no_grad(): | ||
| test_input = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE) | ||
| test_input[0, 2:26, 13:16] = 1.0 # digit-1-like pattern | ||
| output = quantized_model(test_input) | ||
| pred = output.argmax(dim=1).item() | ||
| logger.info(f" Digit-1 pattern -> predicted: {pred}") | ||
|
|
||
| export_to_pte(quantized_model, example_input, args.output) | ||
| logger.info("Export complete!") | ||
| logger.info(f"Input shape: (1, {IMAGE_SIZE}, {IMAGE_SIZE})") | ||
| logger.info("Input format: Float [0.0, 1.0] (same as FP32 variant)") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the USE_CMSIS_NN + USE_SELECTIVE_BUILD branch, portable_ops_lib / executorch_selected_kernels are not linked. Even with CMSIS-NN accelerated int8 linear, this model still needs non-CMSIS ops like view/reshape (from the TinyMLPMNIST forward) to be registered, which typically come from portable_ops_lib or executorch_selected_kernels. Consider linking executorch_selected_kernels alongside cortex_m_ops_lib (or keep portable_ops_lib) so this configuration doesn’t hit OperatorMissing at runtime.