diff --git a/examples/raspberry_pi/pico2/CMakeLists.txt b/examples/raspberry_pi/pico2/CMakeLists.txt
index 16bb397252f..dba4b60e2dd 100644
--- a/examples/raspberry_pi/pico2/CMakeLists.txt
+++ b/examples/raspberry_pi/pico2/CMakeLists.txt
@@ -125,21 +125,83 @@ target_compile_options(
 
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
 
-set(BAREMETAL_BUILD_DIR ${EXECUTORCH_ROOT}/executorch/cmake-out/)
+set(BAREMETAL_BUILD_DIR
+    ${EXECUTORCH_ROOT}/executorch/cmake-out/
+    CACHE STRING "ExecuTorch baremetal build dir"
+)
 
-# Link ExecuTorch and Pico libraries
-target_link_libraries(
-  executorch_pico
-  PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
-          ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
-          -Wl,--whole-archive
-          ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
-          -Wl,--no-whole-archive
-          ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
-          pico_stdlib
-          pico_stdio_usb
+# CMSIS-NN support: link quantized cortex_m kernels instead of portable ops
+option(USE_CMSIS_NN "Link CMSIS-NN INT8 kernels for Cortex-M33 acceleration"
+       OFF
+)
+option(USE_SELECTIVE_BUILD "Use selective build (only link model-required ops)"
+       OFF
 )
 
+if(USE_CMSIS_NN)
+  message(STATUS "CMSIS-NN enabled: linking cortex_m_ops_lib + cmsis-nn")
+  if(USE_SELECTIVE_BUILD)
+    # CMSIS-NN model uses only cortex_m:: ops, no portable ops needed. Skip
+    # --whole-archive on portable_ops_lib to avoid pulling unused ops.
+    message(STATUS "Selective build: CMSIS-NN only (no portable ops)")
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  else()
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  endif()
+else()
+  if(USE_SELECTIVE_BUILD)
+    message(STATUS "Selective build: using executorch_selected_kernels")
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_selected_kernels.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  else()
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  endif()
+endif()
+
 # Only add extra outputs if the target builds successfully
 if(TARGET executorch_pico)
   pico_add_extra_outputs(executorch_pico)
diff --git a/examples/raspberry_pi/pico2/build_firmware_pico.sh b/examples/raspberry_pi/pico2/build_firmware_pico.sh
index e3343b38deb..cf918061106 100755
--- a/examples/raspberry_pi/pico2/build_firmware_pico.sh
+++ b/examples/raspberry_pi/pico2/build_firmware_pico.sh
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-#!/bin/bash
 # build_firmware_pico.sh
 # Simple script to cross-compile ExecuTorch and build Pico2 firmware with optional model input
 
@@ -17,12 +16,42 @@ PICO2_DIR="${ROOT_DIR}/examples/raspberry_pi/pico2"
 BUILD_DIR="${PICO2_DIR}/build"
 EXECUTORCH_BUILD_DIR="${ROOT_DIR}/cmake-out"
 
+# Pico SDK 2.0's mbedtls requires this for CMake >= 3.30
+export CMAKE_POLICY_VERSION_MINIMUM=3.5
+
+# Portable nproc: use nproc on Linux, sysctl on macOS
+if command -v nproc &>/dev/null; then
+  NPROC=$(nproc)
+else
+  NPROC=$(sysctl -n hw.ncpu)
+fi
+
+# Source ARM toolchain if available and not already on PATH
+if ! command -v arm-none-eabi-gcc &>/dev/null; then
+  SETUP_PATH="${ROOT_DIR}/examples/arm/arm-scratch/setup_path.sh"
+  if [ -f "${SETUP_PATH}" ]; then
+    source "${SETUP_PATH}"
+  else
+    # Try to find the toolchain directly
+    TOOLCHAIN_BIN=$(find "${ROOT_DIR}/examples/arm/arm-scratch" -name "arm-none-eabi-gcc" -type f 2>/dev/null | head -1)
+    if [ -n "${TOOLCHAIN_BIN:-}" ]; then
+      export PATH="$(dirname "${TOOLCHAIN_BIN}"):${PATH}"
+    else
+      echo "Error: arm-none-eabi-gcc not found. Run: ./examples/arm/setup.sh --i-agree-to-the-contained-eula"
+      exit 1
+    fi
+  fi
+fi
+
+echo "Using ARM toolchain: $(which arm-none-eabi-gcc)"
+
 # Default model
 DEFAULT_MODEL="default_model.pte"
 
 usage() {
-  echo "Usage: $0 [--clean] [--model=path/to/model.pte]"
+  echo "Usage: $0 [--clean] [--cmsis] [--model=path/to/model.pte]"
   echo "  --clean           Clean build directories"
+  echo "  --cmsis           Build with CMSIS-NN INT8 kernels (requires cortex_m backend)"
   echo "  --model=FILE      Specify model file to embed (relative to pico2/)"
   exit 1
 }
@@ -30,6 +59,7 @@ usage() {
 # Parse args
 MODEL_INPUT=""
 CLEAN_BUILD=0
+USE_CMSIS=0
 
 for arg in "$@"; do
   case $arg in
@@ -37,6 +67,10 @@ for arg in "$@"; do
       CLEAN_BUILD=1
       shift
       ;;
+    --cmsis)
+      USE_CMSIS=1
+      shift
+      ;;
     --model=*)
       MODEL_INPUT="${arg#*=}"
       shift
@@ -68,21 +102,28 @@ if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
   echo "Using selective build from model: ${MODEL_ABS_PATH}"
 fi
 
+CMSIS_FLAGS=()
+if [ $USE_CMSIS -eq 1 ]; then
+  echo "CMSIS-NN mode: building with Cortex-M backend and CMSIS-NN kernels"
+  CMSIS_FLAGS=(
+    -DEXECUTORCH_BUILD_CORTEX_M=ON
+  )
+fi
+
 cmake -B "${EXECUTORCH_BUILD_DIR}" \
   -DCMAKE_TOOLCHAIN_FILE="${ROOT_DIR}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \
-  -DTARGET_CPU=cortex-m0plus \
+  -DTARGET_CPU=cortex-m33+nofp \
   -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
   -DEXECUTORCH_PAL_DEFAULT=minimal \
-  -DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \
   -DCMAKE_BUILD_TYPE=MinSizeRel \
   -DEXECUTORCH_ENABLE_LOGGING=OFF \
-  -DEXECUTORCH_SELECT_ALL_OPS=OFF \
   -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
   -DCMAKE_INSTALL_PREFIX="${EXECUTORCH_BUILD_DIR}" \
   ${SELECT_OPS_FLAGS} \
+  ${CMSIS_FLAGS[@]+"${CMSIS_FLAGS[@]}"} \
   "${ROOT_DIR}"
 
-cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j$(nproc)
+cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j${NPROC}
 
 echo "ExecuTorch cross compile complete."
 
@@ -90,6 +131,12 @@ echo "ExecuTorch cross compile complete."
 
 cd "${PICO2_DIR}"
 
+PICO_CMAKE_FLAGS=(-DPICO_BOARD=pico2 -DCMAKE_BUILD_TYPE=Release)
+
+if [ $USE_CMSIS -eq 1 ]; then
+  PICO_CMAKE_FLAGS+=(-DUSE_CMSIS_NN=ON)
+fi
+
 if [ -n "$MODEL_INPUT" ]; then
   # Use specified model
   if [ ! -f "${MODEL_INPUT}" ]; then
@@ -97,13 +144,15 @@ if [ -n "$MODEL_INPUT" ]; then
     exit 1
   fi
   echo "Building firmware with model: ${MODEL_INPUT}"
-  cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${MODEL_INPUT}" -DCMAKE_BUILD_TYPE=Release
+  PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${MODEL_INPUT}")
 else
   # Use default model
   echo "Building firmware with default model: ${DEFAULT_MODEL}"
-  cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${DEFAULT_MODEL}" -DCMAKE_BUILD_TYPE=Release
+  PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${DEFAULT_MODEL}")
 fi
 
-cmake --build "${BUILD_DIR}" -j$(nproc)
+cmake -B "${BUILD_DIR}" "${PICO_CMAKE_FLAGS[@]}"
+
+cmake --build "${BUILD_DIR}" -j${NPROC}
 
 echo "Firmware build complete. Output in ${BUILD_DIR}, Binary: executorch_pico.uf2"
diff --git a/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py b/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py
new file mode 100644
index 00000000000..43ff4a41229
--- /dev/null
+++ b/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Export the TinyMLP MNIST model with INT8 quantization for CMSIS-NN acceleration.
+
+Uses the CortexMQuantizer to produce INT8 quantized ops that map to CMSIS-NN
+kernels on Cortex-M33 (RP2350/Pico2). The model I/O stays float — quantize and
+dequantize nodes are inserted inside the graph.
+
+Usage:
+    python export_mlp_mnist_cmsis.py
+    python export_mlp_mnist_cmsis.py --output my_model.pte
+    python export_mlp_mnist_cmsis.py --num-calibration 200
+"""
+
+import argparse
+import logging
+import os
+
+import torch
+
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
+from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.extension.export_util.utils import save_pte_program
+
+from export_mlp_mnist import create_balanced_model, IMAGE_SIZE, test_comprehensive
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+
+def get_calibration_data(num_samples: int = 100):
+    """
+    Generate calibration data for quantization.
+    Mixes structured digit-like patterns and random noise so the observer
+    sees a representative activation range.
+    """
+    calibration_data = []
+
+    # Structured patterns that look like the digits the model will see
+    for _ in range(num_samples // 2):
+        x = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
+        # Random vertical / horizontal strokes
+        col = torch.randint(5, 23, (1,)).item()
+        row = torch.randint(5, 23, (1,)).item()
+        x[0, 2:26, col - 1 : col + 2] = 1.0  # vertical stroke
+        x[0, row - 1 : row + 2, 5:23] = 1.0  # horizontal stroke
+        calibration_data.append(x)
+
+    # Random pixel patterns
+    for _ in range(num_samples - num_samples // 2):
+        x = (torch.rand(1, IMAGE_SIZE, IMAGE_SIZE) > 0.7).float()
+        calibration_data.append(x)
+
+    return calibration_data
+
+
+def quantize_model(model, calibration_data):
+    quantizer = CortexMQuantizer()
+    example_input = calibration_data[0]
+
+    exported = torch.export.export(model, (example_input,))
+    graph_module = exported.module()
+
+    prepared = prepare_pt2e(graph_module, quantizer)
+
+    logger.info(f"Calibrating with {len(calibration_data)} samples...")
+    with torch.no_grad():
+        for i, data in enumerate(calibration_data):
+            prepared(data)
+            if (i + 1) % 25 == 0:
+                logger.info(f"  Calibrated {i + 1}/{len(calibration_data)} samples")
+
+    quantized = convert_pt2e(prepared)
+    return quantized, example_input
+
+
+def export_to_pte(quantized_model, example_input, output_path: str):
+    exported_program = torch.export.export(quantized_model, (example_input,))
+
+    edge_config = EdgeCompileConfig(
+        _check_ir_validity=False,
+        preserve_ops=[torch.ops.aten.linear.default],
+    )
+    edge_program = to_edge(exported_program, compile_config=edge_config)
+    logger.info("Edge program created")
+
+    logger.info("Applying Cortex-M optimization passes...")
+    pass_manager = CortexMPassManager(edge_program.exported_program())
+    transformed_ep = pass_manager.transform()
+
+    edge_program = to_edge(transformed_ep, compile_config=edge_config)
+
+    logger.info("Converting to ExecuTorch format...")
+    exec_program = edge_program.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
+    )
+
+    save_pte_program(exec_program, output_path)
+    file_size = os.path.getsize(output_path)
+    logger.info(f"Model saved to {output_path} ({file_size / 1024:.1f} KB)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Export TinyMLP MNIST for Cortex-M with CMSIS-NN (INT8)"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="balanced_tiny_mlp_mnist_cmsis.pte",
+        help="Output .pte file path",
+    )
+    parser.add_argument(
+        "--num-calibration",
+        type=int,
+        default=100,
+        help="Number of calibration samples for quantization",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    logger.info("Creating balanced MLP MNIST model...")
+    model = create_balanced_model()
+    model.eval()
+
+    logger.info("Testing FP32 model before quantization:")
+    test_comprehensive(model)
+
+    calibration_data = get_calibration_data(args.num_calibration)
+    quantized_model, example_input = quantize_model(model, calibration_data)
+
+    logger.info("Testing quantized model:")
+    with torch.no_grad():
+        test_input = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
+        test_input[0, 2:26, 13:16] = 1.0  # digit-1-like pattern
+        output = quantized_model(test_input)
+        pred = output.argmax(dim=1).item()
+        logger.info(f"  Digit-1 pattern -> predicted: {pred}")
+
+    export_to_pte(quantized_model, example_input, args.output)
+    logger.info("Export complete!")
+    logger.info(f"Input shape: (1, {IMAGE_SIZE}, {IMAGE_SIZE})")
+    logger.info("Input format: Float [0.0, 1.0] (same as FP32 variant)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/raspberry_pi/pico2/main.cpp b/examples/raspberry_pi/pico2/main.cpp
index 8219e3bfc52..08f03709f4f 100644
--- a/examples/raspberry_pi/pico2/main.cpp
+++ b/examples/raspberry_pi/pico2/main.cpp
@@ -11,6 +11,7 @@
 // Pico includes
 #include "pico/stdio_usb.h"
 #include "pico/stdlib.h"
+#include "pico/time.h"
 
 // Executorch includes
 #include <executorch/extension/data_loader/buffer_data_loader.h>
@@ -244,6 +245,8 @@ bool run_inference(Method& method) {
 
   printf("🧪 Testing all supported digits:\n\n");
 
+  uint32_t latencies_us[4] = {0};
+
   for (int test = 0; test < 4; test++) {
     const char** ascii_digit = test_cases[test].pattern;
     const char* digit_name = test_cases[test].name;
@@ -294,12 +297,18 @@ bool run_inference(Method& method) {
       return false;
     }
 
+    uint32_t start_us = time_us_32();
     result = method.execute();
+    uint32_t elapsed_us = time_us_32() - start_us;
+    latencies_us[test] = elapsed_us;
+
     if (result != Error::Ok) {
       printf("❌ Failed to execute: error %d\n", (int)result);
       return false;
     }
 
+    printf("⏱️  Inference time: %lu us\n", (unsigned long)elapsed_us);
+
     auto output_evalue = method.get_output(0);
     if (!output_evalue.isTensor()) {
       printf("❌ Output is not a tensor\n");
@@ -340,6 +349,16 @@ bool run_inference(Method& method) {
     printf("\n==================================================\n\n");
   }
 
+  // Print latency summary
+  uint32_t total_us = 0;
+  printf("📊 Inference latency summary:\n");
+  for (int i = 0; i < 4; i++) {
+    printf(
+        "  %s: %lu us\n", test_cases[i].name, (unsigned long)latencies_us[i]);
+    total_us += latencies_us[i];
+  }
+  printf("  Average: %lu us\n\n", (unsigned long)(total_us / 4));
+
   printf(
       "🎉 All tests complete! ExecuTorch inference of neural network works on Pico2!\n");
   return true;
@@ -373,6 +392,21 @@ int executor_runner() {
     printf("Failed to load and prepare model\n");
     return 1;
   }
+
+  // Probe method allocator usage: try allocating 1 byte to find cur_ position
+  void* probe = method_allocator.allocate(1, 1);
+  uint32_t method_used = probe
+      ? (uint32_t)((uint8_t*)probe - method_allocator_pool)
+      : sizeof(method_allocator_pool);
+  printf("📊 Memory usage after method load:\n");
+  printf(
+      "   Method allocator: %lu / %lu bytes used\n",
+      (unsigned long)method_used,
+      (unsigned long)sizeof(method_allocator_pool));
+  printf(
+      "   Activation pool: %lu bytes allocated\n",
+      (unsigned long)sizeof(activation_pool));
+
   if (!run_inference(*method_ptr)) {
     printf("Failed to run inference\n");
     return 1;