pytorch
diff --git a/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 73 additions & 16 deletions b/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 73 additions & 16 deletions
diff --git a/‎backends/xnnpack/runtime/XNNHeader.cpp‎
Lines changed: 43 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNHeader.cpp‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎docs/source/pico2_tutorial.md‎
Lines changed: 69 additions & 3 deletions b/‎docs/source/pico2_tutorial.md‎
Lines changed: 69 additions & 3 deletions
@@ -170,10 +170,12 @@ std::vector<T> flatbufferDimsToVector(
 /**
 Gets the constant data pointer associated with the given tensor value.
 Obtaining the constant data pointer can either be from within the flatbuffer
-payload (deprecated) or via offsets to the constant_data_ptr. If no constant
-data associated with the tensor value, then returns nullptr.
+payload (deprecated) or via offsets to the constant_data_ptr.
+
+Failures are returned as an Error, and the successful value may be nullptr
+when the tensor has no associated constant data.
 */
-const uint8_t* getConstantDataPtr(
+Result<const uint8_t*> getConstantDataPtr(
     uint32_t buffer_idx,
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
@@ -184,26 +186,56 @@ const uint8_t* getConstantDataPtr(
     if (!constant_data_ptr) {
       // TODO(T172265611): Remove constant_buffer in flatbuffer path after BC
       // window
-      const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
-      return constant_buffer[buffer_idx]->storage()->data();
+      auto* cb = flatbuffer_graph->constant_buffer();
+      ET_CHECK_OR_RETURN_ERROR(
+          cb != nullptr, InvalidProgram, "constant_buffer is null");
+      ET_CHECK_OR_RETURN_ERROR(
+          buffer_idx < cb->size(),
+          InvalidProgram,
+          "buffer_idx %u out of bounds for constant_buffer of size %zu",
+          buffer_idx,
+          cb->size());
+      auto* buffer_entry = (*cb)[buffer_idx];
+      ET_CHECK_OR_RETURN_ERROR(
+          buffer_entry != nullptr && buffer_entry->storage() != nullptr,
+          InvalidProgram,
+          "Null constant_buffer entry at buffer_idx %u",
+          buffer_idx);
+      return buffer_entry->storage()->data();
     } else {
-      ConstantDataOffsetPtr constant_data_offset =
-          flatbuffer_graph->constant_data()->Get(buffer_idx);
+      auto* cd = flatbuffer_graph->constant_data();
+      ET_CHECK_OR_RETURN_ERROR(
+          cd != nullptr, InvalidProgram, "constant_data is null");
+      ET_CHECK_OR_RETURN_ERROR(
+          buffer_idx < cd->size(),
+          InvalidProgram,
+          "buffer_idx %u out of bounds for constant_data of size %zu",
+          buffer_idx,
+          cd->size());
+      ConstantDataOffsetPtr constant_data_offset = cd->Get(buffer_idx);
+      ET_CHECK_OR_RETURN_ERROR(
+          constant_data_offset != nullptr,
+          InvalidProgram,
+          "Null constant_data entry at buffer_idx %u",
+          buffer_idx);
       uint64_t offset = constant_data_offset->offset();
-
       bool has_named_key = flatbuffers::IsFieldPresent(
           constant_data_offset, fb_xnnpack::ConstantDataOffset::VT_NAMED_KEY);
       // If there is no tensor name
       if (!has_named_key) {
         return constant_data_ptr + offset;
       } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            constant_data_offset->named_key() != nullptr,
+            InvalidProgram,
+            "Named key is null");
         const std::string& data_name = constant_data_offset->named_key()->str();
 #ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
         Result<const uint8_t*> data_ptr =
             weights_cache->load_unpacked_data(data_name);
         if (!data_ptr.ok()) {
           ET_LOG(Error, "Failed to load weights from cache");
-          return nullptr;
+          return data_ptr.error();
         }
         return data_ptr.get();
 #else
@@ -215,7 +247,7 @@ const uint8_t* getConstantDataPtr(
               "Failed to get constant data for key %s from named_data_map. Error code: %u",
               data_name.c_str(),
               static_cast<uint32_t>(buffer.error()));
-          return nullptr;
+          return buffer.error();
         }
         const uint8_t* data_ptr =
             static_cast<const uint8_t*>(buffer.get().data());
@@ -229,7 +261,7 @@ const uint8_t* getConstantDataPtr(
   return nullptr;
 }
 
-const uint8_t* getConstantDataPtr(
+Result<const uint8_t*> getConstantDataPtr(
     const fb_xnnpack::XNNTensorValue* tensor_value,
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
@@ -298,13 +330,17 @@ Error defineTensor(
 
   // Get Pointer to constant data from flatbuffer, if its non-constant
   // it is a nullptr
-  const uint8_t* buffer_ptr = getConstantDataPtr(
+  auto buffer_result = getConstantDataPtr(
       tensor_value,
       flatbuffer_graph,
       constant_data_ptr,
       named_data_map,
       freeable_buffers,
       weights_cache);
+  if (!buffer_result.ok()) {
+    return buffer_result.error();
+  }
+  const uint8_t* buffer_ptr = buffer_result.get();
 
   xnn_status status;
   // The type we might have to convert to
@@ -449,13 +485,17 @@ Error defineTensor(
         const float* scale = qparams->scale()->data();
 
         if (qparams->scale_buffer_idx() != 0) {
-          scale = reinterpret_cast<const float*>(getConstantDataPtr(
+          auto scale_result = getConstantDataPtr(
               qparams->scale_buffer_idx(),
               flatbuffer_graph,
               constant_data_ptr,
               named_data_map,
               freeable_buffers,
-              weights_cache));
+              weights_cache);
+          if (!scale_result.ok()) {
+            return scale_result.error();
+          }
+          scale = reinterpret_cast<const float*>(scale_result.get());
           ET_CHECK_OR_RETURN_ERROR(
               scale != nullptr, Internal, "Failed to load scale data.");
         }
@@ -491,13 +531,18 @@ Error defineTensor(
         // Block scales are preferably serialized as bf16 but can also be
         // serialized as fp32 for backwards compatability.
         if (qparams->scale_buffer_idx() != 0) {
-          scale_data = reinterpret_cast<const uint16_t*>(getConstantDataPtr(
+          auto scale_data_result = getConstantDataPtr(
               qparams->scale_buffer_idx(),
               flatbuffer_graph,
               constant_data_ptr,
               named_data_map,
               freeable_buffers,
-              weights_cache));
+              weights_cache);
+          if (!scale_data_result.ok()) {
+            return scale_data_result.error();
+          }
+          scale_data =
+              reinterpret_cast<const uint16_t*>(scale_data_result.get());
           ET_CHECK_OR_RETURN_ERROR(
               scale_data != nullptr, Internal, "Failed to load scale data.");
           scale_numel = qparams->num_scales();
@@ -1816,16 +1861,19 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
+  size_t flatbuffer_size = 0;
   CompileAllocator compile_allocator;
 
   // Header status can only either be Error::Ok or Error::NotFound
   if (header.ok()) {
     flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
         header->flatbuffer_offset;
+    flatbuffer_size = header->flatbuffer_size;
     constant_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
         header->constant_data_offset;
   } else if (header.error() == Error::NotFound) {
     flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer);
+    flatbuffer_size = num_bytes;
   } else {
     ET_LOG(Error, "XNNHeader may be corrupt");
     return header.error();
@@ -1843,6 +1891,15 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       "XNNPACK Delegate Serialization Format version identifier '%.4s' != expected XN00 or XN01'",
       flatbuffers::GetBufferIdentifier(flatbuffer_data));
 
+  // Verify the FlatBuffer data integrity before accessing it. Without this,
+  // malformed data could cause out-of-bounds reads when traversing the
+  // FlatBuffer's internal offset tables.
+  flatbuffers::Verifier verifier(flatbuffer_data, flatbuffer_size);
+  ET_CHECK_OR_RETURN_ERROR(
+      verifier.VerifyBuffer<fb_xnnpack::XNNGraph>(nullptr),
+      DelegateInvalidCompatibility,
+      "FlatBuffer verification failed; data may be truncated or corrupt");
+
   auto flatbuffer_graph = fb_xnnpack::GetXNNGraph(flatbuffer_data);
   ET_CHECK_OR_RETURN_ERROR(
       flatbuffer_graph != nullptr && flatbuffer_graph->xvalues() != nullptr &&
 
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/xnnpack/runtime/XNNHeader.h>
 
+#include <cinttypes>
 #include <cstring>
 
 #include <executorch/runtime/core/error.h>
@@ -64,6 +65,48 @@ Result<XNNHeader> XNNHeader::Parse(const void* data, size_t size) {
   uint64_t constant_data_size =
       GetUInt64LE(header_data + XNNHeader::kConstantDataSizeOffset);
 
+  // Validate min flatbuffer size.
+  constexpr size_t kMinFlatbufferSize =
+      sizeof(uint32_t) + 4; // root offset + identifier
+  ET_CHECK_OR_RETURN_ERROR(
+      flatbuffer_size >= kMinFlatbufferSize,
+      InvalidArgument,
+      "flatbuffer_size %" PRIu32 " is too small (minimum %zu)",
+      flatbuffer_size,
+      kMinFlatbufferSize);
+
+  // Validate that flatbuffer region does not overflow or exceed the buffer.
+  ET_CHECK_OR_RETURN_ERROR(
+      flatbuffer_offset <= size && flatbuffer_size <= size - flatbuffer_offset,
+      InvalidArgument,
+      "flatbuffer_offset: %" PRIu32 " and flatbuffer_size: %" PRIu32
+      " are invalid for buffer of size: %zu",
+      flatbuffer_offset,
+      flatbuffer_size,
+      size);
+  // Validate that constant data region does not overflow or exceed the buffer.
+  ET_CHECK_OR_RETURN_ERROR(
+      constant_data_offset <= size &&
+          constant_data_size <= size - constant_data_offset,
+      InvalidArgument,
+      "constant_data_offset: %" PRIu32 " and constant_data_size: %" PRIu64
+      " are invalid for buffer of size: %zu",
+      constant_data_offset,
+      constant_data_size,
+      size);
+
+  // Validate that constant data region does not overlap with flatbuffer region.
+  // flatbuffer should come before constant data.
+  ET_CHECK_OR_RETURN_ERROR(
+      constant_data_offset >= flatbuffer_offset &&
+          constant_data_offset - flatbuffer_offset >= flatbuffer_size,
+      InvalidArgument,
+      "constant_data_offset: %" PRIu32 " and flatbuffer_offset: %" PRIu32
+      " with flatbuffer_size: %" PRIu32 " are overlapping.",
+      constant_data_offset,
+      flatbuffer_offset,
+      flatbuffer_size);
+
   return XNNHeader{
       flatbuffer_offset,
       flatbuffer_size,
 
@@ -9,6 +9,7 @@ A 28×28 MNIST digit classifier running on memory constrained, low power microco
 - Input: ASCII art digits (0, 1, 4, 7)
 - Output: Real-time predictions via USB serial
 - Memory: <400KB total footprint
+- Two variants: FP32 (portable ops) and INT8 (CMSIS-NN accelerated)
 
 ## Prerequisites
 
@@ -24,29 +25,63 @@ which arm-none-eabi-gcc # --> arm/arm-scratch/arm-gnu-toolchain-13.3.rel1-x86_64
 
 ## Step 1: Generate pte from given example Model
 
+### FP32 model (default)
+
 - Use the [provided example model](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/export_mlp_mnist.py)
 
 ```bash
+cd examples/raspberry_pi/pico2
 python export_mlp_mnist.py # Creates balanced_tiny_mlp_mnist.pte
 ```
 
 - **Note:** This is hand-crafted MNIST Classifier (proof-of-concept), and not production trained. This tiny MLP recognizes digits 0, 1, 4, and 7 using manually designed feature detectors.
 
+### INT8 quantized model (CMSIS-NN accelerated)
+
+- Use the [CMSIS-NN export script](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py)
+
+```bash
+cd examples/raspberry_pi/pico2
+python export_mlp_mnist_cmsis.py # Creates balanced_tiny_mlp_mnist_cmsis.pte
+```
+
+This uses the `CortexMQuantizer` to produce INT8 quantized ops that map to CMSIS-NN kernels on Cortex-M33. The model I/O stays float — quantize and dequantize nodes are inserted inside the graph.
+
 ## Step 2: Build Firmware for Pico2
 
+### FP32 build
+
 ```bash
 # Generate model (Creates balanced_tiny_mlp_mnist.pte)
 cd ./examples/raspberry_pi/pico2
 python export_mlp_mnist.py
 cd -
 
 # Build Pico2 firmware (one command!)
+./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
+```
+
+### INT8 CMSIS-NN build
+
+```bash
+# Generate INT8 quantized model
+cd ./examples/raspberry_pi/pico2
+python export_mlp_mnist_cmsis.py
+cd -
 
-./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte   # This creates executorch_pico.uf2, a firmware image for Pico2
+# Build with CMSIS-NN backend
+./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
 ```
 
 Output: **executorch_pico.uf2** firmware file (examples/raspberry_pi/pico2/build/)
 
+**Script options:**
+| Flag | Description |
+|------|-------------|
+| `--model=FILE` | Specify model file to embed (relative to pico2/) |
+| `--cmsis` | Build with CMSIS-NN INT8 kernels for Cortex-M33 acceleration |
+| `--clean` | Clean build directories and exit; run separately before building if needed |
+
 **Note:** '[build_firmware_pico.sh](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/build_firmware_pico.sh)' script converts given model pte to hex array and generates C code for the same via this helper [script](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/pte_to_array.py). This C code is then compiled to generate final .uf2 binary which is then flashed to Pico2.
 
 ## Step 3: Flash to Pico2
@@ -72,6 +107,10 @@ screen /dev/tty.usbmodem1101 115200
 
 Something like:
 
+📊 Memory usage after method load:
+   Method allocator: 45632 / 204800 bytes used
+   Activation pool: 204800 bytes allocated
+
 === Digit 7 ===
 ############################
 ############################
@@ -104,6 +143,7 @@ Something like:
 
 Input stats: 159 white pixels out of 784 total
 Running neural network inference...
+⏱️  Inference time: 245 us
 ✅ Neural network results:
   Digit 0: 370.000
   Digit 1: 0.000
@@ -116,7 +156,16 @@ Running neural network inference...
   Digit 8: -3.000
   Digit 9: -3.000
 
-� PREDICTED: 7 (Expected: 7) ✅ CORRECT!
+🎯 PREDICTED: 7 (Expected: 7) ✅ CORRECT!
+
+==================================================
+
+📊 Inference latency summary:
+  Digit 0: 312 us
+  Digit 1: 198 us
+  Digit 4: 267 us
+  Digit 7: 245 us
+  Average: 255 us
 ```
 
 ## Memory Optimization Tips
@@ -184,12 +233,29 @@ arm-none-eabi-objdump -t examples/raspberry_pi/pico2/build/executorch_pico.elf |
 arm-none-eabi-readelf -l examples/raspberry_pi/pico2/build/executorch_pico.elf
 ```
 
+## CMSIS-NN INT8 Acceleration
+
+The Pico2 uses an RP2350 SoC with a Cortex-M33 core. The CMSIS-NN library provides optimized INT8 kernels that leverage the Cortex-M33's DSP instructions for faster inference compared to FP32 portable ops.
+
+### How it works
+
+1. `export_mlp_mnist_cmsis.py` uses `CortexMQuantizer` to quantize the model to INT8
+2. The model I/O remains float — quantize/dequantize nodes are inserted inside the graph
+3. `--cmsis` flag builds ExecuTorch with the Cortex-M backend and links CMSIS-NN kernels
+4. At runtime, quantized linear ops dispatch to CMSIS-NN instead of portable kernels
+
+### When to use CMSIS-NN
+
+- Lower latency on supported ops (linear, conv2d)
+- Smaller model size (INT8 weights vs FP32)
+- Trade-off: slight accuracy loss from quantization
+
 ## Next Steps
 
 ### Scale up your deployment
 
 - Use real production trained model
-- Optimize further → INT8 quantization, pruning
+- Optimize further → INT8 quantization with CMSIS-NN, pruning
 
 ### Happy Inference!