Skip to content

Commit 1af2029

Browse files
authored
Merge branch 'main' into digantdesai/sdpa-bench-and-perf-stats
2 parents f02b19a + 841181e commit 1af2029

5 files changed

Lines changed: 250 additions & 31 deletions

File tree

backends/xnnpack/runtime/XNNCompiler.cpp

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -170,10 +170,12 @@ std::vector<T> flatbufferDimsToVector(
170170
/**
171171
Gets the constant data pointer associated with the given tensor value.
172172
Obtaining the constant data pointer can either be from within the flatbuffer
173-
payload (deprecated) or via offsets to the constant_data_ptr. If no constant
174-
data associated with the tensor value, then returns nullptr.
173+
payload (deprecated) or via offsets to the constant_data_ptr.
174+
175+
Failures are returned as an Error, and the successful value may be nullptr
176+
when the tensor has no associated constant data.
175177
*/
176-
const uint8_t* getConstantDataPtr(
178+
Result<const uint8_t*> getConstantDataPtr(
177179
uint32_t buffer_idx,
178180
GraphPtr flatbuffer_graph,
179181
const uint8_t* constant_data_ptr,
@@ -184,26 +186,56 @@ const uint8_t* getConstantDataPtr(
184186
if (!constant_data_ptr) {
185187
// TODO(T172265611): Remove constant_buffer in flatbuffer path after BC
186188
// window
187-
const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
188-
return constant_buffer[buffer_idx]->storage()->data();
189+
auto* cb = flatbuffer_graph->constant_buffer();
190+
ET_CHECK_OR_RETURN_ERROR(
191+
cb != nullptr, InvalidProgram, "constant_buffer is null");
192+
ET_CHECK_OR_RETURN_ERROR(
193+
buffer_idx < cb->size(),
194+
InvalidProgram,
195+
"buffer_idx %u out of bounds for constant_buffer of size %zu",
196+
buffer_idx,
197+
cb->size());
198+
auto* buffer_entry = (*cb)[buffer_idx];
199+
ET_CHECK_OR_RETURN_ERROR(
200+
buffer_entry != nullptr && buffer_entry->storage() != nullptr,
201+
InvalidProgram,
202+
"Null constant_buffer entry at buffer_idx %u",
203+
buffer_idx);
204+
return buffer_entry->storage()->data();
189205
} else {
190-
ConstantDataOffsetPtr constant_data_offset =
191-
flatbuffer_graph->constant_data()->Get(buffer_idx);
206+
auto* cd = flatbuffer_graph->constant_data();
207+
ET_CHECK_OR_RETURN_ERROR(
208+
cd != nullptr, InvalidProgram, "constant_data is null");
209+
ET_CHECK_OR_RETURN_ERROR(
210+
buffer_idx < cd->size(),
211+
InvalidProgram,
212+
"buffer_idx %u out of bounds for constant_data of size %zu",
213+
buffer_idx,
214+
cd->size());
215+
ConstantDataOffsetPtr constant_data_offset = cd->Get(buffer_idx);
216+
ET_CHECK_OR_RETURN_ERROR(
217+
constant_data_offset != nullptr,
218+
InvalidProgram,
219+
"Null constant_data entry at buffer_idx %u",
220+
buffer_idx);
192221
uint64_t offset = constant_data_offset->offset();
193-
194222
bool has_named_key = flatbuffers::IsFieldPresent(
195223
constant_data_offset, fb_xnnpack::ConstantDataOffset::VT_NAMED_KEY);
196224
// If there is no tensor name
197225
if (!has_named_key) {
198226
return constant_data_ptr + offset;
199227
} else {
228+
ET_CHECK_OR_RETURN_ERROR(
229+
constant_data_offset->named_key() != nullptr,
230+
InvalidProgram,
231+
"Named key is null");
200232
const std::string& data_name = constant_data_offset->named_key()->str();
201233
#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
202234
Result<const uint8_t*> data_ptr =
203235
weights_cache->load_unpacked_data(data_name);
204236
if (!data_ptr.ok()) {
205237
ET_LOG(Error, "Failed to load weights from cache");
206-
return nullptr;
238+
return data_ptr.error();
207239
}
208240
return data_ptr.get();
209241
#else
@@ -215,7 +247,7 @@ const uint8_t* getConstantDataPtr(
215247
"Failed to get constant data for key %s from named_data_map. Error code: %u",
216248
data_name.c_str(),
217249
static_cast<uint32_t>(buffer.error()));
218-
return nullptr;
250+
return buffer.error();
219251
}
220252
const uint8_t* data_ptr =
221253
static_cast<const uint8_t*>(buffer.get().data());
@@ -229,7 +261,7 @@ const uint8_t* getConstantDataPtr(
229261
return nullptr;
230262
}
231263

232-
const uint8_t* getConstantDataPtr(
264+
Result<const uint8_t*> getConstantDataPtr(
233265
const fb_xnnpack::XNNTensorValue* tensor_value,
234266
GraphPtr flatbuffer_graph,
235267
const uint8_t* constant_data_ptr,
@@ -298,13 +330,17 @@ Error defineTensor(
298330

299331
// Get Pointer to constant data from flatbuffer, if its non-constant
300332
// it is a nullptr
301-
const uint8_t* buffer_ptr = getConstantDataPtr(
333+
auto buffer_result = getConstantDataPtr(
302334
tensor_value,
303335
flatbuffer_graph,
304336
constant_data_ptr,
305337
named_data_map,
306338
freeable_buffers,
307339
weights_cache);
340+
if (!buffer_result.ok()) {
341+
return buffer_result.error();
342+
}
343+
const uint8_t* buffer_ptr = buffer_result.get();
308344

309345
xnn_status status;
310346
// The type we might have to convert to
@@ -449,13 +485,17 @@ Error defineTensor(
449485
const float* scale = qparams->scale()->data();
450486

451487
if (qparams->scale_buffer_idx() != 0) {
452-
scale = reinterpret_cast<const float*>(getConstantDataPtr(
488+
auto scale_result = getConstantDataPtr(
453489
qparams->scale_buffer_idx(),
454490
flatbuffer_graph,
455491
constant_data_ptr,
456492
named_data_map,
457493
freeable_buffers,
458-
weights_cache));
494+
weights_cache);
495+
if (!scale_result.ok()) {
496+
return scale_result.error();
497+
}
498+
scale = reinterpret_cast<const float*>(scale_result.get());
459499
ET_CHECK_OR_RETURN_ERROR(
460500
scale != nullptr, Internal, "Failed to load scale data.");
461501
}
@@ -491,13 +531,18 @@ Error defineTensor(
491531
// Block scales are preferably serialized as bf16 but can also be
492532
// serialized as fp32 for backwards compatability.
493533
if (qparams->scale_buffer_idx() != 0) {
494-
scale_data = reinterpret_cast<const uint16_t*>(getConstantDataPtr(
534+
auto scale_data_result = getConstantDataPtr(
495535
qparams->scale_buffer_idx(),
496536
flatbuffer_graph,
497537
constant_data_ptr,
498538
named_data_map,
499539
freeable_buffers,
500-
weights_cache));
540+
weights_cache);
541+
if (!scale_data_result.ok()) {
542+
return scale_data_result.error();
543+
}
544+
scale_data =
545+
reinterpret_cast<const uint16_t*>(scale_data_result.get());
501546
ET_CHECK_OR_RETURN_ERROR(
502547
scale_data != nullptr, Internal, "Failed to load scale data.");
503548
scale_numel = qparams->num_scales();
@@ -1816,16 +1861,19 @@ ET_NODISCARD Error XNNCompiler::compileModel(
18161861
Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
18171862
const uint8_t* flatbuffer_data = nullptr;
18181863
const uint8_t* constant_data = nullptr;
1864+
size_t flatbuffer_size = 0;
18191865
CompileAllocator compile_allocator;
18201866

18211867
// Header status can only either be Error::Ok or Error::NotFound
18221868
if (header.ok()) {
18231869
flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
18241870
header->flatbuffer_offset;
1871+
flatbuffer_size = header->flatbuffer_size;
18251872
constant_data = reinterpret_cast<const uint8_t*>(buffer_pointer) +
18261873
header->constant_data_offset;
18271874
} else if (header.error() == Error::NotFound) {
18281875
flatbuffer_data = reinterpret_cast<const uint8_t*>(buffer_pointer);
1876+
flatbuffer_size = num_bytes;
18291877
} else {
18301878
ET_LOG(Error, "XNNHeader may be corrupt");
18311879
return header.error();
@@ -1843,6 +1891,15 @@ ET_NODISCARD Error XNNCompiler::compileModel(
18431891
"XNNPACK Delegate Serialization Format version identifier '%.4s' != expected XN00 or XN01'",
18441892
flatbuffers::GetBufferIdentifier(flatbuffer_data));
18451893

1894+
// Verify the FlatBuffer data integrity before accessing it. Without this,
1895+
// malformed data could cause out-of-bounds reads when traversing the
1896+
// FlatBuffer's internal offset tables.
1897+
flatbuffers::Verifier verifier(flatbuffer_data, flatbuffer_size);
1898+
ET_CHECK_OR_RETURN_ERROR(
1899+
verifier.VerifyBuffer<fb_xnnpack::XNNGraph>(nullptr),
1900+
DelegateInvalidCompatibility,
1901+
"FlatBuffer verification failed; data may be truncated or corrupt");
1902+
18461903
auto flatbuffer_graph = fb_xnnpack::GetXNNGraph(flatbuffer_data);
18471904
ET_CHECK_OR_RETURN_ERROR(
18481905
flatbuffer_graph != nullptr && flatbuffer_graph->xvalues() != nullptr &&

backends/xnnpack/runtime/XNNHeader.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <executorch/backends/xnnpack/runtime/XNNHeader.h>
1010

11+
#include <cinttypes>
1112
#include <cstring>
1213

1314
#include <executorch/runtime/core/error.h>
@@ -64,6 +65,48 @@ Result<XNNHeader> XNNHeader::Parse(const void* data, size_t size) {
6465
uint64_t constant_data_size =
6566
GetUInt64LE(header_data + XNNHeader::kConstantDataSizeOffset);
6667

68+
// Validate min flatbuffer size.
69+
constexpr size_t kMinFlatbufferSize =
70+
sizeof(uint32_t) + 4; // root offset + identifier
71+
ET_CHECK_OR_RETURN_ERROR(
72+
flatbuffer_size >= kMinFlatbufferSize,
73+
InvalidArgument,
74+
"flatbuffer_size %" PRIu32 " is too small (minimum %zu)",
75+
flatbuffer_size,
76+
kMinFlatbufferSize);
77+
78+
// Validate that flatbuffer region does not overflow or exceed the buffer.
79+
ET_CHECK_OR_RETURN_ERROR(
80+
flatbuffer_offset <= size && flatbuffer_size <= size - flatbuffer_offset,
81+
InvalidArgument,
82+
"flatbuffer_offset: %" PRIu32 " and flatbuffer_size: %" PRIu32
83+
" are invalid for buffer of size: %zu",
84+
flatbuffer_offset,
85+
flatbuffer_size,
86+
size);
87+
// Validate that constant data region does not overflow or exceed the buffer.
88+
ET_CHECK_OR_RETURN_ERROR(
89+
constant_data_offset <= size &&
90+
constant_data_size <= size - constant_data_offset,
91+
InvalidArgument,
92+
"constant_data_offset: %" PRIu32 " and constant_data_size: %" PRIu64
93+
" are invalid for buffer of size: %zu",
94+
constant_data_offset,
95+
constant_data_size,
96+
size);
97+
98+
// Validate that constant data region does not overlap with flatbuffer region.
99+
// flatbuffer should come before constant data.
100+
ET_CHECK_OR_RETURN_ERROR(
101+
constant_data_offset >= flatbuffer_offset &&
102+
constant_data_offset - flatbuffer_offset >= flatbuffer_size,
103+
InvalidArgument,
104+
"constant_data_offset: %" PRIu32 " and flatbuffer_offset: %" PRIu32
105+
" with flatbuffer_size: %" PRIu32 " are overlapping.",
106+
constant_data_offset,
107+
flatbuffer_offset,
108+
flatbuffer_size);
109+
67110
return XNNHeader{
68111
flatbuffer_offset,
69112
flatbuffer_size,

docs/source/pico2_tutorial.md

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ A 28×28 MNIST digit classifier running on memory constrained, low power microco
99
- Input: ASCII art digits (0, 1, 4, 7)
1010
- Output: Real-time predictions via USB serial
1111
- Memory: <400KB total footprint
12+
- Two variants: FP32 (portable ops) and INT8 (CMSIS-NN accelerated)
1213

1314
## Prerequisites
1415

@@ -24,29 +25,63 @@ which arm-none-eabi-gcc # --> arm/arm-scratch/arm-gnu-toolchain-13.3.rel1-x86_64
2425

2526
## Step 1: Generate pte from given example Model
2627

28+
### FP32 model (default)
29+
2730
- Use the [provided example model](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/export_mlp_mnist.py)
2831

2932
```bash
33+
cd examples/raspberry_pi/pico2
3034
python export_mlp_mnist.py # Creates balanced_tiny_mlp_mnist.pte
3135
```
3236

3337
- **Note:** This is hand-crafted MNIST Classifier (proof-of-concept), and not production trained. This tiny MLP recognizes digits 0, 1, 4, and 7 using manually designed feature detectors.
3438

39+
### INT8 quantized model (CMSIS-NN accelerated)
40+
41+
- Use the [CMSIS-NN export script](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py)
42+
43+
```bash
44+
cd examples/raspberry_pi/pico2
45+
python export_mlp_mnist_cmsis.py # Creates balanced_tiny_mlp_mnist_cmsis.pte
46+
```
47+
48+
This uses the `CortexMQuantizer` to produce INT8 quantized ops that map to CMSIS-NN kernels on Cortex-M33. The model I/O stays float — quantize and dequantize nodes are inserted inside the graph.
49+
3550
## Step 2: Build Firmware for Pico2
3651

52+
### FP32 build
53+
3754
```bash
3855
# Generate model (Creates balanced_tiny_mlp_mnist.pte)
3956
cd ./examples/raspberry_pi/pico2
4057
python export_mlp_mnist.py
4158
cd -
4259

4360
# Build Pico2 firmware (one command!)
61+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
62+
```
63+
64+
### INT8 CMSIS-NN build
65+
66+
```bash
67+
# Generate INT8 quantized model
68+
cd ./examples/raspberry_pi/pico2
69+
python export_mlp_mnist_cmsis.py
70+
cd -
4471

45-
./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte # This creates executorch_pico.uf2, a firmware image for Pico2
72+
# Build with CMSIS-NN backend
73+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
4674
```
4775

4876
Output: **executorch_pico.uf2** firmware file (examples/raspberry_pi/pico2/build/)
4977

78+
**Script options:**
79+
| Flag | Description |
80+
|------|-------------|
81+
| `--model=FILE` | Specify model file to embed (relative to pico2/) |
82+
| `--cmsis` | Build with CMSIS-NN INT8 kernels for Cortex-M33 acceleration |
83+
| `--clean` | Clean build directories and exit; run separately before building if needed |
84+
5085
**Note:** '[build_firmware_pico.sh](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/build_firmware_pico.sh)' script converts given model pte to hex array and generates C code for the same via this helper [script](https://github.com/pytorch/executorch/blob/main/examples/raspberry_pi/pico2/pte_to_array.py). This C code is then compiled to generate final .uf2 binary which is then flashed to Pico2.
5186

5287
## Step 3: Flash to Pico2
@@ -72,6 +107,10 @@ screen /dev/tty.usbmodem1101 115200
72107

73108
Something like:
74109

110+
📊 Memory usage after method load:
111+
Method allocator: 45632 / 204800 bytes used
112+
Activation pool: 204800 bytes allocated
113+
75114
=== Digit 7 ===
76115
############################
77116
############################
@@ -104,6 +143,7 @@ Something like:
104143

105144
Input stats: 159 white pixels out of 784 total
106145
Running neural network inference...
146+
⏱️ Inference time: 245 us
107147
✅ Neural network results:
108148
Digit 0: 370.000
109149
Digit 1: 0.000
@@ -116,7 +156,16 @@ Running neural network inference...
116156
Digit 8: -3.000
117157
Digit 9: -3.000
118158

119-
� PREDICTED: 7 (Expected: 7) ✅ CORRECT!
159+
🎯 PREDICTED: 7 (Expected: 7) ✅ CORRECT!
160+
161+
==================================================
162+
163+
📊 Inference latency summary:
164+
Digit 0: 312 us
165+
Digit 1: 198 us
166+
Digit 4: 267 us
167+
Digit 7: 245 us
168+
Average: 255 us
120169
```
121170

122171
## Memory Optimization Tips
@@ -184,12 +233,29 @@ arm-none-eabi-objdump -t examples/raspberry_pi/pico2/build/executorch_pico.elf |
184233
arm-none-eabi-readelf -l examples/raspberry_pi/pico2/build/executorch_pico.elf
185234
```
186235

236+
## CMSIS-NN INT8 Acceleration
237+
238+
The Pico2 uses an RP2350 SoC with a Cortex-M33 core. The CMSIS-NN library provides optimized INT8 kernels that leverage the Cortex-M33's DSP instructions for faster inference compared to FP32 portable ops.
239+
240+
### How it works
241+
242+
1. `export_mlp_mnist_cmsis.py` uses `CortexMQuantizer` to quantize the model to INT8
243+
2. The model I/O remains float — quantize/dequantize nodes are inserted inside the graph
244+
3. `--cmsis` flag builds ExecuTorch with the Cortex-M backend and links CMSIS-NN kernels
245+
4. At runtime, quantized linear ops dispatch to CMSIS-NN instead of portable kernels
246+
247+
### When to use CMSIS-NN
248+
249+
- Lower latency on supported ops (linear, conv2d)
250+
- Smaller model size (INT8 weights vs FP32)
251+
- Trade-off: slight accuracy loss from quantization
252+
187253
## Next Steps
188254

189255
### Scale up your deployment
190256

191257
- Use real production trained model
192-
- Optimize further → INT8 quantization, pruning
258+
- Optimize further → INT8 quantization with CMSIS-NN, pruning
193259

194260
### Happy Inference!
195261

0 commit comments

Comments
 (0)