Skip to content

Commit cef386b

Browse files
authored
Merge branch 'main' into cuda-graph
2 parents aa7bb82 + 2f339f0 commit cef386b

6 files changed

Lines changed: 187 additions & 10 deletions

File tree

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
name: Pico2 Build Validation
2+
3+
on:
4+
push:
5+
branches: [main, release/*]
6+
paths:
7+
- examples/raspberry_pi/pico2/**
8+
- backends/cortex_m/**
9+
- .ci/scripts/**
10+
- examples/arm/**
11+
- .github/workflows/test-pico2-build.yml
12+
schedule:
13+
# Run daily at 3 AM UTC to catch upstream breakages
14+
- cron: '0 3 * * *'
15+
16+
concurrency:
17+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.number || github.sha }}
18+
cancel-in-progress: true
19+
20+
jobs:
21+
test-pico2-fp32-build:
22+
name: test-pico2-fp32-build
23+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
24+
permissions:
25+
id-token: write
26+
contents: read
27+
with:
28+
runner: linux.2xlarge.memory
29+
docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
30+
submodules: 'recursive'
31+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
32+
timeout: 120
33+
script: |
34+
set -euo pipefail
35+
# Activate conda environment from the docker image
36+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
37+
conda activate "${CONDA_ENV}"
38+
39+
source .ci/scripts/utils.sh
40+
install_executorch "--use-pt-pinned-commit"
41+
42+
# Install ARM baremetal toolchain
43+
.ci/scripts/setup-arm-baremetal-tools.sh
44+
source examples/arm/arm-scratch/setup_path.sh
45+
46+
echo "=== Step 1: Export FP32 model ==="
47+
cd examples/raspberry_pi/pico2
48+
python export_mlp_mnist.py
49+
test -f balanced_tiny_mlp_mnist.pte
50+
echo "FP32 model exported: $(ls -la balanced_tiny_mlp_mnist.pte)"
51+
cd -
52+
53+
echo "=== Step 2: Build FP32 firmware ==="
54+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
55+
56+
echo "=== Step 3: Validate FP32 artifacts ==="
57+
ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
58+
UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
59+
60+
test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
61+
test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
62+
63+
echo "--- Section sizes ---"
64+
arm-none-eabi-size -A "${ELF_FILE}"
65+
66+
echo "--- Section headers ---"
67+
arm-none-eabi-objdump -h "${ELF_FILE}"
68+
69+
echo "--- Key symbols ---"
70+
arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
71+
72+
# Validate binary fits in Pico2 memory using aggregated totals:
73+
# flash = text + data (4MB = 4194304 bytes)
74+
# SRAM = data + bss (520KB = 532480 bytes)
75+
eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
76+
77+
FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
78+
SRAM_USED=$((DATA_SIZE + BSS_SIZE))
79+
echo "FP32 binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
80+
81+
if [ "${FLASH_USED}" -gt 4194304 ]; then
82+
echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
83+
exit 1
84+
fi
85+
if [ "${SRAM_USED}" -gt 532480 ]; then
86+
echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
87+
exit 1
88+
fi
89+
echo "PASS: FP32 firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
90+
91+
test-pico2-cmsis-build:
92+
name: test-pico2-cmsis-nn-build
93+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
94+
permissions:
95+
id-token: write
96+
contents: read
97+
with:
98+
runner: linux.2xlarge.memory
99+
docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
100+
submodules: 'recursive'
101+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
102+
timeout: 120
103+
script: |
104+
set -euo pipefail
105+
# Activate conda environment from the docker image
106+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
107+
conda activate "${CONDA_ENV}"
108+
109+
source .ci/scripts/utils.sh
110+
install_executorch "--use-pt-pinned-commit"
111+
112+
# Install ARM baremetal toolchain
113+
.ci/scripts/setup-arm-baremetal-tools.sh
114+
source examples/arm/arm-scratch/setup_path.sh
115+
116+
echo "=== Step 1: Export INT8 CMSIS-NN model ==="
117+
cd examples/raspberry_pi/pico2
118+
python export_mlp_mnist_cmsis.py
119+
test -f balanced_tiny_mlp_mnist_cmsis.pte
120+
echo "INT8 model exported: $(ls -la balanced_tiny_mlp_mnist_cmsis.pte)"
121+
cd -
122+
123+
echo "=== Step 2: Build CMSIS-NN firmware ==="
124+
./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
125+
126+
echo "=== Step 3: Validate CMSIS-NN artifacts ==="
127+
ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
128+
UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
129+
130+
test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
131+
test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
132+
133+
echo "--- Section sizes ---"
134+
arm-none-eabi-size -A "${ELF_FILE}"
135+
136+
echo "--- Section headers ---"
137+
arm-none-eabi-objdump -h "${ELF_FILE}"
138+
139+
echo "--- Key symbols ---"
140+
arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
141+
142+
# Verify CMSIS-NN symbols are linked
143+
CMSIS_NN_SYMBOLS=$(arm-none-eabi-nm "${ELF_FILE}" | grep -E '(cmsis_nn|arm_nn)' || true)
144+
if [ -n "${CMSIS_NN_SYMBOLS}" ]; then
145+
echo "PASS: CMSIS-NN symbols found in binary"
146+
printf '%s\n' "${CMSIS_NN_SYMBOLS}" | head -20
147+
else
148+
echo "FAIL: No CMSIS-NN symbols detected — cortex_m backend may not be linked correctly"
149+
exit 1
150+
fi
151+
152+
# Validate binary fits in Pico2 memory using aggregated totals:
153+
# flash = text + data (4MB = 4194304 bytes)
154+
# SRAM = data + bss (520KB = 532480 bytes)
155+
eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
156+
157+
FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
158+
SRAM_USED=$((DATA_SIZE + BSS_SIZE))
159+
echo "CMSIS-NN binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
160+
161+
if [ "${FLASH_USED}" -gt 4194304 ]; then
162+
echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
163+
exit 1
164+
fi
165+
if [ "${SRAM_USED}" -gt 532480 ]; then
166+
echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
167+
exit 1
168+
fi
169+
echo "PASS: CMSIS-NN firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,8 +1135,6 @@ void ComputeGraph::prepack() {
11351135
int i = 0;
11361136
bool submitted = false;
11371137
const bool reduce_peak_memory = total_constant_nbytes_ > 10 * MB;
1138-
// int count = 0;
1139-
11401138
context_->set_cmd();
11411139
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
11421140
// Do not trigger on the first or last prepack node.

backends/vulkan/runtime/graph/containers/Constant.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ struct TensorRef final {
2929
// This will be empty (default constructed) for the raw pointer constructor
3030
executorch::runtime::FreeableBuffer buffer;
3131

32+
// Number of PrepackNodes that still need to read from this TensorRef. When
33+
// this reaches 0, the buffer can be safely freed. This prevents
34+
// use-after-free when multiple PrepackNodes reference the same TensorRef
35+
// (e.g. shared/tied weights).
36+
int32_t prepack_use_count{0};
37+
3238
explicit TensorRef(
3339
const std::vector<int64_t>& t_sizes,
3440
vkapi::ScalarType t_dtype,
@@ -44,8 +50,6 @@ struct TensorRef final {
4450
return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
4551
}
4652

47-
// Manually free the buffer if needed (though it will be freed automatically
48-
// on destruction)
4953
void free_buffer() {
5054
buffer.Free();
5155
}

backends/vulkan/runtime/graph/ops/PrepackNode.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ PrepackNode::PrepackNode(
4444
push_constants_(push_constants) {
4545
graph.update_descriptor_counts(shader, /*execute = */ false);
4646
graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
47+
if (!graph.val_is_none(tref)) {
48+
graph.get_tref(tref)->prepack_use_count++;
49+
}
4750
}
4851

4952
api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
@@ -100,9 +103,10 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
100103
}
101104
}
102105

103-
// Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer,
104-
// it can be freed.
105-
tref->free_buffer();
106+
if (--tref->prepack_use_count == 0) {
107+
tref->free_buffer();
108+
}
109+
106110
return staging;
107111
}
108112

backends/xnnpack/runtime/XNNCompiler.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ Result<const uint8_t*> getConstantDataPtr(
192192
ET_CHECK_OR_RETURN_ERROR(
193193
buffer_idx < cb->size(),
194194
InvalidProgram,
195-
"buffer_idx %u out of bounds for constant_buffer of size %zu",
195+
"buffer_idx %u out of bounds for constant_buffer of size %u",
196196
buffer_idx,
197197
cb->size());
198198
auto* buffer_entry = (*cb)[buffer_idx];
@@ -209,7 +209,7 @@ Result<const uint8_t*> getConstantDataPtr(
209209
ET_CHECK_OR_RETURN_ERROR(
210210
buffer_idx < cd->size(),
211211
InvalidProgram,
212-
"buffer_idx %u out of bounds for constant_data of size %zu",
212+
"buffer_idx %u out of bounds for constant_data of size %u",
213213
buffer_idx,
214214
cd->size());
215215
ConstantDataOffsetPtr constant_data_offset = cd->Get(buffer_idx);

examples/raspberry_pi/pico2/build_firmware_pico.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,10 @@ echo "Cross compiling ExecuTorch baremetal ARM..."
9595
# Resolve the model path for selective build. Using EXECUTORCH_SELECT_OPS_MODEL
9696
# auto-detects the exact operators the model needs from the .pte file, avoiding
9797
# "Operator missing" errors at runtime.
98+
# Note: skip selective build for CMSIS-NN models — their cortex_m:: ops are
99+
# registered by the cortex_m backend, not by portable kernel codegen.
98100
SELECT_OPS_FLAGS=""
99-
if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
101+
if [ $USE_CMSIS -eq 0 ] && [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
100102
MODEL_ABS_PATH="$(cd "${PICO2_DIR}" && realpath "${MODEL_INPUT}")"
101103
SELECT_OPS_FLAGS="-DEXECUTORCH_SELECT_OPS_MODEL=${MODEL_ABS_PATH}"
102104
echo "Using selective build from model: ${MODEL_ABS_PATH}"

0 commit comments

Comments
 (0)