Merge branch 'main' into cuda-graph

Gasoonjia · web-flow · commit cef386b1464d · 2026-04-15T16:39:50.000-07:00
diff --git a/.github/workflows/test-pico2-build.yml b/.github/workflows/test-pico2-build.yml
@@ -0,0 +1,169 @@
+name: Pico2 Build Validation
+
+on:
+  push:
+    branches: [main, release/*]
+    paths:
+      - examples/raspberry_pi/pico2/**
+      - backends/cortex_m/**
+      - .ci/scripts/**
+      - examples/arm/**
+      - .github/workflows/test-pico2-build.yml
+  schedule:
+    # Run daily at 3 AM UTC to catch upstream breakages
+    - cron: '0 3 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  test-pico2-fp32-build:
+    name: test-pico2-fp32-build
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+        # Activate conda environment from the docker image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install ARM baremetal toolchain
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        echo "=== Step 1: Export FP32 model ==="
+        cd examples/raspberry_pi/pico2
+        python export_mlp_mnist.py
+        test -f balanced_tiny_mlp_mnist.pte
+        echo "FP32 model exported: $(ls -la balanced_tiny_mlp_mnist.pte)"
+        cd -
+
+        echo "=== Step 2: Build FP32 firmware ==="
+        ./examples/raspberry_pi/pico2/build_firmware_pico.sh --model=balanced_tiny_mlp_mnist.pte
+
+        echo "=== Step 3: Validate FP32 artifacts ==="
+        ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
+        UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
+
+        test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
+        test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
+
+        echo "--- Section sizes ---"
+        arm-none-eabi-size -A "${ELF_FILE}"
+
+        echo "--- Section headers ---"
+        arm-none-eabi-objdump -h "${ELF_FILE}"
+
+        echo "--- Key symbols ---"
+        arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
+
+        # Validate binary fits in Pico2 memory using aggregated totals:
+        #   flash = text + data (4MB = 4194304 bytes)
+        #   SRAM  = data + bss  (520KB = 532480 bytes)
+        eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
+
+        FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
+        SRAM_USED=$((DATA_SIZE + BSS_SIZE))
+        echo "FP32 binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
+
+        if [ "${FLASH_USED}" -gt 4194304 ]; then
+          echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
+          exit 1
+        fi
+        if [ "${SRAM_USED}" -gt 532480 ]; then
+          echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
+          exit 1
+        fi
+        echo "PASS: FP32 firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
+
+  test-pico2-cmsis-build:
+    name: test-pico2-cmsis-nn-build
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+        # Activate conda environment from the docker image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install ARM baremetal toolchain
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        echo "=== Step 1: Export INT8 CMSIS-NN model ==="
+        cd examples/raspberry_pi/pico2
+        python export_mlp_mnist_cmsis.py
+        test -f balanced_tiny_mlp_mnist_cmsis.pte
+        echo "INT8 model exported: $(ls -la balanced_tiny_mlp_mnist_cmsis.pte)"
+        cd -
+
+        echo "=== Step 2: Build CMSIS-NN firmware ==="
+        ./examples/raspberry_pi/pico2/build_firmware_pico.sh --cmsis --model=balanced_tiny_mlp_mnist_cmsis.pte
+
+        echo "=== Step 3: Validate CMSIS-NN artifacts ==="
+        ELF_FILE=examples/raspberry_pi/pico2/build/executorch_pico.elf
+        UF2_FILE=examples/raspberry_pi/pico2/build/executorch_pico.uf2
+
+        test -f "${ELF_FILE}" || { echo "FAIL: .elf not found"; exit 1; }
+        test -f "${UF2_FILE}" || { echo "FAIL: .uf2 not found"; exit 1; }
+
+        echo "--- Section sizes ---"
+        arm-none-eabi-size -A "${ELF_FILE}"
+
+        echo "--- Section headers ---"
+        arm-none-eabi-objdump -h "${ELF_FILE}"
+
+        echo "--- Key symbols ---"
+        arm-none-eabi-nm --print-size --size-sort --radix=d "${ELF_FILE}" | tail -20
+
+        # Verify CMSIS-NN symbols are linked
+        CMSIS_NN_SYMBOLS=$(arm-none-eabi-nm "${ELF_FILE}" | grep -E '(cmsis_nn|arm_nn)' || true)
+        if [ -n "${CMSIS_NN_SYMBOLS}" ]; then
+          echo "PASS: CMSIS-NN symbols found in binary"
+          printf '%s\n' "${CMSIS_NN_SYMBOLS}" | head -20
+        else
+          echo "FAIL: No CMSIS-NN symbols detected — cortex_m backend may not be linked correctly"
+          exit 1
+        fi
+
+        # Validate binary fits in Pico2 memory using aggregated totals:
+        #   flash = text + data (4MB = 4194304 bytes)
+        #   SRAM  = data + bss  (520KB = 532480 bytes)
+        eval $(arm-none-eabi-size "${ELF_FILE}" | awk 'NR==2 {printf "TEXT_SIZE=%d DATA_SIZE=%d BSS_SIZE=%d", $1, $2, $3}')
+
+        FLASH_USED=$((TEXT_SIZE + DATA_SIZE))
+        SRAM_USED=$((DATA_SIZE + BSS_SIZE))
+        echo "CMSIS-NN binary: text=${TEXT_SIZE} data=${DATA_SIZE} bss=${BSS_SIZE} => flash=${FLASH_USED} sram=${SRAM_USED}"
+
+        if [ "${FLASH_USED}" -gt 4194304 ]; then
+          echo "FAIL: flash usage (${FLASH_USED}) exceeds 4MB"
+          exit 1
+        fi
+        if [ "${SRAM_USED}" -gt 532480 ]; then
+          echo "FAIL: SRAM usage (${SRAM_USED}) exceeds 520KB"
+          exit 1
+        fi
+        echo "PASS: CMSIS-NN firmware fits in Pico2 memory (SRAM: ${SRAM_USED}/532480, Flash: ${FLASH_USED}/4194304)"
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -1135,8 +1135,6 @@ void ComputeGraph::prepack() {
   int i = 0;
   bool submitted = false;
   const bool reduce_peak_memory = total_constant_nbytes_ > 10 * MB;
-  // int count = 0;
-
   context_->set_cmd();
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     // Do not trigger on the first or last prepack node.
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -29,6 +29,12 @@ struct TensorRef final {
   // This will be empty (default constructed) for the raw pointer constructor
   executorch::runtime::FreeableBuffer buffer;
 
+  // Number of PrepackNodes that still need to read from this TensorRef. When
+  // this reaches 0, the buffer can be safely freed. This prevents
+  // use-after-free when multiple PrepackNodes reference the same TensorRef
+  // (e.g. shared/tied weights).
+  int32_t prepack_use_count{0};
+
   explicit TensorRef(
       const std::vector<int64_t>& t_sizes,
       vkapi::ScalarType t_dtype,
@@ -44,8 +50,6 @@ struct TensorRef final {
     return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
   }
 
-  // Manually free the buffer if needed (though it will be freed automatically
-  // on destruction)
   void free_buffer() {
     buffer.Free();
   }
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -44,6 +44,9 @@ PrepackNode::PrepackNode(
       push_constants_(push_constants) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
+  if (!graph.val_is_none(tref)) {
+    graph.get_tref(tref)->prepack_use_count++;
+  }
 }
 
 api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
@@ -100,9 +103,10 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
     }
   }
 
-  // Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer,
-  // it can be freed.
-  tref->free_buffer();
+  if (--tref->prepack_use_count == 0) {
+    tref->free_buffer();
+  }
+
   return staging;
 }
 
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -192,7 +192,7 @@ Result<const uint8_t*> getConstantDataPtr(
       ET_CHECK_OR_RETURN_ERROR(
           buffer_idx < cb->size(),
           InvalidProgram,
-          "buffer_idx %u out of bounds for constant_buffer of size %zu",
+          "buffer_idx %u out of bounds for constant_buffer of size %u",
           buffer_idx,
           cb->size());
       auto* buffer_entry = (*cb)[buffer_idx];
@@ -209,7 +209,7 @@ Result<const uint8_t*> getConstantDataPtr(
       ET_CHECK_OR_RETURN_ERROR(
           buffer_idx < cd->size(),
           InvalidProgram,
-          "buffer_idx %u out of bounds for constant_data of size %zu",
+          "buffer_idx %u out of bounds for constant_data of size %u",
           buffer_idx,
           cd->size());
       ConstantDataOffsetPtr constant_data_offset = cd->Get(buffer_idx);
diff --git a/examples/raspberry_pi/pico2/build_firmware_pico.sh b/examples/raspberry_pi/pico2/build_firmware_pico.sh
@@ -95,8 +95,10 @@ echo "Cross compiling ExecuTorch baremetal ARM..."
 # Resolve the model path for selective build. Using EXECUTORCH_SELECT_OPS_MODEL
 # auto-detects the exact operators the model needs from the .pte file, avoiding
 # "Operator missing" errors at runtime.
+# Note: skip selective build for CMSIS-NN models — their cortex_m:: ops are
+# registered by the cortex_m backend, not by portable kernel codegen.
 SELECT_OPS_FLAGS=""
-if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
+if [ $USE_CMSIS -eq 0 ] && [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
   MODEL_ABS_PATH="$(cd "${PICO2_DIR}" && realpath "${MODEL_INPUT}")"
   SELECT_OPS_FLAGS="-DEXECUTORCH_SELECT_OPS_MODEL=${MODEL_ABS_PATH}"
   echo "Using selective build from model: ${MODEL_ABS_PATH}"

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,9 @@ PrepackNode::PrepackNode(`
`44`	`44`	`push_constants_(push_constants) {`
`45`	`45`	`graph.update_descriptor_counts(shader, /execute = / false);`
`46`	`46`	`graph.update_descriptor_counts(noop_shader_, /execute = / false);`
	`47`	`+ if (!graph.val_is_none(tref)) {`
	`48`	`+ graph.get_tref(tref)->prepack_use_count++;`
	`49`	`+ }`
`47`	`50`	`}`
`48`	`51`
`49`	`52`	`api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {`
`@@ -100,9 +103,10 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {`
`100`	`103`	`}`
`101`	`104`	`}`
`102`	`105`
`103`		`- // Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer,`
`104`		`- // it can be freed.`
`105`		`- tref->free_buffer();`
	`106`	`+ if (--tref->prepack_use_count == 0) {`
	`107`	`+ tref->free_buffer();`
	`108`	`+ }`
	`109`	`+`
`106`	`110`	`return staging;`
`107`	`111`	`}`
`108`	`112`