Skip to content

Commit e737ba9

Browse files
committed
Update on "[ET Device Support] Module: allocate device memory for planned buffers"
This diff enables module API loading program memory-planed on non-cpu device. It update Module::load_method() to detect device buffers via MethodMeta and allocate device memory using the registered DeviceAllocator. Device memory is managed via DeviceMemoryBuffer RAII objects stored in PlannedMemory, ensuring proper cleanup when the Method is destroyed. Differential Revision: [D97850705](https://our.internmc.facebook.com/intern/diff/D97850705/) [ghstack-poisoned]
2 parents c003658 + 8f944f9 commit e737ba9

511 files changed

Lines changed: 24933 additions & 6290 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
a9592258daacad7423fd5f39aaa59c6e36471520
1+
585799cf7039d376d2ac4848b5ef0b501f60679e

.ci/docker/common/install_cuda_windows_cross_compile.sh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,23 @@ get_torch_cuda_version() {
4848
}
4949

5050
install_windows_cuda() {
51-
# Get CUDA version from torch
52-
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
51+
# Use CUDA_VERSION env var if set (from Docker build arg), otherwise query PyTorch
52+
if [ -n "${CUDA_VERSION:-}" ]; then
53+
echo "Using CUDA version from environment: ${CUDA_VERSION}"
54+
CUDA_MAJOR_MINOR=$(echo "${CUDA_VERSION}" | cut -d. -f1,2)
55+
else
56+
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
57+
58+
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
59+
echo "ERROR: Could not detect CUDA version from PyTorch."
60+
echo "Make sure PyTorch with CUDA support is installed or set CUDA_VERSION."
61+
exit 1
62+
fi
5363

54-
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
55-
echo "ERROR: Could not detect CUDA version from PyTorch."
56-
echo "Make sure PyTorch with CUDA support is installed before running this script."
57-
exit 1
64+
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
65+
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
5866
fi
5967

60-
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
61-
62-
# Extract major.minor version (e.g., "12.8" from "12.8.1" or "12.8")
63-
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
64-
6568
# Look up the full version and driver version
6669
if [ -z "${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}" ]; then
6770
echo "ERROR: CUDA version ${CUDA_MAJOR_MINOR} is not in the known version map."

.ci/docker/ubuntu/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_c
105105
COPY ./common/utils.sh utils.sh
106106
RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
107107
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda.sh && \
108-
bash ./install_cuda_windows_cross_compile.sh; \
108+
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda_windows_cross_compile.sh; \
109109
fi
110110
RUN rm -f install_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
111111
# Set up CUDA environment for Linux compilation (nvcc, etc.)

.ci/scripts/export_model_artifact.sh

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,17 @@ case "$HF_MODEL" in
184184
PREPROCESSOR_FEATURE_SIZE=""
185185
PREPROCESSOR_OUTPUT=""
186186
;;
187+
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+
MODEL_NAME="qwen3_5_moe"
189+
TASK=""
190+
MAX_SEQ_LEN=""
191+
EXTRA_PIP=""
192+
PREPROCESSOR_FEATURE_SIZE=""
193+
PREPROCESSOR_OUTPUT=""
194+
;;
187195
*)
188196
echo "Error: Unsupported model '$HF_MODEL'"
189-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
197+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
190198
exit 1
191199
;;
192200
esac
@@ -350,7 +358,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
350358
STREAMING_ARG=""
351359
PREPROCESSOR_ARGS="--feature_size 128 --output_file ${OUTPUT_DIR}/preprocessor.pte"
352360
if [ "$USE_STREAMING" = "true" ]; then
353-
STREAMING_ARG="--streaming"
361+
STREAMING_ARG="--streaming --sliding-window 2048"
354362
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --streaming"
355363
else
356364
PREPROCESSOR_ARGS="$PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
@@ -380,6 +388,46 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
380388
exit 0
381389
fi
382390

391+
# Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+
if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
393+
pip install safetensors huggingface_hub
394+
pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+
# Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+
LOCAL_MODEL_DIR=$(mktemp -d)
398+
INDUCTOR_CACHE=$(mktemp -d)
399+
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+
python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
402+
403+
# Sanity check: run inference on the prequantized model
404+
echo "::group::Inference sanity check"
405+
python -m executorch.examples.models.qwen3_5_moe.inference \
406+
--prequantized "$LOCAL_MODEL_DIR" \
407+
--prompt "What is the capital of France?" \
408+
--max-new-tokens 32 \
409+
--temperature 0 \
410+
--no-compile
411+
echo "::endgroup::"
412+
413+
# Copy tokenizer for the runner
414+
cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
415+
416+
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+
echo "::group::Export"
418+
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419+
python -m executorch.examples.models.qwen3_5_moe.export \
420+
--prequantized "$LOCAL_MODEL_DIR" \
421+
--output-dir "${OUTPUT_DIR}"
422+
echo "::endgroup::"
423+
424+
test -f "${OUTPUT_DIR}/model.pte"
425+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
426+
ls -al "${OUTPUT_DIR}"
427+
428+
exit 0
429+
fi
430+
383431
MAX_SEQ_LEN_ARG=""
384432
if [ -n "$MAX_SEQ_LEN" ]; then
385433
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"

.ci/scripts/setup-openvino.sh

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,41 @@ set -ex
1010
# shellcheck source=/dev/null
1111
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212

13+
# Parse arguments
14+
USE_NIGHTLY=false
15+
for arg in "$@"; do
16+
case $arg in
17+
--nightly) USE_NIGHTLY=true ;;
18+
esac
19+
done
20+
1321
# Download and install OpenVINO from release packages
14-
OPENVINO_VERSION="2025.3"
15-
OPENVINO_BUILD="2025.3.0.19807.44526285f24"
16-
OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
22+
OPENVINO_VERSION="2026.0"
23+
OPENVINO_BUILD="2026.0.0.20965.c6d6a13a886"
24+
OPENVINO_STABLE_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
25+
26+
OPENVINO_NIGHTLY_BUILD_ID="2026.1.0-21310-c694fbc2b6d"
27+
OPENVINO_NIGHTLY_BUILD="2026.1.0.dev20260312"
28+
OPENVINO_NIGHTLY_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/${OPENVINO_NIGHTLY_BUILD_ID}/openvino_toolkit_ubuntu22_${OPENVINO_NIGHTLY_BUILD}_x86_64.tgz"
29+
30+
if [ "${USE_NIGHTLY}" = true ]; then
31+
OPENVINO_URL="${OPENVINO_NIGHTLY_URL}"
32+
OPENVINO_EXTRACTED_DIR="openvino_toolkit_ubuntu22_${OPENVINO_NIGHTLY_BUILD}_x86_64"
33+
echo "Using OpenVINO nightly build: ${OPENVINO_NIGHTLY_BUILD_ID}"
34+
else
35+
OPENVINO_URL="${OPENVINO_STABLE_URL}"
36+
OPENVINO_EXTRACTED_DIR="openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64"
37+
echo "Using OpenVINO stable release: ${OPENVINO_BUILD}"
38+
fi
1739

1840
curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
1941
tar -xzf /tmp/openvino_toolkit.tgz
20-
mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
42+
mv "${OPENVINO_EXTRACTED_DIR}" openvino
2143

44+
set +u
2245
source openvino/setupvars.sh
23-
cd backends/openvino
24-
pip install -r requirements.txt
25-
cd scripts
46+
set -u
47+
pip install -r backends/openvino/requirements.txt
48+
pushd backends/openvino/scripts
2649
./openvino_build.sh --enable_python
50+
popd

.ci/scripts/test_backend.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ if [[ "$FLOW" == *arm* ]]; then
7878
fi
7979
fi
8080

81+
if [[ "$FLOW" == *openvino* ]]; then
82+
# Setup OpenVINO environment
83+
source .ci/scripts/setup-openvino.sh --nightly
84+
EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_OPENVINO=ON"
85+
fi
86+
8187
if [[ $IS_MACOS -eq 1 ]]; then
8288
SETUP_SCRIPT=.ci/scripts/setup-macos.sh
8389
else

.ci/scripts/test_cortex_m_e2e.sh

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/usr/bin/env bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
# End-to-end test for Cortex-M backend: export a model via aot_arm_compiler
9+
# with cortex-m55+int8 target, then run the .bpte on Corstone-300 FVP.
10+
#
11+
# Usage: bash .ci/scripts/test_cortex_m_e2e.sh <model_name>
12+
# Example: bash .ci/scripts/test_cortex_m_e2e.sh mv2
13+
14+
set -eux
15+
16+
MODEL=$1
17+
mkdir -p "./cortex_m_e2e/${MODEL}"
18+
WORK_DIR=$(realpath "./cortex_m_e2e/${MODEL}")
19+
20+
echo "=== Exporting ${MODEL} with cortex-m55+int8 ==="
21+
python -m examples.arm.aot_arm_compiler \
22+
-m "${MODEL}" \
23+
--target=cortex-m55+int8 \
24+
--quantize \
25+
--bundleio \
26+
--intermediates="${WORK_DIR}/intermediates" \
27+
--output="${WORK_DIR}/${MODEL}.bpte"
28+
29+
BPTE="${WORK_DIR}/${MODEL}.bpte"
30+
test -f "${BPTE}" || { echo "FAIL: ${BPTE} not produced"; exit 1; }
31+
echo "=== Exported ${BPTE} ($(stat --printf='%s' "${BPTE}") bytes) ==="
32+
33+
ELF="arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner"
34+
test -f "${ELF}" || { echo "FAIL: executor runner not found at ${ELF}"; exit 1; }
35+
36+
LOG_FILE=$(mktemp)
37+
38+
# Create a tiny dummy input file — the runner requires -i but BundleIO
39+
# ignores it and uses the embedded test inputs instead.
40+
dd if=/dev/zero of="${WORK_DIR}/dummy.bin" bs=4 count=1 2>/dev/null
41+
42+
echo "=== Running ${MODEL} on Corstone-300 FVP ==="
43+
FVP_Corstone_SSE-300_Ethos-U55 \
44+
-C ethosu.num_macs=128 \
45+
-C mps3_board.visualisation.disable-visualisation=1 \
46+
-C mps3_board.telnetterminal0.start_telnet=0 \
47+
-C mps3_board.uart0.out_file='-' \
48+
-C mps3_board.uart0.shutdown_on_eot=1 \
49+
-C cpu0.semihosting-enable=1 \
50+
-C cpu0.semihosting-stack_base=0 \
51+
-C cpu0.semihosting-heap_limit=0 \
52+
-C "cpu0.semihosting-cwd=${WORK_DIR}" \
53+
-C "ethosu.extra_args='--fast'" \
54+
-C "cpu0.semihosting-cmd_line='executor_runner -m ${MODEL}.bpte -i dummy.bin -o out'" \
55+
-a "${ELF}" \
56+
--timelimit 300 2>&1 | tee "${LOG_FILE}" || true
57+
58+
echo "=== Checking FVP output ==="
59+
60+
if grep -q "Test_result: PASS" "${LOG_FILE}"; then
61+
echo "=== SUCCESS: ${MODEL} e2e BundleIO test PASSED on FVP ==="
62+
rm "${LOG_FILE}"
63+
exit 0
64+
fi
65+
66+
if grep -q "Test_result: FAIL" "${LOG_FILE}"; then
67+
echo "FAIL: ${MODEL} BundleIO output mismatch"
68+
rm "${LOG_FILE}"
69+
exit 1
70+
fi
71+
72+
if grep -qE "(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)" "${LOG_FILE}"; then
73+
echo "FAIL: ${MODEL} FVP run hit a fatal error"
74+
rm "${LOG_FILE}"
75+
exit 1
76+
fi
77+
78+
echo "FAIL: ${MODEL} no BundleIO test result found in FVP output"
79+
rm "${LOG_FILE}"
80+
exit 1

.ci/scripts/test_model_e2e.sh

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,21 @@ case "$HF_MODEL" in
216216
AUDIO_FILE="test_audio.wav"
217217
IMAGE_PATH=""
218218
;;
219+
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
220+
MODEL_NAME="qwen3_5_moe"
221+
RUNNER_TARGET="qwen3_5_moe_runner"
222+
RUNNER_PATH="qwen3_5_moe"
223+
EXPECTED_OUTPUT="Paris"
224+
PREPROCESSOR=""
225+
TOKENIZER_URL=""
226+
TOKENIZER_FILE="tokenizer.json"
227+
AUDIO_URL=""
228+
AUDIO_FILE=""
229+
IMAGE_PATH=""
230+
;;
219231
*)
220232
echo "Error: Unsupported model '$HF_MODEL'"
221-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
233+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
222234
exit 1
223235
;;
224236
esac
@@ -232,7 +244,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts"
232244

233245

234246
# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
235-
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then
247+
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "qwen3_5_moe" ]; then
236248
if [ "$TOKENIZER_FILE" != "" ]; then
237249
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
238250
else
@@ -341,6 +353,9 @@ EOF
341353
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
342354
fi
343355
;;
356+
qwen3_5_moe)
357+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0"
358+
;;
344359
voxtral_realtime)
345360
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
346361
# Add CUDA data path if present
@@ -359,7 +374,7 @@ EOF
359374
;;
360375
esac
361376

362-
OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
377+
OUTPUT=$(eval $RUNNER_BIN $RUNNER_ARGS 2>&1)
363378
EXIT_CODE=$?
364379
set -e
365380

.ci/scripts/test_model_e2e_windows.ps1

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,19 @@ try {
135135
Write-Host "::group::Check CUDA toolchain"
136136
$nvccOutput = nvcc --version | Out-String
137137
Write-Host $nvccOutput
138-
nvidia-smi
138+
$nvidiaSmiCmd = Get-Command nvidia-smi -ErrorAction SilentlyContinue
139+
if ($null -eq $nvidiaSmiCmd) {
140+
Write-Host "nvidia-smi not available (command not found; driver may not be installed)"
141+
}
142+
else {
143+
try {
144+
nvidia-smi
145+
}
146+
catch {
147+
Write-Host "nvidia-smi failed (driver or GPU issue). Error details:"
148+
Write-Host $_
149+
}
150+
}
139151
if (-not [string]::IsNullOrWhiteSpace($ExpectedCudaVersion)) {
140152
$versionMatch = [Regex]::Match($nvccOutput, "release\s+(\d+\.\d+)")
141153
if (-not $versionMatch.Success) {

.ci/scripts/wheel/test_linux.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,25 @@
1111
from examples.models import Backend, Model
1212

1313
if __name__ == "__main__":
14-
# On Linux x86_64 the wheel is built with the Qualcomm backend.
15-
# Verify that it was registered correctly.
16-
if platform.system() == "Linux" and platform.machine() in ("x86_64", "amd64"):
14+
if platform.system() == "Linux":
1715
from executorch.extension.pybindings.portable_lib import (
1816
_get_registered_backend_names,
1917
)
2018

2119
registered = _get_registered_backend_names()
20+
21+
# QNN backend is only available on x86_64.
22+
if platform.machine() in ("x86_64", "amd64"):
23+
assert (
24+
"QnnBackend" in registered
25+
), f"QnnBackend not found in registered backends: {registered}"
26+
print("✓ QnnBackend is registered")
27+
28+
# OpenVINO backend is available on all Linux architectures.
2229
assert (
23-
"QnnBackend" in registered
24-
), f"QnnBackend not found in registered backends: {registered}"
25-
print("✓ QnnBackend is registered")
30+
"OpenvinoBackend" in registered
31+
), f"OpenvinoBackend not found in registered backends: {registered}"
32+
print("✓ OpenvinoBackend is registered")
2633

2734
test_base.run_tests(
2835
model_tests=[

0 commit comments

Comments
 (0)