Skip to content

Commit 6a699b1

Browse files
committed
Merge branch 'mlx-delegate-part2' into mlx-delegate-part3
2 parents b8f0fa6 + 6f805bf commit 6a699b1

1,134 files changed

Lines changed: 63795 additions & 26792 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
a9592258daacad7423fd5f39aaa59c6e36471520
1+
585799cf7039d376d2ac4848b5ef0b501f60679e
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
659af3c353e49b35c191cdd2dba3b3c79d0e6822
1+
release/2.11

.ci/docker/common/install_cuda_windows_cross_compile.sh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,23 @@ get_torch_cuda_version() {
4848
}
4949

5050
install_windows_cuda() {
51-
# Get CUDA version from torch
52-
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
51+
# Use CUDA_VERSION env var if set (from Docker build arg), otherwise query PyTorch
52+
if [ -n "${CUDA_VERSION:-}" ]; then
53+
echo "Using CUDA version from environment: ${CUDA_VERSION}"
54+
CUDA_MAJOR_MINOR=$(echo "${CUDA_VERSION}" | cut -d. -f1,2)
55+
else
56+
TORCH_CUDA_VERSION=$(get_torch_cuda_version)
57+
58+
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
59+
echo "ERROR: Could not detect CUDA version from PyTorch."
60+
echo "Make sure PyTorch with CUDA support is installed or set CUDA_VERSION."
61+
exit 1
62+
fi
5363

54-
if [ -z "${TORCH_CUDA_VERSION}" ] || [ "${TORCH_CUDA_VERSION}" = "None" ]; then
55-
echo "ERROR: Could not detect CUDA version from PyTorch."
56-
echo "Make sure PyTorch with CUDA support is installed before running this script."
57-
exit 1
64+
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
65+
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
5866
fi
5967

60-
echo "Detected PyTorch CUDA version: ${TORCH_CUDA_VERSION}"
61-
62-
# Extract major.minor version (e.g., "12.8" from "12.8.1" or "12.8")
63-
CUDA_MAJOR_MINOR=$(echo "${TORCH_CUDA_VERSION}" | cut -d. -f1,2)
64-
6568
# Look up the full version and driver version
6669
if [ -z "${CUDA_DRIVER_MAP[${CUDA_MAJOR_MINOR}]}" ]; then
6770
echo "ERROR: CUDA version ${CUDA_MAJOR_MINOR} is not in the known version map."

.ci/docker/common/install_pytorch.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ install_pytorch_and_domains() {
3232
pip_install "$(echo dist/*.whl)"
3333

3434
# Grab the pinned audio and vision commits from PyTorch
35-
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
35+
TORCHAUDIO_VERSION=release/2.11
3636
export TORCHAUDIO_VERSION
37-
TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
37+
TORCHVISION_VERSION=release/0.26
3838
export TORCHVISION_VERSION
3939

4040
install_domains

.ci/docker/ubuntu/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ COPY ./common/install_cuda_windows_cross_compile.sh install_cuda_windows_cross_c
105105
COPY ./common/utils.sh utils.sh
106106
RUN if [ -n "${CUDA_WINDOWS_CROSS_COMPILE}" ]; then \
107107
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda.sh && \
108-
bash ./install_cuda_windows_cross_compile.sh; \
108+
CUDA_VERSION=${CUDA_VERSION} bash ./install_cuda_windows_cross_compile.sh; \
109109
fi
110110
RUN rm -f install_cuda.sh install_cuda_windows_cross_compile.sh utils.sh
111111
# Set up CUDA environment for Linux compilation (nvcc, etc.)

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ build_qnn_backend() {
1818
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1919

2020
parallelism=$(( $(nproc) - 1 ))
21-
bash backends/qualcomm/scripts/build.sh --skip_linux_android --skip_linux_embedded --job_number ${parallelism} --release
21+
bash backends/qualcomm/scripts/build.sh --skip_linux_android --job_number ${parallelism} --release
2222
}
2323

2424
set_up_aot() {

.ci/scripts/export_model_artifact.sh

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ Arguments:
2222
- mistralai/Voxtral-Mini-4B-Realtime-2602
2323
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2424
- google/gemma-3-4b-it
25+
- nvidia/diar_streaming_sortformer_4spk-v2
2526
- nvidia/parakeet-tdt
27+
- facebook/dinov2-small-imagenet1k-1-layer
2628
2729
quant_name Quantization type (optional, default: non-quantized)
2830
Options:
@@ -45,6 +47,7 @@ Examples:
4547
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
4648
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
4749
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
50+
export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
4851
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4952
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
5053
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
@@ -157,6 +160,22 @@ case "$HF_MODEL" in
157160
PREPROCESSOR_FEATURE_SIZE=""
158161
PREPROCESSOR_OUTPUT=""
159162
;;
163+
nvidia/diar_streaming_sortformer_4spk-v2)
164+
MODEL_NAME="sortformer"
165+
TASK=""
166+
MAX_SEQ_LEN=""
167+
EXTRA_PIP=""
168+
PREPROCESSOR_FEATURE_SIZE=""
169+
PREPROCESSOR_OUTPUT=""
170+
;;
171+
facebook/dinov2-small-imagenet1k-1-layer)
172+
MODEL_NAME="dinov2"
173+
TASK=""
174+
MAX_SEQ_LEN=""
175+
EXTRA_PIP=""
176+
PREPROCESSOR_FEATURE_SIZE=""
177+
PREPROCESSOR_OUTPUT=""
178+
;;
160179
mistralai/Voxtral-Mini-4B-Realtime-2602)
161180
MODEL_NAME="voxtral_realtime"
162181
TASK=""
@@ -165,9 +184,17 @@ case "$HF_MODEL" in
165184
PREPROCESSOR_FEATURE_SIZE=""
166185
PREPROCESSOR_OUTPUT=""
167186
;;
187+
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+
MODEL_NAME="qwen3_5_moe"
189+
TASK=""
190+
MAX_SEQ_LEN=""
191+
EXTRA_PIP=""
192+
PREPROCESSOR_FEATURE_SIZE=""
193+
PREPROCESSOR_OUTPUT=""
194+
;;
168195
*)
169196
echo "Error: Unsupported model '$HF_MODEL'"
170-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
197+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
171198
exit 1
172199
;;
173200
esac
@@ -247,6 +274,59 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
247274
exit 0
248275
fi
249276

277+
# Sortformer uses a custom export script
278+
if [ "$MODEL_NAME" = "sortformer" ]; then
279+
if [ "$QUANT_NAME" != "non-quantized" ]; then
280+
echo "Error: Sortformer currently supports only non-quantized export"
281+
exit 1
282+
fi
283+
284+
pip install -r examples/models/sortformer/install_requirements.txt
285+
286+
SORTFORMER_BACKEND="$DEVICE"
287+
if [ "$DEVICE" = "cuda-windows" ]; then
288+
SORTFORMER_BACKEND="cuda-windows"
289+
elif [ "$DEVICE" = "cuda" ]; then
290+
SORTFORMER_BACKEND="cuda"
291+
elif [ "$DEVICE" = "xnnpack" ]; then
292+
SORTFORMER_BACKEND="xnnpack"
293+
else
294+
SORTFORMER_BACKEND="portable"
295+
fi
296+
297+
python -m executorch.examples.models.sortformer.export_sortformer \
298+
--hf-model "${HF_MODEL}" \
299+
--backend "${SORTFORMER_BACKEND}" \
300+
--output-dir "${OUTPUT_DIR}"
301+
302+
test -f "${OUTPUT_DIR}/sortformer.pte"
303+
mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte"
304+
# CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
305+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
306+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
307+
fi
308+
ls -al "${OUTPUT_DIR}"
309+
echo "::endgroup::"
310+
exit 0
311+
fi
312+
313+
# DINOv2 uses a custom export script
314+
if [ "$MODEL_NAME" = "dinov2" ]; then
315+
pip install -r examples/models/dinov2/install_requirements.txt
316+
317+
python -m executorch.examples.models.dinov2.export_dinov2 \
318+
--backend "$DEVICE" \
319+
--output-dir "${OUTPUT_DIR}"
320+
321+
test -f "${OUTPUT_DIR}/model.pte"
322+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
323+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
324+
fi
325+
ls -al "${OUTPUT_DIR}"
326+
echo "::endgroup::"
327+
exit 0
328+
fi
329+
250330
# Voxtral Realtime uses a custom export script
251331
if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
252332
pip install safetensors huggingface_hub
@@ -262,6 +342,7 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
262342
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
263343
elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
264344
VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
345+
VR_DTYPE_ARGS="--dtype bf16"
265346
elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
266347
VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
267348
VR_DTYPE_ARGS="--dtype bf16"
@@ -301,11 +382,51 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
301382
fi
302383
# Copy tokenizer from downloaded model weights
303384
cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
385+
rm -rf "$LOCAL_MODEL_DIR"
304386
ls -al "${OUTPUT_DIR}"
305387
echo "::endgroup::"
306388
exit 0
307389
fi
308390

391+
# Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+
if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
393+
pip install safetensors huggingface_hub
394+
pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+
# Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+
LOCAL_MODEL_DIR=$(mktemp -d)
398+
INDUCTOR_CACHE=$(mktemp -d)
399+
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+
python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
402+
403+
# Sanity check: run inference on the prequantized model
404+
echo "::group::Inference sanity check"
405+
python -m executorch.examples.models.qwen3_5_moe.inference \
406+
--prequantized "$LOCAL_MODEL_DIR" \
407+
--prompt "What is the capital of France?" \
408+
--max-new-tokens 32 \
409+
--temperature 0 \
410+
--no-compile
411+
echo "::endgroup::"
412+
413+
# Copy tokenizer for the runner
414+
cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
415+
416+
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+
echo "::group::Export"
418+
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419+
python -m executorch.examples.models.qwen3_5_moe.export \
420+
--prequantized "$LOCAL_MODEL_DIR" \
421+
--output-dir "${OUTPUT_DIR}"
422+
echo "::endgroup::"
423+
424+
test -f "${OUTPUT_DIR}/model.pte"
425+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
426+
ls -al "${OUTPUT_DIR}"
427+
exit 0
428+
fi
429+
309430
MAX_SEQ_LEN_ARG=""
310431
if [ -n "$MAX_SEQ_LEN" ]; then
311432
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"

.ci/scripts/setup-openvino.sh

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,41 @@ set -ex
1010
# shellcheck source=/dev/null
1111
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212

13+
# Parse arguments
14+
USE_NIGHTLY=false
15+
for arg in "$@"; do
16+
case $arg in
17+
--nightly) USE_NIGHTLY=true ;;
18+
esac
19+
done
20+
1321
# Download and install OpenVINO from release packages
14-
OPENVINO_VERSION="2025.3"
15-
OPENVINO_BUILD="2025.3.0.19807.44526285f24"
16-
OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
22+
OPENVINO_VERSION="2026.0"
23+
OPENVINO_BUILD="2026.0.0.20965.c6d6a13a886"
24+
OPENVINO_STABLE_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
25+
26+
OPENVINO_NIGHTLY_BUILD_ID="2026.1.0-21310-c694fbc2b6d"
27+
OPENVINO_NIGHTLY_BUILD="2026.1.0.dev20260312"
28+
OPENVINO_NIGHTLY_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/${OPENVINO_NIGHTLY_BUILD_ID}/openvino_toolkit_ubuntu22_${OPENVINO_NIGHTLY_BUILD}_x86_64.tgz"
29+
30+
if [ "${USE_NIGHTLY}" = true ]; then
31+
OPENVINO_URL="${OPENVINO_NIGHTLY_URL}"
32+
OPENVINO_EXTRACTED_DIR="openvino_toolkit_ubuntu22_${OPENVINO_NIGHTLY_BUILD}_x86_64"
33+
echo "Using OpenVINO nightly build: ${OPENVINO_NIGHTLY_BUILD_ID}"
34+
else
35+
OPENVINO_URL="${OPENVINO_STABLE_URL}"
36+
OPENVINO_EXTRACTED_DIR="openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64"
37+
echo "Using OpenVINO stable release: ${OPENVINO_BUILD}"
38+
fi
1739

1840
curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
1941
tar -xzf /tmp/openvino_toolkit.tgz
20-
mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
42+
mv "${OPENVINO_EXTRACTED_DIR}" openvino
2143

44+
set +u
2245
source openvino/setupvars.sh
23-
cd backends/openvino
24-
pip install -r requirements.txt
25-
cd scripts
46+
set -u
47+
pip install -r backends/openvino/requirements.txt
48+
pushd backends/openvino/scripts
2649
./openvino_build.sh --enable_python
50+
popd

.ci/scripts/test_backend.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ if [[ "$FLOW" == *qnn* ]]; then
4646
export LD_LIBRARY_PATH"=$QNN_X86_LIB_DIR:$QNN_SDK_ROOT/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
4747

4848
# TODO Get SDK root from install scripts
49-
EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=$QNN_SDK_ROOT"
49+
EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_QNN=ON -DQNN_SDK_ROOT=$QNN_SDK_ROOT -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON"
5050
fi
5151

5252
if [[ "$FLOW" == *vulkan* ]]; then
@@ -78,6 +78,12 @@ if [[ "$FLOW" == *arm* ]]; then
7878
fi
7979
fi
8080

81+
if [[ "$FLOW" == *openvino* ]]; then
82+
# Setup OpenVINO environment
83+
source .ci/scripts/setup-openvino.sh --nightly
84+
EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_OPENVINO=ON"
85+
fi
86+
8187
if [[ $IS_MACOS -eq 1 ]]; then
8288
SETUP_SCRIPT=.ci/scripts/setup-macos.sh
8389
else

0 commit comments

Comments
 (0)