Skip to content

Commit e28f0c5

Browse files
Merge branch 'main' into exir-flatbuffer-serialize-fastpath_v2
2 parents 1dc55fe + 0c2ff55 commit e28f0c5

258 files changed

Lines changed: 6264 additions & 2367 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
1+
a9592258daacad7423fd5f39aaa59c6e36471520

.ci/scripts/export_model_artifact.sh

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Arguments:
3434
3535
output_dir Output directory for artifacts (optional, default: current directory)
3636
37-
mode Export mode (optional, default: auto-detect based on model and device)
37+
mode Export mode (optional, default: vr-streaming)
3838
Supported modes:
3939
- vr-streaming: Voxtral Realtime streaming mode
4040
- vr-offline: Voxtral Realtime offline mode
@@ -141,6 +141,14 @@ case "$HF_MODEL" in
141141
PREPROCESSOR_FEATURE_SIZE=""
142142
PREPROCESSOR_OUTPUT=""
143143
;;
144+
Qwen/Qwen3-0.6B)
145+
MODEL_NAME="qwen3"
146+
TASK="text-generation"
147+
MAX_SEQ_LEN="64"
148+
EXTRA_PIP=""
149+
PREPROCESSOR_FEATURE_SIZE=""
150+
PREPROCESSOR_OUTPUT=""
151+
;;
144152
nvidia/parakeet-tdt)
145153
MODEL_NAME="parakeet"
146154
TASK=""
@@ -159,7 +167,7 @@ case "$HF_MODEL" in
159167
;;
160168
*)
161169
echo "Error: Unsupported model '$HF_MODEL'"
162-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
170+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
163171
exit 1
164172
;;
165173
esac
@@ -256,16 +264,9 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
256264
fi
257265

258266
# Determine streaming mode based on MODE parameter
259-
USE_STREAMING="false"
260-
if [ "$MODE" = "vr-streaming" ]; then
261-
USE_STREAMING="true"
262-
elif [ "$MODE" = "vr-offline" ]; then
267+
USE_STREAMING="true"
268+
if [ "$MODE" = "vr-offline" ]; then
263269
USE_STREAMING="false"
264-
elif [ -z "$MODE" ]; then
265-
# Auto-detect: XNNPACK uses streaming, others use offline
266-
if [ "$DEVICE" = "xnnpack" ]; then
267-
USE_STREAMING="true"
268-
fi
269270
fi
270271

271272
# Configure export and preprocessor based on streaming mode

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
142142
"--qembedding",
143143
"8w",
144144
]
145+
elif recipe == "cuda":
146+
command += [
147+
"--dtype",
148+
"bfloat16",
149+
"--device",
150+
"cuda",
151+
]
152+
if quantize:
153+
command += [
154+
"--qlinear",
155+
"4w",
156+
"--qlinear_packing_format",
157+
"tile_packed_to_4d",
158+
"--qembedding",
159+
"8w",
160+
]
145161
else:
146162
assert (
147163
not quantize
148-
), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
164+
), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
149165

150166
if not run_only:
151167
cli_export(command, model_dir)
152168

169+
if recipe == "cuda":
170+
model_path = Path(model_dir) / "model.pte"
171+
cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
172+
assert model_path.exists(), f"Main model file not found: {model_path}"
173+
assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
174+
153175
tokenizer = AutoTokenizer.from_pretrained(model_id)
154176
saved_files = tokenizer.save_pretrained(model_dir)
155177
tokenizer_path = get_tokenizer_path(model_dir, saved_files)
156178

157179
from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
158180

159-
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
181+
if recipe == "cuda":
182+
runner = TextLLMRunner(
183+
f"{model_dir}/model.pte",
184+
tokenizer_path,
185+
f"{model_dir}/aoti_cuda_blob.ptd",
186+
)
187+
else:
188+
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
160189
tokens = []
161190
runner.generate(
162191
"Simply put, the theory of relativity states that",

.ci/scripts/test_lora.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
cmake_install_executorch_libraries() {
1313
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
1414
rm -rf cmake-out
15-
cmake --workflow llm-release
15+
cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
16+
cmake --build --preset llm-release-install
1617
}
1718

1819
cmake_build_llama_runner() {

.ci/scripts/test_lora_multimethod.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
cmake_install_executorch_libraries() {
1313
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
1414
rm -rf cmake-out
15-
cmake --workflow llm-release
15+
cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
16+
cmake --build --preset llm-release-install
1617
}
1718

1819
cmake_build_llama_runner() {

.ci/scripts/test_model_e2e.sh

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Arguments:
2121
- mistralai/Voxtral-Mini-3B-2507
2222
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2323
- google/gemma-3-4b-it
24+
- Qwen/Qwen3-0.6B
2425
- nvidia/parakeet-tdt
2526
- mistralai/Voxtral-Mini-4B-Realtime-2602
2627
@@ -151,6 +152,18 @@ case "$HF_MODEL" in
151152
AUDIO_FILE=""
152153
IMAGE_PATH="docs/source/_static/img/et-logo.png"
153154
;;
155+
Qwen/Qwen3-0.6B)
156+
MODEL_NAME="qwen3"
157+
RUNNER_TARGET="llama_main"
158+
RUNNER_PATH="llama"
159+
EXPECTED_OUTPUT="Paris"
160+
PREPROCESSOR=""
161+
TOKENIZER_URL="https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
162+
TOKENIZER_FILE=""
163+
AUDIO_URL=""
164+
AUDIO_FILE=""
165+
IMAGE_PATH=""
166+
;;
154167
nvidia/parakeet-tdt)
155168
MODEL_NAME="parakeet"
156169
RUNNER_TARGET="parakeet_runner"
@@ -177,7 +190,7 @@ case "$HF_MODEL" in
177190
;;
178191
*)
179192
echo "Error: Unsupported model '$HF_MODEL'"
180-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
193+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
181194
exit 1
182195
;;
183196
esac
@@ -246,9 +259,14 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
246259
install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER_BIN"
247260
fi
248261
fi
249-
# For CUDA, add data_path argument (Metal embeds data in .pte)
262+
# For CUDA, add named data argument (Metal embeds data in .pte).
263+
# Llama runner uses --data_paths, other runners use --data_path.
250264
if [ "$DEVICE" = "cuda" ]; then
251-
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
265+
if [ "$RUNNER_PATH" = "llama" ]; then
266+
RUNNER_ARGS="$RUNNER_ARGS --data_paths ${MODEL_DIR}/aoti_cuda_blob.ptd"
267+
else
268+
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
269+
fi
252270
fi
253271

254272
# Add model-specific arguments
@@ -262,6 +280,15 @@ case "$MODEL_NAME" in
262280
gemma3)
263281
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
264282
;;
283+
qwen3)
284+
PROMPT_FILE="${MODEL_DIR}/qwen3_prompt.txt"
285+
cat > "${PROMPT_FILE}" << 'EOF'
286+
<|im_start|>user
287+
What is the capital of France?<|im_end|>
288+
<|im_start|>assistant
289+
EOF
290+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --prompt_file ${PROMPT_FILE}"
291+
;;
265292
parakeet)
266293
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
267294
# For CUDA, add data_path argument (Metal embeds data in .pte)
@@ -272,16 +299,9 @@ case "$MODEL_NAME" in
272299
voxtral_realtime)
273300
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
274301
# Determine streaming mode based on MODE parameter
275-
USE_STREAMING="false"
276-
if [ "$MODE" = "vr-streaming" ]; then
277-
USE_STREAMING="true"
278-
elif [ "$MODE" = "vr-offline" ]; then
302+
USE_STREAMING="true"
303+
if [ "$MODE" = "vr-offline" ]; then
279304
USE_STREAMING="false"
280-
elif [ -z "$MODE" ]; then
281-
# Auto-detect: XNNPACK uses streaming, others use offline
282-
if [ "$DEVICE" = "xnnpack" ]; then
283-
USE_STREAMING="true"
284-
fi
285305
fi
286306
# Add streaming flag if needed
287307
if [ "$USE_STREAMING" = "true" ]; then

.ci/scripts/test_wheel_package_qnn.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,14 @@ EOF
8686
# ----------------------------
8787
echo "=== Building Wheel Package ==="
8888
source .ci/scripts/utils.sh
89+
90+
# Ensure QNN SDK is available so setup.py auto-detects it.
91+
source backends/qualcomm/scripts/install_qnn_sdk.sh
92+
install_qnn
93+
94+
# Make QNN SDK libraries available for runtime loading (e.g. libQnnHtp.so)
95+
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
96+
8997
install_executorch
9098
EXECUTORCH_BUILDING_WHEEL=1 python setup.py bdist_wheel
9199
unset EXECUTORCH_BUILDING_WHEEL

.ci/scripts/wheel/pre_build_script.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,16 @@ fi
4444
# able to see the installed torch package.
4545

4646
"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
47+
48+
# Download Qualcomm QNN SDK on Linux x86_64 so the wheel build can include the
49+
# QNN backend. The SDK is large, so we download it here (outside CMake) rather
50+
# than during cmake configure.
51+
if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
52+
echo "Downloading Qualcomm QNN SDK..."
53+
QNN_SDK_ROOT=$(python3 \
54+
"${GITHUB_WORKSPACE}/${REPOSITORY}/backends/qualcomm/scripts/download_qnn_sdk.py" \
55+
--print-sdk-path)
56+
export QNN_SDK_ROOT
57+
echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" >> "${GITHUB_ENV}"
58+
echo "QNN SDK downloaded to ${QNN_SDK_ROOT}"
59+
fi

.ci/scripts/wheel/test_linux.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,25 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8+
import platform
9+
810
import test_base
911
from examples.models import Backend, Model
1012

1113
if __name__ == "__main__":
14+
# On Linux x86_64 the wheel is built with the Qualcomm backend.
15+
# Verify that it was registered correctly.
16+
if platform.system() == "Linux" and platform.machine() in ("x86_64", "amd64"):
17+
from executorch.extension.pybindings.portable_lib import (
18+
_get_registered_backend_names,
19+
)
20+
21+
registered = _get_registered_backend_names()
22+
assert (
23+
"QnnBackend" in registered
24+
), f"QnnBackend not found in registered backends: {registered}"
25+
print("✓ QnnBackend is registered")
26+
1227
test_base.run_tests(
1328
model_tests=[
1429
test_base.ModelTest(

0 commit comments

Comments
 (0)