pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 9 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 31 additions & 2 deletions b/‎.ci/scripts/test_huggingface_optimum_model.py‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 30 additions & 3 deletions b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎.ci/scripts/test_wheel_package_qnn.sh‎
Lines changed: 8 additions & 0 deletions b/‎.ci/scripts/test_wheel_package_qnn.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 13 additions & 0 deletions b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.ci/scripts/wheel/test_linux.py‎
Lines changed: 15 additions & 0 deletions b/‎.ci/scripts/wheel/test_linux.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 17 additions & 4 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions
@@ -1 +1 @@
-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
+a9592258daacad7423fd5f39aaa59c6e36471520
@@ -141,6 +141,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  Qwen/Qwen3-0.6B)
+    MODEL_NAME="qwen3"
+    TASK="text-generation"
+    MAX_SEQ_LEN="64"
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     TASK=""
@@ -159,7 +167,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
 
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
                 "--qembedding",
                 "8w",
             ]
+    elif recipe == "cuda":
+        command += [
+            "--dtype",
+            "bfloat16",
+            "--device",
+            "cuda",
+        ]
+        if quantize:
+            command += [
+                "--qlinear",
+                "4w",
+                "--qlinear_packing_format",
+                "tile_packed_to_4d",
+                "--qembedding",
+                "8w",
+            ]
     else:
         assert (
             not quantize
-        ), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
+        ), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
 
     if not run_only:
         cli_export(command, model_dir)
 
+    if recipe == "cuda":
+        model_path = Path(model_dir) / "model.pte"
+        cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
+        assert model_path.exists(), f"Main model file not found: {model_path}"
+        assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
+
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     saved_files = tokenizer.save_pretrained(model_dir)
     tokenizer_path = get_tokenizer_path(model_dir, saved_files)
 
     from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
 
-    runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
+    if recipe == "cuda":
+        runner = TextLLMRunner(
+            f"{model_dir}/model.pte",
+            tokenizer_path,
+            f"{model_dir}/aoti_cuda_blob.ptd",
+        )
+    else:
+        runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
     tokens = []
     runner.generate(
         "Simply put, the theory of relativity states that",
 
@@ -21,6 +21,7 @@ Arguments:
                 - mistralai/Voxtral-Mini-3B-2507
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
+                - Qwen/Qwen3-0.6B
                 - nvidia/parakeet-tdt
                 - mistralai/Voxtral-Mini-4B-Realtime-2602
 
@@ -151,6 +152,18 @@ case "$HF_MODEL" in
     AUDIO_FILE=""
     IMAGE_PATH="docs/source/_static/img/et-logo.png"
     ;;
+  Qwen/Qwen3-0.6B)
+    MODEL_NAME="qwen3"
+    RUNNER_TARGET="llama_main"
+    RUNNER_PATH="llama"
+    EXPECTED_OUTPUT="Paris"
+    PREPROCESSOR=""
+    TOKENIZER_URL="https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE=""
+    IMAGE_PATH=""
+    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     RUNNER_TARGET="parakeet_runner"
@@ -177,7 +190,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -246,9 +259,14 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
     install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER_BIN"
   fi
 fi
-# For CUDA, add data_path argument (Metal embeds data in .pte)
+# For CUDA, add named data argument (Metal embeds data in .pte).
+# Llama runner uses --data_paths, other runners use --data_path.
 if [ "$DEVICE" = "cuda" ]; then
-  RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  if [ "$RUNNER_PATH" = "llama" ]; then
+    RUNNER_ARGS="$RUNNER_ARGS --data_paths ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  else
+    RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  fi
 fi
 
 # Add model-specific arguments
@@ -262,6 +280,15 @@ case "$MODEL_NAME" in
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
     ;;
+  qwen3)
+    PROMPT_FILE="${MODEL_DIR}/qwen3_prompt.txt"
+    cat > "${PROMPT_FILE}" << 'EOF'
+<|im_start|>user
+What is the capital of France?<|im_end|>
+<|im_start|>assistant
+EOF
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --prompt_file ${PROMPT_FILE}"
+    ;;
   parakeet)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
     # For CUDA, add data_path argument (Metal embeds data in .pte)
 
@@ -86,6 +86,14 @@ EOF
 # ----------------------------
 echo "=== Building Wheel Package ==="
 source .ci/scripts/utils.sh
+
+# Ensure QNN SDK is available so setup.py auto-detects it.
+source backends/qualcomm/scripts/install_qnn_sdk.sh
+install_qnn
+
+# Make QNN SDK libraries available for runtime loading (e.g. libQnnHtp.so)
+export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang/:${LD_LIBRARY_PATH:-}"
+
 install_executorch
 EXECUTORCH_BUILDING_WHEEL=1 python setup.py bdist_wheel
 unset EXECUTORCH_BUILDING_WHEEL
 
@@ -44,3 +44,16 @@ fi
 # able to see the installed torch package.
 
 "${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
+
+# Download Qualcomm QNN SDK on Linux x86_64 so the wheel build can include the
+# QNN backend.  The SDK is large, so we download it here (outside CMake) rather
+# than during cmake configure.
+if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
+  echo "Downloading Qualcomm QNN SDK..."
+  QNN_SDK_ROOT=$(python3 \
+    "${GITHUB_WORKSPACE}/${REPOSITORY}/backends/qualcomm/scripts/download_qnn_sdk.py" \
+    --print-sdk-path)
+  export QNN_SDK_ROOT
+  echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" >> "${GITHUB_ENV}"
+  echo "QNN SDK downloaded to ${QNN_SDK_ROOT}"
+fi
@@ -5,10 +5,25 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import platform
+
 import test_base
 from examples.models import Backend, Model
 
 if __name__ == "__main__":
+    # On Linux x86_64 the wheel is built with the Qualcomm backend.
+    # Verify that it was registered correctly.
+    if platform.system() == "Linux" and platform.machine() in ("x86_64", "amd64"):
+        from executorch.extension.pybindings.portable_lib import (
+            _get_registered_backend_names,
+        )
+
+        registered = _get_registered_backend_names()
+        assert (
+            "QnnBackend" in registered
+        ), f"QnnBackend not found in registered backends: {registered}"
+        print("✓ QnnBackend is registered")
+
     test_base.run_tests(
         model_tests=[
             test_base.ModelTest(
 
@@ -138,6 +138,8 @@ jobs:
             name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
+          - repo: "Qwen"
+            name: "Qwen3-0.6B"
           - repo: "nvidia"
             name: "parakeet-tdt"
         quant:
@@ -236,12 +238,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: ["gemma3-4b"]
-        quantize: ["", "--quantize"]
+        include:
+          - model: "gemma3-4b"
+            quantize: ""
+            artifact: "google-gemma-3-4b-it-cuda-non-quantized"
+          - model: "gemma3-4b"
+            quantize: "--quantize"
+            artifact: "google-gemma-3-4b-it-cuda-quantized-int4-tile-packed"
+          - model: "qwen3-0.6b"
+            quantize: ""
+            artifact: "Qwen-Qwen3-0.6B-cuda-non-quantized"
+          - model: "qwen3-0.6b"
+            quantize: "--quantize"
+            artifact: "Qwen-Qwen3-0.6B-cuda-quantized-int4-tile-packed"
     with:
       timeout: 120
       secrets-env: EXECUTORCH_HF_TOKEN
-      download-artifact: google-gemma-3-4b-it-cuda-${{ matrix.quantize && 'quantized-int4-tile-packed' || 'non-quantized' }}
+      download-artifact: ${{ matrix.artifact }}
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
@@ -280,7 +293,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        echo "::group::Test CUDA Multimodal: ${{ matrix.model }} ${{ matrix.quantize }}"
+        echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}"
         python .ci/scripts/test_huggingface_optimum_model.py \
           --model ${{ matrix.model }} \
           --recipe cuda \
 
@@ -16,7 +16,6 @@ jobs:
   test-qnn-wheel-packages-linux:
     name: test-qnn-wheel-packages-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    if: false
     permissions:
       id-token: write
       contents: read
 
@@ -222,6 +222,7 @@ exclude_patterns = [
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
     'examples/cuda',
+    'kernels/portable',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73`
	`1`	`+a9592258daacad7423fd5f39aaa59c6e36471520`