Add text only llm CI jobs for cuda

larryliu0820 · larryliu0820 · commit 256027024a83 · 2026-02-27T14:52:09.000-08:00
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
+a9592258daacad7423fd5f39aaa59c6e36471520
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -141,6 +141,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  Qwen/Qwen3-0.6B)
+    MODEL_NAME="qwen3"
+    TASK="text-generation"
+    MAX_SEQ_LEN="64"
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     TASK=""
@@ -159,7 +167,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
                 "--qembedding",
                 "8w",
             ]
+    elif recipe == "cuda":
+        command += [
+            "--dtype",
+            "bfloat16",
+            "--device",
+            "cuda",
+        ]
+        if quantize:
+            command += [
+                "--qlinear",
+                "4w",
+                "--qlinear_packing_format",
+                "tile_packed_to_4d",
+                "--qembedding",
+                "8w",
+            ]
     else:
         assert (
             not quantize
-        ), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
+        ), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
 
     if not run_only:
         cli_export(command, model_dir)
 
+    if recipe == "cuda":
+        model_path = Path(model_dir) / "model.pte"
+        cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
+        assert model_path.exists(), f"Main model file not found: {model_path}"
+        assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
+
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     saved_files = tokenizer.save_pretrained(model_dir)
     tokenizer_path = get_tokenizer_path(model_dir, saved_files)
 
     from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
 
-    runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
+    if recipe == "cuda":
+        runner = TextLLMRunner(
+            f"{model_dir}/model.pte",
+            tokenizer_path,
+            f"{model_dir}/aoti_cuda_blob.ptd",
+        )
+    else:
+        runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
     tokens = []
     runner.generate(
         "Simply put, the theory of relativity states that",
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -138,6 +138,8 @@ jobs:
             name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
+          - repo: "Qwen"
+            name: "Qwen3-0.6B"
           - repo: "nvidia"
             name: "parakeet-tdt"
         quant:
@@ -236,12 +238,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: ["gemma3-4b"]
-        quantize: ["", "--quantize"]
+        include:
+          - model: "gemma3-4b"
+            quantize: ""
+            artifact: "google-gemma-3-4b-it-cuda-non-quantized"
+          - model: "gemma3-4b"
+            quantize: "--quantize"
+            artifact: "google-gemma-3-4b-it-cuda-quantized-int4-tile-packed"
+          - model: "qwen3-0.6b"
+            quantize: ""
+            artifact: "Qwen-Qwen3-0.6B-cuda-non-quantized"
+          - model: "qwen3-0.6b"
+            quantize: "--quantize"
+            artifact: "Qwen-Qwen3-0.6B-cuda-quantized-int4-tile-packed"
     with:
       timeout: 120
       secrets-env: EXECUTORCH_HF_TOKEN
-      download-artifact: google-gemma-3-4b-it-cuda-${{ matrix.quantize && 'quantized-int4-tile-packed' || 'non-quantized' }}
+      download-artifact: ${{ matrix.artifact }}
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
@@ -280,7 +293,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        echo "::group::Test CUDA Multimodal: ${{ matrix.model }} ${{ matrix.quantize }}"
+        echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}"
         python .ci/scripts/test_huggingface_optimum_model.py \
           --model ${{ matrix.model }} \
           --recipe cuda \

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73`
	`1`	`+a9592258daacad7423fd5f39aaa59c6e36471520`