Add LLM support for cuda backend (#17316)

larryliu0820 · web-flow · commit 6bb983b78c72 · 2026-02-28T15:05:41.000-08:00
## Summary
This PR extends CUDA support for text-only LLM workflows and adds CI
coverage for Qwen3-0.6B artifacts and pybind execution.

## Why
We already validate CUDA multimodal paths, but text-generation CUDA
coverage (especially Qwen3) was incomplete.
This change adds export/run support and CI wiring so CUDA
text-generation artifacts are exercised in automated tests.

## What changed

### CUDA LLM runner/build support
- Added `llama-cuda` and `llama-cuda-debug` Makefile targets.
- Added CUDA presets/workflow presets in
`examples/models/llama/CMakePresets.json`.
- Updated `examples/models/llama/CMakeLists.txt` to link CUDA backend
when `EXECUTORCH_BUILD_CUDA=ON`.
- Updated `examples/models/llama/main.cpp`:
  - Added `--data_path` convenience flag (single PTD path).
  - Added `--prompt_file` support for file-based prompts.

### Gemma3 runner usability
- Updated `examples/models/gemma3/e2e_runner.cpp`:
  - Added `--max_new_tokens`.
  - Added `--stop_sequence` early-stop behavior.

### Optimum exporter integration and CI pin
- Bumped optimum-executorch CI pin to:
  - `a9592258daacad7423fd5f39aaa59c6e36471520`
- Added `Qwen/Qwen3-0.6B` handling in
`.ci/scripts/export_model_artifact.sh` for `text-generation`.

### HuggingFace optimum CUDA test path
- Updated `.ci/scripts/test_huggingface_optimum_model.py`
(`test_text_generation`):
  - Supports `recipe=cuda` export (`--device cuda --dtype bfloat16`).
  - Supports CUDA quantization for this path:
    - `--qlinear 4w`
    - `--qlinear_packing_format tile_packed_to_4d`
    - `--qembedding 8w`
  - Validates presence of `aoti_cuda_blob.ptd`.
  - Passes blob path into `TextLLMRunner`.

### CUDA workflow updates
- Updated `.github/workflows/cuda.yml`:
  - Added `Qwen/Qwen3-0.6B` to CUDA export matrix.
  - Updated `test-cuda-pybind` matrix to explicit artifact mapping.
- Added Qwen non-quantized and quantized-int4-tile-packed artifact runs
in pybind test.
  - Switched `download-artifact` to matrix-provided artifact name.

## Validation
Rely on new CI jobs.
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
+a9592258daacad7423fd5f39aaa59c6e36471520
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -141,6 +141,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  Qwen/Qwen3-0.6B)
+    MODEL_NAME="qwen3"
+    TASK="text-generation"
+    MAX_SEQ_LEN="64"
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     TASK=""
@@ -159,7 +167,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
                 "--qembedding",
                 "8w",
             ]
+    elif recipe == "cuda":
+        command += [
+            "--dtype",
+            "bfloat16",
+            "--device",
+            "cuda",
+        ]
+        if quantize:
+            command += [
+                "--qlinear",
+                "4w",
+                "--qlinear_packing_format",
+                "tile_packed_to_4d",
+                "--qembedding",
+                "8w",
+            ]
     else:
         assert (
             not quantize
-        ), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
+        ), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
 
     if not run_only:
         cli_export(command, model_dir)
 
+    if recipe == "cuda":
+        model_path = Path(model_dir) / "model.pte"
+        cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
+        assert model_path.exists(), f"Main model file not found: {model_path}"
+        assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
+
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     saved_files = tokenizer.save_pretrained(model_dir)
     tokenizer_path = get_tokenizer_path(model_dir, saved_files)
 
     from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
 
-    runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
+    if recipe == "cuda":
+        runner = TextLLMRunner(
+            f"{model_dir}/model.pte",
+            tokenizer_path,
+            f"{model_dir}/aoti_cuda_blob.ptd",
+        )
+    else:
+        runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
     tokens = []
     runner.generate(
         "Simply put, the theory of relativity states that",
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -21,6 +21,7 @@ Arguments:
                 - mistralai/Voxtral-Mini-3B-2507
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
+                - Qwen/Qwen3-0.6B
                 - nvidia/parakeet-tdt
                 - mistralai/Voxtral-Mini-4B-Realtime-2602
 
@@ -151,6 +152,18 @@ case "$HF_MODEL" in
     AUDIO_FILE=""
     IMAGE_PATH="docs/source/_static/img/et-logo.png"
     ;;
+  Qwen/Qwen3-0.6B)
+    MODEL_NAME="qwen3"
+    RUNNER_TARGET="llama_main"
+    RUNNER_PATH="llama"
+    EXPECTED_OUTPUT="Paris"
+    PREPROCESSOR=""
+    TOKENIZER_URL="https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE=""
+    IMAGE_PATH=""
+    ;;
   nvidia/parakeet-tdt)
     MODEL_NAME="parakeet"
     RUNNER_TARGET="parakeet_runner"
@@ -177,7 +190,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -246,9 +259,14 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
     install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib "$RUNNER_BIN"
   fi
 fi
-# For CUDA, add data_path argument (Metal embeds data in .pte)
+# For CUDA, add named data argument (Metal embeds data in .pte).
+# Llama runner uses --data_paths, other runners use --data_path.
 if [ "$DEVICE" = "cuda" ]; then
-  RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  if [ "$RUNNER_PATH" = "llama" ]; then
+    RUNNER_ARGS="$RUNNER_ARGS --data_paths ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  else
+    RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+  fi
 fi
 
 # Add model-specific arguments
@@ -262,6 +280,15 @@ case "$MODEL_NAME" in
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
     ;;
+  qwen3)
+    PROMPT_FILE="${MODEL_DIR}/qwen3_prompt.txt"
+    cat > "${PROMPT_FILE}" << 'EOF'
+<|im_start|>user
+What is the capital of France?<|im_end|>
+<|im_start|>assistant
+EOF
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --prompt_file ${PROMPT_FILE}"
+    ;;
   parakeet)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
     # For CUDA, add data_path argument (Metal embeds data in .pte)
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -138,6 +138,8 @@ jobs:
             name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
+          - repo: "Qwen"
+            name: "Qwen3-0.6B"
           - repo: "nvidia"
             name: "parakeet-tdt"
         quant:
@@ -236,12 +238,23 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: ["gemma3-4b"]
-        quantize: ["", "--quantize"]
+        include:
+          - model: "gemma3-4b"
+            quantize: ""
+            artifact: "google-gemma-3-4b-it-cuda-non-quantized"
+          - model: "gemma3-4b"
+            quantize: "--quantize"
+            artifact: "google-gemma-3-4b-it-cuda-quantized-int4-tile-packed"
+          - model: "qwen3-0.6b"
+            quantize: ""
+            artifact: "Qwen-Qwen3-0.6B-cuda-non-quantized"
+          - model: "qwen3-0.6b"
+            quantize: "--quantize"
+            artifact: "Qwen-Qwen3-0.6B-cuda-quantized-int4-tile-packed"
     with:
       timeout: 120
       secrets-env: EXECUTORCH_HF_TOKEN
-      download-artifact: google-gemma-3-4b-it-cuda-${{ matrix.quantize && 'quantized-int4-tile-packed' || 'non-quantized' }}
+      download-artifact: ${{ matrix.artifact }}
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
@@ -280,7 +293,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        echo "::group::Test CUDA Multimodal: ${{ matrix.model }} ${{ matrix.quantize }}"
+        echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}"
         python .ci/scripts/test_huggingface_optimum_model.py \
           --model ${{ matrix.model }} \
           --recipe cuda \
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -110,6 +110,8 @@ help:
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
 	@echo "  sortformer-cpu      - Build Sortformer runner with CPU backend"
 	@echo "  silero-vad-cpu      - Build Silero VAD runner with CPU backend"
+	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
+	@echo "  llama-cuda-debug    - Build Llama runner with CUDA backend (debug mode)"
 	@echo "  llama-cpu           - Build Llama runner with CPU backend"
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
@@ -265,6 +267,24 @@ llama-cpu:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
+llama-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Llama runner with CUDA..."
+	cd examples/models/llama && cmake --workflow --preset llama-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
+
+llama-cuda-debug:
+	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
+	cmake --workflow --preset llm-debug-cuda
+	@echo "==> Building Llama runner with CUDA (debug mode)..."
+	cd examples/models/llama && cmake --workflow --preset llama-cuda-debug
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
+
 llava-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
@@ -163,6 +163,15 @@ if(TARGET xnnpack_backend)
   executorch_target_link_options_shared_lib(xnnpack_backend)
 endif()
 
+# CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda_backend)
+  if(NOT MSVC)
+    executorch_target_link_options_shared_lib(aoti_cuda_backend)
+  endif()
+endif()
+
 # Vulkan backend
 if(TARGET vulkan_backend)
   list(APPEND link_libraries vulkan_backend)
diff --git a/examples/models/llama/CMakePresets.json b/examples/models/llama/CMakePresets.json
@@ -18,6 +18,36 @@
                 "CMAKE_BUILD_TYPE": "Debug",
                 "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out"
             }
+        },
+        {
+            "name": "llama-cuda-debug",
+            "displayName": "Llama runner in Debug mode with CUDA backend",
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
+        },
+        {
+            "name": "llama-cuda",
+            "displayName": "Llama runner with CUDA backend",
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
         }
     ],
     "buildPresets": [
@@ -32,6 +62,18 @@
             "displayName": "Build Llama runner in Debug mode",
             "configurePreset": "llama-debug",
             "targets": ["llama_main"]
+        },
+        {
+            "name": "llama-cuda-debug",
+            "displayName": "Build Llama runner in Debug mode with CUDA backend",
+            "configurePreset": "llama-cuda-debug",
+            "targets": ["llama_main"]
+        },
+        {
+            "name": "llama-cuda",
+            "displayName": "Build Llama runner with CUDA backend",
+            "configurePreset": "llama-cuda",
+            "targets": ["llama_main"]
         }
     ],
     "workflowPresets": [
@@ -62,6 +104,34 @@
                     "name": "llama-debug"
                 }
             ]
+        },
+        {
+            "name": "llama-cuda-debug",
+            "displayName": "Configure and build Llama runner in Debug mode with CUDA backend",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "llama-cuda-debug"
+                },
+                {
+                    "type": "build",
+                    "name": "llama-cuda-debug"
+                }
+            ]
+        },
+        {
+            "name": "llama-cuda",
+            "displayName": "Configure and build Llama runner with CUDA backend",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "llama-cuda"
+                },
+                {
+                    "type": "build",
+                    "name": "llama-cuda"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
@@ -9,6 +9,7 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <gflags/gflags.h>
+#include <fstream>
 #include <sstream>
 #include <vector>
 
@@ -34,6 +35,10 @@ DEFINE_string(
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+DEFINE_string(
+    prompt_file,
+    "",
+    "Optional path to a file containing the prompt. If set, this overrides --prompt.");
 
 DEFINE_double(
     temperature,
@@ -102,6 +107,17 @@ std::vector<std::string> parseStringList(const std::string& input) {
   return result;
 }
 
+bool readFileToString(const std::string& path, std::string& out) {
+  std::ifstream file(path, std::ios::in | std::ios::binary);
+  if (!file) {
+    return false;
+  }
+  std::ostringstream ss;
+  ss << file.rdbuf();
+  out = ss.str();
+  return true;
+}
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -114,7 +130,18 @@ int32_t main(int32_t argc, char** argv) {
 
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
 
+  std::string prompt_storage;
   const char* prompt = FLAGS_prompt.c_str();
+  if (!FLAGS_prompt_file.empty()) {
+    if (!readFileToString(FLAGS_prompt_file, prompt_storage)) {
+      ET_LOG(
+          Error,
+          "Failed to read prompt file at path: %s",
+          FLAGS_prompt_file.c_str());
+      return 1;
+    }
+    prompt = prompt_storage.c_str();
+  }
 
   float temperature = FLAGS_temperature;
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-5bf1aeb587e9b1f3572b0bd60265c5dafd007b73`
	`1`	`+a9592258daacad7423fd5f39aaa59c6e36471520`