Skip to content

Commit 2560270

Browse files
committed
Add text only llm CI jobs for cuda
1 parent 7a4086b commit 2560270

4 files changed

Lines changed: 58 additions & 8 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5bf1aeb587e9b1f3572b0bd60265c5dafd007b73
1+
a9592258daacad7423fd5f39aaa59c6e36471520

.ci/scripts/export_model_artifact.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,14 @@ case "$HF_MODEL" in
141141
PREPROCESSOR_FEATURE_SIZE=""
142142
PREPROCESSOR_OUTPUT=""
143143
;;
144+
Qwen/Qwen3-0.6B)
145+
MODEL_NAME="qwen3"
146+
TASK="text-generation"
147+
MAX_SEQ_LEN="64"
148+
EXTRA_PIP=""
149+
PREPROCESSOR_FEATURE_SIZE=""
150+
PREPROCESSOR_OUTPUT=""
151+
;;
144152
nvidia/parakeet-tdt)
145153
MODEL_NAME="parakeet"
146154
TASK=""
@@ -159,7 +167,7 @@ case "$HF_MODEL" in
159167
;;
160168
*)
161169
echo "Error: Unsupported model '$HF_MODEL'"
162-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
170+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
163171
exit 1
164172
;;
165173
esac

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,21 +142,50 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
142142
"--qembedding",
143143
"8w",
144144
]
145+
elif recipe == "cuda":
146+
command += [
147+
"--dtype",
148+
"bfloat16",
149+
"--device",
150+
"cuda",
151+
]
152+
if quantize:
153+
command += [
154+
"--qlinear",
155+
"4w",
156+
"--qlinear_packing_format",
157+
"tile_packed_to_4d",
158+
"--qembedding",
159+
"8w",
160+
]
145161
else:
146162
assert (
147163
not quantize
148-
), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
164+
), "Quantization is only supported for XnnPack, CoreML, and CUDA recipes at the moment."
149165

150166
if not run_only:
151167
cli_export(command, model_dir)
152168

169+
if recipe == "cuda":
170+
model_path = Path(model_dir) / "model.pte"
171+
cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
172+
assert model_path.exists(), f"Main model file not found: {model_path}"
173+
assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
174+
153175
tokenizer = AutoTokenizer.from_pretrained(model_id)
154176
saved_files = tokenizer.save_pretrained(model_dir)
155177
tokenizer_path = get_tokenizer_path(model_dir, saved_files)
156178

157179
from executorch.extension.llm.runner import GenerationConfig, TextLLMRunner
158180

159-
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
181+
if recipe == "cuda":
182+
runner = TextLLMRunner(
183+
f"{model_dir}/model.pte",
184+
tokenizer_path,
185+
f"{model_dir}/aoti_cuda_blob.ptd",
186+
)
187+
else:
188+
runner = TextLLMRunner(f"{model_dir}/model.pte", tokenizer_path)
160189
tokens = []
161190
runner.generate(
162191
"Simply put, the theory of relativity states that",

.github/workflows/cuda.yml

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ jobs:
138138
name: "whisper-large-v3-turbo"
139139
- repo: "google"
140140
name: "gemma-3-4b-it"
141+
- repo: "Qwen"
142+
name: "Qwen3-0.6B"
141143
- repo: "nvidia"
142144
name: "parakeet-tdt"
143145
quant:
@@ -236,12 +238,23 @@ jobs:
236238
strategy:
237239
fail-fast: false
238240
matrix:
239-
model: ["gemma3-4b"]
240-
quantize: ["", "--quantize"]
241+
include:
242+
- model: "gemma3-4b"
243+
quantize: ""
244+
artifact: "google-gemma-3-4b-it-cuda-non-quantized"
245+
- model: "gemma3-4b"
246+
quantize: "--quantize"
247+
artifact: "google-gemma-3-4b-it-cuda-quantized-int4-tile-packed"
248+
- model: "qwen3-0.6b"
249+
quantize: ""
250+
artifact: "Qwen-Qwen3-0.6B-cuda-non-quantized"
251+
- model: "qwen3-0.6b"
252+
quantize: "--quantize"
253+
artifact: "Qwen-Qwen3-0.6B-cuda-quantized-int4-tile-packed"
241254
with:
242255
timeout: 120
243256
secrets-env: EXECUTORCH_HF_TOKEN
244-
download-artifact: google-gemma-3-4b-it-cuda-${{ matrix.quantize && 'quantized-int4-tile-packed' || 'non-quantized' }}
257+
download-artifact: ${{ matrix.artifact }}
245258
runner: linux.g5.4xlarge.nvidia.gpu
246259
gpu-arch-type: cuda
247260
gpu-arch-version: 12.6
@@ -280,7 +293,7 @@ jobs:
280293
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
281294
echo "::endgroup::"
282295
283-
echo "::group::Test CUDA Multimodal: ${{ matrix.model }} ${{ matrix.quantize }}"
296+
echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}"
284297
python .ci/scripts/test_huggingface_optimum_model.py \
285298
--model ${{ matrix.model }} \
286299
--recipe cuda \

0 commit comments

Comments
 (0)