Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
recipe_owner: adil-a
time: "00:45:00"
vllm_deploy: true
recipe_owner: adil-a
checkpoint_robustness:
hf_kl_threshold: 5e-3
distributed.tp_size: 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,9 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
recipe_owner: adil-a
time: "00:45:00"
vllm_deploy: true
recipe_owner: adil-a
checkpoint_robustness:
hf_kl_threshold: 5e-3
trust_remote_code: true
Expand Down
1 change: 1 addition & 0 deletions examples/llm_finetune/gemma/gemma_3_270m_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ optimizer:
# min_lr: 1.0e-5

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:20:00"
checkpoint_robustness:
Expand Down
1 change: 1 addition & 0 deletions examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ optimizer:
# min_lr: 1.0e-5

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:20:00"
checkpoint_robustness:
Expand Down
2 changes: 2 additions & 0 deletions examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ ci:
recipe_owner: hemildesai
time: "00:15:00"
node_multiplier: true
vllm_deploy: true
vllm_smoke_test: true
checkpoint_robustness:
hf_kl_threshold: 5e-2
tokenizer_name: openai/gpt-oss-20b
Expand Down
2 changes: 2 additions & 0 deletions examples/llm_finetune/gpt_oss/gpt_oss_20b_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ optimizer:
# min_lr: 1.0e-5

ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: akoumpa
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,13 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
vllm_deploy: true
recipe_owner: akoumpa

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: akoumpa
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,13 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
recipe_owner: akoumpa
vllm_deploy: true

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: akoumpa
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ ci:
recipe_owner: akoumpa
nodes: 2
time: "00:45:00"
vllm_deploy: true
checkpoint_robustness:
hf_kl_threshold: 5e-3
distributed.tp_size: 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,9 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
recipe_owner: HuiyingLi
time: "00:45:00"
vllm_deploy: true
recipe_owner: HuiyingLi
checkpoint_robustness:
hf_kl_threshold: 5e-3
trust_remote_code: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,11 @@ lr_scheduler:
# save_dir: <your_wandb_save_dir>

ci:
nodes: 4
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
nodes: 4
checkpoint_robustness:
hf_kl_threshold: 7e-2
experts_implementation: grouped_mm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
vllm_smoke_test: true
recipe_owner: adil-a
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
recipe_owner: akoumpa
time: "00:15:00"
checkpoint_robustness:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ lr_scheduler:
min_lr: 1.0e-6

ci:
vllm_deploy: true
recipe_owner: akoumpa
time: "00:15:00"
checkpoint_robustness:
Expand Down
1 change: 1 addition & 0 deletions examples/llm_finetune/phi/phi_4_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ci:
recipe_owner: hemildesai
time: "00:25:00"
node_multiplier: true
vllm_deploy: true
checkpoint_robustness:
hf_kl_threshold: 1.2e-3
tokenizer_name: microsoft/phi-4
Expand Down
1 change: 1 addition & 0 deletions examples/llm_finetune/phi/phi_4_squad_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ optimizer:
# min_lr: 1.0e-5

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:25:00"
checkpoint_robustness:
Expand Down
3 changes: 2 additions & 1 deletion examples/llm_finetune/qwen/qwen2_5_7b_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ optimizer:
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: HuiyingLi
time: "00:45:00"
vllm_deploy: true
recipe_owner: HuiyingLi
checkpoint_robustness:
hf_kl_threshold: 9e-3
distributed.tp_size: 2
Expand Down
1 change: 1 addition & 0 deletions examples/llm_finetune/qwen/qwen2_5_7b_squad_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ optimizer:
# min_lr: 1.0e-5

ci:
vllm_deploy: true
recipe_owner: HuiyingLi
time: "00:25:00"
checkpoint_robustness:
Expand Down
2 changes: 2 additions & 0 deletions examples/llm_finetune/qwen/qwen3_moe_30b_te_deepep.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,5 @@ ci:
recipe_owner: hemildesai
node_multiplier: true
time: "00:15:00"
vllm_deploy: true
vllm_smoke_test: true
4 changes: 4 additions & 0 deletions tests/ci_tests/requirements_deploy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vllm==0.19.0
peft==0.18.1
pytest
pyyaml
54 changes: 54 additions & 0 deletions tests/ci_tests/scripts/vllm_launcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!/bin/bash
# Unified vLLM deployment test launcher.
# Determines SFT vs PEFT from CI_JOB_STAGE and passes --deploy_mode explicitly.
# Expects: CONFIG_PATH, TEST_NAME, PIPELINE_DIR, CI_JOB_STAGE
set -xeuo pipefail

export PYTHONPATH=${PYTHONPATH:-}:$(pwd)
export CUDA_VISIBLE_DEVICES="0"

cd /opt/Automodel
uv venv /tmp/vllm_deploy_venv
source /tmp/vllm_deploy_venv/bin/activate
uv pip install -r tests/ci_tests/requirements_deploy.txt

TEST_SCRIPT="tests/functional_tests/checkpoint_robustness/test_checkpoint_vllm_deploy.py"
FINETUNE_TEST_NAME="${TEST_NAME%_vllm_deploy}"
CKPT_DIR="$PIPELINE_DIR/$FINETUNE_TEST_NAME/robustness_checkpoint"
CKPT_BASE=$(ls -d "${CKPT_DIR}"/epoch_*_step_* 2>/dev/null | sort | tail -1 || true)

if [[ -z "$CKPT_BASE" ]]; then
echo "ERROR: No checkpoint found under ${CKPT_DIR}"
echo "Contents of $PIPELINE_DIR/$FINETUNE_TEST_NAME/:"
ls -la "$PIPELINE_DIR/$FINETUNE_TEST_NAME/" 2>/dev/null || echo " Directory does not exist"
exit 1
fi
echo "Using checkpoint: ${CKPT_BASE}"

if [[ "$CI_JOB_STAGE" == *"peft"* ]]; then
python -m pytest $TEST_SCRIPT \
--deploy_mode peft \
--config_path "$CONFIG_PATH" \
--adapter_path "${CKPT_BASE}/model/" \
--max_new_tokens 50
else
python -m pytest $TEST_SCRIPT \
--deploy_mode sft \
--config_path "$CONFIG_PATH" \
--deploy_model_path "${CKPT_BASE}/model/consolidated/" \
--max_new_tokens 50
fi
20 changes: 18 additions & 2 deletions tests/ci_tests/utils/generate_ci_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,20 @@ def generate_job(config: str, config_override: Dict[str, Any], scope: str, test_
slurm_time = job['variables'].get('TIME', '00:10:00')
job['variables']['TIME'] = DQ(slurm_time_multiplier(slurm_time, 2))

return job
# Generate vLLM deploy job if recipe opts in
vllm_job = None
if ci_config.get('vllm_deploy'):
vllm_stage = 'peft_vllm_deploy' if 'peft' in config.stem else 'sft_vllm_deploy'
vllm_job = {
'extends': '.vllm_deploy_test',
'stage': vllm_stage,
'variables': {
'CONFIG_PATH': f'{config}',
'TEST_LEVEL': f'{scope}',
}
}

return job, vllm_job


def generate_pipeline(automodel_dir: str, scope: str, test_folder: str):
Expand Down Expand Up @@ -201,7 +214,10 @@ def generate_pipeline(automodel_dir: str, scope: str, test_folder: str):
if model_name in exempt_models_list or config_name in exempt_configs_list:
continue

pipeline[f'{config_name}'] = generate_job(config, config_override, scope, test_folder, automodel_dir)
job, vllm_job = generate_job(config, config_override, scope, test_folder, automodel_dir)
pipeline[f'{config_name}'] = job
if vllm_job:
pipeline[f'{config_name}_vllm_deploy'] = vllm_job

return pipeline

Expand Down
Loading
Loading