diff --git a/examples/diffusion/finetune/flux_t2i_flow.yaml b/examples/diffusion/finetune/flux_t2i_flow.yaml index 4b14092fde..f0dcd0526c 100644 --- a/examples/diffusion/finetune/flux_t2i_flow.yaml +++ b/examples/diffusion/finetune/flux_t2i_flow.yaml @@ -77,3 +77,7 @@ dist_env: init_method: "env://" seed: 42 + +ci: + recipe_owner: pthombre + time: "00:30:00" diff --git a/examples/diffusion/finetune/hunyuan_t2v_flow.yaml b/examples/diffusion/finetune/hunyuan_t2v_flow.yaml index 38949fae21..51039f8f2a 100644 --- a/examples/diffusion/finetune/hunyuan_t2v_flow.yaml +++ b/examples/diffusion/finetune/hunyuan_t2v_flow.yaml @@ -80,3 +80,7 @@ dist_env: init_method: "env://" seed: 42 + +ci: + recipe_owner: pthombre + time: "01:30:00" diff --git a/examples/diffusion/finetune/qwen_image_t2i_flow.yaml b/examples/diffusion/finetune/qwen_image_t2i_flow.yaml new file mode 100644 index 0000000000..ae3d4ebb78 --- /dev/null +++ b/examples/diffusion/finetune/qwen_image_t2i_flow.yaml @@ -0,0 +1,84 @@ +model: + pretrained_model_name_or_path: "Qwen/Qwen-Image" + mode: "finetune" + cache_dir: null + attention_backend: "flash" + +optim: + learning_rate: 1e-5 + + optimizer: + weight_decay: 0.01 + betas: [0.9, 0.999] + +#adjust dp_size to the total number of GPUs +fsdp: + dp_size: 8 + tp_size: 1 + cp_size: 1 + pp_size: 1 + activation_checkpointing: false + cpu_offload: false + +flow_matching: + adapter_type: "qwen_image" + adapter_kwargs: + guidance_scale: 3.5 + use_guidance_embeds: false + timestep_sampling: "logit_normal" + logit_mean: 0.0 + logit_std: 1.0 + flow_shift: 2.23 + mix_uniform_ratio: 0.0 + sigma_min: 0.02 + sigma_max: 1.0 + num_train_timesteps: 1000 + i2v_prob: 0.0 + use_loss_weighting: true + loss_weighting_scheme: "bsmntw" + log_interval: 100 + summary_log_interval: 10 + +step_scheduler: + num_epochs: 10 + local_batch_size: 1 + global_batch_size: 8 + ckpt_every_steps: 500 + save_checkpoint_every_epoch: false + log_every: 1 + # max_steps: null # Set to limit training to a specific number of steps + +data: + dataloader: + _target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader + cache_dir: PATH_TO_YOUR_DATA + train_text_encoder: false + num_workers: 2 + # Supported resolutions include [256x256], [512x512], and [1024x1024]. + base_resolution: [512, 512] + dynamic_batch_size: false + shuffle: true + drop_last: false + +checkpoint: + enabled: true + checkpoint_dir: PATH_TO_YOUR_CKPT_DIR + model_save_format: safetensors + save_consolidated: true + diffusers_compatible: true + restore_from: null + +wandb: + project: qwen-image-finetuning + mode: online + name: qwen_image_finetune_run_1 + +dist_env: + backend: "nccl" + init_method: "env://" + +seed: 42 + +ci: + recipe_owner: pthombre + time: "00:30:00" diff --git a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml index f8e21bf190..6b86a9eca2 100644 --- a/examples/diffusion/finetune/wan2_1_t2v_flow.yaml +++ b/examples/diffusion/finetune/wan2_1_t2v_flow.yaml @@ -73,3 +73,7 @@ checkpoint: save_consolidated: true diffusers_compatible: true restore_from: null + +ci: + recipe_owner: pthombre + time: "00:30:00" diff --git a/examples/diffusion/generate/generate.py b/examples/diffusion/generate/generate.py index cfa34db0f9..5b94164ec5 100644 --- a/examples/diffusion/generate/generate.py +++ b/examples/diffusion/generate/generate.py @@ -53,6 +53,7 @@ # Pipeline class name -> output type mapping _PIPELINE_OUTPUT_TYPES = { "FluxPipeline": "image", + "QwenImagePipeline": "image", "WanPipeline": "video", "HunyuanVideoPipeline": "video", "HunyuanVideo15Pipeline": "video", diff --git a/tests/ci_tests/configs/diffusion_finetune/nightly_recipes.yml b/tests/ci_tests/configs/diffusion_finetune/nightly_recipes.yml new file mode 100644 index 0000000000..9c89f637e5 --- /dev/null +++ b/tests/ci_tests/configs/diffusion_finetune/nightly_recipes.yml @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +examples_dir: diffusion/finetune +configs: + - wan2_1_t2v_flow.yaml + - hunyuan_t2v_flow.yaml + - flux_t2i_flow.yaml + - qwen_image_t2i_flow.yaml diff --git a/tests/ci_tests/configs/diffusion_finetune/override_recipes.yml b/tests/ci_tests/configs/diffusion_finetune/override_recipes.yml new file mode 100644 index 0000000000..0d29fff83f --- /dev/null +++ b/tests/ci_tests/configs/diffusion_finetune/override_recipes.yml @@ -0,0 +1,20 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +exempt_models: + +exempt_configs: + +known_issue: + diff --git a/tests/ci_tests/scripts/diffusion_finetune_launcher.sh b/tests/ci_tests/scripts/diffusion_finetune_launcher.sh new file mode 100755 index 0000000000..10a6d43ecf --- /dev/null +++ b/tests/ci_tests/scripts/diffusion_finetune_launcher.sh @@ -0,0 +1,192 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Environment variables expected from CI template: +# CONFIG_PATH, TEST_LEVEL, NPROC_PER_NODE, TEST_NODE_COUNT, +# MASTER_ADDR, MASTER_PORT, SLURM_JOB_ID, PIPELINE_DIR, TEST_NAME + +DATA_DIR="$PIPELINE_DIR/$TEST_NAME/data" +CKPT_DIR="$PIPELINE_DIR/$TEST_NAME/checkpoint" +INFER_DIR="$PIPELINE_DIR/$TEST_NAME/inference_output" + +cd /opt/Automodel + +# ============================================ +# Derive model-specific settings from config +# ============================================ +RECIPE_NAME=$(basename "$CONFIG_PATH" .yaml) +case "$RECIPE_NAME" in + wan2_1_t2v_flow*) + MEDIA_TYPE="video" + PROCESSOR="wan" + GENERATE_CONFIG="examples/diffusion/generate/configs/generate_wan.yaml" + MODEL_NAME="Wan-AI/Wan2.1-T2V-1.3B-Diffusers" + INFER_NUM_FRAMES=9 + PREPROCESS_EXTRA_ARGS="" + ;; + hunyuan_t2v_flow*) + MEDIA_TYPE="video" + PROCESSOR="hunyuan" + GENERATE_CONFIG="examples/diffusion/generate/configs/generate_hunyuan.yaml" + MODEL_NAME="hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v" + INFER_NUM_FRAMES=5 + PREPROCESS_EXTRA_ARGS="--target_frames 13" + ;; + flux_t2i_flow*) + MEDIA_TYPE="image" + PROCESSOR="flux" + GENERATE_CONFIG="examples/diffusion/generate/configs/generate_flux.yaml" + MODEL_NAME="black-forest-labs/FLUX.1-dev" + PREPROCESS_EXTRA_ARGS="" + ;; + qwen_image_t2i_flow*) + MEDIA_TYPE="image" + PROCESSOR="qwen_image" + GENERATE_CONFIG="examples/diffusion/generate/configs/generate_qwen_image.yaml" + MODEL_NAME="Qwen/Qwen-Image" + PREPROCESS_EXTRA_ARGS="" + ;; + *) + echo "ERROR: Unknown recipe '$RECIPE_NAME'. Add a case to diffusion_finetune_launcher.sh." + exit 1 + ;; +esac +echo "[config] Recipe=$RECIPE_NAME MediaType=$MEDIA_TYPE Processor=$PROCESSOR Model=$MODEL_NAME" + +# ============================================ +# Stage 1: Download dataset +# ============================================ +echo "============================================" +echo "[data] Downloading dataset..." +echo "============================================" +if [ "$MEDIA_TYPE" = "image" ]; then + uv run --extra diffusion python -c " +from datasets import load_dataset +from pathlib import Path +import json + +ds = load_dataset('diffusers/tuxemon', split='train') +out_dir = Path('$DATA_DIR/raw') +out_dir.mkdir(parents=True, exist_ok=True) + +jsonl_entries = [] +for i, row in enumerate(ds): + fname = f'tuxemon_sample_{i:04d}.png' + row['image'].save(out_dir / fname) + jsonl_entries.append({'file_name': fname, 'internvl': row['gpt4_turbo_caption']}) + +jsonl_path = out_dir / 'tuxemon_internvl.json' +with open(jsonl_path, 'w') as jf: + for entry in jsonl_entries: + jf.write(json.dumps(entry) + '\n') + +print(f'Extracted {len(ds)} images to {out_dir}') +" +else + uv run --extra diffusion python -c " +from huggingface_hub import snapshot_download +snapshot_download('modal-labs/dissolve', repo_type='dataset', local_dir='$DATA_DIR/raw') +print('Dataset downloaded successfully') +" +fi + +# ============================================ +# Stage 2: Preprocess to latents +# ============================================ +echo "============================================" +echo "[preprocess] Converting ${MEDIA_TYPE}s to latents..." +echo "============================================" +if [ "$MEDIA_TYPE" = "image" ]; then + uv run --extra diffusion python -m tools.diffusion.preprocessing_multiprocess image \ + --image_dir "$DATA_DIR/raw" \ + --output_dir "$DATA_DIR/cache" \ + --processor "$PROCESSOR" \ + $PREPROCESS_EXTRA_ARGS +else + uv run --extra diffusion python -m tools.diffusion.preprocessing_multiprocess video \ + --video_dir "$DATA_DIR/raw" \ + --output_dir "$DATA_DIR/cache" \ + --processor "$PROCESSOR" \ + --resolution_preset 512p \ + --caption_format sidecar \ + $PREPROCESS_EXTRA_ARGS +fi + +# ============================================ +# Stage 3: Finetune +# ============================================ +echo "============================================" +echo "[finetune] Running finetuning..." +echo "============================================" +CONFIG="--config /opt/Automodel/${CONFIG_PATH} \ + --data.dataloader.cache_dir $DATA_DIR/cache \ + --checkpoint.checkpoint_dir $CKPT_DIR \ + --step_scheduler.max_steps ${MAX_STEPS:-100} \ + --step_scheduler.ckpt_every_steps 100 \ + --step_scheduler.save_checkpoint_every_epoch false \ + --fsdp.dp_size ${NPROC_PER_NODE} \ + --wandb.mode disabled" + +CMD="uv run --extra diffusion torchrun --nproc-per-node=${NPROC_PER_NODE} \ + --nnodes=${TEST_NODE_COUNT} \ + --rdzv_backend=c10d \ + --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \ + --rdzv_id=${SLURM_JOB_ID}" + +eval $CMD examples/diffusion/finetune/finetune.py $CONFIG + +# ============================================ +# Stage 4: Inference smoke test +# ============================================ +echo "============================================" +echo "[inference] Running inference smoke test..." +echo "============================================" +CKPT_STEP_DIR=$(ls -d $CKPT_DIR/epoch_*_step_* | sort -t_ -k4 -n | tail -1) + +if [ "$MEDIA_TYPE" = "image" ]; then + uv run --extra diffusion python examples/diffusion/generate/generate.py \ + --config "$GENERATE_CONFIG" \ + --model.pretrained_model_name_or_path "$MODEL_NAME" \ + --model.checkpoint "$CKPT_STEP_DIR" \ + --inference.num_inference_steps 5 \ + --output.output_dir "$INFER_DIR" \ + --vae.enable_slicing true \ + --vae.enable_tiling true + + if ls $INFER_DIR/sample_*.png 1>/dev/null 2>&1; then + echo "[inference] SUCCESS: Output image(s) generated" + else + echo "[inference] FAILURE: No output images found" + exit 1 + fi +else + uv run --extra diffusion python examples/diffusion/generate/generate.py \ + --config "$GENERATE_CONFIG" \ + --model.pretrained_model_name_or_path "$MODEL_NAME" \ + --model.checkpoint "$CKPT_STEP_DIR" \ + --inference.num_inference_steps 5 \ + --inference.pipeline_kwargs.num_frames "$INFER_NUM_FRAMES" \ + --output.output_dir "$INFER_DIR" \ + --vae.enable_slicing true \ + --vae.enable_tiling true + + if ls $INFER_DIR/sample_*.mp4 1>/dev/null 2>&1; then + echo "[inference] SUCCESS: Output video(s) generated" + else + echo "[inference] FAILURE: No output videos found" + exit 1 + fi +fi diff --git a/tests/ci_tests/utils/generate_ci_tests.py b/tests/ci_tests/utils/generate_ci_tests.py index 3773495886..6cc0f5d12d 100644 --- a/tests/ci_tests/utils/generate_ci_tests.py +++ b/tests/ci_tests/utils/generate_ci_tests.py @@ -91,7 +91,8 @@ def detect_yml_configurations(automodel_dir: str, scope: str, test_folder: str): config_path = f"{automodel_dir}/tests/ci_tests/configs/{test_folder}/{scope}_recipes.yml" with open(config_path, "r", encoding="utf-8") as f: test_configs = yaml.load(f) - yml_configs = [Path(f"examples/{test_folder}/{c}") for c in test_configs['configs']] + examples_dir = test_configs.get("examples_dir", test_folder) + yml_configs = [Path(f"examples/{examples_dir}/{c}") for c in test_configs["configs"]] return yml_configs @@ -113,41 +114,41 @@ def generate_job(config: str, config_override: Dict[str, Any], scope: str, test_ """ # Initialize test job job = { - 'variables': { - 'CONFIG_PATH': f'{config}', - 'TEST_LEVEL': f'{scope}', + "variables": { + "CONFIG_PATH": f"{config}", + "TEST_LEVEL": f"{scope}", } } # Configure test template - if 'benchmark' in config.stem: - job['extends'] = '.llm_benchmark_test' + if "benchmark" in config.stem: + job["extends"] = ".llm_benchmark_test" else: - job['extends'] = f'.{test_folder}_test' + job["extends"] = f".{test_folder}_test" # Apply resource overrides (time, nodes, etc.) from the recipe's top-level ci: section recipe_path = f"{automodel_dir}/{config}" with open(recipe_path, "r", encoding="utf-8") as rf: recipe = yaml.load(rf) - ci_config = recipe.get('ci') or {} + ci_config = recipe.get("ci") or {} ci_key_map = { - 'time': 'TIME', - 'nodes': 'TEST_NODE_COUNT', - 'node_multiplier': 'NODE_MULTIPLIER', - 'local_batch_size': 'LOCAL_BATCH_SIZE', - 'recipe_owner': 'RECIPE_OWNER', - 'nproc_per_node': 'CONFIG_NPROC_PER_NODE', + "time": "TIME", + "nodes": "TEST_NODE_COUNT", + "node_multiplier": "NODE_MULTIPLIER", + "local_batch_size": "LOCAL_BATCH_SIZE", + "recipe_owner": "RECIPE_OWNER", + "nproc_per_node": "CONFIG_NPROC_PER_NODE", } for ci_key, ci_var in ci_key_map.items(): if ci_key in ci_config: value = ci_config[ci_key] - if ci_var == 'TIME': - job['variables'][ci_var] = DQ(str(value)) - elif ci_var == 'NODE_MULTIPLIER': - job['variables'][ci_var] = str(value).lower() + if ci_var == "TIME": + job["variables"][ci_var] = DQ(str(value)) + elif ci_var == "NODE_MULTIPLIER": + job["variables"][ci_var] = str(value).lower() else: - job['variables'][ci_var] = value + job["variables"][ci_var] = value # Pass through env_vars as CI variables (exported to container via --export=ALL) for key, value in ci_config.get('env_vars', {}).items(): @@ -157,36 +158,38 @@ def generate_job(config: str, config_override: Dict[str, Any], scope: str, test_ job['variables']['HAS_ROBUSTNESS'] = str(has_robustness).lower() # Configure test stage based on recipe type and robustness config - if 'benchmark' in test_folder: - job['stage'] = 'performance' - elif 'benchmark' in config.stem: - job['stage'] = 'benchmark' - elif 'peft' in config.stem: - job['stage'] = 'peft_ckpt_robustness' if has_robustness else 'peft' + if "benchmark" in test_folder: + job["stage"] = "performance" + elif "benchmark" in config.stem: + job["stage"] = "benchmark" + elif test_folder.startswith("diffusion"): + job["stage"] = "diffusion_sft" + elif "peft" in config.stem: + job["stage"] = "peft_ckpt_robustness" if has_robustness else "peft" else: - job['stage'] = 'sft_ckpt_robustness' if has_robustness else 'sft' + job["stage"] = "sft_ckpt_robustness" if has_robustness else "sft" # Check if config has known issue - known_issue_config_list = config_override.get('known_issue') or [] + known_issue_config_list = config_override.get("known_issue") or [] if config.stem in known_issue_config_list: - job['allow_failure'] = True + job["allow_failure"] = True # Double time allocation as tests run for 2 epoch if scope == "convergence": - slurm_time = job['variables'].get('TIME', '00:10:00') - job['variables']['TIME'] = DQ(slurm_time_multiplier(slurm_time, 2)) + slurm_time = job["variables"].get("TIME", "00:10:00") + job["variables"]["TIME"] = DQ(slurm_time_multiplier(slurm_time, 2)) # Generate vLLM deploy job if recipe opts in vllm_job = None - if ci_config.get('vllm_deploy'): - vllm_stage = 'peft_vllm_deploy' if 'peft' in config.stem else 'sft_vllm_deploy' + if ci_config.get("vllm_deploy"): + vllm_stage = "peft_vllm_deploy" if "peft" in config.stem else "sft_vllm_deploy" vllm_job = { - 'extends': '.vllm_deploy_test', - 'stage': vllm_stage, - 'variables': { - 'CONFIG_PATH': f'{config}', - 'TEST_LEVEL': f'{scope}', - } + "extends": ".vllm_deploy_test", + "stage": vllm_stage, + "variables": { + "CONFIG_PATH": f"{config}", + "TEST_LEVEL": f"{scope}", + }, } return job, vllm_job @@ -215,45 +218,43 @@ def generate_pipeline(automodel_dir: str, scope: str, test_folder: str): yml_configs = detect_yml_configurations(automodel_dir, scope, test_folder) if not yml_configs: - raise Exception(f'No yml configurations were found under {automodel_dir}/examples/{test_folder}') + raise Exception(f"No yml configurations were found under {automodel_dir}/examples/{test_folder}") - pipeline = { - 'include': ['automodel/automodel_ci_template.yml'] - } + pipeline = {"include": ["automodel/automodel_ci_template.yml"]} for config in yml_configs: model_name = config.parent.name config_name = config.stem # Check if model is in exempt model list - exempt_models_list = config_override.get('exempt_models') or [] - exempt_configs_list = config_override.get('exempt_configs') or [] + exempt_models_list = config_override.get("exempt_models") or [] + exempt_configs_list = config_override.get("exempt_configs") or [] if model_name in exempt_models_list or config_name in exempt_configs_list: continue job, vllm_job = generate_job(config, config_override, scope, test_folder, automodel_dir) - pipeline[f'{config_name}'] = job + pipeline[f"{config_name}"] = job if vllm_job: - pipeline[f'{config_name}_vllm_deploy'] = vllm_job + pipeline[f"{config_name}_vllm_deploy"] = vllm_job return pipeline def main(): parser = argparse.ArgumentParser() - parser.add_argument('--automodel-dir', type=str, required=True, help='Path to Automodel directory') - parser.add_argument('--scope', type=str, required=True, help='Scope of the tests (nightly, release)') - parser.add_argument('--test-folder', type=str, required=True, help='Target folder to search') + parser.add_argument("--automodel-dir", type=str, required=True, help="Path to Automodel directory") + parser.add_argument("--scope", type=str, required=True, help="Scope of the tests (nightly, release)") + parser.add_argument("--test-folder", type=str, required=True, help="Target folder to search") args = parser.parse_args() pipeline = generate_pipeline(args.automodel_dir, args.scope, args.test_folder) if pipeline: - with open(f'generated_automodel_{args.test_folder}_tests.yml', 'w') as f: + with open(f"generated_automodel_{args.test_folder}_tests.yml", "w") as f: yaml.dump(pipeline, f) print(f"Generated pipeline with {len([k for k in pipeline.keys() if k != 'stages'])} jobs") -if __name__ == '__main__': +if __name__ == "__main__": main()