Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/diffusion/finetune/flux_t2i_flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,7 @@ dist_env:
init_method: "env://"

seed: 42

ci:
recipe_owner: pthombre
time: "00:30:00"
4 changes: 4 additions & 0 deletions examples/diffusion/finetune/hunyuan_t2v_flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,7 @@ dist_env:
init_method: "env://"

seed: 42

ci:
recipe_owner: pthombre
time: "01:30:00"
84 changes: 84 additions & 0 deletions examples/diffusion/finetune/qwen_image_t2i_flow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
model:
pretrained_model_name_or_path: "Qwen/Qwen-Image"
mode: "finetune"
cache_dir: null
attention_backend: "flash"

optim:
learning_rate: 1e-5

optimizer:
weight_decay: 0.01
betas: [0.9, 0.999]

#adjust dp_size to the total number of GPUs
fsdp:
dp_size: 8
tp_size: 1
cp_size: 1
pp_size: 1
activation_checkpointing: false
cpu_offload: false

flow_matching:
adapter_type: "qwen_image"
adapter_kwargs:
guidance_scale: 3.5
use_guidance_embeds: false
timestep_sampling: "logit_normal"
logit_mean: 0.0
logit_std: 1.0
flow_shift: 2.23
mix_uniform_ratio: 0.0
sigma_min: 0.02
sigma_max: 1.0
num_train_timesteps: 1000
i2v_prob: 0.0
use_loss_weighting: true
loss_weighting_scheme: "bsmntw"
log_interval: 100
summary_log_interval: 10

step_scheduler:
num_epochs: 10
local_batch_size: 1
global_batch_size: 8
ckpt_every_steps: 500
save_checkpoint_every_epoch: false
log_every: 1
# max_steps: null # Set to limit training to a specific number of steps

data:
dataloader:
_target_: nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader
cache_dir: PATH_TO_YOUR_DATA
train_text_encoder: false
num_workers: 2
# Supported resolutions include [256x256], [512x512], and [1024x1024].
base_resolution: [512, 512]
dynamic_batch_size: false
shuffle: true
drop_last: false

checkpoint:
enabled: true
checkpoint_dir: PATH_TO_YOUR_CKPT_DIR
model_save_format: safetensors
save_consolidated: true
diffusers_compatible: true
restore_from: null

wandb:
project: qwen-image-finetuning
mode: online
name: qwen_image_finetune_run_1

dist_env:
backend: "nccl"
init_method: "env://"

seed: 42

ci:
recipe_owner: pthombre
time: "00:30:00"
4 changes: 4 additions & 0 deletions examples/diffusion/finetune/wan2_1_t2v_flow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@ checkpoint:
save_consolidated: true
diffusers_compatible: true
restore_from: null

ci:
recipe_owner: pthombre
time: "00:30:00"
1 change: 1 addition & 0 deletions examples/diffusion/generate/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
# Pipeline class name -> output type mapping
_PIPELINE_OUTPUT_TYPES = {
"FluxPipeline": "image",
"QwenImagePipeline": "image",
"WanPipeline": "video",
"HunyuanVideoPipeline": "video",
"HunyuanVideo15Pipeline": "video",
Expand Down
20 changes: 20 additions & 0 deletions tests/ci_tests/configs/diffusion_finetune/nightly_recipes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

examples_dir: diffusion/finetune
configs:
- wan2_1_t2v_flow.yaml
- hunyuan_t2v_flow.yaml
- flux_t2i_flow.yaml
- qwen_image_t2i_flow.yaml
20 changes: 20 additions & 0 deletions tests/ci_tests/configs/diffusion_finetune/override_recipes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

exempt_models:

exempt_configs:

known_issue:

192 changes: 192 additions & 0 deletions tests/ci_tests/scripts/diffusion_finetune_launcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail

# Environment variables expected from CI template:
# CONFIG_PATH, TEST_LEVEL, NPROC_PER_NODE, TEST_NODE_COUNT,
# MASTER_ADDR, MASTER_PORT, SLURM_JOB_ID, PIPELINE_DIR, TEST_NAME

DATA_DIR="$PIPELINE_DIR/$TEST_NAME/data"
CKPT_DIR="$PIPELINE_DIR/$TEST_NAME/checkpoint"
INFER_DIR="$PIPELINE_DIR/$TEST_NAME/inference_output"

cd /opt/Automodel

# ============================================
# Derive model-specific settings from config
# ============================================
RECIPE_NAME=$(basename "$CONFIG_PATH" .yaml)
case "$RECIPE_NAME" in
wan2_1_t2v_flow*)
MEDIA_TYPE="video"
PROCESSOR="wan"
GENERATE_CONFIG="examples/diffusion/generate/configs/generate_wan.yaml"
MODEL_NAME="Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
INFER_NUM_FRAMES=9
PREPROCESS_EXTRA_ARGS=""
;;
hunyuan_t2v_flow*)
MEDIA_TYPE="video"
PROCESSOR="hunyuan"
GENERATE_CONFIG="examples/diffusion/generate/configs/generate_hunyuan.yaml"
MODEL_NAME="hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v"
INFER_NUM_FRAMES=5
PREPROCESS_EXTRA_ARGS="--target_frames 13"
;;
flux_t2i_flow*)
MEDIA_TYPE="image"
PROCESSOR="flux"
GENERATE_CONFIG="examples/diffusion/generate/configs/generate_flux.yaml"
MODEL_NAME="black-forest-labs/FLUX.1-dev"
PREPROCESS_EXTRA_ARGS=""
;;
qwen_image_t2i_flow*)
MEDIA_TYPE="image"
PROCESSOR="qwen_image"
GENERATE_CONFIG="examples/diffusion/generate/configs/generate_qwen_image.yaml"
MODEL_NAME="Qwen/Qwen-Image"
PREPROCESS_EXTRA_ARGS=""
;;
*)
echo "ERROR: Unknown recipe '$RECIPE_NAME'. Add a case to diffusion_finetune_launcher.sh."
exit 1
;;
esac
echo "[config] Recipe=$RECIPE_NAME MediaType=$MEDIA_TYPE Processor=$PROCESSOR Model=$MODEL_NAME"

# ============================================
# Stage 1: Download dataset
# ============================================
echo "============================================"
echo "[data] Downloading dataset..."
echo "============================================"
if [ "$MEDIA_TYPE" = "image" ]; then
uv run --extra diffusion python -c "
from datasets import load_dataset
from pathlib import Path
import json

ds = load_dataset('diffusers/tuxemon', split='train')
out_dir = Path('$DATA_DIR/raw')
out_dir.mkdir(parents=True, exist_ok=True)

jsonl_entries = []
for i, row in enumerate(ds):
fname = f'tuxemon_sample_{i:04d}.png'
row['image'].save(out_dir / fname)
jsonl_entries.append({'file_name': fname, 'internvl': row['gpt4_turbo_caption']})

jsonl_path = out_dir / 'tuxemon_internvl.json'
with open(jsonl_path, 'w') as jf:
for entry in jsonl_entries:
jf.write(json.dumps(entry) + '\n')

print(f'Extracted {len(ds)} images to {out_dir}')
"
else
uv run --extra diffusion python -c "
from huggingface_hub import snapshot_download
snapshot_download('modal-labs/dissolve', repo_type='dataset', local_dir='$DATA_DIR/raw')
print('Dataset downloaded successfully')
"
fi

# ============================================
# Stage 2: Preprocess to latents
# ============================================
echo "============================================"
echo "[preprocess] Converting ${MEDIA_TYPE}s to latents..."
echo "============================================"
if [ "$MEDIA_TYPE" = "image" ]; then
uv run --extra diffusion python -m tools.diffusion.preprocessing_multiprocess image \
--image_dir "$DATA_DIR/raw" \
--output_dir "$DATA_DIR/cache" \
--processor "$PROCESSOR" \
$PREPROCESS_EXTRA_ARGS
else
uv run --extra diffusion python -m tools.diffusion.preprocessing_multiprocess video \
--video_dir "$DATA_DIR/raw" \
--output_dir "$DATA_DIR/cache" \
--processor "$PROCESSOR" \
--resolution_preset 512p \
--caption_format sidecar \
$PREPROCESS_EXTRA_ARGS
fi

# ============================================
# Stage 3: Finetune
# ============================================
echo "============================================"
echo "[finetune] Running finetuning..."
echo "============================================"
CONFIG="--config /opt/Automodel/${CONFIG_PATH} \
--data.dataloader.cache_dir $DATA_DIR/cache \
--checkpoint.checkpoint_dir $CKPT_DIR \
--step_scheduler.max_steps ${MAX_STEPS:-100} \
--step_scheduler.ckpt_every_steps 100 \
--step_scheduler.save_checkpoint_every_epoch false \
--fsdp.dp_size ${NPROC_PER_NODE} \
--wandb.mode disabled"

CMD="uv run --extra diffusion torchrun --nproc-per-node=${NPROC_PER_NODE} \
--nnodes=${TEST_NODE_COUNT} \
--rdzv_backend=c10d \
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--rdzv_id=${SLURM_JOB_ID}"

eval $CMD examples/diffusion/finetune/finetune.py $CONFIG

# ============================================
# Stage 4: Inference smoke test
# ============================================
echo "============================================"
echo "[inference] Running inference smoke test..."
echo "============================================"
CKPT_STEP_DIR=$(ls -d $CKPT_DIR/epoch_*_step_* | sort -t_ -k4 -n | tail -1)

if [ "$MEDIA_TYPE" = "image" ]; then
uv run --extra diffusion python examples/diffusion/generate/generate.py \
--config "$GENERATE_CONFIG" \
--model.pretrained_model_name_or_path "$MODEL_NAME" \
--model.checkpoint "$CKPT_STEP_DIR" \
--inference.num_inference_steps 5 \
--output.output_dir "$INFER_DIR" \
--vae.enable_slicing true \
--vae.enable_tiling true

if ls $INFER_DIR/sample_*.png 1>/dev/null 2>&1; then
echo "[inference] SUCCESS: Output image(s) generated"
else
echo "[inference] FAILURE: No output images found"
exit 1
fi
else
uv run --extra diffusion python examples/diffusion/generate/generate.py \
--config "$GENERATE_CONFIG" \
--model.pretrained_model_name_or_path "$MODEL_NAME" \
--model.checkpoint "$CKPT_STEP_DIR" \
--inference.num_inference_steps 5 \
--inference.pipeline_kwargs.num_frames "$INFER_NUM_FRAMES" \
--output.output_dir "$INFER_DIR" \
--vae.enable_slicing true \
--vae.enable_tiling true

if ls $INFER_DIR/sample_*.mp4 1>/dev/null 2>&1; then
echo "[inference] SUCCESS: Output video(s) generated"
else
echo "[inference] FAILURE: No output videos found"
exit 1
fi
fi
Loading
Loading