Skip to content

Commit 53b29fd

Browse files
committed
Merge branch 'main' into main
Signed-off-by: hychiang <kenny5312012@gmail.com>
2 parents fc7cb41 + 6d9773b commit 53b29fd

47 files changed

Lines changed: 4709 additions & 290 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/example_tests.yml

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,21 @@ jobs:
5656
match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
5757
delay: 300s
5858

59-
##### PyTorch Example Tests #####
59+
##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
6060
torch-pr:
6161
needs: [check-file-changes, wait-checks]
6262
if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
6363
strategy:
6464
fail-fast: false
6565
matrix:
6666
example: [llm_distill, llm_qat, llm_sparsity]
67+
include:
68+
- example: speculative_decoding
69+
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
6770
uses: ./.github/workflows/_example_tests_runner.yml
6871
secrets: inherit
6972
with:
70-
docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
73+
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
7174
example: ${{ matrix.example }}
7275
pip_install_extras: "[hf,dev-test]"
7376
runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
7881
fail-fast: false
7982
matrix:
8083
example: [llm_distill, llm_qat, llm_sparsity]
84+
include:
85+
- example: speculative_decoding
86+
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
8187
uses: ./.github/workflows/_example_tests_runner.yml
8288
secrets: inherit
8389
with:
84-
docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
90+
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
8591
example: ${{ matrix.example }}
8692
pip_install_extras: "[hf,dev-test]"
8793
runner: linux-amd64-gpu-h100-latest-2
8894

89-
##### Speculative Decoding Example Tests (requires 26.01 image) #####
90-
speculative-decoding-pr:
91-
needs: [check-file-changes, wait-checks]
92-
if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
93-
uses: ./.github/workflows/_example_tests_runner.yml
94-
secrets: inherit
95-
with:
96-
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
97-
example: speculative_decoding
98-
pip_install_extras: "[hf,dev-test]"
99-
runner: linux-amd64-gpu-l4-latest-1
100-
101-
speculative-decoding-non-pr:
102-
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
103-
uses: ./.github/workflows/_example_tests_runner.yml
104-
secrets: inherit
105-
with:
106-
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
107-
example: speculative_decoding
108-
pip_install_extras: "[hf,dev-test]"
109-
runner: linux-amd64-gpu-h100-latest-2
110-
11195
##### TensorRT-LLM Example Tests #####
11296
trtllm-pr:
11397
needs: [check-file-changes, wait-checks]
@@ -172,15 +156,14 @@ jobs:
172156
example-pr-required-check:
173157
# Run even if example tests are skipped
174158
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
175-
needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
159+
needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
176160
runs-on: ubuntu-latest
177161
steps:
178162
- name: Required GPU tests did not succeed
179163
if: |
180164
needs.check-file-changes.result != 'success' ||
181165
(needs.check-file-changes.outputs.any_changed == 'true' && (
182166
needs.torch-pr.result != 'success' ||
183-
needs.speculative-decoding-pr.result != 'success' ||
184167
needs.trtllm-pr.result != 'success' ||
185168
needs.onnx-pr.result != 'success'
186169
))

.github/workflows/gpu_tests.yml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,16 @@ jobs:
5959
gpu-tests-pr:
6060
needs: [check-file-changes, wait-checks]
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
62+
strategy:
63+
fail-fast: false
64+
matrix:
65+
include:
66+
- example: py312-cuda12-gpu
67+
timeout: 90
68+
- example: py312-cuda12-gpu-megatron
69+
timeout: 120
6270
runs-on: linux-amd64-gpu-l4-latest-1
63-
timeout-minutes: 120
71+
timeout-minutes: ${{ matrix.timeout }}
6472
container: &gpu_container
6573
image: nvcr.io/nvidia/pytorch:25.06-py3
6674
env:
@@ -74,11 +82,19 @@ jobs:
7482
run: |
7583
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
7684
- name: Run gpu tests
77-
run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
85+
run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
7886
gpu-tests-non-pr:
7987
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
88+
strategy:
89+
fail-fast: false
90+
matrix:
91+
include:
92+
- example: py312-cuda12-gpu
93+
timeout: 90
94+
- example: py312-cuda12-gpu-megatron
95+
timeout: 120
8096
runs-on: linux-amd64-gpu-h100-latest-2
81-
timeout-minutes: 150
97+
timeout-minutes: ${{ matrix.timeout }}
8298
container: *gpu_container
8399
steps: *gpu_steps
84100
gpu-pr-required-check:

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ NVIDIA Model Optimizer Changelog (Linux)
2222
- Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
2323
- Add support for image-text data calibration in PTQ for Nemotron VL models.
2424
- Add Megatron Core export/import mapping for Qwen3-VL (``Qwen3VLForConditionalGeneration``) vision-language models. The mapping handles the ``model.language_model.`` weight prefix used by Qwen3-VL and supports both dense and MoE variants.
25+
- Add PTQ support for Nemotron Parse.
26+
- Add distillation support for LTX-2. See `examples/diffusers/distillation/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/distillation>`_ for more details.
2527

2628
0.41 (2026-01-19)
2729
^^^^^^^^^^^^^^^^^
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# LTX-2 Distillation Training with ModelOpt
2+
3+
Knowledge distillation for LTX-2 DiT models using NVIDIA ModelOpt. A frozen **teacher** guides a trainable **student** through a combined loss:
4+
5+
```text
6+
L_total = α × L_task + (1-α) × L_distill
7+
```
8+
9+
Currently supported:
10+
11+
- **Quantization-Aware Distillation (QAD)** — student uses ModelOpt fake quantization
12+
13+
Planned:
14+
15+
- **Sparsity-Aware Distillation (SAD)** — student uses ModelOpt sparsity
16+
17+
## Installation
18+
19+
```bash
20+
# From the distillation example directory
21+
cd examples/diffusers/distillation
22+
23+
# Install Model-Optimizer (from repo root)
24+
pip install -e ../../..
25+
26+
# Install all dependencies (ltx-trainer, ltx-core, ltx-pipelines, omegaconf)
27+
pip install -r requirements.txt
28+
```
29+
30+
## Quick Start
31+
32+
### 1. Prepare Your Dataset
33+
34+
Use the ltx-trainer preprocessing to extract latents and text embeddings:
35+
36+
```bash
37+
python -m ltx_trainer.preprocess \
38+
--input_dir /path/to/videos \
39+
--output_dir /path/to/preprocessed \
40+
--model_path /path/to/ltx2/checkpoint.safetensors
41+
```
42+
43+
### 2. Configure
44+
45+
Copy and edit the example config:
46+
47+
```bash
48+
cp configs/distillation_example.yaml configs/my_experiment.yaml
49+
```
50+
51+
Key settings to update:
52+
53+
```yaml
54+
model:
55+
model_path: "/path/to/ltx2/checkpoint.safetensors"
56+
text_encoder_path: "/path/to/gemma/model"
57+
58+
data:
59+
preprocessed_data_root: "/path/to/preprocessed/data"
60+
61+
distillation:
62+
distillation_alpha: 0.5 # 1.0 = pure task loss, 0.0 = pure distillation
63+
quant_cfg: "FP8_DEFAULT_CFG" # or INT8_DEFAULT_CFG, NVFP4_DEFAULT_CFG, null
64+
65+
# IMPORTANT: disable ltx-trainer's built-in quantization
66+
acceleration:
67+
quantization: null
68+
```
69+
70+
### 3. Run Training
71+
72+
#### Single GPU
73+
74+
```bash
75+
python distillation_trainer.py --config configs/my_experiment.yaml
76+
```
77+
78+
#### Multi-GPU (Single Node) with Accelerate
79+
80+
```bash
81+
accelerate launch \
82+
--config_file configs/accelerate/fsdp.yaml \
83+
--num_processes 8 \
84+
distillation_trainer.py --config configs/my_experiment.yaml
85+
```
86+
87+
#### Multi-node Training with Accelerate
88+
89+
To launch on multiple nodes, make sure to set the following environment variables on each node:
90+
91+
- `NUM_NODES`: Total number of nodes
92+
- `GPUS_PER_NODE`: Number of GPUs per node
93+
- `NODE_RANK`: Unique rank/index of this node (0-based)
94+
- `MASTER_ADDR`: IP address of the master node (rank 0)
95+
- `MASTER_PORT`: Communication port (e.g., 29500)
96+
97+
Then run this (on every node):
98+
99+
```bash
100+
accelerate launch \
101+
--config_file configs/accelerate/fsdp.yaml \
102+
--num_machines $NUM_NODES \
103+
--num_processes $((NUM_NODES * GPUS_PER_NODE)) \
104+
--machine_rank $NODE_RANK \
105+
--main_process_ip $MASTER_ADDR \
106+
--main_process_port $MASTER_PORT \
107+
distillation_trainer.py --config configs/my_experiment.yaml
108+
```
109+
110+
**Config overrides** can be passed via CLI using dotted notation:
111+
112+
```bash
113+
accelerate launch ... distillation_trainer.py \
114+
--config configs/my_experiment.yaml \
115+
++distillation.distillation_alpha=0.6 \
116+
++distillation.quant_cfg=INT8_DEFAULT_CFG \
117+
++optimization.learning_rate=1e-5
118+
```
119+
120+
## Configuration Reference
121+
122+
### Calibration
123+
124+
Before training begins, calibration runs full denoising inference to collect activation statistics for accurate quantizer scales. This is cached as a step-0 checkpoint and reused on subsequent runs.
125+
126+
| Parameter | Default | Description |
127+
|-----------|---------|-------------|
128+
| `calibration_prompts_file` | null | Text file with one prompt per line. Use the HuggingFace dataset 'Gustavosta/Stable-Diffusion-Prompts' if null. |
129+
| `calibration_size` | 128 | Number of prompts (each runs a full denoising loop) |
130+
| `calibration_n_steps` | 30 | Denoising steps per prompt |
131+
| `calibration_guidance_scale` | 4.0 | CFG scale (should match inference-time) |
132+
133+
### Checkpoint Resume
134+
135+
| Parameter | Default | Description |
136+
|-----------|---------|-------------|
137+
| `resume_from_checkpoint` | null | `"latest"` to auto-detect, or explicit path |
138+
| `must_save_by` | null | Minutes after which to save and exit (for Slurm time limits) |
139+
| `restore_quantized_checkpoint` | null | Restore a pre-quantized model (skips calibration) |
140+
| `save_quantized_checkpoint` | null | Path to save the final quantized model |
141+
142+
### Custom Quantization Configs
143+
144+
To define custom quantization configs, add entries to `CUSTOM_QUANT_CONFIGS` in `distillation_trainer.py`:
145+
146+
```python
147+
CUSTOM_QUANT_CONFIGS["MY_FP8_CFG"] = {
148+
"quant_cfg": mtq.FP8_DEFAULT_CFG["quant_cfg"],
149+
"algorithm": "max",
150+
}
151+
```
152+
153+
Then reference it in your YAML: `quant_cfg: MY_FP8_CFG`.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# FSDP Configuration
2+
#
3+
# FULL_SHARD across all GPUs for maximum memory efficiency.
4+
# For multi-node training with `accelerate launch`.
5+
#
6+
# Usage:
7+
# accelerate launch \
8+
# --config_file configs/accelerate/fsdp.yaml \
9+
# --num_processes 16 \
10+
# --num_machines 2 \
11+
# --machine_rank $MACHINE_RANK \
12+
# --main_process_ip $MASTER_IP \
13+
# --main_process_port 29500 \
14+
# distillation_trainer.py --config configs/distillation_example.yaml
15+
16+
distributed_type: FSDP
17+
downcast_bf16: 'no'
18+
enable_cpu_affinity: false
19+
20+
fsdp_config:
21+
# FULL_SHARD: Shard optimizer states, gradients, and parameters across ALL GPUs
22+
# This provides maximum memory efficiency for large models like LTX-2 19B
23+
# Parameters are fully sharded across all nodes (not replicated)
24+
fsdp_sharding_strategy: FULL_SHARD
25+
26+
# Enable activation checkpointing to reduce memory during backward pass
27+
# Critical for 19B model training
28+
fsdp_activation_checkpointing: true
29+
30+
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
31+
fsdp_backward_prefetch: BACKWARD_PRE
32+
fsdp_cpu_ram_efficient_loading: true
33+
fsdp_forward_prefetch: false
34+
fsdp_offload_params: false
35+
fsdp_reshard_after_forward: true
36+
fsdp_state_dict_type: SHARDED_STATE_DICT
37+
fsdp_sync_module_states: true
38+
fsdp_transformer_layer_cls_to_wrap: BasicAVTransformerBlock
39+
fsdp_use_orig_params: true
40+
fsdp_version: 1
41+
42+
# Note: num_machines and num_processes are overridden by accelerate launch command-line args
43+
# These are just defaults for local testing
44+
num_machines: 1
45+
num_processes: 8

0 commit comments

Comments
 (0)