NVIDIA
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 10 additions & 27 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 10 additions & 27 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 19 additions & 3 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/diffusers/distillation/README.md‎
Lines changed: 153 additions & 0 deletions b/‎examples/diffusers/distillation/README.md‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎examples/diffusers/distillation/configs/accelerate/fsdp.yaml‎
Lines changed: 45 additions & 0 deletions b/‎examples/diffusers/distillation/configs/accelerate/fsdp.yaml‎
Lines changed: 45 additions & 0 deletions
@@ -56,18 +56,21 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests #####
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
   torch-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
     strategy:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
 
-  ##### Speculative Decoding Example Tests (requires 26.01 image) #####
-  speculative-decoding-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  speculative-decoding-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-2
-
   ##### TensorRT-LLM Example Tests #####
   trtllm-pr:
     needs: [check-file-changes, wait-checks]
@@ -172,15 +156,14 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
             needs.torch-pr.result != 'success' ||
-            needs.speculative-decoding-pr.result != 'success' ||
             needs.trtllm-pr.result != 'success' ||
             needs.onnx-pr.result != 'success'
           ))
 
@@ -59,8 +59,16 @@ jobs:
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 120
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -74,11 +82,19 @@ jobs:
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 150
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
 
@@ -22,6 +22,8 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
 - Add support for image-text data calibration in PTQ for Nemotron VL models.
 - Add Megatron Core export/import mapping for Qwen3-VL (``Qwen3VLForConditionalGeneration``) vision-language models. The mapping handles the ``model.language_model.`` weight prefix used by Qwen3-VL and supports both dense and MoE variants.
+- Add PTQ support for Nemotron Parse.
+- Add distillation support for LTX-2. See `examples/diffusers/distillation/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/distillation>`_ for more details.
 
 0.41 (2026-01-19)
 ^^^^^^^^^^^^^^^^^
 
@@ -0,0 +1,153 @@
+# LTX-2 Distillation Training with ModelOpt
+
+Knowledge distillation for LTX-2 DiT models using NVIDIA ModelOpt. A frozen **teacher** guides a trainable **student** through a combined loss:
+
+```text
+L_total = α × L_task + (1-α) × L_distill
+```
+
+Currently supported:
+
+- **Quantization-Aware Distillation (QAD)** — student uses ModelOpt fake quantization
+
+Planned:
+
+- **Sparsity-Aware Distillation (SAD)** — student uses ModelOpt sparsity
+
+## Installation
+
+```bash
+# From the distillation example directory
+cd examples/diffusers/distillation
+
+# Install Model-Optimizer (from repo root)
+pip install -e ../../..
+
+# Install all dependencies (ltx-trainer, ltx-core, ltx-pipelines, omegaconf)
+pip install -r requirements.txt
+```
+
+## Quick Start
+
+### 1. Prepare Your Dataset
+
+Use the ltx-trainer preprocessing to extract latents and text embeddings:
+
+```bash
+python -m ltx_trainer.preprocess \
+    --input_dir /path/to/videos \
+    --output_dir /path/to/preprocessed \
+    --model_path /path/to/ltx2/checkpoint.safetensors
+```
+
+### 2. Configure
+
+Copy and edit the example config:
+
+```bash
+cp configs/distillation_example.yaml configs/my_experiment.yaml
+```
+
+Key settings to update:
+
+```yaml
+model:
+  model_path: "/path/to/ltx2/checkpoint.safetensors"
+  text_encoder_path: "/path/to/gemma/model"
+
+data:
+  preprocessed_data_root: "/path/to/preprocessed/data"
+
+distillation:
+  distillation_alpha: 0.5       # 1.0 = pure task loss, 0.0 = pure distillation
+  quant_cfg: "FP8_DEFAULT_CFG"  # or INT8_DEFAULT_CFG, NVFP4_DEFAULT_CFG, null
+
+# IMPORTANT: disable ltx-trainer's built-in quantization
+acceleration:
+  quantization: null
+```
+
+### 3. Run Training
+
+#### Single GPU
+
+```bash
+python distillation_trainer.py --config configs/my_experiment.yaml
+```
+
+#### Multi-GPU (Single Node) with Accelerate
+
+```bash
+accelerate launch \
+    --config_file configs/accelerate/fsdp.yaml \
+    --num_processes 8 \
+    distillation_trainer.py --config configs/my_experiment.yaml
+```
+
+#### Multi-node Training with Accelerate
+
+To launch on multiple nodes, make sure to set the following environment variables on each node:
+
+- `NUM_NODES`: Total number of nodes
+- `GPUS_PER_NODE`: Number of GPUs per node
+- `NODE_RANK`: Unique rank/index of this node (0-based)
+- `MASTER_ADDR`: IP address of the master node (rank 0)
+- `MASTER_PORT`: Communication port (e.g., 29500)
+
+Then run this (on every node):
+
+```bash
+accelerate launch \
+    --config_file configs/accelerate/fsdp.yaml \
+    --num_machines $NUM_NODES \
+    --num_processes $((NUM_NODES * GPUS_PER_NODE)) \
+    --machine_rank $NODE_RANK \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    distillation_trainer.py --config configs/my_experiment.yaml
+```
+
+**Config overrides** can be passed via CLI using dotted notation:
+
+```bash
+accelerate launch ... distillation_trainer.py \
+    --config configs/my_experiment.yaml \
+    ++distillation.distillation_alpha=0.6 \
+    ++distillation.quant_cfg=INT8_DEFAULT_CFG \
+    ++optimization.learning_rate=1e-5
+```
+
+## Configuration Reference
+
+### Calibration
+
+Before training begins, calibration runs full denoising inference to collect activation statistics for accurate quantizer scales. This is cached as a step-0 checkpoint and reused on subsequent runs.
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `calibration_prompts_file` | null | Text file with one prompt per line. Use the HuggingFace dataset 'Gustavosta/Stable-Diffusion-Prompts' if null. |
+| `calibration_size` | 128 | Number of prompts (each runs a full denoising loop) |
+| `calibration_n_steps` | 30 | Denoising steps per prompt |
+| `calibration_guidance_scale` | 4.0 | CFG scale (should match inference-time) |
+
+### Checkpoint Resume
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `resume_from_checkpoint` | null | `"latest"` to auto-detect, or explicit path |
+| `must_save_by` | null | Minutes after which to save and exit (for Slurm time limits) |
+| `restore_quantized_checkpoint` | null | Restore a pre-quantized model (skips calibration) |
+| `save_quantized_checkpoint` | null | Path to save the final quantized model |
+
+### Custom Quantization Configs
+
+To define custom quantization configs, add entries to `CUSTOM_QUANT_CONFIGS` in `distillation_trainer.py`:
+
+```python
+CUSTOM_QUANT_CONFIGS["MY_FP8_CFG"] = {
+    "quant_cfg": mtq.FP8_DEFAULT_CFG["quant_cfg"],
+    "algorithm": "max",
+}
+```
+
+Then reference it in your YAML: `quant_cfg: MY_FP8_CFG`.
@@ -0,0 +1,45 @@
+# FSDP Configuration
+#
+# FULL_SHARD across all GPUs for maximum memory efficiency.
+# For multi-node training with `accelerate launch`.
+#
+# Usage:
+#   accelerate launch \
+#       --config_file configs/accelerate/fsdp.yaml \
+#       --num_processes 16 \
+#       --num_machines 2 \
+#       --machine_rank $MACHINE_RANK \
+#       --main_process_ip $MASTER_IP \
+#       --main_process_port 29500 \
+#       distillation_trainer.py --config configs/distillation_example.yaml
+
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+
+fsdp_config:
+  # FULL_SHARD: Shard optimizer states, gradients, and parameters across ALL GPUs
+  # This provides maximum memory efficiency for large models like LTX-2 19B
+  # Parameters are fully sharded across all nodes (not replicated)
+  fsdp_sharding_strategy: FULL_SHARD
+
+  # Enable activation checkpointing to reduce memory during backward pass
+  # Critical for 19B model training
+  fsdp_activation_checkpointing: true
+
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BasicAVTransformerBlock
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+
+# Note: num_machines and num_processes are overridden by accelerate launch command-line args
+# These are just defaults for local testing
+num_machines: 1
+num_processes: 8