Address PR review: causal shift fix, save_dtype default, forward restore safety

realAsma · claude · realAsma · commit 1f1c2507161b · 2026-04-16T19:12:07.000Z
- Fix causal shift inconsistency between _standard_kd_loss and _liger_kd_loss
- Change save_dtype default from "bfloat16" to None (preserve original dtype)
- Add try/except in _forward_redirect to restore module.forward on failure
- Skip ARGUMENTS.md pre-commit hook gracefully when modelopt not installed

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -139,7 +139,8 @@ repos:
     hooks:
       - id: generate-arguments-md
         name: Regenerate examples/llm_qat/ARGUMENTS.md
-        entry: bash -c 'python examples/llm_qat/train.py --generate_docs examples/llm_qat/ARGUMENTS.md'
+        entry: bash -c 'python -c "import modelopt" 2>/dev/null && python examples/llm_qat/train.py --generate_docs examples/llm_qat/ARGUMENTS.md || echo
+          "Skipping ARGUMENTS.md generation (modelopt not installed)"'
         language: system
         files: >-
           (?x)^(
diff --git a/examples/llm_qat/ARGUMENTS.md b/examples/llm_qat/ARGUMENTS.md
@@ -52,7 +52,7 @@ Extends [HuggingFace TrainingArguments](https://huggingface.co/docs/transformers
 | `--trainable_params` | `list[str]` | `None` | Glob patterns (fnmatch) for parameters that should be trainable. All other parameters will be frozen. Mutually exclusive with frozen_params. |
 | `--frozen_params` | `list[str]` | `None` | Glob patterns (fnmatch) for parameters that should be frozen. Mutually exclusive with trainable_params. |
 | `--lr_config` | `str` | `None` | Path to a YAML file mapping fnmatch patterns to optimizer kwargs (e.g. lr, weight_decay). First matching pattern wins per parameter. See examples/llm_qat/configs/train/lr_config_example.yaml. |
-| `--save_dtype` | `str` | `"bfloat16"` | Dtype string to write into the saved model's config.json (e.g. 'bfloat16', 'float16'). Defaults to 'bfloat16'. |
+| `--save_dtype` | `str` | `None` | Dtype string to write into the saved model's config.json (e.g. 'bfloat16', 'float16'). Preserves the original dtype when not set. |
 | `--manual_gc` | `bool` | `False` | Run `gc.collect()` before each training/prediction step to work around GPU memory leaks during QAT/distillation. |
 | `--liger_ce_label_smoothing` | `float` | `0.0` | Label smoothing for Liger fused CE loss. Only used when --use_liger_kernel is enabled. |
 | `--lora` | `bool` | `False` | Whether to add LoRA (Low-Rank Adaptation) adapter before training. When using real quantization, the LoRA adapter must be set, as quantized weights will be frozen during training. |
diff --git a/modelopt/torch/distill/plugins/huggingface.py b/modelopt/torch/distill/plugins/huggingface.py
@@ -180,13 +180,15 @@ def compute_kd_loss_func(self, outputs, labels, **kwargs):
         return self._standard_kd_loss(outputs, labels, **kwargs)
 
     def _standard_kd_loss(self, outputs, labels, **kwargs):
-        """KD loss with ignore-index masking."""
-        student_logits = outputs.logits.float()
-        teacher_logits = self._last_teacher_outputs.logits.float()
+        """KD loss with causal shift and ignore-index masking."""
+        # Causal LM shift (match _liger_kd_loss semantics)
+        student_logits = outputs.logits[..., :-1, :].contiguous().float()
+        teacher_logits = self._last_teacher_outputs.logits[..., :-1, :].contiguous().float()
         per_token_loss = self._kd_criterion(student_logits, teacher_logits)
         if labels is None:
             return per_token_loss.sum()
-        mask = labels != IGNORE_INDEX
+        shift_labels = labels[..., 1:].contiguous()
+        mask = shift_labels != IGNORE_INDEX
         loss = (per_token_loss * mask).sum() / mask.sum().clamp(min=1)
         self._last_teacher_outputs = None
         return loss
diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py
@@ -220,12 +220,12 @@ class ModelOptTrainerArguments(ModelOptHFArguments):
             ),
         },
     )
-    save_dtype: str = dataclasses.field(
-        default="bfloat16",
+    save_dtype: str | None = dataclasses.field(
+        default=None,
         metadata={
             "help": (
                 "Dtype string to write into the saved model's config.json "
-                "(e.g. 'bfloat16', 'float16'). Defaults to 'bfloat16'."
+                "(e.g. 'bfloat16', 'float16'). Preserves the original dtype when not set."
             ),
         },
     )
@@ -433,8 +433,12 @@ def wrapped_forward(*a, **kw):
         return fn()
 
     module.forward = wrapped_forward
-    dummy = torch.empty(1, device=next(module.parameters()).device)
-    return module(dummy)
+    try:
+        dummy = torch.empty(1, device=next(module.parameters()).device)
+        return module(dummy)
+    except Exception:
+        module.forward = original_forward
+        raise
 
 
 class ModelOptHFTrainer(Trainer):