[Feat]: Offline DFlash training

h-guo18 · h-guo18 · commit f208109848b0 · 2026-04-19T21:51:01.000Z
- Add `dflash_offline` config flag for training from pre-computed hidden states;
  deletes base model layers to save memory.
- Move `dflash_mask_token_id` auto-detection from `main.py` into `DFlashConfig`
  Pydantic validators; derive `dflash_offline` from `data_args.offline_data_path`.
- Add `DFlashBaseModelOutput.from_offline_dict` classmethod for consuming
  pre-computed hidden states in the forward path.

Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -49,7 +49,7 @@
 
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
-from modelopt.torch.speculative.config import EagleConfig
+from modelopt.torch.speculative.config import DFlashConfig, EagleConfig
 from modelopt.torch.speculative.utils import load_vlm_or_llm, patch_transformers5_params_loading
 from modelopt.torch.utils import print_rank_0
 
@@ -303,18 +303,9 @@ def train():
                 model.eagle_module.d2t = torch.load(data_args.draft_vocab_cache, weights_only=True)
                 print_rank_0(f"Loaded draft vocab cache from {data_args.draft_vocab_cache}.")
         elif training_args.mode == "dflash":
-            # Auto-detect mask_token_id from tokenizer if not set
-            if not dflash_cfg.get("dflash_mask_token_id"):
-                if tokenizer.mask_token_id is not None:
-                    dflash_cfg["dflash_mask_token_id"] = tokenizer.mask_token_id
-                    print_rank_0(
-                        f"Auto-detected mask_token_id={tokenizer.mask_token_id} from tokenizer"
-                    )
-                else:
-                    raise ValueError(
-                        "mask_token_id not found in tokenizer and not set in config. "
-                        "Set dflash.dflash_mask_token_id in the training YAML."
-                    )
+            dflash_cfg = DFlashConfig.model_validate(
+                dflash_cfg, context={"tokenizer": tokenizer, "data_args": data_args}
+            ).model_dump()
             mtsp.convert(model, [("dflash", dflash_cfg)])
         else:
             raise Exception(f"{training_args.mode} is not supported!")
diff --git a/modelopt/torch/speculative/config.py b/modelopt/torch/speculative/config.py
@@ -67,6 +67,11 @@ def _get_dflash_default_config():
 class DFlashConfig(ModeloptBaseConfig):
     """DFlash config for block-wise parallel speculative decoding."""
 
+    dflash_offline: bool = ModeloptField(
+        default=False,
+        description="Whether to use detached DFlash (offline training from pre-computed hidden states).",
+    )
+
     dflash_block_size: int = ModeloptField(
         default=8,
         description="Block size for parallel prediction. Draft predicts this many tokens per block.",
@@ -110,6 +115,39 @@ class DFlashConfig(ModeloptBaseConfig):
         description="Whether to use torch.compile on DFlash forward/loss methods.",
     )
 
+    @model_validator(mode="before")
+    @classmethod
+    def _derive_dflash_offline(cls, data: Any, info: ValidationInfo) -> Any:
+        """Derive ``dflash_offline`` from ``data_args.offline_data_path`` when provided in context."""
+        ctx = info.context if info.context else {}
+        data_args = ctx.get("data_args")
+        if data_args is not None and isinstance(data, dict):
+            data["dflash_offline"] = data_args.offline_data_path is not None
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def _resolve_mask_token_id(cls, data: Any, info: ValidationInfo) -> Any:
+        """Auto-detect ``dflash_mask_token_id`` from tokenizer when provided in context."""
+        if not isinstance(data, dict) or data.get("dflash_mask_token_id") is not None:
+            return data
+        ctx = info.context if info.context else {}
+        tokenizer = ctx.get("tokenizer")
+        if tokenizer is not None and getattr(tokenizer, "mask_token_id", None) is not None:
+            data["dflash_mask_token_id"] = tokenizer.mask_token_id
+        return data
+
+    @model_validator(mode="after")
+    def _check_mask_token_id(self) -> "DFlashConfig":
+        """Validate that mask_token_id is set after all resolution attempts."""
+        if self.dflash_mask_token_id is None:
+            raise ValueError(
+                "dflash_mask_token_id is required. Set it in the config YAML "
+                "(dflash.dflash_mask_token_id=TOKEN_ID) or ensure the tokenizer "
+                "has a mask_token_id attribute."
+            )
+        return self
+
 
 class MedusaConfig(ModeloptBaseConfig):
     """Medusa config."""
diff --git a/modelopt/torch/speculative/dflash/dflash_model.py b/modelopt/torch/speculative/dflash/dflash_model.py
@@ -27,6 +27,7 @@ def _setup(self):
 
     def modify(self, config):
         """Base DFlash Model modify function. Child class should implement the details."""
+        self.dflash_offline = config.dflash_offline
         self.dflash_block_size = config.dflash_block_size
         self.dflash_freeze_base_model = config.dflash_freeze_base_model
         self.dflash_loss_decay_factor = config.dflash_loss_decay_factor
diff --git a/modelopt/torch/speculative/plugins/hf_dflash.py b/modelopt/torch/speculative/plugins/hf_dflash.py
@@ -181,20 +181,17 @@ def modify(self, config):
         self.dflash_config.block_size = self.dflash_block_size
 
         # Target layer IDs
-        num_target_layers = base_config.num_hidden_layers
+        num_target_layers = (
+            base_config.num_orig_hidden_layers
+            if self.dflash_offline
+            else base_config.num_hidden_layers
+        )
         num_draft_layers = self.dflash_config.num_hidden_layers
         self.target_layer_ids = build_target_layer_ids(num_target_layers, num_draft_layers)
         self.dflash_config.target_layer_ids = self.target_layer_ids
 
-        # mask_token_id: set in DFlashConfig (or auto-detected by main.py from tokenizer)
-        mask_id = config.dflash_mask_token_id
-        if mask_id is None:
-            raise ValueError(
-                "dflash_mask_token_id is required. Set it in the config YAML "
-                "(dflash.dflash_mask_token_id=TOKEN_ID) or let main.py auto-detect "
-                "from tokenizer.mask_token_id."
-            )
-        self.mask_token_id = mask_id
+        # mask_token_id: validated by DFlashConfig, auto-detected from tokenizer context
+        self.mask_token_id = config.dflash_mask_token_id
         logger.info("DFlash mask_token_id: %s", self.mask_token_id)
 
         # Freeze base model
@@ -207,10 +204,17 @@ def modify(self, config):
         self.dflash_module = DFlashModule(self.dflash_config)
         # Match base model dtype/device. Skip if base is on meta (during from_pretrained
         # restore — the model will be moved to the correct device after weight loading).
-        base_device = next(self._base_model.layers[-1].parameters()).device
+        if self.dflash_offline:
+            base_device = self._base_model_lm_head.weight.device
+        else:
+            base_device = next(self._base_model.layers[-1].parameters()).device
         if base_device.type != "meta":
             self.dflash_module.to(self._base_model.dtype).to(base_device)
 
+        # Delete base model layers for offline training (save memory)
+        if self.dflash_offline:
+            self._base_model._modules.pop("layers")
+
         self.is_quantized = False
         self._num_anchors = self.dflash_num_anchors
 
@@ -465,9 +469,17 @@ def forward(
             )
 
         # 1. Run base model → extract target hidden states
-        base_outputs = self._dflash_base_model_forward(
-            input_ids, attention_mask, freeze=self.dflash_freeze_base_model
-        )
+        if self.dflash_offline:
+            assert "base_model_outputs" in kwargs
+            base_outputs = DFlashBaseModelOutput.from_offline_dict(kwargs["base_model_outputs"])
+            if base_outputs.logits is None and self.dflash_self_logit_distillation:
+                # Compute logits from last-layer hidden states for KD loss
+                out_hiddens = kwargs["base_model_outputs"].get("base_model_hidden_states")
+                base_outputs.logits = self._base_model_lm_head(out_hiddens)
+        else:
+            base_outputs = self._dflash_base_model_forward(
+                input_ids, attention_mask, freeze=self.dflash_freeze_base_model
+            )
 
         # 2. Build loss mask.
         # When labels are provided (answer_only_loss), they already encode both
diff --git a/modelopt/torch/speculative/plugins/modeling_dflash.py b/modelopt/torch/speculative/plugins/modeling_dflash.py
@@ -42,6 +42,14 @@ class DFlashBaseModelOutput:
     target_hidden: torch.Tensor  # concatenated hidden states from target layers [B, seq, N*H]
     logits: torch.Tensor | None = None  # base model logits [B, seq, vocab]
 
+    @classmethod
+    def from_offline_dict(cls, d: dict):
+        """Construct from a dict of pre-computed base model outputs (offline training)."""
+        return cls(
+            target_hidden=d.get("aux_hidden_states"),
+            logits=d.get("base_model_logits"),
+        )
+
 
 def build_target_layer_ids(num_target_layers, num_draft_layers):
     """Select layers uniformly from the target model for feature extraction."""