[For RL] Keep attrs after folding weight and fix empty extra state for Megatron (#779)

mxinO · web-flow · commit 6f094d7fa574 · 2026-02-28T02:53:32.000Z
## What does this PR do? **Type of change:** improvement **Overview:** - For Quantization aware reinforcement learning, after folding weight of rollout, we want to keep the quantization attrs for next step. - Minor fix for empty extra state - Support getting dataloader from jsonl file, useful for using training data as calibration data. I can separate this to another PR if necessary. ## Usage `mtq.fold_weight(keep_attrs=True)` will keep quantizer attrs after folding weight, ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes  - **Did you write any new necessary tests?**: NA - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: No  ## Additional Information   ## Summary by CodeRabbit * **New Features** * Added support for loading dataset samples directly from JSONL/JSONL.GZ files * Added optional parameter to skip logits return in generation prefill operations * Enhanced weight folding operations to optionally preserve quantization attributes during model optimization * **Bug Fixes** * Fixed handling of empty tensor states to prevent deserialization errors in Megatron module  --------- Signed-off-by: Meng Xin <mxin@nvidia.com>
diff --git a/modelopt/torch/opt/plugins/megatron.py b/modelopt/torch/opt/plugins/megatron.py
@@ -99,6 +99,8 @@ def _modelopt_set_extra_state(self, state: Any):
         return
 
     if isinstance(state, torch.Tensor):
+        if state.numel() == 0:
+            return
         # Default format: byte tensor with pickled data
         #
         # TODO: possible deserialization improvement
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
@@ -530,8 +530,8 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None):
         print("\n".join(lines))
 
 
-def fold_weight(model: nn.Module):
+def fold_weight(model: nn.Module, keep_attrs: bool = False):
     """Fold weight quantizer for fast evaluation."""
     for name, module in model.named_modules():
         if isinstance(module, QuantModule):
-            module.fold_weight()
+            module.fold_weight(keep_attrs)
diff --git a/modelopt/torch/quantization/nn/modules/quant_linear.py b/modelopt/torch/quantization/nn/modules/quant_linear.py
@@ -162,9 +162,9 @@ def forward(self, input, *args, **kwargs):
             output = super().forward(input, *args, **kwargs)
         return output
 
-    def fold_weight(self):
+    def fold_weight(self, keep_attrs: bool = False):
         """Fold the weight for faster eval."""
-        super().fold_weight()
+        super().fold_weight(keep_attrs)
         if (
             hasattr(self, "weight_quantizer")
             and hasattr(self, "weight")
@@ -179,13 +179,14 @@ def fold_weight(self):
                     self.weight
                     + self.weight_quantizer.svdquant_lora_b @ self.weight_quantizer.svdquant_lora_a
                 )
-            _attrs = [
-                "_svdquant_lora_a",
-                "_svdquant_lora_b",
-            ]
-            for attr in _attrs:
-                if hasattr(self.weight_quantizer, attr):
-                    delattr(self.weight_quantizer, attr)
+            if not keep_attrs:
+                _attrs = [
+                    "_svdquant_lora_a",
+                    "_svdquant_lora_b",
+                ]
+                for attr in _attrs:
+                    if hasattr(self.weight_quantizer, attr):
+                        delattr(self.weight_quantizer, attr)
 
 
 class RealQuantLinear(QuantModule):
diff --git a/modelopt/torch/quantization/nn/modules/quant_module.py b/modelopt/torch/quantization/nn/modules/quant_module.py
@@ -119,7 +119,7 @@ def modelopt_post_restore(self, prefix: str = ""):
             if isinstance(module, TensorQuantizer):
                 module.to(non_tq_param_or_buffer.device)
 
-    def fold_weight(self):
+    def fold_weight(self, keep_attrs: bool = False):
         """Fold the weight for faster eval."""
         # Handle all attributes that end with _weight_quantizer
         for name in dir(self):
@@ -138,13 +138,14 @@ def fold_weight(self):
                 weight = getattr(self, weight_name)
                 weight.data.copy_(attr(weight.float()).to(weight.dtype))
                 attr.disable()
-                _attrs = [
-                    "_pre_quant_scale",
-                    "_amax",
-                ]
-                for attr_name in _attrs:
-                    if hasattr(attr, attr_name):
-                        delattr(attr, attr_name)
+                if not keep_attrs:
+                    _attrs = [
+                        "_pre_quant_scale",
+                        "_amax",
+                    ]
+                    for attr_name in _attrs:
+                        if hasattr(attr, attr_name):
+                            delattr(attr, attr_name)
 
 
 QuantModuleRegistry = _DMRegistryCls("Quant", QuantModule)
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -363,9 +363,9 @@ class HFRowParallelLinear(HFParallelLinear):
 class _QuantHFParallelLinear(_ParallelLinear):
     _functionals_to_replace = [(torch.nn.functional, "linear")]
 
-    def fold_weight(self):
+    def fold_weight(self, keep_attrs: bool = False):
         with self.enable_weight_access_and_writeback():
-            super().fold_weight()
+            super().fold_weight(keep_attrs)
 
     @contextmanager
     def enable_weight_access_and_writeback(self):
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -228,7 +228,7 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
             )
 
     @torch.no_grad()
-    def fold_weight(self):
+    def fold_weight(self, keep_attrs: bool = False):
         # the MoE weights can be super large, it consumes too much memory, so we need to fold the weight one by one
         for i in range(self.w13_weight.shape[0]):
             self.w13_weight[i].copy_(
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -16,6 +16,7 @@
 """Utility functions for getting samples and forward loop function for different datasets."""
 
 import copy
+import json
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 from warnings import warn
@@ -110,6 +111,47 @@
 ]
 
 
+def _get_jsonl_text_samples(jsonl_path: str, num_samples: int) -> list[str]:
+    """Load up to ``num_samples`` entries from a JSONL file using the ``text`` field.
+
+    Each non-empty line must be a JSON object containing a ``text`` field.
+    """
+    if num_samples <= 0:
+        return []
+
+    samples: list[str] = []
+
+    with open(jsonl_path, encoding="utf-8") as f:
+        for line_idx, line in enumerate(f, start=1):
+            if len(samples) >= num_samples:
+                break
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as e:
+                raise ValueError(
+                    f"Invalid JSON in JSONL file {jsonl_path} at line {line_idx}: {e}"
+                ) from e
+
+            if not isinstance(obj, dict):
+                raise ValueError(
+                    f"Expected a JSON object in JSONL file {jsonl_path} at line {line_idx}, "
+                    f"got {type(obj)}."
+                )
+
+            if "text" not in obj:
+                raise ValueError(
+                    f"Missing required field 'text' in JSONL file {jsonl_path} at line {line_idx}."
+                )
+
+            samples.append(str(obj["text"]))
+
+    return samples
+
+
 def _normalize_splits(split: str | list[str]) -> list[str]:
     """Ensure split is always a list."""
     return [split] if isinstance(split, str) else list(split)
@@ -181,7 +223,7 @@ def get_dataset_samples(
     ``messages``/``conversations`` (chat), ``prompt``, ``text``, or ``input``.
 
     Args:
-        dataset_name: Name or HuggingFace path of the dataset to load.
+        dataset_name: Name or HuggingFace path of the dataset to load, or a path to a ``.jsonl``/``.jsonl.gz`` file.
         num_samples: Number of samples to load from the dataset.
         apply_chat_template: Whether to apply the chat template to the samples
             (if supported by the dataset).  For unregistered datasets with a
@@ -196,6 +238,10 @@ def get_dataset_samples(
     Returns:
         Samples: The list of samples.
     """
+    # Local JSONL file path support (each line is a JSON object with a `text` field).
+    if dataset_name.endswith(".jsonl"):
+        return _get_jsonl_text_samples(dataset_name, num_samples)
+
     from datasets import load_dataset
 
     is_registered = dataset_name in SUPPORTED_DATASET_CONFIG
@@ -284,7 +330,8 @@ def get_dataset_dataloader(
     """Get a dataloader with the dataset name and tokenizer of the target model.
 
     Args:
-        dataset_name: Name of the dataset to load.
+        dataset_name: Name of the dataset to load, or a path to a ``.jsonl`` file.
+            If a ``.jsonl`` file is provided, each line must be a JSON object with a ``text`` field.
         tokenizer: Instance of HuggingFace tokenizer.
         batch_size: Batch size of the returned dataloader.
         num_samples: Number of samples from the dataset.
diff --git a/modelopt/torch/utils/plugins/megatron_generate.py b/modelopt/torch/utils/plugins/megatron_generate.py
@@ -46,6 +46,7 @@ def megatron_prefill(
     pixel_values: torch.FloatTensor | None = None,
     image_grid_thw: torch.LongTensor | None = None,
     image_sizes: torch.LongTensor | None = None,
+    skip_return_logits: bool = False,
 ) -> torch.Tensor:
     """A simple prefill function for Megatron Core V(LM) models."""
     if not isinstance(model, MegatronModule):
@@ -112,6 +113,8 @@ def _forward_step_func(data, model):
         forward_only=True,
         collect_non_loss_data=True,
     )
+    if skip_return_logits:
+        return None
 
     if mpu.is_pipeline_last_stage():
         logits = list_of_logits[0][:, :seq_length, :].detach()
@@ -124,7 +127,6 @@ def _forward_step_func(data, model):
         logits_dtype = torch.float16
     else:
         logits_dtype = torch.float32
-
     logits = broadcast_from_last_pipeline_stage(
         [max_batch_size, seq_length, model.vocab_size], logits_dtype, logits
     )

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,8 @@ def _modelopt_set_extra_state(self, state: Any):`
`99`	`99`	`return`
`100`	`100`
`101`	`101`	`if isinstance(state, torch.Tensor):`
	`102`	`+ if state.numel() == 0:`
	`103`	`+ return`
`102`	`104`	`# Default format: byte tensor with pickled data`
`103`	`105`	`#`
`104`	`106`	`# TODO: possible deserialization improvement`
Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):`
`228`	`228`	`)`
`229`	`229`
`230`	`230`	`@torch.no_grad()`
`231`		`- def fold_weight(self):`
	`231`	`+ def fold_weight(self, keep_attrs: bool = False):`
`232`	`232`	`# the MoE weights can be super large, it consumes too much memory, so we need to fold the weight one by one`
`233`	`233`	`for i in range(self.w13_weight.shape[0]):`
`234`	`234`	`self.w13_weight[i].copy_(`