Revert MO_DEBUG_MAX_LAYERS hatches in model_calib + unified_export_hf

Fridah-nv · Fridah-nv · commit 26f20d42878d · 2026-05-15T12:01:36.000-07:00
The env-var-gated early-break (model_calib.layerwise_calibrate) and
export skip (unified_export_hf._process_quantized_modules) were only
needed to bound wall-clock during the cliff-fix smoke test. The bug fix
itself is purely about not bringing over glm5.1-tmp's clamps in
moe_utils.py — which we already don't. Removing the debug hatches keeps
the branch a clean superset of main's production behavior.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -17,7 +17,6 @@
 
 import collections.abc
 import json
-import os
 import re
 import tempfile
 import warnings
@@ -663,16 +662,6 @@ def _process_quantized_modules(
             # _QuantFusedExperts uses plural `gate_up_proj_weight_quantizers` (ModuleList),
             # which get_quantization_format's singular-weight_quantizer check misses. Handle
             # it explicitly before the format gate so fused-experts get split + quantized.
-            # Debug hatch (paired with MO_DEBUG_MAX_LAYERS in model_calib.layerwise_calibrate):
-            # skip _export_fused_experts for layers whose layerwise calibration was never run.
-            # Those layers' per-expert quantizers have no _amax — touching them triggers the
-            # uncalibrated-fallback warnings or, with corrupt storage, a CUDA illegal-memory
-            # error. With the calibrated layers only, every expert has a valid _amax.
-            _debug_max = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
-            if _debug_max > 0:
-                _m = re.search(r"\.layers\.(\d+)\.", name or "")
-                if _m and int(_m.group(1)) >= _debug_max:
-                    continue
             with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                 _export_fused_experts(sub_module, dtype)
         elif get_quantization_format(sub_module) != QUANTIZATION_NONE:
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -16,7 +16,6 @@
 """Calibration utilities."""
 
 import math
-import os
 import time
 import warnings
 from collections.abc import Callable
@@ -1767,15 +1766,7 @@ def layerwise_calibrate(
             start_layer, resumed_inputs, forward_loop
         )
 
-        _debug_max_layers = int(os.environ.get("MO_DEBUG_MAX_LAYERS", "0") or "0")
-
         for layer_idx in range(start_layer, num_layers):
-            if _debug_max_layers > 0 and layer_idx >= _debug_max_layers:
-                print_rank_0(
-                    f"MO_DEBUG_MAX_LAYERS={_debug_max_layers}: stopping layerwise "
-                    f"calibration after layer {layer_idx - 1}/{num_layers}"
-                )
-                break
             layer = transformer_layers[layer_idx]
 
             def _layer_forward_loop(m, _inputs=layer_inputs):