NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 18 additions & 5 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎modelopt/torch/quantization/config.py‎
Lines changed: 48 additions & 8 deletions b/‎modelopt/torch/quantization/config.py‎
Lines changed: 48 additions & 8 deletions
@@ -48,7 +48,7 @@
 import modelopt.torch.quantization as mtq
 import modelopt.torch.sparsity as mts
 from modelopt.torch.export import (
-    export_hf_checkpoint,
+    export_hf_vllm_fq_checkpoint,
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
@@ -77,6 +77,9 @@
     "int4_awq": mtq.INT4_AWQ_CFG,
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
     "nvfp4": mtq.NVFP4_DEFAULT_CFG,
+    "nvfp4_mse": mtq.NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG,
+    "nvfp4_lo_he": mtq.NVFP4_LOCAL_HESSIAN_CFG,
+    "nvfp4_gl_he": mtq.NVFP4_GLOBAL_HESSIAN_CFG,
     "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG,
     "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
     "fp8_pc_pt": mtq.FP8_PER_CHANNEL_PER_TOKEN_CFG,
@@ -139,10 +142,10 @@ def make_calib_dataloader(
         assert tokenizer is not None and isinstance(
             tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         ), "The PreTrainedTokenizer must be set"
-        # Labels are only needed for gradient-based auto_quantize
+        # Labels are needed for gradient-based auto_quantize or global hessian calibration
         include_labels = (
             args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
-        )
+        ) or args.qformat == "nvfp4_gl_he"  # Global hessian needs labels for backward pass
         calib_dataloader = get_dataset_dataloader(
             dataset_name=args.dataset,
             tokenizer=tokenizer,
@@ -432,8 +435,18 @@ def mono_quantize(
 
         if not use_calibration:
             warnings.warn("Dynamic quantization. Calibration skipped.")
+
+        # Check if we need backward pass for global hessian calibration
+        algorithm_cfg = quant_cfg.get("algorithm", {})
+        use_global_hessian = (
+            algorithm_cfg.get("method") == "local_hessian"
+            and algorithm_cfg.get("hessian_type") == "global"
+        )
+
         calibrate_loop = (
-            create_forward_loop(dataloader=calib_dataloader) if use_calibration else None
+            create_forward_loop(dataloader=calib_dataloader, enable_backward=use_global_hessian)
+            if use_calibration
+            else None
         )
 
         if calibration_only:
@@ -535,7 +548,7 @@ def export_quantized(
                     "They will be set at deployment time."
                 )
 
-            export_hf_checkpoint(
+            export_hf_vllm_fq_checkpoint(
                 full_model,
                 export_dir=export_path,
             )
 
@@ -446,6 +446,27 @@
     },
     "algorithm": {
         "method": "local_hessian",
+        "hessian_type": "local",
+        "fp8_scale_sweep": True,
+    },
+}
+
+NVFP4_GLOBAL_HESSIAN_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        "*input_quantizer": {
+            "enable": False,
+        },
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": {
+        "method": "local_hessian",
+        "hessian_type": "global",
         "fp8_scale_sweep": True,
     },
 }
@@ -1125,23 +1146,42 @@ class MseCalibConfig(QuantizeAlgorithmConfig):
 
 
 class LocalHessianCalibConfig(QuantizeAlgorithmConfig):
-    """Configuration for local Hessian-weighted MSE calibration.
+    """Configuration for Hessian-weighted MSE calibration.
 
     This algorithm uses activation information to optimize per-block scales for weight
     quantization. It minimizes the output reconstruction error by weighting the loss
-    with the local Hessian matrix computed from input activations.
+    with the Hessian matrix computed from input activations (and optionally output gradients).
 
-    The local Hessian loss for each block is: ``(dw @ H @ dw.T)`` where:
+    The Hessian loss for each block is: ``(dw @ H @ dw.T)`` where:
     - ``dw = weight - quantized_weight`` (weight reconstruction error per block)
-    - ``H = X @ X.T`` is the local Hessian computed from input activations X
+    - ``H`` is the Hessian matrix (local or global, depending on ``hessian_type``)
+
+    Two Hessian types are supported:
+
+    - **local**: ``H = X @ X.T`` - uses only input activations. Faster, no backward pass needed.
+    - **global**: ``H = (X * grad²) @ X.T`` - weights by output gradient squared.
+      More accurate as it accounts for output importance, but requires backward pass.
 
     This method is particularly effective for NVFP4 weight-only quantization where
     activation information helps select better per-block scales.
-
     """
 
     method: Literal["local_hessian"] = ModeloptField("local_hessian")
 
+    hessian_type: Literal["local", "global"] = ModeloptField(
+        default="local",
+        title="Type of Hessian to compute.",
+        description="""Type of Hessian matrix to use for weighting quantization errors:
+
+        - ``"local"``: H = X @ X.T - Only uses input activations. Fast, forward-pass only.
+        - ``"global"``: H = (X * grad²) @ X.T - Weights by output gradient squared.
+          More accurate as it captures output importance, but requires backward pass
+          during calibration.
+
+        The global Hessian is closer to the true Fisher Information and typically
+        gives better results, but at the cost of running backward passes.""",
+    )
+
     step_size: float | None = ModeloptField(
         default=0.1,
         gt=0.0,
@@ -1175,8 +1215,8 @@ class LocalHessianCalibConfig(QuantizeAlgorithmConfig):
     block_size: int | None = ModeloptField(
         default=16,
         gt=0,
-        title="Block size for local Hessian computation.",
-        description="The block size used for computing the local Hessian matrix. "
+        title="Block size for Hessian computation.",
+        description="The block size used for computing the Hessian matrix. "
         "This should match the block size used in the quantization config. "
         "Default is 16 for NVFP4.",
     )
@@ -1190,7 +1230,7 @@ class LocalHessianCalibConfig(QuantizeAlgorithmConfig):
     debug: bool | None = ModeloptField(
         default=False,
         title="Debug mode.",
-        description="If True, module's local Hessian metadata will be kept as a module attribute.",
+        description="If True, module's Hessian metadata will be kept as a module attribute.",
     )