From d96a4113f862d2df1507a82a547bd76f5f423bf7 Mon Sep 17 00:00:00 2001
From: Rohan Joshi <rohjoshi@nvidia.com>
Date: Mon, 20 Apr 2026 23:18:45 +0000
Subject: [PATCH 1/3] First commit

Signed-off-by: Rohan Joshi <rohjoshi@nvidia.com>
---
 examples/alpamayo/quantize.py | 661 ++++++++++++++++++++++++++++++++++
 1 file changed, 661 insertions(+)
 create mode 100644 examples/alpamayo/quantize.py

diff --git a/examples/alpamayo/quantize.py b/examples/alpamayo/quantize.py
new file mode 100644
index 00000000000..bb4b13e24cb
--- /dev/null
+++ b/examples/alpamayo/quantize.py
@@ -0,0 +1,661 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Quantize AlpamayoR1 and export as an HF-style checkpoint.
+
+Usage:
+    python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-r1-fp8 --quantize fp8
+    python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-r1-nvfp4 --quantize nvfp4 --real-quant
+"""
+
+import argparse
+import collections.abc
+import copy
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+import einops
+import pandas as pd
+import torch
+from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset
+from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1
+from alpamayo_r1.models.token_utils import to_special_token
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoTokenizer
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export import export_hf_checkpoint
+from modelopt.torch.export.quant_utils import get_quant_config
+from modelopt.torch.opt.plugins.huggingface import (
+    _LIBRARY_CLASSES_FOR_PATCHING,
+    _PATCHED_CLASSES,
+    patch_pretrained_methods,
+)
+from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader
+
+logger = logging.getLogger(__name__)
+
+try:
+    assert torch.ops.tensorrt.quantize_op.default
+except Exception:
+    logger.warning("Unable to import quantization op. Please install modelopt library")
+
+MIN_PIXELS = 163840
+MAX_PIXELS = 196608
+BASE_PROCESSOR_NAME = "Qwen/Qwen3-VL-2B-Instruct"
+
+
+def create_message(frames: torch.Tensor):
+    """Construct the message using images and cot."""
+    assert frames.ndim == 4, f"{frames.ndim=}, expected (N, C, H, W)"
+
+    # NOTE: we expand the padding tokens to match training, so we can directly apply native processor from VLM.
+    num_traj_token = 48
+    hist_traj_placeholder = (
+        f"<|traj_history_start|>{'<|traj_history|>' * num_traj_token}<|traj_history_end|>"
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a driving assistant that generates safe and accurate actions.",
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [{"type": "image", "image": frame} for frame in frames]
+            + [
+                {
+                    "type": "text",
+                    "text": f"{hist_traj_placeholder}output the chain-of-thought reasoning of the \
+                    driving process, then output the future trajectory.",
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "<|cot_start|>",
+                }
+            ],
+        },
+    ]
+
+
+def get_processor(tokenizer: AutoTokenizer) -> AutoProcessor:
+    """Get the processor for the Qwen3-VL-2B-Instruct model."""
+    processor_kwargs = {
+        "min_pixels": MIN_PIXELS,
+        "max_pixels": MAX_PIXELS,
+    }
+
+    processor = AutoProcessor.from_pretrained(BASE_PROCESSOR_NAME, **processor_kwargs)
+    processor.tokenizer = tokenizer
+    return processor
+
+
+def to_device(
+    data: Any,
+    device: str | torch.device | None = None,
+    dtype: torch.dtype | None = None,
+) -> Any:
+    """Recursively cast data into the specified device, dtype."""
+    if isinstance(data, torch.Tensor):
+        data = data.to(
+            device=device,
+            dtype=dtype,
+        )
+        return data
+    elif isinstance(data, collections.abc.Mapping):
+        return {key: to_device(data[key], device=device, dtype=dtype) for key in data}
+    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
+        return [to_device(elem, device=device, dtype=dtype) for elem in data]
+    else:
+        return data
+
+
+def enable_huggingface_checkpointing_patch() -> None:
+    """Patch PreTrainedModel.from_pretrained / save_pretrained to save/restore ModelOpt state.
+
+    Must be called before AlpamayoR1.from_pretrained() when loading a quantized (FP8/NVFP4)
+    checkpoint so that modelopt_state.pth is restored and _amax scaling factors are applied.
+    """
+    for name, (classes, methods_list) in _LIBRARY_CLASSES_FOR_PATCHING.items():
+        for cls, patch_methods in zip(classes, methods_list):
+            if cls in _PATCHED_CLASSES:
+                continue
+            patch_methods = [m for m in patch_methods if m[0] != "_from_config"]
+            patch_pretrained_methods(cls, patch_methods)
+            _PATCHED_CLASSES.add(cls)
+        print(f"ModelOpt save/restore enabled for `{name}` library.")
+
+
+enable_huggingface_checkpointing_patch()
+
+
+def _teacher_forced_flow_loss_forward(
+    self,
+    data: dict[str, Any],
+) -> dict[str, torch.Tensor]:
+    """Differentiable forward that returns the flow-matching training targets.
+
+    Bypasses autoregressive reasoning generation and diffusion sampling.
+    The VLM runs in a single non-sampling forward pass (with ``<traj_future_start>``
+    appended to the prompt) to build the prompt KV cache; the expert then runs once
+    on a linearly-interpolated noisy action and returns the predicted velocity field.
+
+    Args:
+        data: dict with ``tokenized_data`` (input_ids + other processor outputs),
+            ``ego_history_xyz``, ``ego_history_rot``, ``ego_future_xyz``,
+            ``ego_future_rot``.
+
+    Returns:
+        dict with keys ``v_pred`` and ``v_target``, both shape
+        ``(b,n_diffusion_tokens, action_dim)``. Callers compute MSE between them.
+    """
+    ego_history_xyz = data["ego_history_xyz"]
+    ego_history_rot = data["ego_history_rot"]
+    ego_future_xyz = data["ego_future_xyz"]
+    ego_future_rot = data["ego_future_rot"]
+    b, n_traj_group, _, _ = ego_history_xyz.shape
+    assert n_traj_group == 1, "Only one trajectory group is supported."
+
+    tokenized_data = dict(data["tokenized_data"])
+    input_ids = tokenized_data.pop("input_ids")
+    traj_data_vlm = {
+        "ego_history_xyz": ego_history_xyz,
+        "ego_history_rot": ego_history_rot,
+    }
+    input_ids = self.fuse_traj_tokens(input_ids, traj_data_vlm)
+    device = input_ids.device
+
+    # Append <traj_future_start> so the expert attends through the full prompt
+    # that inference would have generated up to the action block.
+    traj_future_start_id = self.tokenizer.convert_tokens_to_ids(
+        to_special_token("traj_future_start")
+    )
+    start_col = torch.full(
+        (input_ids.shape[0], 1),
+        traj_future_start_id,
+        dtype=input_ids.dtype,
+        device=device,
+    )
+    input_ids = torch.cat([input_ids, start_col], dim=1)
+    if "attention_mask" in tokenized_data and tokenized_data["attention_mask"] is not None:
+        am = tokenized_data["attention_mask"]
+        tokenized_data["attention_mask"] = torch.cat(
+            [am, torch.ones((am.shape[0], 1), dtype=am.dtype, device=am.device)], dim=1
+        )
+
+    vlm_outputs = self.vlm(
+        input_ids=input_ids,
+        use_cache=True,
+        return_dict=True,
+        **tokenized_data,
+    )
+    prompt_cache = vlm_outputs.past_key_values
+    prefill_seq_len = prompt_cache.get_seq_length()
+    rope_deltas = self.vlm.model.rope_deltas
+
+    n_diffusion_tokens = self.action_space.get_action_space_dims()[0]
+    offset = torch.full((b,), prefill_seq_len, device=device, dtype=torch.long)
+
+    position_ids = torch.arange(n_diffusion_tokens, device=device)
+    position_ids = einops.repeat(position_ids, "l -> 3 b l", b=b).clone()
+    delta = rope_deltas + offset[:, None]
+    position_ids += delta.to(position_ids.device)
+
+    # No padding between prompt cache and action block: full attention mask.
+    attention_mask = torch.zeros(
+        (b, 1, n_diffusion_tokens, prefill_seq_len + n_diffusion_tokens),
+        dtype=torch.float32,
+        device=device,
+    )
+
+    forward_kwargs = {}
+    if self.config.expert_non_causal_attention:
+        forward_kwargs["is_causal"] = False
+
+    # Build flow-matching target: x_1 = GT action, x_0 ~ N(0, I).
+    x_1 = self.action_space.traj_to_action(
+        traj_history_xyz=ego_history_xyz[:, 0],
+        traj_history_rot=ego_history_rot[:, 0],
+        traj_future_xyz=ego_future_xyz[:, 0],
+        traj_future_rot=ego_future_rot[:, 0],
+    )  # (b,n_diffusion_tokens, 2)
+    x_1 = x_1.to(device=device, dtype=torch.float32)
+
+    x_0 = torch.randn_like(x_1)
+    t = torch.rand(b, 1, 1, device=device, dtype=x_1.dtype)
+    x_t = (1.0 - t) * x_0 + t * x_1
+    v_target = x_1 - x_0
+
+    # Cast to action-module dtype to match action_in_proj / expert weights.
+    proj_dtype = next(self.action_in_proj.parameters()).dtype
+    x_t_cast = x_t.to(dtype=proj_dtype)
+    t_cast = t.to(dtype=proj_dtype)
+
+    future_token_embeds = self.action_in_proj(x_t_cast, t_cast)
+    if future_token_embeds.dim() == 2:
+        future_token_embeds = future_token_embeds.view(b, n_diffusion_tokens, -1)
+
+    expert_out = self.expert(
+        inputs_embeds=future_token_embeds,
+        position_ids=position_ids,
+        past_key_values=prompt_cache,
+        attention_mask=attention_mask,
+        use_cache=True,
+        **forward_kwargs,
+    )
+    prompt_cache.crop(prefill_seq_len)
+    last_hidden = expert_out.last_hidden_state[:, -n_diffusion_tokens:]
+    v_pred = self.action_out_proj(last_hidden).view(b, *self.action_space.get_action_space_dims())
+
+    return {"v_pred": v_pred.to(torch.float32), "v_target": v_target}
+
+
+def patch_teacher_forced_flow_loss_forward() -> None:
+    """Attach teacher_forced_flow_loss_forward to AlpamayoR1 if missing.
+
+    The public OSS AlpamayoR1 (github.com/nvlabs/alpamayo) does not define this
+    method; it exists only on the internal training fork. The body ported above
+    is the calibration path used by auto_quantize_model.
+    """
+    if not hasattr(AlpamayoR1, "teacher_forced_flow_loss_forward"):
+        AlpamayoR1.teacher_forced_flow_loss_forward = _teacher_forced_flow_loss_forward
+
+
+patch_teacher_forced_flow_loss_forward()
+
+
+def make_joint_calibration_forward_loop(
+    *,
+    clip_ids: list[str],
+    processor,
+    t0_us: int,
+    top_p: float,
+    temperature: float,
+    max_generation_length: int,
+    calibration_traj_samples: int,
+    device: str,
+):
+    """
+    Build a calibration loop that exercises both VLM generation and diffusion.
+
+    This avoids text-only calibration and ensures quantizers in the rollout path
+    (vlm/expert/diffusion-related modules) observe representative activations.
+    """
+
+    def _calibration_loop(runtime_model):
+        runtime_model.eval()
+        with torch.no_grad():
+            for clip_id in tqdm(clip_ids, desc="Calibration"):
+                data = load_physical_aiavdataset(clip_id, t0_us=t0_us)
+                messages = create_message(data["image_frames"].flatten(0, 1))
+                inputs = processor.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    add_generation_prompt=False,
+                    continue_final_message=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                )
+                model_inputs = {
+                    "tokenized_data": inputs,
+                    "ego_history_xyz": data["ego_history_xyz"],
+                    "ego_history_rot": data["ego_history_rot"],
+                }
+                model_inputs = to_device(model_inputs, device)
+
+                with torch.autocast("cuda", dtype=torch.float16):
+                    runtime_model.sample_trajectories_from_data_with_vlm_rollout(
+                        data=model_inputs,
+                        top_p=top_p,
+                        temperature=temperature,
+                        num_traj_samples=calibration_traj_samples,
+                        max_generation_length=max_generation_length,
+                    )
+
+    return _calibration_loop
+
+
+def read_clip_ids_from_parquet(parquet_path: str) -> list[str]:
+    """
+    Reads clip_ids from parquet. Tries common column names; falls back to index if needed.
+    Returns clip_ids as a list of strings (unique, preserving first occurrence order).
+    """
+    parquet_path = str(parquet_path)
+    df = pd.read_parquet(parquet_path)
+    cols_lower = {c.lower(): c for c in df.columns}
+    clip_ids = df[cols_lower["key"]].astype(str).tolist()
+
+    seen = set()
+    uniq = []
+    for cid in clip_ids:
+        if cid not in seen:
+            seen.add(cid)
+            uniq.append(cid)
+    return uniq
+
+
+def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
+    """
+    Quantize a PyTorch model using ModelOpt post-training quantization (PTQ).
+
+    This function applies quantization to reduce model precision for faster inference
+    while maintaining acceptable accuracy. It uses calibration data generated from
+    the provided tokenizer to determine optimal quantization parameters.
+
+    Supported quantization formats:
+        - fp8: 8-bit floating point quantization
+        - nvfp4: 4-bit NVIDIA floating point quantization
+    Args:
+        model: PyTorch model to quantize. Must be in evaluation mode.
+        args: Command line arguments containing quant_format and debug.
+        tokenizer: Hugging Face tokenizer for creating calibration data.
+            Required only when `calibration_forward_loop` is not provided.
+        calibration_forward_loop: Optional callable taking `model` and running
+            calibration forward passes. Use this for non-text modules whose
+            forward signature is not compatible with dataset_utils batches.
+
+    Returns:
+        Quantized model
+    """
+    # Create calibration forward loop. For standard text models we can build
+    # it from tokenizer-based data, but vision modules often need custom args.
+    if calibration_forward_loop is None:
+        if tokenizer is None:
+            raise ValueError("tokenizer must be provided when calibration_forward_loop is None")
+        calib_dataloader = get_dataset_dataloader(
+            tokenizer=tokenizer,
+            batch_size=32,
+            num_samples=512,
+            device="cuda:0",
+        )
+        calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+    else:
+        calibrate_loop = calibration_forward_loop
+
+    if args.quant_format == "fp8":
+        quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG)
+    elif args.quant_format == "nvfp4":
+        quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)
+    else:
+        raise RuntimeError("Unsupported quantization format")
+    quant_cfg["quant_cfg"].append({"quantizer_name": "*vlm.model.visual*", "enable": False})
+
+    model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    if args.debug:
+        print("================== quantize_model summary ==================")
+        mtq.print_quant_summary(model)
+
+    return model
+
+
+def auto_quantize_model(
+    model,
+    args,
+    *,
+    clip_ids,
+    processor,
+    t0_us: int,
+    top_p: float,
+    temperature: float,
+    max_generation_length: int,
+    calibration_traj_samples: int,
+    device: str,
+):
+    """
+    Quantize a PyTorch model using ModelOpt's AutoQuantize API.
+
+    Searches per-layer across [NVFP4_DEFAULT_CFG, FP8_DEFAULT_CFG] under the
+    effective-bits budget in args.auto_quantize_bits. Calibration data is built
+    from the same joint VLM + diffusion rollout used by
+    alpamayo_r1.eval.make_joint_calibration_forward_loop.
+
+    Args:
+        model: PyTorch model to quantize. Must be in eval mode.
+        args: Namespace with `auto_quantize_bits` (float) and `debug` (bool).
+        clip_ids: Iterable of clip_ids for calibration.
+        processor: HF processor used for chat-template tokenization.
+        t0_us, top_p, temperature, max_generation_length, calibration_traj_samples,
+        device: Same semantics as make_joint_calibration_forward_loop.
+
+    Returns:
+        Quantized model (the search_state from mtq.auto_quantize is discarded).
+    """
+
+    def _one_epoch():
+        for clip_id in clip_ids:
+            data = load_physical_aiavdataset(clip_id, t0_us=t0_us)
+            messages = create_message(data["image_frames"].flatten(0, 1))
+            inputs = processor.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=False,
+                continue_final_message=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            model_inputs = {
+                "tokenized_data": inputs,
+                "ego_history_xyz": data["ego_history_xyz"],
+                "ego_history_rot": data["ego_history_rot"],
+                "ego_future_xyz": data["ego_future_xyz"],
+                "ego_future_rot": data["ego_future_rot"],
+            }
+            yield to_device(model_inputs, device)
+
+    class _ReusableLoader:
+        """Re-iterable wrapper so modelopt can run calibration + scoring passes."""
+
+        def __iter__(self):
+            return _one_epoch()
+
+    data_loader = _ReusableLoader()
+
+    def forward_step(runtime_model, data):
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            out = runtime_model.teacher_forced_flow_loss_forward(data=data)
+        v_pred, v_target = out["v_pred"], out["v_target"]
+        print(
+            f"[autoquant-fwd] v_pred: finite={torch.isfinite(v_pred).all().item()} "
+            f"min={v_pred.min().item():.4g} max={v_pred.max().item():.4g} "
+            f"abs_mean={v_pred.abs().mean().item():.4g} | "
+            f"v_target: finite={torch.isfinite(v_target).all().item()} "
+            f"min={v_target.min().item():.4g} max={v_target.max().item():.4g}"
+        )
+        return out
+
+    def loss_func(output, batch):
+        loss = torch.nn.functional.mse_loss(output["v_pred"], output["v_target"])
+        print(f"[autoquant-loss] loss={loss.item():.6g} finite={torch.isfinite(loss).item()}")
+        return loss
+
+    # try:
+    model, search_state = mtq.auto_quantize(
+        model,
+        constraints={"effective_bits": args.auto_quantize_bits},
+        quantization_formats=["NVFP4_DEFAULT_CFG", "FP8_DEFAULT_CFG"],
+        data_loader=data_loader,
+        forward_step=forward_step,
+        loss_func=loss_func,
+        disabled_layers="*lm_head*",
+        verbose=True,
+    )
+
+    print("================== auto_quantize search_state ==================")
+    print(search_state)
+
+    if args.debug:
+        print("================== auto_quantize_model summary ==================")
+        mtq.print_quant_summary(model)
+
+    return model
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Quantize AlpamayoR1 and export as HF checkpoint")
+    ap.add_argument(
+        "--ckpt",
+        type=str,
+        default="nvidia/Alpamayo-R1-10B",
+        help="HF hub id or local path of the input checkpoint",
+    )
+    ap.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to save the quantized HF checkpoint",
+    )
+    ap.add_argument(
+        "--qformat",
+        type=str,
+        required=True,
+        choices=["fp8", "nvfp4", "auto"],
+        help="Quantization format",
+    )
+    ap.add_argument(
+        "--auto_quantize_bits",
+        type=float,
+        default=4.8,
+        help="Effective-bits budget for AutoQuantize (only used when --quantize auto)",
+    )
+    ap.add_argument(
+        "--parquet",
+        type=str,
+        default="1005_7cam_gold_eval_metadb_public.parquet",
+        help="Parquet file with clip_ids for calibration",
+    )
+    ap.add_argument("--t0_us", type=int, default=5_100_000)
+    ap.add_argument("--top_p", type=float, default=0.98)
+    ap.add_argument("--temperature", type=float, default=0.6)
+    ap.add_argument("--max_generation_length", type=int, default=256)
+    ap.add_argument("--num_traj_samples", type=int, default=6)
+    ap.add_argument(
+        "--limit", type=int, default=644, help="How many clip_ids to use for calibration"
+    )
+    ap.add_argument(
+        "--real-quant",
+        action="store_true",
+        help="Export packed real-quantized weights (fp8 / NVFP4) via "
+        "modelopt.torch.export.export_hf_checkpoint instead of "
+        "saving fake-quant fp16 weights with quantizer state.",
+    )
+    args = ap.parse_args()
+
+    script_dir = Path(__file__).resolve().parent
+    parquet_path = (script_dir / args.parquet).resolve()
+
+    clip_ids = read_clip_ids_from_parquet(str(parquet_path))
+    if args.limit is not None and args.limit > 0:
+        clip_ids = clip_ids[: args.limit]
+    print(f"Loaded {len(clip_ids)} clip_ids from: {parquet_path}")
+
+    device = "cuda"
+    print(f"Loading model from {args.ckpt!r} ...")
+    model = AlpamayoR1.from_pretrained(args.ckpt, dtype=torch.float16).to(
+        device=device, dtype=torch.float16
+    )
+    model.eval()
+
+    processor = get_processor(model.tokenizer)
+
+    # Quantize using existing recipe
+    print(f"Quantizing model ({args.qformat}) ...")
+    quantization_args = argparse.Namespace(
+        quant_format=args.qformat,
+        quant_algo="max",
+        weight_only=False,
+        debug=True,
+        auto_quantize_bits=args.auto_quantize_bits,
+    )
+    if args.qformat == "auto":
+        model = auto_quantize_model(
+            model,
+            quantization_args,
+            clip_ids=clip_ids,
+            processor=processor,
+            t0_us=args.t0_us,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            max_generation_length=args.max_generation_length,
+            calibration_traj_samples=args.num_traj_samples,
+            device=device,
+        )
+    else:
+        # Build calibration loop
+        calibration_forward_loop = make_joint_calibration_forward_loop(
+            clip_ids=clip_ids,
+            processor=processor,
+            t0_us=args.t0_us,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            max_generation_length=args.max_generation_length,
+            calibration_traj_samples=args.num_traj_samples,
+            device=device,
+        )
+        model = quantize_model(
+            model,
+            quantization_args,
+            calibration_forward_loop=calibration_forward_loop,
+        )
+    model.eval()
+
+    # Save as HF-style checkpoint
+    os.makedirs(args.output_dir, exist_ok=True)
+    print(f"Saving quantized checkpoint to {args.output_dir!r} ...")
+
+    if args.real_quant:
+        # Persist processor + composite config first so export_hf_checkpoint's
+        # injected `quantization_config` in config.json is the one that wins.
+        processor.save_pretrained(args.output_dir)
+        model.config.save_pretrained(args.output_dir)
+        with torch.inference_mode():
+            export_hf_checkpoint(
+                model,
+                dtype=torch.float16,
+                export_dir=args.output_dir,
+            )
+    else:
+        with torch.inference_mode():
+            model.save_pretrained(args.output_dir)
+
+        processor.save_pretrained(args.output_dir)
+        model.config.save_pretrained(args.output_dir)
+
+        quant_cfg = get_quant_config(model)
+        with open(os.path.join(args.output_dir, "hf_quant_config.json"), "w") as f:
+            json.dump(quant_cfg, f)
+
+    print(f"Quantized checkpoint saved to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    with torch.no_grad():
+        main()

From 579a10dd3e86b0cfe8d25638839c1b66fddf3e21 Mon Sep 17 00:00:00 2001
From: Rohan Joshi <rohjoshi@nvidia.com>
Date: Mon, 1 Jun 2026 06:38:22 +0000
Subject: [PATCH 2/3] Fixed real quant path

Signed-off-by: Rohan Joshi <rohjoshi@nvidia.com>
---
 examples/alpamayo/quantize.py | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/examples/alpamayo/quantize.py b/examples/alpamayo/quantize.py
index bb4b13e24cb..5cf0c087bdd 100644
--- a/examples/alpamayo/quantize.py
+++ b/examples/alpamayo/quantize.py
@@ -407,7 +407,29 @@ def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
         quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)
     else:
         raise RuntimeError("Unsupported quantization format")
-    quant_cfg["quant_cfg"].append({"quantizer_name": "*vlm.model.visual*", "enable": False})
+    # Keep the entire vision tower in high precision. We must clear the NVFP4 quantizer
+    # *type* here, not merely disable it: the QuantConv3d in the vision patch-embed routes to
+    # a JIT-compiled implicit-GEMM CUDA kernel whenever its quantizers are NVFP4-typed (num_bits
+    # == (2, 1) with a dynamic block config) -- even when `enable=False`. That path requires
+    # CUDA_HOME (kernel compilation) and would also fake-quantize the vision weights we intend to
+    # leave untouched. Passing a non-NVFP4 cfg (num_bits=8) together with enable=False keeps these
+    # modules on the plain, unquantized forward path. Harmless for FP8 (already disabled there).
+    quant_cfg["quant_cfg"].append(
+        {"quantizer_name": "*vlm.model.visual*", "enable": False, "cfg": {"num_bits": 8}}
+    )
+
+    if args.quant_format == "nvfp4":
+        # NVFP4 packs weights in blocks of 16 along the input (K) dimension. A Linear whose
+        # in_features is not a multiple of 16 gets K-padded when its weight is packed, and
+        # ModelOpt's packed-weight dequantize path cannot reshape the padded buffer back to the
+        # logical shape (it raises e.g. "shape '[512, 60]' is invalid for input of size 32768").
+        # Such layers also never satisfy the real-quant GEMM's K % 64 == 0 requirement, so they
+        # would only ever run on the (now-broken) dequantize fallback. Keep them in high precision.
+        # In AlpamayoR1 these are the small action-projection heads (e.g. the Fourier-feature
+        # encoder input), so the size/speed impact of leaving them unquantized is negligible.
+        for _name, _module in model.named_modules():
+            if isinstance(_module, torch.nn.Linear) and _module.in_features % 16 != 0:
+                quant_cfg["quant_cfg"].append({"quantizer_name": f"{_name}.*", "enable": False})
 
     model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
     if args.debug:
@@ -534,7 +556,7 @@ def main():
         help="Directory to save the quantized HF checkpoint",
     )
     ap.add_argument(
-        "--qformat",
+        "--quantize",
         type=str,
         required=True,
         choices=["fp8", "nvfp4", "auto"],
@@ -558,7 +580,7 @@ def main():
     ap.add_argument("--max_generation_length", type=int, default=256)
     ap.add_argument("--num_traj_samples", type=int, default=6)
     ap.add_argument(
-        "--limit", type=int, default=644, help="How many clip_ids to use for calibration"
+        "--limit", type=int, default=16, help="How many clip_ids to use for calibration"
     )
     ap.add_argument(
         "--real-quant",
@@ -587,15 +609,15 @@ def main():
     processor = get_processor(model.tokenizer)
 
     # Quantize using existing recipe
-    print(f"Quantizing model ({args.qformat}) ...")
+    print(f"Quantizing model ({args.quantize}) ...")
     quantization_args = argparse.Namespace(
-        quant_format=args.qformat,
+        quant_format=args.quantize,
         quant_algo="max",
         weight_only=False,
         debug=True,
         auto_quantize_bits=args.auto_quantize_bits,
     )
-    if args.qformat == "auto":
+    if args.quantize == "auto":
         model = auto_quantize_model(
             model,
             quantization_args,
@@ -632,16 +654,25 @@ def main():
     print(f"Saving quantized checkpoint to {args.output_dir!r} ...")
 
     if args.real_quant:
-        # Persist processor + composite config first so export_hf_checkpoint's
-        # injected `quantization_config` in config.json is the one that wins.
+        # Real (packed) quantization. `mtq.compress` replaces the quantized linears with
+        # RealQuantLinear modules whose weights are packed into the low-precision storage
+        # format (NVFP4 = E2M1 nibbles + per-block FP8 scales) and enables ModelOpt's
+        # real-quant GEMM kernels, so inference runs on the hardware NVFP4 path rather than
+        # fake-quant fp16. We then save through the ModelOpt-patched `save_pretrained`, which
+        # writes the packed weights *and* a `modelopt_state.pth` recording the quantize +
+        # real_quantize modes (including the packed-tensor metadata/scales). Reloading via
+        # `AlpamayoR1.from_pretrained` with ModelOpt HF checkpointing enabled replays those
+        # modes and re-wraps the packed weights, so the checkpoint loads and runs real-quantized.
+        #
+        # NOTE: `export_hf_checkpoint` (the unified vLLM/TRT-LLM deployment format) is
+        # intentionally not used here: that format has no `modelopt_state.pth`, so a custom
+        # model class like AlpamayoR1 cannot reload it through `from_pretrained`.
+        mtq.compress(model)
+        model.eval()
+        with torch.inference_mode():
+            model.save_pretrained(args.output_dir)
         processor.save_pretrained(args.output_dir)
         model.config.save_pretrained(args.output_dir)
-        with torch.inference_mode():
-            export_hf_checkpoint(
-                model,
-                dtype=torch.float16,
-                export_dir=args.output_dir,
-            )
     else:
         with torch.inference_mode():
             model.save_pretrained(args.output_dir)

From 6cc978ee00a772a4aaf971d6bf87ffb473dec688 Mon Sep 17 00:00:00 2001
From: Rohan Joshi <rohjoshi@nvidia.com>
Date: Mon, 1 Jun 2026 22:44:40 +0000
Subject: [PATCH 3/3] Added README and calibration clips

Signed-off-by: Rohan Joshi <rohjoshi@nvidia.com>
---
 ...ws_train_set_for_calibration_25.10.parquet | Bin 0 -> 3949 bytes
 examples/alpamayo/README.md                   |  72 ++++++++++++++++++
 examples/alpamayo/quantize.py                 |  66 ++++++++--------
 3 files changed, 103 insertions(+), 35 deletions(-)
 create mode 100644 examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet
 create mode 100644 examples/alpamayo/README.md

diff --git a/examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet b/examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c16c7d2b98ca493a148dc528f4244f412d51462d
GIT binary patch
literal 3949
zcmb_f&2J+~6>n$ltUZ%JW`!*q<zQJe)*z;a>hJE=te~A4JI*HAjAzDQfTF7I>ZFr)
z+xR0+G8$=4Xv7I24l5354@gKW4oG|Bz<&S|oK}bvXAX1Xz=7pewPQKS4B8dja;mG|
zd-Z<5_fchM&uPfnWcKBa?7uSG*<x0DPm*4yUT@r%B+1fbYAC*sJm1xkW*Za{*D#T$
zxGrV7rCEl;?Xat1>?<y^HA4X{^I*mll=%L-rKzs(xmIHDm9G;W8_Yz^(7?)fJ!CNr
zBdYs4lU=OnBz5=or#hBZlMx$PiV9}BVIYdJ3cF;(q{K6P)8wGGZ^~5FHAD@AA&t2f
zvNYl&oA|b(8n$97f}BM)>}%LWvMOsxb3FqQ;=wjqGnqm?6?TarY^utNLXhgI2GXdm
zBgzOtx{Fl>D+ZC7&h1oP_G}aKp_qwJ_DtPEHa2`D6H7I8N-PU|sn@S$SN9l(jCiJu
zGy*hj-_nrE6b+n#jfE3;U)j{Quwtr6ws@>mmyvDzF4D1S66z^Bv+v|z|H@WfLm{?}
zOiS^RX2=#$V+0x4XSU@t##|99^|8$i)kZ4z3D^-C5oQ`l#<J;S*>^1>LNzSY#-^ep
z!*f*xMRpO9JsC2oyHGCE^%ap~izrMp4HfBzVj)elAXd$=5wryLDJH(=-}(9L4>hVN
zHntStqf&^15@b<Kk?HBoU`&&lMS13Qj~ccGS&&r<VVF8b7Vtxc?15K}S%!(T67=Bh
zdoZkJm$S?3hIIG#muMU-e;a>^`rWz(G}az0!B|WrQ)|l8laK%K_x}LsTJ~EUR1%>1
zL;^lJ>8^N-P3iS`(-&Xx!?Kk9G%-IciF7$5r4q@%rc&=FrN6DFlONy6tR=s_lu1i_
znRnAt;$|k3{ERO*SAQV=b~P>i8{U6eT}w(UnY+^OSJxikn?MPkKc(XL)imVO(n_q!
zr}yHC1pMA0eOrjh?0ebVc6Lb|@Jx~8jE5wM00|VA5}p!@eEbsrGQ~LhL{Z8n3qR7O
z{PM^>WId8iOWBn>OZj(>+1VXwdmX%HC0MVzVKCSaM*Alu3_REbvXpx`iH>9Y)MyX}
z<Jpc}v9Mm)E=%j#m4YrUuN8#5>{70n`{_#V<x2V2H_GX+zY_}gpp;T}2^d59m!t=|
zmzlzHp)3_43eH9<+*y_iS7H&gnz?TI)yjiweNuo;!sUbSEILZg!iqB>JpA#U+&`Df
zzxzh{A1mu$skqlzUqZS_V4Pop^s_|n7m4!Awep|8$#ZzMxYzvUK1s~8ko(Q8T(O+a
zeZD3Y?&X&UB=X28|I6j)r6BUy>Au?!C%tG?dcXL?=cQwi!K^eS(E%&nE0!YCV<3no
zVF;_ym<-3T#A4nbz*NULUNOFsU;G$UvCDnopUpg9cH{>v^!6{yVrKi}vjImeO`^c<
zdohVgZ*VqO^yv|E$DCY`jfqFb1k9h8SOn%lbinsMds;<S>BRz>%PdOJ>fFI48W+O<
ziw95Ci+ez47F+4kA-axvO??HO_WPlz!=)5lNEZ@t9hjfgHvnFUuK)$n*wC&c&Bgx<
zN?Zijq2}T%P+zUGA~4=nWIP@QbTSql^&FC1a)*qJ`$JLwFrY&+Jmb|LoRQ(MKjnl@
z*l-l|BQ8~x`}jUC@qIHsi^}`zeYNyL%C9($#z(F9<C`)6`OWX~M?QGuIL>4E<IjhV
zqdAoWr|LKd&c=a|RVtVCQ!a!(GyS9K!*~s&@FMhe$C)~?58lGbF)aBShR0xEG3&i>
z(ycYMwz7LdH=AP`?F1hM70S-GhNPt**N^%K^=Rvm_PnqjOb_Zk{gAevahr+PIvs6q
zIDxA+4&CT6$eihHed2D^jrzl9)18OTsJ^jv?$w%8?rWzld)%g@*dwncPufb1-)+xv
zC!LlYa$6cTM|6W@b_d<o>2X_m19q(n{Hy0}wV}H;&dUkBmOcRg$K5A&jtxBJDQnu8
zwJXAJy=Moc*F5qzgnry@Hx9uE?mPj`o1>uH(vPUp7|@<8{56}6+V<0fz`w%XJp=pd
z7;<>Q%L$0m)Z;u|<-DMaDv(denY_R$YEDRN_oNFk@ccmRV|O$3z0Gh6aTpNCNvAgm
zJL=<D-)iDct8wVn9?fgvKbamt-RAkL_jX65UGYhK*Y|qW5oyJHyS*@^wT4f7)$z>t
z5v_$&s0HM)dkFJm5yP8!_O?!_GJPAzfI?m4_~p>6RgXK6+j%WF@rdVl5tnR2+-IT=
zJ<@89x>b9Te;;Z&d^?wL;yvu(7*CgJyP_}l0rW54J^Bjyigu?E`^?|<`nhTa@)`Sc
zzwW-P0sq>m-G&<0dsXahZVft76VJ|t$!iPsJnOcqn5uPNbLhKMu-SsMcP#o8`UU1&
zCtjRSrQ6;*CoOv-&hSzFZ2QQW>}+h=;*8GucSASY9Z*FRK6V1_eEVo8@Z+1P#;=Nt
zSYgWVC}+fnnz&!YAnudRhj58mpm<ajMsRuY1s~$)8N4dzZtZm7aY5}V*nhXfuRgAD
z4!JPCq=Y}Q<oBQBbd*Dx_5`Z9&o&%qb7sob6+v}(j$ad?U<A(sKIW#^`kWh`?*WNZ
zhl7Ftj#HUBJ6sN%FYe<?rNJejglB;tzjGaa=D8`z7w|s#K*|>fWcZT_8zXlxK;jFd
aIQv2<DjNT)DDa=c^N*y%&-*?2NBh5RktezU

literal 0
HcmV?d00001

diff --git a/examples/alpamayo/README.md b/examples/alpamayo/README.md
new file mode 100644
index 00000000000..c0c2bc2dda3
--- /dev/null
+++ b/examples/alpamayo/README.md
@@ -0,0 +1,72 @@
+# Quantizing Alpamayo 1
+
+[Alpamayo 1](https://github.com/nvlabs/alpamayo) (formerly Alpamayo-R1) is a
+~10B vision-language-action model trained by NVIDIA for autonomous vehicle
+research. It takes multi-camera video and egomotion history as input and
+produces a Chain-of-Causation reasoning trace plus a future driving trajectory.
+See the paper, [*Alpamayo-R1: Bridging Reasoning and Action Prediction for
+Generalizable Autonomous Driving in the Long
+Tail*](https://arxiv.org/abs/2511.00088), and the
+[nvlabs/alpamayo](https://github.com/nvlabs/alpamayo) repository for details.
+
+This example produces FP8, NVFP4, and mixed-precision quantized checkpoints of
+Alpamayo using ModelOpt. Quantization calibration runs on a small dataset of 16
+AV clips (`0417_16rows_train_set_for_calibration_25.10.parquet`).
+
+## Setup
+
+Clone Alpamayo and install it into the current environment so `alpamayo_r1` is
+importable:
+
+```bash
+git clone https://github.com/nvlabs/alpamayo
+pip install ./alpamayo
+```
+
+Follow the Alpamayo README to request access to the gated model weights and the
+Physical AI AV dataset, then authenticate with `hf auth login`.
+
+## Usage
+
+`quantize.py` loads an Alpamayo checkpoint, calibrates it on the 16 clips, and
+exports an HF-style quantized checkpoint.
+
+### FP8 / NVFP4
+
+By default the script saves **fake-quantized** weights (fp16 weights plus
+quantizer state) — useful for accuracy evaluation:
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-fp8 --quantize fp8
+```
+
+Pass `--real-quant` to save **real-quantized** weights packed into the
+low-precision storage format (NVFP4 = E2M1 nibbles + per-block FP8 scales),
+which run on the hardware low-precision GEMM path:
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-nvfp4 --quantize nvfp4 --real-quant
+```
+
+The vision tower is always kept in high precision, and small action-projection
+heads whose dimensions are not multiples of 16 are left unquantized (they break
+the real-quant GEMM backends).
+
+### AutoQuantize (mixed precision)
+
+`--quantize auto` runs ModelOpt's AutoQuantize, which searches per layer between
+NVFP4 and FP8 under an effective-bits budget (`--auto_quantize_bits`, default
+6.5):
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-auto --quantize auto --auto_quantize_bits 6.5
+```
+
+AutoQuantize chooses a per-layer format using a **gradient-based sensitivity
+score**: it backpropagates a loss through the model and estimates how much each
+candidate format perturbs that loss, then picks the cheapest assignment that
+stays within the bit budget. Here the loss is the flow-matching objective — an
+MSE between the action expert's predicted velocity field `v_pred` and the
+target `v_target = x_1 - x_0` from a teacher-forced forward pass on the
+calibration clips. Layers the loss is sensitive to keep more bits (FP8); the
+rest go to NVFP4.
diff --git a/examples/alpamayo/quantize.py b/examples/alpamayo/quantize.py
index 5cf0c087bdd..a8dcce7fa8c 100644
--- a/examples/alpamayo/quantize.py
+++ b/examples/alpamayo/quantize.py
@@ -194,8 +194,7 @@ def _teacher_forced_flow_loss_forward(
     input_ids = self.fuse_traj_tokens(input_ids, traj_data_vlm)
     device = input_ids.device
 
-    # Append <traj_future_start> so the expert attends through the full prompt
-    # that inference would have generated up to the action block.
+    # Append <traj_future_start> so the expert attends through the full prompt.
     traj_future_start_id = self.tokenizer.convert_tokens_to_ids(
         to_special_token("traj_future_start")
     )
@@ -407,28 +406,21 @@ def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
         quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)
     else:
         raise RuntimeError("Unsupported quantization format")
-    # Keep the entire vision tower in high precision. We must clear the NVFP4 quantizer
-    # *type* here, not merely disable it: the QuantConv3d in the vision patch-embed routes to
-    # a JIT-compiled implicit-GEMM CUDA kernel whenever its quantizers are NVFP4-typed (num_bits
-    # == (2, 1) with a dynamic block config) -- even when `enable=False`. That path requires
-    # CUDA_HOME (kernel compilation) and would also fake-quantize the vision weights we intend to
-    # leave untouched. Passing a non-NVFP4 cfg (num_bits=8) together with enable=False keeps these
-    # modules on the plain, unquantized forward path. Harmless for FP8 (already disabled there).
+    # Keep the vision tower in high precision. Pass a non-NVFP4 cfg (num_bits=8) with
+    # enable=False, not just enable=False: an NVFP4-typed QuantConv3d routes to a JIT
+    # implicit-GEMM CUDA kernel (needs CUDA_HOME) even when disabled.
     quant_cfg["quant_cfg"].append(
         {"quantizer_name": "*vlm.model.visual*", "enable": False, "cfg": {"num_bits": 8}}
     )
 
-    if args.quant_format == "nvfp4":
-        # NVFP4 packs weights in blocks of 16 along the input (K) dimension. A Linear whose
-        # in_features is not a multiple of 16 gets K-padded when its weight is packed, and
-        # ModelOpt's packed-weight dequantize path cannot reshape the padded buffer back to the
-        # logical shape (it raises e.g. "shape '[512, 60]' is invalid for input of size 32768").
-        # Such layers also never satisfy the real-quant GEMM's K % 64 == 0 requirement, so they
-        # would only ever run on the (now-broken) dequantize fallback. Keep them in high precision.
-        # In AlpamayoR1 these are the small action-projection heads (e.g. the Fourier-feature
-        # encoder input), so the size/speed impact of leaving them unquantized is negligible.
+    if args.quant_format == "nvfp4" or getattr(args, "real_quant", False):
+        # Keep Linear layers whose in/out features aren't multiples of 16 in high precision:
+        # they break the real-quant GEMM backends (NVFP4 block packing, FP8 torch._scaled_mm).
+        # In AlpamayoR1 these are the small action-projection heads, so the impact is negligible.
         for _name, _module in model.named_modules():
-            if isinstance(_module, torch.nn.Linear) and _module.in_features % 16 != 0:
+            if isinstance(_module, torch.nn.Linear) and (
+                _module.in_features % 16 != 0 or _module.out_features % 16 != 0
+            ):
                 quant_cfg["quant_cfg"].append({"quantizer_name": f"{_name}.*", "enable": False})
 
     model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
@@ -519,7 +511,16 @@ def loss_func(output, batch):
         print(f"[autoquant-loss] loss={loss.item():.6g} finite={torch.isfinite(loss).item()}")
         return loss
 
-    # try:
+    # Mirror the quantize_model exclusions via disabled_layers (fnmatch against module names),
+    # since the AutoQuantize search also includes NVFP4: keep the vision tower unquantized, and
+    # exclude Linear layers whose in/out features aren't multiples of 16.
+    disabled_layers = ["*lm_head*", "*vlm.model.visual*"]
+    for _name, _module in model.named_modules():
+        if isinstance(_module, torch.nn.Linear) and (
+            _module.in_features % 16 != 0 or _module.out_features % 16 != 0
+        ):
+            disabled_layers.append(_name)
+
     model, search_state = mtq.auto_quantize(
         model,
         constraints={"effective_bits": args.auto_quantize_bits},
@@ -527,7 +528,7 @@ def loss_func(output, batch):
         data_loader=data_loader,
         forward_step=forward_step,
         loss_func=loss_func,
-        disabled_layers="*lm_head*",
+        disabled_layers=disabled_layers,
         verbose=True,
     )
 
@@ -565,13 +566,13 @@ def main():
     ap.add_argument(
         "--auto_quantize_bits",
         type=float,
-        default=4.8,
+        default=6.5,
         help="Effective-bits budget for AutoQuantize (only used when --quantize auto)",
     )
     ap.add_argument(
         "--parquet",
         type=str,
-        default="1005_7cam_gold_eval_metadb_public.parquet",
+        default="0417_16rows_train_set_for_calibration_25.10.parquet",
         help="Parquet file with clip_ids for calibration",
     )
     ap.add_argument("--t0_us", type=int, default=5_100_000)
@@ -616,6 +617,7 @@ def main():
         weight_only=False,
         debug=True,
         auto_quantize_bits=args.auto_quantize_bits,
+        real_quant=args.real_quant,
     )
     if args.quantize == "auto":
         model = auto_quantize_model(
@@ -654,19 +656,13 @@ def main():
     print(f"Saving quantized checkpoint to {args.output_dir!r} ...")
 
     if args.real_quant:
-        # Real (packed) quantization. `mtq.compress` replaces the quantized linears with
-        # RealQuantLinear modules whose weights are packed into the low-precision storage
-        # format (NVFP4 = E2M1 nibbles + per-block FP8 scales) and enables ModelOpt's
-        # real-quant GEMM kernels, so inference runs on the hardware NVFP4 path rather than
-        # fake-quant fp16. We then save through the ModelOpt-patched `save_pretrained`, which
-        # writes the packed weights *and* a `modelopt_state.pth` recording the quantize +
-        # real_quantize modes (including the packed-tensor metadata/scales). Reloading via
-        # `AlpamayoR1.from_pretrained` with ModelOpt HF checkpointing enabled replays those
-        # modes and re-wraps the packed weights, so the checkpoint loads and runs real-quantized.
+        # Real (packed) quantization. `mtq.compress` packs weights into the low-precision
+        # storage format and enables ModelOpt's real-quant GEMM kernels. The ModelOpt-patched
+        # `save_pretrained` writes the packed weights plus a `modelopt_state.pth`, which
+        # `AlpamayoR1.from_pretrained` replays to reload and run real-quantized.
         #
-        # NOTE: `export_hf_checkpoint` (the unified vLLM/TRT-LLM deployment format) is
-        # intentionally not used here: that format has no `modelopt_state.pth`, so a custom
-        # model class like AlpamayoR1 cannot reload it through `from_pretrained`.
+        # NOTE: `export_hf_checkpoint` (the vLLM/TRT-LLM deployment format) isn't used here: it
+        # has no `modelopt_state.pth`, so a custom model class can't reload it via from_pretrained.
         mtq.compress(model)
         model.eval()
         with torch.inference_mode():