diff --git a/examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet b/examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet
new file mode 100644
index 0000000000..c16c7d2b98
Binary files /dev/null and b/examples/alpamayo/0417_16rows_train_set_for_calibration_25.10.parquet differ
diff --git a/examples/alpamayo/README.md b/examples/alpamayo/README.md
new file mode 100644
index 0000000000..c0c2bc2dda
--- /dev/null
+++ b/examples/alpamayo/README.md
@@ -0,0 +1,72 @@
+# Quantizing Alpamayo 1
+
+[Alpamayo 1](https://github.com/nvlabs/alpamayo) (formerly Alpamayo-R1) is a
+~10B vision-language-action model trained by NVIDIA for autonomous vehicle
+research. It takes multi-camera video and egomotion history as input and
+produces a Chain-of-Causation reasoning trace plus a future driving trajectory.
+See the paper, [*Alpamayo-R1: Bridging Reasoning and Action Prediction for
+Generalizable Autonomous Driving in the Long
+Tail*](https://arxiv.org/abs/2511.00088), and the
+[nvlabs/alpamayo](https://github.com/nvlabs/alpamayo) repository for details.
+
+This example produces FP8, NVFP4, and mixed-precision quantized checkpoints of
+Alpamayo using ModelOpt. Quantization calibration runs on a small dataset of 16
+AV clips (`0417_16rows_train_set_for_calibration_25.10.parquet`).
+
+## Setup
+
+Clone Alpamayo and install it into the current environment so `alpamayo_r1` is
+importable:
+
+```bash
+git clone https://github.com/nvlabs/alpamayo
+pip install ./alpamayo
+```
+
+Follow the Alpamayo README to request access to the gated model weights and the
+Physical AI AV dataset, then authenticate with `hf auth login`.
+
+## Usage
+
+`quantize.py` loads an Alpamayo checkpoint, calibrates it on the 16 clips, and
+exports an HF-style quantized checkpoint.
+
+### FP8 / NVFP4
+
+By default the script saves **fake-quantized** weights (fp16 weights plus
+quantizer state) — useful for accuracy evaluation:
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-fp8 --quantize fp8
+```
+
+Pass `--real-quant` to save **real-quantized** weights packed into the
+low-precision storage format (NVFP4 = E2M1 nibbles + per-block FP8 scales),
+which run on the hardware low-precision GEMM path:
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-nvfp4 --quantize nvfp4 --real-quant
+```
+
+The vision tower is always kept in high precision, and small action-projection
+heads whose dimensions are not multiples of 16 are left unquantized (they break
+the real-quant GEMM backends).
+
+### AutoQuantize (mixed precision)
+
+`--quantize auto` runs ModelOpt's AutoQuantize, which searches per layer between
+NVFP4 and FP8 under an effective-bits budget (`--auto_quantize_bits`, default
+6.5):
+
+```bash
+python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-auto --quantize auto --auto_quantize_bits 6.5
+```
+
+AutoQuantize chooses a per-layer format using a **gradient-based sensitivity
+score**: it backpropagates a loss through the model and estimates how much each
+candidate format perturbs that loss, then picks the cheapest assignment that
+stays within the bit budget. Here the loss is the flow-matching objective — an
+MSE between the action expert's predicted velocity field `v_pred` and the
+target `v_target = x_1 - x_0` from a teacher-forced forward pass on the
+calibration clips. Layers the loss is sensitive to keep more bits (FP8); the
+rest go to NVFP4.
diff --git a/examples/alpamayo/quantize.py b/examples/alpamayo/quantize.py
new file mode 100644
index 0000000000..a8dcce7fa8
--- /dev/null
+++ b/examples/alpamayo/quantize.py
@@ -0,0 +1,688 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Quantize AlpamayoR1 and export as an HF-style checkpoint.
+
+Usage:
+    python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-r1-fp8 --quantize fp8
+    python quantize.py --ckpt nvidia/Alpamayo-R1-10B --output-dir ./alpamayo-r1-nvfp4 --quantize nvfp4 --real-quant
+"""
+
+import argparse
+import collections.abc
+import copy
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+import einops
+import pandas as pd
+import torch
+from alpamayo_r1.load_physical_aiavdataset import load_physical_aiavdataset
+from alpamayo_r1.models.alpamayo_r1 import AlpamayoR1
+from alpamayo_r1.models.token_utils import to_special_token
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoTokenizer
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export import export_hf_checkpoint
+from modelopt.torch.export.quant_utils import get_quant_config
+from modelopt.torch.opt.plugins.huggingface import (
+    _LIBRARY_CLASSES_FOR_PATCHING,
+    _PATCHED_CLASSES,
+    patch_pretrained_methods,
+)
+from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader
+
+logger = logging.getLogger(__name__)
+
+try:
+    assert torch.ops.tensorrt.quantize_op.default
+except Exception:
+    logger.warning("Unable to import quantization op. Please install modelopt library")
+
+MIN_PIXELS = 163840
+MAX_PIXELS = 196608
+BASE_PROCESSOR_NAME = "Qwen/Qwen3-VL-2B-Instruct"
+
+
+def create_message(frames: torch.Tensor):
+    """Construct the message using images and cot."""
+    assert frames.ndim == 4, f"{frames.ndim=}, expected (N, C, H, W)"
+
+    # NOTE: we expand the padding tokens to match training, so we can directly apply native processor from VLM.
+    num_traj_token = 48
+    hist_traj_placeholder = (
+        f"<|traj_history_start|>{'<|traj_history|>' * num_traj_token}<|traj_history_end|>"
+    )
+
+    return [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "You are a driving assistant that generates safe and accurate actions.",
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [{"type": "image", "image": frame} for frame in frames]
+            + [
+                {
+                    "type": "text",
+                    "text": f"{hist_traj_placeholder}output the chain-of-thought reasoning of the \
+                    driving process, then output the future trajectory.",
+                }
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "<|cot_start|>",
+                }
+            ],
+        },
+    ]
+
+
+def get_processor(tokenizer: AutoTokenizer) -> AutoProcessor:
+    """Get the processor for the Qwen3-VL-2B-Instruct model."""
+    processor_kwargs = {
+        "min_pixels": MIN_PIXELS,
+        "max_pixels": MAX_PIXELS,
+    }
+
+    processor = AutoProcessor.from_pretrained(BASE_PROCESSOR_NAME, **processor_kwargs)
+    processor.tokenizer = tokenizer
+    return processor
+
+
+def to_device(
+    data: Any,
+    device: str | torch.device | None = None,
+    dtype: torch.dtype | None = None,
+) -> Any:
+    """Recursively cast data into the specified device, dtype."""
+    if isinstance(data, torch.Tensor):
+        data = data.to(
+            device=device,
+            dtype=dtype,
+        )
+        return data
+    elif isinstance(data, collections.abc.Mapping):
+        return {key: to_device(data[key], device=device, dtype=dtype) for key in data}
+    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
+        return [to_device(elem, device=device, dtype=dtype) for elem in data]
+    else:
+        return data
+
+
+def enable_huggingface_checkpointing_patch() -> None:
+    """Patch PreTrainedModel.from_pretrained / save_pretrained to save/restore ModelOpt state.
+
+    Must be called before AlpamayoR1.from_pretrained() when loading a quantized (FP8/NVFP4)
+    checkpoint so that modelopt_state.pth is restored and _amax scaling factors are applied.
+    """
+    for name, (classes, methods_list) in _LIBRARY_CLASSES_FOR_PATCHING.items():
+        for cls, patch_methods in zip(classes, methods_list):
+            if cls in _PATCHED_CLASSES:
+                continue
+            patch_methods = [m for m in patch_methods if m[0] != "_from_config"]
+            patch_pretrained_methods(cls, patch_methods)
+            _PATCHED_CLASSES.add(cls)
+        print(f"ModelOpt save/restore enabled for `{name}` library.")
+
+
+enable_huggingface_checkpointing_patch()
+
+
+def _teacher_forced_flow_loss_forward(
+    self,
+    data: dict[str, Any],
+) -> dict[str, torch.Tensor]:
+    """Differentiable forward that returns the flow-matching training targets.
+
+    Bypasses autoregressive reasoning generation and diffusion sampling.
+    The VLM runs in a single non-sampling forward pass (with ``<traj_future_start>``
+    appended to the prompt) to build the prompt KV cache; the expert then runs once
+    on a linearly-interpolated noisy action and returns the predicted velocity field.
+
+    Args:
+        data: dict with ``tokenized_data`` (input_ids + other processor outputs),
+            ``ego_history_xyz``, ``ego_history_rot``, ``ego_future_xyz``,
+            ``ego_future_rot``.
+
+    Returns:
+        dict with keys ``v_pred`` and ``v_target``, both shape
+        ``(b,n_diffusion_tokens, action_dim)``. Callers compute MSE between them.
+    """
+    ego_history_xyz = data["ego_history_xyz"]
+    ego_history_rot = data["ego_history_rot"]
+    ego_future_xyz = data["ego_future_xyz"]
+    ego_future_rot = data["ego_future_rot"]
+    b, n_traj_group, _, _ = ego_history_xyz.shape
+    assert n_traj_group == 1, "Only one trajectory group is supported."
+
+    tokenized_data = dict(data["tokenized_data"])
+    input_ids = tokenized_data.pop("input_ids")
+    traj_data_vlm = {
+        "ego_history_xyz": ego_history_xyz,
+        "ego_history_rot": ego_history_rot,
+    }
+    input_ids = self.fuse_traj_tokens(input_ids, traj_data_vlm)
+    device = input_ids.device
+
+    # Append <traj_future_start> so the expert attends through the full prompt.
+    traj_future_start_id = self.tokenizer.convert_tokens_to_ids(
+        to_special_token("traj_future_start")
+    )
+    start_col = torch.full(
+        (input_ids.shape[0], 1),
+        traj_future_start_id,
+        dtype=input_ids.dtype,
+        device=device,
+    )
+    input_ids = torch.cat([input_ids, start_col], dim=1)
+    if "attention_mask" in tokenized_data and tokenized_data["attention_mask"] is not None:
+        am = tokenized_data["attention_mask"]
+        tokenized_data["attention_mask"] = torch.cat(
+            [am, torch.ones((am.shape[0], 1), dtype=am.dtype, device=am.device)], dim=1
+        )
+
+    vlm_outputs = self.vlm(
+        input_ids=input_ids,
+        use_cache=True,
+        return_dict=True,
+        **tokenized_data,
+    )
+    prompt_cache = vlm_outputs.past_key_values
+    prefill_seq_len = prompt_cache.get_seq_length()
+    rope_deltas = self.vlm.model.rope_deltas
+
+    n_diffusion_tokens = self.action_space.get_action_space_dims()[0]
+    offset = torch.full((b,), prefill_seq_len, device=device, dtype=torch.long)
+
+    position_ids = torch.arange(n_diffusion_tokens, device=device)
+    position_ids = einops.repeat(position_ids, "l -> 3 b l", b=b).clone()
+    delta = rope_deltas + offset[:, None]
+    position_ids += delta.to(position_ids.device)
+
+    # No padding between prompt cache and action block: full attention mask.
+    attention_mask = torch.zeros(
+        (b, 1, n_diffusion_tokens, prefill_seq_len + n_diffusion_tokens),
+        dtype=torch.float32,
+        device=device,
+    )
+
+    forward_kwargs = {}
+    if self.config.expert_non_causal_attention:
+        forward_kwargs["is_causal"] = False
+
+    # Build flow-matching target: x_1 = GT action, x_0 ~ N(0, I).
+    x_1 = self.action_space.traj_to_action(
+        traj_history_xyz=ego_history_xyz[:, 0],
+        traj_history_rot=ego_history_rot[:, 0],
+        traj_future_xyz=ego_future_xyz[:, 0],
+        traj_future_rot=ego_future_rot[:, 0],
+    )  # (b,n_diffusion_tokens, 2)
+    x_1 = x_1.to(device=device, dtype=torch.float32)
+
+    x_0 = torch.randn_like(x_1)
+    t = torch.rand(b, 1, 1, device=device, dtype=x_1.dtype)
+    x_t = (1.0 - t) * x_0 + t * x_1
+    v_target = x_1 - x_0
+
+    # Cast to action-module dtype to match action_in_proj / expert weights.
+    proj_dtype = next(self.action_in_proj.parameters()).dtype
+    x_t_cast = x_t.to(dtype=proj_dtype)
+    t_cast = t.to(dtype=proj_dtype)
+
+    future_token_embeds = self.action_in_proj(x_t_cast, t_cast)
+    if future_token_embeds.dim() == 2:
+        future_token_embeds = future_token_embeds.view(b, n_diffusion_tokens, -1)
+
+    expert_out = self.expert(
+        inputs_embeds=future_token_embeds,
+        position_ids=position_ids,
+        past_key_values=prompt_cache,
+        attention_mask=attention_mask,
+        use_cache=True,
+        **forward_kwargs,
+    )
+    prompt_cache.crop(prefill_seq_len)
+    last_hidden = expert_out.last_hidden_state[:, -n_diffusion_tokens:]
+    v_pred = self.action_out_proj(last_hidden).view(b, *self.action_space.get_action_space_dims())
+
+    return {"v_pred": v_pred.to(torch.float32), "v_target": v_target}
+
+
+def patch_teacher_forced_flow_loss_forward() -> None:
+    """Attach teacher_forced_flow_loss_forward to AlpamayoR1 if missing.
+
+    The public OSS AlpamayoR1 (github.com/nvlabs/alpamayo) does not define this
+    method; it exists only on the internal training fork. The body ported above
+    is the calibration path used by auto_quantize_model.
+    """
+    if not hasattr(AlpamayoR1, "teacher_forced_flow_loss_forward"):
+        AlpamayoR1.teacher_forced_flow_loss_forward = _teacher_forced_flow_loss_forward
+
+
+patch_teacher_forced_flow_loss_forward()
+
+
+def make_joint_calibration_forward_loop(
+    *,
+    clip_ids: list[str],
+    processor,
+    t0_us: int,
+    top_p: float,
+    temperature: float,
+    max_generation_length: int,
+    calibration_traj_samples: int,
+    device: str,
+):
+    """
+    Build a calibration loop that exercises both VLM generation and diffusion.
+
+    This avoids text-only calibration and ensures quantizers in the rollout path
+    (vlm/expert/diffusion-related modules) observe representative activations.
+    """
+
+    def _calibration_loop(runtime_model):
+        runtime_model.eval()
+        with torch.no_grad():
+            for clip_id in tqdm(clip_ids, desc="Calibration"):
+                data = load_physical_aiavdataset(clip_id, t0_us=t0_us)
+                messages = create_message(data["image_frames"].flatten(0, 1))
+                inputs = processor.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    add_generation_prompt=False,
+                    continue_final_message=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                )
+                model_inputs = {
+                    "tokenized_data": inputs,
+                    "ego_history_xyz": data["ego_history_xyz"],
+                    "ego_history_rot": data["ego_history_rot"],
+                }
+                model_inputs = to_device(model_inputs, device)
+
+                with torch.autocast("cuda", dtype=torch.float16):
+                    runtime_model.sample_trajectories_from_data_with_vlm_rollout(
+                        data=model_inputs,
+                        top_p=top_p,
+                        temperature=temperature,
+                        num_traj_samples=calibration_traj_samples,
+                        max_generation_length=max_generation_length,
+                    )
+
+    return _calibration_loop
+
+
+def read_clip_ids_from_parquet(parquet_path: str) -> list[str]:
+    """
+    Reads clip_ids from parquet. Tries common column names; falls back to index if needed.
+    Returns clip_ids as a list of strings (unique, preserving first occurrence order).
+    """
+    parquet_path = str(parquet_path)
+    df = pd.read_parquet(parquet_path)
+    cols_lower = {c.lower(): c for c in df.columns}
+    clip_ids = df[cols_lower["key"]].astype(str).tolist()
+
+    seen = set()
+    uniq = []
+    for cid in clip_ids:
+        if cid not in seen:
+            seen.add(cid)
+            uniq.append(cid)
+    return uniq
+
+
+def quantize_model(model, args, tokenizer=None, calibration_forward_loop=None):
+    """
+    Quantize a PyTorch model using ModelOpt post-training quantization (PTQ).
+
+    This function applies quantization to reduce model precision for faster inference
+    while maintaining acceptable accuracy. It uses calibration data generated from
+    the provided tokenizer to determine optimal quantization parameters.
+
+    Supported quantization formats:
+        - fp8: 8-bit floating point quantization
+        - nvfp4: 4-bit NVIDIA floating point quantization
+    Args:
+        model: PyTorch model to quantize. Must be in evaluation mode.
+        args: Command line arguments containing quant_format and debug.
+        tokenizer: Hugging Face tokenizer for creating calibration data.
+            Required only when `calibration_forward_loop` is not provided.
+        calibration_forward_loop: Optional callable taking `model` and running
+            calibration forward passes. Use this for non-text modules whose
+            forward signature is not compatible with dataset_utils batches.
+
+    Returns:
+        Quantized model
+    """
+    # Create calibration forward loop. For standard text models we can build
+    # it from tokenizer-based data, but vision modules often need custom args.
+    if calibration_forward_loop is None:
+        if tokenizer is None:
+            raise ValueError("tokenizer must be provided when calibration_forward_loop is None")
+        calib_dataloader = get_dataset_dataloader(
+            tokenizer=tokenizer,
+            batch_size=32,
+            num_samples=512,
+            device="cuda:0",
+        )
+        calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+    else:
+        calibrate_loop = calibration_forward_loop
+
+    if args.quant_format == "fp8":
+        quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG)
+    elif args.quant_format == "nvfp4":
+        quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)
+    else:
+        raise RuntimeError("Unsupported quantization format")
+    # Keep the vision tower in high precision. Pass a non-NVFP4 cfg (num_bits=8) with
+    # enable=False, not just enable=False: an NVFP4-typed QuantConv3d routes to a JIT
+    # implicit-GEMM CUDA kernel (needs CUDA_HOME) even when disabled.
+    quant_cfg["quant_cfg"].append(
+        {"quantizer_name": "*vlm.model.visual*", "enable": False, "cfg": {"num_bits": 8}}
+    )
+
+    if args.quant_format == "nvfp4" or getattr(args, "real_quant", False):
+        # Keep Linear layers whose in/out features aren't multiples of 16 in high precision:
+        # they break the real-quant GEMM backends (NVFP4 block packing, FP8 torch._scaled_mm).
+        # In AlpamayoR1 these are the small action-projection heads, so the impact is negligible.
+        for _name, _module in model.named_modules():
+            if isinstance(_module, torch.nn.Linear) and (
+                _module.in_features % 16 != 0 or _module.out_features % 16 != 0
+            ):
+                quant_cfg["quant_cfg"].append({"quantizer_name": f"{_name}.*", "enable": False})
+
+    model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    if args.debug:
+        print("================== quantize_model summary ==================")
+        mtq.print_quant_summary(model)
+
+    return model
+
+
+def auto_quantize_model(
+    model,
+    args,
+    *,
+    clip_ids,
+    processor,
+    t0_us: int,
+    top_p: float,
+    temperature: float,
+    max_generation_length: int,
+    calibration_traj_samples: int,
+    device: str,
+):
+    """
+    Quantize a PyTorch model using ModelOpt's AutoQuantize API.
+
+    Searches per-layer across [NVFP4_DEFAULT_CFG, FP8_DEFAULT_CFG] under the
+    effective-bits budget in args.auto_quantize_bits. Calibration data is built
+    from the same joint VLM + diffusion rollout used by
+    alpamayo_r1.eval.make_joint_calibration_forward_loop.
+
+    Args:
+        model: PyTorch model to quantize. Must be in eval mode.
+        args: Namespace with `auto_quantize_bits` (float) and `debug` (bool).
+        clip_ids: Iterable of clip_ids for calibration.
+        processor: HF processor used for chat-template tokenization.
+        t0_us, top_p, temperature, max_generation_length, calibration_traj_samples,
+        device: Same semantics as make_joint_calibration_forward_loop.
+
+    Returns:
+        Quantized model (the search_state from mtq.auto_quantize is discarded).
+    """
+
+    def _one_epoch():
+        for clip_id in clip_ids:
+            data = load_physical_aiavdataset(clip_id, t0_us=t0_us)
+            messages = create_message(data["image_frames"].flatten(0, 1))
+            inputs = processor.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=False,
+                continue_final_message=True,
+                return_dict=True,
+                return_tensors="pt",
+            )
+            model_inputs = {
+                "tokenized_data": inputs,
+                "ego_history_xyz": data["ego_history_xyz"],
+                "ego_history_rot": data["ego_history_rot"],
+                "ego_future_xyz": data["ego_future_xyz"],
+                "ego_future_rot": data["ego_future_rot"],
+            }
+            yield to_device(model_inputs, device)
+
+    class _ReusableLoader:
+        """Re-iterable wrapper so modelopt can run calibration + scoring passes."""
+
+        def __iter__(self):
+            return _one_epoch()
+
+    data_loader = _ReusableLoader()
+
+    def forward_step(runtime_model, data):
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            out = runtime_model.teacher_forced_flow_loss_forward(data=data)
+        v_pred, v_target = out["v_pred"], out["v_target"]
+        print(
+            f"[autoquant-fwd] v_pred: finite={torch.isfinite(v_pred).all().item()} "
+            f"min={v_pred.min().item():.4g} max={v_pred.max().item():.4g} "
+            f"abs_mean={v_pred.abs().mean().item():.4g} | "
+            f"v_target: finite={torch.isfinite(v_target).all().item()} "
+            f"min={v_target.min().item():.4g} max={v_target.max().item():.4g}"
+        )
+        return out
+
+    def loss_func(output, batch):
+        loss = torch.nn.functional.mse_loss(output["v_pred"], output["v_target"])
+        print(f"[autoquant-loss] loss={loss.item():.6g} finite={torch.isfinite(loss).item()}")
+        return loss
+
+    # Mirror the quantize_model exclusions via disabled_layers (fnmatch against module names),
+    # since the AutoQuantize search also includes NVFP4: keep the vision tower unquantized, and
+    # exclude Linear layers whose in/out features aren't multiples of 16.
+    disabled_layers = ["*lm_head*", "*vlm.model.visual*"]
+    for _name, _module in model.named_modules():
+        if isinstance(_module, torch.nn.Linear) and (
+            _module.in_features % 16 != 0 or _module.out_features % 16 != 0
+        ):
+            disabled_layers.append(_name)
+
+    model, search_state = mtq.auto_quantize(
+        model,
+        constraints={"effective_bits": args.auto_quantize_bits},
+        quantization_formats=["NVFP4_DEFAULT_CFG", "FP8_DEFAULT_CFG"],
+        data_loader=data_loader,
+        forward_step=forward_step,
+        loss_func=loss_func,
+        disabled_layers=disabled_layers,
+        verbose=True,
+    )
+
+    print("================== auto_quantize search_state ==================")
+    print(search_state)
+
+    if args.debug:
+        print("================== auto_quantize_model summary ==================")
+        mtq.print_quant_summary(model)
+
+    return model
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Quantize AlpamayoR1 and export as HF checkpoint")
+    ap.add_argument(
+        "--ckpt",
+        type=str,
+        default="nvidia/Alpamayo-R1-10B",
+        help="HF hub id or local path of the input checkpoint",
+    )
+    ap.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to save the quantized HF checkpoint",
+    )
+    ap.add_argument(
+        "--quantize",
+        type=str,
+        required=True,
+        choices=["fp8", "nvfp4", "auto"],
+        help="Quantization format",
+    )
+    ap.add_argument(
+        "--auto_quantize_bits",
+        type=float,
+        default=6.5,
+        help="Effective-bits budget for AutoQuantize (only used when --quantize auto)",
+    )
+    ap.add_argument(
+        "--parquet",
+        type=str,
+        default="0417_16rows_train_set_for_calibration_25.10.parquet",
+        help="Parquet file with clip_ids for calibration",
+    )
+    ap.add_argument("--t0_us", type=int, default=5_100_000)
+    ap.add_argument("--top_p", type=float, default=0.98)
+    ap.add_argument("--temperature", type=float, default=0.6)
+    ap.add_argument("--max_generation_length", type=int, default=256)
+    ap.add_argument("--num_traj_samples", type=int, default=6)
+    ap.add_argument(
+        "--limit", type=int, default=16, help="How many clip_ids to use for calibration"
+    )
+    ap.add_argument(
+        "--real-quant",
+        action="store_true",
+        help="Export packed real-quantized weights (fp8 / NVFP4) via "
+        "modelopt.torch.export.export_hf_checkpoint instead of "
+        "saving fake-quant fp16 weights with quantizer state.",
+    )
+    args = ap.parse_args()
+
+    script_dir = Path(__file__).resolve().parent
+    parquet_path = (script_dir / args.parquet).resolve()
+
+    clip_ids = read_clip_ids_from_parquet(str(parquet_path))
+    if args.limit is not None and args.limit > 0:
+        clip_ids = clip_ids[: args.limit]
+    print(f"Loaded {len(clip_ids)} clip_ids from: {parquet_path}")
+
+    device = "cuda"
+    print(f"Loading model from {args.ckpt!r} ...")
+    model = AlpamayoR1.from_pretrained(args.ckpt, dtype=torch.float16).to(
+        device=device, dtype=torch.float16
+    )
+    model.eval()
+
+    processor = get_processor(model.tokenizer)
+
+    # Quantize using existing recipe
+    print(f"Quantizing model ({args.quantize}) ...")
+    quantization_args = argparse.Namespace(
+        quant_format=args.quantize,
+        quant_algo="max",
+        weight_only=False,
+        debug=True,
+        auto_quantize_bits=args.auto_quantize_bits,
+        real_quant=args.real_quant,
+    )
+    if args.quantize == "auto":
+        model = auto_quantize_model(
+            model,
+            quantization_args,
+            clip_ids=clip_ids,
+            processor=processor,
+            t0_us=args.t0_us,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            max_generation_length=args.max_generation_length,
+            calibration_traj_samples=args.num_traj_samples,
+            device=device,
+        )
+    else:
+        # Build calibration loop
+        calibration_forward_loop = make_joint_calibration_forward_loop(
+            clip_ids=clip_ids,
+            processor=processor,
+            t0_us=args.t0_us,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            max_generation_length=args.max_generation_length,
+            calibration_traj_samples=args.num_traj_samples,
+            device=device,
+        )
+        model = quantize_model(
+            model,
+            quantization_args,
+            calibration_forward_loop=calibration_forward_loop,
+        )
+    model.eval()
+
+    # Save as HF-style checkpoint
+    os.makedirs(args.output_dir, exist_ok=True)
+    print(f"Saving quantized checkpoint to {args.output_dir!r} ...")
+
+    if args.real_quant:
+        # Real (packed) quantization. `mtq.compress` packs weights into the low-precision
+        # storage format and enables ModelOpt's real-quant GEMM kernels. The ModelOpt-patched
+        # `save_pretrained` writes the packed weights plus a `modelopt_state.pth`, which
+        # `AlpamayoR1.from_pretrained` replays to reload and run real-quantized.
+        #
+        # NOTE: `export_hf_checkpoint` (the vLLM/TRT-LLM deployment format) isn't used here: it
+        # has no `modelopt_state.pth`, so a custom model class can't reload it via from_pretrained.
+        mtq.compress(model)
+        model.eval()
+        with torch.inference_mode():
+            model.save_pretrained(args.output_dir)
+        processor.save_pretrained(args.output_dir)
+        model.config.save_pretrained(args.output_dir)
+    else:
+        with torch.inference_mode():
+            model.save_pretrained(args.output_dir)
+
+        processor.save_pretrained(args.output_dir)
+        model.config.save_pretrained(args.output_dir)
+
+        quant_cfg = get_quant_config(model)
+        with open(os.path.join(args.output_dir, "hf_quant_config.json"), "w") as f:
+            json.dump(quant_cfg, f)
+
+    print(f"Quantized checkpoint saved to {args.output_dir}")
+
+
+if __name__ == "__main__":
+    with torch.no_grad():
+        main()