NVIDIA
diff --git a/‎examples/llm_eval/lm_eval_tensorrt_llm.py‎
Lines changed: 4 additions & 0 deletions b/‎examples/llm_eval/lm_eval_tensorrt_llm.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/llm_eval/mmlu.py‎
Lines changed: 19 additions & 2 deletions b/‎examples/llm_eval/mmlu.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎examples/llm_eval/run_simple_eval.sh‎
Lines changed: 13 additions & 1 deletion b/‎examples/llm_eval/run_simple_eval.sh‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 55 additions & 1 deletion b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 55 additions & 1 deletion
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 18 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 28 additions & 18 deletions
diff --git a/‎examples/llm_ptq/run_tensorrt_llm.py‎
Lines changed: 8 additions & 1 deletion b/‎examples/llm_ptq/run_tensorrt_llm.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 20 additions & 7 deletions b/‎examples/llm_ptq/scripts/huggingface_example.sh‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 4 additions & 1 deletion b/‎examples/llm_ptq/scripts/parser.sh‎
Lines changed: 4 additions & 1 deletion
@@ -64,6 +64,10 @@ def __init__(
             tokenizer=self.tokenizer,
             max_batch_size=int(batch_size),
             max_seq_len=max_length,
+            # Loglikelihood tasks request context logits. KV cache prefix reuse would return
+            # logits only for the recomputed suffix on shared-prefix requests (e.g. hellaswag),
+            # truncating context_logits and breaking parse_logprobs. Disable it.
+            enable_kv_cache_reuse=False,
         )
         self.max_length = max_length - 1
         logger.info("Loaded TRT-LLM")
 
@@ -183,7 +183,10 @@ def gen_prompt(train_df, subject, k=-1):
 def evaluate(args, subject, model: EvalModel | LLM, dev_df, test_df):
     cors = []
     all_probs = []
-    for i in range(test_df.shape[0]):
+    num_examples = test_df.shape[0]
+    if args.limit is not None:
+        num_examples = min(num_examples, args.limit)
+    for i in range(num_examples):
         # get prompt and make sure it fits
         k = args.ntrain
         prompt_end = format_example(test_df, i, include_answer=False)
@@ -201,6 +204,12 @@ def check_valid_length(model, prompt):
             train_prompt = gen_prompt(dev_df, subject, k)
             prompt = train_prompt + prompt_end
 
+        # Skip examples that do not fit even at zero-shot, otherwise the backend rejects
+        # prompts longer than max_seq_len and aborts the whole evaluation.
+        if not check_valid_length(model, prompt):
+            print(f"Skipping {subject} example {i}: prompt exceeds max_seq_len even at 0-shot.")
+            continue
+
         label = test_df.iloc[i, test_df.shape[1] - 1]
         if isinstance(model, EvalModel):
             pred = model.run(prompt)
@@ -212,7 +221,11 @@ def check_valid_length(model, prompt):
         cors.append(cor)
         all_probs.append(probs)
 
-    acc = np.mean(cors)
+    if not cors:
+        # Every example was skipped (all prompts exceeded max_seq_len). Surface it instead of
+        # silently producing a nan accuracy downstream.
+        print(f"WARNING: all {subject} examples were skipped; reporting accuracy as nan.")
+    acc = np.mean(cors) if cors else float("nan")
     cors = np.array(cors)
 
     all_probs = np.array(all_probs)
@@ -233,8 +246,12 @@ def main(
     auto_quantize_score_size: int = 128,
     auto_quantize_checkpoint: str | None = None,
     sparse_cfg: str | None = None,
+    limit: int | None = None,
     **kwargs,
 ):
+    if limit is not None and limit <= 0:
+        raise ValueError(f"limit must be a positive integer when provided, got {limit}.")
+
     random.seed(RAND_SEED)
     np.random.seed(RAND_SEED)
 
 
@@ -28,11 +28,23 @@ if [ ! -d "human-eval" ]; then
     git clone https://github.com/openai/human-eval.git
 fi
 
+# Pin to a known commit for reproducibility (and so the entry-point patch below matches), forcing
+# it every run so a reused checkout cannot drift to an arbitrary revision. -f discards the patch
+# applied to setup.py on a previous run before re-applying it below.
+git -C human-eval checkout -q -f 6d43fb980f9fee3c892a914eda09951f772ad10d
+
+# human-eval's console_scripts entry point lacks the ":callable" suffix, which newer pip/setuptools
+# reject ("A callable suffix is required"). The target module defines main(), so point at it.
+sed -i 's|human_eval\.evaluate_functional_correctness"|human_eval.evaluate_functional_correctness:main"|' human-eval/setup.py
+
 if [ ! -d "simple-evals" ]; then
     git clone https://github.com/openai/simple-evals.git
 fi
 
-pip install -e human-eval
+# --no-build-isolation: human-eval's legacy setup.py imports pkg_resources at build time,
+# which pip's isolated build env does not provide with newer setuptools. Build against the
+# base environment (which has setuptools/pkg_resources) instead.
+pip install -e human-eval --no-build-isolation
 pip install openai
 
 pushd simple-evals
 
@@ -42,6 +42,9 @@
     ProcessorMixin,
 )
 
+from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg
+
 try:
     from huggingface_hub import snapshot_download
 except ImportError:
@@ -51,6 +54,58 @@
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
+# TODO: Refactor into the config system.
+_QWEN36_AUTOQ_DISABLED_LAYERS = (
+    "*shared_expert_gate*",
+    "*linear_attn.in_proj_a*",
+    "*linear_attn.in_proj_b*",
+)
+_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
+
+
+def _is_qwen_model(model) -> bool:
+    """Return True when model/config identifiers indicate a Qwen-family model."""
+    candidates = [type(model).__name__]
+    config = getattr(model, "config", None)
+    configs = [
+        config,
+        getattr(config, "text_config", None),
+        getattr(config, "language_config", None),
+    ]
+    for cfg in configs:
+        if cfg is None:
+            continue
+        candidates.append(type(cfg).__name__)
+        model_type = getattr(cfg, "model_type", None)
+        if model_type is not None:
+            candidates.append(str(model_type))
+        architectures = getattr(cfg, "architectures", ()) or ()
+        if isinstance(architectures, str):
+            architectures = (architectures,)
+        candidates.extend(str(architecture) for architecture in architectures)
+    return any("qwen" in candidate.lower() for candidate in candidates)
+
+
+def _get_auto_quantize_disabled_layers(model) -> list[str]:
+    """Return layer patterns that should be excluded from AutoQuantize search."""
+    disabled_layers = [
+        entry["quantizer_name"]
+        for entry in _default_disabled_quantizer_cfg
+        if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
+    ]
+    if _is_qwen_model(model):
+        disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    if is_multimodal_model(model):
+        disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
+    return disabled_layers
+
+
+def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]:
+    """Return layer patterns excluded only from AutoQuantize cost accounting."""
+    if is_multimodal_model(model):
+        return list(_VLM_AUTOQ_DISABLED_LAYERS)
+    return []
+
 
 def run_nemotron_vl_preview(
     full_model,
@@ -133,7 +188,6 @@ def is_nemotron_vl(model_or_config):
     # Try to get config from model, or use directly if it's a config
     if hasattr(model_or_config, "config"):
         config = model_or_config.config
-        from modelopt.torch.export.model_utils import is_multimodal_model
 
         if not is_multimodal_model(model_or_config):
             return False
 
@@ -27,6 +27,8 @@
 from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4
 from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static
 from example_utils import (
+    _get_auto_quantize_cost_excluded_patterns,
+    _get_auto_quantize_disabled_layers,
     build_quant_cfg,
     copy_custom_model_files,
     create_vlm_calibration_loop,
@@ -72,7 +74,8 @@
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
-from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
+from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.speculative.eagle.utils import (
@@ -132,6 +135,7 @@ def _kv_cfg_uses_constant_amax(kv_quant_cfg: list[dict[str, Any]]) -> bool:
         "nvfp4_awq_lite",
         "nvfp4_w4a4_weight_mse_fp8_sweep",
         "w4a8_awq_beta",
+        "w4a16_nvfp4",
         "fp8_2d_blockwise_weight_only",
         "w4a8_mxfp4_fp8",
         "nvfp4_mlp_only",
@@ -387,10 +391,14 @@ def forward_step(model, batch):
         "effective_bits": args.auto_quantize_bits,
         "cost_model": args.auto_quantize_cost_model,
     }
+    auto_quantize_cost = {}
     if args.auto_quantize_active_moe_expert_ratio is not None:
-        auto_quantize_constraints["cost"] = {
-            "active_moe_expert_ratio": args.auto_quantize_active_moe_expert_ratio
-        }
+        auto_quantize_cost["active_moe_expert_ratio"] = args.auto_quantize_active_moe_expert_ratio
+    cost_excluded_patterns = _get_auto_quantize_cost_excluded_patterns(language_model)
+    if cost_excluded_patterns:
+        auto_quantize_cost[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = cost_excluded_patterns
+    if auto_quantize_cost:
+        auto_quantize_constraints["cost"] = auto_quantize_cost
 
     language_model, _ = mtq.auto_quantize(
         language_model,
@@ -406,12 +414,7 @@ def forward_step(model, batch):
             len(calib_dataloader), max(auto_quantize_score_size // args.batch_size, 1)
         ),
         verbose=True,
-        # Disable all default disabled layers such as lm_head, mlp.gate, router etc.
-        disabled_layers=[
-            entry["quantizer_name"]
-            for entry in _default_disabled_quantizer_cfg
-            if "parent_class" not in entry
-        ],
+        disabled_layers=_get_auto_quantize_disabled_layers(language_model),
         method=auto_quantize_method,
         checkpoint=auto_quantize_checkpoint,
     )
@@ -487,7 +490,7 @@ def load_model(args: argparse.Namespace):
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
     # Default to image-text calibration for VLM models
-    if is_nemotron_vl_model and not args.calib_with_images:
+    if is_nemotron_vl_model and not args.calib_with_images and args.auto_quantize_bits is None:
         print("Nemotron VL model detected. Enabling image-text calibration by default.")
         args.calib_with_images = True
 
@@ -539,12 +542,10 @@ def load_model(args: argparse.Namespace):
                 : len(args.dataset)
             ]
 
-            # We only quantize the language model for VLMs other than the type supported above.
-            # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
-            # on the outer CausalLM, not the inner language backbone. A recipe that targets
-            # lm_head must therefore quantize against the full model and explicitly keep visual
-            # and MTP siblings disabled.
-            if args.recipe is None:
+            # Plain PTQ quantizes only the extracted language model. Recipe and
+            # AutoQuantize paths keep the outer CausalLM so recipes/search can see
+            # Qwen3.5/3.6-MoE VLM lm_head.
+            if args.recipe is None and args.auto_quantize_bits is None:
                 extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
                     full_model
                 )
@@ -1070,9 +1071,16 @@ def _is_layerwise(obj):
             "Auto quantization needs multiple quantization format."
         )
 
+        # For VL models, autoquant must walk submodules of the OUTER CausalLM
+        # (which carries lm_head and the LM-head forward path) — otherwise
+        # lm_head and any sibling-of-language_model modules are silently
+        # invisible to the search. ``forward_step`` also needs the outer model
+        # to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
+        # Visual tower and MTP siblings are auto-excluded inside
+        # ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
         auto_quantize(
             args,
-            language_model,
+            full_model,
             calib_dataloader,
             auto_quantize_method=args.auto_quantize_method,
             auto_quantize_score_size=args.auto_quantize_score_size,
@@ -1437,6 +1445,8 @@ def parse_args() -> argparse.Namespace:
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
         parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    if args.auto_quantize_bits is not None and args.calib_with_images:
+        parser.error("--calib_with_images is not supported with --auto_quantize_bits.")
     if args.auto_quantize_active_moe_expert_ratio is not None and not (
         0.0 < args.auto_quantize_active_moe_expert_ratio <= 1.0
     ):
 
@@ -66,7 +66,14 @@ def run(args):
 
     print("TensorRT-LLM example outputs:")
 
-    llm = LLM(args.checkpoint_dir, tokenizer=tokenizer, max_batch_size=len(input_texts))
+    # generate_context_logits() below requires KV cache reuse disabled: with prefix block reuse,
+    # shared-prefix inputs return truncated (silently incorrect) context logits.
+    llm = LLM(
+        args.checkpoint_dir,
+        tokenizer=tokenizer,
+        max_batch_size=len(input_texts),
+        enable_kv_cache_reuse=False,
+    )
     torch.cuda.cudart().cudaProfilerStart()
     outputs = llm.generate_text(input_texts, args.max_output_len)
     torch.cuda.cudart().cudaProfilerStop()
 
@@ -29,6 +29,10 @@ for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
 
+# Fail on errors inside pipelines (e.g. `python eval.py | tee result.txt`), otherwise a crashing
+# eval is masked by tee's exit code and the script passes silently.
+set -o pipefail
+
 if [ -z "$MODEL_PATH" ]; then
     echo "Unsupported model argument: Expected a huggingface model path or model name" >&2
     exit 1
@@ -216,7 +220,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         RUN_ARGS+=" --trust_remote_code "
     fi
 
-    python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS
+    # Only run the deploy+generate smoke test when "quant" is explicitly requested. Eval tasks
+    # (lm_eval/mmlu/simple_eval) deploy the checkpoint themselves, so it is redundant there.
+    if [[ $TASKS =~ "quant" ]]; then
+        python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS
+    fi
 fi
 
 if [[ -d "${MODEL_PATH}" ]]; then
@@ -285,11 +293,16 @@ if [[ $TASKS =~ "mmlu" ]]; then
         tar -xf /tmp/mmlu.tar -C data && mv data/data $MMLU_DATA_PATH
     fi
 
+    mmlu_flags=""
+    if [ -n "$MMLU_LIMIT" ]; then
+        mmlu_flags+=" --limit $MMLU_LIMIT "
+    fi
+
     python mmlu.py \
         --model_name causal \
         --model_path $MODEL_ABS_PATH \
         --checkpoint_dir $SAVE_PATH \
-        --data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT
+        --data_dir $MMLU_DATA_PATH $mmlu_flags | tee $MMLU_RESULT
     popd
 
 fi
@@ -304,16 +317,16 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
     trtllm-serve $SAVE_PATH --host 0.0.0.0 --port $PORT >$SAVE_PATH/serve.txt 2>&1 &
     SERVE_PID=$!
 
-    tail -f $SAVE_PATH/serve.txt | while read line; do
-        if echo "$line" | grep -q "Application startup complete"; then
-            echo "Application startup complete."
-            break
-        fi
+    # Poll the log instead of `tail -f | while ... break`: under `set -o pipefail` (set above),
+    # breaking out of that pipeline leaves tail to die by SIGPIPE, which would abort the script.
+    while ! grep -q "Application startup complete" $SAVE_PATH/serve.txt 2>/dev/null; do
         if ! kill -0 $SERVE_PID 2>/dev/null; then
             echo "trtllm-serve has exited."
             exit 1
         fi
+        sleep 2
     done
+    echo "Application startup complete."
 
     pushd ../llm_eval/
 
 
@@ -28,6 +28,7 @@ parse_options() {
     LM_EVAL_TASKS="mmlu,gsm8k"
     LM_EVAL_LIMIT=
     SIMPLE_EVAL_TASKS="mmlu"
+    MMLU_LIMIT=
 
     TASKS="quant"
 
@@ -38,7 +39,7 @@ parse_options() {
     CAST_MXFP4_TO_NVFP4=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,simple_eval_limit:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,simple_eval_limit:,mmlu_limit:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
@@ -61,6 +62,7 @@ parse_options() {
       --lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
       --simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
       --simple_eval_limit ) SIMPLE_EVAL_LIMIT="$2"; shift 2;;
+      --mmlu_limit ) MMLU_LIMIT="$2"; shift 2;;
       --trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
       --use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
       --gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
@@ -161,6 +163,7 @@ parse_options() {
   echo "lm_eval_limit: $LM_EVAL_LIMIT"
   echo "simple_eval_tasks: $SIMPLE_EVAL_TASKS"
   echo "simple_eval_limit: $SIMPLE_EVAL_LIMIT"
+  echo "mmlu_limit: $MMLU_LIMIT"
   echo "num_sample: $NUM_SAMPLES"
   echo "use_seq_device_map: $USE_SEQ_DEVICE_MAP"
   echo "gpu_max_mem_percentage: $GPU_MAX_MEM_PERCENTAGE"