Skip to content

Commit 105c736

Browse files
Merge branch 'main' into gkarch/runtime_opt
2 parents d6e1c6b + 01415c2 commit 105c736

17 files changed

Lines changed: 645 additions & 66 deletions

File tree

examples/llm_eval/lm_eval_tensorrt_llm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ def __init__(
6464
tokenizer=self.tokenizer,
6565
max_batch_size=int(batch_size),
6666
max_seq_len=max_length,
67+
# Loglikelihood tasks request context logits. KV cache prefix reuse would return
68+
# logits only for the recomputed suffix on shared-prefix requests (e.g. hellaswag),
69+
# truncating context_logits and breaking parse_logprobs. Disable it.
70+
enable_kv_cache_reuse=False,
6771
)
6872
self.max_length = max_length - 1
6973
logger.info("Loaded TRT-LLM")

examples/llm_eval/mmlu.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,10 @@ def gen_prompt(train_df, subject, k=-1):
183183
def evaluate(args, subject, model: EvalModel | LLM, dev_df, test_df):
184184
cors = []
185185
all_probs = []
186-
for i in range(test_df.shape[0]):
186+
num_examples = test_df.shape[0]
187+
if args.limit is not None:
188+
num_examples = min(num_examples, args.limit)
189+
for i in range(num_examples):
187190
# get prompt and make sure it fits
188191
k = args.ntrain
189192
prompt_end = format_example(test_df, i, include_answer=False)
@@ -201,6 +204,12 @@ def check_valid_length(model, prompt):
201204
train_prompt = gen_prompt(dev_df, subject, k)
202205
prompt = train_prompt + prompt_end
203206

207+
# Skip examples that do not fit even at zero-shot, otherwise the backend rejects
208+
# prompts longer than max_seq_len and aborts the whole evaluation.
209+
if not check_valid_length(model, prompt):
210+
print(f"Skipping {subject} example {i}: prompt exceeds max_seq_len even at 0-shot.")
211+
continue
212+
204213
label = test_df.iloc[i, test_df.shape[1] - 1]
205214
if isinstance(model, EvalModel):
206215
pred = model.run(prompt)
@@ -212,7 +221,11 @@ def check_valid_length(model, prompt):
212221
cors.append(cor)
213222
all_probs.append(probs)
214223

215-
acc = np.mean(cors)
224+
if not cors:
225+
# Every example was skipped (all prompts exceeded max_seq_len). Surface it instead of
226+
# silently producing a nan accuracy downstream.
227+
print(f"WARNING: all {subject} examples were skipped; reporting accuracy as nan.")
228+
acc = np.mean(cors) if cors else float("nan")
216229
cors = np.array(cors)
217230

218231
all_probs = np.array(all_probs)
@@ -233,8 +246,12 @@ def main(
233246
auto_quantize_score_size: int = 128,
234247
auto_quantize_checkpoint: str | None = None,
235248
sparse_cfg: str | None = None,
249+
limit: int | None = None,
236250
**kwargs,
237251
):
252+
if limit is not None and limit <= 0:
253+
raise ValueError(f"limit must be a positive integer when provided, got {limit}.")
254+
238255
random.seed(RAND_SEED)
239256
np.random.seed(RAND_SEED)
240257

examples/llm_eval/run_simple_eval.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,23 @@ if [ ! -d "human-eval" ]; then
2828
git clone https://github.com/openai/human-eval.git
2929
fi
3030

31+
# Pin to a known commit for reproducibility (and so the entry-point patch below matches), forcing
32+
# it every run so a reused checkout cannot drift to an arbitrary revision. -f discards the patch
33+
# applied to setup.py on a previous run before re-applying it below.
34+
git -C human-eval checkout -q -f 6d43fb980f9fee3c892a914eda09951f772ad10d
35+
36+
# human-eval's console_scripts entry point lacks the ":callable" suffix, which newer pip/setuptools
37+
# reject ("A callable suffix is required"). The target module defines main(), so point at it.
38+
sed -i 's|human_eval\.evaluate_functional_correctness"|human_eval.evaluate_functional_correctness:main"|' human-eval/setup.py
39+
3140
if [ ! -d "simple-evals" ]; then
3241
git clone https://github.com/openai/simple-evals.git
3342
fi
3443

35-
pip install -e human-eval
44+
# --no-build-isolation: human-eval's legacy setup.py imports pkg_resources at build time,
45+
# which pip's isolated build env does not provide with newer setuptools. Build against the
46+
# base environment (which has setuptools/pkg_resources) instead.
47+
pip install -e human-eval --no-build-isolation
3648
pip install openai
3749

3850
pushd simple-evals

examples/llm_ptq/example_utils.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
ProcessorMixin,
4343
)
4444

45+
from modelopt.torch.export.model_utils import is_multimodal_model
46+
from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg
47+
4548
try:
4649
from huggingface_hub import snapshot_download
4750
except ImportError:
@@ -51,6 +54,58 @@
5154

5255
SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
5356

57+
# TODO: Refactor into the config system.
58+
_QWEN36_AUTOQ_DISABLED_LAYERS = (
59+
"*shared_expert_gate*",
60+
"*linear_attn.in_proj_a*",
61+
"*linear_attn.in_proj_b*",
62+
)
63+
_VLM_AUTOQ_DISABLED_LAYERS = ("*visual*", "*mtp*", "*vision_tower*")
64+
65+
66+
def _is_qwen_model(model) -> bool:
67+
"""Return True when model/config identifiers indicate a Qwen-family model."""
68+
candidates = [type(model).__name__]
69+
config = getattr(model, "config", None)
70+
configs = [
71+
config,
72+
getattr(config, "text_config", None),
73+
getattr(config, "language_config", None),
74+
]
75+
for cfg in configs:
76+
if cfg is None:
77+
continue
78+
candidates.append(type(cfg).__name__)
79+
model_type = getattr(cfg, "model_type", None)
80+
if model_type is not None:
81+
candidates.append(str(model_type))
82+
architectures = getattr(cfg, "architectures", ()) or ()
83+
if isinstance(architectures, str):
84+
architectures = (architectures,)
85+
candidates.extend(str(architecture) for architecture in architectures)
86+
return any("qwen" in candidate.lower() for candidate in candidates)
87+
88+
89+
def _get_auto_quantize_disabled_layers(model) -> list[str]:
90+
"""Return layer patterns that should be excluded from AutoQuantize search."""
91+
disabled_layers = [
92+
entry["quantizer_name"]
93+
for entry in _default_disabled_quantizer_cfg
94+
if "parent_class" not in entry and entry["quantizer_name"] != "*lm_head*"
95+
]
96+
if _is_qwen_model(model):
97+
disabled_layers.extend(p for p in _QWEN36_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
98+
if is_multimodal_model(model):
99+
disabled_layers.extend(p for p in _VLM_AUTOQ_DISABLED_LAYERS if p not in disabled_layers)
100+
return disabled_layers
101+
102+
103+
def _get_auto_quantize_cost_excluded_patterns(model) -> list[str]:
104+
"""Return layer patterns excluded only from AutoQuantize cost accounting."""
105+
if is_multimodal_model(model):
106+
return list(_VLM_AUTOQ_DISABLED_LAYERS)
107+
return []
108+
54109

55110
def run_nemotron_vl_preview(
56111
full_model,
@@ -133,7 +188,6 @@ def is_nemotron_vl(model_or_config):
133188
# Try to get config from model, or use directly if it's a config
134189
if hasattr(model_or_config, "config"):
135190
config = model_or_config.config
136-
from modelopt.torch.export.model_utils import is_multimodal_model
137191

138192
if not is_multimodal_model(model_or_config):
139193
return False

examples/llm_ptq/hf_ptq.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
from cast_mxfp4_to_nvfp4 import apply_to_model as apply_cast_mxfp4_to_nvfp4
2828
from cast_mxfp4_to_nvfp4 import force_weight_quantizers_static
2929
from example_utils import (
30+
_get_auto_quantize_cost_excluded_patterns,
31+
_get_auto_quantize_disabled_layers,
3032
build_quant_cfg,
3133
copy_custom_model_files,
3234
create_vlm_calibration_loop,
@@ -72,7 +74,8 @@
7274
save_expert_token_count_table,
7375
)
7476
from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
75-
from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
77+
from modelopt.torch.quantization._auto_quantize_cost import EXCLUDED_MODULE_NAME_PATTERNS_KEY
78+
from modelopt.torch.quantization.config import need_calibration
7679
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
7780
from modelopt.torch.quantization.utils import is_quantized
7881
from modelopt.torch.speculative.eagle.utils import (
@@ -132,6 +135,7 @@ def _kv_cfg_uses_constant_amax(kv_quant_cfg: list[dict[str, Any]]) -> bool:
132135
"nvfp4_awq_lite",
133136
"nvfp4_w4a4_weight_mse_fp8_sweep",
134137
"w4a8_awq_beta",
138+
"w4a16_nvfp4",
135139
"fp8_2d_blockwise_weight_only",
136140
"w4a8_mxfp4_fp8",
137141
"nvfp4_mlp_only",
@@ -387,10 +391,14 @@ def forward_step(model, batch):
387391
"effective_bits": args.auto_quantize_bits,
388392
"cost_model": args.auto_quantize_cost_model,
389393
}
394+
auto_quantize_cost = {}
390395
if args.auto_quantize_active_moe_expert_ratio is not None:
391-
auto_quantize_constraints["cost"] = {
392-
"active_moe_expert_ratio": args.auto_quantize_active_moe_expert_ratio
393-
}
396+
auto_quantize_cost["active_moe_expert_ratio"] = args.auto_quantize_active_moe_expert_ratio
397+
cost_excluded_patterns = _get_auto_quantize_cost_excluded_patterns(language_model)
398+
if cost_excluded_patterns:
399+
auto_quantize_cost[EXCLUDED_MODULE_NAME_PATTERNS_KEY] = cost_excluded_patterns
400+
if auto_quantize_cost:
401+
auto_quantize_constraints["cost"] = auto_quantize_cost
394402

395403
language_model, _ = mtq.auto_quantize(
396404
language_model,
@@ -406,12 +414,7 @@ def forward_step(model, batch):
406414
len(calib_dataloader), max(auto_quantize_score_size // args.batch_size, 1)
407415
),
408416
verbose=True,
409-
# Disable all default disabled layers such as lm_head, mlp.gate, router etc.
410-
disabled_layers=[
411-
entry["quantizer_name"]
412-
for entry in _default_disabled_quantizer_cfg
413-
if "parent_class" not in entry
414-
],
417+
disabled_layers=_get_auto_quantize_disabled_layers(language_model),
415418
method=auto_quantize_method,
416419
checkpoint=auto_quantize_checkpoint,
417420
)
@@ -487,7 +490,7 @@ def load_model(args: argparse.Namespace):
487490
is_nemotron_vl_model = is_nemotron_vl(full_model)
488491

489492
# Default to image-text calibration for VLM models
490-
if is_nemotron_vl_model and not args.calib_with_images:
493+
if is_nemotron_vl_model and not args.calib_with_images and args.auto_quantize_bits is None:
491494
print("Nemotron VL model detected. Enabling image-text calibration by default.")
492495
args.calib_with_images = True
493496

@@ -539,12 +542,10 @@ def load_model(args: argparse.Namespace):
539542
: len(args.dataset)
540543
]
541544

542-
# We only quantize the language model for VLMs other than the type supported above.
543-
# Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
544-
# on the outer CausalLM, not the inner language backbone. A recipe that targets
545-
# lm_head must therefore quantize against the full model and explicitly keep visual
546-
# and MTP siblings disabled.
547-
if args.recipe is None:
545+
# Plain PTQ quantizes only the extracted language model. Recipe and
546+
# AutoQuantize paths keep the outer CausalLM so recipes/search can see
547+
# Qwen3.5/3.6-MoE VLM lm_head.
548+
if args.recipe is None and args.auto_quantize_bits is None:
548549
extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
549550
full_model
550551
)
@@ -1070,9 +1071,16 @@ def _is_layerwise(obj):
10701071
"Auto quantization needs multiple quantization format."
10711072
)
10721073

1074+
# For VL models, autoquant must walk submodules of the OUTER CausalLM
1075+
# (which carries lm_head and the LM-head forward path) — otherwise
1076+
# lm_head and any sibling-of-language_model modules are silently
1077+
# invisible to the search. ``forward_step`` also needs the outer model
1078+
# to produce ``CausalLMOutputWithPast`` (for ``.loss`` / ``.logits``).
1079+
# Visual tower and MTP siblings are auto-excluded inside
1080+
# ``auto_quantize()`` via *visual* / *mtp* / *vision_tower* patterns.
10731081
auto_quantize(
10741082
args,
1075-
language_model,
1083+
full_model,
10761084
calib_dataloader,
10771085
auto_quantize_method=args.auto_quantize_method,
10781086
auto_quantize_score_size=args.auto_quantize_score_size,
@@ -1437,6 +1445,8 @@ def parse_args() -> argparse.Namespace:
14371445
args = parser.parse_args()
14381446
if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
14391447
parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
1448+
if args.auto_quantize_bits is not None and args.calib_with_images:
1449+
parser.error("--calib_with_images is not supported with --auto_quantize_bits.")
14401450
if args.auto_quantize_active_moe_expert_ratio is not None and not (
14411451
0.0 < args.auto_quantize_active_moe_expert_ratio <= 1.0
14421452
):

examples/llm_ptq/run_tensorrt_llm.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,14 @@ def run(args):
6666

6767
print("TensorRT-LLM example outputs:")
6868

69-
llm = LLM(args.checkpoint_dir, tokenizer=tokenizer, max_batch_size=len(input_texts))
69+
# generate_context_logits() below requires KV cache reuse disabled: with prefix block reuse,
70+
# shared-prefix inputs return truncated (silently incorrect) context logits.
71+
llm = LLM(
72+
args.checkpoint_dir,
73+
tokenizer=tokenizer,
74+
max_batch_size=len(input_texts),
75+
enable_kv_cache_reuse=False,
76+
)
7077
torch.cuda.cudart().cudaProfilerStart()
7178
outputs = llm.generate_text(input_texts, args.max_output_len)
7279
torch.cuda.cudart().cudaProfilerStop()

examples/llm_ptq/scripts/huggingface_example.sh

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ for i in $(env | grep ^SLURM_ | cut -d"=" -f 1); do unset -v $i; done
2929
for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
3030
for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
3131

32+
# Fail on errors inside pipelines (e.g. `python eval.py | tee result.txt`), otherwise a crashing
33+
# eval is masked by tee's exit code and the script passes silently.
34+
set -o pipefail
35+
3236
if [ -z "$MODEL_PATH" ]; then
3337
echo "Unsupported model argument: Expected a huggingface model path or model name" >&2
3438
exit 1
@@ -216,7 +220,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
216220
RUN_ARGS+=" --trust_remote_code "
217221
fi
218222

219-
python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS
223+
# Only run the deploy+generate smoke test when "quant" is explicitly requested. Eval tasks
224+
# (lm_eval/mmlu/simple_eval) deploy the checkpoint themselves, so it is redundant there.
225+
if [[ $TASKS =~ "quant" ]]; then
226+
python run_tensorrt_llm.py --checkpoint_dir=$SAVE_PATH $RUN_ARGS
227+
fi
220228
fi
221229

222230
if [[ -d "${MODEL_PATH}" ]]; then
@@ -285,11 +293,16 @@ if [[ $TASKS =~ "mmlu" ]]; then
285293
tar -xf /tmp/mmlu.tar -C data && mv data/data $MMLU_DATA_PATH
286294
fi
287295

296+
mmlu_flags=""
297+
if [ -n "$MMLU_LIMIT" ]; then
298+
mmlu_flags+=" --limit $MMLU_LIMIT "
299+
fi
300+
288301
python mmlu.py \
289302
--model_name causal \
290303
--model_path $MODEL_ABS_PATH \
291304
--checkpoint_dir $SAVE_PATH \
292-
--data_dir $MMLU_DATA_PATH | tee $MMLU_RESULT
305+
--data_dir $MMLU_DATA_PATH $mmlu_flags | tee $MMLU_RESULT
293306
popd
294307

295308
fi
@@ -304,16 +317,16 @@ if [[ $TASKS =~ "livecodebench" || $TASKS =~ "simple_eval" ]]; then
304317
trtllm-serve $SAVE_PATH --host 0.0.0.0 --port $PORT >$SAVE_PATH/serve.txt 2>&1 &
305318
SERVE_PID=$!
306319

307-
tail -f $SAVE_PATH/serve.txt | while read line; do
308-
if echo "$line" | grep -q "Application startup complete"; then
309-
echo "Application startup complete."
310-
break
311-
fi
320+
# Poll the log instead of `tail -f | while ... break`: under `set -o pipefail` (set above),
321+
# breaking out of that pipeline leaves tail to die by SIGPIPE, which would abort the script.
322+
while ! grep -q "Application startup complete" $SAVE_PATH/serve.txt 2>/dev/null; do
312323
if ! kill -0 $SERVE_PID 2>/dev/null; then
313324
echo "trtllm-serve has exited."
314325
exit 1
315326
fi
327+
sleep 2
316328
done
329+
echo "Application startup complete."
317330

318331
pushd ../llm_eval/
319332

examples/llm_ptq/scripts/parser.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ parse_options() {
2828
LM_EVAL_TASKS="mmlu,gsm8k"
2929
LM_EVAL_LIMIT=
3030
SIMPLE_EVAL_TASKS="mmlu"
31+
MMLU_LIMIT=
3132

3233
TASKS="quant"
3334

@@ -38,7 +39,7 @@ parse_options() {
3839
CAST_MXFP4_TO_NVFP4=false
3940

4041
# Parse command-line options
41-
ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,simple_eval_limit:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
42+
ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,simple_eval_limit:,mmlu_limit:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
4243

4344
eval set -- "$ARGS"
4445
while true; do
@@ -61,6 +62,7 @@ parse_options() {
6162
--lm_eval_limit ) LM_EVAL_LIMIT="$2"; shift 2;;
6263
--simple_eval_tasks ) SIMPLE_EVAL_TASKS="$2"; shift 2;;
6364
--simple_eval_limit ) SIMPLE_EVAL_LIMIT="$2"; shift 2;;
65+
--mmlu_limit ) MMLU_LIMIT="$2"; shift 2;;
6466
--trust_remote_code ) TRUST_REMOTE_CODE=true; shift;;
6567
--use_seq_device_map ) USE_SEQ_DEVICE_MAP=true; shift;;
6668
--gpu_max_mem_percentage ) GPU_MAX_MEM_PERCENTAGE="$2"; shift 2;;
@@ -161,6 +163,7 @@ parse_options() {
161163
echo "lm_eval_limit: $LM_EVAL_LIMIT"
162164
echo "simple_eval_tasks: $SIMPLE_EVAL_TASKS"
163165
echo "simple_eval_limit: $SIMPLE_EVAL_LIMIT"
166+
echo "mmlu_limit: $MMLU_LIMIT"
164167
echo "num_sample: $NUM_SAMPLES"
165168
echo "use_seq_device_map: $USE_SEQ_DEVICE_MAP"
166169
echo "gpu_max_mem_percentage: $GPU_MAX_MEM_PERCENTAGE"

0 commit comments

Comments
 (0)