diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 1b00a45044..6b29be4eb0 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -690,7 +690,9 @@ def pre_quantize( ][0:1] # Generate preview before quantization - if model_type == "deepseek": + if args.skip_generate: + generated_ids_before_ptq = None + elif model_type == "deepseek": # DeepSeek generation may go OOM, so we skip it generated_ids_before_ptq = None elif is_nemotron_vl_model and tokenizer is not None: @@ -703,7 +705,6 @@ def pre_quantize( allow_fallback=False, ) else: - # Standard generation for non-Nemotron VL models generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": print("Applying nvfp4 quantization (MoE only) for gpt-oss") @@ -1084,6 +1085,16 @@ def parse_args() -> argparse.Namespace: default=True, action=argparse.BooleanOptionalAction, ) + parser.add_argument( + "--skip_generate", + help=( + "Skip pre/post-quantization preview calls that invoke model.generate(). " + "Note: this does not skip calibration or batch-size probing. " + "For very large models, pair with --batch_size 1 to avoid max-batch probing." + ), + default=False, + action="store_true", + ) parser.add_argument( "--low_memory_mode", help=(