@@ -827,11 +827,36 @@ def post_quantize(
827827 first_text_speech_dataset ,
828828 calib_batch : dict | None = None ,
829829):
830- """
831- Processing after the quantization.
830+ """Processing after the quantization.
831+
832+ Runs one round of generation using the quantized model for a sample prompt and
833+ compares it with the pre-quantize generation from ``pre_quantize()``.
832834
833- Currently we run one round of generation using the quantized model for a sample prompt,
834- and compare it with pre-quantize generation.
835+ Args:
836+ args: Parsed CLI arguments. Used for ``verbose``, ``quant_summary_path``,
837+ ``export_path``, ``pyt_ckpt_path``, and ``skip_generate`` flags.
838+ full_model: The quantized model to run post-quantization generation on.
839+ model_type: Model architecture identifier (e.g. ``"qwen3omni"``, ``"whisper"``,
840+ ``"llama4"``, ``"deepseek"``). Controls model-specific generation and
841+ decoding paths. ``None`` for generic models.
842+ tokenizer: HF tokenizer for decoding generated token ids. May be ``None`` when
843+ a ``processor`` is used instead (e.g. vision-language or speech models).
844+ processor: HF image/audio processor for multimodal models. Used for decoding
845+ outputs from vision-language (Mllama, Qwen3Omni) and speech (Whisper)
846+ models. ``None`` for text-only models.
847+ preview_input_ids: Input token ids (single sample) produced by ``pre_quantize()``
848+ for the preview generation comparison.
849+ generated_ids_before_ptq: Generation output from ``pre_quantize()`` to compare
850+ against post-quantization output. ``None`` if generation was skipped.
851+ is_nemotron_vl_model: Whether the model is a Nemotron VL model, which uses
852+ ``model.chat()`` and returns text strings instead of token tensors.
853+ first_text_speech_dataset: Text transcript of the first speech sample, used as
854+ the display input for Whisper models since their ``input_ids`` are
855+ mel-spectrogram features rather than decodable tokens.
856+ calib_batch: Full calibration batch dict from ``pre_quantize``. Required for
857+ multimodal models (e.g. Qwen3Omni) whose ``generate()`` needs the complete
858+ input dict (audio features, attention masks, etc.) rather than just
859+ ``input_ids``. For text-only models this is unused and may be ``None``.
835860
836861 """
837862
0 commit comments