vLLM fakequant: add recipe-based quantization support (#1233)

kinjalpatel27 · web-flow · commit 73be81037d84 · 2026-04-14T13:37:02.000-07:00
### What does this PR do? Type of change: example update This PR adds recipe-based quantization support to the vLLM fakequant example. ### Testing ``` docker run --gpus all -it --shm-size=160GB --network host --rm --entrypoint bash -v <modelopt>:/home/modelopt vllm/vllm-openai:v0.15.0 -c "cd /home/modelopt && pip install . && pip install datasets && RECIPE_PATH=/home/modelopt/modelopt_recipes/general/ptq/nvfp4_mlp_only-fp8_kv.yml python3 /home/modelopt/examples/vllm_serve/vllm_serve_fakequant.py Qwen/Qwen3-0.6B -tp 1 --served-model-name Qwen3-0.6B --host 0.0.0.0 --port 8001 --trust-remote-code --disable-custom-all-reduce --gpu-memory-utilization 0.8" ``` ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A - Did you write any new necessary tests?: N/A - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: N/A ### Additional Information   ## Summary by CodeRabbit * **New Features** * Added `RECIPE_PATH` environment variable support enabling users to specify ModelOpt PTQ recipe YAML files for quantization configuration in vLLM serving. * **Documentation** * Updated examples and documentation to support recipe-driven quantization configuration, aligning export workflow with recipe-based setup.  --------- Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
@@ -28,6 +28,7 @@ You can either edit the `quant_config` dictionary in `vllm_serve_fakequant.py`,
 | QUANT_FILE_PATH | Optional path to exported quantizer state dict `quantizer_state.pth` | None |
 | MODELOPT_STATE_PATH | Optional path to exported `vllm_fq_modelopt_state.pth` (restores quantizer state and parameters) | None |
 | CALIB_BATCH_SIZE | Calibration batch size                           | 1                  |
+| RECIPE_PATH      | Optional path to a ModelOpt PTQ recipe YAML  | None |
 
 Set these variables in your shell or Docker environment as needed to customize calibration.
 
@@ -65,7 +66,7 @@ Step 1: export the model with bf16 weights and quantizer state. To export the mo
 ```bash
 python ../llm_ptq/hf_ptq.py \
   --pyt_ckpt_path <MODEL_PATH> \
-  --qformat nvfp4 \
+  --recipe <PATH_TO_RECIPE> \
   --calib_size 512 \
   --export_path <EXPORT_DIR> \
   --vllm_fakequant_export \
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -43,6 +43,7 @@
     "quant_file_path": os.environ.get("QUANT_FILE_PATH", None),
     "modelopt_state_path": os.environ.get("MODELOPT_STATE_PATH", None),
     "calib_batch_size": int(os.environ.get("CALIB_BATCH_SIZE", 1)),
+    "recipe_path": os.environ.get("RECIPE_PATH", None),
 }
 
 
@@ -138,6 +139,7 @@ def compile_or_warm_up_model(self) -> None:
             quant_config["quant_cfg"]
             or quant_config["kv_quant_cfg"]
             or quant_config["modelopt_state_path"]
+            or quant_config["recipe_path"]
         ):
             _fakequant_run_prolog_worker(self)
         super().compile_or_warm_up_model()
diff --git a/examples/vllm_serve/vllm_ptq_utils.py b/examples/vllm_serve/vllm_ptq_utils.py
@@ -24,6 +24,7 @@
 from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput
 
 import modelopt.torch.quantization as mtq
+from modelopt.recipe import ModelOptPTQRecipe, load_recipe
 
 
 def _create_new_data_cls(data_cls, **kwargs):
@@ -141,22 +142,31 @@ def update_kv_cfg_for_mla(model: torch.nn.Module, kv_quant_cfg: list) -> list:
 def get_quant_config(quant_config: dict[str, Any], model: Any) -> dict[str, Any]:
     import copy
 
-    quant_cfg = (
-        copy.deepcopy(getattr(mtq, quant_config["quant_cfg"])) if quant_config["quant_cfg"] else {}
-    )
-    quant_kv_cfg = (
-        copy.deepcopy(getattr(mtq, quant_config["kv_quant_cfg"]))
-        if quant_config["kv_quant_cfg"]
-        else {}
-    )
+    if quant_config["recipe_path"]:
+        recipe = load_recipe(quant_config["recipe_path"])
+        assert isinstance(recipe, ModelOptPTQRecipe), (
+            f"Expected PTQ recipe, but got {type(recipe).__name__} from {quant_config['recipe_path']}"
+        )
+        quant_cfg = recipe.quantize
+    else:
+        quant_cfg = (
+            copy.deepcopy(getattr(mtq, quant_config["quant_cfg"]))
+            if quant_config["quant_cfg"]
+            else {}
+        )
+        quant_kv_cfg = (
+            copy.deepcopy(getattr(mtq, quant_config["kv_quant_cfg"]))
+            if quant_config["kv_quant_cfg"]
+            else {}
+        )
 
-    # Check if model has MLA and update KV config accordingly
-    if quant_kv_cfg:
-        quant_kv_cfg["quant_cfg"] = update_kv_cfg_for_mla(model, quant_kv_cfg["quant_cfg"])
+        # Check if model has MLA and update KV config accordingly
+        if quant_kv_cfg:
+            quant_kv_cfg["quant_cfg"] = update_kv_cfg_for_mla(model, quant_kv_cfg["quant_cfg"])
 
-    if quant_kv_cfg:
-        quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
-            quant_cfg, quant_kv_cfg["quant_cfg"]
-        )
+        if quant_kv_cfg:
+            quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
+                quant_cfg, quant_kv_cfg["quant_cfg"]
+            )
 
     return quant_cfg
diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py
@@ -78,6 +78,7 @@
     "KV_QUANT_CFG",
     "MODELOPT_STATE_PATH",
     "CALIB_BATCH_SIZE",
+    "RECIPE_PATH",
     "TRUST_REMOTE_CODE",
 }
 

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@`
`78`	`78`	`"KV_QUANT_CFG",`
`79`	`79`	`"MODELOPT_STATE_PATH",`
`80`	`80`	`"CALIB_BATCH_SIZE",`
	`81`	`+ "RECIPE_PATH",`
`81`	`82`	`"TRUST_REMOTE_CODE",`
`82`	`83`	`}`
`83`	`84`