minor

kinjalpatel27 · kinjalpatel27 · commit 687ceeacaa05 · 2026-04-10T23:57:33.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
@@ -4,7 +4,7 @@ This is a simple example to demonstrate calibrating and serving ModelOpt fakequa
 
 Compared with realquant, fakequant is 2-5x slower, but doesn't require dedicated kernel support and facilitates research.
 
-This example is tested with vllm 0.9.0 and 0.11.2
+This example is tested with vllm 0.9.0 and 0.19.1
 
 ## Prepare environment
 
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -133,11 +133,14 @@ def determine_available_memory(self) -> int:
         with disable_compilation(model):
             return super().determine_available_memory()
 
-    def compile_or_warm_up_model(self) -> None:
+    def compile_or_warm_up_model(self) -> float:
         if (
             quant_config["quant_cfg"]
             or quant_config["kv_quant_cfg"]
             or quant_config["modelopt_state_path"]
         ):
             _fakequant_run_prolog_worker(self)
-        super().compile_or_warm_up_model()
+        # Must return the base worker's compilation time (seconds). Returning None
+        # breaks vLLM V1 executor: initialize_from_config does max(compilation_times)
+        # across TP workers.
+        return super().compile_or_warm_up_model()
diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py
@@ -66,7 +66,6 @@
     from vllm.utils import FlexibleArgumentParser
 else:
     from vllm.utils.argparse_utils import FlexibleArgumentParser
-    from vllm.v1.executor.ray_executor import RayDistributedExecutor
 
 
 # Adding the envs you want to pass to the workers
@@ -81,7 +80,14 @@
     "TRUST_REMOTE_CODE",
 }
 
-RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)
+if vllm_version <= version.parse("0.11.0"):
+    RayDistributedExecutor.ADDITIONAL_ENV_VARS.update(additional_env_vars)
+else:
+    from vllm.platforms import current_platform
+
+    for _name in additional_env_vars:
+        if _name not in current_platform.additional_env_vars:
+            current_platform.additional_env_vars.append(_name)
 
 
 def main():