[Data][LLM] Fix max_pending_requests default to track vLLM's GPU-dependent max_num_seqs (ray-project#62918)

Aydin-ab · claude · TruongQuangPhat · commit 89f64d0d7338 · 2026-05-27T15:27:43.000+07:00
Signed-off-by: Aydin Abiar &lt;aydin@anyscale.com&gt;
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: phattruong &lt;23120318@student.hcmus.edu.vn&gt;
diff --git a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
@@ -209,6 +209,11 @@ class vLLMEngineWrapper:
     Args:
         *args: The positional arguments for the engine.
         max_pending_requests: The maximum number of pending requests in the queue.
+            If None, it will be auto-resolved to
+            ``ceil(1.1 * max_num_seqs * pipeline_parallel_size)`` using values
+            from vLLM's resolved engine config (so the default tracks vLLM's
+            GPU-dependent ``max_num_seqs``). Pass a non-positive value (e.g.
+            ``-1``) to disable the semaphore entirely.
         dynamic_lora_loading_path: The S3 path to the dynamic LoRA adapter.
         log_engine_metrics: Whether to export vLLM metrics to Ray's Prometheus endpoint.
         **kwargs: The keyword arguments for the engine.
@@ -217,7 +222,7 @@ class vLLMEngineWrapper:
     def __init__(
         self,
         idx_in_batch_column: str,
-        max_pending_requests: int = -1,
+        max_pending_requests: Optional[int] = None,
         dynamic_lora_loading_path: Optional[str] = None,
         log_engine_metrics: bool = True,
         **kwargs,
@@ -294,8 +299,29 @@ def __init__(
 
         # The performance gets really bad if there are too many requests in the pending queue.
         # We work around it with semaphore to limit the number of concurrent requests in the engine.
+        # When the caller did not specify a limit, derive it from the resolved
+        # vLLM config rather than from raw engine_kwargs. vLLM's default
+        # `max_num_seqs` is GPU-dependent (e.g. 256 on A10G/A100, 1024 on H100),
+        # so reading from `scheduler_config` avoids silently capping the
+        # semaphore below vLLM's actual capacity.
+        scheduler_config = self._vllm_config.scheduler_config
+        parallel_config = self._vllm_config.parallel_config
+        engine_capacity = (
+            scheduler_config.max_num_seqs * parallel_config.pipeline_parallel_size
+        )
+        if max_pending_requests is None:
+            max_pending_requests = math.ceil(engine_capacity * 1.1)
+        elif 0 < max_pending_requests < engine_capacity:
+            logger.warning(
+                "max_pending_requests (%d) < max_num_seqs * pipeline_parallel_size "
+                "(%d); may underutilize vLLM. Consider >=%d, or <=0 to disable.",
+                max_pending_requests,
+                engine_capacity,
+                math.ceil(engine_capacity * 1.1),
+            )
         self.max_pending_requests = max_pending_requests
         if self.max_pending_requests > 0:
+            logger.info("Max pending requests is set to %d", self.max_pending_requests)
             self.semaphore = asyncio.Semaphore(self.max_pending_requests)
         else:
             self.semaphore = asyncio.NullContext()
@@ -612,7 +638,10 @@ def __init__(
             engine_kwargs: The kwargs to pass to the vLLM engine.
             task_type: The task to use for the vLLM engine (e.g., "generate", "embed", etc).
             max_pending_requests: The maximum number of pending requests. If None,
-                it will be set to 1.1 * max_num_seqs * pipeline_parallel_size.
+                it will be set to ``ceil(1.1 * max_num_seqs * pipeline_parallel_size)``,
+                where ``max_num_seqs`` and ``pipeline_parallel_size`` are read from
+                vLLM's resolved engine config (so the default tracks vLLM's
+                GPU-dependent ``max_num_seqs``, not a hardcoded value).
             dynamic_lora_loading_path: The path to the dynamic LoRA adapter. It is expected
                 to hold subfolders each for a different lora checkpoint.
             should_continue_on_error: If True, continue processing when inference fails for
@@ -629,14 +658,6 @@ def __init__(
         self.task_type = task_type
         self.engine_kwargs = self.normalize_engine_kwargs(engine_kwargs)
 
-        # Set up the max pending requests.
-        pp_size = self.engine_kwargs.get("pipeline_parallel_size", 1)
-        self.max_pending_requests = max_pending_requests or math.ceil(
-            self.engine_kwargs.get("max_num_seqs", 128) * pp_size * 1.1
-        )
-        if self.max_pending_requests > 0:
-            logger.info("Max pending requests is set to %d", self.max_pending_requests)
-
         exclude_safetensors = (
             self.engine_kwargs.get("load_format") in STREAMING_LOAD_FORMATS
         )
@@ -662,11 +683,14 @@ def __init__(
             model_source=source,
             idx_in_batch_column=self.IDX_IN_BATCH_COLUMN,
             enable_log_requests=False,
-            max_pending_requests=self.max_pending_requests,
+            max_pending_requests=max_pending_requests,
             dynamic_lora_loading_path=dynamic_lora_loading_path,
             log_engine_metrics=log_engine_metrics,
             **self.engine_kwargs,
         )
+        # The wrapper resolves a None into a concrete value using vLLM's
+        # resolved engine config; surface that back on the UDF.
+        self.max_pending_requests = self.llm.max_pending_requests
 
         max_num_seqs = self.llm.get_scheduler_config().max_num_seqs
         if batch_size * max_concurrent_batches < max_num_seqs:
diff --git a/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py b/python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py
@@ -124,6 +124,12 @@ def test_vllm_engine_stage_post_init(gpu_type, model_llama_3_2_216M):
 
 @pytest.mark.asyncio
 async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
+    # Simulate vLLM's resolved state when the user sets `max_num_seqs=100`:
+    # the wrapper owns the resolution of `max_pending_requests`, and the UDF
+    # reads the resolved value back.
+    expected_max_pending_requests = math.ceil(100 * 1.1)
+    mock_vllm_wrapper.return_value.max_pending_requests = expected_max_pending_requests
+
     # Create UDF instance - it will use the mocked wrapper
     udf = vLLMEngineStageUDF(
         data_column="__data",
@@ -147,7 +153,7 @@ async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
     assert udf.task_type == vLLMTaskType.GENERATE
     assert udf.engine_kwargs["task_type"] == vLLMTaskType.EMBED
     assert udf.engine_kwargs["max_num_seqs"] == 100
-    assert udf.max_pending_requests == math.ceil(100 * 1.1)
+    assert udf.max_pending_requests == expected_max_pending_requests
 
     # Test batch processing
     batch = {
@@ -169,13 +175,16 @@ async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
     assert responses[1]["prompt"] in ["Hello", "World"]
     assert responses[0]["prompt"] != responses[1]["prompt"]
 
-    # Verify the wrapper was constructed with correct arguments
+    # Verify the wrapper was constructed with correct arguments. The UDF
+    # passes `max_pending_requests=None` straight through when the caller
+    # doesn't supply it; the wrapper resolves the default from vLLM's
+    # resolved engine config.
     mock_vllm_wrapper.assert_called_once_with(
         model=model_llama_3_2_216M,
         model_source=model_llama_3_2_216M,
         idx_in_batch_column="__idx_in_batch",
         disable_log_stats=False,
-        max_pending_requests=111,
+        max_pending_requests=None,
         task_type=vLLMTaskType.EMBED,
         max_num_seqs=100,
         dynamic_lora_loading_path=None,