Skip to content

Commit 89f64d0

Browse files
Aydin-abclaude
authored andcommitted
[Data][LLM] Fix max_pending_requests default to track vLLM's GPU-dependent max_num_seqs (ray-project#62918)
Signed-off-by: Aydin Abiar <aydin@anyscale.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: phattruong <23120318@student.hcmus.edu.vn>
1 parent 6882ea2 commit 89f64d0

2 files changed

Lines changed: 47 additions & 14 deletions

File tree

python/ray/llm/_internal/batch/stages/vllm_engine_stage.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,11 @@ class vLLMEngineWrapper:
209209
Args:
210210
*args: The positional arguments for the engine.
211211
max_pending_requests: The maximum number of pending requests in the queue.
212+
If None, it will be auto-resolved to
213+
``ceil(1.1 * max_num_seqs * pipeline_parallel_size)`` using values
214+
from vLLM's resolved engine config (so the default tracks vLLM's
215+
GPU-dependent ``max_num_seqs``). Pass a non-positive value (e.g.
216+
``-1``) to disable the semaphore entirely.
212217
dynamic_lora_loading_path: The S3 path to the dynamic LoRA adapter.
213218
log_engine_metrics: Whether to export vLLM metrics to Ray's Prometheus endpoint.
214219
**kwargs: The keyword arguments for the engine.
@@ -217,7 +222,7 @@ class vLLMEngineWrapper:
217222
def __init__(
218223
self,
219224
idx_in_batch_column: str,
220-
max_pending_requests: int = -1,
225+
max_pending_requests: Optional[int] = None,
221226
dynamic_lora_loading_path: Optional[str] = None,
222227
log_engine_metrics: bool = True,
223228
**kwargs,
@@ -294,8 +299,29 @@ def __init__(
294299

295300
# The performance gets really bad if there are too many requests in the pending queue.
296301
# We work around it with semaphore to limit the number of concurrent requests in the engine.
302+
# When the caller did not specify a limit, derive it from the resolved
303+
# vLLM config rather than from raw engine_kwargs. vLLM's default
304+
# `max_num_seqs` is GPU-dependent (e.g. 256 on A10G/A100, 1024 on H100),
305+
# so reading from `scheduler_config` avoids silently capping the
306+
# semaphore below vLLM's actual capacity.
307+
scheduler_config = self._vllm_config.scheduler_config
308+
parallel_config = self._vllm_config.parallel_config
309+
engine_capacity = (
310+
scheduler_config.max_num_seqs * parallel_config.pipeline_parallel_size
311+
)
312+
if max_pending_requests is None:
313+
max_pending_requests = math.ceil(engine_capacity * 1.1)
314+
elif 0 < max_pending_requests < engine_capacity:
315+
logger.warning(
316+
"max_pending_requests (%d) < max_num_seqs * pipeline_parallel_size "
317+
"(%d); may underutilize vLLM. Consider >=%d, or <=0 to disable.",
318+
max_pending_requests,
319+
engine_capacity,
320+
math.ceil(engine_capacity * 1.1),
321+
)
297322
self.max_pending_requests = max_pending_requests
298323
if self.max_pending_requests > 0:
324+
logger.info("Max pending requests is set to %d", self.max_pending_requests)
299325
self.semaphore = asyncio.Semaphore(self.max_pending_requests)
300326
else:
301327
self.semaphore = asyncio.NullContext()
@@ -612,7 +638,10 @@ def __init__(
612638
engine_kwargs: The kwargs to pass to the vLLM engine.
613639
task_type: The task to use for the vLLM engine (e.g., "generate", "embed", etc).
614640
max_pending_requests: The maximum number of pending requests. If None,
615-
it will be set to 1.1 * max_num_seqs * pipeline_parallel_size.
641+
it will be set to ``ceil(1.1 * max_num_seqs * pipeline_parallel_size)``,
642+
where ``max_num_seqs`` and ``pipeline_parallel_size`` are read from
643+
vLLM's resolved engine config (so the default tracks vLLM's
644+
GPU-dependent ``max_num_seqs``, not a hardcoded value).
616645
dynamic_lora_loading_path: The path to the dynamic LoRA adapter. It is expected
617646
to hold subfolders each for a different lora checkpoint.
618647
should_continue_on_error: If True, continue processing when inference fails for
@@ -629,14 +658,6 @@ def __init__(
629658
self.task_type = task_type
630659
self.engine_kwargs = self.normalize_engine_kwargs(engine_kwargs)
631660

632-
# Set up the max pending requests.
633-
pp_size = self.engine_kwargs.get("pipeline_parallel_size", 1)
634-
self.max_pending_requests = max_pending_requests or math.ceil(
635-
self.engine_kwargs.get("max_num_seqs", 128) * pp_size * 1.1
636-
)
637-
if self.max_pending_requests > 0:
638-
logger.info("Max pending requests is set to %d", self.max_pending_requests)
639-
640661
exclude_safetensors = (
641662
self.engine_kwargs.get("load_format") in STREAMING_LOAD_FORMATS
642663
)
@@ -662,11 +683,14 @@ def __init__(
662683
model_source=source,
663684
idx_in_batch_column=self.IDX_IN_BATCH_COLUMN,
664685
enable_log_requests=False,
665-
max_pending_requests=self.max_pending_requests,
686+
max_pending_requests=max_pending_requests,
666687
dynamic_lora_loading_path=dynamic_lora_loading_path,
667688
log_engine_metrics=log_engine_metrics,
668689
**self.engine_kwargs,
669690
)
691+
# The wrapper resolves a None into a concrete value using vLLM's
692+
# resolved engine config; surface that back on the UDF.
693+
self.max_pending_requests = self.llm.max_pending_requests
670694

671695
max_num_seqs = self.llm.get_scheduler_config().max_num_seqs
672696
if batch_size * max_concurrent_batches < max_num_seqs:

python/ray/llm/tests/batch/gpu/stages/test_vllm_engine_stage.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,12 @@ def test_vllm_engine_stage_post_init(gpu_type, model_llama_3_2_216M):
124124

125125
@pytest.mark.asyncio
126126
async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
127+
# Simulate vLLM's resolved state when the user sets `max_num_seqs=100`:
128+
# the wrapper owns the resolution of `max_pending_requests`, and the UDF
129+
# reads the resolved value back.
130+
expected_max_pending_requests = math.ceil(100 * 1.1)
131+
mock_vllm_wrapper.return_value.max_pending_requests = expected_max_pending_requests
132+
127133
# Create UDF instance - it will use the mocked wrapper
128134
udf = vLLMEngineStageUDF(
129135
data_column="__data",
@@ -147,7 +153,7 @@ async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
147153
assert udf.task_type == vLLMTaskType.GENERATE
148154
assert udf.engine_kwargs["task_type"] == vLLMTaskType.EMBED
149155
assert udf.engine_kwargs["max_num_seqs"] == 100
150-
assert udf.max_pending_requests == math.ceil(100 * 1.1)
156+
assert udf.max_pending_requests == expected_max_pending_requests
151157

152158
# Test batch processing
153159
batch = {
@@ -169,13 +175,16 @@ async def test_vllm_engine_udf_basic(mock_vllm_wrapper, model_llama_3_2_216M):
169175
assert responses[1]["prompt"] in ["Hello", "World"]
170176
assert responses[0]["prompt"] != responses[1]["prompt"]
171177

172-
# Verify the wrapper was constructed with correct arguments
178+
# Verify the wrapper was constructed with correct arguments. The UDF
179+
# passes `max_pending_requests=None` straight through when the caller
180+
# doesn't supply it; the wrapper resolves the default from vLLM's
181+
# resolved engine config.
173182
mock_vllm_wrapper.assert_called_once_with(
174183
model=model_llama_3_2_216M,
175184
model_source=model_llama_3_2_216M,
176185
idx_in_batch_column="__idx_in_batch",
177186
disable_log_stats=False,
178-
max_pending_requests=111,
187+
max_pending_requests=None,
179188
task_type=vLLMTaskType.EMBED,
180189
max_num_seqs=100,
181190
dynamic_lora_loading_path=None,

0 commit comments

Comments
 (0)