[data][llm] Promote max_tasks_in_flight_per_actor to a first-class config field and adjust defaults (ray-project#63214)

jeffreywang-anyscale · web-flow · commit afee41376ea2 · 2026-05-08T14:42:09.000-07:00
Signed-off-by: Jeffrey Wang &lt;jeffreywang@anyscale.com&gt;
diff --git a/python/ray/data/llm.py b/python/ray/data/llm.py
@@ -125,7 +125,16 @@ class vLLMEngineProcessorConfig(_vLLMEngineProcessorConfig):
             This is to overlap the batch processing to avoid the tail latency of
             each batch. The default value may not be optimal when the batch size
             or the batch processing latency is too small, but it should be good
-            enough for batch size >= 64.
+            enough for batch size >= 64. Sets the engine actor's Ray Core
+            ``max_concurrency``.
+        max_tasks_in_flight_per_actor: Max tasks Ray Data submits concurrently to
+            each engine actor. Passed through to ``ray.data.ActorPoolStrategy``.
+            If unset, Ray Data uses
+            ``ray.data.DataContext.max_tasks_in_flight_per_actor`` if set globally.
+            Otherwise, it defaults to ``2 * max_concurrent_batches``; the factor
+            can be overridden via the
+            ``RAY_DATA_ACTOR_DEFAULT_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR``
+            env var.
         should_continue_on_error: If True, continue processing when inference fails for a row
             instead of raising an exception. Failed rows will have a non-empty
             ``__inference_error__`` column containing the error message, and other
@@ -233,7 +242,16 @@ class SGLangEngineProcessorConfig(_SGLangEngineProcessorConfig):
             This is to overlap the batch processing to avoid the tail latency of
             each batch. The default value may not be optimal when the batch size
             or the batch processing latency is too small, but it should be good
-            enough for batch size >= 64.
+            enough for batch size >= 64. Sets the engine actor's Ray Core
+            ``max_concurrency``.
+        max_tasks_in_flight_per_actor: Max tasks Ray Data submits concurrently to
+            each engine actor. Passed through to ``ray.data.ActorPoolStrategy``.
+            If unset, Ray Data uses
+            ``ray.data.DataContext.max_tasks_in_flight_per_actor`` if set globally.
+            Otherwise, it defaults to ``2 * max_concurrent_batches``; the factor
+            can be overridden via the
+            ``RAY_DATA_ACTOR_DEFAULT_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR``
+            env var.
         chat_template_stage: Chat templating stage config (bool | dict | ChatTemplateStageConfig).
             Defaults to True. Use nested config for per-stage control over batch_size,
             concurrency, runtime_env, num_cpus, and memory. Legacy ``apply_chat_template``
diff --git a/python/ray/llm/_internal/batch/processor/base.py b/python/ray/llm/_internal/batch/processor/base.py
@@ -17,11 +17,6 @@
 logger = logging.getLogger(__name__)
 
 
-# Higher values here are better for prefetching and locality. It's ok for this to be
-# fairly high since streaming backpressure prevents us from overloading actors.
-DEFAULT_MAX_TASKS_IN_FLIGHT = 16
-
-
 class ProcessorConfig(BaseModelExtended):
     """The processor configuration."""
 
@@ -55,9 +50,12 @@ class ProcessorConfig(BaseModelExtended):
 
     experimental: Dict[str, Any] = Field(
         default_factory=dict,
-        description="[Experimental] Experimental configurations."
+        description="[Experimental] Experimental configurations. "
         "Supported keys:\n"
-        "`max_tasks_in_flight_per_actor`: The maximum number of tasks in flight per actor. Default to 16.",
+        "`max_tasks_in_flight_per_actor`: [DEPRECATED] Prefer the top-level "
+        "`max_tasks_in_flight_per_actor` field on `OfflineProcessorConfig`. "
+        "Setting it here is still respected (and overridden by the top-level "
+        "field if both are set), but logs a deprecation warning.",
     )
 
     @field_validator("concurrency")
@@ -156,7 +154,21 @@ class OfflineProcessorConfig(ProcessorConfig):
         "This is to overlap the batch processing to avoid the tail latency of "
         "each batch. The default value may not be optimal when the batch size "
         "or the batch processing latency is too small, but it should be good "
-        "enough for batch size >= 32.",
+        "enough for batch size >= 32. Sets the engine actor's Ray Core "
+        "`max_concurrency`.",
+    )
+    max_tasks_in_flight_per_actor: Optional[int] = Field(
+        default=None,
+        description="Max tasks Ray Data submits concurrently to each engine "
+        "actor. Passed through to `ray.data.ActorPoolStrategy`. If unset, Ray "
+        "Data uses `ray.data.DataContext.max_tasks_in_flight_per_actor` if set "
+        "globally. Otherwise, it defaults to `2 * max_concurrent_batches`; the "
+        "factor can be overridden via the "
+        "`RAY_DATA_ACTOR_DEFAULT_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR` "
+        "env var. "
+        "Setting this lower than `max_concurrent_batches` can underutilize the "
+        "engine actor because Ray Data submits fewer tasks than the actor can "
+        "process concurrently.",
     )
     should_continue_on_error: bool = Field(
         default=False,
@@ -260,6 +272,44 @@ def _coerce_legacy_to_stage_config(cls, values: Dict[str, Any]) -> Dict[str, Any
 
         return values
 
+    @model_validator(mode="before")
+    def _migrate_experimental_max_tasks_in_flight_per_actor(
+        cls, values: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Migrate deprecated `experimental[max_tasks_in_flight_per_actor]` to
+        the top-level field; top-level wins if both are set."""
+        experimental = values.get("experimental") or {}
+        if "max_tasks_in_flight_per_actor" in experimental:
+            logger.warning(
+                "Setting `max_tasks_in_flight_per_actor` via `experimental` is "
+                "deprecated; use the top-level `max_tasks_in_flight_per_actor` "
+                "field on `OfflineProcessorConfig` instead. The value in "
+                "`experimental` is still respected for now (and overridden by "
+                "the top-level field if both are set), but will be removed in "
+                "a future version."
+            )
+            if values.get("max_tasks_in_flight_per_actor") is None:
+                values["max_tasks_in_flight_per_actor"] = experimental[
+                    "max_tasks_in_flight_per_actor"
+                ]
+        return values
+
+    @model_validator(mode="after")
+    def _warn_if_max_tasks_in_flight_underutilizes_actor(self):
+        if (
+            self.max_tasks_in_flight_per_actor is not None
+            and self.max_tasks_in_flight_per_actor < self.max_concurrent_batches
+        ):
+            logger.warning(
+                "Setting `max_tasks_in_flight_per_actor` (%s) lower than "
+                "`max_concurrent_batches` (%s) can underutilize each engine "
+                "actor because Ray Data will submit fewer tasks than the actor "
+                "can process concurrently.",
+                self.max_tasks_in_flight_per_actor,
+                self.max_concurrent_batches,
+            )
+        return self
+
     @model_validator(mode="before")
     def _warn_prepare_image_stage_deprecation(
         cls, values: Dict[str, Any]
diff --git a/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py b/python/ray/llm/_internal/batch/processor/sglang_engine_proc.py
@@ -15,7 +15,6 @@
     get_or_create_telemetry_agent,
 )
 from ray.llm._internal.batch.processor.base import (
-    DEFAULT_MAX_TASKS_IN_FLIGHT,
     OfflineProcessorConfig,
     Processor,
     ProcessorBuilder,
@@ -184,9 +183,7 @@ def build_sglang_engine_processor(
                 # saturate `max_concurrency`.
                 compute=ray.data.ActorPoolStrategy(
                     **config.get_concurrency(autoscaling_enabled=True),
-                    max_tasks_in_flight_per_actor=config.experimental.get(
-                        "max_tasks_in_flight_per_actor", DEFAULT_MAX_TASKS_IN_FLIGHT
-                    ),
+                    max_tasks_in_flight_per_actor=config.max_tasks_in_flight_per_actor,
                 ),
                 # The number of running batches "per actor" in Ray Core level.
                 # This is used to make sure we overlap batches to avoid the tail
diff --git a/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py b/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
@@ -15,7 +15,6 @@
     get_or_create_telemetry_agent,
 )
 from ray.llm._internal.batch.processor.base import (
-    DEFAULT_MAX_TASKS_IN_FLIGHT,
     OfflineProcessorConfig,
     Processor,
     ProcessorBuilder,
@@ -284,9 +283,7 @@ def build_vllm_engine_processor(
                 # saturate `max_concurrency`.
                 compute=ray.data.ActorPoolStrategy(
                     **config.get_concurrency(autoscaling_enabled=True),
-                    max_tasks_in_flight_per_actor=config.experimental.get(
-                        "max_tasks_in_flight_per_actor", DEFAULT_MAX_TASKS_IN_FLIGHT
-                    ),
+                    max_tasks_in_flight_per_actor=config.max_tasks_in_flight_per_actor,
                 ),
                 # The number of running batches "per actor" in Ray Core level.
                 # This is used to make sure we overlap batches to avoid the tail
diff --git a/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py b/python/ray/llm/tests/batch/cpu/processor/test_processor_base.py
@@ -1,12 +1,16 @@
 import sys
 from typing import Any, AsyncIterator, Dict, List, Type
+from unittest.mock import patch
 
 import pydantic
 import pytest
 
 import ray
 from ray.data.llm import build_processor
-from ray.llm._internal.batch.processor import vLLMEngineProcessorConfig
+from ray.llm._internal.batch.processor import (
+    base as processor_base,
+    vLLMEngineProcessorConfig,
+)
 from ray.llm._internal.batch.processor.base import (
     Processor,
     ProcessorBuilder,
@@ -386,6 +390,93 @@ def test_with_tuple_concurrency(self, pair, expected):
         assert conf.get_concurrency() == expected
 
 
+class TestOfflineProcessorConfig:
+    @pytest.mark.parametrize(
+        "kwargs, expected",
+        [
+            ({"max_tasks_in_flight_per_actor": 10}, 10),
+            ({}, None),
+            # Field stays None; the formula runs in Ray Data, not here.
+            ({"max_concurrent_batches": 4}, None),
+        ],
+    )
+    def test_max_tasks_in_flight_per_actor_passthrough(self, kwargs, expected):
+        """Field passes through to ActorPoolStrategy; None defers resolution."""
+        config = vLLMEngineProcessorConfig(
+            model_source="unsloth/Llama-3.2-1B-Instruct",
+            **kwargs,
+        )
+        assert config.max_tasks_in_flight_per_actor == expected
+        assert config.max_concurrent_batches == kwargs.get("max_concurrent_batches", 8)
+
+    def test_experimental_max_tasks_in_flight_per_actor_deprecated(self):
+        """Setting `experimental['max_tasks_in_flight_per_actor']` migrates to
+        the top-level field with a deprecation log; the explicit top-level
+        field overrides it but the warning still fires."""
+
+        def has_deprecation_log(warning_mock):
+            return any(
+                "max_tasks_in_flight_per_actor" in call.args[0]
+                and "deprecated" in call.args[0]
+                for call in warning_mock.call_args_list
+            )
+
+        # Migration: experimental → top-level field.
+        with patch.object(processor_base.logger, "warning") as warning_mock:
+            cfg = vLLMEngineProcessorConfig(
+                model_source="unsloth/Llama-3.2-1B-Instruct",
+                experimental={"max_tasks_in_flight_per_actor": 10},
+            )
+        assert cfg.max_tasks_in_flight_per_actor == 10
+        assert has_deprecation_log(warning_mock)
+
+        # Explicit top-level beats experimental, but warning still fires.
+        with patch.object(processor_base.logger, "warning") as warning_mock:
+            cfg = vLLMEngineProcessorConfig(
+                model_source="unsloth/Llama-3.2-1B-Instruct",
+                max_tasks_in_flight_per_actor=20,
+                experimental={"max_tasks_in_flight_per_actor": 10},
+            )
+        assert cfg.max_tasks_in_flight_per_actor == 20
+        assert has_deprecation_log(warning_mock)
+
+    def test_max_tasks_in_flight_under_max_concurrent_batches_warns(self):
+        with patch.object(processor_base.logger, "warning") as warning_mock:
+            cfg = vLLMEngineProcessorConfig(
+                model_source="unsloth/Llama-3.2-1B-Instruct",
+                max_tasks_in_flight_per_actor=1,
+                max_concurrent_batches=8,
+            )
+
+        assert cfg.max_tasks_in_flight_per_actor == 1
+        assert cfg.max_concurrent_batches == 8
+        warning_messages = [call.args[0] for call in warning_mock.call_args_list]
+        assert any(
+            "max_tasks_in_flight_per_actor" in message
+            and "max_concurrent_batches" in message
+            and "underutilize" in message
+            for message in warning_messages
+        )
+
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {},
+            {"max_tasks_in_flight_per_actor": 8, "max_concurrent_batches": 8},
+            {"max_tasks_in_flight_per_actor": 16, "max_concurrent_batches": 8},
+        ],
+    )
+    def test_max_tasks_in_flight_does_not_warn_when_not_underutilized(self, kwargs):
+        with patch.object(processor_base.logger, "warning") as warning_mock:
+            vLLMEngineProcessorConfig(
+                model_source="unsloth/Llama-3.2-1B-Instruct",
+                **kwargs,
+            )
+
+        warning_messages = [call.args[0] for call in warning_mock.call_args_list]
+        assert not any("underutilize" in message for message in warning_messages)
+
+
 class TestMapKwargs:
     """Tests for preprocess_map_kwargs and postprocess_map_kwargs."""
 
diff --git a/python/ray/llm/tests/batch/gpu/processor/test_sglang_engine_proc.py b/python/ray/llm/tests/batch/gpu/processor/test_sglang_engine_proc.py
@@ -1,15 +1,14 @@
 """This test suite does not need sglang to be installed."""
 
 import sys
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 
 import ray
 from ray.data.llm import SGLangEngineProcessorConfig
 from ray.llm._internal.batch.constants import SGLangTaskType
 from ray.llm._internal.batch.processor import ProcessorBuilder
-from ray.llm._internal.batch.processor.base import DEFAULT_MAX_TASKS_IN_FLIGHT
 from ray.llm._internal.batch.processor.sglang_engine_proc import (
     build_sglang_engine_processor,
 )
@@ -76,40 +75,6 @@ def test_sglang_engine_processor(gpu_type, model_llama_3_2_216M):
 
 
 class TestSGLangEngineProcessorConfig:
-    @pytest.mark.parametrize(
-        "experimental_config",
-        [
-            {"max_tasks_in_flight_per_actor": 10},
-            {},
-        ],
-    )
-    def test_experimental_max_tasks_in_flight_per_actor_usage(
-        self, experimental_config
-    ):
-        """Tests that max_tasks_in_flight_per_actor is set properly in the ActorPoolStrategy."""
-
-        with patch("ray.data.ActorPoolStrategy") as mock_actor_pool:
-            mock_actor_pool.return_value = MagicMock()
-
-            config = SGLangEngineProcessorConfig(
-                model_source="unsloth/Llama-3.2-1B-Instruct",
-                experimental=experimental_config,
-            )
-            build_sglang_engine_processor(config)
-
-            mock_actor_pool.assert_called()
-            call_kwargs = mock_actor_pool.call_args[1]
-            if experimental_config:
-                assert (
-                    call_kwargs["max_tasks_in_flight_per_actor"]
-                    == experimental_config["max_tasks_in_flight_per_actor"]
-                )
-            else:
-                assert (
-                    call_kwargs["max_tasks_in_flight_per_actor"]
-                    == DEFAULT_MAX_TASKS_IN_FLIGHT
-                )
-
     def test_build_processor_autoconfig_failure_with_trust_remote_code(self):
         config = SGLangEngineProcessorConfig(
             model_source="nonexistent-org/nonexistent-model",
diff --git a/python/ray/llm/tests/batch/gpu/processor/test_vllm_engine_proc.py b/python/ray/llm/tests/batch/gpu/processor/test_vllm_engine_proc.py
@@ -1,5 +1,4 @@
 import sys
-from unittest.mock import MagicMock, patch
 
 import pydantic
 import pytest
@@ -692,46 +691,6 @@ def test_build_processor_autoconfig_failure(self):
         processor = build_processor(config)
         assert processor is not None
 
-    @pytest.mark.parametrize(
-        "experimental_config",
-        [
-            {"max_tasks_in_flight_per_actor": 10},
-            {},
-        ],
-    )
-    def test_experimental_max_tasks_in_flight_per_actor_usage(
-        self, experimental_config
-    ):
-        """Tests that max_tasks_in_flight_per_actor is set properly in the ActorPoolStrategy."""
-
-        from ray.llm._internal.batch.processor.base import DEFAULT_MAX_TASKS_IN_FLIGHT
-        from ray.llm._internal.batch.processor.vllm_engine_proc import (
-            build_vllm_engine_processor,
-            vLLMEngineProcessorConfig,
-        )
-
-        with patch("ray.data.ActorPoolStrategy") as mock_actor_pool:
-            mock_actor_pool.return_value = MagicMock()
-
-            config = vLLMEngineProcessorConfig(
-                model_source="unsloth/Llama-3.2-1B-Instruct",
-                experimental=experimental_config,
-            )
-            build_vllm_engine_processor(config)
-
-            mock_actor_pool.assert_called()
-            call_kwargs = mock_actor_pool.call_args[1]
-            if experimental_config:
-                assert (
-                    call_kwargs["max_tasks_in_flight_per_actor"]
-                    == experimental_config["max_tasks_in_flight_per_actor"]
-                )
-            else:
-                assert (
-                    call_kwargs["max_tasks_in_flight_per_actor"]
-                    == DEFAULT_MAX_TASKS_IN_FLIGHT
-                )
-
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))