[serve] Coalesce HAProxy controller broadcasts into a single apply

harshit-anyscale · claude · harshit-anyscale · commit 3c44db2c852c · 2026-05-13T11:25:27.000Z
Under autoscaling churn the Serve controller fires target-group /
fallback-target broadcasts tens of ms apart; without coalescing, each
one runs its own runtime-API command burst on the HAProxy admin
socket. The CLI mux serializes everything (runtime API, stats, the
`-x` socket transfer used by reload) so command-level timeouts and
`-x` failures cluster together once admin-socket pressure exceeds
HAProxy's processing rate.

Coalesce broadcasts in `HAProxyManager` into a single sleeping flush
task. Updates arriving during the sleep are absorbed implicitly via
in-place writes to `self._target_groups` / fallback fields; updates
during the apply re-arm a `_coalesce_pending` flag so the trailing
broadcast in a flurry isn't dropped.

Window is configurable via `RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S`
(default 0.5s, set 0 to disable). Trade-off: each broadcast incurs up
to one window of additional latency before reaching HAProxy, which is
small relative to typical replica-start time and well below the rate
at which admin-socket saturation begins.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
@@ -759,6 +759,18 @@
     os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S", "3600")
 )
 
+# Window during which incoming controller broadcasts (target_groups,
+# fallback_targets) are coalesced into a single backend update before being
+# applied to HAProxy. Under autoscaling churn the controller can fire
+# broadcasts tens of ms apart; without coalescing each one issues its own
+# runtime-API command burst on the admin socket, which saturates HAProxy's
+# CLI mux and causes timeouts (and `-x` socket-transfer failures during the
+# fallback reload). 0.5s collapses typical burst clusters into one diff.
+# Set to 0 to disable coalescing entirely (legacy behaviour).
+RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S = float(
+    os.environ.get("RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S", "0.5")
+)
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
@@ -35,6 +35,7 @@
     RAY_SERVE_EXPERIMENTAL_PIP_HAPROXY,
     RAY_SERVE_HAPROXY_BALANCE_ALGORITHM,
     RAY_SERVE_HAPROXY_BINARY_PATH,
+    RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S,
     RAY_SERVE_HAPROXY_CONFIG_FILE_LOC,
     RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S,
     RAY_SERVE_HAPROXY_HEALTH_CHECK_DOWNINTER,
@@ -1412,6 +1413,20 @@ def __init__(
         # which can cause race conditions with SO_REUSEPORT
         self._reload_lock = asyncio.Lock()
 
+        # Coalesce controller broadcasts: when a broadcast arrives we schedule
+        # a single sleeping flush task. Subsequent broadcasts during the sleep
+        # are absorbed implicitly because they overwrite `self._target_groups`
+        # / fallback fields in place; the flush picks up the latest snapshot
+        # when it wakes. This collapses bursts of replica-add/remove events
+        # (common during autoscaling churn) into one runtime-API command pile,
+        # which keeps the HAProxy admin socket from saturating.
+        # `_coalesce_pending` lets the flush task notice broadcasts that
+        # arrive during a long-running apply and trigger another iteration
+        # so the trailing broadcast in a flurry isn't dropped.
+        self._coalesce_window_s = RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S
+        self._coalesce_task: Optional[asyncio.Task] = None
+        self._coalesce_pending = False
+
         self.long_poll_client = long_poll_client or LongPollClient(
             ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE),
             {
@@ -1678,7 +1693,8 @@ async def _apply_backend_update(
             if not applied_incrementally:
                 await self._haproxy.reload()
 
-    def _update_haproxy_backends(self) -> None:
+    def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
+        """Snapshot the current target groups + fallbacks into backend configs."""
         backend_configs = []
         for target_group in self._target_groups:
             fallback_target = None
@@ -1690,16 +1706,56 @@ def _update_haproxy_backends(self) -> None:
             backend_config = self._create_backend_config(target_group, fallback_target)
             backend_configs.append(backend_config)
 
-        logger.info(
-            f"Got updated backend configs: {backend_configs}.",
-            extra={"log_to_stderr": True},
-        )
+        return {bc.name: bc for bc in backend_configs}
 
-        name_to_backend_configs = {
-            backend_config.name: backend_config for backend_config in backend_configs
-        }
+    def _update_haproxy_backends(self) -> None:
+        # Schedule a coalesced flush. If a flush is already pending (sleeping
+        # or applying), set `_coalesce_pending` so the running task picks up
+        # our broadcast on the next loop iteration. With coalescing disabled
+        # (window=0), apply synchronously to preserve legacy behaviour.
+        if self._coalesce_window_s <= 0:
+            name_to_backend_configs = self._build_name_to_backend_configs()
+            logger.info(
+                f"Got updated backend configs: {list(name_to_backend_configs.values())}.",
+                extra={"log_to_stderr": True},
+            )
+            self.event_loop.create_task(
+                self._apply_backend_update(name_to_backend_configs)
+            )
+            return
+
+        self._coalesce_pending = True
+        if self._coalesce_task is None or self._coalesce_task.done():
+            self._coalesce_task = self.event_loop.create_task(
+                self._coalesce_and_apply()
+            )
+
+    async def _coalesce_and_apply(self) -> None:
+        """Drain pending broadcasts, applying each batch after a short sleep.
 
-        self.event_loop.create_task(self._apply_backend_update(name_to_backend_configs))
+        Broadcasts arriving during the sleep are absorbed because the
+        long-poll callbacks overwrite `self._target_groups` / fallback fields
+        in place — by the time we wake up, the snapshot already reflects them.
+        Broadcasts arriving during the apply are handled by the outer loop:
+        they re-set `_coalesce_pending`, and we iterate again.
+        """
+        try:
+            while self._coalesce_pending:
+                # Clear before sleep so any broadcast during sleep+apply
+                # re-arms the flag and triggers another iteration.
+                self._coalesce_pending = False
+                await asyncio.sleep(self._coalesce_window_s)
+                name_to_backend_configs = self._build_name_to_backend_configs()
+                logger.info(
+                    f"Got updated backend configs: {list(name_to_backend_configs.values())}.",
+                    extra={"log_to_stderr": True},
+                )
+                await self._apply_backend_update(name_to_backend_configs)
+        except Exception as e:
+            # Don't let an apply failure kill the coalescer. The next broadcast
+            # will schedule a fresh flush task; the underlying error is already
+            # logged by `_apply_backend_update` / `try_apply_servers_dynamically`.
+            logger.error(f"Coalesced backend apply failed: {e}")
 
     def update_target_groups(self, target_groups: List[TargetGroup]) -> None:
         self._target_groups = target_groups