ray-project · harshit-anyscale · May 6, 2026 · May 6, 2026 · May 7, 2026 · May 7, 2026
@@ -716,7 +716,16 @@
 
 # HAProxy hard stop after timeout
 RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S = int(
-    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "120")
+    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "1800")
+)
+
+# Idle keep-alive timeout for HAProxy's client-side connections. Distinct
+# from `HTTPOptions.keep_alive_timeout_s` (which is the uvicorn keep-alive
+# on the *replica* side). Lower values force idle clients to rotate off
+# old HAProxy procs faster after a reload, reducing the chance that a
+# long-running request lands on a near-hard-stop-deadline proc.
+RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S", "60")
 )
 
 # HAProxy metrics export port
@@ -733,16 +742,12 @@
 )
 
 # HAProxy timeout configurations (in seconds, None = no timeout)
-RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = (
-    int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S"))
-    if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S")
-    else None
+RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S", "3600")
 )
 
-RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = (
-    int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S"))
-    if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S")
-    else None
+RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S", "5")
 )
 
 # When enabled, adds 'option http-no-delay' to the HAProxy config defaults,
@@ -756,6 +761,90 @@
     os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S", "3600")
 )
 
+# Time to wait for HAProxy to enter the running state during a graceful
+# reload before giving up. The previous hardcoded 5s was too tight for
+# clusters with many backends/servers under load: when reloads time out
+# the new server list never takes effect, leaving HAProxy with stale
+# routing (new replicas can't be added without a reload, since Ray's
+# implementation regenerates the config rather than using the runtime API).
+RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S", "5")
+)
+
+# Number of connection-level retries per request. With option redispatch,
+# each retry picks a different healthy server. The HAProxy compiled-in
+# default is 3; raising this is useful in environments with high
+# autoscaling churn where any given primary may be temporarily
+# unreachable and a sibling can serve the request instead.
+RAY_SERVE_HAPROXY_RETRIES = int(os.environ.get("RAY_SERVE_HAPROXY_RETRIES", "3"))
+
+# Window during which incoming controller broadcasts (target_groups,
+# fallback_targets) are coalesced into a single backend update before being
+# applied to HAProxy. Under autoscaling churn the controller can fire
+# broadcasts tens of ms apart; without coalescing each one issues its own
+# runtime-API command burst on the admin socket, which saturates HAProxy's
+# CLI mux and causes timeouts (and `-x` socket-transfer failures during the
+# fallback reload). 0.2s collapses typical burst clusters into one diff.
+# Set to 0 to disable coalescing entirely (legacy behaviour).
+RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S = float(
+    os.environ.get("RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S", "0.5")
+)
+
+# Maximum number of HAProxy runtime-API commands (e.g. `add server`,
+# `disable server`, `del server`, `enable server`) batched onto a single
+# admin-socket connection. Each chunk is sent as a `;`-separated command
+# string and shares one socket-level timeout. Smaller chunks reduce the
+# risk of any one chunk exceeding the read timeout when HAProxy's CLI
+# mux is queueing behind HTTP worker dispatch under load: the per-chunk
+# overhead (~single-digit ms of socket setup) is negligible compared to
+# the cost of a fallback full reload triggered by a timeout. 16 leaves
+# substantial headroom even at high traffic load.
+RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE = int(
+    os.environ.get("RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE", "16")
+)
+
+# Connect/read timeout (seconds) for HAProxy admin-socket commands. The
+# CLI mux serializes admin operations behind HTTP worker dispatch, so a
+# batch of `add server` / `del server` / etc. commands can routinely take
+# more than a few seconds while HAProxy is serving heavy traffic. A
+# generous ceiling keeps the runtime-API path alive across slow windows
+# instead of cascading into a fallback reload.
+RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S = float(
+    os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "60")
+)
+
+# Total number of server slots to pre-allocate across all backends via
+# HAProxy's `server-template` directive. Slots are partitioned across
+# backends at config-generation time (see `_compute_slot_split`); each
+# backend's share becomes the size of its `server-template` block and
+# the upper bound on how many replicas it can hold before the runtime-
+# API path returns False and a full reload re-computes the split. 4096
+# total comfortably covers Ray Serve clusters with hundreds of replicas
+# across a few dozen backends; raise it for larger fleets, but note that
+# every slot has a small HAProxy memory cost (~few KB per slot) and
+# slightly inflates `show stat` output.
+RAY_SERVE_HAPROXY_TOTAL_SLOTS = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TOTAL_SLOTS", "4096")
+)
+
+# Floor on per-backend slot allocation. Even backends with zero replicas
+# get this many slots reserved so they can absorb some growth without
+# requiring a reload to re-split. Set high enough to cover short bursts
+# of churn but low enough that idle backends don't crowd out active ones.
+# If N_backends * MIN_SLOTS exceeds TOTAL_SLOTS the split degrades to an
+# equal share of (TOTAL_SLOTS // N_backends) per backend.
+RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND = int(
+    os.environ.get("RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND", "32")
+)
+
+# Headroom multiplier applied when allocating slots. A backend with N
+# current replicas gets ~N * factor slots (subject to total/min limits);
+# the extra slots absorb scale-up without triggering a reload. 2.0 means
+# a backend can double in size between reloads before exhausting.
+RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR = float(
+    os.environ.get("RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR", "2.0")
+)
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(