Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0ad1299
[serve] Enable HAProxy redispatch and retry-on for backend resilience
harshit-anyscale May 6, 2026
d5e5e99
Merge branch 'master' into serve-haproxy-redispatch
harshit-anyscale May 6, 2026
f7b9a28
[serve] Accept 500 or 503 in test_default_error_handling
harshit-anyscale May 7, 2026
2ce77e1
[serve] Make HAProxy reload timeout and retries env-var configurable
harshit-anyscale May 7, 2026
edda9ef
[serve] Decouple /-/healthz from backend pool state
harshit-anyscale May 7, 2026
4b97a71
[serve] Apply server-only backend updates via HAProxy runtime API
harshit-anyscale May 8, 2026
efbe04e
[serve] Validate HAProxy runtime-API responses; address Codex review
harshit-anyscale May 8, 2026
596ebb4
[serve] Recover from dead HAProxy in graceful reload path
harshit-anyscale May 8, 2026
ce527e4
[serve] Coalesce HAProxy controller broadcasts into a single apply
harshit-anyscale May 8, 2026
4e972da
[serve] Free listener ports before HAProxy fresh-start recovery
harshit-anyscale May 8, 2026
aded8a7
[serve] Probe HTTP listener for HAProxy health check, not admin socket
harshit-anyscale May 8, 2026
55f6fb9
[serve] Stop using admin socket for HAProxy reload readiness check
harshit-anyscale May 8, 2026
6790359
[serve] Accept any HTTP response in HAProxy listener probe
harshit-anyscale May 8, 2026
50ef8cd
[serve] Use TCP-only probe for HAProxy readiness, HTTP for health
harshit-anyscale May 8, 2026
f1f692b
[serve] Bump default HAProxy startup timeout to match reload timeout
harshit-anyscale May 8, 2026
884ccbc
[serve] Add timing instrumentation for HAProxy reload phases
harshit-anyscale May 8, 2026
5e9815f
[serve] Split HAProxy admin sockets and skip startup DNS
harshit-anyscale May 8, 2026
ab4ab5c
[serve] Batch HAProxy runtime-API commands onto single connections
harshit-anyscale May 11, 2026
8944a10
[serve] Tune HAProxy runtime-API chunk size, socket timeout, log per-…
harshit-anyscale May 11, 2026
9dbe27e
[serve] Migrate HAProxy backend management to server-template slot pools
harshit-anyscale May 11, 2026
2ca0d1f
[serve] Fix server-template bugs found in self-review
harshit-anyscale May 11, 2026
b6f2fd8
[serve] Filter HAProxy stats to active slots only
harshit-anyscale May 11, 2026
6029404
[serve] Wait for new HAProxy proc to claim admin socket before runtim…
harshit-anyscale May 11, 2026
7330a99
[serve] Isolate HAProxy admin socket on dedicated thread; bump socket…
harshit-anyscale May 11, 2026
eac17a6
[serve] Revert HAProxy thread isolation; drain stderr + dump admin-so…
harshit-anyscale May 11, 2026
3b77d2b
[serve] Default HAProxy timeouts, remove abortonclose, raise min slot…
harshit-anyscale May 12, 2026
8bb9997
[serve] Bump HAProxy hard-stop to 1800s; decouple keep-alive from HTT…
harshit-anyscale May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 98 additions & 9 deletions python/ray/serve/_private/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,16 @@

# HAProxy hard stop after timeout
RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S = int(
os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "120")
os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "1800")
)

# Idle keep-alive timeout for HAProxy's client-side connections. Distinct
# from `HTTPOptions.keep_alive_timeout_s` (which is the uvicorn keep-alive
# on the *replica* side). Lower values force idle clients to rotate off
# old HAProxy procs faster after a reload, reducing the chance that a
# long-running request lands on a near-hard-stop-deadline proc.
RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S = int(
os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S", "60")
)

# HAProxy metrics export port
Expand All @@ -733,16 +742,12 @@
)

# HAProxy timeout configurations (in seconds, None = no timeout)
RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = (
int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S"))
if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S")
else None
RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = int(
os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S", "3600")
)

RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = (
int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S"))
if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S")
else None
RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = int(
os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S", "5")
)

# When enabled, adds 'option http-no-delay' to the HAProxy config defaults,
Expand All @@ -756,6 +761,90 @@
os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S", "3600")
)

# Time to wait for HAProxy to enter the running state during a graceful
# reload before giving up. The previous hardcoded 5s was too tight for
# clusters with many backends/servers under load: when reloads time out
# the new server list never takes effect, leaving HAProxy with stale
# routing (new replicas can't be added without a reload, since Ray's
# implementation regenerates the config rather than using the runtime API).
RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S = int(
os.environ.get("RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S", "5")
)

# Number of connection-level retries per request. With option redispatch,
# each retry picks a different healthy server. The HAProxy compiled-in
# default is 3; raising this is useful in environments with high
# autoscaling churn where any given primary may be temporarily
# unreachable and a sibling can serve the request instead.
RAY_SERVE_HAPROXY_RETRIES = int(os.environ.get("RAY_SERVE_HAPROXY_RETRIES", "3"))

# Window during which incoming controller broadcasts (target_groups,
# fallback_targets) are coalesced into a single backend update before being
# applied to HAProxy. Under autoscaling churn the controller can fire
# broadcasts tens of ms apart; without coalescing each one issues its own
# runtime-API command burst on the admin socket, which saturates HAProxy's
# CLI mux and causes timeouts (and `-x` socket-transfer failures during the
# fallback reload). 0.2s collapses typical burst clusters into one diff.
# Set to 0 to disable coalescing entirely (legacy behaviour).
RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S = float(
os.environ.get("RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S", "0.5")
)

# Maximum number of HAProxy runtime-API commands (e.g. `add server`,
# `disable server`, `del server`, `enable server`) batched onto a single
# admin-socket connection. Each chunk is sent as a `;`-separated command
# string and shares one socket-level timeout. Smaller chunks reduce the
# risk of any one chunk exceeding the read timeout when HAProxy's CLI
# mux is queueing behind HTTP worker dispatch under load: the per-chunk
# overhead (~single-digit ms of socket setup) is negligible compared to
# the cost of a fallback full reload triggered by a timeout. 16 leaves
# substantial headroom even at high traffic load.
RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE = int(
os.environ.get("RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE", "16")
)

# Connect/read timeout (seconds) for HAProxy admin-socket commands. The
# CLI mux serializes admin operations behind HTTP worker dispatch, so a
# batch of `add server` / `del server` / etc. commands can routinely take
# more than a few seconds while HAProxy is serving heavy traffic. A
# generous ceiling keeps the runtime-API path alive across slow windows
# instead of cascading into a fallback reload.
RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S = float(
os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "60")
)

# Total number of server slots to pre-allocate across all backends via
# HAProxy's `server-template` directive. Slots are partitioned across
# backends at config-generation time (see `_compute_slot_split`); each
# backend's share becomes the size of its `server-template` block and
# the upper bound on how many replicas it can hold before the runtime-
# API path returns False and a full reload re-computes the split. 4096
# total comfortably covers Ray Serve clusters with hundreds of replicas
# across a few dozen backends; raise it for larger fleets, but note that
# every slot has a small HAProxy memory cost (~few KB per slot) and
# slightly inflates `show stat` output.
RAY_SERVE_HAPROXY_TOTAL_SLOTS = int(
os.environ.get("RAY_SERVE_HAPROXY_TOTAL_SLOTS", "4096")
)

# Floor on per-backend slot allocation. Even backends with zero replicas
# get this many slots reserved so they can absorb some growth without
# requiring a reload to re-split. Set high enough to cover short bursts
# of churn but low enough that idle backends don't crowd out active ones.
# If N_backends * MIN_SLOTS exceeds TOTAL_SLOTS the split degrades to an
# equal share of (TOTAL_SLOTS // N_backends) per backend.
RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND = int(
os.environ.get("RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND", "32")
)

# Headroom multiplier applied when allocating slots. A backend with N
# current replicas gets ~N * factor slots (subject to total/min limits);
# the extra slots absorb scale-up without triggering a reload. 2.0 means
# a backend can double in size between reloads before exhausting.
RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR = float(
os.environ.get("RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR", "2.0")
)

# Number of consecutive failed server health checks that must occur
# before haproxy marks the server as down.
RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
Expand Down
Loading
Loading