From 0ad12998da8417cd7afcb2898b6bb7881128119b Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Wed, 6 May 2026 16:59:45 +0000
Subject: [PATCH 01/26] [serve] Enable HAProxy redispatch and retry-on for
 backend resilience
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `option redispatch` and `retry-on conn-failure empty-response` to
each generated backend block in the HAProxy template.

Without these directives, when a Serve replica becomes unreachable
between health-check intervals (e.g. during autoscaling churn), HAProxy
exhausts its default `retries 3` against the same dead server, falls
through to the slow `SERVE_PROXY_ACTOR-fallback` backup, and surfaces a
synthesized 502 (remapped to 500 by the existing errorfile config) to
the client. With `option redispatch`, the retries pick a fresh server.
With `retry-on conn-failure empty-response`, retries trigger on
connection-establishment failures (replica process gone) and
connected-but-no-response cases (replica died after accept) — both are
body-stream-safe and don't risk double-execution of side-effecting
requests.

The two `retry-on` conditions intentionally exclude `response-timeout`
(no `timeout server` is set in the default config so it would never
fire) and `5xx` (would mask `BackPressureError` from the application
layer, causing thrash instead of respecting back-pressure).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy_templates.py |  7 +++++++
 python/ray/serve/tests/test_haproxy_api.py     | 14 ++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 252589019458..11dfcd3a544e 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -91,6 +91,13 @@
 {%- set hc = item.health_config %}
 backend {{ backend.name or 'unknown' }}
     log global
+    # On a server failure, retry on a different server (combined with the
+    # default `retries 3`) instead of giving up to the backup or returning
+    # a synthesized 5xx. retry-on covers connection-establishment failures
+    # and "connected but no response yet" cases — both are body-stream-safe
+    # and don't risk double-execution.
+    option redispatch
+    retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance
     http-reuse always
     # Set backend-specific timeouts, overriding defaults if specified
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index ff1b22b1c8e9..55ae80e328ce 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -326,6 +326,13 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     http-request return status 404 content-type text/plain lf-string "Path \'%[path]\' not found. Ping http://.../-/routes for available routes."
 backend api_backend
     log global
+    # On a server failure, retry on a different server (combined with the
+    # default `retries 3`) instead of giving up to the backup or returning
+    # a synthesized 5xx. retry-on covers connection-establishment failures
+    # and "connected but no response yet" cases — both are body-stream-safe
+    # and don't risk double-execution.
+    option redispatch
+    retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance
     http-reuse always
     # Set backend-specific timeouts, overriding defaults if specified
@@ -344,6 +351,13 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     server api_fallback_server 127.0.0.1:8500 check backup
 backend web_backend
     log global
+    # On a server failure, retry on a different server (combined with the
+    # default `retries 3`) instead of giving up to the backup or returning
+    # a synthesized 5xx. retry-on covers connection-establishment failures
+    # and "connected but no response yet" cases — both are body-stream-safe
+    # and don't risk double-execution.
+    option redispatch
+    retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance
     http-reuse always
     # Set backend-specific timeouts, overriding defaults if specified

From f7b9a282d278ce32bb9c2b72968114544cb222a0 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Thu, 7 May 2026 03:42:48 +0000
Subject: [PATCH 02/26] [serve] Accept 500 or 503 in
 test_default_error_handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With HAProxy redispatch enabled, requests to a deployment whose only
replica died mid-request now reach the backup proxy actor instead of
being given up on by HAProxy. The backup raises DeploymentUnavailableError
(→ 503) when the controller has broadcast unavailability, or the request
fails as ActorDiedError (→ 500) if it hasn't. Both are valid 5xx
responses for a deployment that is genuinely unavailable; loosen the
assertion to accept either.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/tests/test_http_routes.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/ray/serve/tests/test_http_routes.py b/python/ray/serve/tests/test_http_routes.py
index 4fafc1360e73..18f18a9f5896 100644
--- a/python/ray/serve/tests/test_http_routes.py
+++ b/python/ray/serve/tests/test_http_routes.py
@@ -244,9 +244,16 @@ def h():
         time.sleep(100)  # Don't return here to leave time for actor exit.
 
     serve.run(h.bind())
-    # Error is raised before the request reaches the deployed replica as the replica does not exist.
+    # The replica kills itself mid-request, so the deployment ends up with no
+    # live replicas. The exact status code depends on timing:
+    # - 500 if the controller hasn't yet broadcast unavailability to the proxy
+    #   (the request fails as ActorDiedError → mapped to 500 in http_util.py).
+    # - 503 if the broadcast has happened (the proxy raises
+    #   DeploymentUnavailableError → mapped to 503 in http_util.py). With
+    #   HAProxy + redispatch, the backup proxy actor is reached after the
+    #   primary fails, by which point the broadcast has typically landed.
     r = httpx.get("http://localhost:8000/h")
-    assert r.status_code == 500
+    assert r.status_code in (500, 503)
 
 
 if __name__ == "__main__":

From 2ce77e1fdfcc221414093e53de0cc98db48759bf Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Thu, 7 May 2026 09:52:17 +0000
Subject: [PATCH 03/26] [serve] Make HAProxy reload timeout and retries env-var
 configurable

Two related knobs that matter for HAProxy ingress reliability under
autoscaling churn, both previously hardcoded:

1. RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S (default: 5)
   The wait window in _wait_for_hap_availability() used to be 5s and
   hardcoded. With many backends/servers under load, HAProxy's startup
   (config parse + initial health checks against every server + socket
   transfer) regularly exceeds 5s, the new process is killed, and the
   old process keeps running with stale config. Since Ray's
   HAProxyManager regenerates config rather than using HAProxy's runtime
   API, a failed reload silently drops new replica registrations:
   primaries are added to the broadcast but never to HAProxy's server
   list, leaving the pool to drift smaller than reality and concentrating
   load on a shrinking subset of replicas (or on the single fallback
   ProxyActor).

2. RAY_SERVE_HAPROXY_RETRIES (default: 3)
   Number of connection-level retries. Combined with the existing
   `option redispatch` (added in the same PR series), each retry picks
   a different healthy server. Raising this is useful in environments
   with autoscaling churn where any given primary may briefly be
   unreachable before another sibling can serve the request.

Both default to the previous compiled-in values, preserving behavior
for users who don't opt in. Recommended overrides for high-churn
clusters:

    RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S=30
    RAY_SERVE_HAPROXY_RETRIES=100

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py        | 17 +++++++++++++++
 python/ray/serve/_private/haproxy.py          | 10 ++++++++-
 .../ray/serve/_private/haproxy_templates.py   | 11 +++++-----
 python/ray/serve/tests/test_haproxy_api.py    | 21 ++++++++++---------
 4 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 3bbed7ac849f..f7d3947e09c2 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -756,6 +756,23 @@
     os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S", "3600")
 )
 
+# Time to wait for HAProxy to enter the running state during a graceful
+# reload before giving up. The previous hardcoded 5s was too tight for
+# clusters with many backends/servers under load: when reloads time out
+# the new server list never takes effect, leaving HAProxy with stale
+# routing (new replicas can't be added without a reload, since Ray's
+# implementation regenerates the config rather than using the runtime API).
+RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S", "5")
+)
+
+# Number of connection-level retries per request. With option redispatch,
+# each retry picks a different healthy server. The HAProxy compiled-in
+# default is 3; raising this is useful in environments with high
+# autoscaling churn where any given primary may be temporarily
+# unreachable and a sibling can serve the request instead.
+RAY_SERVE_HAPROXY_RETRIES = int(os.environ.get("RAY_SERVE_HAPROXY_RETRIES", "3"))
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 9e4b9bee371a..0dc81611bee1 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -41,6 +41,8 @@
     RAY_SERVE_HAPROXY_MAXCONN,
     RAY_SERVE_HAPROXY_METRICS_PORT,
     RAY_SERVE_HAPROXY_NBTHREAD,
+    RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
+    RAY_SERVE_HAPROXY_RETRIES,
     RAY_SERVE_HAPROXY_SERVER_STATE_BASE,
     RAY_SERVE_HAPROXY_SERVER_STATE_FILE,
     RAY_SERVE_HAPROXY_SOCKET_PATH,
@@ -414,6 +416,10 @@ class HAProxyConfig:
     timeout_server_s: Optional[int] = RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S
     timeout_http_request_s: Optional[int] = None
     hard_stop_after_s: Optional[int] = RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S
+    # Number of connection-level retries per request. Used in the `defaults`
+    # block; combined with `option redispatch` (set per-backend) each retry
+    # picks a different healthy server.
+    retries: int = RAY_SERVE_HAPROXY_RETRIES
     custom_global: Dict[str, str] = field(default_factory=dict)
     custom_defaults: Dict[str, str] = field(default_factory=dict)
     inject_process_id_header: bool = False
@@ -670,7 +676,9 @@ async def _graceful_reload(self) -> None:
             raise
 
     async def _wait_for_hap_availability(
-        self, proc: asyncio.subprocess.Process, timeout_s: int = 5
+        self,
+        proc: asyncio.subprocess.Process,
+        timeout_s: int = RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
     ) -> None:
         start_time = time.time()
 
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 11dfcd3a544e..7c0c85b6be8c 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -36,6 +36,7 @@
 defaults
     mode http
     option log-health-checks
+    retries {{ config.retries }}
     {% if config.timeout_connect_s is not none %}timeout connect {{ config.timeout_connect_s }}s{% endif %}
     {% if config.timeout_client_s is not none %}timeout client {{ config.timeout_client_s }}s{% endif %}
     {% if config.timeout_server_s is not none %}timeout server {{ config.timeout_server_s }}s{% endif %}
@@ -91,11 +92,11 @@
 {%- set hc = item.health_config %}
 backend {{ backend.name or 'unknown' }}
     log global
-    # On a server failure, retry on a different server (combined with the
-    # default `retries 3`) instead of giving up to the backup or returning
-    # a synthesized 5xx. retry-on covers connection-establishment failures
-    # and "connected but no response yet" cases — both are body-stream-safe
-    # and don't risk double-execution.
+    # On a server failure, retry on a different server (combined with
+    # `retries N` from the defaults block) instead of giving up to the
+    # backup or returning a synthesized 5xx. retry-on covers
+    # connection-establishment failures and "connected but no response yet"
+    # cases — both are body-stream-safe and don't risk double-execution.
     option redispatch
     retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 55ae80e328ce..36bb9825123f 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -278,6 +278,7 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
 defaults
     mode http
     option log-health-checks
+    retries 3
     timeout connect 5s
     timeout client 30s
     timeout server 30s
@@ -326,11 +327,11 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     http-request return status 404 content-type text/plain lf-string "Path \'%[path]\' not found. Ping http://.../-/routes for available routes."
 backend api_backend
     log global
-    # On a server failure, retry on a different server (combined with the
-    # default `retries 3`) instead of giving up to the backup or returning
-    # a synthesized 5xx. retry-on covers connection-establishment failures
-    # and "connected but no response yet" cases — both are body-stream-safe
-    # and don't risk double-execution.
+    # On a server failure, retry on a different server (combined with
+    # `retries N` from the defaults block) instead of giving up to the
+    # backup or returning a synthesized 5xx. retry-on covers
+    # connection-establishment failures and "connected but no response yet"
+    # cases — both are body-stream-safe and don't risk double-execution.
     option redispatch
     retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance
@@ -351,11 +352,11 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     server api_fallback_server 127.0.0.1:8500 check backup
 backend web_backend
     log global
-    # On a server failure, retry on a different server (combined with the
-    # default `retries 3`) instead of giving up to the backup or returning
-    # a synthesized 5xx. retry-on covers connection-establishment failures
-    # and "connected but no response yet" cases — both are body-stream-safe
-    # and don't risk double-execution.
+    # On a server failure, retry on a different server (combined with
+    # `retries N` from the defaults block) instead of giving up to the
+    # backup or returning a synthesized 5xx. retry-on covers
+    # connection-establishment failures and "connected but no response yet"
+    # cases — both are body-stream-safe and don't risk double-execution.
     option redispatch
     retry-on conn-failure empty-response
     # Enable HTTP connection reuse for better performance

From edda9ef93afa82e5fb56df2b4acb6618a4d0e770 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Thu, 7 May 2026 15:47:30 +0000
Subject: [PATCH 04/26] [serve] Decouple /-/healthz from backend pool state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Always return 200 from HAProxy's /-/healthz when HAProxy itself is up
(unless drain mode is active). Previously, the endpoint returned 503
whenever every backend reported nbsrv() == 0, which happens not only
during real outages but also briefly during reload transitions
(initial-check windows where servers are present in the config but
HAProxy's per-server health checks haven't completed yet).

Tying ALB target health to backend pool state via this endpoint causes
flapping cascades: a 1-3 second 503 window during a reload makes the
ALB mark the target unhealthy for minutes (interval × healthy_threshold
to recover). With reloads happening repeatedly under autoscaling
churn, all targets end up unhealthy simultaneously, the ALB has no
healthy targets, and every client request becomes an ALB-synthesized
502.

After this change, /-/healthz reflects only HAProxy's own liveness
(the question the ALB target health check actually needs answered).
Backend availability is still surfaced to clients per-request: if
HAProxy can't reach any backend, it returns 503 from the actual
request path — which is the correct HTTP semantic and uncorrelated
across requests, not a cluster-wide blackout.

Trade-off: clients may now see HAProxy-returned 503s for the small
fraction of requests that genuinely have no backend, instead of
ALB-returned 502s. Net effect is a substantial volume reduction in
client-visible failures because the cascade pattern (one bad probe
blanking out an entire target for minutes) is eliminated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ray/serve/_private/haproxy_templates.py   | 21 +++++----
 python/ray/serve/tests/test_haproxy_api.py    | 46 ++++++++++---------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 7c0c85b6be8c..3e7db9313a7a 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -5,16 +5,17 @@
 {%- if not health_info.healthy %}
     # Override: force health checks to fail (used by drain/disable)
     http-request return status {{ health_info.status }} content-type text/plain string "{{ health_info.health_message }}" if healthcheck
-{%- elif backends %}
-    # 200 if any backend has at least one server UP
-{%-   for backend in backends %}
-    acl backend_{{ backend.name or 'unknown' }}_server_up nbsrv({{ backend.name or 'unknown' }}) ge 1
-{%-   endfor %}
-    # Any backend with a server UP passes the health check (OR logic)
-{%-   for backend in backends %}
-    http-request return status {{ health_info.status }} content-type text/plain string "{{ health_info.health_message }}" if healthcheck backend_{{ backend.name or 'unknown' }}_server_up
-{%-   endfor %}
-    http-request return status 503 content-type text/plain string "Service Unavailable" if healthcheck
+{%- else %}
+    # Always return success if HAProxy itself is up. Backend pool state is
+    # surfaced to clients per-request (HAProxy returns 503 when backends
+    # are missing) — not via this ingress healthcheck. Tying ALB target
+    # health to nbsrv() causes flapping cascades during autoscaling churn:
+    # initial-check windows after a reload briefly report nbsrv()=0 even
+    # though HAProxy can still serve traffic (via redispatch, fallback,
+    # and replicas that finish their initial probes within milliseconds).
+    # That brief 503 marks the target unhealthy at the ALB for minutes,
+    # making every client request route to a "no healthy targets" state.
+    http-request return status {{ health_info.status }} content-type text/plain string "{{ health_info.health_message }}" if healthcheck
 {%- endif %}
 """
 
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 36bb9825123f..81c2cd9ef534 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -305,13 +305,16 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     acl healthcheck path -i /-/healthz
     # Suppress logging for health checks
     http-request set-log-level silent if healthcheck
-    # 200 if any backend has at least one server UP
-    acl backend_api_backend_server_up nbsrv(api_backend) ge 1
-    acl backend_web_backend_server_up nbsrv(web_backend) ge 1
-    # Any backend with a server UP passes the health check (OR logic)
-    http-request return status 200 content-type text/plain string "success" if healthcheck backend_api_backend_server_up
-    http-request return status 200 content-type text/plain string "success" if healthcheck backend_web_backend_server_up
-    http-request return status 503 content-type text/plain string "Service Unavailable" if healthcheck
+    # Always return success if HAProxy itself is up. Backend pool state is
+    # surfaced to clients per-request (HAProxy returns 503 when backends
+    # are missing) — not via this ingress healthcheck. Tying ALB target
+    # health to nbsrv() causes flapping cascades during autoscaling churn:
+    # initial-check windows after a reload briefly report nbsrv()=0 even
+    # though HAProxy can still serve traffic (via redispatch, fallback,
+    # and replicas that finish their initial probes within milliseconds).
+    # That brief 503 marks the target unhealthy at the ALB for minutes,
+    # making every client request route to a "no healthy targets" state.
+    http-request return status 200 content-type text/plain string "success" if healthcheck
     # Routes endpoint
     acl routes path -i /-/routes
     http-request return status 200 content-type application/json string "{routes}" if routes
@@ -1093,7 +1096,11 @@ def health_check_condition(status_code: int):
 
 @pytest.mark.asyncio
 async def test_health_endpoint_or_logic_multiple_backends(haproxy_api_cleanup):
-    """Test that the health endpoint returns 200 if ANY backend has at least one server UP (OR logic)."""
+    """Test that the health endpoint returns 200 unconditionally when HAProxy
+    is up — regardless of backend server state. The endpoint is for ALB
+    target-health checks (is this proxy reachable?) and intentionally
+    decouples from backend pool state to avoid flapping cascades during
+    autoscaling churn."""
     with tempfile.TemporaryDirectory() as temp_dir:
         config_file_path = os.path.join(temp_dir, "haproxy.cfg")
         socket_path = os.path.join(temp_dir, "admin.sock")
@@ -1187,25 +1194,22 @@ def health_ok():
             backend2_server.should_exit = True
             backend2_thread.join(timeout=5)
 
-            # Wait for health check to fail (both servers are DOWN)
-            def health_fails():
-                resp = requests.get(
-                    f"http://127.0.0.1:{config.frontend_port}{config.health_check_endpoint}",
-                    timeout=5,
-                )
-                return resp.status_code == 503
-
-            wait_for_condition(health_fails, timeout=10, retry_interval_ms=200)
+            # Give HAProxy time to detect both servers are down
+            await asyncio.sleep(2)
 
-            # Verify health check returns 503 when ALL servers are DOWN
+            # Verify health check STILL returns 200 even when ALL servers are
+            # DOWN. /-/healthz reflects HAProxy's own liveness, not backend
+            # pool state. Backend availability is surfaced to clients
+            # per-request (HAProxy returns 503 from the actual request path
+            # when no backend can serve), not via this ingress probe.
             health_response = requests.get(
                 f"http://127.0.0.1:{config.frontend_port}{config.health_check_endpoint}",
                 timeout=5,
             )
             assert (
-                health_response.status_code == 503
-            ), "Health check should return 503 when all servers are DOWN"
-            assert b"Service Unavailable" in health_response.content
+                health_response.status_code == 200
+            ), "Health check should return 200 (HAProxy is up) regardless of backend state"
+            assert b"success" in health_response.content
 
             await api.stop()
         finally:

From 4b97a71dbe920146c96b66470131611d07a93ef4 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 04:14:11 +0000
Subject: [PATCH 05/26] [serve] Apply server-only backend updates via HAProxy
 runtime API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At Ray Serve scales with many replicas under autoscaling churn, every
controller broadcast triggers a full HAProxy config regeneration plus a
graceful reload. Each reload blocks the proxy actor's event loop long
enough (config-string templating, file write, fork+exec of the new
HAProxy process, initial server-state probes) that the actor can miss
the controller's `check_health` window. Three consecutive missed checks
trip PROXY_HEALTH_CHECK_UNHEALTHY_THRESHOLD; the controller kills and
recreates the actor; the new actor immediately receives a broadcast and
starts another reload; repeat. Result: every proxy in the cluster ends
up cycling continuously, the ALB sees its targets as permanently
unhealthy, and clients receive 502s from "no healthy targets" not from
any HAProxy or replica issue.

This change adds an incremental update path: when the only difference
between the previous and new backend configs is the set of servers
within existing backends (the dominant case during autoscaling churn —
adds/removes of individual replicas), apply the change via HAProxy's
runtime API (`add server` / `del server` over the admin socket)
instead of regenerating the config and reloading. Runtime-API server
operations are sub-millisecond per command and don't block the actor's
event loop, so the proxy stays responsive to the controller even while
processing rapid-fire broadcasts.

Structural changes — backend added or removed, ACL/path-prefix change,
fallback-server change, timeout or health-check parameter change —
still take the existing reload path. The diff is conservative: any
field-level discrepancy outside the servers list forces a reload.

If any single runtime-API command fails (socket transient, HAProxy
version doesn't support `add server`, etc.) the change is abandoned
mid-flight and the caller falls back to a reload, so partial state is
never persisted.

Internal state (HAProxyApi.backend_configs and the on-disk config file)
is updated either way so a subsequent reload — for instance, triggered
by a future structural change — sees the current set of servers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py          | 227 +++++++++++-
 .../ray/serve/tests/unit/test_haproxy_diff.py | 330 ++++++++++++++++++
 2 files changed, 555 insertions(+), 2 deletions(-)
 create mode 100644 python/ray/serve/tests/unit/test_haproxy_diff.py

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 0dc81611bee1..49716050a653 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -553,6 +553,100 @@ async def reload(self) -> None:
         pass
 
 
+@dataclass
+class _BackendDiff:
+    """Diff between old and new BackendConfigs for incremental update path.
+
+    When `is_structural` is True, a full reload is required (backends added/
+    removed, ACLs/timeouts/health-check parameters changed, fallback changed).
+
+    When `is_structural` is False, only the `servers` list within one or more
+    backends differs, and the change can be applied via HAProxy's runtime API
+    (`add server` / `del server` over the admin socket) without a reload.
+    """
+
+    is_structural: bool
+    servers_to_add: Dict[str, List[ServerConfig]] = field(default_factory=dict)
+    servers_to_remove: Dict[str, List[str]] = field(default_factory=dict)
+
+
+def _compute_backend_diff(
+    old_configs: Dict[str, BackendConfig],
+    new_configs: Dict[str, BackendConfig],
+) -> _BackendDiff:
+    """Determine whether a config change can be applied incrementally.
+
+    Anything other than pure server list differences within an existing
+    backend is treated as a structural change requiring a reload. This keeps
+    the incremental path conservative: when in doubt, reload.
+    """
+    # Different set of backends (added/removed) → reload
+    if set(old_configs.keys()) != set(new_configs.keys()):
+        return _BackendDiff(is_structural=True)
+
+    servers_to_add: Dict[str, List[ServerConfig]] = {}
+    servers_to_remove: Dict[str, List[str]] = {}
+
+    for name in new_configs:
+        old_be = old_configs[name]
+        new_be = new_configs[name]
+
+        # Any backend-level config change → reload. Compare the fields that
+        # affect the rendered backend block (excluding the servers list).
+        backend_level_fields = (
+            "path_prefix",
+            "fallback_server",
+            "app_name",
+            "timeout_connect_s",
+            "timeout_server_s",
+            "timeout_client_s",
+            "timeout_http_request_s",
+            "timeout_queue_s",
+            "timeout_http_keep_alive_s",
+            "timeout_tunnel_s",
+            "health_check_fall",
+            "health_check_rise",
+            "health_check_inter",
+            "health_check_fastinter",
+            "health_check_downinter",
+            "health_check_path",
+        )
+        for f in backend_level_fields:
+            if getattr(old_be, f) != getattr(new_be, f):
+                return _BackendDiff(is_structural=True)
+
+        # Compare server lists by name. A name present in both with different
+        # host/port is treated as remove + add (HAProxy's runtime API can't
+        # update a server's address in-place across all versions).
+        old_servers = {s.name: s for s in old_be.servers}
+        new_servers = {s.name: s for s in new_be.servers}
+
+        added: List[ServerConfig] = []
+        removed: List[str] = []
+
+        for sname, server in new_servers.items():
+            if sname not in old_servers:
+                added.append(server)
+            elif old_servers[sname] != server:
+                added.append(server)
+                removed.append(sname)
+
+        for sname in old_servers:
+            if sname not in new_servers:
+                removed.append(sname)
+
+        if added:
+            servers_to_add[name] = added
+        if removed:
+            servers_to_remove[name] = removed
+
+    return _BackendDiff(
+        is_structural=False,
+        servers_to_add=servers_to_add,
+        servers_to_remove=servers_to_remove,
+    )
+
+
 class HAProxyApi(ProxyApi):
     """ProxyApi implementation for HAProxy."""
 
@@ -984,6 +1078,113 @@ def set_backend_configs(
             len(bc.servers) > 0 for bc in backend_configs.values()
         )
 
+    async def try_apply_servers_dynamically(
+        self,
+        new_backend_configs: Dict[str, BackendConfig],
+    ) -> bool:
+        """Try to apply server-list changes via HAProxy's runtime API.
+
+        Compares ``new_backend_configs`` to the current ``self.backend_configs``.
+        If the change is purely server adds/removes within existing backends,
+        applies them over the admin socket (`add server` / `del server`) without
+        regenerating the config or restarting HAProxy. Otherwise falls back
+        and returns False so the caller can perform a full reload.
+
+        At Ray Serve scales with many replicas under autoscaling churn,
+        regenerating and reloading on every replica change blocks the proxy
+        actor's event loop long enough to fail controller health checks
+        (causing the proxy to be killed and recreated, cascading into the
+        ALB seeing all targets as unhealthy). The runtime API is ~1ms per
+        change vs. ~1-2s for a reload, so this incremental path keeps the
+        actor responsive under churn.
+
+        Returns True if applied incrementally (no reload needed). Returns
+        False if a reload is required (caller is responsible for calling
+        ``reload()``).
+        """
+        # First-time setup or no current state to diff against → reload.
+        if not self.backend_configs:
+            self.set_backend_configs(new_backend_configs)
+            return False
+
+        # If HAProxy isn't running yet (initial startup), the runtime API
+        # isn't available. Fall back to reload, which will start HAProxy.
+        if not await self.is_running():
+            self.set_backend_configs(new_backend_configs)
+            return False
+
+        diff = _compute_backend_diff(self.backend_configs, new_backend_configs)
+        if diff.is_structural:
+            self.set_backend_configs(new_backend_configs)
+            return False
+
+        if not diff.servers_to_add and not diff.servers_to_remove:
+            # No-op: server lists are identical. Update internal state
+            # defensively (in case other fields on ServerConfig change in the
+            # future) and skip the reload.
+            self.set_backend_configs(new_backend_configs)
+            return True
+
+        # Apply the diff via the admin socket. If any single command fails,
+        # bail out and let the caller reload — this avoids leaving HAProxy in
+        # a partially-updated state.
+        try:
+            for backend_name, server_names in diff.servers_to_remove.items():
+                for sname in server_names:
+                    # Disable before delete: required when the server has
+                    # in-flight connections; safe even when it doesn't.
+                    await self._send_socket_command(
+                        f"disable server {backend_name}/{sname}"
+                    )
+                    await self._send_socket_command(
+                        f"del server {backend_name}/{sname}"
+                    )
+
+            for backend_name, servers in diff.servers_to_add.items():
+                for server in servers:
+                    # Newly-added runtime servers inherit `default-server`
+                    # directives (inter/fall/rise/etc.) from the backend
+                    # block, so we only need address + the `check` flag.
+                    await self._send_socket_command(
+                        f"add server {backend_name}/{server.name} "
+                        f"{server.host}:{server.port} check"
+                    )
+                    # Runtime-added servers start in MAINT state by default.
+                    await self._send_socket_command(
+                        f"enable server {backend_name}/{server.name}"
+                    )
+        except Exception as e:
+            logger.warning(
+                f"Failed to apply backend update incrementally via runtime "
+                f"API: {e}. Falling back to a full reload.",
+                extra={"log_to_stderr": False},
+            )
+            self.set_backend_configs(new_backend_configs)
+            return False
+
+        # Update internal state and regenerate the config file so any
+        # subsequent reload (e.g. for a structural change) starts from the
+        # current set of servers.
+        self.set_backend_configs(new_backend_configs)
+        try:
+            self._generate_config_file_internal()
+        except Exception as e:
+            # The runtime change is already applied. A failure to write the
+            # config file is non-fatal here: the next reload will rebuild it.
+            logger.warning(
+                f"Applied backend update via runtime API but failed to "
+                f"refresh the config file: {e}",
+                extra={"log_to_stderr": False},
+            )
+
+        logger.info(
+            f"Applied backend update via runtime API: "
+            f"{sum(len(v) for v in diff.servers_to_add.values())} server(s) added, "
+            f"{sum(len(v) for v in diff.servers_to_remove.values())} server(s) removed.",
+            extra={"log_to_stderr": False},
+        )
+        return True
+
     async def is_running(self) -> bool:
         try:
             await self._send_socket_command("show info")
@@ -1268,6 +1469,29 @@ async def _reload_haproxy(self) -> None:
             await self._haproxy_start_task
             await self._haproxy.reload()
 
+    async def _apply_backend_update(
+        self, name_to_backend_configs: Dict[str, BackendConfig]
+    ) -> None:
+        """Apply a backend config update incrementally if possible, else reload.
+
+        At scale, each controller broadcast triggers a config update. Doing a
+        full HAProxy reload for every update blocks the actor's event loop
+        long enough to miss the controller's health-check window, which causes
+        the controller to kill and recreate the proxy actor — leading to a
+        cascading failure where every proxy in the cluster cycles continuously.
+
+        The runtime API path applies pure server adds/removes in milliseconds
+        without a reload. For structural changes (new backend, ACL change,
+        timeout change, etc.) we still fall back to a full reload.
+        """
+        async with self._reload_lock:
+            await self._haproxy_start_task
+            applied_incrementally = await self._haproxy.try_apply_servers_dynamically(
+                name_to_backend_configs
+            )
+            if not applied_incrementally:
+                await self._haproxy.reload()
+
     def _update_haproxy_backends(self) -> None:
         backend_configs = []
         for target_group in self._target_groups:
@@ -1289,8 +1513,7 @@ def _update_haproxy_backends(self) -> None:
             backend_config.name: backend_config for backend_config in backend_configs
         }
 
-        self._haproxy.set_backend_configs(name_to_backend_configs)
-        self.event_loop.create_task(self._reload_haproxy())
+        self.event_loop.create_task(self._apply_backend_update(name_to_backend_configs))
 
     def update_target_groups(self, target_groups: List[TargetGroup]) -> None:
         self._target_groups = target_groups
diff --git a/python/ray/serve/tests/unit/test_haproxy_diff.py b/python/ray/serve/tests/unit/test_haproxy_diff.py
new file mode 100644
index 000000000000..bdb4028c8c2a
--- /dev/null
+++ b/python/ray/serve/tests/unit/test_haproxy_diff.py
@@ -0,0 +1,330 @@
+"""Unit tests for the incremental backend-update diff in HAProxyApi.
+
+The diff helper decides whether a backend-config change can be applied via
+HAProxy's runtime API (`add server` / `del server`) instead of regenerating
+the full config and triggering a graceful reload. At Ray Serve scales with
+many replicas under autoscaling churn, reload-on-every-change blocks the
+proxy actor's event loop long enough to fail the controller's health check,
+which kills and recreates the actor — leading to cluster-wide proxy
+flapping. The runtime-API path keeps the actor responsive for the common
+case (replica added/removed, nothing else).
+
+These tests cover the diff classification in isolation. Whether a given
+diff actually applies cleanly is HAProxy's responsibility at runtime;
+HAProxyApi.try_apply_servers_dynamically falls back to a reload on any
+socket-command error.
+"""
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from ray.serve._private.haproxy import (
+    BackendConfig,
+    HAProxyApi,
+    HAProxyConfig,
+    ServerConfig,
+    _compute_backend_diff,
+)
+from ray.serve.config import HTTPOptions
+
+
+def _backend(
+    name: str,
+    servers: list,
+    *,
+    path_prefix: str = "/x",
+    fallback=None,
+    **overrides,
+) -> BackendConfig:
+    """Make a BackendConfig with sensible defaults; override fields per test."""
+    return BackendConfig(
+        name=name,
+        path_prefix=path_prefix,
+        app_name=name,
+        servers=servers,
+        fallback_server=fallback,
+        **overrides,
+    )
+
+
+def _server(name: str, host: str = "10.0.0.1", port: int = 8000) -> ServerConfig:
+    return ServerConfig(name=name, host=host, port=port)
+
+
+class TestComputeBackendDiff:
+    """The diff is the safety boundary between fast-path and reload."""
+
+    def test_identical_configs_no_changes(self):
+        """No diff → no work, no reload."""
+        configs = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
+        diff = _compute_backend_diff(configs, configs)
+        assert not diff.is_structural
+        assert diff.servers_to_add == {}
+        assert diff.servers_to_remove == {}
+
+    def test_added_server(self):
+        """A new server within an existing backend is the canonical fast-path case."""
+        old = {"b1": _backend("b1", [_server("s1")])}
+        new = {"b1": _backend("b1", [_server("s1"), _server("s2", port=8001)])}
+        diff = _compute_backend_diff(old, new)
+        assert not diff.is_structural
+        assert "b1" in diff.servers_to_add
+        assert [s.name for s in diff.servers_to_add["b1"]] == ["s2"]
+        assert diff.servers_to_remove == {}
+
+    def test_removed_server(self):
+        """A removed server within an existing backend is also fast-path."""
+        old = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
+        new = {"b1": _backend("b1", [_server("s1")])}
+        diff = _compute_backend_diff(old, new)
+        assert not diff.is_structural
+        assert diff.servers_to_remove == {"b1": ["s2"]}
+        assert diff.servers_to_add == {}
+
+    def test_server_address_change_is_remove_plus_add(self):
+        """Same name, different host/port → remove old, add new.
+
+        HAProxy's runtime API doesn't have a clean in-place address-change
+        primitive that works the same way across all 2.x releases, so we
+        treat this as remove + re-add. Conservative and predictable.
+        """
+        old = {"b1": _backend("b1", [_server("s1", host="10.0.0.1", port=8000)])}
+        new = {"b1": _backend("b1", [_server("s1", host="10.0.0.2", port=8000)])}
+        diff = _compute_backend_diff(old, new)
+        assert not diff.is_structural
+        assert diff.servers_to_remove == {"b1": ["s1"]}
+        assert [s.host for s in diff.servers_to_add["b1"]] == ["10.0.0.2"]
+
+    def test_added_backend_is_structural(self):
+        """A new backend requires a new ACL → reload."""
+        old = {"b1": _backend("b1", [_server("s1")])}
+        new = {
+            "b1": _backend("b1", [_server("s1")]),
+            "b2": _backend("b2", [_server("s3")]),
+        }
+        diff = _compute_backend_diff(old, new)
+        assert diff.is_structural
+
+    def test_removed_backend_is_structural(self):
+        """A removed backend → reload (frontend ACLs must be regenerated)."""
+        old = {
+            "b1": _backend("b1", [_server("s1")]),
+            "b2": _backend("b2", [_server("s3")]),
+        }
+        new = {"b1": _backend("b1", [_server("s1")])}
+        diff = _compute_backend_diff(old, new)
+        assert diff.is_structural
+
+    def test_path_prefix_change_is_structural(self):
+        """Path-prefix changes affect the frontend ACLs → reload."""
+        old = {"b1": _backend("b1", [_server("s1")], path_prefix="/x")}
+        new = {"b1": _backend("b1", [_server("s1")], path_prefix="/y")}
+        diff = _compute_backend_diff(old, new)
+        assert diff.is_structural
+
+    def test_fallback_server_change_is_structural(self):
+        """Backup-server changes are rendered into the backend block → reload."""
+        old = {
+            "b1": _backend(
+                "b1",
+                [_server("s1")],
+                fallback=_server("fb", host="127.0.0.1", port=8500),
+            )
+        }
+        new = {
+            "b1": _backend(
+                "b1",
+                [_server("s1")],
+                fallback=_server("fb", host="127.0.0.1", port=8501),
+            )
+        }
+        diff = _compute_backend_diff(old, new)
+        assert diff.is_structural
+
+    @pytest.mark.parametrize(
+        "field,old_val,new_val",
+        [
+            ("timeout_connect_s", None, 5),
+            ("timeout_server_s", 30, 60),
+            ("timeout_http_keep_alive_s", None, 60),
+            ("timeout_tunnel_s", None, 60),
+            ("health_check_fall", None, 3),
+            ("health_check_rise", None, 2),
+            ("health_check_inter", None, "1s"),
+            ("health_check_path", None, "/-/healthz"),
+        ],
+    )
+    def test_backend_level_field_change_is_structural(self, field, old_val, new_val):
+        """Any non-server backend field change → reload.
+
+        These all render into the HAProxy backend block (timeouts, health-
+        check directives, etc.), so they require regenerating the config.
+        """
+        old = {"b1": _backend("b1", [_server("s1")], **{field: old_val})}
+        new = {"b1": _backend("b1", [_server("s1")], **{field: new_val})}
+        diff = _compute_backend_diff(old, new)
+        assert diff.is_structural, f"change to {field} should be structural"
+
+    def test_concurrent_adds_and_removes_in_one_diff(self):
+        """A real-world case: scale-up adds N replicas while scale-down removes M."""
+        old = {"b1": _backend("b1", [_server("s1"), _server("s2"), _server("s3")])}
+        new = {"b1": _backend("b1", [_server("s2"), _server("s4"), _server("s5")])}
+        diff = _compute_backend_diff(old, new)
+        assert not diff.is_structural
+        assert sorted(diff.servers_to_remove["b1"]) == ["s1", "s3"]
+        assert sorted(s.name for s in diff.servers_to_add["b1"]) == ["s4", "s5"]
+
+    def test_changes_across_multiple_backends(self):
+        """Mixed updates across backends still take the fast path if all are server-only."""
+        old = {
+            "b1": _backend("b1", [_server("s1")]),
+            "b2": _backend("b2", [_server("s2")], path_prefix="/y"),
+        }
+        new = {
+            "b1": _backend("b1", [_server("s1"), _server("s1b")]),
+            "b2": _backend("b2", [], path_prefix="/y"),
+        }
+        diff = _compute_backend_diff(old, new)
+        assert not diff.is_structural
+        assert [s.name for s in diff.servers_to_add["b1"]] == ["s1b"]
+        assert diff.servers_to_remove["b2"] == ["s2"]
+
+
+class TestTryApplyServersDynamically:
+    """Behavior of HAProxyApi.try_apply_servers_dynamically.
+
+    The contract: returns True only when the change was applied without a
+    reload. Otherwise the caller (HAProxyManager._apply_backend_update)
+    will perform a full reload.
+    """
+
+    def _make_api(self, backend_configs=None):
+        cfg = HAProxyConfig(
+            socket_path="/tmp/test.sock",
+            http_options=HTTPOptions(host="127.0.0.1", port=8000),
+        )
+        return HAProxyApi(cfg=cfg, backend_configs=backend_configs or {})
+
+    @pytest.mark.asyncio
+    async def test_first_time_setup_falls_back_to_reload(self):
+        """Initial broadcast (no prior state) must reload — there's no diff base."""
+        api = self._make_api(backend_configs={})
+        new_configs = {"b1": _backend("b1", [_server("s1")])}
+
+        applied = await api.try_apply_servers_dynamically(new_configs)
+        assert applied is False
+        # Internal state should have been updated so the subsequent reload
+        # uses the new configs.
+        assert api.backend_configs == new_configs
+
+    @pytest.mark.asyncio
+    async def test_haproxy_not_running_falls_back_to_reload(self):
+        """If HAProxy isn't up yet, the runtime API isn't available."""
+        api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
+        new_configs = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
+
+        with patch.object(api, "is_running", AsyncMock(return_value=False)):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+        assert applied is False
+        assert api.backend_configs == new_configs
+
+    @pytest.mark.asyncio
+    async def test_structural_change_falls_back_to_reload(self):
+        """New backend → caller must reload; we still update internal state."""
+        api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
+        new_configs = {
+            "b1": _backend("b1", [_server("s1")]),
+            "b2": _backend("b2", [_server("s3")]),
+        }
+
+        with patch.object(api, "is_running", AsyncMock(return_value=True)):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+        assert applied is False
+        assert api.backend_configs == new_configs
+
+    @pytest.mark.asyncio
+    async def test_server_add_uses_runtime_api(self):
+        """Pure add → `add server` + `enable server` over the socket, no reload."""
+        api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
+        new_configs = {"b1": _backend("b1", [_server("s1"), _server("s2", port=8001)])}
+
+        socket_calls = []
+
+        async def fake_send(cmd):
+            socket_calls.append(cmd)
+            return ""
+
+        with patch.object(
+            api, "is_running", AsyncMock(return_value=True)
+        ), patch.object(
+            api, "_send_socket_command", side_effect=fake_send
+        ), patch.object(
+            api, "_generate_config_file_internal"
+        ):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+
+        assert applied is True
+        # The exact commands matter — operators read these in HAProxy's logs.
+        assert any("add server b1/s2 10.0.0.1:8001" in c for c in socket_calls)
+        assert any("enable server b1/s2" in c for c in socket_calls)
+        assert not any("del server" in c for c in socket_calls)
+
+    @pytest.mark.asyncio
+    async def test_server_remove_uses_runtime_api(self):
+        """Pure remove → `disable server` + `del server` over the socket, no reload."""
+        api = self._make_api(
+            backend_configs={"b1": _backend("b1", [_server("s1"), _server("s2")])}
+        )
+        new_configs = {"b1": _backend("b1", [_server("s1")])}
+
+        socket_calls = []
+
+        async def fake_send(cmd):
+            socket_calls.append(cmd)
+            return ""
+
+        with patch.object(
+            api, "is_running", AsyncMock(return_value=True)
+        ), patch.object(
+            api, "_send_socket_command", side_effect=fake_send
+        ), patch.object(
+            api, "_generate_config_file_internal"
+        ):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+
+        assert applied is True
+        # `disable` must precede `del` to handle in-flight connections cleanly.
+        disable_idx = next(
+            i for i, c in enumerate(socket_calls) if "disable server b1/s2" in c
+        )
+        del_idx = next(i for i, c in enumerate(socket_calls) if "del server b1/s2" in c)
+        assert disable_idx < del_idx
+
+    @pytest.mark.asyncio
+    async def test_socket_failure_falls_back_to_reload(self):
+        """If any runtime command fails, bail and let the caller reload.
+
+        This avoids leaving HAProxy in a half-applied state where some servers
+        were added but others weren't.
+        """
+        api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
+        new_configs = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
+
+        with patch.object(
+            api, "is_running", AsyncMock(return_value=True)
+        ), patch.object(
+            api,
+            "_send_socket_command",
+            AsyncMock(side_effect=RuntimeError("socket closed")),
+        ):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+        assert applied is False
+        # Internal state still updated so the fallback reload reflects truth.
+        assert api.backend_configs == new_configs
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", "-s", __file__]))

From efbe04e589000a203cbff1495b25f9a594df2057 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 04:28:25 +0000
Subject: [PATCH 06/26] [serve] Validate HAProxy runtime-API responses; address
 Codex review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex review of the prior commit flagged three issues; this addresses
them.

(High) `_send_socket_command` returns HAProxy's response body verbatim,
including error strings — HAProxy conveys most command-level rejections
through the response (e.g. "[ALERT] (123) : 'add server' : Backend X
not found.") rather than via transport-level exceptions. The previous
apply path only caught transport exceptions, so it would treat a rejection
as success and skip the fallback reload, leaving live HAProxy state
diverged from `self.backend_configs`. Add `_send_runtime_command` which
wraps the send + checks the response for HAProxy's error markers
(log-level prefixes plus a small set of common error tokens) and raises
RuntimeError when any are present. Mutating commands in
`try_apply_servers_dynamically` use the new wrapper so application-level
rejections trigger the existing fallback-to-reload path.

(Low) Add `BackendConfig.name` to the structural-change comparison in
`_compute_backend_diff`. The current callers in this file always set
the dict key equal to the BackendConfig.name, but the helper itself
shouldn't assume that invariant — a name change has to reload because
it affects the frontend ACL labels and `use_backend` directives.

(Doc) Update the comment that claimed the apply loop "avoids leaving
HAProxy in a partially-updated state" — commands are sent sequentially
and partial state is live until the fallback reload converges things
back to the desired state. The fallback reload is what guarantees
consistency, not the abort-on-error.

Tests: add coverage for the rejection-via-response-string case (the
common one) and direct unit tests for `_send_runtime_command`'s
validator behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py          | 70 +++++++++++++++++--
 .../ray/serve/tests/unit/test_haproxy_diff.py | 56 ++++++++++++++-
 2 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 49716050a653..06c7f56d3299 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -593,7 +593,12 @@ def _compute_backend_diff(
 
         # Any backend-level config change → reload. Compare the fields that
         # affect the rendered backend block (excluding the servers list).
+        # `name` is included defensively: callers in this file invariably
+        # use the dict key as the BackendConfig.name, but the helper itself
+        # shouldn't assume that invariant — a name change has to reload to
+        # update the frontend ACL labels and `use_backend` directives.
         backend_level_fields = (
+            "name",
             "path_prefix",
             "fallback_server",
             "app_name",
@@ -963,6 +968,54 @@ async def _send_socket_command(self, command: str) -> str:
         except Exception as e:
             raise RuntimeError(f"Failed to send socket command '{command}': {e}")
 
+    async def _send_runtime_command(self, command: str) -> str:
+        """Send an admin-socket command and treat error responses as failures.
+
+        HAProxy's admin socket conveys command failures by returning an
+        error string in the response body — not by closing the connection
+        or raising at the transport level. `_send_socket_command` returns
+        the body verbatim, so callers that only catch transport-level
+        exceptions (e.g. mutating commands like `add server` / `del server`)
+        can mistake a rejection for a success.
+
+        This wrapper inspects the response for the well-known error markers
+        HAProxy emits and raises RuntimeError when present, so the caller's
+        `try/except` reliably catches both transport failures and
+        application-level rejections.
+        """
+        result = await self._send_socket_command(command)
+        if not result:
+            return result
+        # HAProxy prefixes alert/warning/notice levels into the admin-socket
+        # response when a command is rejected (e.g.,
+        #   "[ALERT] (XYZ) : 'add server' : Backend X not found.")
+        # Plain string error messages without a prefix also occur — check
+        # for a small set of common error tokens. Any false-positive here
+        # is fine: the worst case is we trigger a fallback reload.
+        normalized = result.lower()
+        error_markers = (
+            "[alert]",
+            "[warning]",
+            "[notice]",
+            "no such server",
+            "no such backend",
+            "doesn't exist",
+            "does not exist",
+            "not found",
+            "not allowed",
+            "syntax error",
+            "invalid argument",
+            "unknown command",
+            "must be disabled",
+        )
+        for marker in error_markers:
+            if marker in normalized:
+                raise RuntimeError(
+                    f"HAProxy rejected runtime command '{command}': "
+                    f"{result.strip()}"
+                )
+        return result
+
     @staticmethod
     def _parse_haproxy_csv_stats(
         stats_output: str,
@@ -1125,18 +1178,21 @@ async def try_apply_servers_dynamically(
             self.set_backend_configs(new_backend_configs)
             return True
 
-        # Apply the diff via the admin socket. If any single command fails,
-        # bail out and let the caller reload — this avoids leaving HAProxy in
-        # a partially-updated state.
+        # Apply the diff via the admin socket. Commands are sent
+        # sequentially — if one fails partway through, the live HAProxy
+        # state is partially updated. The fallback reload that the caller
+        # performs after we return False rebuilds full state from
+        # self.backend_configs (which we update here) and converges
+        # everything back to the new desired state.
         try:
             for backend_name, server_names in diff.servers_to_remove.items():
                 for sname in server_names:
                     # Disable before delete: required when the server has
                     # in-flight connections; safe even when it doesn't.
-                    await self._send_socket_command(
+                    await self._send_runtime_command(
                         f"disable server {backend_name}/{sname}"
                     )
-                    await self._send_socket_command(
+                    await self._send_runtime_command(
                         f"del server {backend_name}/{sname}"
                     )
 
@@ -1145,12 +1201,12 @@ async def try_apply_servers_dynamically(
                     # Newly-added runtime servers inherit `default-server`
                     # directives (inter/fall/rise/etc.) from the backend
                     # block, so we only need address + the `check` flag.
-                    await self._send_socket_command(
+                    await self._send_runtime_command(
                         f"add server {backend_name}/{server.name} "
                         f"{server.host}:{server.port} check"
                     )
                     # Runtime-added servers start in MAINT state by default.
-                    await self._send_socket_command(
+                    await self._send_runtime_command(
                         f"enable server {backend_name}/{server.name}"
                     )
         except Exception as e:
diff --git a/python/ray/serve/tests/unit/test_haproxy_diff.py b/python/ray/serve/tests/unit/test_haproxy_diff.py
index bdb4028c8c2a..8eccfa147747 100644
--- a/python/ray/serve/tests/unit/test_haproxy_diff.py
+++ b/python/ray/serve/tests/unit/test_haproxy_diff.py
@@ -303,10 +303,11 @@ async def fake_send(cmd):
 
     @pytest.mark.asyncio
     async def test_socket_failure_falls_back_to_reload(self):
-        """If any runtime command fails, bail and let the caller reload.
+        """If any runtime command raises, bail and let the caller reload.
 
-        This avoids leaving HAProxy in a half-applied state where some servers
-        were added but others weren't.
+        HAProxy can fail at the transport level (socket closed, connection
+        refused, timeout). The caller's reload converges live state back
+        to self.backend_configs.
         """
         api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
         new_configs = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
@@ -323,6 +324,55 @@ async def test_socket_failure_falls_back_to_reload(self):
         # Internal state still updated so the fallback reload reflects truth.
         assert api.backend_configs == new_configs
 
+    @pytest.mark.asyncio
+    async def test_socket_returns_error_string_falls_back_to_reload(self):
+        """HAProxy can reject commands by returning an error string (not raising).
+
+        The runtime API conveys most command-level failures as response
+        text (e.g. ``[ALERT] (123) : 'add server' : Backend X not found.``)
+        rather than transport-level exceptions. We must treat those
+        responses as failures or we'd silently skip a needed reload while
+        live HAProxy state diverges from our internal config.
+        """
+        api = self._make_api(backend_configs={"b1": _backend("b1", [_server("s1")])})
+        new_configs = {"b1": _backend("b1", [_server("s1"), _server("s2")])}
+
+        async def fake_send(cmd):
+            if cmd.startswith("add server"):
+                return "[ALERT] (123) : 'add server' : Backend b1 not found.\n"
+            return ""
+
+        with patch.object(
+            api, "is_running", AsyncMock(return_value=True)
+        ), patch.object(api, "_send_socket_command", side_effect=fake_send):
+            applied = await api.try_apply_servers_dynamically(new_configs)
+        assert applied is False
+        assert api.backend_configs == new_configs
+
+    @pytest.mark.asyncio
+    async def test_send_runtime_command_raises_on_error_string(self):
+        """Direct test of the response validator that backs the apply path."""
+        api = self._make_api()
+
+        async def fake_send(cmd):
+            return "[ALERT] command rejected\n"
+
+        with patch.object(api, "_send_socket_command", side_effect=fake_send):
+            with pytest.raises(RuntimeError, match="HAProxy rejected"):
+                await api._send_runtime_command("add server foo/bar 1.2.3.4:80")
+
+    @pytest.mark.asyncio
+    async def test_send_runtime_command_passes_empty_response(self):
+        """Successful HAProxy runtime commands return empty/whitespace."""
+        api = self._make_api()
+
+        async def fake_send(cmd):
+            return ""
+
+        with patch.object(api, "_send_socket_command", side_effect=fake_send):
+            result = await api._send_runtime_command("disable server foo/bar")
+        assert result == ""
+
 
 if __name__ == "__main__":
     import sys

From 596ebb47127a1c053e78729e7d129f821e296472 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 08:02:36 +0000
Subject: [PATCH 07/26] [serve] Recover from dead HAProxy in graceful reload
 path

When a prior reload's `-x admin.sock` socket transfer fails (admin
socket overloaded under autoscaling churn), the new HAProxy exits 1
before binding listeners. The actor's `self._proc` ends up pointing at
that dead process, and every subsequent `_apply_backend_update` raises
"HAProxy crashed during startup: exit code 1" from
`_wait_for_hap_availability(old_proc)` -- the actor is stuck with no
listener on 8000 until the controller kills it.

Detect `old_proc` already exited and start a fresh HAProxy without
`-sf`/`-x`. Drops in-flight connections briefly but lets the actor
return to a healthy state instead of staying down.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 06c7f56d3299..4689a1ec928a 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -749,6 +749,26 @@ async def _graceful_reload(self) -> None:
         """Perform a graceful reload of HAProxy by starting a new process with -sf."""
         try:
             old_proc = self._proc
+            # If the previous HAProxy process has already exited (e.g., a prior
+            # reload's `-x admin.sock` socket transfer failed and the new
+            # process exited 1), there's nothing to graceful-reload from: no
+            # listening FDs to inherit and no live PID to signal. Without this
+            # branch every subsequent reload would re-raise "HAProxy crashed
+            # during startup: exit code 1" from `_wait_for_hap_availability`
+            # below and the actor would be stuck with no listener on 8000
+            # until the controller kills it. Start a fresh process instead so
+            # we recover automatically; this drops in-flight connections
+            # briefly but is strictly better than staying down.
+            if old_proc is None or old_proc.returncode is not None:
+                logger.warning(
+                    "Previous HAProxy process is not running "
+                    "(returncode=%s); starting a fresh HAProxy instead of "
+                    "attempting a graceful reload.",
+                    old_proc.returncode if old_proc is not None else None,
+                )
+                self._proc = await self._start_and_wait_for_haproxy()
+                return
+
             await self._wait_for_hap_availability(old_proc)
 
             # Save server state if optimization is enabled

From ce527e4f255237aa32b0ead27cf23df0196096af Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 08:03:06 +0000
Subject: [PATCH 08/26] [serve] Coalesce HAProxy controller broadcasts into a
 single apply

Under autoscaling churn the Serve controller fires target-group /
fallback-target broadcasts tens of ms apart; without coalescing, each
one runs its own runtime-API command burst on the HAProxy admin
socket. The CLI mux serializes everything (runtime API, stats, the
`-x` socket transfer used by reload) so command-level timeouts and
`-x` failures cluster together once admin-socket pressure exceeds
HAProxy's processing rate.

Coalesce broadcasts in `HAProxyManager` into a single sleeping flush
task. Updates arriving during the sleep are absorbed implicitly via
in-place writes to `self._target_groups` / fallback fields; updates
during the apply re-arm a `_coalesce_pending` flag so the trailing
broadcast in a flurry isn't dropped.

Window is configurable via `RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S`
(default 0.5s, set 0 to disable). Trade-off: each broadcast incurs up
to one window of additional latency before reaching HAProxy, which is
small relative to typical replica-start time and well below the rate
at which admin-socket saturation begins.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py | 12 +++++
 python/ray/serve/_private/haproxy.py   | 74 ++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index f7d3947e09c2..8e8f47802980 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -773,6 +773,18 @@
 # unreachable and a sibling can serve the request instead.
 RAY_SERVE_HAPROXY_RETRIES = int(os.environ.get("RAY_SERVE_HAPROXY_RETRIES", "3"))
 
+# Window during which incoming controller broadcasts (target_groups,
+# fallback_targets) are coalesced into a single backend update before being
+# applied to HAProxy. Under autoscaling churn the controller can fire
+# broadcasts tens of ms apart; without coalescing each one issues its own
+# runtime-API command burst on the admin socket, which saturates HAProxy's
+# CLI mux and causes timeouts (and `-x` socket-transfer failures during the
+# fallback reload). 0.2s collapses typical burst clusters into one diff.
+# Set to 0 to disable coalescing entirely (legacy behaviour).
+RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S = float(
+    os.environ.get("RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S", "0.5")
+)
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 4689a1ec928a..f688753201fe 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -31,6 +31,7 @@
     RAY_SERVE_EXPERIMENTAL_PIP_HAPROXY,
     RAY_SERVE_HAPROXY_BALANCE_ALGORITHM,
     RAY_SERVE_HAPROXY_BINARY_PATH,
+    RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S,
     RAY_SERVE_HAPROXY_CONFIG_FILE_LOC,
     RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S,
     RAY_SERVE_HAPROXY_HEALTH_CHECK_DOWNINTER,
@@ -1311,6 +1312,20 @@ def __init__(
         # which can cause race conditions with SO_REUSEPORT
         self._reload_lock = asyncio.Lock()
 
+        # Coalesce controller broadcasts: when a broadcast arrives we schedule
+        # a single sleeping flush task. Subsequent broadcasts during the sleep
+        # are absorbed implicitly because they overwrite `self._target_groups`
+        # / fallback fields in place; the flush picks up the latest snapshot
+        # when it wakes. This collapses bursts of replica-add/remove events
+        # (common during autoscaling churn) into one runtime-API command pile,
+        # which keeps the HAProxy admin socket from saturating.
+        # `_coalesce_pending` lets the flush task notice broadcasts that
+        # arrive during a long-running apply and trigger another iteration
+        # so the trailing broadcast in a flurry isn't dropped.
+        self._coalesce_window_s = RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S
+        self._coalesce_task: Optional[asyncio.Task] = None
+        self._coalesce_pending = False
+
         self.long_poll_client = long_poll_client or LongPollClient(
             ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE),
             {
@@ -1568,7 +1583,8 @@ async def _apply_backend_update(
             if not applied_incrementally:
                 await self._haproxy.reload()
 
-    def _update_haproxy_backends(self) -> None:
+    def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
+        """Snapshot the current target groups + fallbacks into backend configs."""
         backend_configs = []
         for target_group in self._target_groups:
             fallback_target = None
@@ -1580,16 +1596,56 @@ def _update_haproxy_backends(self) -> None:
             backend_config = self._create_backend_config(target_group, fallback_target)
             backend_configs.append(backend_config)
 
-        logger.info(
-            f"Got updated backend configs: {backend_configs}.",
-            extra={"log_to_stderr": True},
-        )
+        return {bc.name: bc for bc in backend_configs}
 
-        name_to_backend_configs = {
-            backend_config.name: backend_config for backend_config in backend_configs
-        }
+    def _update_haproxy_backends(self) -> None:
+        # Schedule a coalesced flush. If a flush is already pending (sleeping
+        # or applying), set `_coalesce_pending` so the running task picks up
+        # our broadcast on the next loop iteration. With coalescing disabled
+        # (window=0), apply synchronously to preserve legacy behaviour.
+        if self._coalesce_window_s <= 0:
+            name_to_backend_configs = self._build_name_to_backend_configs()
+            logger.info(
+                f"Got updated backend configs: {list(name_to_backend_configs.values())}.",
+                extra={"log_to_stderr": True},
+            )
+            self.event_loop.create_task(
+                self._apply_backend_update(name_to_backend_configs)
+            )
+            return
+
+        self._coalesce_pending = True
+        if self._coalesce_task is None or self._coalesce_task.done():
+            self._coalesce_task = self.event_loop.create_task(
+                self._coalesce_and_apply()
+            )
+
+    async def _coalesce_and_apply(self) -> None:
+        """Drain pending broadcasts, applying each batch after a short sleep.
 
-        self.event_loop.create_task(self._apply_backend_update(name_to_backend_configs))
+        Broadcasts arriving during the sleep are absorbed because the
+        long-poll callbacks overwrite `self._target_groups` / fallback fields
+        in place — by the time we wake up, the snapshot already reflects them.
+        Broadcasts arriving during the apply are handled by the outer loop:
+        they re-set `_coalesce_pending`, and we iterate again.
+        """
+        try:
+            while self._coalesce_pending:
+                # Clear before sleep so any broadcast during sleep+apply
+                # re-arms the flag and triggers another iteration.
+                self._coalesce_pending = False
+                await asyncio.sleep(self._coalesce_window_s)
+                name_to_backend_configs = self._build_name_to_backend_configs()
+                logger.info(
+                    f"Got updated backend configs: {list(name_to_backend_configs.values())}.",
+                    extra={"log_to_stderr": True},
+                )
+                await self._apply_backend_update(name_to_backend_configs)
+        except Exception as e:
+            # Don't let an apply failure kill the coalescer. The next broadcast
+            # will schedule a fresh flush task; the underlying error is already
+            # logged by `_apply_backend_update` / `try_apply_servers_dynamically`.
+            logger.error(f"Coalesced backend apply failed: {e}")
 
     def update_target_groups(self, target_groups: List[TargetGroup]) -> None:
         self._target_groups = target_groups

From 4e972dad02f087b271b812efdcc0adbce2656a01 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 09:35:49 +0000
Subject: [PATCH 09/26] [serve] Free listener ports before HAProxy fresh-start
 recovery

The dead-old-proc fresh-start path didn't account for orphaned HAProxy
processes still holding ports 8000 / 9101 / 8404. When a prior reload's
`-x admin.sock` socket transfer fails, the new process exits without
sending SIGUSR1 to the old (the SIGUSR1 only fires after binding
succeeds), and the old process keeps its listening sockets fully open
indefinitely while the actor logs it as a finished old proc in
`_old_procs`. Subsequent fresh-start attempts then race against those
orphans for ports and crash with `cannot bind socket (Address already
in use)`, only recovering when the orphans naturally drain (tens of
seconds).

SIGKILL tracked old procs before the fresh-start bind so the new
process can take the ports immediately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 34 ++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index f688753201fe..e917a5ab6036 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -746,6 +746,30 @@ async def _save_server_state(self) -> None:
         with open(self.cfg.server_state_file, "w") as f:
             f.write(server_state)
 
+    async def _terminate_old_procs(self) -> None:
+        """SIGKILL any old HAProxy procs we're still tracking.
+
+        Used during fresh-start recovery (when -sf/-x graceful reload isn't
+        possible) to free listener ports the orphans may still be holding.
+        Idempotent — already-dead procs are a no-op.
+        """
+        for old_proc in self._old_procs:
+            if old_proc.returncode is not None:
+                continue
+            try:
+                old_proc.kill()
+                await old_proc.wait()
+                logger.info(
+                    f"Killed orphaned HAProxy process (PID: {old_proc.pid}) "
+                    "to free listener ports for fresh start."
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to kill orphaned HAProxy process "
+                    f"(PID: {old_proc.pid}): {e}"
+                )
+        self._old_procs.clear()
+
     async def _graceful_reload(self) -> None:
         """Perform a graceful reload of HAProxy by starting a new process with -sf."""
         try:
@@ -767,6 +791,16 @@ async def _graceful_reload(self) -> None:
                     "attempting a graceful reload.",
                     old_proc.returncode if old_proc is not None else None,
                 )
+                # Kill any tracked old procs that may still be holding the
+                # listener ports. Under -x failures the prior reload never
+                # delivered SIGUSR1 to its predecessor, so processes appended
+                # to `_old_procs` can stay fully bound to ports 8000 / 9101 /
+                # 8404 indefinitely. A fresh-start without -sf/-x doesn't
+                # inherit anything, so it would otherwise hit EADDRINUSE on
+                # those ports until the orphans finish their natural drain
+                # (tens of seconds at minimum). SIGKILL them up front so the
+                # fresh process can bind immediately.
+                await self._terminate_old_procs()
                 self._proc = await self._start_and_wait_for_haproxy()
                 return
 

From aded8a7a514fd06b6bb8eafe738fef5d4d2b07ba Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 10:27:03 +0000
Subject: [PATCH 10/26] [serve] Probe HTTP listener for HAProxy health check,
 not admin socket

The controller's `check_health` was calling `is_running()`, which sends
`show info` over the HAProxy admin socket. Under load that socket is
shared with our runtime-API commands and the `-x` reload FD transfer,
and saturates: `show info` exceeds the 5s `_send_socket_command`
timeout, `is_running()` returns False, the controller counts it as a
missed health check, and after 3 consecutive misses (~30-50s) the
controller marks the proxy unhealthy and kills the actor. The killed
actor's HAProxy child gets orphaned, holds listener ports, and the
replacement actor's HAProxy crashes with EADDRINUSE -- compounding
the cascade we were trying to avoid.

Replace the admin-socket check with a direct probe of the HAProxy HTTP
listener at `/-/healthz`. The listener is served by worker threads
independent of the CLI mux, so it stays responsive when admin socket
is saturated. Matches what the upstream ALB target group health check
sees, so proxy self-reported health aligns with load-balancer view.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 48 +++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index e917a5ab6036..ab51d22861f6 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -1520,9 +1520,55 @@ async def check_health(self) -> bool:
         # If haproxy is already shutdown, return False.
         if not self._haproxy or not self._haproxy._proc:
             return False
+        if self._haproxy._proc.returncode is not None:
+            return False
 
         logger.debug("Received health check.", extra={"log_to_stderr": False})
-        return await self._haproxy.is_running()
+        return await self._probe_http_listener()
+
+    async def _probe_http_listener(self) -> bool:
+        """Probe the HAProxy HTTP listener via /-/healthz.
+
+        The controller's `check_health` previously called `is_running()`,
+        which sends `show info` over the HAProxy admin socket. Under load
+        that socket also carries our runtime-API commands and HAProxy's
+        `-x` reload FD transfer, and saturates: a benign `show info` can
+        take >5s to return, which falsely flags the proxy as unhealthy
+        and triggers the controller to kill+recreate it. The kill orphans
+        the HAProxy child (Ray actor death doesn't cascade to subprocesses),
+        whose listener ports then block the new actor's HAProxy from
+        binding -- compounding the original cascade.
+
+        Probing the HTTP listener directly bypasses the admin socket. The
+        listener is served by HAProxy's worker threads, which are
+        independent of the CLI mux that handles socket commands, so it
+        stays responsive even when admin socket is saturated. This matches
+        what the AWS ALB target group health check sees, so the proxy's
+        self-reported health aligns with what the upstream load balancer
+        will route to.
+        """
+        try:
+            reader, writer = await asyncio.wait_for(
+                asyncio.open_connection("127.0.0.1", self._http_options.port),
+                timeout=2.0,
+            )
+        except Exception:
+            return False
+        try:
+            writer.write(
+                b"GET /-/healthz HTTP/1.0\r\nHost: localhost\r\nConnection: close\r\n\r\n"
+            )
+            await writer.drain()
+            response = await asyncio.wait_for(reader.read(256), timeout=2.0)
+            return b"HTTP/1.0 200" in response or b"HTTP/1.1 200" in response
+        except Exception:
+            return False
+        finally:
+            try:
+                writer.close()
+                await writer.wait_closed()
+            except Exception:
+                pass
 
     def pong(self) -> str:
         pass

From 55f6fb9cc5eeb57d895fd7004e3b77d199f45543 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 11:08:24 +0000
Subject: [PATCH 11/26] [serve] Stop using admin socket for HAProxy reload
 readiness check

`_wait_for_hap_availability` polled `is_running()` (which sends
`show info` over the HAProxy admin socket) to decide whether the new
HAProxy process had finished startup. Under load the admin socket is
shared with runtime-API commands and the `-x` FD transfer, so a benign
`show info` can take >5s to return. With the 15s reload timeout, only
a few iterations get a chance to succeed; if all of them hit a slow
admin socket we abort the reload as "did not enter running state
within 15 seconds" -- even though the new HAProxy is actually serving.

Switch the readiness probe to a direct HTTP HEAD/GET to /-/healthz on
the frontend listener. The listener is independent of the CLI mux and
stays responsive when the admin socket is saturated. Crash detection
still uses `proc.returncode` polling, so a genuinely failed startup
is still caught quickly.

Also make `_save_server_state` failures non-fatal in `_graceful_reload`.
The state file is purely an optimization for preserving per-server
connection counts across reload; a `show servers state` admin-socket
timeout used to abort the entire reload, which then cascaded as the
next broadcast retried with the same overloaded socket.

Refactor the HTTP probe into `HAProxyApi.probe_http_listener` so both
`HAProxyManager.check_health` (controller-facing) and
`_wait_for_hap_availability` (internal reload) share one implementation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 115 +++++++++++++++------------
 1 file changed, 66 insertions(+), 49 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index ab51d22861f6..17741b1b359e 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -806,9 +806,21 @@ async def _graceful_reload(self) -> None:
 
             await self._wait_for_hap_availability(old_proc)
 
-            # Save server state if optimization is enabled
+            # Save server state if optimization is enabled. Treat failures
+            # as non-fatal: the state file is purely an optimization (lets
+            # the new HAProxy preserve per-server connection counts across
+            # reload), and a stale or missing file just causes HAProxy to
+            # start counters from zero. A `show servers state` admin-socket
+            # timeout here under load used to abort the entire reload --
+            # which then cascaded since the next broadcast retried with
+            # the same overloaded socket.
             if self.cfg.enable_hap_optimization:
-                await self._save_server_state()
+                try:
+                    await self._save_server_state()
+                except Exception as e:
+                    logger.warning(
+                        f"Skipping HAProxy server state save before reload: {e}"
+                    )
 
             # Start new HAProxy process with -sf flag to gracefully take over from old process
             # Use -x socket transfer for seamless reloads if optimization is enabled
@@ -829,6 +841,46 @@ async def _graceful_reload(self) -> None:
             logger.error(f"HAProxy graceful reload failed: {e}")
             raise
 
+    async def probe_http_listener(self) -> bool:
+        """Probe the HAProxy HTTP listener via /-/healthz.
+
+        Returns True if HAProxy is serving 200 responses on its frontend
+        port, False otherwise. Used as a readiness signal that does NOT
+        depend on the admin socket -- the admin socket is shared with our
+        runtime-API commands and the `-x` reload FD transfer, and saturates
+        under autoscaling churn. The HTTP listener is served by HAProxy
+        worker threads independent of the CLI mux, so it stays responsive
+        when the admin socket is busy.
+
+        During graceful reload (where the old process is still bound to the
+        same port) this probe may be answered by either the old or the new
+        process; that's acceptable for readiness purposes because either
+        way the port is serving traffic. Crash detection is handled
+        separately by polling `proc.returncode`.
+        """
+        try:
+            reader, writer = await asyncio.wait_for(
+                asyncio.open_connection("127.0.0.1", self.cfg.frontend_port),
+                timeout=2.0,
+            )
+        except Exception:
+            return False
+        try:
+            writer.write(
+                b"GET /-/healthz HTTP/1.0\r\nHost: localhost\r\nConnection: close\r\n\r\n"
+            )
+            await writer.drain()
+            response = await asyncio.wait_for(reader.read(256), timeout=2.0)
+            return b"HTTP/1.0 200" in response or b"HTTP/1.1 200" in response
+        except Exception:
+            return False
+        finally:
+            try:
+                writer.close()
+                await writer.wait_closed()
+            except Exception:
+                pass
+
     async def _wait_for_hap_availability(
         self,
         proc: asyncio.subprocess.Process,
@@ -836,7 +888,6 @@ async def _wait_for_hap_availability(
     ) -> None:
         start_time = time.time()
 
-        # TODO: update this to use health checks
         while time.time() - start_time < timeout_s:
             if proc.returncode is not None:
                 stdout = await proc.stdout.read() if proc.stdout else b""
@@ -850,7 +901,12 @@ async def _wait_for_hap_availability(
                     f"HAProxy crashed during startup: {output or f'exit code {proc.returncode}'}"
                 )
 
-            if await self.is_running():
+            # Probe the HTTP listener instead of the admin socket. The admin
+            # socket gets saturated under runtime-API load and a benign
+            # `show info` can take >5s to return, falsely flagging a healthy
+            # HAProxy as not-yet-running. The HTTP listener is independent
+            # of the CLI mux and stays responsive.
+            if await self.probe_http_listener():
                 return
 
             await asyncio.sleep(0.5)
@@ -1524,51 +1580,12 @@ async def check_health(self) -> bool:
             return False
 
         logger.debug("Received health check.", extra={"log_to_stderr": False})
-        return await self._probe_http_listener()
-
-    async def _probe_http_listener(self) -> bool:
-        """Probe the HAProxy HTTP listener via /-/healthz.
-
-        The controller's `check_health` previously called `is_running()`,
-        which sends `show info` over the HAProxy admin socket. Under load
-        that socket also carries our runtime-API commands and HAProxy's
-        `-x` reload FD transfer, and saturates: a benign `show info` can
-        take >5s to return, which falsely flags the proxy as unhealthy
-        and triggers the controller to kill+recreate it. The kill orphans
-        the HAProxy child (Ray actor death doesn't cascade to subprocesses),
-        whose listener ports then block the new actor's HAProxy from
-        binding -- compounding the original cascade.
-
-        Probing the HTTP listener directly bypasses the admin socket. The
-        listener is served by HAProxy's worker threads, which are
-        independent of the CLI mux that handles socket commands, so it
-        stays responsive even when admin socket is saturated. This matches
-        what the AWS ALB target group health check sees, so the proxy's
-        self-reported health aligns with what the upstream load balancer
-        will route to.
-        """
-        try:
-            reader, writer = await asyncio.wait_for(
-                asyncio.open_connection("127.0.0.1", self._http_options.port),
-                timeout=2.0,
-            )
-        except Exception:
-            return False
-        try:
-            writer.write(
-                b"GET /-/healthz HTTP/1.0\r\nHost: localhost\r\nConnection: close\r\n\r\n"
-            )
-            await writer.drain()
-            response = await asyncio.wait_for(reader.read(256), timeout=2.0)
-            return b"HTTP/1.0 200" in response or b"HTTP/1.1 200" in response
-        except Exception:
-            return False
-        finally:
-            try:
-                writer.close()
-                await writer.wait_closed()
-            except Exception:
-                pass
+        # Probe the HTTP listener instead of the admin socket. See
+        # HAProxyApi.probe_http_listener for the rationale -- briefly: the
+        # admin socket competes with runtime-API load, and a saturated
+        # admin socket would otherwise cause the controller to kill the
+        # actor and trigger a reload-orphan-port cascade.
+        return await self._haproxy.probe_http_listener()
 
     def pong(self) -> str:
         pass

From 6790359920b308a7f6f50d9c4550624b081a4327 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 11:15:42 +0000
Subject: [PATCH 12/26] [serve] Accept any HTTP response in HAProxy listener
 probe

The previous commit only treated `HTTP/1.x 200` as alive, which broke
two legitimate states where HAProxy is fully functioning but /-/healthz
returns 503:

- Drain mode: `disable()` flips `pass_health_checks=False` so
  /-/healthz returns 503 to signal the upstream LB to stop routing.
  With strict-200, `check_health` returned False during drain and the
  controller would mark the proxy UNHEALTHY, killing a draining proxy
  and triggering the orphan-port cascade we just fixed elsewhere.

- Initial startup: when `has_received_routes=False` (before the first
  controller broadcast), /-/healthz returns 503. Strict-200 made
  `_wait_for_hap_availability` time out during initial `start()`, so
  the actor never reached HEALTHY.

The signal we want from the probe is "HAProxy is alive and parsing
HTTP", not "HAProxy says traffic is healthy". Match the old
admin-socket `is_running()` behavior, which returned True in both
above states.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 17741b1b359e..8e363e718320 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -871,7 +871,15 @@ async def probe_http_listener(self) -> bool:
             )
             await writer.drain()
             response = await asyncio.wait_for(reader.read(256), timeout=2.0)
-            return b"HTTP/1.0 200" in response or b"HTTP/1.1 200" in response
+            # Accept any HTTP response, not just 200. The signal we want is
+            # "HAProxy is alive and parsing HTTP", not "HAProxy says traffic
+            # is healthy". /-/healthz legitimately returns 503 in two cases:
+            #   - drain mode (disable() sets pass_health_checks=False to
+            #     signal upstream LB to stop routing).
+            #   - initial startup before has_received_routes is True.
+            # In both states HAProxy is functioning and admin-socket-based
+            # `is_running()` would return True; we match that behavior.
+            return response.startswith(b"HTTP/")
         except Exception:
             return False
         finally:

From 50ef8cdc704c2c45376916cb8712978687bfd479 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 12:12:44 +0000
Subject: [PATCH 13/26] [serve] Use TCP-only probe for HAProxy readiness, HTTP
 for health

The HTTP probe takes up to 4s per attempt (2s connect + 2s read), which
exhausted the 5s startup budget in 1-2 iterations under load and made
initial HAProxy startup fail with "did not enter running state within
5 seconds". Even though HAProxy was bound to its port, its worker
threads were busy replaying the server-state file and dispatching
health-check setup for hundreds of servers -- so the first HTTP
request to /-/healthz could take a few hundred ms to be processed,
and the cumulative wait blew the budget.

Split the probe in two:

- `_is_listener_bound()`: TCP open + close, 0.5s timeout. Used inside
  `_wait_for_hap_availability` for startup/reload readiness. Iterates
  fast (sub-millisecond when port not yet bound, sub-millisecond when
  bound) so we get many attempts per second within the budget. The
  signal "kernel accepted my SYN" is sufficient to know HAProxy
  finished binding listeners.

- `probe_http_listener()`: full HTTP GET, kept for the controller's
  `check_health`. Runs every ~10s on stable proxies where HAProxy is
  long past startup, so the per-attempt cost doesn't matter. Verifies
  HAProxy is actually parsing HTTP, which is a stronger signal than
  TCP-bound for ongoing health.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 67 ++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 8e363e718320..91f8081b9b5e 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -841,22 +841,51 @@ async def _graceful_reload(self) -> None:
             logger.error(f"HAProxy graceful reload failed: {e}")
             raise
 
+    async def _is_listener_bound(self) -> bool:
+        """Quick TCP-only check: is HAProxy accepting connections on its port?
+
+        Used for readiness detection inside `_wait_for_hap_availability`.
+        We deliberately don't send an HTTP request here -- under startup
+        load (large config, many backends, server-state file replay)
+        HAProxy can be bound but momentarily slow to dispatch the first
+        HTTP requests to its worker threads, which would burn our entire
+        5s startup budget on a single read timeout. TCP-bound is the
+        right signal for "the listener is up"; HTTP-level responsiveness
+        follows within milliseconds and is verified separately by the
+        controller's health check via `probe_http_listener`.
+        """
+        try:
+            _, writer = await asyncio.wait_for(
+                asyncio.open_connection("127.0.0.1", self.cfg.frontend_port),
+                timeout=0.5,
+            )
+        except Exception:
+            return False
+        try:
+            writer.close()
+            await writer.wait_closed()
+        except Exception:
+            pass
+        return True
+
     async def probe_http_listener(self) -> bool:
         """Probe the HAProxy HTTP listener via /-/healthz.
 
-        Returns True if HAProxy is serving 200 responses on its frontend
-        port, False otherwise. Used as a readiness signal that does NOT
-        depend on the admin socket -- the admin socket is shared with our
-        runtime-API commands and the `-x` reload FD transfer, and saturates
-        under autoscaling churn. The HTTP listener is served by HAProxy
-        worker threads independent of the CLI mux, so it stays responsive
-        when the admin socket is busy.
-
-        During graceful reload (where the old process is still bound to the
-        same port) this probe may be answered by either the old or the new
-        process; that's acceptable for readiness purposes because either
-        way the port is serving traffic. Crash detection is handled
-        separately by polling `proc.returncode`.
+        Returns True if HAProxy is parsing HTTP on its frontend port,
+        False otherwise. Used by the controller's health check, which
+        runs on a stable proxy and needs to differentiate "alive and
+        serving" from "bound but stuck". Does NOT depend on the admin
+        socket -- the admin socket is shared with our runtime-API
+        commands and the `-x` reload FD transfer, and saturates under
+        autoscaling churn. The HTTP listener is served by HAProxy worker
+        threads independent of the CLI mux, so it stays responsive when
+        the admin socket is busy.
+
+        During graceful reload (where the old process is still bound to
+        the same port) this probe may be answered by either the old or
+        the new process; that's acceptable for readiness purposes because
+        either way the port is serving traffic. Crash detection is
+        handled separately by polling `proc.returncode`.
         """
         try:
             reader, writer = await asyncio.wait_for(
@@ -909,12 +938,12 @@ async def _wait_for_hap_availability(
                     f"HAProxy crashed during startup: {output or f'exit code {proc.returncode}'}"
                 )
 
-            # Probe the HTTP listener instead of the admin socket. The admin
-            # socket gets saturated under runtime-API load and a benign
-            # `show info` can take >5s to return, falsely flagging a healthy
-            # HAProxy as not-yet-running. The HTTP listener is independent
-            # of the CLI mux and stays responsive.
-            if await self.probe_http_listener():
+            # TCP-only check: is the listener accepting connections? We
+            # don't send an HTTP request here because the per-iteration
+            # cost would burn our startup budget. TCP-bound is the right
+            # signal for "HAProxy is up"; the controller's check_health
+            # uses the HTTP probe to verify ongoing serving capability.
+            if await self._is_listener_bound():
                 return
 
             await asyncio.sleep(0.5)

From f1f692b10e80bcd664231a74e5c4684d15ab1767 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 12:19:14 +0000
Subject: [PATCH 14/26] [serve] Bump default HAProxy startup timeout to match
 reload timeout

`_start_and_wait_for_haproxy` had a hardcoded 5s timeout, applied to
both initial actor startup (via `start()`) and graceful reload (via
`_graceful_reload`). With large configs (many backends + large
server-state file replay) HAProxy can need more than 5s to reach the
bind step, especially on hosts under CPU pressure from a cascade
already in progress -- which manifests as "HAProxy did not enter
running state within 5 seconds" right when we need recovery to be
robust.

Default to `RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S` (15s) to match the
timeout already used elsewhere in the reload path. Belt-and-suspenders
alongside the TCP-only readiness probe: the probe makes per-attempt
cost cheap, the larger budget tolerates genuinely slow startup.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 91f8081b9b5e..e63880da7023 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -709,7 +709,9 @@ def _is_running(self) -> bool:
         return self._proc is not None and self._proc.returncode is None
 
     async def _start_and_wait_for_haproxy(
-        self, *extra_args: str, timeout_s: int = 5
+        self,
+        *extra_args: str,
+        timeout_s: int = RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
     ) -> asyncio.subprocess.Process:
         # Build command args
         haproxy_bin = get_haproxy_binary()

From 884ccbcb6a5d0a6f94ac37907349d87c69b5e484 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 18:46:27 +0000
Subject: [PATCH 15/26] [serve] Add timing instrumentation for HAProxy reload
 phases

Reloads can fail with "did not enter running state within N seconds"
without telling us where the time went. Add structured timing to the
graceful reload path so failures expose the bottleneck:

- `_start_and_wait_for_haproxy`: logs spawn vs wait_for_ready time and
  the args used (visible whether `-sf`/`-x` are involved).
- `_graceful_reload`: logs each phase (old_proc_check, save_state,
  new_proc_start) with cumulative duration on success and failure.
- `_wait_for_hap_availability`: logs probe-attempt count on success
  (debug) and includes attempt count in the timeout error.

After deploy, "HAProxy graceful reload OK in 12.34s (old_proc_check=0.05s
save_state=2.10s new_proc_start=10.19s)" or the corresponding failure
message tells you exactly which phase is slow.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 69 +++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index e63880da7023..3fa1973d4064 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -725,21 +725,40 @@ async def _start_and_wait_for_haproxy(
 
         logger.debug(f"Starting HAProxy with args: {args}")
 
+        spawn_start = time.monotonic()
         proc = await asyncio.create_subprocess_exec(
             *args,
             stdout=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.PIPE,
         )
+        spawn_elapsed = time.monotonic() - spawn_start
 
+        wait_start = time.monotonic()
         try:
             await self._wait_for_hap_availability(proc, timeout_s=timeout_s)
         except Exception:
+            wait_elapsed = time.monotonic() - wait_start
+            logger.warning(
+                "HAProxy start failed (PID=%s): spawn=%.2fs wait=%.2fs args=%s",
+                proc.pid,
+                spawn_elapsed,
+                wait_elapsed,
+                extra_args,
+            )
             # If startup fails, ensure the process is killed to avoid orphaned processes
             if proc.returncode is None:
                 proc.kill()
                 await proc.wait()
             raise
 
+        wait_elapsed = time.monotonic() - wait_start
+        logger.info(
+            "HAProxy started (PID=%s): spawn=%.2fs wait=%.2fs args=%s",
+            proc.pid,
+            spawn_elapsed,
+            wait_elapsed,
+            extra_args,
+        )
         return proc
 
     async def _save_server_state(self) -> None:
@@ -774,6 +793,8 @@ async def _terminate_old_procs(self) -> None:
 
     async def _graceful_reload(self) -> None:
         """Perform a graceful reload of HAProxy by starting a new process with -sf."""
+        reload_start = time.monotonic()
+        phases: List[str] = []
         try:
             old_proc = self._proc
             # If the previous HAProxy process has already exited (e.g., a prior
@@ -802,11 +823,24 @@ async def _graceful_reload(self) -> None:
                 # those ports until the orphans finish their natural drain
                 # (tens of seconds at minimum). SIGKILL them up front so the
                 # fresh process can bind immediately.
+                t0 = time.monotonic()
                 await self._terminate_old_procs()
+                phases.append(f"terminate_old={time.monotonic() - t0:.2f}s")
+
+                t0 = time.monotonic()
                 self._proc = await self._start_and_wait_for_haproxy()
+                phases.append(f"fresh_start={time.monotonic() - t0:.2f}s")
+
+                logger.info(
+                    "HAProxy fresh-start reload OK in %.2fs (%s)",
+                    time.monotonic() - reload_start,
+                    " ".join(phases),
+                )
                 return
 
+            t0 = time.monotonic()
             await self._wait_for_hap_availability(old_proc)
+            phases.append(f"old_proc_check={time.monotonic() - t0:.2f}s")
 
             # Save server state if optimization is enabled. Treat failures
             # as non-fatal: the state file is purely an optimization (lets
@@ -817,9 +851,12 @@ async def _graceful_reload(self) -> None:
             # which then cascaded since the next broadcast retried with
             # the same overloaded socket.
             if self.cfg.enable_hap_optimization:
+                t0 = time.monotonic()
                 try:
                     await self._save_server_state()
+                    phases.append(f"save_state={time.monotonic() - t0:.2f}s")
                 except Exception as e:
+                    phases.append(f"save_state_skipped={time.monotonic() - t0:.2f}s")
                     logger.warning(
                         f"Skipping HAProxy server state save before reload: {e}"
                     )
@@ -830,17 +867,26 @@ async def _graceful_reload(self) -> None:
             if self.cfg.enable_hap_optimization:
                 reload_args.extend(["-x", self.cfg.socket_path])
 
+            t0 = time.monotonic()
             self._proc = await self._start_and_wait_for_haproxy(*reload_args)
+            phases.append(f"new_proc_start={time.monotonic() - t0:.2f}s")
 
             # Track old process so we can ensure it's cleaned up during shutdown
             if old_proc is not None:
                 self._old_procs.append(old_proc)
 
             logger.info(
-                "Successfully performed graceful HAProxy reload with process restart."
+                "HAProxy graceful reload OK in %.2fs (%s)",
+                time.monotonic() - reload_start,
+                " ".join(phases),
             )
         except Exception as e:
-            logger.error(f"HAProxy graceful reload failed: {e}")
+            logger.error(
+                "HAProxy graceful reload failed after %.2fs (%s): %s",
+                time.monotonic() - reload_start,
+                " ".join(phases) if phases else "no_phases_completed",
+                e,
+            )
             raise
 
     async def _is_listener_bound(self) -> bool:
@@ -925,9 +971,11 @@ async def _wait_for_hap_availability(
         proc: asyncio.subprocess.Process,
         timeout_s: int = RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
     ) -> None:
-        start_time = time.time()
+        start_time = time.monotonic()
+        iterations = 0
 
-        while time.time() - start_time < timeout_s:
+        while time.monotonic() - start_time < timeout_s:
+            iterations += 1
             if proc.returncode is not None:
                 stdout = await proc.stdout.read() if proc.stdout else b""
                 stderr = await proc.stderr.read() if proc.stderr else b""
@@ -937,7 +985,10 @@ async def _wait_for_hap_availability(
                 )
 
                 raise RuntimeError(
-                    f"HAProxy crashed during startup: {output or f'exit code {proc.returncode}'}"
+                    f"HAProxy crashed during startup after "
+                    f"{time.monotonic() - start_time:.2f}s, "
+                    f"{iterations} probe attempts: "
+                    f"{output or f'exit code {proc.returncode}'}"
                 )
 
             # TCP-only check: is the listener accepting connections? We
@@ -946,12 +997,18 @@ async def _wait_for_hap_availability(
             # signal for "HAProxy is up"; the controller's check_health
             # uses the HTTP probe to verify ongoing serving capability.
             if await self._is_listener_bound():
+                logger.debug(
+                    "HAProxy listener bound after %.2fs (%d probe attempts)",
+                    time.monotonic() - start_time,
+                    iterations,
+                )
                 return
 
             await asyncio.sleep(0.5)
 
         raise RuntimeError(
-            f"HAProxy did not enter running state within {timeout_s} seconds."
+            f"HAProxy did not enter running state within {timeout_s} seconds "
+            f"({iterations} probe attempts)."
         )
 
     def _generate_config_file_internal(self) -> None:

From 5e9815f4ca038929c4524e3b3df7887097b780e8 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Fri, 8 May 2026 18:56:05 +0000
Subject: [PATCH 16/26] [serve] Split HAProxy admin sockets and skip startup
 DNS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two independent quick wins to reduce HAProxy reload latency, both via
the rendered config:

1. Split the admin socket into two:
   - `<socket>.fd` (`expose-fd listeners`) — dedicated to the `-x`
     FD-transfer during graceful reload.
   - `<socket>` — runtime API (`add server`, `del server`, `show stat`)
     and stats. No `expose-fd listeners`.

   Previously a single socket carried both. With nbthread > 1 HAProxy
   parallelizes CLI connections across threads, but the `_getsocks`
   exchange during reload still has to share the same connection-slot
   pool with our runtime-API command stream. Splitting them keeps the
   FD-transfer socket idle except during reload, so reloads don't
   queue behind in-flight `add server` commands.

   `transfer_socket_path` is derived deterministically from
   `socket_path` (`.fd` suffix), so existing per-node path scoping
   carries over. `-x` now points at the transfer socket;
   `_send_socket_command` continues to use `socket_path`.

2. Add `default-server init-addr last,libc,none` to the defaults
   block. HAProxy's default `init-addr` order consults libc DNS even
   for IP-literal servers, which can stall startup for seconds when a
   config has thousands of server entries. `last,libc,none` reuses
   the previous process's resolved address (free via the server-state
   file), only consults libc if needed, and accepts "no IP yet" rather
   than failing startup -- so HAProxy reaches its bind step
   immediately.

Updates the rendered-config golden in `test_haproxy_api.py` to match.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py          | 17 +++++++++++++-
 .../ray/serve/_private/haproxy_templates.py   | 22 ++++++++++++++++++-
 python/ray/serve/tests/test_haproxy_api.py    | 22 ++++++++++++++++++-
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 3fa1973d4064..e28cce0efda7 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -459,6 +459,18 @@ class HAProxyConfig:
 
     is_head: bool = False
 
+    @property
+    def transfer_socket_path(self) -> str:
+        """Path of the dedicated FD-transfer admin socket.
+
+        Separate from `socket_path` so that the `-x` socket transfer during
+        reload (which goes through HAProxy's CLI mux) doesn't have to
+        compete with our runtime-API command stream (`add server`,
+        `del server`, etc.). Derived deterministically from `socket_path`
+        so existing per-node path scoping carries over automatically.
+        """
+        return self.socket_path + ".fd"
+
     @property
     def frontend_host(self) -> str:
         if self.http_options.host is None or self.http_options.host == "0.0.0.0":
@@ -865,7 +877,10 @@ async def _graceful_reload(self) -> None:
             # Use -x socket transfer for seamless reloads if optimization is enabled
             reload_args = ["-sf", str(old_proc.pid)]
             if self.cfg.enable_hap_optimization:
-                reload_args.extend(["-x", self.cfg.socket_path])
+                # Use the dedicated FD-transfer socket so `_getsocks` doesn't
+                # queue behind in-flight runtime-API commands on the main
+                # admin socket.
+                reload_args.extend(["-x", self.cfg.transfer_socket_path])
 
             t0 = time.monotonic()
             self._proc = await self._start_and_wait_for_haproxy(*reload_args)
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 3e7db9313a7a..ce1211b79e87 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -23,7 +23,18 @@
     # Log to the standard system log socket with debug level.
     log /dev/log local0 debug
     log 127.0.0.1:{{ config.syslog_port }} local0 debug
-    stats socket {{ config.socket_path }} mode 666 level admin expose-fd listeners
+    # Dedicated socket for the `-x admin.sock` FD-transfer during graceful
+    # reload. Carries `expose-fd listeners` so the new HAProxy can fetch
+    # listener FDs from it, but is NOT used for runtime-API commands.
+    # Keeping FD transfer on its own socket prevents the `_getsocks`
+    # exchange from queueing behind add/del/disable/enable server commands
+    # during a reload.
+    stats socket {{ config.transfer_socket_path }} mode 666 level admin expose-fd listeners
+    # Runtime-API + stats socket. Used by the proxy actor for
+    # `add server`, `del server`, `show stat`, etc. No `expose-fd
+    # listeners` here — FD transfer is reserved for the dedicated socket
+    # above so the two streams don't contend on the same connection slot.
+    stats socket {{ config.socket_path }} mode 666 level admin
     stats timeout 30s
     maxconn {{ config.maxconn }}
     nbthread {{ config.nbthread }}
@@ -37,6 +48,15 @@
 defaults
     mode http
     option log-health-checks
+    # Avoid blocking on libc DNS resolution at HAProxy startup. With many
+    # backends, the cumulative resolver wait can be seconds even when
+    # every server address is already an IP literal -- HAProxy still
+    # consults libc as part of `init-addr`'s default order. The order
+    # below tries the previous process's resolved address first (instant
+    # via the server-state file), falls back to libc only if needed, and
+    # accepts "no IP yet" rather than failing startup. Servers come up
+    # immediately and DNS resolution happens lazily in the background.
+    default-server init-addr last,libc,none
     retries {{ config.retries }}
     {% if config.timeout_connect_s is not none %}timeout connect {{ config.timeout_connect_s }}s{% endif %}
     {% if config.timeout_client_s is not none %}timeout client {{ config.timeout_client_s }}s{% endif %}
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 81c2cd9ef534..9aa50f96fb16 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -268,7 +268,18 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # Log to the standard system log socket with debug level.
     log /dev/log local0 debug
     log 127.0.0.1:514 local0 debug
-    stats socket {socket_path} mode 666 level admin expose-fd listeners
+    # Dedicated socket for the `-x admin.sock` FD-transfer during graceful
+    # reload. Carries `expose-fd listeners` so the new HAProxy can fetch
+    # listener FDs from it, but is NOT used for runtime-API commands.
+    # Keeping FD transfer on its own socket prevents the `_getsocks`
+    # exchange from queueing behind add/del/disable/enable server commands
+    # during a reload.
+    stats socket {socket_path}.fd mode 666 level admin expose-fd listeners
+    # Runtime-API + stats socket. Used by the proxy actor for
+    # `add server`, `del server`, `show stat`, etc. No `expose-fd
+    # listeners` here — FD transfer is reserved for the dedicated socket
+    # above so the two streams don't contend on the same connection slot.
+    stats socket {socket_path} mode 666 level admin
     stats timeout 30s
     maxconn 1000
     nbthread 2
@@ -278,6 +289,15 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
 defaults
     mode http
     option log-health-checks
+    # Avoid blocking on libc DNS resolution at HAProxy startup. With many
+    # backends, the cumulative resolver wait can be seconds even when
+    # every server address is already an IP literal -- HAProxy still
+    # consults libc as part of `init-addr`'s default order. The order
+    # below tries the previous process's resolved address first (instant
+    # via the server-state file), falls back to libc only if needed, and
+    # accepts "no IP yet" rather than failing startup. Servers come up
+    # immediately and DNS resolution happens lazily in the background.
+    default-server init-addr last,libc,none
     retries 3
     timeout connect 5s
     timeout client 30s

From ab4ab5cd4368b51f182da98821756a4d5cb60259 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 04:57:48 +0000
Subject: [PATCH 17/26] [serve] Batch HAProxy runtime-API commands onto single
 connections

Each `add server`/`enable server`/`disable server`/`del server` was
issued as its own Unix socket round-trip via `_send_runtime_command`.
A real autoscaling broadcast (say 50 server changes across 24 backends)
produced ~200 sequential per-command connections to the local HAProxy
admin socket. Under traffic load each round-trip queues behind HAProxy
worker threads serving HTTP, and the cumulative wallclock exceeds the
5s `_send_socket_command` timeout. The apply fails, falls back to
reload, and (in the worst case) cascades into a cluster-wide event of
runtime-API timeouts + fresh-start recoveries.

Add `_send_runtime_commands(commands, chunk_size=64)` which joins
batches of commands with `;` (HAProxy CLI's native multi-command
separator) and sends each chunk over a single admin-socket connection.
Same error-marker substring detection as the single-command path; loses
per-command attribution on failure but the caller's response (fall back
to reload) is the same regardless.

Apply loop in `try_apply_servers_dynamically` switched from per-command
awaits to building a command list and sending via the batched API. For
a 200-command apply, this drops the round-trip count from 200 to
ceil(200/64) = 4, amortizing the per-connection accept/dispatch cost
into a single CLI session per chunk where HAProxy processes commands
in-order without re-acquiring the socket.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 99 +++++++++++++++++++++++-----
 1 file changed, 84 insertions(+), 15 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index e28cce0efda7..5b3d298d0412 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -1238,6 +1238,68 @@ async def _send_runtime_command(self, command: str) -> str:
                 )
         return result
 
+    async def _send_runtime_commands(
+        self, commands: List[str], chunk_size: int = 64
+    ) -> None:
+        """Send multiple admin-socket commands batched onto single connections.
+
+        HAProxy's CLI accepts multiple commands per connection separated by
+        `;`. This lets us collapse what would otherwise be N separate Unix
+        socket round-trips (each contending with HTTP worker threads for
+        accept/dispatch) into a small number of connections, dramatically
+        reducing the wallclock cost of a large apply.
+
+        At Ray Serve scales (autoscaling broadcast = tens to hundreds of
+        server changes across many backends), the previous per-command
+        connection model meant a single broadcast triggered ~200 separate
+        admin-socket round-trips per proxy. Each round-trip competes with
+        live traffic on HAProxy's worker threads; the per-command overhead
+        compounds and pushes individual commands past the 5s timeout under
+        load. With batching, the same 200 commands ride through 4 socket
+        sessions (at the default chunk_size), each session handling its
+        commands in-order without re-acquiring the connection.
+
+        Error detection mirrors `_send_runtime_command`: substring-match the
+        concatenated response for known HAProxy error markers and raise on
+        the first hit. Loses per-command attribution (we don't know which
+        command in the batch failed), but the caller's response to any
+        failure is the same fallback reload regardless.
+
+        chunk_size of 64 stays comfortably under HAProxy's default 16 KB CLI
+        buffer (`tune.bufsize`) for typical command lengths (~80 bytes).
+        """
+        if not commands:
+            return
+        error_markers = (
+            "[alert]",
+            "[warning]",
+            "[notice]",
+            "no such server",
+            "no such backend",
+            "doesn't exist",
+            "does not exist",
+            "not found",
+            "not allowed",
+            "syntax error",
+            "invalid argument",
+            "unknown command",
+            "must be disabled",
+        )
+        for i in range(0, len(commands), chunk_size):
+            chunk = commands[i : i + chunk_size]
+            batched = ";".join(chunk)
+            result = await self._send_socket_command(batched)
+            if not result:
+                continue
+            normalized = result.lower()
+            for marker in error_markers:
+                if marker in normalized:
+                    raise RuntimeError(
+                        f"HAProxy rejected one or more runtime commands in "
+                        f"batch of {len(chunk)} (chunk starts at index {i}): "
+                        f"{result.strip()[:500]}"
+                    )
+
     @staticmethod
     def _parse_haproxy_csv_stats(
         stats_output: str,
@@ -1400,37 +1462,44 @@ async def try_apply_servers_dynamically(
             self.set_backend_configs(new_backend_configs)
             return True
 
-        # Apply the diff via the admin socket. Commands are sent
-        # sequentially — if one fails partway through, the live HAProxy
-        # state is partially updated. The fallback reload that the caller
-        # performs after we return False rebuilds full state from
-        # self.backend_configs (which we update here) and converges
+        # Apply the diff via the admin socket. We accumulate all commands
+        # into a list and ship them through `_send_runtime_commands`, which
+        # batches them onto a small number of Unix socket connections
+        # (separator: `;`). Previously each command opened its own
+        # connection, so a broadcast with N server changes incurred 2N
+        # admin-socket round-trips per proxy. Under load each round-trip
+        # competes with HTTP worker threads for accept/dispatch, and the
+        # cumulative wallclock easily exceeds the 5s `_send_socket_command`
+        # timeout. Batching collapses the 2N round-trips into ceil(2N/64)
+        # connections -- typically 1-4 for a real broadcast -- so the per-
+        # command overhead is amortized away.
+        #
+        # If any command in the batch fails, the whole apply fails and the
+        # caller falls back to a reload, which rebuilds full state from
+        # self.backend_configs (which we update below) and converges
         # everything back to the new desired state.
         try:
+            commands: List[str] = []
             for backend_name, server_names in diff.servers_to_remove.items():
                 for sname in server_names:
                     # Disable before delete: required when the server has
                     # in-flight connections; safe even when it doesn't.
-                    await self._send_runtime_command(
-                        f"disable server {backend_name}/{sname}"
-                    )
-                    await self._send_runtime_command(
-                        f"del server {backend_name}/{sname}"
-                    )
+                    commands.append(f"disable server {backend_name}/{sname}")
+                    commands.append(f"del server {backend_name}/{sname}")
 
             for backend_name, servers in diff.servers_to_add.items():
                 for server in servers:
                     # Newly-added runtime servers inherit `default-server`
                     # directives (inter/fall/rise/etc.) from the backend
                     # block, so we only need address + the `check` flag.
-                    await self._send_runtime_command(
+                    commands.append(
                         f"add server {backend_name}/{server.name} "
                         f"{server.host}:{server.port} check"
                     )
                     # Runtime-added servers start in MAINT state by default.
-                    await self._send_runtime_command(
-                        f"enable server {backend_name}/{server.name}"
-                    )
+                    commands.append(f"enable server {backend_name}/{server.name}")
+
+            await self._send_runtime_commands(commands)
         except Exception as e:
             logger.warning(
                 f"Failed to apply backend update incrementally via runtime "

From 8944a10bb949b66621c46c56c27580c78f869876 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 08:48:28 +0000
Subject: [PATCH 18/26] [serve] Tune HAProxy runtime-API chunk size, socket
 timeout, log per-chunk timings

Under autoscaling churn at production scale, broadcasts of dozens to
hundreds of `add/del/disable/enable` commands routinely exceeded the
hardcoded 5s admin-socket timeout, falling back to a full reload that
amplifies load cluster-wide. Two tactical fixes:

* `RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE` (default 16, was 64): per-chunk
  timeout budget instead of per-batch budget. Smaller chunks complete
  the runtime-API path instead of failing into the slow reload path.
* `RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S` (default 15, was hardcoded 5):
  wider per-chunk budget so HAProxy can finish processing even when the
  CLI mux is queueing behind heavy HTTP worker dispatch.

Both knobs are env-configurable so they can be tuned at the next load
test without a redeploy.

`_send_runtime_commands` now logs per-chunk wallclock on failure (which
chunk, how long before it failed, prior chunks' times, total elapsed)
and a summary on success, so we can distinguish "just over the timeout
budget" (smaller chunks help) from "HAProxy genuinely wedged" (a
different problem). Success log is info-level + stderr so chunk-time
distributions are visible in normal proxy stderr alongside the
existing apply summaries.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py | 25 +++++++
 python/ray/serve/_private/haproxy.py   | 94 +++++++++++++++++++-------
 2 files changed, 95 insertions(+), 24 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 8e8f47802980..95ebb0fa6ab8 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -785,6 +785,31 @@
     os.environ.get("RAY_SERVE_HAPROXY_BROADCAST_COALESCE_S", "0.5")
 )
 
+# Maximum number of HAProxy runtime-API commands (e.g. `add server`,
+# `disable server`, `del server`, `enable server`) batched onto a single
+# admin-socket connection. Each chunk is sent as a `;`-separated command
+# string and shares one socket-level timeout. Smaller chunks reduce the
+# risk of any one chunk exceeding the read timeout when HAProxy's CLI
+# mux is queueing behind HTTP worker dispatch under load: the per-chunk
+# overhead (~single-digit ms of socket setup) is negligible compared to
+# the cost of a fallback full reload triggered by a timeout. 16 leaves
+# substantial headroom even at high traffic load.
+RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE = int(
+    os.environ.get("RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE", "16")
+)
+
+# Connect/read timeout (seconds) for HAProxy admin-socket commands. The
+# CLI mux serializes admin operations behind HTTP worker dispatch, so a
+# batch of `add server` / `del server` / etc. commands can routinely take
+# more than a few seconds while HAProxy is serving heavy traffic. The
+# previous hardcoded 5s was tight enough that broadcasts during
+# autoscaling churn hit it routinely, falling back to expensive full
+# reloads. 15s lets the slower-but-functional runtime-API path complete
+# in those cases instead of cascading into reloads.
+RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S = float(
+    os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "15")
+)
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 5b3d298d0412..d16af1d953cf 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -44,9 +44,11 @@
     RAY_SERVE_HAPROXY_NBTHREAD,
     RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
     RAY_SERVE_HAPROXY_RETRIES,
+    RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE,
     RAY_SERVE_HAPROXY_SERVER_STATE_BASE,
     RAY_SERVE_HAPROXY_SERVER_STATE_FILE,
     RAY_SERVE_HAPROXY_SOCKET_PATH,
+    RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S,
     RAY_SERVE_HAPROXY_STATS_PORT,
     RAY_SERVE_HAPROXY_SYSLOG_PORT,
     RAY_SERVE_HAPROXY_TCP_NODELAY,
@@ -1162,7 +1164,7 @@ async def _send_socket_command(self, command: str) -> str:
             try:
                 reader, writer = await asyncio.wait_for(
                     asyncio.open_unix_connection(self.cfg.socket_path),
-                    timeout=5.0,
+                    timeout=RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S,
                 )
             except asyncio.TimeoutError:
                 raise RuntimeError(
@@ -1175,7 +1177,9 @@ async def _send_socket_command(self, command: str) -> str:
 
                 # Read until EOF (HAProxy closes connection after response)
                 try:
-                    result_bytes = await asyncio.wait_for(reader.read(), timeout=5.0)
+                    result_bytes = await asyncio.wait_for(
+                        reader.read(), timeout=RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S
+                    )
                 except asyncio.TimeoutError:
                     raise RuntimeError(
                         f"Timeout while sending command '{command}' to HAProxy socket"
@@ -1239,33 +1243,41 @@ async def _send_runtime_command(self, command: str) -> str:
         return result
 
     async def _send_runtime_commands(
-        self, commands: List[str], chunk_size: int = 64
+        self,
+        commands: List[str],
+        chunk_size: int = RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE,
     ) -> None:
         """Send multiple admin-socket commands batched onto single connections.
 
         HAProxy's CLI accepts multiple commands per connection separated by
         `;`. This lets us collapse what would otherwise be N separate Unix
         socket round-trips (each contending with HTTP worker threads for
-        accept/dispatch) into a small number of connections, dramatically
-        reducing the wallclock cost of a large apply.
-
-        At Ray Serve scales (autoscaling broadcast = tens to hundreds of
-        server changes across many backends), the previous per-command
-        connection model meant a single broadcast triggered ~200 separate
-        admin-socket round-trips per proxy. Each round-trip competes with
-        live traffic on HAProxy's worker threads; the per-command overhead
-        compounds and pushes individual commands past the 5s timeout under
-        load. With batching, the same 200 commands ride through 4 socket
-        sessions (at the default chunk_size), each session handling its
-        commands in-order without re-acquiring the connection.
-
-        Error detection mirrors `_send_runtime_command`: substring-match the
-        concatenated response for known HAProxy error markers and raise on
-        the first hit. Loses per-command attribution (we don't know which
-        command in the batch failed), but the caller's response to any
-        failure is the same fallback reload regardless.
-
-        chunk_size of 64 stays comfortably under HAProxy's default 16 KB CLI
+        accept/dispatch) into a small number of connections, reducing the
+        wallclock cost of a large apply.
+
+        chunk_size controls how many commands ride through each socket
+        connection. The socket-level timeout (RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S)
+        applies per chunk, so smaller chunks give each chunk its own budget
+        rather than forcing the whole batch to fit in one timeout window.
+        Under load HAProxy's CLI mux serializes admin operations behind HTTP
+        worker dispatch, so the dominant cost is HAProxy's processing time
+        per command, not per-chunk socket setup -- keeping chunks small
+        trades a small amount of extra wallclock for a much higher chance of
+        completing the runtime-API path instead of timing out into a full
+        reload.
+
+        Error detection: substring-match the concatenated response for known
+        HAProxy error markers and raise on the first hit. Loses per-command
+        attribution (we don't know which command in the batch failed), but
+        the caller's response to any failure is the same fallback reload
+        regardless.
+
+        On failure, logs per-chunk wallclock so we can distinguish "just over
+        the timeout budget" from "HAProxy genuinely wedged" -- the right fix
+        for the two is different (smaller chunks vs. investigating HAProxy
+        state).
+
+        chunk_size stays comfortably under HAProxy's default 16 KB CLI
         buffer (`tune.bufsize`) for typical command lengths (~80 bytes).
         """
         if not commands:
@@ -1285,10 +1297,35 @@ async def _send_runtime_commands(
             "unknown command",
             "must be disabled",
         )
+        num_chunks = (len(commands) + chunk_size - 1) // chunk_size
+        chunk_times: List[float] = []
+        batch_start = time.monotonic()
         for i in range(0, len(commands), chunk_size):
             chunk = commands[i : i + chunk_size]
             batched = ";".join(chunk)
-            result = await self._send_socket_command(batched)
+            chunk_start = time.monotonic()
+            try:
+                result = await self._send_socket_command(batched)
+            except Exception:
+                # Per-chunk wallclock tells us whether we're at "just over
+                # budget" (e.g. 15.0s = hit the timeout exactly, smaller chunks
+                # or larger timeout would help) or "HAProxy is wedged" (much
+                # longer than that, something else is wrong).
+                logger.warning(
+                    "Runtime API chunk %d/%d (size=%d) failed after %.2fs; "
+                    "total_commands=%d, prior_chunk_times=%s, "
+                    "batch_elapsed=%.2fs",
+                    i // chunk_size + 1,
+                    num_chunks,
+                    len(chunk),
+                    time.monotonic() - chunk_start,
+                    len(commands),
+                    [f"{t:.2f}s" for t in chunk_times],
+                    time.monotonic() - batch_start,
+                    extra={"log_to_stderr": False},
+                )
+                raise
+            chunk_times.append(time.monotonic() - chunk_start)
             if not result:
                 continue
             normalized = result.lower()
@@ -1299,6 +1336,15 @@ async def _send_runtime_commands(
                         f"batch of {len(chunk)} (chunk starts at index {i}): "
                         f"{result.strip()[:500]}"
                     )
+        logger.info(
+            "Runtime API batch OK: %d commands, %d chunks, chunk_times=%s, "
+            "total=%.2fs",
+            len(commands),
+            num_chunks,
+            [f"{t:.2f}s" for t in chunk_times],
+            time.monotonic() - batch_start,
+            extra={"log_to_stderr": True},
+        )
 
     @staticmethod
     def _parse_haproxy_csv_stats(

From 9dbe27e08547c2fe4f7ef44b66f816ad98545b6d Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 09:38:47 +0000
Subject: [PATCH 19/26] [serve] Migrate HAProxy backend management to
 server-template slot pools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the dynamic `add server` / `del server` / `disable server` /
`enable server` runtime-API model with HAProxy's `server-template`
directive plus per-backend slot pools.

Each backend's rendered config now contains a pre-allocated pool of
`server-template srv 1-N ...` slots that start disabled at a placeholder
address. The proxy actor binds slot ↔ replica mappings in-process
(BackendSlotPool, first-free via min-heap) and uses `set server addr
... port ...` + `set server state ready/maint` to repurpose slots at
runtime. This eliminates server-list mutation under HAProxy's CLI mux
lock (which contends with HTTP worker dispatch under heavy traffic) and
roughly halves command count per broadcast for replica removals.

Architecture:
* `RAY_SERVE_HAPROXY_TOTAL_SLOTS` (4096) caps total slots across all
  backends; `_compute_slot_split` partitions the budget per backend at
  config-generation time using replica-count * headroom proportional
  split with a per-backend minimum floor.
* `BackendSlotPool` per backend tracks replica → slot. Lives on
  `HAProxyApi` so it's adjacent to the runtime-API call sites.
* `_rebind_slot_pools()` runs after every successful (re)load:
  recreates pools sized from BackendConfig.slot_pool_size, then issues
  one batched runtime-API call to populate slots from the current
  backend_configs. This is what makes the post-reload window not lose
  the desired server set.
* `try_apply_servers_dynamically` now translates diffs into
  `set server ... addr/state` commands. Slot-pool-size changes count
  as structural (the rendered template line itself changes), so they
  go through reload, which recomputes the split.
* Pool exhaustion → return False → caller reloads → reload re-splits
  with bigger share for the exhausted backend.

Failure modes:
* Removals processed before adds so 1:1 churn doesn't need 2x slots.
* Slot population after reload is best-effort; on failure HAProxy is
  up but no replicas are routable until next broadcast retries.
* `slot_mapping.json` written to socket_dir after each apply for
  live debugging (translates srvN back to SERVE_REPLICA_ name).

Constants added (all env-overridable):
  RAY_SERVE_HAPROXY_TOTAL_SLOTS=4096
  RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND=16
  RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR=2.0

Behavior change: `show stat` on the admin socket now reports
`srv1`/`srv2`/... instead of `SERVE_REPLICA_<app>-<replica>`. Use
`slot_mapping.json` or the per-apply log lines to translate.

Test golden config updated for the new template line. Pure-Python
unit tests for BackendSlotPool and _compute_slot_split added in
test_haproxy_slots.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py        |  32 ++
 python/ray/serve/_private/haproxy.py          | 467 +++++++++++++++---
 .../ray/serve/_private/haproxy_templates.py   |  11 +-
 python/ray/serve/tests/test_haproxy_api.py    |  23 +-
 python/ray/serve/tests/test_haproxy_slots.py  | 175 +++++++
 5 files changed, 641 insertions(+), 67 deletions(-)
 create mode 100644 python/ray/serve/tests/test_haproxy_slots.py

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 95ebb0fa6ab8..7be17ac96894 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -810,6 +810,38 @@
     os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "15")
 )
 
+# Total number of server slots to pre-allocate across all backends via
+# HAProxy's `server-template` directive. Slots are partitioned across
+# backends at config-generation time (see `_compute_slot_split`); each
+# backend's share becomes the size of its `server-template` block and
+# the upper bound on how many replicas it can hold before the runtime-
+# API path returns False and a full reload re-computes the split. 4096
+# total comfortably covers Ray Serve clusters with hundreds of replicas
+# across a few dozen backends; raise it for larger fleets, but note that
+# every slot has a small HAProxy memory cost (~few KB per slot) and
+# slightly inflates `show stat` output.
+RAY_SERVE_HAPROXY_TOTAL_SLOTS = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TOTAL_SLOTS", "4096")
+)
+
+# Floor on per-backend slot allocation. Even backends with zero replicas
+# get this many slots reserved so they can absorb some growth without
+# requiring a reload to re-split. Set high enough to cover short bursts
+# of churn but low enough that idle backends don't crowd out active ones.
+# If N_backends * MIN_SLOTS exceeds TOTAL_SLOTS the split degrades to an
+# equal share of (TOTAL_SLOTS // N_backends) per backend.
+RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND = int(
+    os.environ.get("RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND", "16")
+)
+
+# Headroom multiplier applied when allocating slots. A backend with N
+# current replicas gets ~N * factor slots (subject to total/min limits);
+# the extra slots absorb scale-up without triggering a reload. 2.0 means
+# a backend can double in size between reloads before exhausting.
+RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR = float(
+    os.environ.get("RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR", "2.0")
+)
+
 # Number of consecutive failed server health checks that must occur
 # before haproxy marks the server as down.
 RAY_SERVE_HAPROXY_HEALTH_CHECK_FALL = int(
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index d16af1d953cf..209bb7a018be 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -1,8 +1,10 @@
 import asyncio
 import csv
+import heapq
 import io
 import json
 import logging
+import math
 import os
 import re
 import shutil
@@ -41,12 +43,14 @@
     RAY_SERVE_HAPROXY_HEALTH_CHECK_RISE,
     RAY_SERVE_HAPROXY_MAXCONN,
     RAY_SERVE_HAPROXY_METRICS_PORT,
+    RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND,
     RAY_SERVE_HAPROXY_NBTHREAD,
     RAY_SERVE_HAPROXY_RELOAD_TIMEOUT_S,
     RAY_SERVE_HAPROXY_RETRIES,
     RAY_SERVE_HAPROXY_RUNTIME_CHUNK_SIZE,
     RAY_SERVE_HAPROXY_SERVER_STATE_BASE,
     RAY_SERVE_HAPROXY_SERVER_STATE_FILE,
+    RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR,
     RAY_SERVE_HAPROXY_SOCKET_PATH,
     RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S,
     RAY_SERVE_HAPROXY_STATS_PORT,
@@ -55,6 +59,7 @@
     RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S,
     RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S,
     RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S,
+    RAY_SERVE_HAPROXY_TOTAL_SLOTS,
     SERVE_CONTROLLER_NAME,
     SERVE_LOGGER_NAME,
     SERVE_NAMESPACE,
@@ -323,6 +328,13 @@ class BackendConfig:
     # The app name for this backend.
     app_name: str = field(default_factory=str)
 
+    # Size of this backend's `server-template` slot pool. The rendered
+    # config emits `server-template srv 1-{slot_pool_size} ...` so the
+    # backend can hold up to `slot_pool_size` replicas before the runtime
+    # API has to fall back to a reload to re-split. Set by HAProxyManager
+    # via `_compute_slot_split` on every config (re)generation.
+    slot_pool_size: int = 0
+
     def build_health_check_config(self, global_config: "HAProxyConfig") -> dict:
         """Build health check configuration for HAProxy backend.
 
@@ -617,6 +629,9 @@ def _compute_backend_diff(
             "path_prefix",
             "fallback_server",
             "app_name",
+            # slot_pool_size is part of the rendered `server-template` line,
+            # so any change requires a reload to take effect.
+            "slot_pool_size",
             "timeout_connect_s",
             "timeout_server_s",
             "timeout_client_s",
@@ -667,6 +682,159 @@ def _compute_backend_diff(
     )
 
 
+class BackendSlotPool:
+    """Tracks slot ↔ replica mapping for one backend's `server-template` block.
+
+    HAProxy's `server-template` directive pre-allocates a fixed pool of
+    server slots at config-generation time. The slots are named
+    `<prefix><N>` (e.g. `srv1`, `srv2`, ...) and start disabled with a
+    placeholder address. Once HAProxy is running we repurpose slots
+    dynamically via the runtime API:
+
+        # Assign slot N to a new replica:
+        set server <backend>/srvN addr <ip> port <port>
+        set server <backend>/srvN state ready
+
+        # Release slot N (replica removed):
+        set server <backend>/srvN state maint
+
+    The pool tracks which slots are in use and which are free; allocation
+    is "first-free" via a min-heap so slot numbers stay compact under
+    churn. `assign` is idempotent — re-assigning an already-present
+    replica returns its existing slot, which lets the apply loop handle
+    same-name-different-address transitions without bookkeeping.
+    """
+
+    def __init__(self, backend_name: str, pool_size: int):
+        if pool_size <= 0:
+            raise ValueError(f"BackendSlotPool requires pool_size > 0, got {pool_size}")
+        self.backend_name = backend_name
+        self.pool_size = pool_size
+        self._replica_to_slot: Dict[str, int] = {}
+        # min-heap of free slot numbers (1-indexed to match HAProxy CLI)
+        self._free_slots: List[int] = list(range(1, pool_size + 1))
+        heapq.heapify(self._free_slots)
+
+    def assign(self, replica_name: str) -> Optional[int]:
+        """Allocate a slot for `replica_name`. Returns the slot number,
+        or None if the pool is full. Idempotent: re-assigning an
+        already-present replica returns its existing slot."""
+        existing = self._replica_to_slot.get(replica_name)
+        if existing is not None:
+            return existing
+        if not self._free_slots:
+            return None
+        slot = heapq.heappop(self._free_slots)
+        self._replica_to_slot[replica_name] = slot
+        return slot
+
+    def release(self, replica_name: str) -> Optional[int]:
+        """Free the slot held by `replica_name`. Returns the slot number,
+        or None if the replica wasn't holding a slot."""
+        slot = self._replica_to_slot.pop(replica_name, None)
+        if slot is not None:
+            heapq.heappush(self._free_slots, slot)
+        return slot
+
+    def slot_for(self, replica_name: str) -> Optional[int]:
+        """Return the slot held by `replica_name`, or None if absent."""
+        return self._replica_to_slot.get(replica_name)
+
+    def in_use(self) -> int:
+        return len(self._replica_to_slot)
+
+    def free(self) -> int:
+        return len(self._free_slots)
+
+    def is_exhausted(self) -> bool:
+        return not self._free_slots
+
+    def mapping(self) -> Dict[str, int]:
+        """Return a copy of the replica → slot mapping (for diagnostics)."""
+        return dict(self._replica_to_slot)
+
+
+def _compute_slot_split(
+    backend_to_replica_count: Dict[str, int],
+    total: int = RAY_SERVE_HAPROXY_TOTAL_SLOTS,
+    min_per_backend: int = RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND,
+    headroom_factor: float = RAY_SERVE_HAPROXY_SLOT_HEADROOM_FACTOR,
+) -> Dict[str, int]:
+    """Partition `total` slots across backends.
+
+    Allocation strategy:
+      1. Reserve `min_per_backend` for every backend (floor).
+      2. Compute each backend's "demand" as `replicas * headroom_factor`.
+      3. Distribute the remaining slots proportionally to demand. Backends
+         with zero replicas only get their minimum (no extra share).
+
+    Invariants when N := len(backend_to_replica_count):
+      - sum(result.values()) <= total
+      - For each b: result[b] >= min_per_backend  (when N * min <= total)
+      - For each b: result[b] >= 1                (always)
+
+    Degraded path: if `N * min_per_backend > total`, we can't honor the
+    minimum and fall back to equal split (floor(total / N) each, with the
+    remainder distributed to the first few backends alphabetically). This
+    is a pathological case that should not occur with default settings
+    (4096 / 16 = 256 backends).
+    """
+    if not backend_to_replica_count:
+        return {}
+
+    n = len(backend_to_replica_count)
+    names = sorted(backend_to_replica_count.keys())
+
+    if n * min_per_backend > total:
+        # Pathological: too many backends to give each the minimum. Equal split.
+        base = max(1, total // n)
+        remainder = max(0, total - base * n)
+        return {
+            name: base + (1 if i < remainder else 0) for i, name in enumerate(names)
+        }
+
+    reserved = n * min_per_backend
+    remaining = total - reserved
+
+    demand = {
+        name: max(0, int(math.ceil(backend_to_replica_count[name] * headroom_factor)))
+        for name in names
+    }
+    total_demand = sum(demand.values())
+
+    extra: Dict[str, int] = {name: 0 for name in names}
+    if remaining > 0:
+        if total_demand == 0:
+            # No replicas anywhere — split the remainder equally.
+            per_backend = remaining // n
+            leftover = remaining - per_backend * n
+            for i, name in enumerate(names):
+                extra[name] = per_backend + (1 if i < leftover else 0)
+        else:
+            # Proportional to demand. Float-then-floor, then distribute the
+            # rounding remainder to backends with the largest fractional parts
+            # so we always allocate exactly `remaining` slots.
+            shares_float = {
+                name: remaining * demand[name] / total_demand for name in names
+            }
+            shares = {name: int(shares_float[name]) for name in names}
+            allocated = sum(shares.values())
+            leftover = remaining - allocated
+            # Sort by fractional part descending; ties broken by name asc.
+            order = sorted(
+                names,
+                key=lambda name: (
+                    -(shares_float[name] - shares[name]),
+                    name,
+                ),
+            )
+            for name in order[:leftover]:
+                shares[name] += 1
+            extra = shares
+
+    return {name: min_per_backend + extra[name] for name in names}
+
+
 class HAProxyApi(ProxyApi):
     """ProxyApi implementation for HAProxy."""
 
@@ -684,6 +852,10 @@ def __init__(
         self._proc = None
         # Track old processes from graceful reloads that may still be draining
         self._old_procs: List[asyncio.subprocess.Process] = []
+        # Slot pools, one per backend, sized by the most recent rendered
+        # config's `slot_pool_size`. Populated/reset by `_rebind_slot_pools`
+        # which is called after every successful (re)load.
+        self._slot_pools: Dict[str, BackendSlotPool] = {}
 
         # Ensure required directories exist during initialization
         self._initialize_directories_and_error_files()
@@ -718,6 +890,124 @@ def _initialize_directories_and_error_files(self) -> None:
 
         self.cfg.error_file_path = error_file_path
 
+    async def _rebind_slot_pools(self) -> None:
+        """Rebuild slot pools from backend_configs and populate slots.
+
+        Called after every successful (re)load. The new HAProxy process
+        starts with all template slots disabled at the placeholder
+        address, so we:
+          1. Recreate the in-memory pools (sized per BackendConfig's
+             `slot_pool_size`, set by `_compute_slot_split` at
+             config-generation time).
+          2. Assign current `backend_configs[*].servers` to slots.
+          3. Send a single batched runtime-API call to point those slots
+             at the right addresses and bring them up.
+
+        This handles two cases under one path:
+          - Production: a reload was triggered because the incremental
+            apply gave up (e.g. structural change, pool exhausted). The
+            new desired state was already written to `backend_configs`
+            by `try_apply_servers_dynamically` before it returned False;
+            this method makes HAProxy actually serve from it.
+          - Tests: HAProxyApi is constructed with explicit
+            `backend_configs={...}` and `start()` is called; this
+            method binds those servers to slots without requiring the
+            caller to make an extra runtime-API call.
+
+        Step 3 is best-effort: on failure HAProxy is up but slots are
+        unpopulated; the next broadcast will diff against
+        `backend_configs` (which already reflects desired state) and
+        find no work to do, leaving slots empty. To recover, callers
+        should observe a failed apply and retry. In practice failures
+        here are rare since we just restarted HAProxy.
+        """
+        new_pools: Dict[str, BackendSlotPool] = {}
+        for name, bc in self.backend_configs.items():
+            if bc.slot_pool_size <= 0:
+                continue
+            new_pools[name] = BackendSlotPool(name, bc.slot_pool_size)
+        self._slot_pools = new_pools
+
+        if new_pools:
+            sizes = {n: p.pool_size for n, p in new_pools.items()}
+            logger.info(
+                "Slot pools reset: total=%d, split=%s",
+                sum(sizes.values()),
+                sizes,
+                extra={"log_to_stderr": True},
+            )
+
+        if not self.backend_configs:
+            return
+        commands: List[str] = []
+        for backend_name, bc in self.backend_configs.items():
+            pool = self._slot_pools.get(backend_name)
+            if pool is None or not bc.servers:
+                continue
+            for server in bc.servers:
+                slot = pool.assign(server.name)
+                if slot is None:
+                    logger.warning(
+                        "Slot population: backend %s pool exhausted "
+                        "(pool_size=%d). Some servers will be unreachable "
+                        "until the next reload re-splits.",
+                        backend_name,
+                        pool.pool_size,
+                    )
+                    break
+                commands.append(
+                    f"set server {backend_name}/srv{slot} "
+                    f"addr {server.host} port {server.port}"
+                )
+                commands.append(f"set server {backend_name}/srv{slot} state ready")
+        if not commands:
+            return
+        try:
+            await self._send_runtime_commands(commands)
+            logger.info(
+                "Slot population OK: %d servers placed across %d backends",
+                sum(len(bc.servers) for bc in self.backend_configs.values()),
+                len(self._slot_pools),
+                extra={"log_to_stderr": True},
+            )
+        except Exception as e:
+            logger.warning(
+                f"Slot population failed after reload: {e}. HAProxy is up "
+                f"but no replicas are routable until the next broadcast retry."
+            )
+        # Best-effort snapshot for live debugging.
+        self._write_slot_mapping_for_debug()
+
+    def _write_slot_mapping_for_debug(self) -> None:
+        """Best-effort dump of current slot mapping to a JSON file.
+
+        Writes `<socket_dir>/slot_mapping.json` so operators can `cat` it
+        to translate `srv1`/`srv2`/... back to replica IDs without
+        grepping logs. Best-effort: any failure is swallowed so a write
+        problem can't break an otherwise-successful apply.
+        """
+        try:
+            socket_dir = os.path.dirname(self.cfg.socket_path)
+            mapping_path = os.path.join(socket_dir, "slot_mapping.json")
+            payload = {
+                name: {
+                    "pool_size": pool.pool_size,
+                    "in_use": pool.in_use(),
+                    "free": pool.free(),
+                    "slots": {
+                        f"srv{slot}": replica
+                        for replica, slot in pool.mapping().items()
+                    },
+                }
+                for name, pool in self._slot_pools.items()
+            }
+            tmp = mapping_path + ".tmp"
+            with open(tmp, "w") as f:
+                json.dump(payload, f, indent=2, sort_keys=True)
+            os.replace(tmp, mapping_path)
+        except Exception as e:
+            logger.debug(f"Failed to write slot_mapping.json: {e}")
+
     def _is_running(self) -> bool:
         """Check if the HAProxy process is still running."""
         return self._proc is not None and self._proc.returncode is None
@@ -845,6 +1135,12 @@ async def _graceful_reload(self) -> None:
                 self._proc = await self._start_and_wait_for_haproxy()
                 phases.append(f"fresh_start={time.monotonic() - t0:.2f}s")
 
+                # New process started with empty server-template slots;
+                # rebind pools from backend_configs and repopulate.
+                t0 = time.monotonic()
+                await self._rebind_slot_pools()
+                phases.append(f"rebind_slots={time.monotonic() - t0:.2f}s")
+
                 logger.info(
                     "HAProxy fresh-start reload OK in %.2fs (%s)",
                     time.monotonic() - reload_start,
@@ -892,6 +1188,12 @@ async def _graceful_reload(self) -> None:
             if old_proc is not None:
                 self._old_procs.append(old_proc)
 
+            # New process has empty server-template slots; rebind pools
+            # from backend_configs and repopulate.
+            t0 = time.monotonic()
+            await self._rebind_slot_pools()
+            phases.append(f"rebind_slots={time.monotonic() - t0:.2f}s")
+
             logger.info(
                 "HAProxy graceful reload OK in %.2fs (%s)",
                 time.monotonic() - reload_start,
@@ -1116,6 +1418,10 @@ async def start(self) -> None:
                 logger.info("Successfully generated HAProxy config file.")
 
             self._proc = await self._start_and_wait_for_haproxy()
+            # Rebuild in-memory pools and populate slots from any
+            # already-set backend_configs (test fixtures or production
+            # reloads that wrote backend_configs before calling start).
+            await self._rebind_slot_pools()
             logger.info("HAProxy started successfully.")
         except Exception as e:
             logger.error(f"Failed to initialize and start HAProxy configuration: {e}")
@@ -1468,18 +1774,23 @@ async def try_apply_servers_dynamically(
         """Try to apply server-list changes via HAProxy's runtime API.
 
         Compares ``new_backend_configs`` to the current ``self.backend_configs``.
-        If the change is purely server adds/removes within existing backends,
-        applies them over the admin socket (`add server` / `del server`) without
-        regenerating the config or restarting HAProxy. Otherwise falls back
-        and returns False so the caller can perform a full reload.
-
-        At Ray Serve scales with many replicas under autoscaling churn,
-        regenerating and reloading on every replica change blocks the proxy
-        actor's event loop long enough to fail controller health checks
-        (causing the proxy to be killed and recreated, cascading into the
-        ALB seeing all targets as unhealthy). The runtime API is ~1ms per
-        change vs. ~1-2s for a reload, so this incremental path keeps the
-        actor responsive under churn.
+        If the change is purely server adds/removes within existing backends
+        with unchanged slot-pool sizes, applies the diff over the admin socket
+        using `set server <backend>/srvN addr ... port ...` plus
+        `set server ... state ready/maint` against the pre-allocated
+        `server-template` slots. Otherwise falls back to a full reload.
+
+        The slot pool model trades up-front memory (one templated server per
+        slot, regardless of whether it's in use) for runtime-API efficiency:
+        we never need to `add server`/`del server` -- which mutate HAProxy's
+        per-backend server list under lock -- only repurpose pre-existing
+        slots. The result is fewer admin-socket commands per broadcast and
+        no server-list mutation contention with HTTP workers.
+
+        Slot accounting (which slot holds which replica) lives in
+        ``self._slot_pools`` and is reset to "all free" after every
+        successful (re)load, since the new HAProxy process starts with all
+        template slots disabled at the placeholder address.
 
         Returns True if applied incrementally (no reload needed). Returns
         False if a reload is required (caller is responsible for calling
@@ -1498,53 +1809,82 @@ async def try_apply_servers_dynamically(
 
         diff = _compute_backend_diff(self.backend_configs, new_backend_configs)
         if diff.is_structural:
+            # Structural change (which includes slot_pool_size shifts) → reload.
             self.set_backend_configs(new_backend_configs)
             return False
 
         if not diff.servers_to_add and not diff.servers_to_remove:
-            # No-op: server lists are identical. Update internal state
-            # defensively (in case other fields on ServerConfig change in the
-            # future) and skip the reload.
+            # No-op: server lists are identical.
             self.set_backend_configs(new_backend_configs)
             return True
 
-        # Apply the diff via the admin socket. We accumulate all commands
-        # into a list and ship them through `_send_runtime_commands`, which
-        # batches them onto a small number of Unix socket connections
-        # (separator: `;`). Previously each command opened its own
-        # connection, so a broadcast with N server changes incurred 2N
-        # admin-socket round-trips per proxy. Under load each round-trip
-        # competes with HTTP worker threads for accept/dispatch, and the
-        # cumulative wallclock easily exceeds the 5s `_send_socket_command`
-        # timeout. Batching collapses the 2N round-trips into ceil(2N/64)
-        # connections -- typically 1-4 for a real broadcast -- so the per-
-        # command overhead is amortized away.
-        #
-        # If any command in the batch fails, the whole apply fails and the
-        # caller falls back to a reload, which rebuilds full state from
-        # self.backend_configs (which we update below) and converges
-        # everything back to the new desired state.
-        try:
-            commands: List[str] = []
-            for backend_name, server_names in diff.servers_to_remove.items():
-                for sname in server_names:
-                    # Disable before delete: required when the server has
-                    # in-flight connections; safe even when it doesn't.
-                    commands.append(f"disable server {backend_name}/{sname}")
-                    commands.append(f"del server {backend_name}/{sname}")
-
-            for backend_name, servers in diff.servers_to_add.items():
-                for server in servers:
-                    # Newly-added runtime servers inherit `default-server`
-                    # directives (inter/fall/rise/etc.) from the backend
-                    # block, so we only need address + the `check` flag.
-                    commands.append(
-                        f"add server {backend_name}/{server.name} "
-                        f"{server.host}:{server.port} check"
+        # Build the command list. Process removals before adds so freed
+        # slots are available for new replicas in the same broadcast --
+        # otherwise a 1:1 churn (5 in, 5 out) would temporarily need 2x
+        # the slots.
+        commands: List[str] = []
+        added_count = 0
+        removed_count = 0
+
+        for backend_name, server_names in diff.servers_to_remove.items():
+            pool = self._slot_pools.get(backend_name)
+            if pool is None:
+                # No pool means the manager has produced a config we
+                # haven't loaded into HAProxy yet. Falling back to reload
+                # is the safe choice: it'll regenerate the config and
+                # rebuild pools.
+                logger.warning(
+                    "No slot pool for backend %s; falling back to reload.",
+                    backend_name,
+                )
+                self.set_backend_configs(new_backend_configs)
+                return False
+            for sname in server_names:
+                slot = pool.release(sname)
+                if slot is None:
+                    # Replica wasn't holding a slot. This can happen if a
+                    # prior apply failed mid-way; benign, skip.
+                    continue
+                commands.append(f"set server {backend_name}/srv{slot} state maint")
+                removed_count += 1
+
+        for backend_name, servers in diff.servers_to_add.items():
+            pool = self._slot_pools.get(backend_name)
+            if pool is None:
+                logger.warning(
+                    "No slot pool for backend %s; falling back to reload.",
+                    backend_name,
+                )
+                self.set_backend_configs(new_backend_configs)
+                return False
+            for server in servers:
+                slot = pool.assign(server.name)
+                if slot is None:
+                    logger.warning(
+                        "Backend %s slot pool exhausted "
+                        "(pool_size=%d, in_use=%d). Falling back to reload "
+                        "to re-split slots with bigger share for this "
+                        "backend.",
+                        backend_name,
+                        pool.pool_size,
+                        pool.in_use(),
                     )
-                    # Runtime-added servers start in MAINT state by default.
-                    commands.append(f"enable server {backend_name}/{server.name}")
+                    self.set_backend_configs(new_backend_configs)
+                    return False
+                commands.append(
+                    f"set server {backend_name}/srv{slot} "
+                    f"addr {server.host} port {server.port}"
+                )
+                commands.append(f"set server {backend_name}/srv{slot} state ready")
+                added_count += 1
+
+        if not commands:
+            # All adds/removes were no-ops (e.g. removes for replicas that
+            # never got slot-assigned). Update state and we're done.
+            self.set_backend_configs(new_backend_configs)
+            return True
 
+        try:
             await self._send_runtime_commands(commands)
         except Exception as e:
             logger.warning(
@@ -1556,14 +1896,12 @@ async def try_apply_servers_dynamically(
             return False
 
         # Update internal state and regenerate the config file so any
-        # subsequent reload (e.g. for a structural change) starts from the
-        # current set of servers.
+        # subsequent reload starts from the current set of servers.
         self.set_backend_configs(new_backend_configs)
         try:
             self._generate_config_file_internal()
         except Exception as e:
-            # The runtime change is already applied. A failure to write the
-            # config file is non-fatal here: the next reload will rebuild it.
+            # Non-fatal: runtime change already applied; next reload rebuilds.
             logger.warning(
                 f"Applied backend update via runtime API but failed to "
                 f"refresh the config file: {e}",
@@ -1572,10 +1910,11 @@ async def try_apply_servers_dynamically(
 
         logger.info(
             f"Applied backend update via runtime API: "
-            f"{sum(len(v) for v in diff.servers_to_add.values())} server(s) added, "
-            f"{sum(len(v) for v in diff.servers_to_remove.values())} server(s) removed.",
+            f"{added_count} server(s) added, {removed_count} server(s) removed.",
             extra={"log_to_stderr": False},
         )
+        # Best-effort slot-mapping snapshot for live debugging.
+        self._write_slot_mapping_for_debug()
         return True
 
     async def is_running(self) -> bool:
@@ -1907,7 +2246,15 @@ async def _apply_backend_update(
                 await self._haproxy.reload()
 
     def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
-        """Snapshot the current target groups + fallbacks into backend configs."""
+        """Snapshot the current target groups + fallbacks into backend configs.
+
+        Also computes the per-backend slot pool size via `_compute_slot_split`
+        and stamps each BackendConfig with its share. Slot pool sizes are
+        part of the rendered config (they show up in `server-template
+        srv 1-N`), so a change in the split is treated as a structural
+        change by `_compute_backend_diff` -- which is what we want: when
+        the split shifts, we need a reload to update the template.
+        """
         backend_configs = []
         for target_group in self._target_groups:
             fallback_target = None
@@ -1919,6 +2266,12 @@ def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
             backend_config = self._create_backend_config(target_group, fallback_target)
             backend_configs.append(backend_config)
 
+        # Stamp each backend with its slot-pool size from the split.
+        replica_counts = {bc.name: len(bc.servers) for bc in backend_configs}
+        split = _compute_slot_split(replica_counts)
+        for bc in backend_configs:
+            bc.slot_pool_size = split.get(bc.name, 0)
+
         return {bc.name: bc for bc in backend_configs}
 
     def _update_haproxy_backends(self) -> None:
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index ce1211b79e87..aed19d58e308 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -152,10 +152,13 @@
     http-check expect status 200
     {%- endif %}
     {{ hc.default_server_directive }}
-    # Servers in this backend
-    {%- for server in backend.servers %}
-    server {{ server.name }} {{ server.host }}:{{ server.port }} check
-    {%- endfor %}
+    # Pre-allocated slot pool. Slots start disabled with a placeholder
+    # address; the proxy actor populates them via the runtime API
+    # (`set server <backend>/srvN addr <ip> port <port>` + `state ready`)
+    # as replicas register, and releases them (`state maint`) when
+    # replicas exit. Slot count is computed by `_compute_slot_split` at
+    # config-generation time from the current replica count + headroom.
+    server-template srv 1-{{ backend.slot_pool_size }} 0.0.0.0:1 check disabled
     {%- if backend.fallback_server %}
     # Fallback to head node's Serve proxy when no ingress replicas are available
     server {{ backend.fallback_server.name }} {{ backend.fallback_server.host }}:{{ backend.fallback_server.port }} check backup
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 9aa50f96fb16..d19365159b04 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -225,6 +225,7 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
                 fallback_server=ServerConfig(
                     name="api_fallback_server", host="127.0.0.1", port=8500
                 ),
+                slot_pool_size=8,
             ),
             "web_backend": BackendConfig(
                 name="web_backend",
@@ -236,7 +237,8 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
                 timeout_tunnel_s=45,
                 servers=[
                     ServerConfig(name="web_server1", host="127.0.0.1", port=8003),
-                ]
+                ],
+                slot_pool_size=4,
                 # No health check overrides - should use global defaults
             ),
         }
@@ -368,9 +370,13 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     option httpchk GET /api/health
     http-check expect status 200
     default-server fastinter 250ms downinter 250ms fall 2 rise 3 inter 5s check
-    # Servers in this backend
-    server api_server1 127.0.0.1:8001 check
-    server api_server2 127.0.0.1:8002 check
+    # Pre-allocated slot pool. Slots start disabled with a placeholder
+    # address; the proxy actor populates them via the runtime API
+    # (`set server <backend>/srvN addr <ip> port <port>` + `state ready`)
+    # as replicas register, and releases them (`state maint`) when
+    # replicas exit. Slot count is computed by `_compute_slot_split` at
+    # config-generation time from the current replica count + headroom.
+    server-template srv 1-8 0.0.0.0:1 check disabled
     # Fallback to head node's Serve proxy when no ingress replicas are available
     server api_fallback_server 127.0.0.1:8500 check backup
 backend web_backend
@@ -395,8 +401,13 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     option httpchk GET /-/healthz
     http-check expect status 200
     default-server fastinter 250ms downinter 250ms fall 3 rise 2 inter 2s check
-    # Servers in this backend
-    server web_server1 127.0.0.1:8003 check
+    # Pre-allocated slot pool. Slots start disabled with a placeholder
+    # address; the proxy actor populates them via the runtime API
+    # (`set server <backend>/srvN addr <ip> port <port>` + `state ready`)
+    # as replicas register, and releases them (`state maint`) when
+    # replicas exit. Slot count is computed by `_compute_slot_split` at
+    # config-generation time from the current replica count + headroom.
+    server-template srv 1-4 0.0.0.0:1 check disabled
 listen stats
   bind *:8080
   stats enable
diff --git a/python/ray/serve/tests/test_haproxy_slots.py b/python/ray/serve/tests/test_haproxy_slots.py
new file mode 100644
index 000000000000..4a1b8e6db92b
--- /dev/null
+++ b/python/ray/serve/tests/test_haproxy_slots.py
@@ -0,0 +1,175 @@
+"""Unit tests for HAProxy slot pool + split helpers.
+
+These tests are pure-Python and do not require the HAProxy binary or the
+RAY_SERVE_ENABLE_HA_PROXY env flag.
+"""
+
+import pytest
+
+from ray.serve._private.haproxy import (
+    BackendSlotPool,
+    _compute_slot_split,
+)
+
+
+class TestBackendSlotPool:
+    def test_init_rejects_nonpositive_size(self):
+        with pytest.raises(ValueError):
+            BackendSlotPool("be", 0)
+        with pytest.raises(ValueError):
+            BackendSlotPool("be", -1)
+
+    def test_first_free_allocation_is_compact(self):
+        """Slots are handed out lowest-first; releases recycle the freed slot."""
+        pool = BackendSlotPool("be", pool_size=4)
+        assert pool.assign("a") == 1
+        assert pool.assign("b") == 2
+        assert pool.assign("c") == 3
+        # Release b -> slot 2 should be the next allocation.
+        assert pool.release("b") == 2
+        assert pool.assign("d") == 2
+        # Release a -> slot 1 reused next.
+        assert pool.release("a") == 1
+        assert pool.assign("e") == 1
+
+    def test_assign_is_idempotent(self):
+        pool = BackendSlotPool("be", pool_size=4)
+        slot = pool.assign("a")
+        assert pool.assign("a") == slot
+        assert pool.in_use() == 1
+
+    def test_release_of_unknown_is_noop(self):
+        pool = BackendSlotPool("be", pool_size=2)
+        assert pool.release("ghost") is None
+        assert pool.in_use() == 0
+
+    def test_full_pool_returns_none(self):
+        pool = BackendSlotPool("be", pool_size=2)
+        pool.assign("a")
+        pool.assign("b")
+        assert pool.assign("c") is None
+        assert pool.is_exhausted()
+        # After a release, room opens up again.
+        pool.release("a")
+        assert not pool.is_exhausted()
+        assert pool.assign("c") == 1
+
+    def test_mapping_returns_copy(self):
+        pool = BackendSlotPool("be", pool_size=4)
+        pool.assign("a")
+        m1 = pool.mapping()
+        pool.assign("b")
+        m2 = pool.mapping()
+        # m1 should not be mutated by later assignments
+        assert m1 == {"a": 1}
+        assert m2 == {"a": 1, "b": 2}
+
+    def test_in_use_and_free_counts(self):
+        pool = BackendSlotPool("be", pool_size=3)
+        assert pool.in_use() == 0
+        assert pool.free() == 3
+        pool.assign("a")
+        pool.assign("b")
+        assert pool.in_use() == 2
+        assert pool.free() == 1
+        pool.release("a")
+        assert pool.in_use() == 1
+        assert pool.free() == 2
+
+
+class TestComputeSlotSplit:
+    def test_empty_returns_empty(self):
+        assert _compute_slot_split({}) == {}
+
+    def test_single_backend_gets_full_budget(self):
+        result = _compute_slot_split({"only": 10}, total=128, min_per_backend=16)
+        assert result == {"only": 128}
+
+    def test_no_replicas_equal_split_above_minimum(self):
+        result = _compute_slot_split(
+            {"a": 0, "b": 0, "c": 0},
+            total=300,
+            min_per_backend=10,
+            headroom_factor=2.0,
+        )
+        # Reserved: 30. Remaining: 270 split equally → 90 each. Sum = 300.
+        assert result == {"a": 100, "b": 100, "c": 100}
+        assert sum(result.values()) == 300
+
+    def test_proportional_split_by_replica_count(self):
+        result = _compute_slot_split(
+            {"big": 100, "small": 10},
+            total=400,
+            min_per_backend=20,
+            headroom_factor=2.0,
+        )
+        # Reserved: 40. Remaining: 360.
+        # Demand: big=200, small=20 (total 220).
+        # Extra: big = 360*200/220 ≈ 327, small = 360*20/220 ≈ 32.
+        # With rounding: 327+32=359; +1 to largest fractional → 327+33=360.
+        # Final: big=20+327=347 or 348, small=20+32=33 or 32. Sum=400.
+        assert sum(result.values()) == 400
+        assert result["big"] >= result["small"]
+        assert result["big"] >= 200  # has at least replica count's worth
+        assert result["small"] >= 20  # has at least the minimum
+
+    def test_minimum_floor_respected(self):
+        """Even an idle backend gets at least min_per_backend."""
+        result = _compute_slot_split(
+            {"hot": 100, "idle": 0},
+            total=200,
+            min_per_backend=16,
+            headroom_factor=2.0,
+        )
+        assert result["idle"] == 16
+        assert sum(result.values()) <= 200
+
+    def test_pathological_too_many_backends_degrades_to_equal(self):
+        # 10 backends * min=16 = 160 > total=100, so equal split.
+        result = _compute_slot_split(
+            {f"be{i}": 0 for i in range(10)},
+            total=100,
+            min_per_backend=16,
+            headroom_factor=2.0,
+        )
+        assert sum(result.values()) == 100
+        # Each backend has at least 1, all roughly equal (10 each).
+        for size in result.values():
+            assert size >= 1
+        assert max(result.values()) - min(result.values()) <= 1
+
+    def test_sum_invariant_does_not_exceed_total(self):
+        """Property: sum(result) <= total for all reasonable inputs."""
+        import random
+
+        random.seed(42)
+        for _ in range(50):
+            n = random.randint(1, 30)
+            counts = {f"be{i}": random.randint(0, 50) for i in range(n)}
+            total = random.randint(64, 8192)
+            result = _compute_slot_split(
+                counts, total=total, min_per_backend=16, headroom_factor=2.0
+            )
+            assert sum(result.values()) <= total
+            assert set(result.keys()) == set(counts.keys())
+
+    def test_headroom_factor_scales_allocation(self):
+        """Higher headroom_factor gives the busier backend more slots."""
+        low = _compute_slot_split(
+            {"busy": 50, "idle": 0},
+            total=400,
+            min_per_backend=16,
+            headroom_factor=1.0,
+        )
+        high = _compute_slot_split(
+            {"busy": 50, "idle": 0},
+            total=400,
+            min_per_backend=16,
+            headroom_factor=4.0,
+        )
+        # The "idle" backend's allocation can't grow (only min). But the
+        # "busy" backend's share grows with headroom_factor — except
+        # the total is fixed at 400 and the idle min is 16, so busy
+        # caps at 384 regardless. Just check monotonicity:
+        # busy(high_factor) >= busy(low_factor)
+        assert high["busy"] >= low["busy"]

From 2ca0d1f0401cc7994e3354f4076466dbc2125935 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 09:52:23 +0000
Subject: [PATCH 20/26] [serve] Fix server-template bugs found in self-review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five distinct bugs found while re-reading the prior commit:

1. CRITICAL — slot split recomputed on every broadcast → reload-storm.
   `_build_name_to_backend_configs` unconditionally called
   `_compute_slot_split`. The proportional split shifts every time any
   backend's replica count changes (since ratios reallocate among all
   backends), so every broadcast saw `slot_pool_size` differ →
   structural diff → reload. The runtime-API path was effectively never
   taken. Fix: cache `_current_split` on HAProxyManager; recompute only
   when the backend set changes or any backend has overflowed its
   current allocation. Steady-state churn now hits the runtime-API path
   as intended; reloads only happen on topology changes or genuine
   exhaustion.

2. HIGH — `_rebind_slot_pools` failure left inconsistent state.
   The method pre-assigns slots in the pool BEFORE sending runtime-API
   commands. On send failure the pool said slots were assigned but
   HAProxy still had them at the placeholder; next broadcast with the
   same servers would see an empty diff and skip work, leaving HAProxy
   serving nothing forever. Fix: on send failure wipe both pools and
   `backend_configs` so the next broadcast goes through the first-time
   setup path → reload → fresh rebind with a clean admin socket.

3. HIGH (tests) — `BackendConfig(servers=[...])` without
   `slot_pool_size` rendered `server-template srv 1-0`, which HAProxy
   rejects. Production sets the size via `_compute_slot_split`, but
   test fixtures construct BackendConfig directly. Fix: `__post_init__`
   defaults `slot_pool_size` to `max(len(servers)*2, MIN_SLOTS)` when
   not set; production assignment after construction still wins.

4. HIGH — `enable_hap_optimization` (server-state file) collided with
   server-template. The state file persists slot ↔ replica mappings
   across reloads and is loaded by the new HAProxy on startup. Our
   `_rebind_slot_pools` then assigns slots first-free from a FRESH pool
   and sends `set server addr/state` commands. Result: the same replica
   ends up bound to two slots (one restored by state-file load, one
   freshly assigned by the actor), doubling traffic to it. Compounds
   over reloads. Fix: stop loading the state file -- the mapping is now
   owned by the actor's in-memory pool. Removed `_save_server_state`
   call from `_graceful_reload` (and the now-unused method) and dropped
   `load-server-state-from-file global` from the template. Kept `-x`
   FD transfer (independent, still useful for socket migration).

5. LOW — "Slot population OK: X servers placed" log used the desired
   total instead of the actual placed count. Showed the wrong number on
   mid-batch pool exhaustion. Fix: track `placed_count` directly.

Test golden config updated for the removed `load-server-state-from-file`
line. No new tests; the existing slot-pool unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py          | 142 ++++++++++++------
 .../ray/serve/_private/haproxy_templates.py   |   9 +-
 python/ray/serve/tests/test_haproxy_api.py    |   7 +-
 3 files changed, 109 insertions(+), 49 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 209bb7a018be..38821654a9ae 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -331,10 +331,26 @@ class BackendConfig:
     # Size of this backend's `server-template` slot pool. The rendered
     # config emits `server-template srv 1-{slot_pool_size} ...` so the
     # backend can hold up to `slot_pool_size` replicas before the runtime
-    # API has to fall back to a reload to re-split. Set by HAProxyManager
-    # via `_compute_slot_split` on every config (re)generation.
+    # API has to fall back to a reload to re-split. In production this
+    # is set by HAProxyManager via `_compute_slot_split` on every config
+    # (re)generation. Defaults to a sensible non-zero value via
+    # `__post_init__` so direct-construction callers (typically tests)
+    # don't need to specify it -- a zero value would render an invalid
+    # `server-template srv 1-0` line.
     slot_pool_size: int = 0
 
+    def __post_init__(self) -> None:
+        # If the caller didn't set a slot pool size, derive a sensible
+        # default from the current server count. Production sets this
+        # explicitly post-init via `_compute_slot_split`; this default is
+        # for direct-construction callers (tests) so an unset value
+        # doesn't produce an invalid `server-template 1-0` line.
+        if self.slot_pool_size <= 0:
+            self.slot_pool_size = max(
+                len(self.servers) * 2,
+                RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND,
+            )
+
     def build_health_check_config(self, global_config: "HAProxyConfig") -> dict:
         """Build health check configuration for HAProxy backend.
 
@@ -914,12 +930,15 @@ async def _rebind_slot_pools(self) -> None:
             method binds those servers to slots without requiring the
             caller to make an extra runtime-API call.
 
-        Step 3 is best-effort: on failure HAProxy is up but slots are
-        unpopulated; the next broadcast will diff against
-        `backend_configs` (which already reflects desired state) and
-        find no work to do, leaving slots empty. To recover, callers
-        should observe a failed apply and retry. In practice failures
-        here are rare since we just restarted HAProxy.
+        On step-3 failure we wipe `_slot_pools` AND `backend_configs`
+        so the next broadcast goes through the first-time-setup path
+        (returns False, caller reloads, this method runs again with a
+        fresh HAProxy admin socket). Without this wipe, the actor's
+        cached state would say "slots are assigned" while HAProxy's
+        templated slots are still at the placeholder, and subsequent
+        diffs would find no work to do -- HAProxy would silently serve
+        nothing. Failures here are rare in practice since we just
+        restarted HAProxy.
         """
         new_pools: Dict[str, BackendSlotPool] = {}
         for name, bc in self.backend_configs.items():
@@ -940,6 +959,7 @@ async def _rebind_slot_pools(self) -> None:
         if not self.backend_configs:
             return
         commands: List[str] = []
+        placed_count = 0
         for backend_name, bc in self.backend_configs.items():
             pool = self._slot_pools.get(backend_name)
             if pool is None or not bc.servers:
@@ -960,21 +980,36 @@ async def _rebind_slot_pools(self) -> None:
                     f"addr {server.host} port {server.port}"
                 )
                 commands.append(f"set server {backend_name}/srv{slot} state ready")
+                placed_count += 1
         if not commands:
             return
         try:
             await self._send_runtime_commands(commands)
             logger.info(
                 "Slot population OK: %d servers placed across %d backends",
-                sum(len(bc.servers) for bc in self.backend_configs.values()),
+                placed_count,
                 len(self._slot_pools),
                 extra={"log_to_stderr": True},
             )
         except Exception as e:
-            logger.warning(
-                f"Slot population failed after reload: {e}. HAProxy is up "
-                f"but no replicas are routable until the next broadcast retry."
+            # On failure, the in-memory pool has assignments that HAProxy
+            # doesn't know about: the templated slots are still disabled
+            # at the placeholder address. The next broadcast with
+            # unchanged servers would see an empty diff and skip the
+            # work, leaving HAProxy serving nothing. Force recovery by
+            # wiping our cached state so the next broadcast goes through
+            # the first-time-setup path (returns False, caller reloads,
+            # this method runs again with a fresh HAProxy admin socket).
+            logger.error(
+                "Slot population failed after reload: %s. Wiping pool and "
+                "backend_configs to force a clean recovery on the next "
+                "broadcast.",
+                e,
+                extra={"log_to_stderr": True},
             )
+            self._slot_pools = {}
+            self.backend_configs = {}
+            return
         # Best-effort snapshot for live debugging.
         self._write_slot_mapping_for_debug()
 
@@ -1065,12 +1100,6 @@ async def _start_and_wait_for_haproxy(
         )
         return proc
 
-    async def _save_server_state(self) -> None:
-        """Save the server state to the file."""
-        server_state = await self._send_socket_command("show servers state")
-        with open(self.cfg.server_state_file, "w") as f:
-            f.write(server_state)
-
     async def _terminate_old_procs(self) -> None:
         """SIGKILL any old HAProxy procs we're still tracking.
 
@@ -1152,24 +1181,14 @@ async def _graceful_reload(self) -> None:
             await self._wait_for_hap_availability(old_proc)
             phases.append(f"old_proc_check={time.monotonic() - t0:.2f}s")
 
-            # Save server state if optimization is enabled. Treat failures
-            # as non-fatal: the state file is purely an optimization (lets
-            # the new HAProxy preserve per-server connection counts across
-            # reload), and a stale or missing file just causes HAProxy to
-            # start counters from zero. A `show servers state` admin-socket
-            # timeout here under load used to abort the entire reload --
-            # which then cascaded since the next broadcast retried with
-            # the same overloaded socket.
-            if self.cfg.enable_hap_optimization:
-                t0 = time.monotonic()
-                try:
-                    await self._save_server_state()
-                    phases.append(f"save_state={time.monotonic() - t0:.2f}s")
-                except Exception as e:
-                    phases.append(f"save_state_skipped={time.monotonic() - t0:.2f}s")
-                    logger.warning(
-                        f"Skipping HAProxy server state save before reload: {e}"
-                    )
+            # Note: server-state file save/load is intentionally skipped.
+            # With `server-template`, every reload runs `_rebind_slot_pools`
+            # which re-assigns slots first-free from a fresh pool. Loading
+            # the state file in the new HAProxy would also restore the
+            # previous mapping; the two would collide and the same replica
+            # would end up bound to two slots (both routable, doubling
+            # traffic to it). We keep `-x` FD transfer for seamless
+            # listener migration since that's independent of slot state.
 
             # Start new HAProxy process with -sf flag to gracefully take over from old process
             # Use -x socket transfer for seamless reloads if optimization is enabled
@@ -1981,6 +2000,16 @@ def __init__(
         self._coalesce_task: Optional[asyncio.Task] = None
         self._coalesce_pending = False
 
+        # Cached slot-pool split, keyed by backend name. Recomputed only
+        # when the set of backends changes or a backend's replica count
+        # exceeds its current allocation -- without this caching, the
+        # proportional split would shift on every broadcast (since
+        # changing any backend's replica count changes the ratios for
+        # all backends), every broadcast would see `slot_pool_size`
+        # differ in the diff, and every broadcast would trigger a
+        # reload, defeating the runtime-API path entirely.
+        self._current_split: Dict[str, int] = {}
+
         self.long_poll_client = long_poll_client or LongPollClient(
             ray.get_actor(SERVE_CONTROLLER_NAME, namespace=SERVE_NAMESPACE),
             {
@@ -2248,12 +2277,20 @@ async def _apply_backend_update(
     def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
         """Snapshot the current target groups + fallbacks into backend configs.
 
-        Also computes the per-backend slot pool size via `_compute_slot_split`
-        and stamps each BackendConfig with its share. Slot pool sizes are
-        part of the rendered config (they show up in `server-template
-        srv 1-N`), so a change in the split is treated as a structural
-        change by `_compute_backend_diff` -- which is what we want: when
-        the split shifts, we need a reload to update the template.
+        Stamps each BackendConfig with its slot-pool size from the
+        cached split. The split is recomputed only when:
+          - The set of backends changes (a backend is added or removed), or
+          - Any backend's current replica count exceeds its allocated
+            slot pool size (i.e. it has overflowed and needs more).
+
+        Without this caching, the proportional split would shift on
+        every replica-count change (since the ratios reallocate among
+        all backends). That would make `slot_pool_size` differ in the
+        diff on every broadcast, marking it as a structural change,
+        and forcing a reload on every broadcast -- defeating the entire
+        runtime-API path. With caching, the split is stable until a
+        backend overflows or the topology changes, both of which
+        already justify a reload.
         """
         backend_configs = []
         for target_group in self._target_groups:
@@ -2266,11 +2303,26 @@ def _build_name_to_backend_configs(self) -> Dict[str, BackendConfig]:
             backend_config = self._create_backend_config(target_group, fallback_target)
             backend_configs.append(backend_config)
 
-        # Stamp each backend with its slot-pool size from the split.
         replica_counts = {bc.name: len(bc.servers) for bc in backend_configs}
-        split = _compute_slot_split(replica_counts)
+        backend_set_changed = set(replica_counts) != set(self._current_split)
+        any_overflow = any(
+            replica_counts[name] > self._current_split.get(name, 0)
+            for name in replica_counts
+        )
+        if backend_set_changed or any_overflow:
+            old_split = self._current_split
+            self._current_split = _compute_slot_split(replica_counts)
+            if self._current_split != old_split:
+                logger.info(
+                    "Slot split recomputed (set_changed=%s, overflow=%s): %s",
+                    backend_set_changed,
+                    any_overflow,
+                    self._current_split,
+                    extra={"log_to_stderr": True},
+                )
+
         for bc in backend_configs:
-            bc.slot_pool_size = split.get(bc.name, 0)
+            bc.slot_pool_size = self._current_split.get(bc.name, 0)
 
         return {bc.name: bc for bc in backend_configs}
 
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index aed19d58e308..1afc61689055 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -79,9 +79,12 @@
     errorfile 502 {{ config.error_file_path }}
     errorfile 504 {{ config.error_file_path }}
     {%- endif %}
-    {%- if config.enable_hap_optimization %}
-    load-server-state-from-file global
-    {%- endif %}
+    # `load-server-state-from-file` is intentionally not emitted: with
+    # `server-template`, the slot ↔ replica mapping is owned by the
+    # proxy actor's in-memory pool. Loading a saved state file would
+    # restore the previous mapping inside HAProxy while the actor
+    # re-assigns slots first-free from a fresh pool, producing
+    # duplicate slots both pointing at the same replica.
     balance {{ config.balance_algorithm }}
 frontend prometheus
     bind :{{ config.metrics_port }}
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index d19365159b04..3e5aa7034b6d 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -314,7 +314,12 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # Normalize 502 and 504 errors to 500 per Serve's default behavior
     errorfile 502 {temp_dir}/500.http
     errorfile 504 {temp_dir}/500.http
-    load-server-state-from-file global
+    # `load-server-state-from-file` is intentionally not emitted: with
+    # `server-template`, the slot ↔ replica mapping is owned by the
+    # proxy actor's in-memory pool. Loading a saved state file would
+    # restore the previous mapping inside HAProxy while the actor
+    # re-assigns slots first-free from a fresh pool, producing
+    # duplicate slots both pointing at the same replica.
     balance random(2)
 frontend prometheus
     bind :9101

From b6f2fd8b884ad9ad28113bfd86927f9851c78e71 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 09:57:49 +0000
Subject: [PATCH 21/26] [serve] Filter HAProxy stats to active slots only

With server-template, `show stat` reports every templated slot
(srv1..srv4096) including placeholders in MAINT. Callers of
`get_all_stats` (and via it `total_servers`, `active_servers`,
`HAProxyStats.*`) were getting inflated counts that don't reflect
real replicas. `is_drained` / `is_system_idle` happened to be fine
because placeholder slots have 0 sessions and 0 queued, so the
session sums were correct. But anything counting servers --
including the existing integration tests -- broke.

Filter in `get_all_stats`:
* Slots actively assigned in the in-memory pool: kept.
* Slots with current_sessions > 0 or queued > 0: kept (recently
  released slots still draining in-flight work; `is_system_idle`
  needs to see those).
* Fallback servers (named after the actor, not srv-prefixed): kept.
* Everything else (placeholders): dropped.

Added a public `BackendSlotPool.assigned_slots() -> Set[int]`
accessor so the filter doesn't have to reach into private state.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 57 ++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 38821654a9ae..6febd96a7e7c 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -769,6 +769,10 @@ def mapping(self) -> Dict[str, int]:
         """Return a copy of the replica → slot mapping (for diagnostics)."""
         return dict(self._replica_to_slot)
 
+    def assigned_slots(self) -> Set[int]:
+        """Return the set of slot numbers currently assigned to replicas."""
+        return set(self._replica_to_slot.values())
+
 
 def _compute_slot_split(
     backend_to_replica_count: Dict[str, int],
@@ -1452,23 +1456,52 @@ async def get_all_stats(self) -> Dict[str, Dict[str, ServerStats]]:
         Returns only application backends configured in self.backend_configs,
         excluding HAProxy internal components (frontends, default_backend, stats).
         Also excludes BACKEND aggregate entries, returning only individual servers.
+
+        With server-template, `show stat` reports every templated slot
+        (e.g. srv1..srv4096) -- including placeholders at the disabled
+        address. Including those would inflate `total_servers` and skew
+        any caller that counts servers. We filter to:
+          - Slots actively assigned in our in-memory pool (i.e. bound
+            to a real replica).
+          - Slots with non-zero current sessions or queued requests
+            (recently released slots still draining; we want
+            `is_system_idle` to see those).
+          - The fallback server (named after the actor, not srv-prefixed).
         """
         try:
             stats_output = await self._send_socket_command("show stat")
             all_stats = self._parse_haproxy_csv_stats(stats_output)
 
-            # Filter to only return application backends (ones in backend_configs)
-            # Exclude HAProxy internal components like frontends, default_backend, stats
-            # Also exclude BACKEND aggregate entries, keep only individual servers
-            return {
-                backend_name: {
-                    server_name: stats
-                    for server_name, stats in servers.items()
-                    if server_name != "BACKEND"
-                }
-                for backend_name, servers in all_stats.items()
-                if backend_name in self.backend_configs
-            }
+            filtered: Dict[str, Dict[str, ServerStats]] = {}
+            for backend_name, servers in all_stats.items():
+                if backend_name not in self.backend_configs:
+                    continue
+                pool = self._slot_pools.get(backend_name)
+                active_slots: Set[int] = (
+                    pool.assigned_slots() if pool is not None else set()
+                )
+                kept: Dict[str, ServerStats] = {}
+                for server_name, stats in servers.items():
+                    if server_name == "BACKEND":
+                        continue
+                    if not server_name.startswith("srv"):
+                        # Non-template server (e.g. fallback). Keep unconditionally.
+                        kept[server_name] = stats
+                        continue
+                    try:
+                        slot_num = int(server_name[3:])
+                    except ValueError:
+                        # Unexpected naming; keep defensively.
+                        kept[server_name] = stats
+                        continue
+                    if slot_num in active_slots:
+                        kept[server_name] = stats
+                    elif stats.current_sessions > 0 or stats.queued > 0:
+                        # Released slot still draining in-flight work.
+                        kept[server_name] = stats
+                    # else: placeholder slot; skip
+                filtered[backend_name] = kept
+            return filtered
         except Exception as e:
             logger.error(f"Failed to get HAProxy stats: {e}")
             return {}

From 6029404b66abf336144cc6bd7062115f59ef4f3d Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 10:59:52 +0000
Subject: [PATCH 22/26] [serve] Wait for new HAProxy proc to claim admin socket
 before runtime commands

After a graceful reload, `_start_and_wait_for_haproxy` returns as soon
as the new proc's frontend listener accepts TCP -- which is fast via
`-x` FD transfer. But the runtime-API admin socket binding lags behind
that by tens to hundreds of milliseconds. During that lag, connections
to the admin-socket path still go to the OLD proc (it still has the
file bound). For the initial reload (when the old proc started with an
empty config), that means our `set server backend/srvN ...` commands
hit a process that doesn't know about any backends -- "No such
backend." -- and `_rebind_slot_pools` wipes pool+backend_configs to
force recovery, which schedules another reload, which races the same
window, and so on.

Visible failure pattern: bursts of 503s on high-traffic endpoints
during cluster bring-up (~3% aggregate failure rate observed on a
recent load test, ~7% on /highscale-stress/ specifically), centered
on the period between the initial empty-config start and the first
broadcast's reload completing.

Fix: probe `show info` for the new proc's PID after start and before
issuing any runtime commands. Polls every 50ms with a 10s deadline.
On the rare timeout, log a warning and proceed -- runtime commands
may still fail, in which case the existing reload-fallback path
kicks in.

Applied in three reload paths:
* HAProxyApi.start() (initial start)
* _graceful_reload (normal graceful reload)
* _graceful_reload fresh-start branch

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/haproxy.py | 72 ++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 6febd96a7e7c..bd90f8f142e5 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -1168,6 +1168,14 @@ async def _graceful_reload(self) -> None:
                 self._proc = await self._start_and_wait_for_haproxy()
                 phases.append(f"fresh_start={time.monotonic() - t0:.2f}s")
 
+                # On fresh-start there is no old process competing for
+                # the admin socket, so this should resolve immediately;
+                # we still wait defensively in case the new proc takes
+                # a moment to bind.
+                t0 = time.monotonic()
+                await self._wait_for_admin_socket_bound_by(self._proc.pid)
+                phases.append(f"admin_wait={time.monotonic() - t0:.2f}s")
+
                 # New process started with empty server-template slots;
                 # rebind pools from backend_configs and repopulate.
                 t0 = time.monotonic()
@@ -1211,6 +1219,18 @@ async def _graceful_reload(self) -> None:
             if old_proc is not None:
                 self._old_procs.append(old_proc)
 
+            # `_start_and_wait_for_haproxy` returns as soon as the new
+            # proc's frontend listener accepts TCP -- which it does very
+            # early via `-x` FD transfer. The runtime-API admin socket
+            # binding lags behind that, and during the lag connections
+            # to the socket path still go to the OLD proc (which has
+            # the old config and rejects new-config commands with
+            # "No such backend"). Wait for the admin socket to be owned
+            # by the new proc before we issue any runtime commands.
+            t0 = time.monotonic()
+            await self._wait_for_admin_socket_bound_by(self._proc.pid)
+            phases.append(f"admin_wait={time.monotonic() - t0:.2f}s")
+
             # New process has empty server-template slots; rebind pools
             # from backend_configs and repopulate.
             t0 = time.monotonic()
@@ -1258,6 +1278,52 @@ async def _is_listener_bound(self) -> bool:
             pass
         return True
 
+    async def _wait_for_admin_socket_bound_by(
+        self, target_pid: int, timeout_s: float = 10.0
+    ) -> bool:
+        """Wait until the admin socket is bound by the proc with `target_pid`.
+
+        After a graceful reload, the new HAProxy process binds the
+        frontend listener very early (often within a few ms via the `-x`
+        FD transfer) but the runtime-API admin socket binding can lag
+        behind by tens to hundreds of milliseconds. During that window
+        the old proc still owns the admin-socket path, so any
+        `_send_socket_command` (e.g. our `set server` / `show servers
+        state` calls) gets routed to the OLD process -- which has the
+        OLD config and returns "No such backend" for templated slots in
+        the new config.
+
+        Polls `show info` and parses the `Pid:` line until it matches
+        the new proc, or until `timeout_s` elapses. Returns True if
+        confirmed, False on timeout. Failures here are logged but not
+        fatal: callers should still attempt their runtime commands, and
+        if those fail the regular reload-fallback path kicks in.
+        """
+        deadline = time.monotonic() + timeout_s
+        pid_re = re.compile(r"^Pid:\s*(\d+)\s*$", re.MULTILINE)
+        last_seen_pid: Optional[int] = None
+        while time.monotonic() < deadline:
+            try:
+                info = await self._send_socket_command("show info")
+                match = pid_re.search(info)
+                if match:
+                    seen = int(match.group(1))
+                    last_seen_pid = seen
+                    if seen == target_pid:
+                        return True
+            except Exception:
+                pass
+            await asyncio.sleep(0.05)
+        logger.warning(
+            "Admin socket did not bind to new HAProxy pid=%d within %.1fs "
+            "(last seen pid=%s); proceeding anyway -- runtime commands may "
+            "fail and trigger another reload.",
+            target_pid,
+            timeout_s,
+            last_seen_pid,
+        )
+        return False
+
     async def probe_http_listener(self) -> bool:
         """Probe the HAProxy HTTP listener via /-/healthz.
 
@@ -1441,6 +1507,12 @@ async def start(self) -> None:
                 logger.info("Successfully generated HAProxy config file.")
 
             self._proc = await self._start_and_wait_for_haproxy()
+            # No old process competes for the admin socket on initial
+            # start, but the new proc still needs a moment to bind it
+            # after the frontend listener comes up. Wait briefly so
+            # `_rebind_slot_pools`'s runtime-API calls don't race the
+            # bind.
+            await self._wait_for_admin_socket_bound_by(self._proc.pid)
             # Rebuild in-memory pools and populate slots from any
             # already-set backend_configs (test fixtures or production
             # reloads that wrote backend_configs before calling start).

From 7330a996b87e02b8ede324f7d3f11697db61dbaf Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 12:56:44 +0000
Subject: [PATCH 23/26] [serve] Isolate HAProxy admin socket on dedicated
 thread; bump socket/hard-stop timeouts

Production load tests showed a reload cascade: under traffic spikes the
data-plane threads saturated and starved the admin socket, runtime-API
commands hit their 15s timeout, the proxy actor fell back to a full
reload, and the in-flight streaming and long-running requests on the
old HAProxy process were chopped when hard-stop-after fired.

Three changes that together break the cascade and survive the reloads
it can't prevent:

- haproxy_templates.py: pin both stats sockets to `thread 1` and
  restrict all `bind` lines (prometheus, http_frontend, stats) to
  `thread 2-{nbthread}`. With admin work on a dedicated thread, runtime
  commands stay responsive even when threads 2..N are at 100% CPU.
- constants.py: RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S 15s -> 60s as a
  safety margin for rare slow windows that isolation alone doesn't
  cover.
- constants.py: RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S 120s -> 600s so
  long-running requests (e.g. SSE streams, /long-runner/-style
  endpoints) survive across reloads instead of being TCP-RST'd.

Thread isolation requires nbthread >= 2; HAProxyConfig.__post_init__
enforces this so a misconfigured template can't render `thread 2-1`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py         | 14 +++++++-------
 python/ray/serve/_private/haproxy.py           | 11 +++++++++++
 python/ray/serve/_private/haproxy_templates.py | 15 ++++++++++-----
 python/ray/serve/tests/test_haproxy_api.py     | 17 +++++++++++------
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 7be17ac96894..d6952dc2f4ca 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -716,7 +716,7 @@
 
 # HAProxy hard stop after timeout
 RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S = int(
-    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "120")
+    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "600")
 )
 
 # HAProxy metrics export port
@@ -801,13 +801,13 @@
 # Connect/read timeout (seconds) for HAProxy admin-socket commands. The
 # CLI mux serializes admin operations behind HTTP worker dispatch, so a
 # batch of `add server` / `del server` / etc. commands can routinely take
-# more than a few seconds while HAProxy is serving heavy traffic. The
-# previous hardcoded 5s was tight enough that broadcasts during
-# autoscaling churn hit it routinely, falling back to expensive full
-# reloads. 15s lets the slower-but-functional runtime-API path complete
-# in those cases instead of cascading into reloads.
+# more than a few seconds while HAProxy is serving heavy traffic. With
+# the admin socket pinned to its own thread (see haproxy_templates.py)
+# this should usually stay fast, but a generous ceiling keeps the
+# runtime-API path alive across rare slow windows instead of cascading
+# into a fallback reload.
 RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S = float(
-    os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "15")
+    os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "60")
 )
 
 # Total number of server slots to pre-allocate across all backends via
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index bd90f8f142e5..8f86a70e75da 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -489,6 +489,17 @@ class HAProxyConfig:
 
     is_head: bool = False
 
+    def __post_init__(self) -> None:
+        # Thread isolation requires at least 2 threads: thread 1 is
+        # reserved for the admin/stats sockets, threads 2..nbthread serve
+        # data-plane traffic (see haproxy_templates.py). With nbthread<2
+        # the template would emit `thread 2-1` which HAProxy rejects.
+        if self.nbthread < 2:
+            raise ValueError(
+                f"nbthread must be >= 2 for admin/data-plane thread "
+                f"isolation (got {self.nbthread})."
+            )
+
     @property
     def transfer_socket_path(self) -> str:
         """Path of the dedicated FD-transfer admin socket.
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 1afc61689055..372b12208721 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -29,12 +29,17 @@
     # Keeping FD transfer on its own socket prevents the `_getsocks`
     # exchange from queueing behind add/del/disable/enable server commands
     # during a reload.
-    stats socket {{ config.transfer_socket_path }} mode 666 level admin expose-fd listeners
+    # `thread 1` pins admin work to a dedicated thread (see frontend
+    # binds below, which restrict data-plane traffic to thread 2+). When
+    # the data plane saturates threads 2..N, thread 1 stays free for
+    # runtime-API commands, preventing the cascade where slow admin
+    # responses trigger fallback reloads.
+    stats socket {{ config.transfer_socket_path }} mode 666 level admin expose-fd listeners thread 1
     # Runtime-API + stats socket. Used by the proxy actor for
     # `add server`, `del server`, `show stat`, etc. No `expose-fd
     # listeners` here — FD transfer is reserved for the dedicated socket
     # above so the two streams don't contend on the same connection slot.
-    stats socket {{ config.socket_path }} mode 666 level admin
+    stats socket {{ config.socket_path }} mode 666 level admin thread 1
     stats timeout 30s
     maxconn {{ config.maxconn }}
     nbthread {{ config.nbthread }}
@@ -87,12 +92,12 @@
     # duplicate slots both pointing at the same replica.
     balance {{ config.balance_algorithm }}
 frontend prometheus
-    bind :{{ config.metrics_port }}
+    bind :{{ config.metrics_port }} thread 2-{{ config.nbthread }}
     mode http
     http-request use-service prometheus-exporter if { path {{ config.metrics_uri }} }
     no log
 frontend http_frontend
-    bind {{ config.frontend_host }}:{{ config.frontend_port }}
+    bind {{ config.frontend_host }}:{{ config.frontend_port }} thread 2-{{ config.nbthread }}
 {{ healthz_rules|safe }}
     # Routes endpoint
     acl routes path -i /-/routes
@@ -168,7 +173,7 @@
     {%- endif %}
 {%- endfor %}
 listen stats
-  bind *:{{ config.stats_port }}
+  bind *:{{ config.stats_port }} thread 2-{{ config.nbthread }}
   stats enable
   stats uri {{ config.stats_uri }}
   stats refresh 1s
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 3e5aa7034b6d..cae348ce6e7b 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -276,18 +276,23 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # Keeping FD transfer on its own socket prevents the `_getsocks`
     # exchange from queueing behind add/del/disable/enable server commands
     # during a reload.
-    stats socket {socket_path}.fd mode 666 level admin expose-fd listeners
+    # `thread 1` pins admin work to a dedicated thread (see frontend
+    # binds below, which restrict data-plane traffic to thread 2+). When
+    # the data plane saturates threads 2..N, thread 1 stays free for
+    # runtime-API commands, preventing the cascade where slow admin
+    # responses trigger fallback reloads.
+    stats socket {socket_path}.fd mode 666 level admin expose-fd listeners thread 1
     # Runtime-API + stats socket. Used by the proxy actor for
     # `add server`, `del server`, `show stat`, etc. No `expose-fd
     # listeners` here — FD transfer is reserved for the dedicated socket
     # above so the two streams don't contend on the same connection slot.
-    stats socket {socket_path} mode 666 level admin
+    stats socket {socket_path} mode 666 level admin thread 1
     stats timeout 30s
     maxconn 1000
     nbthread 2
     server-state-base /tmp/haproxy-serve
     server-state-file /tmp/haproxy-serve/server-state
-    hard-stop-after 120s
+    hard-stop-after 600s
 defaults
     mode http
     option log-health-checks
@@ -322,12 +327,12 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # duplicate slots both pointing at the same replica.
     balance random(2)
 frontend prometheus
-    bind :9101
+    bind :9101 thread 2-2
     mode http
     http-request use-service prometheus-exporter if {{ path /metrics }}
     no log
 frontend http_frontend
-    bind *:8000
+    bind *:8000 thread 2-2
     # Health check endpoint
     acl healthcheck path -i /-/healthz
     # Suppress logging for health checks
@@ -414,7 +419,7 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # config-generation time from the current replica count + headroom.
     server-template srv 1-4 0.0.0.0:1 check disabled
 listen stats
-  bind *:8080
+  bind *:8080 thread 2-2
   stats enable
   stats uri /mystats
   stats refresh 1s

From eac17a67840c6ebccf3617bb706617813d99f799 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Mon, 11 May 2026 17:42:09 +0000
Subject: [PATCH 24/26] [serve] Revert HAProxy thread isolation; drain stderr +
 dump admin-socket diagnostics

Thread isolation didn't fix the admin-socket cascade. Logs from the
post-isolation run show admin_wait consistently hitting the full 60s
ceiling -- meaning the socket is unresponsive, not contended -- and
rebind_slots variance got worse (single-threaded admin processing).

The likely root cause is now: HAProxy in `-db` debug mode emits per
backend-state-change / health-check-failure lines on stderr. The proxy
actor leaves stderr=PIPE without an active reader, so the 64KB pipe
buffer fills under sustained churn and HAProxy blocks on its next
stderr write -- which freezes the admin socket (same proc) and looks
identical to a deadlock from outside.

- haproxy_templates.py: remove `thread 1` from stats sockets and
  `thread 2-N` from frontend/stats binds, restoring HAProxy's default
  thread distribution (all 4 threads handle admin work in parallel).
- haproxy.py: drop nbthread>=2 validation from HAProxyConfig.
- haproxy.py: spawn `_drain_proc_stderr` after each successful start
  to copy HAProxy stderr to a per-pid log file
  (`<socket_path>.stderr.<pid>.log`), keeping the OS pipe buffer
  drained. Bounded to 10MB with truncate-on-roll.
- haproxy.py: on admin-socket-wait timeout, dump a snapshot
  (`_dump_admin_socket_diagnostics`) of the socket path's filesystem
  state, the new proc's `/proc/<pid>/status` (State: line shows
  Sleeping vs Running -- the smoking gun for a stderr write block),
  and a tail of the per-pid stderr log so the next run carries its
  own diagnostic evidence.
- constants.py: trim the SOCKET_TIMEOUT_S comment that referenced the
  now-removed thread isolation. Keep 60s timeout and 600s
  hard-stop-after; both are independently useful.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py        |   8 +-
 python/ray/serve/_private/haproxy.py          | 120 ++++++++++++++++--
 .../ray/serve/_private/haproxy_templates.py   |  15 +--
 python/ray/serve/tests/test_haproxy_api.py    |  15 +--
 4 files changed, 121 insertions(+), 37 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index d6952dc2f4ca..847ff05d0eb7 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -801,11 +801,9 @@
 # Connect/read timeout (seconds) for HAProxy admin-socket commands. The
 # CLI mux serializes admin operations behind HTTP worker dispatch, so a
 # batch of `add server` / `del server` / etc. commands can routinely take
-# more than a few seconds while HAProxy is serving heavy traffic. With
-# the admin socket pinned to its own thread (see haproxy_templates.py)
-# this should usually stay fast, but a generous ceiling keeps the
-# runtime-API path alive across rare slow windows instead of cascading
-# into a fallback reload.
+# more than a few seconds while HAProxy is serving heavy traffic. A
+# generous ceiling keeps the runtime-API path alive across slow windows
+# instead of cascading into a fallback reload.
 RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S = float(
     os.environ.get("RAY_SERVE_HAPROXY_SOCKET_TIMEOUT_S", "60")
 )
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index 8f86a70e75da..e6c9a6398192 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -489,17 +489,6 @@ class HAProxyConfig:
 
     is_head: bool = False
 
-    def __post_init__(self) -> None:
-        # Thread isolation requires at least 2 threads: thread 1 is
-        # reserved for the admin/stats sockets, threads 2..nbthread serve
-        # data-plane traffic (see haproxy_templates.py). With nbthread<2
-        # the template would emit `thread 2-1` which HAProxy rejects.
-        if self.nbthread < 2:
-            raise ValueError(
-                f"nbthread must be >= 2 for admin/data-plane thread "
-                f"isolation (got {self.nbthread})."
-            )
-
     @property
     def transfer_socket_path(self) -> str:
         """Path of the dedicated FD-transfer admin socket.
@@ -1106,15 +1095,71 @@ async def _start_and_wait_for_haproxy(
             raise
 
         wait_elapsed = time.monotonic() - wait_start
+
+        # Drain HAProxy stderr in the background. `-db` debug mode
+        # emits lines per backend-state change, per health-check fail,
+        # etc., and with many backends a busy proxy can fill the OS
+        # pipe buffer (64KB default on Linux) within seconds. With no
+        # active reader, HAProxy then blocks on its next stderr write
+        # -- which freezes the admin socket (same proc) and presents
+        # to the actor as a 60s timeout on `show info`. Drain to a
+        # per-pid log file so the buffer never fills.
+        stderr_log_path = self._stderr_log_path_for(proc.pid)
+        asyncio.create_task(self._drain_proc_stderr(proc, stderr_log_path))
+
         logger.info(
-            "HAProxy started (PID=%s): spawn=%.2fs wait=%.2fs args=%s",
+            "HAProxy started (PID=%s): spawn=%.2fs wait=%.2fs args=%s "
+            "(stderr -> %s)",
             proc.pid,
             spawn_elapsed,
             wait_elapsed,
             extra_args,
+            stderr_log_path,
         )
         return proc
 
+    def _stderr_log_path_for(self, pid: int) -> str:
+        """Deterministic per-pid stderr log path.
+
+        Co-located with the admin socket so callers (notably
+        `_wait_for_admin_socket_bound_by`) can reconstruct it from
+        the pid alone.
+        """
+        return f"{self.cfg.socket_path}.stderr.{pid}.log"
+
+    async def _drain_proc_stderr(
+        self,
+        proc: asyncio.subprocess.Process,
+        log_path: str,
+        max_bytes: int = 10_000_000,
+    ) -> None:
+        """Drain HAProxy stderr to a bounded per-pid log file."""
+        try:
+            with open(log_path, "wb") as f:
+                f.write(
+                    f"=== HAProxy stderr pid={proc.pid} "
+                    f"started_at={time.time():.3f} ===\n".encode()
+                )
+                f.flush()
+                while True:
+                    try:
+                        chunk = await proc.stderr.read(8192)
+                    except Exception:
+                        break
+                    if not chunk:
+                        break
+                    if f.tell() + len(chunk) > max_bytes:
+                        f.seek(0)
+                        f.truncate()
+                        f.write(
+                            f"=== truncated at {time.time():.3f}, "
+                            f"continuing pid={proc.pid} ===\n".encode()
+                        )
+                    f.write(chunk)
+                    f.flush()
+        except Exception as e:
+            logger.debug("HAProxy stderr drain for pid=%s ended: %s", proc.pid, e)
+
     async def _terminate_old_procs(self) -> None:
         """SIGKILL any old HAProxy procs we're still tracking.
 
@@ -1333,8 +1378,59 @@ async def _wait_for_admin_socket_bound_by(
             timeout_s,
             last_seen_pid,
         )
+        self._dump_admin_socket_diagnostics(target_pid)
         return False
 
+    def _dump_admin_socket_diagnostics(self, target_pid: int) -> None:
+        """Best-effort snapshot when the admin-socket wait times out.
+
+        Captures filesystem state of the socket path, the new proc's
+        /proc/<pid>/status (Sleeping vs Running gives away a write
+        block from a deadlock), and a tail of recent HAProxy stderr.
+        All best-effort -- any sub-failure is swallowed so we never
+        replace a real warning with a traceback.
+        """
+        lines: List[str] = [f"Admin-socket diagnostics for pid={target_pid}:"]
+
+        sock_path = self.cfg.socket_path
+        try:
+            st = os.stat(sock_path)
+            lines.append(
+                f"  socket path {sock_path}: exists, size={st.st_size}, "
+                f"mtime={st.st_mtime:.3f}"
+            )
+        except FileNotFoundError:
+            lines.append(f"  socket path {sock_path}: MISSING")
+        except Exception as e:
+            lines.append(f"  socket path {sock_path}: stat failed: {e}")
+
+        try:
+            with open(f"/proc/{target_pid}/status") as f:
+                for line in f:
+                    if line.startswith(("State:", "Threads:", "VmRSS:")):
+                        lines.append(f"  /proc/{target_pid}: {line.rstrip()}")
+        except FileNotFoundError:
+            lines.append(f"  /proc/{target_pid}: dead (no proc entry)")
+        except Exception as e:
+            lines.append(f"  /proc/{target_pid}/status: read failed: {e}")
+
+        stderr_log = self._stderr_log_path_for(target_pid)
+        try:
+            with open(stderr_log, "rb") as f:
+                f.seek(0, 2)
+                size = f.tell()
+                f.seek(max(0, size - 2048))
+                tail = f.read().decode("utf-8", errors="replace")
+            lines.append(f"  stderr tail ({stderr_log}, last 2KB of {size}B):")
+            for ln in tail.splitlines()[-20:]:
+                lines.append(f"    {ln}")
+        except FileNotFoundError:
+            lines.append(f"  stderr log {stderr_log}: not found")
+        except Exception as e:
+            lines.append(f"  stderr log {stderr_log}: read failed: {e}")
+
+        logger.warning("\n".join(lines))
+
     async def probe_http_listener(self) -> bool:
         """Probe the HAProxy HTTP listener via /-/healthz.
 
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 372b12208721..1afc61689055 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -29,17 +29,12 @@
     # Keeping FD transfer on its own socket prevents the `_getsocks`
     # exchange from queueing behind add/del/disable/enable server commands
     # during a reload.
-    # `thread 1` pins admin work to a dedicated thread (see frontend
-    # binds below, which restrict data-plane traffic to thread 2+). When
-    # the data plane saturates threads 2..N, thread 1 stays free for
-    # runtime-API commands, preventing the cascade where slow admin
-    # responses trigger fallback reloads.
-    stats socket {{ config.transfer_socket_path }} mode 666 level admin expose-fd listeners thread 1
+    stats socket {{ config.transfer_socket_path }} mode 666 level admin expose-fd listeners
     # Runtime-API + stats socket. Used by the proxy actor for
     # `add server`, `del server`, `show stat`, etc. No `expose-fd
     # listeners` here — FD transfer is reserved for the dedicated socket
     # above so the two streams don't contend on the same connection slot.
-    stats socket {{ config.socket_path }} mode 666 level admin thread 1
+    stats socket {{ config.socket_path }} mode 666 level admin
     stats timeout 30s
     maxconn {{ config.maxconn }}
     nbthread {{ config.nbthread }}
@@ -92,12 +87,12 @@
     # duplicate slots both pointing at the same replica.
     balance {{ config.balance_algorithm }}
 frontend prometheus
-    bind :{{ config.metrics_port }} thread 2-{{ config.nbthread }}
+    bind :{{ config.metrics_port }}
     mode http
     http-request use-service prometheus-exporter if { path {{ config.metrics_uri }} }
     no log
 frontend http_frontend
-    bind {{ config.frontend_host }}:{{ config.frontend_port }} thread 2-{{ config.nbthread }}
+    bind {{ config.frontend_host }}:{{ config.frontend_port }}
 {{ healthz_rules|safe }}
     # Routes endpoint
     acl routes path -i /-/routes
@@ -173,7 +168,7 @@
     {%- endif %}
 {%- endfor %}
 listen stats
-  bind *:{{ config.stats_port }} thread 2-{{ config.nbthread }}
+  bind *:{{ config.stats_port }}
   stats enable
   stats uri {{ config.stats_uri }}
   stats refresh 1s
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index cae348ce6e7b..d0547c3dee12 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -276,17 +276,12 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # Keeping FD transfer on its own socket prevents the `_getsocks`
     # exchange from queueing behind add/del/disable/enable server commands
     # during a reload.
-    # `thread 1` pins admin work to a dedicated thread (see frontend
-    # binds below, which restrict data-plane traffic to thread 2+). When
-    # the data plane saturates threads 2..N, thread 1 stays free for
-    # runtime-API commands, preventing the cascade where slow admin
-    # responses trigger fallback reloads.
-    stats socket {socket_path}.fd mode 666 level admin expose-fd listeners thread 1
+    stats socket {socket_path}.fd mode 666 level admin expose-fd listeners
     # Runtime-API + stats socket. Used by the proxy actor for
     # `add server`, `del server`, `show stat`, etc. No `expose-fd
     # listeners` here — FD transfer is reserved for the dedicated socket
     # above so the two streams don't contend on the same connection slot.
-    stats socket {socket_path} mode 666 level admin thread 1
+    stats socket {socket_path} mode 666 level admin
     stats timeout 30s
     maxconn 1000
     nbthread 2
@@ -327,12 +322,12 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # duplicate slots both pointing at the same replica.
     balance random(2)
 frontend prometheus
-    bind :9101 thread 2-2
+    bind :9101
     mode http
     http-request use-service prometheus-exporter if {{ path /metrics }}
     no log
 frontend http_frontend
-    bind *:8000 thread 2-2
+    bind *:8000
     # Health check endpoint
     acl healthcheck path -i /-/healthz
     # Suppress logging for health checks
@@ -419,7 +414,7 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     # config-generation time from the current replica count + headroom.
     server-template srv 1-4 0.0.0.0:1 check disabled
 listen stats
-  bind *:8080 thread 2-2
+  bind *:8080
   stats enable
   stats uri /mystats
   stats refresh 1s

From 3b77d2be5e87d936081e0651ec58bbbc9f27ea67 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Tue, 12 May 2026 07:04:56 +0000
Subject: [PATCH 25/26] [serve] Default HAProxy timeouts, remove abortonclose,
 raise min slot pool

Three changes that address the residual /cpu-fanout/, /heavy-payload/,
and /long-runner/ errors once HAProxy itself was stable:

- constants.py: default `RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S` to 3600
  (matches timeout_client_s) and `RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S`
  to 5. Both were `None` before -- the rendered defaults block was
  emitting blank lines where `timeout server` and `timeout connect`
  should have been, leaving HAProxy on its undefined-behavior fallback
  for both. `timeout server` unset is particularly suspect during
  graceful reload edges.
- constants.py: bump `RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND` from
  16 to 32. With `inter 1s fall 2`, a slot can be DOWN for ~2s after a
  replica goes away; backends with only 16 slots (like /heavy-payload/)
  can briefly have zero UP servers under churn and serve 503 to
  whatever requests arrive in that window. 32 gives headroom for
  small-pool backends without significantly affecting total budget.
- haproxy_templates.py: drop `option abortonclose`. It RSTs the
  backend connection any time the client's TCP read-half goes quiet
  -- fine for sub-second GETs but catastrophic for endpoints that
  hold a connection open for tens of seconds (NAT idle reset,
  keep-alive timer, locust pause, all trigger spurious aborts that
  surface as 502 to the caller, normalized to 500 by errorfile).

Tests: golden config in test_haproxy_api.py updated to drop the
`option abortonclose` line; render verified to match exactly. Per-
backend timeout/diff tests in test_haproxy_diff.py and the slot-split
unit tests in test_haproxy_slots.py pass timeouts/min_per_backend
explicitly, so the new global defaults don't affect them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py         | 14 +++++---------
 python/ray/serve/_private/haproxy_templates.py |  1 -
 python/ray/serve/tests/test_haproxy_api.py     |  1 -
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index 847ff05d0eb7..f0e8ace86fea 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -733,16 +733,12 @@
 )
 
 # HAProxy timeout configurations (in seconds, None = no timeout)
-RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = (
-    int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S"))
-    if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S")
-    else None
+RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S", "3600")
 )
 
-RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = (
-    int(os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S"))
-    if os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S")
-    else None
+RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S", "5")
 )
 
 # When enabled, adds 'option http-no-delay' to the HAProxy config defaults,
@@ -829,7 +825,7 @@
 # If N_backends * MIN_SLOTS exceeds TOTAL_SLOTS the split degrades to an
 # equal share of (TOTAL_SLOTS // N_backends) per backend.
 RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND = int(
-    os.environ.get("RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND", "16")
+    os.environ.get("RAY_SERVE_HAPROXY_MIN_SLOTS_PER_BACKEND", "32")
 )
 
 # Headroom multiplier applied when allocating slots. A backend with N
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
index 1afc61689055..04c69e022f91 100644
--- a/python/ray/serve/_private/haproxy_templates.py
+++ b/python/ray/serve/_private/haproxy_templates.py
@@ -66,7 +66,6 @@
     {% if config.timeout_queue_s is not none %}timeout queue {{ config.timeout_queue_s }}s{% endif %}
     log global
     option httplog
-    option abortonclose
     {%- if config.tcp_nodelay %}
     # Set TCP_NODELAY on all connections
     option http-no-delay
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index d0547c3dee12..1d6dfe4f2dc1 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -309,7 +309,6 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
     timeout queue 1s
     log global
     option httplog
-    option abortonclose
     option idle-close-on-response
     # Normalize 502 and 504 errors to 500 per Serve's default behavior
     errorfile 502 {temp_dir}/500.http

From 8bb9997bd33aba0442e6e9a4a6fd75cc21365f51 Mon Sep 17 00:00:00 2001
From: harshit <harshit@anyscale.com>
Date: Tue, 12 May 2026 13:52:27 +0000
Subject: [PATCH 26/26] [serve] Bump HAProxy hard-stop to 1800s; decouple
 keep-alive from HTTPOptions

For long-running endpoints (e.g. /long-runner/ with med 73s requests),
two changes that together reduce the rate of mid-request kills during
graceful reload:

- constants.py: bump RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S 600s -> 1800s.
  After `-sf`, the old proc is force-killed when hard-stop fires. A
  request in flight on a near-end-of-life old proc gets RST'd, which
  the replica logs as HTTP 499 (client closed before response). With
  1800s, requests up to ~30 minutes have a wide margin to complete
  even if they ride on a near-deadline proc.

- constants.py + haproxy.py: introduce
  RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S (default 60s), promote
  `timeout_http_keep_alive_s` from a property reading
  `http_options.keep_alive_timeout_s` to a real field on HAProxyConfig
  defaulting to the new env var. The old setup conflated two unrelated
  timeouts:
    * HTTPOptions.keep_alive_timeout_s -- uvicorn keep-alive on the
      *replica* side (per Ray Serve's existing semantics)
    * HAProxy keep-alive -- HAProxy <-> client idle timeout, used for
      forcing locust/client TCP connections to rotate off old procs
      after a reload
  60s in HAProxy doesn't kill active long-running requests (those
  are bounded by `timeout client`/`timeout server`, both 3600s). It
  only bounds idle time between requests on a keep-alive connection,
  which encourages clients to reopen connections and land on the new
  proc.

Test: test_haproxy_api.py now passes timeout_http_keep_alive_s=55 to
HAProxyConfig directly rather than via HTTPOptions; the rendered
template still matches the golden config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/ray/serve/_private/constants.py     | 11 ++++++++++-
 python/ray/serve/_private/haproxy.py       | 11 +++++++----
 python/ray/serve/tests/test_haproxy_api.py |  2 +-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index f0e8ace86fea..d2bcac1c91e8 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -716,7 +716,16 @@
 
 # HAProxy hard stop after timeout
 RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S = int(
-    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "600")
+    os.environ.get("RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S", "1800")
+)
+
+# Idle keep-alive timeout for HAProxy's client-side connections. Distinct
+# from `HTTPOptions.keep_alive_timeout_s` (which is the uvicorn keep-alive
+# on the *replica* side). Lower values force idle clients to rotate off
+# old HAProxy procs faster after a reload, reducing the chance that a
+# long-running request lands on a near-hard-stop-deadline proc.
+RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S = int(
+    os.environ.get("RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S", "60")
 )
 
 # HAProxy metrics export port
diff --git a/python/ray/serve/_private/haproxy.py b/python/ray/serve/_private/haproxy.py
index e6c9a6398192..f7aa5d4be247 100644
--- a/python/ray/serve/_private/haproxy.py
+++ b/python/ray/serve/_private/haproxy.py
@@ -58,6 +58,7 @@
     RAY_SERVE_HAPROXY_TCP_NODELAY,
     RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S,
     RAY_SERVE_HAPROXY_TIMEOUT_CONNECT_S,
+    RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S,
     RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S,
     RAY_SERVE_HAPROXY_TOTAL_SLOTS,
     SERVE_CONTROLLER_NAME,
@@ -446,6 +447,12 @@ class HAProxyConfig:
     timeout_client_s: Optional[int] = RAY_SERVE_HAPROXY_TIMEOUT_CLIENT_S
     timeout_server_s: Optional[int] = RAY_SERVE_HAPROXY_TIMEOUT_SERVER_S
     timeout_http_request_s: Optional[int] = None
+    # HAProxy's idle keep-alive timeout for client connections. Decoupled
+    # from `http_options.keep_alive_timeout_s`, which controls the
+    # *replica*'s uvicorn keep-alive on the other side of HAProxy.
+    timeout_http_keep_alive_s: Optional[
+        int
+    ] = RAY_SERVE_HAPROXY_TIMEOUT_HTTP_KEEP_ALIVE_S
     hard_stop_after_s: Optional[int] = RAY_SERVE_HAPROXY_HARD_STOP_AFTER_S
     # Number of connection-level retries per request. Used in the `defaults`
     # block; combined with `option redispatch` (set per-backend) each retry
@@ -512,10 +519,6 @@ def frontend_host(self) -> str:
     def frontend_port(self) -> int:
         return self.http_options.port
 
-    @property
-    def timeout_http_keep_alive_s(self) -> int:
-        return self.http_options.keep_alive_timeout_s
-
     def build_health_route_info(self, backends: List[BackendConfig]) -> HealthRouteInfo:
         if not self.has_received_routes:
             router_ready_for_traffic = False
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
index 1d6dfe4f2dc1..dcab187a24b4 100644
--- a/python/ray/serve/tests/test_haproxy_api.py
+++ b/python/ray/serve/tests/test_haproxy_api.py
@@ -192,6 +192,7 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
             timeout_server_s=30,
             timeout_http_request_s=10,
             timeout_queue_s=1,
+            timeout_http_keep_alive_s=55,
             stats_port=8080,
             stats_uri="/mystats",
             health_check_fall=3,
@@ -201,7 +202,6 @@ def test_generate_config_file_internal(haproxy_api_cleanup):
             http_options=HTTPOptions(
                 host="0.0.0.0",
                 port=8000,
-                keep_alive_timeout_s=55,
             ),
             has_received_routes=True,
             has_received_servers=True,