Merge branch 'master' into docs/symmetric-run-docker-init

Future-Outlier · web-flow · commit d4d8bb3f1013 · 2026-05-08T16:30:46.000+08:00
diff --git a/python/ray/serve/_private/haproxy_templates.py b/python/ray/serve/_private/haproxy_templates.py
@@ -99,6 +99,11 @@
     acl has_ingress_request_router_app var(txn.ingress_request_router_app) -m found
     http-request wait-for-body time {{ ingress_request_router_timeout_s }}s if METH_POST has_ingress_request_router_app
     http-request lua.route_via_ingress_request_router if METH_POST has_ingress_request_router_app
+    # Fail loudly when the Lua dispatch did not pick a replica. Must appear
+    # before the use_backend rules below so the request never falls back to
+    # the primary backend (which would be a silent bypass of the configured
+    # router policy).
+    http-request return status 503 content-type text/plain lf-string "Ingress request router failed: %[var(txn.ingress_request_router_failed)]" hdr X-Serve-Reason %[var(txn.ingress_request_router_failed)] if { var(txn.ingress_request_router_failed) -m found }
     {%- endif %}
     # Static routing based on path prefixes in decreasing length then alphabetical order
 {%- for backend in backends %}
diff --git a/python/ray/serve/_private/ingress_request_router.lua.tmpl b/python/ray/serve/_private/ingress_request_router.lua.tmpl
@@ -76,6 +76,16 @@ local function truncated_full_length(txn, body)
     end
 end
 
+-- Failure semantics: every path that reaches a router decision but cannot pin
+-- a replica must arm txn.ingress_request_router_failed so the frontend's 503
+-- rule fires instead of letting the request silently fall through to the
+-- primary backend. The product invariant is: requests to a router-bearing app
+-- are served via the router or fail; there is no quiet alternative path.
+-- Two silent returns are correct: (1) the request didn't target a
+-- router-bearing app (no txn var set, no app entry in our maps), and
+-- (2) the controller hasn't pushed router/replica state for this app yet.
+-- Failure-mode bucketing belongs in observability (response header label,
+-- metric label), not in the data plane.
 core.register_action("route_via_ingress_request_router", {"http-req"}, function(txn)
     local app = txn:get_var("txn.ingress_request_router_app")
     if not app then
@@ -89,6 +99,7 @@ core.register_action("route_via_ingress_request_router", {"http-req"}, function(
 
     local body = txn.sf:req_body()
     if not body or body == "" then
+        txn:set_var("txn.ingress_request_router_failed", "empty_body")
         return
     end
 
@@ -100,17 +111,24 @@ core.register_action("route_via_ingress_request_router", {"http-req"}, function(
     end
 
     local response = call_router(router, body, truncated)
-    if not response or not is_http_200(response) then
+    if not response then
+        txn:set_var("txn.ingress_request_router_failed", "router_unreachable")
+        return
+    end
+    if not is_http_200(response) then
+        txn:set_var("txn.ingress_request_router_failed", "router_non_200")
         return
     end
 
     local replica_id = extract_replica_id(http_response_body(response))
     if not replica_id then
+        txn:set_var("txn.ingress_request_router_failed", "unparseable_replica_id")
         return
     end
 
     local server_name = replica_map[replica_id]
     if not server_name then
+        txn:set_var("txn.ingress_request_router_failed", "unknown_replica_id")
         return
     end
 
diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
@@ -2535,8 +2535,12 @@ async def _direct_ingress_asgi(
             return
 
         # If the HTTP path does not match the deployment route prefix,
-        # it is invalid and we should not serve it.
-        if not route.startswith(self._route_prefix):
+        # it is invalid and we should not serve it. Ingress request router
+        # peer deployments (e.g. LLMRouter) have no route prefix; fall
+        # back to "" so any path (including the empty-path ASGI edge case)
+        # matches and downstream user code dispatches.
+        route_prefix = self._route_prefix or ""
+        if not route.startswith(route_prefix):
             for msg in convert_object_to_asgi_messages(
                 f"Path '{route}' not found. "
                 "Ping http://.../-/routes for available routes.",
diff --git a/python/ray/serve/tests/test_haproxy_api.py b/python/ray/serve/tests/test_haproxy_api.py
@@ -589,6 +589,68 @@ def test_ingress_request_router_does_not_leak_into_other_backends(
         assert "set-var(txn.ingress_request_router_app) str(api)" not in cfg
 
 
+def test_router_failure_503_rule_appears_before_use_backend(haproxy_api_cleanup):
+    """The 503-on-router-failure rule must be rendered before any
+    ``use_backend`` directive. If it isn't, a failed Lua dispatch would
+    silently fall through to the primary backend and the router policy
+    the operator opted into would be invisibly bypassed."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        api = _make_api(
+            temp_dir,
+            {
+                "llm": BackendConfig(
+                    name="llm",
+                    path_prefix="/",
+                    app_name="llm",
+                    servers=[
+                        ServerConfig(
+                            name="r1",
+                            host="10.0.0.1",
+                            port=30001,
+                            replica_id="rid_1",
+                        )
+                    ],
+                    ingress_request_router_servers=[
+                        ServerConfig(name="router", host="10.0.0.10", port=9000)
+                    ],
+                ),
+            },
+        )
+        with mock.patch(
+            "ray.serve._private.constants.RAY_SERVE_HAPROXY_CONFIG_FILE_LOC",
+            api.config_file_path,
+        ):
+            api._generate_config_file_internal()
+        with open(api.config_file_path) as f:
+            cfg = f.read()
+
+        # Parse line-by-line so we don't accidentally match the substring
+        # "use_backend" inside an explanatory comment block rendered above.
+        lines = cfg.splitlines()
+        sentinel_line = next(
+            (
+                i
+                for i, ln in enumerate(lines)
+                if "var(txn.ingress_request_router_failed) -m found" in ln
+            ),
+            None,
+        )
+        first_use_backend_line = next(
+            (i for i, ln in enumerate(lines) if ln.strip().startswith("use_backend ")),
+            None,
+        )
+        assert sentinel_line is not None, cfg
+        assert first_use_backend_line is not None, cfg
+        assert sentinel_line < first_use_backend_line, (
+            "503-on-router-failure rule must precede every use_backend so a "
+            "failed dispatch does not silently fall through to the primary "
+            "backend.\n" + cfg
+        )
+        # Spot-check rule shape.
+        assert "status 503" in cfg, cfg
+        assert "X-Serve-Reason" in cfg, cfg
+
+
 def _create_replica_server(port: int, replica_id_header: str):
     """Fake data-plane replica that echoes its identity in a response header."""
     app = FastAPI()
@@ -764,6 +826,159 @@ async def test_ingress_request_router_end_to_end(haproxy_api_cleanup):
                     pass
 
 
+def _create_broken_router_server(port: int, status_code: int = 500):
+    """Fake /internal/route that always returns ``status_code`` (default 500).
+    Used to verify the fail-loud path: a router non-200 must surface to the
+    client as 5xx with X-Serve-Reason, not silently fall back to a primary
+    backend."""
+    app = FastAPI()
+
+    @app.get("/-/healthz")
+    async def health():
+        return {"status": "OK"}
+
+    @app.post("/internal/route")
+    async def route(res: Response):
+        res.status_code = status_code
+        return {"error": "broken"}
+
+    return _serve_fastapi_app(app, port, _healthz_ready(port))
+
+
+def _backend_stot(stats_csv: str, backend_name: str) -> int:
+    """Pull ``stot`` (cumulative sessions) for the BACKEND aggregate row of
+    ``backend_name`` from HAProxy's CSV stats. Returns -1 if not found."""
+    lines = stats_csv.splitlines()
+    if not lines:
+        return -1
+    header = lines[0].lstrip("# ").split(",")
+    pxname_idx = header.index("pxname")
+    svname_idx = header.index("svname")
+    stot_idx = header.index("stot")
+    for row in lines[1:]:
+        parts = row.split(",")
+        if (
+            len(parts) > stot_idx
+            and parts[pxname_idx] == backend_name
+            and parts[svname_idx] == "BACKEND"
+        ):
+            return int(parts[stot_idx] or 0)
+    return -1
+
+
+@pytest.mark.asyncio
+async def test_router_failure_fails_loud_with_reason(haproxy_api_cleanup):
+    """When ``/internal/route`` returns non-200, HAProxy must return 5xx with
+    ``X-Serve-Reason`` rather than silently falling back to the primary
+    backend. The primary backend's cumulative session count must stay 0."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        haproxy_port = find_free_port()
+        stats_port = find_free_port()
+        replica_port = find_free_port()
+        router_port = find_free_port()
+
+        actor_name = "SERVE_REPLICA::app#dep#aaa"
+
+        replica, replica_thread = _create_replica_server(
+            replica_port, replica_id_header="A"
+        )
+        broken_router, broken_router_thread = _create_broken_router_server(router_port)
+
+        try:
+            config = HAProxyConfig(
+                http_options=HTTPOptions(
+                    host="127.0.0.1",
+                    port=haproxy_port,
+                    keep_alive_timeout_s=58,
+                ),
+                stats_port=stats_port,
+                socket_path=os.path.join(temp_dir, "admin.sock"),
+                has_received_routes=True,
+                has_received_servers=True,
+                health_check_path="/-/healthz",
+                health_check_inter="500ms",
+                health_check_rise=1,
+                health_check_fall=2,
+            )
+
+            backend = BackendConfig(
+                name="llm",
+                path_prefix="/",
+                app_name="llm",
+                health_check_path="/-/healthz",
+                servers=[
+                    ServerConfig(
+                        name="A",
+                        host="127.0.0.1",
+                        port=replica_port,
+                        replica_id=actor_name,
+                    ),
+                ],
+                ingress_request_router_servers=[
+                    ServerConfig(name="router", host="127.0.0.1", port=router_port),
+                ],
+            )
+
+            api = HAProxyApi(
+                cfg=config,
+                backend_configs={"llm": backend},
+                config_file_path=os.path.join(temp_dir, "haproxy.cfg"),
+            )
+            haproxy_api_cleanup(api)
+            await api.start()
+
+            wait_for_condition(lambda: check_haproxy_ready(stats_port), timeout=10)
+            await async_wait_for_condition(
+                lambda: requests.get(
+                    f"http://127.0.0.1:{haproxy_port}/-/healthz", timeout=2
+                ).status_code
+                == 200,
+                timeout=10,
+            )
+
+            # Every dispatch failure mode must surface as a 5xx with a
+            # reason label, never as a silent primary-backend fallback.
+            # ``router_non_200``: router answered with status != 200 (forced
+            # by the broken-router stub).
+            # ``empty_body``: HAProxy's wait-for-body completed but the
+            # request had no body to forward to the router.
+            failure_cases = [
+                (dict(json={"prompt": "hi"}), "router_non_200"),
+                (dict(data=""), "empty_body"),
+            ]
+            for kwargs, expected_reason in failure_cases:
+                for _ in range(3):
+                    resp = requests.post(
+                        f"http://127.0.0.1:{haproxy_port}/predict",
+                        timeout=5,
+                        **kwargs,
+                    )
+                    assert resp.status_code == 503, (expected_reason, resp.text)
+                    assert resp.headers.get("X-Serve-Reason") == expected_reason, (
+                        expected_reason,
+                        resp.headers,
+                    )
+
+            stats_csv = requests.get(
+                f"http://127.0.0.1:{stats_port}/stats;csv", timeout=5
+            ).text
+            assert _backend_stot(stats_csv, "llm") == 0, stats_csv
+            assert (
+                _backend_stot(stats_csv, "llm-via-ingress-request-router") == 0
+            ), stats_csv
+        finally:
+            for srv in (replica, broken_router):
+                try:
+                    srv.should_exit = True
+                except Exception:
+                    pass
+            for thr in (replica_thread, broken_router_thread):
+                try:
+                    thr.join(timeout=5)
+                except Exception:
+                    pass
+
+
 @pytest.mark.asyncio
 async def test_graceful_reload(haproxy_api_cleanup):
     """Test that graceful reload preserves long-running connections."""