@@ -583,6 +583,11 @@ def test_ingress_request_router_does_not_leak_into_other_backends(
583583
584584 assert "backend llm-via-ingress-request-router" in cfg
585585 assert "backend api-via-ingress-request-router" not in cfg
586+ assert "option http-buffer-request" not in cfg
587+ direct_backend = cfg .split ("backend llm-via-ingress-request-router" , 1 )[1 ]
588+ direct_backend = direct_backend .split ("listen stats" , 1 )[0 ]
589+ assert "http-reuse always" in direct_backend
590+ assert "option http-server-close" not in direct_backend
586591 # Only router-bearing backends contribute a set-var directive that
587592 # arms the Lua dispatch; the plain `api` backend must not.
588593 assert "set-var(txn.ingress_request_router_app) str(llm)" in cfg
@@ -651,6 +656,64 @@ def test_router_failure_503_rule_appears_before_use_backend(haproxy_api_cleanup)
651656 assert "X-Serve-Reason" in cfg , cfg
652657
653658
659+ @pytest .mark .parametrize ("forward_body" , [True , False ])
660+ def test_ingress_request_router_forward_body_gate_renders (
661+ haproxy_api_cleanup , monkeypatch , forward_body
662+ ):
663+ """The FORWARD_BODY escape hatch must drive both:
664+ - HAProxy ``wait-for-body`` + ``tune.bufsize`` directives (memory cost
665+ and the per-request body round-trip), and
666+ - the Lua ``FORWARD_BODY`` constant (whether the action reads the body
667+ and forwards it to ``/internal/route``).
668+
669+ Off by default: round-robin ignores the body, so neither cost is paid.
670+ """
671+ monkeypatch .setattr (
672+ "ray.serve._private.haproxy.RAY_SERVE_INGRESS_REQUEST_ROUTER_FORWARD_BODY" ,
673+ forward_body ,
674+ )
675+ with tempfile .TemporaryDirectory () as temp_dir :
676+ api = _make_api (
677+ temp_dir ,
678+ {
679+ "llm" : BackendConfig (
680+ name = "llm" ,
681+ path_prefix = "/" ,
682+ app_name = "llm" ,
683+ servers = [
684+ ServerConfig (
685+ name = "r1" ,
686+ host = "10.0.0.1" ,
687+ port = 30001 ,
688+ replica_id = "rid_1" ,
689+ )
690+ ],
691+ ingress_request_router_servers = [
692+ ServerConfig (name = "router" , host = "10.0.0.10" , port = 9000 )
693+ ],
694+ ),
695+ },
696+ )
697+ with mock .patch (
698+ "ray.serve._private.constants.RAY_SERVE_HAPROXY_CONFIG_FILE_LOC" ,
699+ api .config_file_path ,
700+ ):
701+ api ._generate_config_file_internal ()
702+ with open (api .config_file_path ) as f :
703+ cfg = f .read ()
704+ with open (os .path .join (temp_dir , "ingress_request_router.lua" )) as f :
705+ lua = f .read ()
706+
707+ if forward_body :
708+ assert "tune.bufsize" in cfg , cfg
709+ assert "wait-for-body" in cfg , cfg
710+ assert "local FORWARD_BODY = true" in lua , lua
711+ else :
712+ assert "tune.bufsize" not in cfg , cfg
713+ assert "wait-for-body" not in cfg , cfg
714+ assert "local FORWARD_BODY = false" in lua , lua
715+
716+
654717def _create_replica_server (port : int , replica_id_header : str ):
655718 """Fake data-plane replica that echoes its identity in a response header."""
656719 app = FastAPI ()
@@ -670,7 +733,7 @@ async def root(path: str, req: Request, res: Response):
670733
671734def _create_router_server (port : int , replica_id_to_return : str ):
672735 """Fake /internal/route. Captures bodies so tests can verify HAProxy
673- actually buffered + forwarded the request ."""
736+ forwards the buffered request body prefix to the router ."""
674737 app = FastAPI ()
675738 captured = {"bodies" : []}
676739
@@ -689,14 +752,20 @@ def ready():
689752 )
690753
691754 server , thread = _serve_fastapi_app (app , port , ready )
755+ # Discard the readiness-probe body so callers see only client traffic.
756+ captured ["bodies" ].clear ()
692757 return server , thread , captured
693758
694759
695760@pytest .mark .asyncio
696- async def test_ingress_request_router_end_to_end (haproxy_api_cleanup ):
761+ async def test_ingress_request_router_end_to_end (haproxy_api_cleanup , monkeypatch ):
697762 """Run actual HAProxy against a fake router + two replicas; verify a POST
698763 is pinned to the replica the router selects, while a GET (which doesn't
699764 trigger the router-routed path) is not."""
765+ monkeypatch .setattr (
766+ "ray.serve._private.haproxy.RAY_SERVE_INGRESS_REQUEST_ROUTER_FORWARD_BODY" ,
767+ True ,
768+ )
700769 with tempfile .TemporaryDirectory () as temp_dir :
701770 haproxy_port = find_free_port ()
702771 stats_port = find_free_port ()
@@ -787,12 +856,9 @@ async def test_ingress_request_router_end_to_end(haproxy_api_cleanup):
787856 assert resp .status_code == 200 , resp .text
788857 assert resp .headers .get ("x-replica-id" ) == "B"
789858
790- # The router actually saw the original body (via wait-for-body +
791- # txn.sf:req_body()). Just check the field made it through.
792- assert any (
793- '"prompt"' in body and "hello" in body
794- for body in router_captured ["bodies" ]
795- )
859+ # Direct streaming keeps a bounded request-body path for
860+ # prefix-cache-aware routing.
861+ assert router_captured ["bodies" ] == ['{"prompt": "hello"}' ]
796862
797863 # Repeat to confirm the pin holds across requests.
798864 for _ in range (3 ):
@@ -802,9 +868,10 @@ async def test_ingress_request_router_end_to_end(haproxy_api_cleanup):
802868 timeout = 5 ,
803869 )
804870 assert resp .headers .get ("x-replica-id" ) == "B"
871+ assert router_captured ["bodies" ] == ['{"prompt": "hello"}' ] * 4
805872
806- # GET is not POST, so wait-for-body / Lua never run ; the router
807- # should have seen exactly the four POSTs above and nothing more.
873+ # GET is not POST, so Lua routing never runs ; the router should
874+ # have seen exactly the four POSTs above and nothing more.
808875 n_router_calls_before_get = len (router_captured ["bodies" ])
809876 requests .get (
810877 f"http://127.0.0.1:{ haproxy_port } /health-passthrough" , timeout = 5
@@ -936,28 +1003,23 @@ async def test_router_failure_fails_loud_with_reason(haproxy_api_cleanup):
9361003 timeout = 10 ,
9371004 )
9381005
939- # Every dispatch failure mode must surface as a 5xx with a
940- # reason label, never as a silent primary-backend fallback.
941- # ``router_non_200``: router answered with status != 200 (forced
942- # by the broken-router stub).
943- # ``empty_body``: HAProxy's wait-for-body completed but the
944- # request had no body to forward to the router.
945- failure_cases = [
946- (dict (json = {"prompt" : "hi" }), "router_non_200" ),
947- (dict (data = "" ), "empty_body" ),
948- ]
949- for kwargs , expected_reason in failure_cases :
1006+ # Every dispatch failure must surface as 5xx with a reason
1007+ # label, never as a silent primary-backend fallback. The broken
1008+ # router returns 500 for both empty and non-empty bodies, so
1009+ # both shapes surface the same ``router_non_200`` reason; the
1010+ # body shape is parametrized to pin that empty-body POSTs are
1011+ # routed through the router and not silently bypassed.
1012+ for body_kwargs in (dict (json = {"prompt" : "hi" }), dict (data = "" )):
9501013 for _ in range (3 ):
9511014 resp = requests .post (
9521015 f"http://127.0.0.1:{ haproxy_port } /predict" ,
9531016 timeout = 5 ,
954- ** kwargs ,
955- )
956- assert resp .status_code == 503 , (expected_reason , resp .text )
957- assert resp .headers .get ("X-Serve-Reason" ) == expected_reason , (
958- expected_reason ,
959- resp .headers ,
1017+ ** body_kwargs ,
9601018 )
1019+ assert resp .status_code == 503 , resp .text
1020+ assert (
1021+ resp .headers .get ("X-Serve-Reason" ) == "router_non_200"
1022+ ), resp .headers
9611023
9621024 stats_csv = requests .get (
9631025 f"http://127.0.0.1:{ stats_port } /stats;csv" , timeout = 5
0 commit comments