[serve][llm] Add direct streaming OpenAI compatibility smoke tests

eicherseiji · eicherseiji · commit fa6dd2b89079 · 2026-05-18T14:03:14.000-07:00
Signed-off-by: Seiji Eicher &lt;seiji@anyscale.com&gt;
diff --git a/python/ray/llm/tests/BUILD.bazel b/python/ray/llm/tests/BUILD.bazel
@@ -120,3 +120,22 @@ py_test_module_list(
         "//:ray_lib",
     ],
 )
+
+py_test_module_list(
+    size = "large",
+    data = glob(["serve/**/*.yaml"]),
+    env = {
+        "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING": "1",
+    },
+    files = ["serve/gpu/integration/test_openai_compatibility.py"],
+    name_suffix = "_direct_streaming",
+    tags = [
+        "exclusive",
+        "gpu",
+        "team:llm",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
diff --git a/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py b/python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py
@@ -1,3 +1,4 @@
+import os
 import sys
 
 import openai
@@ -143,6 +144,10 @@ def test_chat_without_model_parameter(self, testing_model):  # noqa: F811
         assert data["model"] == expected_model
         assert data["choices"][0]["message"]["content"]
 
+    @pytest.mark.skipif(
+        os.environ.get("RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING") == "1",
+        reason="Direct streaming currently supports one LLM config.",
+    )
     def test_chat_without_model_parameter_multiple_models(
         self, testing_multiple_models
     ):  # noqa: F811
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -142,6 +142,75 @@ async def is_paused(self) -> bool:
         """
         return self._is_paused
 
+    async def build_asgi_app(self):
+        """Build a minimal ASGI app for direct-streaming tests."""
+        from fastapi import FastAPI, HTTPException, Request
+        from starlette.responses import JSONResponse, StreamingResponse
+
+        app = FastAPI()
+
+        def check_model(model: Optional[str]) -> None:
+            if model is not None and model != self.llm_config.model_id:
+                raise HTTPException(
+                    status_code=404,
+                    detail=f"Could not find model {model}",
+                )
+
+        async def to_response(gen):
+            try:
+                first = await gen.__anext__()
+            except StopAsyncIteration:
+                return JSONResponse(content={})
+
+            if isinstance(first, ErrorResponse):
+                raise HTTPException(
+                    status_code=first.error.code,
+                    detail=first.error.message,
+                )
+
+            if isinstance(first, str):
+
+                async def stream():
+                    yield first
+                    async for item in gen:
+                        if isinstance(item, str):
+                            yield item
+                        else:
+                            yield f"data: {item.model_dump_json()}\n\n"
+
+                return StreamingResponse(stream(), media_type="text/event-stream")
+
+            return JSONResponse(content=first.model_dump())
+
+        @app.get("/v1/models")
+        async def models():
+            return {
+                "object": "list",
+                "data": [
+                    {
+                        "id": self.llm_config.model_id,
+                        "object": "model",
+                        "created": 0,
+                        "owned_by": "mock",
+                        "metadata": {"input_modality": "text"},
+                    }
+                ],
+            }
+
+        @app.post("/v1/chat/completions")
+        async def chat_completions(request: Request):
+            body = ChatCompletionRequest.model_validate(await request.json())
+            check_model(body.model)
+            return await to_response(self.chat(body))
+
+        @app.post("/v1/completions")
+        async def completions(request: Request):
+            body = CompletionRequest.model_validate(await request.json())
+            check_model(body.model)
+            return await to_response(self.completions(body))
+
+        return app
+
     async def chat(
         self,
         request: ChatCompletionRequest,