Skip to content

Commit fa6dd2b

Browse files
committed
[serve][llm] Add direct streaming OpenAI compatibility smoke tests
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
1 parent d11c24a commit fa6dd2b

3 files changed

Lines changed: 93 additions & 0 deletions

File tree

python/ray/llm/tests/BUILD.bazel

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,22 @@ py_test_module_list(
120120
"//:ray_lib",
121121
],
122122
)
123+
124+
py_test_module_list(
125+
size = "large",
126+
data = glob(["serve/**/*.yaml"]),
127+
env = {
128+
"RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING": "1",
129+
},
130+
files = ["serve/gpu/integration/test_openai_compatibility.py"],
131+
name_suffix = "_direct_streaming",
132+
tags = [
133+
"exclusive",
134+
"gpu",
135+
"team:llm",
136+
],
137+
deps = [
138+
":conftest",
139+
"//:ray_lib",
140+
],
141+
)

python/ray/llm/tests/serve/gpu/integration/test_openai_compatibility.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import sys
23

34
import openai
@@ -143,6 +144,10 @@ def test_chat_without_model_parameter(self, testing_model): # noqa: F811
143144
assert data["model"] == expected_model
144145
assert data["choices"][0]["message"]["content"]
145146

147+
@pytest.mark.skipif(
148+
os.environ.get("RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING") == "1",
149+
reason="Direct streaming currently supports one LLM config.",
150+
)
146151
def test_chat_without_model_parameter_multiple_models(
147152
self, testing_multiple_models
148153
): # noqa: F811

python/ray/llm/tests/serve/mocks/mock_vllm_engine.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,75 @@ async def is_paused(self) -> bool:
142142
"""
143143
return self._is_paused
144144

145+
async def build_asgi_app(self):
146+
"""Build a minimal ASGI app for direct-streaming tests."""
147+
from fastapi import FastAPI, HTTPException, Request
148+
from starlette.responses import JSONResponse, StreamingResponse
149+
150+
app = FastAPI()
151+
152+
def check_model(model: Optional[str]) -> None:
153+
if model is not None and model != self.llm_config.model_id:
154+
raise HTTPException(
155+
status_code=404,
156+
detail=f"Could not find model {model}",
157+
)
158+
159+
async def to_response(gen):
160+
try:
161+
first = await gen.__anext__()
162+
except StopAsyncIteration:
163+
return JSONResponse(content={})
164+
165+
if isinstance(first, ErrorResponse):
166+
raise HTTPException(
167+
status_code=first.error.code,
168+
detail=first.error.message,
169+
)
170+
171+
if isinstance(first, str):
172+
173+
async def stream():
174+
yield first
175+
async for item in gen:
176+
if isinstance(item, str):
177+
yield item
178+
else:
179+
yield f"data: {item.model_dump_json()}\n\n"
180+
181+
return StreamingResponse(stream(), media_type="text/event-stream")
182+
183+
return JSONResponse(content=first.model_dump())
184+
185+
@app.get("/v1/models")
186+
async def models():
187+
return {
188+
"object": "list",
189+
"data": [
190+
{
191+
"id": self.llm_config.model_id,
192+
"object": "model",
193+
"created": 0,
194+
"owned_by": "mock",
195+
"metadata": {"input_modality": "text"},
196+
}
197+
],
198+
}
199+
200+
@app.post("/v1/chat/completions")
201+
async def chat_completions(request: Request):
202+
body = ChatCompletionRequest.model_validate(await request.json())
203+
check_model(body.model)
204+
return await to_response(self.chat(body))
205+
206+
@app.post("/v1/completions")
207+
async def completions(request: Request):
208+
body = CompletionRequest.model_validate(await request.json())
209+
check_model(body.model)
210+
return await to_response(self.completions(body))
211+
212+
return app
213+
145214
async def chat(
146215
self,
147216
request: ChatCompletionRequest,

0 commit comments

Comments
 (0)