mellea/docs/examples/m_serve/client_streaming.py at main · generative-computing/mellea · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# pytest: skip_always
"""Example client demonstrating responses from m serve.

This example shows how to use the OpenAI Python client with a Mellea server
started with:

    m serve docs/examples/m_serve/m_serve_example_streaming.py

Set ``streaming`` below to:
- ``True`` for incremental SSE chunks
- ``False`` for a normal non-streaming response
"""

import openai

PORT = 8080

client = openai.OpenAI(api_key="na", base_url=f"http://0.0.0.0:{PORT}/v1")

streaming = True  # streaming enabled toggle

print(f"stream={streaming} response:")
print("-" * 50)

# Request either a streaming or non-streaming response from the dedicated example server
if streaming:
    stream_result = client.chat.completions.create(
        messages=[
            {"role": "user", "content": "Count down from 100 using words not digits."}
        ],
        model="granite4.1:3b",
        stream=True,
    )
    for chunk in stream_result:
        if chunk.choices[0].delta.content:
            # If you want to see the chunks more clearly separated, change end
            print(chunk.choices[0].delta.content, end="", flush=True)
else:
    completion_result = client.chat.completions.create(
        messages=[
            {"role": "user", "content": "Count down from 100 using words not digits."}
        ],
        model="granite4.1:3b",
        stream=False,
    )
    print(completion_result.choices[0].message.content)

print("\n" + "-" * 50)
print("Stream complete!")