Skip to content

Commit d932a96

Browse files
llama-stack: switches to open model configuration with openai remote (#94)
Signed-off-by: Adrian Cole <adrian@tetrate.io> Co-authored-by: Anuraag Agrawal <anuraaga@gmail.com>
1 parent d67d36e commit d932a96

7 files changed

Lines changed: 64 additions & 41 deletions

File tree

inference-platforms/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ To start and use Ollama, do the following:
106106

107107
---
108108
[aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun
109-
[archgw]: https://docs.archgw.com/guides/observability/tracing.html
109+
[archgw]: https://docs.planoai.dev/guides/observability/tracing.html
110110
[litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
111111
[llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry
112112
[AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector

inference-platforms/agent.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,25 @@
1616
# This must precede any other imports you want to instrument!
1717
auto_instrumentation.initialize()
1818

19+
import argparse
1920
import asyncio
2021
import os
2122
from datetime import datetime, timedelta
2223

2324
from agents import (
2425
Agent,
26+
HostedMCPTool,
2527
OpenAIProvider,
2628
RunConfig,
2729
Runner,
2830
Tool,
2931
)
3032
from agents.mcp import MCPServerStreamableHttp, MCPUtil
33+
from openai.types.responses.tool_param import Mcp
3134

3235

33-
async def run_agent(tools: list[Tool]):
34-
model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
35-
model = OpenAIProvider(use_responses=False).get_model(model_name)
36+
async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
37+
model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
3638
agent = Agent(
3739
name="flight-search-agent",
3840
model=model,
@@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):
4951

5052

5153
async def main():
54+
parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
55+
parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
56+
args = parser.parse_args()
57+
58+
model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
59+
mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
60+
mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)
61+
62+
if args.use_responses_api:
63+
# Server-side MCP via Responses API
64+
tools = [
65+
HostedMCPTool(
66+
tool_config=Mcp(
67+
type="mcp",
68+
server_url=mcp_url,
69+
server_label="kiwi-flights",
70+
headers=mcp_headers,
71+
require_approval="never",
72+
)
73+
)
74+
]
75+
await run_agent(tools, model_name, use_responses=True)
76+
return
77+
78+
# Client-side MCP orchestration
5279
async with MCPServerStreamableHttp(
53-
{
54-
"url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
55-
"headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
56-
"timeout": 30.0,
57-
},
80+
{"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
5881
client_session_timeout_seconds=60.0,
5982
) as server:
6083
tools = await server.list_tools()
6184
util = MCPUtil()
6285
tools = [util.to_function_tool(tool, server, False) for tool in tools]
63-
await run_agent(tools)
86+
await run_agent(tools, model_name, use_responses=False)
6487

6588

6689
if __name__ == "__main__":

inference-platforms/archgw/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ Just run it again until we find a way to make the results idempotent.
7676

7777
---
7878
[docs]: https://github.com/katanemo/archgw?tab=readme-ov-file#use-arch-gateway-as-llm-router
79-
[config]: https://docs.archgw.com/guides/observability/tracing.html
79+
[config]: https://docs.planoai.dev/guides/observability/tracing.html
8080
[envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry
81-
[archgw-wasm]: https://github.com/katanemo/archgw/blob/main/arch/README.md
81+
[archgw-wasm]: https://github.com/katanemo/plano/blob/main/README.md
8282
[uv]: https://docs.astral.sh/uv/getting-started/installation/
83-
[openai-responses]: https://github.com/katanemo/archgw/issues/476
83+
[openai-responses]: https://github.com/katanemo/plano/issues/476
8484
[otel-tui]: https://github.com/ymtdzzz/otel-tui

inference-platforms/chat.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,8 @@ def main():
3939

4040
# vllm-specific switch to disable thinking, ignored by other inference platforms.
4141
# See https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
42-
if "qwen3" in model.lower():
43-
extra_body = {"chat_template_kwargs": {"enable_thinking": False}}
44-
else:
45-
extra_body = {}
42+
extra_body = {"chat_template_kwargs": {"enable_thinking": False}} if model.startswith("Qwen/Qwen3") else None
43+
4644
if args.use_responses_api:
4745
response = client.responses.create(
4846
model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body

inference-platforms/llama-stack/README.md

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
# Llama Stack
22

3-
This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
4-
OpenAI compatible API.
5-
6-
This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
7-
and metrics from signals recorded with Llama Stack's observability SDK.
3+
This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
4+
compatible API.
85

96
## Prerequisites
107

@@ -13,7 +10,7 @@ Start Ollama and your OpenTelemetry Collector via this repository's [README](../
1310
## Run Llama Stack
1411

1512
```bash
16-
docker compose up --pull always --force-recreate --remove-orphans
13+
docker compose up --force-recreate --remove-orphans
1714
```
1815

1916
Clean up when finished, like this:
@@ -36,16 +33,21 @@ Or, for the OpenAI Responses API
3633
uv run --exact -q --env-file env.local ../chat.py --use-responses-api
3734
```
3835

36+
### MCP Agent
37+
38+
```bash
39+
uv run --exact -q --env-file env.local ../agent.py --use-responses-api
40+
```
41+
3942
## Notes
4043

41-
Here are some constraints about the LlamaStack implementation:
42-
* Only supports llama models (so not Qwen)
43-
* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
44-
* Until [this issue][docker] resolves, running docker on Apple Silicon
45-
requires emulation.
44+
* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
45+
which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
46+
* Uses the `starter` distribution with its built-in `remote::openai` provider,
47+
pointing to Ollama via `OPENAI_BASE_URL` environment variable.
48+
* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`)
4649

4750
---
4851
[docs]: https://llama-stack.readthedocs.io/en/latest/index.html
4952
[otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
5053
[uv]: https://docs.astral.sh/uv/getting-started/installation/
51-
[docker]: https://github.com/llamastack/llama-stack/issues/406

inference-platforms/llama-stack/docker-compose.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,16 @@ services:
77
env_file:
88
- env.local
99
entrypoint: sh
10-
command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
10+
command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
1111
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
1212
- "localhost:host-gateway"
1313

1414
llama-stack:
1515
depends_on:
1616
ollama-pull:
1717
condition: service_completed_successfully
18-
image: llamastack/distribution-starter:0.2.20
18+
image: llamastack/distribution-starter:0.5.0
1919
container_name: llama-stack
20-
platform: linux/amd64 # Force amd64 with emulation
2120
tty: true
2221
env_file:
2322
- env.local
@@ -26,7 +25,11 @@ services:
2625
# Ensure the container which specially treats localhost routes back to the
2726
# host machine, e.g. your laptop.
2827
environment:
29-
- OLLAMA_URL=http://host.docker.internal:11434
28+
- OPENAI_BASE_URL=http://host.docker.internal:11434/v1
3029
- OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
30+
# Ensure we only see traces sampled upstream. This reduces noise without
31+
# disabling SQL tracing entirely.
32+
- OTEL_TRACES_SAMPLER=parentbased_traceidratio
33+
- OTEL_TRACES_SAMPLER_ARG=0.0
3134
extra_hosts:
3235
- "host.docker.internal:host-gateway"

inference-platforms/llama-stack/env.local

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
1-
# Override default ENV variables for llama-stack
2-
OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
1+
# OpenAI-compatible endpoint configuration
2+
OPENAI_BASE_URL=http://localhost:8321/v1
33
OPENAI_API_KEY=unused
4-
CHAT_MODEL=llama3.2:1b
5-
6-
# Variables used by llama-stack
7-
OLLAMA_URL=http://localhost:11434
8-
INFERENCE_MODEL=llama3.2:1b
4+
# Models require `provider_id/` prefix, in this case `openai`
5+
CHAT_MODEL=openai/qwen3:0.6b
6+
AGENT_MODEL=openai/qwen3:1.7b
97

108
# OpenTelemetry configuration
11-
TELEMETRY_SINKS=otel_trace,otel_metric
129
OTEL_SERVICE_NAME=llama-stack
1310
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
1411
OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf

0 commit comments

Comments
 (0)