llama-stack: switches to open model configuration with openai remote (#94)

codefromthecrypt · anuraaga · web-flow · commit d932a96692fb · 2026-02-16T11:08:56.000+09:00
Signed-off-by: Adrian Cole &lt;adrian@tetrate.io&gt;
Co-authored-by: Anuraag Agrawal &lt;anuraaga@gmail.com&gt;
diff --git a/inference-platforms/README.md b/inference-platforms/README.md
@@ -106,7 +106,7 @@ To start and use Ollama, do the following:
 
 ---
 [aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun
-[archgw]: https://docs.archgw.com/guides/observability/tracing.html
+[archgw]: https://docs.planoai.dev/guides/observability/tracing.html
 [litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry
 [AgC]: https://github.com/masaic-ai-platform/AgC/blob/main/platform/README.md#setting-up-the-opentelemetry-collector
diff --git a/inference-platforms/agent.py b/inference-platforms/agent.py
@@ -16,23 +16,25 @@
 # This must precede any other imports you want to instrument!
 auto_instrumentation.initialize()
 
+import argparse
 import asyncio
 import os
 from datetime import datetime, timedelta
 
 from agents import (
     Agent,
+    HostedMCPTool,
     OpenAIProvider,
     RunConfig,
     Runner,
     Tool,
 )
 from agents.mcp import MCPServerStreamableHttp, MCPUtil
+from openai.types.responses.tool_param import Mcp
 
 
-async def run_agent(tools: list[Tool]):
-    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
-    model = OpenAIProvider(use_responses=False).get_model(model_name)
+async def run_agent(tools: list[Tool], model_name: str, use_responses: bool):
+    model = OpenAIProvider(use_responses=use_responses).get_model(model_name)
     agent = Agent(
         name="flight-search-agent",
         model=model,
@@ -49,18 +51,39 @@ async def run_agent(tools: list[Tool]):
 
 
 async def main():
+    parser = argparse.ArgumentParser(description="MCP-enabled flight search agent")
+    parser.add_argument("--use-responses-api", action="store_true", help="Use Responses API instead of Agents")
+    args = parser.parse_args()
+
+    model_name = os.getenv("AGENT_MODEL", "gpt-5-nano")
+    mcp_url = os.getenv("MCP_URL", "https://mcp.kiwi.com")
+    mcp_headers = dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h)
+
+    if args.use_responses_api:
+        # Server-side MCP via Responses API
+        tools = [
+            HostedMCPTool(
+                tool_config=Mcp(
+                    type="mcp",
+                    server_url=mcp_url,
+                    server_label="kiwi-flights",
+                    headers=mcp_headers,
+                    require_approval="never",
+                )
+            )
+        ]
+        await run_agent(tools, model_name, use_responses=True)
+        return
+
+    # Client-side MCP orchestration
     async with MCPServerStreamableHttp(
-        {
-            "url": os.getenv("MCP_URL", "https://mcp.kiwi.com"),
-            "headers": dict(h.split("=", 1) for h in os.getenv("MCP_HEADERS", "").split(",") if h),
-            "timeout": 30.0,
-        },
+        {"url": mcp_url, "headers": mcp_headers, "timeout": 30.0},
         client_session_timeout_seconds=60.0,
     ) as server:
         tools = await server.list_tools()
         util = MCPUtil()
         tools = [util.to_function_tool(tool, server, False) for tool in tools]
-        await run_agent(tools)
+        await run_agent(tools, model_name, use_responses=False)
 
 
 if __name__ == "__main__":
diff --git a/inference-platforms/archgw/README.md b/inference-platforms/archgw/README.md
@@ -76,9 +76,9 @@ Just run it again until we find a way to make the results idempotent.
 
 ---
 [docs]: https://github.com/katanemo/archgw?tab=readme-ov-file#use-arch-gateway-as-llm-router
-[config]: https://docs.archgw.com/guides/observability/tracing.html
+[config]: https://docs.planoai.dev/guides/observability/tracing.html
 [envoy-otel]: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/trace/v3/opentelemetry.proto#extension-envoy-tracers-opentelemetry
-[archgw-wasm]: https://github.com/katanemo/archgw/blob/main/arch/README.md
+[archgw-wasm]: https://github.com/katanemo/plano/blob/main/README.md
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
-[openai-responses]: https://github.com/katanemo/archgw/issues/476
+[openai-responses]: https://github.com/katanemo/plano/issues/476
 [otel-tui]: https://github.com/ymtdzzz/otel-tui
diff --git a/inference-platforms/chat.py b/inference-platforms/chat.py
@@ -39,10 +39,8 @@ def main():
 
     # vllm-specific switch to disable thinking, ignored by other inference platforms.
     # See https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
-    if "qwen3" in model.lower():
-        extra_body = {"chat_template_kwargs": {"enable_thinking": False}}
-    else:
-        extra_body = {}
+    extra_body = {"chat_template_kwargs": {"enable_thinking": False}} if model.startswith("Qwen/Qwen3") else None
+
     if args.use_responses_api:
         response = client.responses.create(
             model=model, input=messages[0]["content"], temperature=0, extra_body=extra_body
diff --git a/inference-platforms/llama-stack/README.md b/inference-platforms/llama-stack/README.md
@@ -1,10 +1,7 @@
 # Llama Stack
 
-This shows how to use [Llama Stack][docs] to proxy Ollama, accessible via an
-OpenAI compatible API.
-
-This uses the [`otel` telemetry sink][otel-sink] to export OpenTelemetry traces
-and metrics from signals recorded with Llama Stack's observability SDK.
+This shows how to use [Llama Stack][docs] to proxy Ollama via an OpenAI
+compatible API.
 
 ## Prerequisites
 
@@ -13,7 +10,7 @@ Start Ollama and your OpenTelemetry Collector via this repository's [README](../
 ## Run Llama Stack
 
 ```bash
-docker compose up --pull always --force-recreate --remove-orphans
+docker compose up --force-recreate --remove-orphans
 ```
 
 Clean up when finished, like this:
@@ -36,16 +33,21 @@ Or, for the OpenAI Responses API
 uv run --exact -q --env-file env.local ../chat.py --use-responses-api
 ```
 
+### MCP Agent
+
+```bash
+uv run --exact -q --env-file env.local ../agent.py --use-responses-api
+```
+
 ## Notes
 
-Here are some constraints about the LlamaStack implementation:
-* Only supports llama models (so not Qwen)
-* Bridges its tracing and metrics APIs to `otel_trace` and `otel_metric` sinks.
-* Until [this issue][docker] resolves, running docker on Apple Silicon
-  requires emulation.
+* Llama Stack's Responses API connects to MCP servers server-side (unlike aigw
+  which proxies MCP). The agent passes MCP configuration via `HostedMCPTool`.
+* Uses the `starter` distribution with its built-in `remote::openai` provider,
+  pointing to Ollama via `OPENAI_BASE_URL` environment variable.
+* Models require `provider_id/` prefix (e.g., `openai/qwen3:0.6b`)
 
 ---
 [docs]: https://llama-stack.readthedocs.io/en/latest/index.html
 [otel-sink]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration
 [uv]: https://docs.astral.sh/uv/getting-started/installation/
-[docker]: https://github.com/llamastack/llama-stack/issues/406
diff --git a/inference-platforms/llama-stack/docker-compose.yml b/inference-platforms/llama-stack/docker-compose.yml
@@ -7,17 +7,16 @@ services:
     env_file:
       - env.local
     entrypoint: sh
-    command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}'
+    command: -c 'env | grep _MODEL | cut -d= -f2 | sed "s/^[^/]*\///" | xargs -I{} ollama pull {}'
     extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
       - "localhost:host-gateway"
 
   llama-stack:
     depends_on:
       ollama-pull:
         condition: service_completed_successfully
-    image: llamastack/distribution-starter:0.2.20
+    image: llamastack/distribution-starter:0.5.0
     container_name: llama-stack
-    platform: linux/amd64  # Force amd64 with emulation
     tty: true
     env_file:
       - env.local
@@ -26,7 +25,11 @@ services:
     # Ensure the container which specially treats localhost routes back to the
     # host machine, e.g. your laptop.
     environment:
-      - OLLAMA_URL=http://host.docker.internal:11434
+      - OPENAI_BASE_URL=http://host.docker.internal:11434/v1
       - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318
+      # Ensure we only see traces sampled upstream. This reduces noise without
+      # disabling SQL tracing entirely.
+      - OTEL_TRACES_SAMPLER=parentbased_traceidratio
+      - OTEL_TRACES_SAMPLER_ARG=0.0
     extra_hosts:
       - "host.docker.internal:host-gateway"
diff --git a/inference-platforms/llama-stack/env.local b/inference-platforms/llama-stack/env.local
@@ -1,14 +1,11 @@
-# Override default ENV variables for llama-stack
-OPENAI_BASE_URL=http://localhost:8321/v1/openai/v1
+# OpenAI-compatible endpoint configuration
+OPENAI_BASE_URL=http://localhost:8321/v1
 OPENAI_API_KEY=unused
-CHAT_MODEL=llama3.2:1b
-
-# Variables used by llama-stack
-OLLAMA_URL=http://localhost:11434
-INFERENCE_MODEL=llama3.2:1b
+# Models require `provider_id/` prefix, in this case `openai`
+CHAT_MODEL=openai/qwen3:0.6b
+AGENT_MODEL=openai/qwen3:1.7b
 
 # OpenTelemetry configuration
-TELEMETRY_SINKS=otel_trace,otel_metric
 OTEL_SERVICE_NAME=llama-stack
 OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
 OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf