From 2e8c00c12d94e1aa0ccc3d0ee00688cc3f8bb22d Mon Sep 17 00:00:00 2001
From: Eric Curtin <eric.curtin@docker.com>
Date: Wed, 8 Apr 2026 14:46:11 +0100
Subject: [PATCH 1/2] add interactive gateway demo

Introduces demos/gateway/ with a step-by-step shell demo for the
model-cli gateway command. Each step pauses for Enter, types commands
character-by-character as 'docker model gateway', and covers health,
auth, chat completions, streaming, embeddings, load balancing,
fallbacks, and OpenAI SDK compatibility.

Signed-off-by: Eric Curtin <eric.curtin@docker.com>
---
 demos/gateway/README.md            | 114 +++++++
 demos/gateway/config-advanced.yaml |  39 +++
 demos/gateway/config-basic.yaml    |  16 +
 demos/gateway/demo.sh              | 494 +++++++++++++++++++++++++++++
 4 files changed, 663 insertions(+)
 create mode 100644 demos/gateway/README.md
 create mode 100644 demos/gateway/config-advanced.yaml
 create mode 100644 demos/gateway/config-basic.yaml
 create mode 100755 demos/gateway/demo.sh

diff --git a/demos/gateway/README.md b/demos/gateway/README.md
new file mode 100644
index 00000000..169c9f70
--- /dev/null
+++ b/demos/gateway/README.md
@@ -0,0 +1,114 @@
+# model-cli gateway demo
+
+Demonstrates the `model-cli gateway` command — a lightweight,
+OpenAI-compatible LLM proxy that sits in front of Docker Model Runner
+(and other providers) and adds routing, load balancing, retries,
+fallbacks, and auth.
+
+## Prerequisites
+
+1. Docker Desktop with Model Runner enabled
+2. The `model-cli` binary built:
+   ```bash
+   cd model-cli && cargo build --release
+   ```
+3. Models pulled:
+   ```bash
+   docker model pull ai/smollm2
+   docker model pull ai/gemma3
+   docker model pull ai/qwen3:0.6B-Q4_0
+   docker model pull ai/nomic-embed-text-v1.5
+   ```
+4. Python `openai` package (for step 11):
+   ```bash
+   pip install openai
+   ```
+
+## Run the demo
+
+```bash
+./demos/gateway/demo.sh
+```
+
+The script starts the gateway on `http://localhost:4000`, runs through
+every feature, then shuts the gateway down on exit.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `config-basic.yaml`    | Single-provider config with two models and bearer-token auth |
+| `config-advanced.yaml` | Multi-deployment config showing load balancing and fallbacks |
+| `demo.sh`              | Full end-to-end demo script |
+
+## What is demonstrated
+
+| # | Feature | Config |
+|---|---------|--------|
+| 1 | Start gateway | basic |
+| 2 | `/health` endpoint | basic |
+| 3 | `/v1/models` — OpenAI-compatible model list | basic |
+| 4 | Auth rejection with wrong key (HTTP 401) | basic |
+| 5 | Non-streaming chat completion | basic |
+| 6 | Streaming chat completion (SSE) | basic |
+| 7 | Embeddings via chat model | basic |
+| 8 | Switch to advanced config | advanced |
+| 9 | Round-robin load balancing across two deployments | advanced |
+| 10 | Dedicated embedding model (`nomic-embed-text`) | advanced |
+| 11 | OpenAI Python SDK — zero code changes required | advanced |
+
+## Config anatomy
+
+```yaml
+model_list:
+  # Alias the client uses       Provider / actual model on DMR
+  - model_name: fast-model
+    params:
+      model: docker_model_runner/ai/smollm2
+
+  # Second entry with same alias → round-robin load balancing
+  - model_name: fast-model
+    params:
+      model: docker_model_runner/ai/qwen3:0.6B-Q4_0
+
+  - model_name: big-model
+    params:
+      model: docker_model_runner/ai/gemma3
+
+general_settings:
+  master_key: demo-secret   # Bearer token required on all requests
+  num_retries: 2            # retry up to 2 times before fallback
+  fallbacks:
+    - fast-model: [big-model]   # automatic fallback chain
+```
+
+## Manual curl examples
+
+```bash
+GW="http://localhost:4000"
+KEY="demo-secret"
+
+# Health
+curl "${GW}/health"
+
+# List models
+curl -H "Authorization: Bearer ${KEY}" "${GW}/v1/models"
+
+# Chat completion
+curl -X POST "${GW}/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer ${KEY}" \
+  -d '{"model":"smollm2","messages":[{"role":"user","content":"Hello!"}]}'
+
+# Streaming
+curl -N -X POST "${GW}/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer ${KEY}" \
+  -d '{"model":"smollm2","messages":[{"role":"user","content":"Count to 5"}],"stream":true}'
+
+# Embeddings
+curl -X POST "${GW}/v1/embeddings" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer ${KEY}" \
+  -d '{"model":"embeddings","input":["hello world"]}'
+```
diff --git a/demos/gateway/config-advanced.yaml b/demos/gateway/config-advanced.yaml
new file mode 100644
index 00000000..47ed4901
--- /dev/null
+++ b/demos/gateway/config-advanced.yaml
@@ -0,0 +1,39 @@
+# Advanced gateway config: load balancing, retries, and fallbacks
+#
+# Demonstrates three key gateway features:
+#
+# 1. LOAD BALANCING — two deployments under the same model alias.
+#    The gateway round-robins across them automatically.
+#
+# 2. RETRIES — if a provider call fails, it will be retried up to
+#    num_retries times before giving up (or falling back).
+#
+# 3. FALLBACKS — if "fast-model" exhausts all retries, the gateway
+#    automatically promotes the request to "big-model".
+
+model_list:
+  # Two local DMR models registered under the same alias.
+  # Requests for "fast-model" are round-robined across both.
+  - model_name: fast-model
+    params:
+      model: docker_model_runner/ai/smollm2
+
+  - model_name: fast-model
+    params:
+      model: docker_model_runner/ai/qwen3:0.6B-Q4_0
+
+  # Larger fallback model
+  - model_name: big-model
+    params:
+      model: docker_model_runner/ai/gemma3
+
+  # Embedding model
+  - model_name: embeddings
+    params:
+      model: docker_model_runner/ai/nomic-embed-text-v1.5
+
+general_settings:
+  master_key: demo-secret
+  num_retries: 2                        # retry failing calls twice before fallback
+  fallbacks:
+    - fast-model: [big-model]           # fast-model falls back to big-model
diff --git a/demos/gateway/config-basic.yaml b/demos/gateway/config-basic.yaml
new file mode 100644
index 00000000..9c609636
--- /dev/null
+++ b/demos/gateway/config-basic.yaml
@@ -0,0 +1,16 @@
+# Basic gateway config: single Docker Model Runner provider
+#
+# The gateway exposes a unified OpenAI-compatible API on :4000 and
+# forwards requests to Docker Model Runner running on the local engine.
+
+model_list:
+  - model_name: smollm2        # alias clients use in their requests
+    params:
+      model: docker_model_runner/ai/smollm2   # provider/model
+
+  - model_name: gemma3
+    params:
+      model: docker_model_runner/ai/gemma3
+
+general_settings:
+  master_key: demo-secret      # clients must send: Authorization: Bearer demo-secret
diff --git a/demos/gateway/demo.sh b/demos/gateway/demo.sh
new file mode 100755
index 00000000..1aa9b259
--- /dev/null
+++ b/demos/gateway/demo.sh
@@ -0,0 +1,494 @@
+#!/usr/bin/env bash
+# model-cli gateway — interactive step-by-step demo
+#
+# Each step pauses and waits for Enter before running.
+# Commands are "typed" character-by-character to look live.
+#
+# Prerequisites:
+#   - Docker Model Runner running        (docker model ps)
+#   - model-cli binary built             (cargo build --release in model-cli/)
+#   - Models pulled:
+#       docker model pull ai/smollm2
+#       docker model pull ai/gemma3
+#       docker model pull ai/qwen3:0.6B-Q4_0
+#       docker model pull ai/nomic-embed-text-v1.5
+#   - pip install openai                 (for the SDK step)
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+BIN="${REPO_ROOT}/model-cli/target/release/model-cli"
+GATEWAY_PORT=4000
+GATEWAY_URL="http://localhost:${GATEWAY_PORT}"
+API_KEY="demo-secret"
+GATEWAY_PID=""
+
+# ── colours ───────────────────────────────────────────────────────────────────
+reset=$'\033[0m'
+bold=$'\033[1m'
+dim=$'\033[2m'
+green=$'\033[0;32m'
+yellow=$'\033[0;33m'
+blue=$'\033[0;34m'
+cyan=$'\033[0;36m'
+red=$'\033[0;31m'
+white=$'\033[0;37m'
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+# Simulate typing a command character by character
+typewrite() {
+    local text="$1"
+    local delay="${2:-0.04}"
+    local char
+    for (( i=0; i<${#text}; i++ )); do
+        char="${text:$i:1}"
+        printf '%s' "$char"
+        sleep "$delay"
+    done
+}
+
+# Print a fake prompt then type-animate the command, then wait for Enter
+# After Enter is pressed the command is actually run.
+run_step() {
+    local description="$1"
+    local command="$2"
+
+    echo
+    printf '%s# %s%s\n' "$dim" "$description" "$reset"
+    printf '%s%s$%s ' "$bold" "$green" "$reset"
+    typewrite "$command" 0.035
+    printf '%s ▌%s' "$dim" "$reset"   # blinking-cursor illusion
+
+    # Wait for Enter
+    read -r -s _
+    printf "\r${bold}${green}\$${reset} ${white}%s${reset}\n" "$command"
+
+    # Actually run it
+    eval "$command"
+}
+
+# A pause with a brief explanatory comment — no command executed
+pause_comment() {
+    local msg="$1"
+    echo
+    printf '%s# %s%s' "$dim" "$msg" "$reset"
+    read -r -s _
+    echo
+}
+
+# Section banner
+section() {
+    echo
+    printf '%s%s┌──────────────────────────────────────────────────────┐%s\n' "$bold" "$blue" "$reset"
+    printf '%s%s│  %-52s│%s\n' "$bold" "$blue" "$*" "$reset"
+    printf '%s%s└──────────────────────────────────────────────────────┘%s\n' "$bold" "$blue" "$reset"
+}
+
+ok()  { printf '%s✓ %s%s\n' "$green" "$*" "$reset"; }
+info(){ printf '%s  %s%s\n' "$cyan"  "$*" "$reset"; }
+
+pretty_json() {
+    python3 -c "import sys,json; print(json.dumps(json.load(sys.stdin), indent=2))"
+}
+
+GATEWAY_LOG=""   # path to temp log file for current gateway instance
+
+wait_for_gateway() {
+    local retries=30
+    while ! curl -sf "${GATEWAY_URL}/health" >/dev/null 2>&1; do
+        retries=$((retries - 1))
+        [[ $retries -eq 0 ]] && { printf '%sGateway did not start%s\n' "$red" "$reset"; exit 1; }
+        sleep 0.2
+    done
+}
+
+# Launch the gateway binary directly (no pipe) so $! is the real PID.
+# Logs go to a temp file; we tail it briefly so startup messages are visible.
+launch_gateway() {
+    local config="$1"
+    GATEWAY_LOG="$(mktemp /tmp/model-cli-gateway-XXXXXX.log)"
+    "${BIN}" gateway \
+        --config "${config}" \
+        --port "${GATEWAY_PORT}" \
+        >"${GATEWAY_LOG}" 2>&1 &
+    GATEWAY_PID=$!
+    wait_for_gateway
+    # Print startup lines captured so far (INFO level only, indented)
+    grep 'INFO' "${GATEWAY_LOG}" 2>/dev/null | sed 's/^/  /' || true
+}
+
+stop_gateway() {
+    if [[ -n "${GATEWAY_PID}" ]]; then
+        kill "${GATEWAY_PID}" 2>/dev/null || true
+        wait "${GATEWAY_PID}" 2>/dev/null || true
+        GATEWAY_PID=""
+        [[ -n "${GATEWAY_LOG}" ]] && rm -f "${GATEWAY_LOG}"
+        GATEWAY_LOG=""
+    fi
+}
+
+trap 'stop_gateway' EXIT
+
+# ─────────────────────────────────────────────────────────────────────────────
+# INTRO
+# ─────────────────────────────────────────────────────────────────────────────
+
+clear
+printf '%s%s' "$bold" "$blue"
+cat <<'BANNER'
+  ██████╗   ██████╗  ██████╗  ██╗  ██╗ ███████╗ ██████╗
+  ██╔══██╗ ██╔═══██╗ ██╔════╝ ██║ ██╔╝ ██╔════╝ ██╔══██╗
+  ██║  ██║ ██║   ██║ ██║      █████╔╝  █████╗   ██████╔╝
+  ██║  ██║ ██║   ██║ ██║      ██╔═██╗  ██╔══╝   ██╔══██╗
+  ██████╔╝ ╚██████╔╝ ╚██████╗ ██║  ██╗ ███████╗ ██║  ██║
+  ╚═════╝   ╚═════╝   ╚═════╝ ╚═╝  ╚═╝ ╚══════╝ ╚═╝  ╚═╝
+
+  ███╗   ███╗  ██████╗  ██████╗  ███████╗ ██╗
+  ████╗ ████║ ██╔═══██╗ ██╔══██╗ ██╔════╝ ██║
+  ██╔████╔██║ ██║   ██║ ██║  ██║ █████╗   ██║
+  ██║╚██╔╝██║ ██║   ██║ ██║  ██║ ██╔══╝   ██║
+  ██║ ╚═╝ ██║ ╚██████╔╝ ██████╔╝ ███████╗ ███████╗
+  ╚═╝     ╚═╝  ╚═════╝  ╚═════╝  ╚══════╝ ╚══════╝
+
+   ██████╗   █████╗  ████████╗ ███████╗ ██╗    ██╗  █████╗  ██╗   ██╗
+  ██╔════╝  ██╔══██╗ ╚══██╔══╝ ██╔════╝ ██║    ██║ ██╔══██╗ ╚██╗ ██╔╝
+  ██║  ███╗ ███████║    ██║    █████╗   ██║ █╗ ██║ ███████║  ╚████╔╝
+  ██║   ██║ ██╔══██║    ██║    ██╔══╝   ██║███╗██║ ██╔══██║   ╚██╔╝
+  ╚██████╔╝ ██║  ██║    ██║    ███████╗ ╚███╔███╔╝ ██║  ██║    ██║
+   ╚═════╝  ╚═╝  ╚═╝    ╚═╝    ╚══════╝  ╚══╝╚══╝  ╚═╝  ╚═╝    ╚═╝
+BANNER
+printf '%s\n' "$reset"
+printf '%s  Press %s%sEnter%s%s to advance through each step.%s\n' "$dim" "$reset" "$bold" "$reset" "$dim" "$reset"
+printf '%s  The gateway is started behind the scenes — commands shown are%s\n' "$dim" "$reset"
+printf '%s  exactly what you would run in a real session.%s\n' "$dim" "$reset"
+echo
+
+pause_comment "Let's begin — press Enter to start"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 1 — show the config
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 1 — Write a gateway config"
+
+pause_comment "The gateway is driven by a simple YAML file. Here's a basic one."
+
+echo
+printf '%s# demos/gateway/config-basic.yaml%s\n' "$dim" "$reset"
+cat "${SCRIPT_DIR}/config-basic.yaml"
+
+pause_comment "Press Enter to start the gateway"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 2 — start the gateway (shown as docker model gateway)
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 2 — Start the gateway"
+
+# Show the pretty command; actually run our binary in the background
+echo
+printf '%s# Starts an OpenAI-compatible proxy on :4000%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "docker model gateway --config demos/gateway/config-basic.yaml" 0.035
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %sdocker model gateway --config demos/gateway/config-basic.yaml%s\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+# Actually launch
+launch_gateway "${SCRIPT_DIR}/config-basic.yaml"
+echo
+ok "Gateway listening on http://localhost:${GATEWAY_PORT}"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 3 — health check
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 3 — Health check"
+
+run_step \
+    "The gateway exposes /health — no auth required" \
+    "curl -s http://localhost:${GATEWAY_PORT}/health | python3 -m json.tool"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 4 — list models
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 4 — List models  (/v1/models)"
+
+run_step \
+    "OpenAI-compatible model list — clients see gateway aliases, not backend details" \
+    "curl -s http://localhost:${GATEWAY_PORT}/v1/models -H 'Authorization: Bearer ${API_KEY}' | python3 -m json.tool"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 5 — auth rejection
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 5 — Auth enforcement"
+
+pause_comment "The gateway rejects requests with the wrong key"
+
+echo
+printf '%s# Wrong key → 401%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "curl -s -o /dev/null -w '%{http_code}' http://localhost:${GATEWAY_PORT}/v1/chat/completions -H 'Authorization: Bearer WRONG'" 0.03
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf "\r${bold}${green}\$${reset} ${white}curl -s -o /dev/null -w '%%{http_code}' .../v1/chat/completions -H 'Authorization: Bearer WRONG'${reset}\n"
+
+HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
+    -X POST "${GATEWAY_URL}/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer WRONG" \
+    -d '{"model":"smollm2","messages":[{"role":"user","content":"hi"}]}')
+printf "  HTTP %s\n" "$HTTP_CODE"
+ok "Correctly rejected with 401"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 6 — chat completion
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 6 — Chat completion"
+
+pause_comment "Standard OpenAI-compatible chat completions endpoint"
+
+echo
+printf '%s# POST /v1/chat/completions — non-streaming%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "curl -s http://localhost:${GATEWAY_PORT}/v1/chat/completions \\" 0.03
+printf "\n"
+typewrite "  -H 'Authorization: Bearer ${API_KEY}' \\" 0.03
+printf "\n"
+typewrite "  -d '{\"model\":\"smollm2\",\"messages\":[{\"role\":\"user\",\"content\":\"What is Docker Model Runner?\"}],\"max_tokens\":80}'" 0.03
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %scurl -s .../v1/chat/completions -H '"'"'Authorization: Bearer %s'"'"' -d '"'"'{...}'"'"'%s\n\n' "$bold" "$green" "$reset" "$white" "$API_KEY" "$reset"
+
+curl -sf -X POST "${GATEWAY_URL}/v1/chat/completions" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer ${API_KEY}" \
+     -d '{
+       "model": "smollm2",
+       "messages": [
+         {"role": "system", "content": "You are a helpful assistant. Be very brief."},
+         {"role": "user",   "content": "What is Docker Model Runner? One sentence."}
+       ],
+       "max_tokens": 80
+     }' | pretty_json
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 7 — streaming
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 7 — Streaming (SSE)"
+
+pause_comment "Add stream:true — tokens arrive in real time"
+
+echo
+printf '%s# Same endpoint, stream:true → server-sent events%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "curl -sN http://localhost:${GATEWAY_PORT}/v1/chat/completions \\" 0.03
+printf "\n"
+typewrite "  -H 'Authorization: Bearer ${API_KEY}' \\" 0.03
+printf "\n"
+typewrite "  -d '{\"model\":\"smollm2\",\"messages\":[{\"role\":\"user\",\"content\":\"Count 1 to 5\"}],\"stream\":true}'" 0.03
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %scurl -sN .../v1/chat/completions -d '"'"'{...stream:true...}'"'"'%s\n\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+curl -sfN -X POST "${GATEWAY_URL}/v1/chat/completions" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer ${API_KEY}" \
+     -d '{
+       "model": "smollm2",
+       "messages": [{"role": "user", "content": "Count from 1 to 5, one number per word."}],
+       "stream": true,
+       "max_tokens": 40
+     }' | while IFS= read -r line; do
+         if [[ "${line}" == data:* ]]; then
+             payload="${line#data: }"
+             [[ "${payload}" == "[DONE]" ]] && break
+             delta=$(printf '%s' "$payload" | python3 -c \
+                 "import sys,json; d=json.load(sys.stdin)['choices'][0]['delta']; print(d.get('content') or '',end='')" 2>/dev/null || true)
+             printf '%s' "$delta"
+         fi
+     done
+echo
+echo
+ok "Streaming response complete"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 8 — advanced config (load balancing + fallbacks)
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 8 — Advanced config: load balancing & fallbacks"
+
+pause_comment "Restart the gateway with the advanced config"
+
+echo
+printf '%s# config-advanced.yaml%s\n' "$dim" "$reset"
+cat "${SCRIPT_DIR}/config-advanced.yaml"
+echo
+
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "docker model gateway --config demos/gateway/config-advanced.yaml" 0.035
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %sdocker model gateway --config demos/gateway/config-advanced.yaml%s\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+stop_gateway
+launch_gateway "${SCRIPT_DIR}/config-advanced.yaml"
+echo
+ok "Gateway restarted with advanced config"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 9 — round-robin load balancing
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 9 — Round-robin load balancing"
+
+pause_comment "'fast-model' has 2 backends — watch them alternate across 4 requests"
+
+echo
+printf '%s# 4 requests to '"'"'fast-model'"'"' → round-robins across smollm2 + qwen3%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "for i in 1 2 3 4; do curl -s .../v1/chat/completions -d '{\"model\":\"fast-model\",...}'; done" 0.03
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %sfor i in 1 2 3 4; do curl -s .../v1/chat/completions ...; done%s\n\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+for i in 1 2 3 4; do
+    resp=$(curl -sf -X POST "${GATEWAY_URL}/v1/chat/completions" \
+         -H "Content-Type: application/json" \
+         -H "Authorization: Bearer ${API_KEY}" \
+         -d "{
+           \"model\": \"fast-model\",
+           \"messages\": [{\"role\": \"user\", \"content\": \"Reply with only the number ${i}\"}],
+           \"max_tokens\": 10
+         }")
+    model_used=$(printf '%s' "$resp" | python3 -c \
+        "import sys,json; print(json.load(sys.stdin).get('model','?'))")
+    content=$(printf '%s' "$resp" | python3 -c \
+        "import sys,json; print(json.load(sys.stdin)['choices'][0]['message']['content'].strip())")
+    printf "  Request %d  backend=%-40s  reply=%s\n" "$i" "$model_used" "$content"
+done
+echo
+ok "Requests distributed across both backends"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 10 — embeddings
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 10 — Embeddings (nomic-embed-text)"
+
+pause_comment "Dedicated embedding model behind its own alias"
+
+echo
+printf '%s# POST /v1/embeddings — two sentences, then compute cosine similarity%s\n' "$dim" "$reset"
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "curl -s -X POST http://localhost:${GATEWAY_PORT}/v1/embeddings \\" 0.03
+printf "\n"
+typewrite "  -H 'Content-Type: application/json' \\" 0.03
+printf "\n"
+typewrite "  -H 'Authorization: Bearer ${API_KEY}' \\" 0.03
+printf "\n"
+typewrite "  -d '{\"model\":\"embeddings\",\"input\":[\"The quick brown fox\",\"A fast auburn canine\"]}'" 0.03
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %scurl -s -X POST .../v1/embeddings -d '"'"'{...}'"'"'%s\n\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+curl -sf -X POST "${GATEWAY_URL}/v1/embeddings" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer ${API_KEY}" \
+     -d '{"model":"embeddings","input":["The quick brown fox","A fast auburn canine"]}' \
+| python3 -c "
+import sys, json, math
+resp = json.load(sys.stdin)
+vecs = [item['embedding'] for item in sorted(resp['data'], key=lambda x: x['index'])]
+def cos(a, b):
+    dot = sum(x*y for x,y in zip(a,b))
+    na  = math.sqrt(sum(x*x for x in a))
+    nb  = math.sqrt(sum(x*x for x in b))
+    return dot / (na * nb) if na and nb else 0.0
+print(f'  Vector dimensions : {len(vecs[0])}')
+print(f'  Cosine similarity : {cos(vecs[0], vecs[1]):.4f}')
+print(f'  (sentences are semantically similar → high score)')
+"
+ok "Embeddings complete"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# STEP 11 — OpenAI Python SDK
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Step 11 — OpenAI Python SDK compatibility"
+
+pause_comment "Any app already using the openai library works with zero code changes"
+
+echo
+printf '%s# python demo — just swap base_url to point at the gateway%s\n' "$dim" "$reset"
+cat <<'PYSHOW'
+  from openai import OpenAI
+
+  client = OpenAI(
+      base_url="http://localhost:4000/v1",
+      api_key="demo-secret",
+  )
+
+  resp = client.chat.completions.create(
+      model="fast-model",
+      messages=[{"role": "user", "content": "Name 3 benefits of local LLMs."}],
+      max_tokens=120,
+  )
+  print(resp.choices[0].message.content)
+PYSHOW
+
+printf '%s%s$%s ' "$bold" "$green" "$reset"
+typewrite "python3 demo.py" 0.05
+printf '%s ▌%s' "$dim" "$reset"
+read -r -s _
+printf '\r%s%s$%s %spython3 demo.py%s\n\n' "$bold" "$green" "$reset" "$white" "$reset"
+
+if python3 -c "import openai" 2>/dev/null; then
+    python3 - <<'PYEOF'
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:4000/v1",
+    api_key="demo-secret",
+)
+
+resp = client.chat.completions.create(
+    model="fast-model",
+    messages=[
+        {"role": "system", "content": "You are a concise assistant."},
+        {"role": "user",   "content": "Name 3 benefits of running LLMs locally."},
+    ],
+    max_tokens=120,
+)
+print(resp.choices[0].message.content)
+PYEOF
+    echo
+    ok "OpenAI SDK works against the gateway — no code changes required"
+else
+    printf '%s  (skipped — openai package not installed: pip install openai)%s\n' "$yellow" "$reset"
+    ok "OpenAI SDK step skipped — install openai to run it"
+fi
+
+# ─────────────────────────────────────────────────────────────────────────────
+# DONE
+# ─────────────────────────────────────────────────────────────────────────────
+
+section "Demo complete"
+
+echo
+printf '%s  What we showed:%s\n' "$bold" "$reset"
+info "YAML-driven config — models, auth, retries, fallbacks"
+info "/health  /v1/models  /v1/chat/completions  /v1/embeddings"
+info "Bearer-token auth  (accept ✓  reject 401 ✓)"
+info "Non-streaming and streaming (SSE) chat completions"
+info "Round-robin load balancing across multiple backends"
+info "Automatic fallback chain when a backend fails"
+info "Drop-in OpenAI Python SDK compatibility"
+echo
+ok "docker model gateway demo finished"

From 2b3f9e2a6fdddeb87961d2c737f5058efb052b4b Mon Sep 17 00:00:00 2001
From: Eric Curtin <eric.curtin@docker.com>
Date: Thu, 9 Apr 2026 14:53:08 +0100
Subject: [PATCH 2/2] fix: allow vllm-metal ZMQ IPC sockets in macOS sandbox

vllm-metal uses ZMQ IPC sockets at temporary paths under
/private/var/folders (the macOS TMPDIR) for internal inter-process
communication between API server workers. The Python sandbox profile
only allowed network-bind for Unix sockets matching the
inference.*-[0-9]+\.sock$ pattern and TCP loopback, which caused
a ZMQError: Operation not permitted when vllm-metal tried to bind
those sockets.

Allow network-bind on paths under /private/var/folders so vllm-metal
can create its internal ZMQ IPC sockets in the system temp directory.
---
 pkg/sandbox/sandbox_darwin.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/sandbox/sandbox_darwin.go b/pkg/sandbox/sandbox_darwin.go
index db6dbd86..78351cfc 100644
--- a/pkg/sandbox/sandbox_darwin.go
+++ b/pkg/sandbox/sandbox_darwin.go
@@ -26,10 +26,14 @@ const ConfigurationPython = `(version 1)
 ;;; Python backends use either a Unix socket or a TCP loopback port.
 ;;; Allow Unix socket paths that match the inference socket naming convention
 ;;; as well as TCP loopback binding/inbound for backends that use TCP.
+;;; Also allow Unix domain socket binding in the system temp directory
+;;; (/private/var/folders) which vllm-metal uses for internal ZMQ IPC sockets.
 (deny network*)
 (allow network-bind network-inbound
     (regex #"inference.*-[0-9]+\.sock$")
     (local tcp "localhost:*"))
+(allow network-bind
+    (regex #"^/private/var/folders/"))
 
 ;;; Deny access to the camera and microphone.
 (deny device*)