Skip to content

Commit e10eaaf

Browse files
authored
Merge branch 'main' into issues_1514
2 parents 25a211f + b501724 commit e10eaaf

69 files changed

Lines changed: 3496 additions & 410 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/integration-test-memory.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ jobs:
6161
echo "=== Disk space after cleanup ==="
6262
df -h /
6363
64+
- name: Pre-pull Milvus image
65+
run: docker pull milvusdb/milvus:v2.3.3
66+
6467
- name: Run memory integration tests
6568
run: |
6669
make memory-test-integration DOCKER_TAG=ci VLLM_SR_IMAGE=vllm-sr:ci

config/config.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,22 @@ routing:
467467
- name: support_escalated
468468
gte: 0.45
469469

470+
session_states:
471+
- name: session_routing
472+
fields:
473+
- name: turn_number
474+
type: int
475+
- name: current_model
476+
type: string
477+
- name: cumulative_cost_usd
478+
type: float
479+
- name: retry_count_ema
480+
type: float
481+
- name: quality_score_ema
482+
type: float
483+
- name: kv_cache_warm
484+
type: float
485+
470486
decisions:
471487
- name: static_business_route
472488
description: Static fallback for standard business traffic.

e2e/pkg/fixtures/chat.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ type ChatMessage struct {
1616
type ChatCompletionsRequest struct {
1717
Model string `json:"model"`
1818
Messages []ChatMessage `json:"messages"`
19+
// User is optional; forwarded for per-user routing and session correlation in tests.
20+
User string `json:"user,omitempty"`
1921
}
2022

2123
// ChatCompletionsClient talks to the routed chat-completions API.

e2e/pkg/fixtures/session.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,30 @@ func OpenServiceSession(ctx context.Context, client *kubernetes.Clientset, opts
5454
return newSession(localPort, stop), nil
5555
}
5656

57+
// OpenSemanticRouterMetricsSession establishes a port-forward to the router Prometheus /metrics endpoint (port 9190).
58+
func OpenSemanticRouterMetricsSession(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) (*ServiceSession, error) {
59+
localPort, err := getAvailablePort()
60+
if err != nil {
61+
return nil, fmt.Errorf("failed to get available port: %w", err)
62+
}
63+
64+
stop, err := helpers.StartPortForward(
65+
ctx,
66+
client,
67+
opts.RestConfig,
68+
"vllm-semantic-router-system",
69+
"semantic-router",
70+
fmt.Sprintf("%s:9190", localPort),
71+
opts.Verbose,
72+
)
73+
if err != nil {
74+
return nil, fmt.Errorf("metrics port-forward failed: %w", err)
75+
}
76+
77+
time.Sleep(2 * time.Second)
78+
return newSession(localPort, stop), nil
79+
}
80+
5781
// OpenRouterAPISession establishes a port-forward to the semantic-router API service.
5882
func OpenRouterAPISession(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) (*ServiceSession, error) {
5983
localPort, err := getAvailablePort()

e2e/pkg/testmatrix/testcases.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ var BaselineRouterContract = []string{
2121
"decision-fallback-behavior",
2222
"plugin-config-variations",
2323
"chat-completions-progressive-stress",
24+
// Session observability
25+
"session-telemetry-metrics",
26+
"session-pricing-chat-completions",
27+
"session-pricing-response-api",
2428
}
2529

2630
// DashboardContract is the canonical E2E contract for the dashboard API surface.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
package testcases
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"net/http"
7+
"strings"
8+
"time"
9+
10+
"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
11+
pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
12+
"k8s.io/client-go/kubernetes"
13+
)
14+
15+
func init() {
16+
pkgtestcases.Register("session-pricing-chat-completions", pkgtestcases.TestCase{
17+
Description: "After a routed chat completion, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
18+
Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing"},
19+
Fn: testSessionPricingChatCompletions,
20+
})
21+
pkgtestcases.Register("session-pricing-response-api", pkgtestcases.TestCase{
22+
Description: "After a routed Response API call, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
23+
Tags: []string{"kubernetes", "observability", "metrics", "llm", "pricing", "response-api"},
24+
Fn: testSessionPricingResponseAPI,
25+
})
26+
}
27+
28+
// testSessionPricingChatCompletions verifies that after a Chat Completions request the
29+
// llm_session_turn_cost histogram is present in /metrics (pricing must be configured
30+
// for the routed model in router-config.yaml for the observation to appear).
31+
func testSessionPricingChatCompletions(
32+
ctx context.Context,
33+
client *kubernetes.Clientset,
34+
opts pkgtestcases.TestCaseOptions,
35+
) error {
36+
traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
37+
if err != nil {
38+
return err
39+
}
40+
defer traffic.Close()
41+
42+
metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
43+
if err != nil {
44+
return err
45+
}
46+
defer metricsSession.Close()
47+
48+
chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)
49+
50+
headers := map[string]string{
51+
"x-authz-user-id": "e2e-pricing-chat-user",
52+
}
53+
resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
54+
Model: "MoM",
55+
Messages: []fixtures.ChatMessage{
56+
{Role: "user", Content: "Say hello in one short sentence for pricing telemetry."},
57+
},
58+
User: "e2e-pricing-chat-user",
59+
}, headers)
60+
if err != nil {
61+
return err
62+
}
63+
if resp.StatusCode != http.StatusOK {
64+
return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
65+
}
66+
67+
body, err := fetchMetrics(ctx, metricsSession)
68+
if err != nil {
69+
return err
70+
}
71+
72+
// Token histograms from PR 1 must still be present.
73+
if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
74+
return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
75+
}
76+
if !strings.Contains(body, "llm_session_turn_completion_tokens") {
77+
return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
78+
}
79+
// Cost histogram descriptor must be registered (present even when no observations).
80+
if !strings.Contains(body, "llm_session_turn_cost") {
81+
return fmt.Errorf("metrics body missing llm_session_turn_cost")
82+
}
83+
84+
if opts.SetDetails != nil {
85+
opts.SetDetails(map[string]interface{}{
86+
"chat_status": resp.StatusCode,
87+
})
88+
}
89+
return nil
90+
}
91+
92+
// testSessionPricingResponseAPI verifies that after a Response API request the
93+
// llm_session_turn_cost histogram descriptor is exposed in /metrics.
94+
func testSessionPricingResponseAPI(
95+
ctx context.Context,
96+
client *kubernetes.Clientset,
97+
opts pkgtestcases.TestCaseOptions,
98+
) error {
99+
traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
100+
if err != nil {
101+
return err
102+
}
103+
defer traffic.Close()
104+
105+
metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
106+
if err != nil {
107+
return err
108+
}
109+
defer metricsSession.Close()
110+
111+
respAPI := fixtures.NewResponseAPIClient(traffic, 60*time.Second)
112+
113+
_, raw, err := respAPI.Create(ctx, fixtures.ResponseAPIRequest{
114+
Model: "MoM",
115+
Input: "Say hello in one short sentence for Response API pricing telemetry.",
116+
})
117+
if err != nil {
118+
return fmt.Errorf("response api create: %w", err)
119+
}
120+
if raw.StatusCode != http.StatusOK {
121+
return fmt.Errorf("response api: expected 200, got %d: %s", raw.StatusCode, string(raw.Body))
122+
}
123+
124+
body, err := fetchMetrics(ctx, metricsSession)
125+
if err != nil {
126+
return err
127+
}
128+
129+
if !strings.Contains(body, "llm_session_turn_cost") {
130+
return fmt.Errorf("metrics body missing llm_session_turn_cost after Response API request")
131+
}
132+
if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
133+
return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens after Response API request")
134+
}
135+
if !strings.Contains(body, "llm_session_turn_completion_tokens") {
136+
return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens after Response API request")
137+
}
138+
139+
if opts.SetDetails != nil {
140+
opts.SetDetails(map[string]interface{}{
141+
"response_api_status": raw.StatusCode,
142+
})
143+
}
144+
return nil
145+
}
146+
147+
// fetchMetrics retrieves the Prometheus /metrics text from the router metrics port.
148+
func fetchMetrics(ctx context.Context, metricsSession *fixtures.ServiceSession) (string, error) {
149+
metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
150+
metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
151+
if err != nil {
152+
return "", fmt.Errorf("fetch /metrics: %w", err)
153+
}
154+
if metricsResp.StatusCode != http.StatusOK {
155+
return "", fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
156+
}
157+
return string(metricsResp.Body), nil
158+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package testcases
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"net/http"
7+
"strings"
8+
"time"
9+
10+
"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
11+
pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
12+
"k8s.io/client-go/kubernetes"
13+
)
14+
15+
func init() {
16+
pkgtestcases.Register("session-telemetry-metrics", pkgtestcases.TestCase{
17+
Description: "After a routed chat completion, Prometheus exposes llm_session_turn_* histograms on the router metrics port",
18+
Tags: []string{"kubernetes", "observability", "metrics", "llm"},
19+
Fn: testSessionTelemetryMetrics,
20+
})
21+
}
22+
23+
func testSessionTelemetryMetrics(
24+
ctx context.Context,
25+
client *kubernetes.Clientset,
26+
opts pkgtestcases.TestCaseOptions,
27+
) error {
28+
traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
29+
if err != nil {
30+
return err
31+
}
32+
defer traffic.Close()
33+
34+
metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
35+
if err != nil {
36+
return err
37+
}
38+
defer metricsSession.Close()
39+
40+
chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)
41+
42+
headers := map[string]string{
43+
"x-authz-user-id": "e2e-session-telemetry-user",
44+
}
45+
resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
46+
Model: "MoM",
47+
Messages: []fixtures.ChatMessage{
48+
{Role: "user", Content: "Say hello in one short sentence for session telemetry."},
49+
},
50+
User: "e2e-session-telemetry-user",
51+
}, headers)
52+
if err != nil {
53+
return err
54+
}
55+
if resp.StatusCode != http.StatusOK {
56+
return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
57+
}
58+
59+
metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
60+
metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
61+
if err != nil {
62+
return fmt.Errorf("fetch /metrics: %w", err)
63+
}
64+
if metricsResp.StatusCode != http.StatusOK {
65+
return fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
66+
}
67+
body := string(metricsResp.Body)
68+
if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
69+
return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
70+
}
71+
if !strings.Contains(body, "llm_session_turn_completion_tokens") {
72+
return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
73+
}
74+
75+
if opts.SetDetails != nil {
76+
opts.SetDetails(map[string]interface{}{
77+
"chat_status": resp.StatusCode,
78+
})
79+
}
80+
return nil
81+
}

e2e/testing/run_memory_integration.sh

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,41 @@ HF_HUB_ENABLE_HF_TRANSFER=1 \
9292
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('sentence-transformers/all-MiniLM-L12-v2', local_dir='${TEST_DIR}/models/mom-embedding-light', local_dir_use_symlinks=False)"
9393

9494
make -C "${REPO_ROOT}" start-milvus
95+
96+
# Double-check Milvus readiness with pymilvus probe (gRPC-level, not just HTTP)
97+
echo "Verifying Milvus gRPC readiness via pymilvus..."
98+
for attempt in $(seq 1 30); do
99+
if python3 -c "
100+
from pymilvus import connections
101+
try:
102+
connections.connect('default', host='localhost', port='19530', timeout=5)
103+
connections.disconnect('default')
104+
print('Milvus gRPC connection verified')
105+
except Exception as e:
106+
raise SystemExit(1)
107+
" 2>/dev/null; then
108+
break
109+
fi
110+
if [ "${attempt}" -eq 30 ]; then
111+
echo "ERROR: Milvus gRPC not ready after 30 attempts"
112+
"${CONTAINER_RUNTIME}" logs milvus-semantic-cache 2>&1 | tail -30 || true
113+
exit 1
114+
fi
115+
sleep 2
116+
done
117+
95118
cp "${REPO_ROOT}/e2e/config/config.memory-user.yaml" "${CONFIG_FILE}"
96-
python3 -c 'from pathlib import Path; path = Path("'"${CONFIG_FILE}"'"); path.write_text(path.read_text().replace("host.docker.internal:8000", "llm-katan:8000"))'
119+
python3 -c 'from pathlib import Path; path = Path("'"${CONFIG_FILE}"'"); t = path.read_text(); t = t.replace("host.docker.internal:8000", "llm-katan:8000"); t = t.replace("host.docker.internal:19530", "vllm-sr-milvus:19530"); path.write_text(t)'
97120

98121
if ! "${CONTAINER_RUNTIME}" network inspect "${VLLM_SR_NETWORK}" >/dev/null 2>&1; then
99122
"${CONTAINER_RUNTIME}" network create "${VLLM_SR_NETWORK}" >/dev/null
100123
fi
101124

125+
# Connect the externally-started Milvus to the vllm-sr network so the router
126+
# container can reach it by the name vllm-sr serve expects.
127+
"${CONTAINER_RUNTIME}" network connect --alias vllm-sr-milvus "${VLLM_SR_NETWORK}" milvus-semantic-cache 2>/dev/null || true
128+
echo "Milvus connected to ${VLLM_SR_NETWORK} as vllm-sr-milvus"
129+
102130
"${CONTAINER_RUNTIME}" run -d --name llm-katan \
103131
--network "${VLLM_SR_NETWORK}" \
104132
--network-alias llm-katan \
@@ -141,7 +169,7 @@ fi
141169

142170
VLLM_SR_PID="$(cat "${PID_FILE}")"
143171

144-
for _ in $(seq 1 180); do
172+
for _ in $(seq 1 300); do
145173
http_code="$(curl -s -o /dev/null -w "%{http_code}" "${ROUTER_API_HEALTH_URL}" 2>/dev/null || echo "000")"
146174
if [[ "${http_code}" == "200" ]]; then
147175
echo "vllm-sr router API ready"

0 commit comments

Comments
 (0)