vllm-project
diff --git a/‎.github/workflows/integration-test-memory.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/integration-test-memory.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 16 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎e2e/pkg/fixtures/chat.go‎
Lines changed: 2 additions & 0 deletions b/‎e2e/pkg/fixtures/chat.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎e2e/pkg/fixtures/session.go‎
Lines changed: 24 additions & 0 deletions b/‎e2e/pkg/fixtures/session.go‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎e2e/pkg/testmatrix/testcases.go‎
Lines changed: 4 additions & 0 deletions b/‎e2e/pkg/testmatrix/testcases.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎e2e/testcases/session_pricing_e2e.go‎
Lines changed: 158 additions & 0 deletions b/‎e2e/testcases/session_pricing_e2e.go‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎e2e/testcases/session_telemetry_e2e.go‎
Lines changed: 81 additions & 0 deletions b/‎e2e/testcases/session_telemetry_e2e.go‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎e2e/testing/run_memory_integration.sh‎
Lines changed: 30 additions & 2 deletions b/‎e2e/testing/run_memory_integration.sh‎
Lines changed: 30 additions & 2 deletions
@@ -61,6 +61,9 @@ jobs:
           echo "=== Disk space after cleanup ==="
           df -h /
 
+      - name: Pre-pull Milvus image
+        run: docker pull milvusdb/milvus:v2.3.3
+
       - name: Run memory integration tests
         run: |
           make memory-test-integration DOCKER_TAG=ci VLLM_SR_IMAGE=vllm-sr:ci
 
@@ -467,6 +467,22 @@ routing:
           - name: support_escalated
             gte: 0.45
 
+  session_states:
+    - name: session_routing
+      fields:
+        - name: turn_number
+          type: int
+        - name: current_model
+          type: string
+        - name: cumulative_cost_usd
+          type: float
+        - name: retry_count_ema
+          type: float
+        - name: quality_score_ema
+          type: float
+        - name: kv_cache_warm
+          type: float
+
   decisions:
     - name: static_business_route
       description: Static fallback for standard business traffic.
 
@@ -16,6 +16,8 @@ type ChatMessage struct {
 type ChatCompletionsRequest struct {
 	Model    string        `json:"model"`
 	Messages []ChatMessage `json:"messages"`
+	// User is optional; forwarded for per-user routing and session correlation in tests.
+	User string `json:"user,omitempty"`
 }
 
 // ChatCompletionsClient talks to the routed chat-completions API.
 
@@ -54,6 +54,30 @@ func OpenServiceSession(ctx context.Context, client *kubernetes.Clientset, opts
 	return newSession(localPort, stop), nil
 }
 
+// OpenSemanticRouterMetricsSession establishes a port-forward to the router Prometheus /metrics endpoint (port 9190).
+func OpenSemanticRouterMetricsSession(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) (*ServiceSession, error) {
+	localPort, err := getAvailablePort()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get available port: %w", err)
+	}
+
+	stop, err := helpers.StartPortForward(
+		ctx,
+		client,
+		opts.RestConfig,
+		"vllm-semantic-router-system",
+		"semantic-router",
+		fmt.Sprintf("%s:9190", localPort),
+		opts.Verbose,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("metrics port-forward failed: %w", err)
+	}
+
+	time.Sleep(2 * time.Second)
+	return newSession(localPort, stop), nil
+}
+
 // OpenRouterAPISession establishes a port-forward to the semantic-router API service.
 func OpenRouterAPISession(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) (*ServiceSession, error) {
 	localPort, err := getAvailablePort()
 
@@ -21,6 +21,10 @@ var BaselineRouterContract = []string{
 	"decision-fallback-behavior",
 	"plugin-config-variations",
 	"chat-completions-progressive-stress",
+	// Session observability
+	"session-telemetry-metrics",
+	"session-pricing-chat-completions",
+	"session-pricing-response-api",
 }
 
 // DashboardContract is the canonical E2E contract for the dashboard API surface.
 
@@ -0,0 +1,158 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("session-pricing-chat-completions", pkgtestcases.TestCase{
+		Description: "After a routed chat completion, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
+		Tags:        []string{"kubernetes", "observability", "metrics", "llm", "pricing"},
+		Fn:          testSessionPricingChatCompletions,
+	})
+	pkgtestcases.Register("session-pricing-response-api", pkgtestcases.TestCase{
+		Description: "After a routed Response API call, Prometheus exposes llm_session_turn_cost histogram when model pricing is configured",
+		Tags:        []string{"kubernetes", "observability", "metrics", "llm", "pricing", "response-api"},
+		Fn:          testSessionPricingResponseAPI,
+	})
+}
+
+// testSessionPricingChatCompletions verifies that after a Chat Completions request the
+// llm_session_turn_cost histogram is present in /metrics (pricing must be configured
+// for the routed model in router-config.yaml for the observation to appear).
+func testSessionPricingChatCompletions(
+	ctx context.Context,
+	client *kubernetes.Clientset,
+	opts pkgtestcases.TestCaseOptions,
+) error {
+	traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer traffic.Close()
+
+	metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer metricsSession.Close()
+
+	chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)
+
+	headers := map[string]string{
+		"x-authz-user-id": "e2e-pricing-chat-user",
+	}
+	resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
+		Model: "MoM",
+		Messages: []fixtures.ChatMessage{
+			{Role: "user", Content: "Say hello in one short sentence for pricing telemetry."},
+		},
+		User: "e2e-pricing-chat-user",
+	}, headers)
+	if err != nil {
+		return err
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
+	}
+
+	body, err := fetchMetrics(ctx, metricsSession)
+	if err != nil {
+		return err
+	}
+
+	// Token histograms from PR 1 must still be present.
+	if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
+	}
+	if !strings.Contains(body, "llm_session_turn_completion_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
+	}
+	// Cost histogram descriptor must be registered (present even when no observations).
+	if !strings.Contains(body, "llm_session_turn_cost") {
+		return fmt.Errorf("metrics body missing llm_session_turn_cost")
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"chat_status": resp.StatusCode,
+		})
+	}
+	return nil
+}
+
+// testSessionPricingResponseAPI verifies that after a Response API request the
+// llm_session_turn_cost histogram descriptor is exposed in /metrics.
+func testSessionPricingResponseAPI(
+	ctx context.Context,
+	client *kubernetes.Clientset,
+	opts pkgtestcases.TestCaseOptions,
+) error {
+	traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer traffic.Close()
+
+	metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer metricsSession.Close()
+
+	respAPI := fixtures.NewResponseAPIClient(traffic, 60*time.Second)
+
+	_, raw, err := respAPI.Create(ctx, fixtures.ResponseAPIRequest{
+		Model: "MoM",
+		Input: "Say hello in one short sentence for Response API pricing telemetry.",
+	})
+	if err != nil {
+		return fmt.Errorf("response api create: %w", err)
+	}
+	if raw.StatusCode != http.StatusOK {
+		return fmt.Errorf("response api: expected 200, got %d: %s", raw.StatusCode, string(raw.Body))
+	}
+
+	body, err := fetchMetrics(ctx, metricsSession)
+	if err != nil {
+		return err
+	}
+
+	if !strings.Contains(body, "llm_session_turn_cost") {
+		return fmt.Errorf("metrics body missing llm_session_turn_cost after Response API request")
+	}
+	if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens after Response API request")
+	}
+	if !strings.Contains(body, "llm_session_turn_completion_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens after Response API request")
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"response_api_status": raw.StatusCode,
+		})
+	}
+	return nil
+}
+
+// fetchMetrics retrieves the Prometheus /metrics text from the router metrics port.
+func fetchMetrics(ctx context.Context, metricsSession *fixtures.ServiceSession) (string, error) {
+	metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
+	metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
+	if err != nil {
+		return "", fmt.Errorf("fetch /metrics: %w", err)
+	}
+	if metricsResp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
+	}
+	return string(metricsResp.Body), nil
+}
@@ -0,0 +1,81 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/vllm-project/semantic-router/e2e/pkg/fixtures"
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("session-telemetry-metrics", pkgtestcases.TestCase{
+		Description: "After a routed chat completion, Prometheus exposes llm_session_turn_* histograms on the router metrics port",
+		Tags:        []string{"kubernetes", "observability", "metrics", "llm"},
+		Fn:          testSessionTelemetryMetrics,
+	})
+}
+
+func testSessionTelemetryMetrics(
+	ctx context.Context,
+	client *kubernetes.Clientset,
+	opts pkgtestcases.TestCaseOptions,
+) error {
+	traffic, err := fixtures.OpenServiceSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer traffic.Close()
+
+	metricsSession, err := fixtures.OpenSemanticRouterMetricsSession(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer metricsSession.Close()
+
+	chat := fixtures.NewChatCompletionsClient(traffic, 60*time.Second)
+
+	headers := map[string]string{
+		"x-authz-user-id": "e2e-session-telemetry-user",
+	}
+	resp, err := chat.Create(ctx, fixtures.ChatCompletionsRequest{
+		Model: "MoM",
+		Messages: []fixtures.ChatMessage{
+			{Role: "user", Content: "Say hello in one short sentence for session telemetry."},
+		},
+		User: "e2e-session-telemetry-user",
+	}, headers)
+	if err != nil {
+		return err
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("chat completion: expected 200, got %d: %s", resp.StatusCode, string(resp.Body))
+	}
+
+	metricsHTTP := metricsSession.HTTPClient(15 * time.Second)
+	metricsResp, err := fixtures.DoGETRequest(ctx, metricsHTTP, metricsSession.URL("/metrics"))
+	if err != nil {
+		return fmt.Errorf("fetch /metrics: %w", err)
+	}
+	if metricsResp.StatusCode != http.StatusOK {
+		return fmt.Errorf("/metrics: expected 200, got %d", metricsResp.StatusCode)
+	}
+	body := string(metricsResp.Body)
+	if !strings.Contains(body, "llm_session_turn_prompt_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_prompt_tokens")
+	}
+	if !strings.Contains(body, "llm_session_turn_completion_tokens") {
+		return fmt.Errorf("metrics body missing llm_session_turn_completion_tokens")
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"chat_status": resp.StatusCode,
+		})
+	}
+	return nil
+}
@@ -92,13 +92,41 @@ HF_HUB_ENABLE_HF_TRANSFER=1 \
 python3 -c "from huggingface_hub import snapshot_download; snapshot_download('sentence-transformers/all-MiniLM-L12-v2', local_dir='${TEST_DIR}/models/mom-embedding-light', local_dir_use_symlinks=False)"
 
 make -C "${REPO_ROOT}" start-milvus
+
+# Double-check Milvus readiness with pymilvus probe (gRPC-level, not just HTTP)
+echo "Verifying Milvus gRPC readiness via pymilvus..."
+for attempt in $(seq 1 30); do
+    if python3 -c "
+from pymilvus import connections
+try:
+    connections.connect('default', host='localhost', port='19530', timeout=5)
+    connections.disconnect('default')
+    print('Milvus gRPC connection verified')
+except Exception as e:
+    raise SystemExit(1)
+" 2>/dev/null; then
+        break
+    fi
+    if [ "${attempt}" -eq 30 ]; then
+        echo "ERROR: Milvus gRPC not ready after 30 attempts"
+        "${CONTAINER_RUNTIME}" logs milvus-semantic-cache 2>&1 | tail -30 || true
+        exit 1
+    fi
+    sleep 2
+done
+
 cp "${REPO_ROOT}/e2e/config/config.memory-user.yaml" "${CONFIG_FILE}"
-python3 -c 'from pathlib import Path; path = Path("'"${CONFIG_FILE}"'"); path.write_text(path.read_text().replace("host.docker.internal:8000", "llm-katan:8000"))'
+python3 -c 'from pathlib import Path; path = Path("'"${CONFIG_FILE}"'"); t = path.read_text(); t = t.replace("host.docker.internal:8000", "llm-katan:8000"); t = t.replace("host.docker.internal:19530", "vllm-sr-milvus:19530"); path.write_text(t)'
 
 if ! "${CONTAINER_RUNTIME}" network inspect "${VLLM_SR_NETWORK}" >/dev/null 2>&1; then
     "${CONTAINER_RUNTIME}" network create "${VLLM_SR_NETWORK}" >/dev/null
 fi
 
+# Connect the externally-started Milvus to the vllm-sr network so the router
+# container can reach it by the name vllm-sr serve expects.
+"${CONTAINER_RUNTIME}" network connect --alias vllm-sr-milvus "${VLLM_SR_NETWORK}" milvus-semantic-cache 2>/dev/null || true
+echo "Milvus connected to ${VLLM_SR_NETWORK} as vllm-sr-milvus"
+
 "${CONTAINER_RUNTIME}" run -d --name llm-katan \
     --network "${VLLM_SR_NETWORK}" \
     --network-alias llm-katan \
@@ -141,7 +169,7 @@ fi
 
 VLLM_SR_PID="$(cat "${PID_FILE}")"
 
-for _ in $(seq 1 180); do
+for _ in $(seq 1 300); do
     http_code="$(curl -s -o /dev/null -w "%{http_code}" "${ROUTER_API_HEALTH_URL}" 2>/dev/null || echo "000")"
     if [[ "${http_code}" == "200" ]]; then
         echo "vllm-sr router API ready"
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@ type ChatMessage struct {`
`16`	`16`	`type ChatCompletionsRequest struct {`
`17`	`17`	Model string `json:"model"`
`18`	`18`	Messages []ChatMessage `json:"messages"`
	`19`	`+ // User is optional; forwarded for per-user routing and session correlation in tests.`
	`20`	+ User string `json:"user,omitempty"`
`19`	`21`	`}`
`20`	`22`
`21`	`23`	`// ChatCompletionsClient talks to the routed chat-completions API.`