Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,7 @@ global:
max_models: 50
authz:
fail_open: false
trust_identity_headers: true
identity:
user_id_header: x-user-id
user_groups_header: x-user-groups
Expand Down
189 changes: 189 additions & 0 deletions e2e/config/config.session-affinity-demo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Session Affinity Demo Config
# Demonstrates the route-bouncing problem in multi-turn conversations.
#
# Setup:
# - "expensive" model: Claude via claude-code-proxy on port 11480
# - "cheap" model: qwen2.5:0.5b (0.5B params) on Ollama port 11434
#
# Routing logic:
# - Complex queries (complexity signal "hard") → Claude (expensive)
# - Simple queries (complexity signal "easy") → qwen2.5:0.5b (cheap)
#
# The demo scenario:
# 1. User sends complex coding question → routes to Claude (expensive)
# 2. Claude responds with great code and asks "want me to add tests?"
# 3. User says "yes" → VSR sees only "yes" → classifies as easy
# 4. Routes to qwen2.5:0.5b (cheap) → 0.5B model has NO IDEA what "yes" means

# Disable features we don't need for the demo
semantic_cache:
enabled: false

tools:
enabled: false

prompt_guard:
enabled: true
model_id: "models/mom-jailbreak-classifier"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/mom-jailbreak-classifier/label_mapping.json"

# Jailbreak rules for signal-based detection
jailbreak_rules:
- name: "jailbreak_high"
threshold: 0.7
description: "High-confidence jailbreak detection"

hallucination_mitigation:
enabled: false

# Domain classifier (required by VSR startup)
classifier:
category_model:
model_id: "models/mom-domain-classifier"
threshold: 0.6
use_cpu: true
category_mapping_path: "models/mom-domain-classifier/category_mapping.json"

# Embedding models for complexity signal
embedding_models:
qwen3_model_path: "models/mom-embedding-pro"
use_cpu: true
hnsw_config:
model_type: "qwen3"
preload_embeddings: true
target_dimension: 1024
enable_soft_matching: true
min_score_threshold: 0.5

# Two backends
vllm_endpoints:
- name: "ollama"
address: "127.0.0.1"
port: 11434
weight: 1
- name: "claude-proxy"
address: "127.0.0.1"
port: 11480
weight: 1

# Model-to-endpoint mapping
model_config:
"expensive-model":
preferred_endpoints: ["claude-proxy"]
"qwen2.5:0.5b":
preferred_endpoints: ["ollama"]

# Complexity signal: distinguish hard vs easy prompts
complexity_rules:
- name: "prompt_complexity"
threshold: 0.15
description: "Classify prompt complexity for routing decisions"
hard:
candidates:
- "Implement a concurrent lock-free data structure with memory ordering guarantees"
- "Design a distributed consensus algorithm for fault-tolerant systems"
- "Write a compiler optimization pass for loop vectorization"
- "Build a real-time stream processing pipeline with exactly-once semantics"
- "Implement a B+ tree with concurrent readers and writers"
- "Design a garbage collector with generational collection and compaction"
- "Write an optimized matrix multiplication kernel with cache tiling"
- "Implement a raft consensus protocol with log compaction"
- "Explain the mathematical proof of the P vs NP problem"
- "Analyze the time complexity of this recursive algorithm with memoization"
easy:
candidates:
- "yes"
- "no"
- "ok"
- "sure"
- "thanks"
- "got it"
- "sounds good"
- "please do"
- "go ahead"
- "that works"
- "hello"
- "hi"
- "what is a variable"
- "how do I print hello world"
- "what does this error mean"

# Categories (minimal, required by classifier)
categories:
- name: computer_science
description: "Computer science and programming"
mmlu_categories: ["computer_science"]
- name: other
description: "General topics"
mmlu_categories: ["other"]

# Routing strategy
strategy: "priority"

# Two decisions: complex → expensive, simple → cheap
decisions:
- name: "jailbreak_block"
description: "Block jailbreak attempts"
priority: 999
rules:
operator: "AND"
conditions:
- type: "jailbreak"
name: "jailbreak_high"
modelRefs:
- model: "qwen2.5:0.5b"
use_reasoning: false
plugins:
- type: "fast_response"
configuration:
enabled: true
message: "Request blocked: jailbreak attempt detected."
status_code: 403

- name: "complex_query"
description: "Complex queries that need a powerful model"
priority: 200
rules:
operator: "AND"
conditions:
- type: "complexity"
name: "prompt_complexity:hard"
modelRefs:
- model: "expensive-model"
use_reasoning: false

- name: "simple_query"
description: "Simple queries that a cheap model can handle"
priority: 100
rules:
operator: "AND"
conditions:
- type: "complexity"
name: "prompt_complexity:easy"
modelRefs:
- model: "qwen2.5:0.5b"
use_reasoning: false

- name: "fallback"
description: "Fallback for unmatched queries"
priority: 1
rules:
operator: "AND"
conditions:
- type: "domain"
name: "other"
modelRefs:
- model: "qwen2.5:0.5b"
use_reasoning: false

# Default model when no decision matches
default_model: "qwen2.5:0.5b"

# Observability
observability:
metrics:
enabled: true
tracing:
enabled: false
109 changes: 109 additions & 0 deletions e2e/config/envoy.session-affinity-demo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Envoy config for session affinity demo
# Stripped down: no ext_authz, just ext_proc + dynamic routing
static_resources:
listeners:
- name: listener_0
address:
socket_address:
address: 0.0.0.0
port_value: 8801
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
access_log:
- name: envoy.access_loggers.stdout
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
log_format:
json_format:
time: "%START_TIME%"
request_method: "%REQ(:METHOD)%"
request_path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%"
response_code: "%RESPONSE_CODE%"
upstream_host: "%UPSTREAM_HOST%"
selected_model: "%REQ(X-SELECTED-MODEL)%"
destination: "%REQ(X-VSR-DESTINATION-ENDPOINT)%"
route_config:
name: local_route
virtual_hosts:
- name: local_service
domains: ["*"]
routes:
- match:
prefix: "/"
route:
cluster: dynamic_backend
timeout: 300s
http_filters:
- name: envoy.filters.http.ext_proc
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
grpc_service:
envoy_grpc:
cluster_name: extproc_service
allow_mode_override: true
processing_mode:
request_header_mode: "SEND"
response_header_mode: "SEND"
request_body_mode: "BUFFERED"
response_body_mode: "BUFFERED"
request_trailer_mode: "SKIP"
response_trailer_mode: "SKIP"
failure_mode_allow: true
message_timeout: 300s
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
suppress_envoy_headers: true
http2_protocol_options:
max_concurrent_streams: 100
stream_idle_timeout: "300s"
request_timeout: "300s"
common_http_protocol_options:
idle_timeout: "300s"

clusters:
- name: extproc_service
connect_timeout: 300s
type: STATIC
lb_policy: ROUND_ROBIN
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options:
connection_keepalive:
interval: 300s
timeout: 300s
load_assignment:
cluster_name: extproc_service
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 50051

# Dynamic backend using original destination (VSR sets x-vsr-destination-endpoint)
- name: dynamic_backend
connect_timeout: 300s
type: ORIGINAL_DST
lb_policy: CLUSTER_PROVIDED
original_dst_lb_config:
use_http_header: true
http_header_name: "x-vsr-destination-endpoint"
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http_protocol_options: {}

admin:
address:
socket_address:
address: "127.0.0.1"
port_value: 19000
6 changes: 6 additions & 0 deletions e2e/config/router-runtime.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"phase": "ready",
"ready": true,
"message": "Router models are ready. Starting router services...",
"updated_at": "2026-03-06T18:09:13Z"
}
Loading
Loading