-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathlarge_scale.yaml
More file actions
87 lines (75 loc) · 2.99 KB
/
Copy pathlarge_scale.yaml
File metadata and controls
87 lines (75 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Large-scale evaluation configuration (1000+ prompts)
# Tuned for production runs with higher concurrency and longer timeouts.
# See docs/how-to-resume-and-scale.md for guidance.
evaluation:
name: "model-router-large-scale"
dataset: "datasets/sample_custom.jsonl" # Replace with your 1000+ prompt dataset
sample_size: null # null = use all prompts in the file
random_seed: 42
endpoints:
model_router:
type: "azure_openai"
endpoint_url: "${AZURE_MODEL_ROUTER_ENDPOINT}"
api_key: "${AZURE_MODEL_ROUTER_KEY}"
deployment_name: "${AZURE_MODEL_ROUTER_DEPLOYMENT}"
parameters:
max_tokens: 1024
baseline:
type: "azure_openai"
endpoint_url: "${AZURE_OPENAI_ENDPOINT}"
api_key: "${AZURE_OPENAI_KEY}"
deployment_name: "${AZURE_BASELINE_DEPLOYMENT}"
parameters:
max_tokens: 1024
pricing: # USD per 1M tokens
# ── Model Router markup (input-only charge) ──
model_router:
input: 0.50
output: 1.50
# ── Azure OpenAI models ──
gpt-4.1: { input: 2.00, output: 8.00 }
gpt-4.1-mini: { input: 0.40, output: 1.60 }
gpt-4.1-nano: { input: 0.10, output: 0.40 }
gpt-4o: { input: 2.50, output: 10.00 }
gpt-4o-mini: { input: 0.15, output: 0.60 }
o4-mini: { input: 1.10, output: 4.40 }
gpt-5: { input: 1.25, output: 10.00 }
gpt-5-mini: { input: 0.25, output: 2.00 }
gpt-5-nano: { input: 0.05, output: 0.40 }
gpt-5-chat: { input: 1.25, output: 10.00 }
gpt-5.2: { input: 1.75, output: 14.00 }
gpt-5.2-chat: { input: 1.75, output: 14.00 }
gpt-oss-120b: { input: 0.15, output: 0.60 }
# ── Anthropic ──
claude-haiku-4-5: { input: 1.00, output: 5.00 }
claude-sonnet-4-5: { input: 3.00, output: 15.00 }
claude-opus-4-1: { input: 15.00, output: 75.00 }
claude-opus-4-6: { input: 5.00, output: 25.00 }
# ── xAI ──
grok-4: { input: 2.00, output: 6.00 }
grok-4-fast-reasoning: { input: 0.20, output: 0.50 }
# ── DeepSeek ──
DeepSeek-V3.1: { input: 0.27, output: 1.10 }
DeepSeek-V3.2: { input: 0.27, output: 1.10 }
# ── Meta ──
Llama-4-Maverick-17B-128E-Instruct-FP8: { input: 0.20, output: 0.20 }
concurrency:
max_parallel_requests: 10 # Higher than default (5) for throughput
request_timeout_seconds: 120 # Longer timeout for complex prompts
max_retries: 5 # More retries to handle transient rate limits
output:
directory: "results/large-scale"
formats: ["markdown", "csv", "json"]
judge:
enabled: true
endpoint:
type: "azure_openai"
endpoint_url: "${AZURE_JUDGE_ENDPOINT}"
api_key: "${AZURE_JUDGE_KEY}"
deployment_name: "${AZURE_JUDGE_DEPLOYMENT}"
parameters: {}
pairwise_template: "configs/judge_prompts/pairwise.yaml"
absolute_template: "configs/judge_prompts/absolute.yaml"
max_parallel: 5 # Higher than default (3) for throughput
timeout_seconds: 120
max_retries: 3