-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathdefault.yaml
More file actions
147 lines (130 loc) · 3.22 KB
/
Copy pathdefault.yaml
File metadata and controls
147 lines (130 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Default evaluation configuration
# See .env.example for required environment variables
evaluation:
name: "model-router-vs-gpt5"
dataset: "datasets/sample_custom.jsonl"
sample_size: null # null = use all, or integer for subset
random_seed: 42
endpoints:
model_router:
type: "azure_openai"
endpoint_url: "${AZURE_MODEL_ROUTER_ENDPOINT}"
api_key: "${AZURE_MODEL_ROUTER_KEY}"
deployment_name: "${AZURE_MODEL_ROUTER_DEPLOYMENT}"
parameters:
temperature: 0.7
max_tokens: 1024
baseline:
type: "azure_openai"
endpoint_url: "${AZURE_OPENAI_ENDPOINT}"
api_key: "${AZURE_OPENAI_KEY}"
deployment_name: "${AZURE_BASELINE_DEPLOYMENT}"
parameters:
max_tokens: 1024
pricing: # USD per 1M tokens
# ── Model Router markup (input-only charge) ──
model_router:
input: 0.50
output: 1.50
# ── Azure OpenAI models (Global Standard pricing) ──
# GPT-4.1 series
gpt-4.1:
input: 2.00
output: 8.00
gpt-4.1-mini:
input: 0.40
output: 1.60
gpt-4.1-nano:
input: 0.10
output: 0.40
# GPT-4o series
gpt-4o:
input: 2.50
output: 10.00
gpt-4o-mini:
input: 0.15
output: 0.60
# o-series
o4-mini:
input: 1.10
output: 4.40
# GPT-5 series
gpt-5:
input: 1.25
output: 10.00
gpt-5-mini:
input: 0.25
output: 2.00
gpt-5-nano:
input: 0.05
output: 0.40
gpt-5-chat:
input: 1.25
output: 10.00
# GPT-5.2 series
gpt-5.2:
input: 1.75
output: 14.00
gpt-5.2-chat:
input: 1.75
output: 14.00
# GPT-OSS
gpt-oss-120b:
input: 0.15
output: 0.60
# ── Anthropic models (Azure Marketplace pricing) ──
# Source: https://platform.claude.com/docs/en/docs/about-claude/models
claude-haiku-4-5:
input: 1.00
output: 5.00
claude-sonnet-4-5:
input: 3.00
output: 15.00
claude-opus-4-1:
input: 15.00
output: 75.00
claude-opus-4-6:
input: 5.00
output: 25.00
# ── xAI models (Azure Marketplace pricing) ──
# Source: https://x.ai/api
grok-4:
input: 2.00
output: 6.00
grok-4-fast-reasoning:
input: 0.20
output: 0.50
# ── DeepSeek models (Azure Marketplace pricing) ──
# Source: https://api-docs.deepseek.com
DeepSeek-V3.1:
input: 0.27
output: 1.10
DeepSeek-V3.2:
input: 0.27
output: 1.10
# ── Meta models (Azure Marketplace pricing) ──
Llama-4-Maverick-17B-128E-Instruct-FP8:
input: 0.20
output: 0.20
concurrency:
max_parallel_requests: 5
request_timeout_seconds: 60
max_retries: 3
output:
directory: "results" # base dir only; run_eval.py creates results/run-<unix_timestamp>/ automatically
formats: ["markdown", "csv", "json"]
# LLM-as-a-judge quality evaluation (optional)
# Set enabled: true and configure the judge endpoint to run quality scoring
judge:
enabled: true
endpoint:
type: "azure_openai"
endpoint_url: "${AZURE_JUDGE_ENDPOINT}"
api_key: "${AZURE_JUDGE_KEY}"
deployment_name: "${AZURE_JUDGE_DEPLOYMENT}"
parameters: {}
pairwise_template: "configs/judge_prompts/pairwise.yaml"
absolute_template: "configs/judge_prompts/absolute.yaml"
max_parallel: 3
timeout_seconds: 90
max_retries: 2