Model-Router-Auto-Evaluation/configs/large_scale.yaml at main · microsoft-foundry/Model-Router-Auto-Evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Large-scale evaluation configuration (1000+ prompts)
# Tuned for production runs with higher concurrency and longer timeouts.
# See docs/how-to-resume-and-scale.md for guidance.

evaluation:
  name: "model-router-large-scale"
  dataset: "datasets/sample_custom.jsonl"   # Replace with your 1000+ prompt dataset
  sample_size: null                          # null = use all prompts in the file
  random_seed: 42

endpoints:
  model_router:
    type: "azure_openai"
    endpoint_url: "${AZURE_MODEL_ROUTER_ENDPOINT}"
    api_key: "${AZURE_MODEL_ROUTER_KEY}"
    deployment_name: "${AZURE_MODEL_ROUTER_DEPLOYMENT}"
    parameters:
      max_tokens: 1024

  baseline:
    type: "azure_openai"
    endpoint_url: "${AZURE_OPENAI_ENDPOINT}"
    api_key: "${AZURE_OPENAI_KEY}"
    deployment_name: "${AZURE_BASELINE_DEPLOYMENT}"
    parameters:
      max_tokens: 1024

pricing:                     # USD per 1M tokens
  # ── Model Router markup (input-only charge) ──
  model_router:
    input: 0.50
    output: 1.50

  # ── Azure OpenAI models ──
  gpt-4.1:        { input: 2.00,  output: 8.00  }
  gpt-4.1-mini:   { input: 0.40,  output: 1.60  }
  gpt-4.1-nano:   { input: 0.10,  output: 0.40  }
  gpt-4o:         { input: 2.50,  output: 10.00 }
  gpt-4o-mini:    { input: 0.15,  output: 0.60  }
  o4-mini:        { input: 1.10,  output: 4.40  }
  gpt-5:          { input: 1.25,  output: 10.00 }
  gpt-5-mini:     { input: 0.25,  output: 2.00  }
  gpt-5-nano:     { input: 0.05,  output: 0.40  }
  gpt-5-chat:     { input: 1.25,  output: 10.00 }
  gpt-5.2:        { input: 1.75,  output: 14.00 }
  gpt-5.2-chat:   { input: 1.75,  output: 14.00 }
  gpt-oss-120b:   { input: 0.15,  output: 0.60  }

  # ── Anthropic ──
  claude-haiku-4-5:   { input: 1.00,  output: 5.00  }
  claude-sonnet-4-5:  { input: 3.00,  output: 15.00 }
  claude-opus-4-1:    { input: 15.00, output: 75.00 }
  claude-opus-4-6:    { input: 5.00,  output: 25.00 }

  # ── xAI ──
  grok-4:                { input: 2.00, output: 6.00 }
  grok-4-fast-reasoning: { input: 0.20, output: 0.50 }

  # ── DeepSeek ──
  DeepSeek-V3.1: { input: 0.27, output: 1.10 }
  DeepSeek-V3.2: { input: 0.27, output: 1.10 }

  # ── Meta ──
  Llama-4-Maverick-17B-128E-Instruct-FP8: { input: 0.20, output: 0.20 }

concurrency:
  max_parallel_requests: 10      # Higher than default (5) for throughput
  request_timeout_seconds: 120   # Longer timeout for complex prompts
  max_retries: 5                 # More retries to handle transient rate limits

output:
  directory: "results/large-scale"
  formats: ["markdown", "csv", "json"]

judge:
  enabled: true
  endpoint:
    type: "azure_openai"
    endpoint_url: "${AZURE_JUDGE_ENDPOINT}"
    api_key: "${AZURE_JUDGE_KEY}"
    deployment_name: "${AZURE_JUDGE_DEPLOYMENT}"
    parameters: {}
  pairwise_template: "configs/judge_prompts/pairwise.yaml"
  absolute_template: "configs/judge_prompts/absolute.yaml"
  max_parallel: 5                # Higher than default (3) for throughput
  timeout_seconds: 120
  max_retries: 3