-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathcustom_evaluators.py
More file actions
128 lines (99 loc) · 4.24 KB
/
Copy pathcustom_evaluators.py
File metadata and controls
128 lines (99 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""Code-based custom evaluators for cost and latency comparison.
These evaluators are registered with Microsoft Foundry and run in a sandboxed
environment. They receive the full JSONL item and return a float score
in the range 0.0–1.0.
Score scale:
0.5 = parity (router same as baseline)
> 0.5 = router advantage (cheaper / faster)
< 0.5 = baseline advantage
1.0 = router is free / instant
0.0 = router costs 2x / takes 2x as long
"""
from __future__ import annotations
import textwrap
from typing import Dict
# ── Cost evaluator source code ───────────────────────────────────────────────
# This code runs inside Foundry's sandboxed evaluator.
# It receives `item` (the JSONL row) and returns a float.
COST_EVALUATOR_CODE = textwrap.dedent('''\
def grade(sample, item) -> float:
"""Score router vs baseline on cost.
Score = 0.5 + (savings_ratio * 0.5)
Where savings_ratio = (baseline_cost - router_cost) / baseline_cost
Returns:
float in [0.0, 1.0] where 0.5 = parity, >0.5 = router cheaper
"""
router_cost = float(item.get("router_cost_usd", 0.0))
baseline_cost = float(item.get("baseline_cost_usd", 0.0))
if baseline_cost == 0.0:
return 0.5 # No baseline cost to compare
savings_ratio = (baseline_cost - router_cost) / baseline_cost
score = 0.5 + (savings_ratio * 0.5)
return max(0.0, min(1.0, score))
''')
# ── Latency evaluator source code ───────────────────────────────────────────
LATENCY_EVALUATOR_CODE = textwrap.dedent('''\
def grade(sample, item) -> float:
"""Score router vs baseline on latency.
Score = 0.5 + (speedup_ratio * 0.5)
Where speedup_ratio = (baseline_latency - router_latency) / baseline_latency
Returns:
float in [0.0, 1.0] where 0.5 = parity, >0.5 = router faster
"""
router_latency = float(item.get("router_latency_ms", 0.0))
baseline_latency = float(item.get("baseline_latency_ms", 0.0))
if baseline_latency == 0.0:
return 0.5 # No baseline latency to compare
speedup_ratio = (baseline_latency - router_latency) / baseline_latency
score = 0.5 + (speedup_ratio * 0.5)
return max(0.0, min(1.0, score))
''')
def get_evaluator_code(evaluator_type: str) -> str:
"""Get the source code for a custom evaluator.
Args:
evaluator_type: One of "cost" or "latency".
Returns:
Python source code string.
"""
codes: Dict[str, str] = {
"cost": COST_EVALUATOR_CODE,
"latency": LATENCY_EVALUATOR_CODE,
}
if evaluator_type not in codes:
raise ValueError(f"Unknown evaluator type: {evaluator_type}. Choose from: {list(codes)}")
return codes[evaluator_type]
def register_custom_evaluators(
client,
model_deployment: str,
cost_evaluator_name: str = "mr_cost_comparison",
latency_evaluator_name: str = "mr_latency_comparison",
cost_pass_threshold: float = 0.5,
latency_pass_threshold: float = 0.5,
) -> Dict[str, str]:
"""Register cost and latency custom evaluators with Foundry.
Args:
client: A FoundryEvalClient instance (connected).
model_deployment: Model deployment name.
cost_evaluator_name: Name for the cost evaluator.
latency_evaluator_name: Name for the latency evaluator.
cost_pass_threshold: Pass threshold for cost evaluator.
latency_pass_threshold: Pass threshold for latency evaluator.
Returns:
Dict mapping evaluator type to evaluator ID.
"""
result: Dict[str, str] = {}
cost_id = client.register_evaluator(
evaluator_name=cost_evaluator_name,
code=COST_EVALUATOR_CODE,
deployment_name=model_deployment,
pass_threshold=cost_pass_threshold,
)
result["cost"] = cost_id
latency_id = client.register_evaluator(
evaluator_name=latency_evaluator_name,
code=LATENCY_EVALUATOR_CODE,
deployment_name=model_deployment,
pass_threshold=latency_pass_threshold,
)
result["latency"] = latency_id
return result