-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathadapter.py
More file actions
184 lines (140 loc) · 6.03 KB
/
Copy pathadapter.py
File metadata and controls
184 lines (140 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.
Currently provides:
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
an OpenAI Python grader spec ({"type": "python", "source": ...}).
"""
import ast
import inspect
import textwrap
def build_python_grader_from_evaluation_test(test_fn) -> dict:
"""
Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.
Assumptions:
- `test_fn` is the *core* evaluation function (not the @evaluation_test wrapper),
or an @evaluation_test-decorated function that carries _origin_func.
It should have a signature like:
def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow
- The function only relies on attributes that we provide on `EvaluationRowLike`
(you can extend that class as needed).
- We map OpenAI's (sample, item) to a duck‑typed `row`:
- item["reference_answer"] -> row.ground_truth
- sample["output_text"] -> appended as an assistant message
- raw dicts available as row.item / row.sample
- The function returns either:
- a numeric score, or
- an object/dict with a `score` field, or
- an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
"""
# If the user passed an @evaluation_test wrapper, try to recover the original function
origin = getattr(test_fn, "_origin_func", test_fn)
# Get the source of the original function
src = inspect.getsource(origin)
src = textwrap.dedent(src)
# Parse into AST so we can safely strip decorators and type annotations
tree = ast.parse(src)
class _StripAnnotationsAndDecorators(ast.NodeTransformer):
def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
# Drop all decorators (e.g., @evaluation_test)
node.decorator_list = []
# Remove return type annotation
node.returns = None
self.generic_visit(node)
return node
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
node.decorator_list = []
node.returns = None
self.generic_visit(node)
return node
def visit_arg(self, node: ast.arg) -> ast.AST:
# Remove all parameter annotations (e.g., row: EvaluationRow)
node.annotation = None
return node
transformer = _StripAnnotationsAndDecorators()
tree = transformer.visit(tree)
ast.fix_missing_locations(tree)
# Find the first function definition and rename it to _ep_eval
func_node: ast.AST | None = None
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
func_node = node
break
if func_node is None:
raise ValueError("Expected a function definition in test_fn source.")
func_node.name = "_ep_eval"
# Turn the modified AST back into source
src = ast.unparse(tree)
# Helper code that will live *inside* the grader source
helper = """
from typing import Any, Dict
from types import SimpleNamespace
class EvaluationRow(SimpleNamespace):
\"\"\"Minimal duck-typed stand-in for an evaluation row.
Extend this with whatever attributes your eval logic uses.
\"\"\"
pass
class EvaluateResult(SimpleNamespace):
\"\"\"Simple stand-in for Eval Protocol's EvaluateResult.
This lets evaluation-style functions that construct EvaluateResult(score=...)
run inside the Python grader sandbox without importing eval_protocol.
\"\"\"
def __init__(self, score: float, **kwargs: Any) -> None:
super().__init__(score=score, **kwargs)
class Message(SimpleNamespace):
\"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
pass
def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
# Start from any item-provided messages (EP-style), defaulting to [].
raw_messages = item.get("messages") or []
normalized_messages = []
for m in raw_messages:
if isinstance(m, dict):
normalized_messages.append(
Message(
role=m.get("role"),
content=m.get("content"),
)
)
else:
# Already Message-like; rely on duck typing (must have role/content)
normalized_messages.append(m)
reference = item.get("reference_answer")
prediction = sample.get("output_text")
# EP-style: ensure the model prediction is present as the last assistant message
if prediction is not None:
normalized_messages = list(normalized_messages) # shallow copy
normalized_messages.append(Message(role="assistant", content=prediction))
return EvaluationRow(
ground_truth=reference,
messages=normalized_messages,
item=item,
sample=sample,
)
def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
row = _build_row(sample, item)
result = _ep_eval(row=row)
# Try to normalize different result shapes into a float score
try:
from collections.abc import Mapping
if isinstance(result, (int, float)):
return float(result)
# EvaluateResult-like object with .score
if hasattr(result, "score"):
return float(result.score)
# EvaluationRow-like object with .evaluation_result.score
eval_res = getattr(result, "evaluation_result", None)
if eval_res is not None:
if isinstance(eval_res, Mapping):
if "score" in eval_res:
return float(eval_res["score"])
elif hasattr(eval_res, "score"):
return float(eval_res.score)
# Dict-like with score
if isinstance(result, Mapping) and "score" in result:
return float(result["score"])
except Exception:
pass
return 0.0
"""
full_source = src + "\n\n" + textwrap.dedent(helper)
return {"type": "python", "source": full_source}