python-sdk/eval_protocol/integrations/openai_rft/adapter.py at 2dccccf6894c26c54d2cf0af5ca07478d4cff874 · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
Integration helpers between Eval Protocol evaluations and OpenAI RFT graders.

Currently provides:
- build_python_grader_from_evaluation_test: turn an evaluation-style function into
  an OpenAI Python grader spec ({"type": "python", "source": ...}).
"""

import ast
import inspect
import textwrap


def build_python_grader_from_evaluation_test(test_fn) -> dict:
    """
    Return an OpenAI Python grader spec from an Eval Protocol-style evaluation function.

    Assumptions:
    - `test_fn` is the *core* evaluation function (not the @evaluation_test wrapper),
      or an @evaluation_test-decorated function that carries _origin_func.
      It should have a signature like:

          def my_eval(row, **kwargs) -> EvaluateResult | float | EvaluationRow

    - The function only relies on attributes that we provide on `EvaluationRowLike`
      (you can extend that class as needed).

    - We map OpenAI's (sample, item) to a duck‑typed `row`:
        - item["reference_answer"]      -> row.ground_truth
        - sample["output_text"]         -> appended as an assistant message
        - raw dicts available as row.item / row.sample

    - The function returns either:
        - a numeric score, or
        - an object/dict with a `score` field, or
        - an EvaluationRow/EvaluateResult-like object with `.evaluation_result.score`.
    """

    # If the user passed an @evaluation_test wrapper, try to recover the original function
    origin = getattr(test_fn, "_origin_func", test_fn)

    # Get the source of the original function
    src = inspect.getsource(origin)
    src = textwrap.dedent(src)

    # Parse into AST so we can safely strip decorators and type annotations
    tree = ast.parse(src)

    class _StripAnnotationsAndDecorators(ast.NodeTransformer):
        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
            # Drop all decorators (e.g., @evaluation_test)
            node.decorator_list = []
            # Remove return type annotation
            node.returns = None
            self.generic_visit(node)
            return node

        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
            node.decorator_list = []
            node.returns = None
            self.generic_visit(node)
            return node

        def visit_arg(self, node: ast.arg) -> ast.AST:
            # Remove all parameter annotations (e.g., row: EvaluationRow)
            node.annotation = None
            return node

    transformer = _StripAnnotationsAndDecorators()
    tree = transformer.visit(tree)
    ast.fix_missing_locations(tree)

    # Find the first function definition and rename it to _ep_eval
    func_node: ast.AST | None = None
    for node in tree.body:
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            func_node = node
            break

    if func_node is None:
        raise ValueError("Expected a function definition in test_fn source.")

    func_node.name = "_ep_eval"

    # Turn the modified AST back into source
    src = ast.unparse(tree)

    # Helper code that will live *inside* the grader source
    helper = """
from typing import Any, Dict
from types import SimpleNamespace


class EvaluationRow(SimpleNamespace):
    \"\"\"Minimal duck-typed stand-in for an evaluation row.

    Extend this with whatever attributes your eval logic uses.
    \"\"\"
    pass


class EvaluateResult(SimpleNamespace):
    \"\"\"Simple stand-in for Eval Protocol's EvaluateResult.

    This lets evaluation-style functions that construct EvaluateResult(score=...)
    run inside the Python grader sandbox without importing eval_protocol.
    \"\"\"

    def __init__(self, score: float, **kwargs: Any) -> None:
        super().__init__(score=score, **kwargs)


class Message(SimpleNamespace):
    \"\"\"Duck-typed stand-in for eval_protocol.models.Message (role/content).\"\"\"
    pass


def _build_row(sample: Dict[str, Any], item: Dict[str, Any]) -> EvaluationRow:
    # Start from any item-provided messages (EP-style), defaulting to [].
    raw_messages = item.get("messages") or []
    normalized_messages = []
    for m in raw_messages:
        if isinstance(m, dict):
            normalized_messages.append(
                Message(
                    role=m.get("role"),
                    content=m.get("content"),
                )
            )
        else:
            # Already Message-like; rely on duck typing (must have role/content)
            normalized_messages.append(m)

    reference = item.get("reference_answer")
    prediction = sample.get("output_text")

    # EP-style: ensure the model prediction is present as the last assistant message
    if prediction is not None:
        normalized_messages = list(normalized_messages)  # shallow copy
        normalized_messages.append(Message(role="assistant", content=prediction))

    return EvaluationRow(
        ground_truth=reference,
        messages=normalized_messages,
        item=item,
        sample=sample,
    )


def grade(sample: Dict[str, Any], item: Dict[str, Any]) -> float:
    row = _build_row(sample, item)
    result = _ep_eval(row=row)

    # Try to normalize different result shapes into a float score
    try:
        from collections.abc import Mapping

        if isinstance(result, (int, float)):
            return float(result)

        # EvaluateResult-like object with .score
        if hasattr(result, "score"):
            return float(result.score)

        # EvaluationRow-like object with .evaluation_result.score
        eval_res = getattr(result, "evaluation_result", None)
        if eval_res is not None:
            if isinstance(eval_res, Mapping):
                if "score" in eval_res:
                    return float(eval_res["score"])
            elif hasattr(eval_res, "score"):
                return float(eval_res.score)

        # Dict-like with score
        if isinstance(result, Mapping) and "score" in result:
            return float(result["score"])
    except Exception:
        pass

    return 0.0
"""

    full_source = src + "\n\n" + textwrap.dedent(helper)
    return {"type": "python", "source": full_source}