Skip to content

Commit e2f5b93

Browse files
jsonbaileyclaude
andcommitted
fix: remove judge evaluations from OpenAI runner
The OpenAI Agents SDK does not expose a node's text output at handoff time, making it impossible to evaluate intermediate nodes against real output. Rather than evaluating against an empty string, remove evaluation support from the OpenAI runner entirely until the SDK provides a suitable API. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 61c089f commit e2f5b93

1 file changed

Lines changed: 5 additions & 37 deletions

File tree

packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_graph_runner.py

Lines changed: 5 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,9 @@ def _sanitize_agent_name(key: str) -> str:
2525
class _RunState:
2626
"""Mutable state shared across handoff and tool callbacks during a single run."""
2727

28-
def __init__(self, last_handoff_ns: int, last_node_key: str, input_str: str = '') -> None:
28+
def __init__(self, last_handoff_ns: int, last_node_key: str) -> None:
2929
self.last_handoff_ns = last_handoff_ns
3030
self.last_node_key = last_node_key
31-
self.input_str = input_str
32-
self.pending_eval_tasks: List[tuple] = []
3331

3432

3533
class OpenAIAgentGraphRunner(AgentGraphRunner):
@@ -85,19 +83,12 @@ async def run(self, input: Any) -> AgentGraphResult:
8583

8684
input_str = str(input)
8785
start_ns = time.perf_counter_ns()
88-
state = _RunState(last_handoff_ns=start_ns, last_node_key=root_key, input_str=input_str)
86+
state = _RunState(last_handoff_ns=start_ns, last_node_key=root_key)
8987
try:
9088
from agents import Runner
9189
root_agent = self._build_agents(path, state, tracker)
9290
result = await Runner.run(root_agent, input_str)
93-
self._flush_final_segment(state, result, input_str)
94-
all_eval_results = []
95-
for node_tracker, eval_task in state.pending_eval_tasks:
96-
eval_results = await eval_task
97-
all_eval_results.extend(eval_results)
98-
for r in eval_results:
99-
if r.success:
100-
node_tracker.track_judge_result(r)
91+
self._flush_final_segment(state, result)
10192
self._track_tool_calls(result)
10293

10394
duration = (time.perf_counter_ns() - start_ns) // 1_000_000
@@ -113,7 +104,6 @@ async def run(self, input: Any) -> AgentGraphResult:
113104
output=str(result.final_output),
114105
raw=result,
115106
metrics=LDAIMetrics(success=True, usage=token_usage),
116-
evaluations=all_eval_results,
117107
)
118108
except Exception as exc:
119109
if isinstance(exc, ImportError):
@@ -235,10 +225,7 @@ def _make_on_handoff(
235225
state: _RunState,
236226
):
237227
def on_handoff(run_ctx: Any) -> None:
238-
self._handle_handoff(
239-
run_ctx, src, tgt, path, tracker, config_tracker, state,
240-
input_str=state.input_str,
241-
)
228+
self._handle_handoff(run_ctx, src, tgt, path, tracker, config_tracker, state)
242229
return on_handoff
243230

244231
def _handle_handoff(
@@ -250,7 +237,6 @@ def _handle_handoff(
250237
tracker: Any,
251238
config_tracker: Any,
252239
state: _RunState,
253-
input_str: str = '',
254240
) -> None:
255241
path.append(tgt)
256242
state.last_node_key = tgt
@@ -275,19 +261,7 @@ def _handle_handoff(
275261
config_tracker.track_duration(int(duration_ms))
276262
config_tracker.track_success()
277263

278-
src_node = self._graph.get_node(src)
279-
if src_node is not None:
280-
# The OpenAI Agents SDK does not expose the agent's text output at
281-
# handoff time via RunContextWrapper, so output_text is empty here.
282-
eval_task = src_node.get_config().evaluator.evaluate(input_str, '')
283-
state.pending_eval_tasks.append((config_tracker, eval_task))
284-
285-
def _flush_final_segment(
286-
self,
287-
state: _RunState,
288-
result: Any,
289-
input_str: str = '',
290-
) -> None:
264+
def _flush_final_segment(self, state: _RunState, result: Any) -> None:
291265
"""Record duration/tokens for the last active agent (no handoff after it)."""
292266
if not state.last_node_key:
293267
return
@@ -311,12 +285,6 @@ def _flush_final_segment(
311285
config_tracker.track_duration(int(duration_ms))
312286
config_tracker.track_success()
313287

314-
final_node = self._graph.get_node(state.last_node_key)
315-
if final_node is not None:
316-
output_str = str(result.final_output) if result is not None else ''
317-
eval_task = final_node.get_config().evaluator.evaluate(input_str, output_str)
318-
state.pending_eval_tasks.append((config_tracker, eval_task))
319-
320288
def _track_tool_calls(self, result: Any) -> None:
321289
"""Track all tool calls from the run result, attributed to the node that called them."""
322290
for agent_name, tool_fn_name in get_tool_calls_from_run_items(result.new_items):

0 commit comments

Comments
 (0)