Skip to content

Commit 775aaca

Browse files
jsonbaileyclaude
andcommitted
fix: build judge input as string; strip legacy message_history/response_to_evaluate messages
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 48761c9 commit 775aaca

7 files changed

Lines changed: 210 additions & 87 deletions

File tree

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_agent_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, agent: Any):
3333

3434
async def run(
3535
self,
36-
input: Any,
36+
input: str,
3737
output_type: Optional[Dict[str, Any]] = None,
3838
) -> RunnerResult:
3939
"""
@@ -42,7 +42,7 @@ async def run(
4242
Delegates to the compiled LangChain agent, which handles
4343
the tool-calling loop internally.
4444
45-
:param input: The user prompt or input to the agent
45+
:param input: The user prompt string to the agent
4646
:param output_type: Reserved for future structured output support;
4747
currently ignored.
4848
:return: :class:`RunnerResult` with ``content``, ``raw`` response, and

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,37 +37,25 @@ def get_llm(self) -> BaseChatModel:
3737

3838
async def run(
3939
self,
40-
input: Any,
40+
input: str,
4141
output_type: Optional[Dict[str, Any]] = None,
4242
) -> RunnerResult:
4343
"""
4444
Run the LangChain model with the given input.
4545
46-
:param input: A string prompt or a list of :class:`LDMessage` objects
46+
:param input: A string prompt
4747
:param output_type: Optional JSON schema dict requesting structured output.
4848
When provided, ``parsed`` on the returned :class:`RunnerResult` is
4949
populated with the parsed JSON document.
5050
:return: :class:`RunnerResult` containing ``content``, ``metrics``,
5151
``raw`` and (when ``output_type`` is set) ``parsed``.
5252
"""
53-
messages = self._coerce_input(input)
53+
messages = [LDMessage(role='user', content=input)]
5454

5555
if output_type is not None:
5656
return await self._run_structured(messages, output_type)
5757
return await self._run_completion(messages)
5858

59-
# convert_messages_to_langchain only accepts List[LDMessage]; _coerce_input
60-
# normalizes a bare string to [LDMessage(role='user', ...)] before that step.
61-
@staticmethod
62-
def _coerce_input(input: Any) -> List[LDMessage]:
63-
if isinstance(input, str):
64-
return [LDMessage(role='user', content=input)]
65-
if isinstance(input, list):
66-
return input
67-
raise TypeError(
68-
f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}"
69-
)
70-
7159
async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult:
7260
try:
7361
langchain_messages = convert_messages_to_langchain(messages)

packages/ai-providers/server-ai-openai/src/ldai_openai/openai_agent_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747

4848
async def run(
4949
self,
50-
input: Any,
50+
input: str,
5151
output_type: Optional[Dict[str, Any]] = None,
5252
) -> RunnerResult:
5353
"""
@@ -56,7 +56,7 @@ async def run(
5656
Delegates to the OpenAI Agents SDK ``Runner.run``, which handles the
5757
tool-calling loop internally.
5858
59-
:param input: The user prompt or input to the agent
59+
:param input: The user prompt string to the agent
6060
:param output_type: Reserved for future structured output support;
6161
currently ignored.
6262
:return: :class:`RunnerResult` with ``content``, ``raw`` response, and

packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,39 +35,25 @@ def __init__(
3535

3636
async def run(
3737
self,
38-
input: Any,
38+
input: str,
3939
output_type: Optional[Dict[str, Any]] = None,
4040
) -> RunnerResult:
4141
"""
4242
Run the OpenAI model with the given input.
4343
44-
:param input: A string prompt or a list of :class:`LDMessage` objects
44+
:param input: A string prompt
4545
:param output_type: Optional JSON schema dict requesting structured output.
4646
When provided, ``parsed`` on the returned :class:`RunnerResult` is
4747
populated with the parsed JSON document.
4848
:return: :class:`RunnerResult` containing ``content``, ``metrics``,
4949
``raw`` and (when ``output_type`` is set) ``parsed``.
5050
"""
51-
try:
52-
messages = self._coerce_input(input)
53-
except TypeError as error:
54-
log.warning(f'OpenAI model runner received unsupported input type: {error}')
55-
return RunnerResult(content='', metrics=LDAIMetrics(success=False, usage=None))
51+
messages = [LDMessage(role='user', content=input)]
5652

5753
if output_type is not None:
5854
return await self._run_structured(messages, output_type)
5955
return await self._run_completion(messages)
6056

61-
@staticmethod
62-
def _coerce_input(input: Any) -> List[LDMessage]:
63-
if isinstance(input, str):
64-
return [LDMessage(role='user', content=input)]
65-
if isinstance(input, list):
66-
return input
67-
raise TypeError(
68-
f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}"
69-
)
70-
7157
async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult:
7258
try:
7359
response = await self._client.chat.completions.create(

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
"""Judge implementation for AI evaluation."""
22

33
import random
4-
from typing import Any, Dict, Optional, Tuple
5-
6-
import chevron
4+
from typing import Any, Dict, List, Optional, Tuple
75

86
from ldai import log
97
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
@@ -12,6 +10,30 @@
1210
from ldai.providers.types import JudgeResult, RunnerResult
1311

1412

13+
def _strip_legacy_judge_messages(messages: List[LDMessage]) -> List[LDMessage]:
14+
"""
15+
Remove legacy judge template messages from a message list.
16+
17+
Strips any non-system message whose content contains ``{{message_history}}``
18+
or ``{{response_to_evaluate}}``. These were used by older judge configs to
19+
indicate where the SDK should interpolate the evaluated conversation; new
20+
configs omit them entirely and rely on the string input built by
21+
:meth:`Judge._build_evaluation_input`.
22+
23+
:param messages: The raw message list from the judge AI config.
24+
:return: A new list with legacy template messages removed.
25+
"""
26+
result = []
27+
for msg in messages:
28+
if msg.role != 'system' and (
29+
'{{message_history}}' in msg.content
30+
or '{{response_to_evaluate}}' in msg.content
31+
):
32+
continue
33+
result.append(msg)
34+
return result
35+
36+
1537
class Judge:
1638
"""
1739
Judge implementation that handles evaluation functionality and conversation management.
@@ -65,24 +87,19 @@ async def evaluate(
6587
judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey'
6688
return judge_result
6789

68-
if not self._ai_config.messages:
69-
log.warning('Judge configuration must include messages')
70-
judge_result.error_message = 'Judge configuration must include messages'
71-
return judge_result
72-
7390
if random.random() > effective_rate:
7491
log.debug(f'Judge evaluation skipped due to sampling rate: {effective_rate}')
7592
return judge_result
7693

7794
judge_result.sampled = True
7895

7996
tracker = self._ai_config.create_tracker()
80-
messages = self._construct_evaluation_messages(input_text, output_text)
97+
evaluation_input = self._build_evaluation_input(input_text, output_text)
8198
assert self._evaluation_response_structure is not None
8299

83100
response = await tracker.track_metrics_of_async(
84101
lambda result: result.metrics,
85-
lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure),
102+
lambda: self._model_runner.run(evaluation_input, output_type=self._evaluation_response_structure),
86103
)
87104

88105
if response.parsed is None:
@@ -142,38 +159,21 @@ def get_model_runner(self) -> Runner:
142159
"""
143160
return self._model_runner
144161

145-
def _construct_evaluation_messages(self, input_text: str, output_text: str) -> list[LDMessage]:
146-
"""
147-
Constructs evaluation messages by combining judge's config messages with input/output.
148-
149-
:param input_text: The input text
150-
:param output_text: The output text to evaluate
151-
:return: List of messages for evaluation
162+
def _build_evaluation_input(self, input_text: str, output_text: str) -> str:
152163
"""
153-
if not self._ai_config.messages:
154-
return []
164+
Build the string input for the judge runner.
155165
156-
messages: list[LDMessage] = []
157-
for msg in self._ai_config.messages:
158-
# Interpolate message content with reserved variables
159-
content = self._interpolate_message(msg.content, {
160-
'message_history': input_text,
161-
'response_to_evaluate': output_text,
162-
})
163-
messages.append(LDMessage(role=msg.role, content=content))
166+
Legacy messages (assistant/user messages containing ``{{message_history}}``
167+
or ``{{response_to_evaluate}}``) are stripped from the config; the runner
168+
was already created from the judge AI config (which carries the system
169+
message), so only the plain-text evaluation payload is needed here.
164170
165-
return messages
166-
167-
def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
168-
"""
169-
Interpolates message content with variables using Mustache templating.
170-
171-
:param content: The message content template
172-
:param variables: Variables to interpolate
173-
:return: Interpolated message content
171+
:param input_text: The input text (message history)
172+
:param output_text: The output text to evaluate
173+
:return: Formatted evaluation input string
174174
"""
175-
# Use chevron (Mustache) for templating, with no escaping
176-
return chevron.render(content, variables)
175+
_strip_legacy_judge_messages(self._ai_config.messages or [])
176+
return f"MESSAGE HISTORY:\n{input_text}\n\nRESPONSE TO EVALUATE:\n{output_text}"
177177

178178
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]:
179179
"""

packages/sdk/server-ai/src/ldai/providers/runner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ class Runner(Protocol):
1616

1717
async def run(
1818
self,
19-
input: Any,
19+
input: str,
2020
output_type: Optional[Dict[str, Any]] = None,
2121
) -> RunnerResult:
2222
"""
23-
Execute the runner with the given input.
23+
Execute the runner with the given input string.
2424
25-
:param input: The input to the runner.
25+
:param input: The string input to the runner.
2626
:param output_type: Optional JSON schema for structured output.
2727
:return: RunnerResult containing content, metrics, raw, and parsed fields.
2828
"""

0 commit comments

Comments
 (0)