Skip to content

Commit 15b397a

Browse files
jsonbaileyclaude
andcommitted
fix!: build judge input as string and narrow Runner.run signature
Aligns the JS AI SDK with the spec change implemented in launchdarkly/python-server-sdk-ai#165 and launchdarkly/sdk-specs#160. Judges now build a single formatted string ("MESSAGE HISTORY:\n...\n\n RESPONSE TO EVALUATE:\n...") and pass it to the runner instead of an interpolated message list. Legacy judge configs that contain {{message_history}} or {{response_to_evaluate}} placeholders in non-system messages are stripped at config-construction time so old and new flag values both work without behavioral surprises. BREAKING CHANGE: Runner.run is narrowed from `run(input: string | LDMessage[], outputType?)` to `run(input: string, outputType?)`. The OpenAI, LangChain, and Vercel provider runners no longer accept a pre-built message array; they always prepend any config messages and append the prompt as a user turn. The Judge no longer interpolates {{message_history}} or {{response_to_evaluate}} into config messages — the SDK builds the input string directly and the runner receives that string verbatim. The "Judge configuration must include messages" early-return was removed; a judge with no messages now proceeds to invoke the runner with the formatted input. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 70e4eb9 commit 15b397a

11 files changed

Lines changed: 232 additions & 146 deletions

File tree

packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,20 @@ export class LangChainModelRunner implements Runner {
2929
}
3030

3131
/**
32-
* Run the LangChain model with the given prompt.
32+
* Run the LangChain model with the given user prompt.
3333
*
34-
* @param input The user prompt string or a pre-built message array to send to the model.
35-
* When a string is provided, config messages are prepended before the user prompt.
36-
* When an {@link LDMessage} array is provided, it is used as-is (config messages are
37-
* not prepended).
34+
* Prepends any messages defined in the AI config (system prompt, etc.) before
35+
* the user prompt.
36+
*
37+
* @param input The user prompt string.
3838
* @param outputType Optional JSON schema for structured output. When provided,
3939
* the parsed result is exposed via {@link RunnerResult.parsed}.
4040
*/
41-
async run(input: string | LDMessage[], outputType?: Record<string, unknown>): Promise<RunnerResult> {
42-
const messages: LDMessage[] = Array.isArray(input)
43-
? input
44-
: [...(this._config.messages ?? []), { role: 'user', content: input }];
41+
async run(input: string, outputType?: Record<string, unknown>): Promise<RunnerResult> {
42+
const messages: LDMessage[] = [
43+
...(this._config.messages ?? []),
44+
{ role: 'user', content: input },
45+
];
4546

4647
if (outputType !== undefined) {
4748
return this._runStructured(messages, outputType);

packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -75,27 +75,6 @@ describe('OpenAIModelRunner', () => {
7575
});
7676
});
7777

78-
it('passes a LDMessage[] input directly without prepending config messages', async () => {
79-
const mockResponse = {
80-
choices: [{ message: { content: 'Evaluation result' } }],
81-
usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 },
82-
};
83-
(mockOpenAI.chat.completions.create as jest.Mock).mockResolvedValue(mockResponse as any);
84-
85-
const messages = [
86-
{ role: 'system' as const, content: 'You are a judge' },
87-
{ role: 'user' as const, content: 'Rate this: hello' },
88-
];
89-
const result = await runner.run(messages);
90-
91-
expect(mockOpenAI.chat.completions.create).toHaveBeenCalledWith({
92-
model: 'gpt-3.5-turbo',
93-
messages,
94-
});
95-
expect(result.content).toBe('Evaluation result');
96-
expect(result.metrics.success).toBe(true);
97-
});
98-
9978
it('marks the result unsuccessful when response has no content', async () => {
10079
const mockResponse = { choices: [{ message: {} }] };
10180
(mockOpenAI.chat.completions.create as jest.Mock).mockResolvedValue(mockResponse as any);

packages/ai-providers/server-ai-openai/src/OpenAIAgentRunner.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ export class OpenAIAgentRunner implements Runner {
4848

4949
async run(input: string, _outputType?: Record<string, unknown>): Promise<RunnerResult> {
5050
try {
51-
const result = await this._agentRun(this._agent, String(input), { maxTurns: MAX_TURNS });
51+
const result = await this._agentRun(this._agent, input, { maxTurns: MAX_TURNS });
5252

5353
const toolCalls = getToolCallsFromRunItems(result.newItems ?? []).reduce(
5454
(acc: string[], fnName: string) => {

packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,20 @@ export class OpenAIModelRunner implements Runner {
3232
}
3333

3434
/**
35-
* Run the OpenAI model with the given prompt or message array.
35+
* Run the OpenAI model with the given user prompt.
3636
*
37-
* When `input` is a string it is wrapped as a user turn and appended to any
38-
* messages defined in the config. When `input` is already a `LDMessage[]`
39-
* (e.g. when called from the Judge evaluation path) it is used as-is.
37+
* Prepends any messages defined in the AI config (system prompt,
38+
* instructions, etc.) before the user prompt.
4039
*
41-
* @param input The user prompt string, or a pre-built message array.
40+
* @param input The user prompt string.
4241
* @param outputType Optional JSON schema for structured output. When provided,
4342
* the response is parsed and exposed via {@link RunnerResult.parsed}.
4443
*/
45-
async run(input: string | LDMessage[], outputType?: Record<string, unknown>): Promise<RunnerResult> {
46-
const messages: LDMessage[] = Array.isArray(input)
47-
? input
48-
: [...(this._config.messages ?? []), { role: 'user', content: input }];
44+
async run(input: string, outputType?: Record<string, unknown>): Promise<RunnerResult> {
45+
const messages: LDMessage[] = [
46+
...(this._config.messages ?? []),
47+
{ role: 'user', content: input },
48+
];
4949

5050
if (outputType !== undefined) {
5151
return this._runStructured(messages, outputType);

packages/ai-providers/server-ai-vercel/src/VercelModelRunner.ts

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,20 @@ export class VercelModelRunner implements Runner {
3636
}
3737

3838
/**
39-
* Run the Vercel AI model with the given prompt.
39+
* Run the Vercel AI model with the given user prompt.
4040
*
41-
* @param input The user prompt string, or a pre-built message array. When a
42-
* string is supplied the config's system messages are prepended automatically.
43-
* When a `LDMessage[]` is supplied it is used as-is (config messages are NOT
44-
* prepended — the caller is responsible for the full message list).
41+
* Prepends any messages defined in the AI config (system prompt, etc.) before
42+
* the user prompt.
43+
*
44+
* @param input The user prompt string.
4545
* @param outputType Optional JSON schema for structured output. When provided,
4646
* the parsed object is exposed via {@link RunnerResult.parsed}.
4747
*/
48-
async run(input: string | LDMessage[], outputType?: Record<string, unknown>): Promise<RunnerResult> {
49-
const messages: LDMessage[] = Array.isArray(input)
50-
? input
51-
: [...(this._config.messages ?? []), { role: 'user', content: input }];
48+
async run(input: string, outputType?: Record<string, unknown>): Promise<RunnerResult> {
49+
const messages: LDMessage[] = [
50+
...(this._config.messages ?? []),
51+
{ role: 'user', content: input },
52+
];
5253

5354
if (outputType !== undefined) {
5455
return this._runStructured(messages, outputType);

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 110 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,74 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common';
22

33
import { LDAIConfigTracker } from '../src/api/config/LDAIConfigTracker';
44
import { LDAIJudgeConfig, LDMessage } from '../src/api/config/types';
5-
import { Judge } from '../src/api/judge/Judge';
5+
import { Judge, stripLegacyJudgeMessages } from '../src/api/judge/Judge';
66
import { RunnerResult } from '../src/api/model/types';
77
import { Runner } from '../src/api/providers/Runner';
88

9+
describe('stripLegacyJudgeMessages', () => {
10+
it('strips assistant messages containing {{message_history}}', () => {
11+
const messages: LDMessage[] = [
12+
{ role: 'system', content: 'You are a judge.' },
13+
{ role: 'assistant', content: 'Here is the history: {{message_history}}' },
14+
];
15+
const result = stripLegacyJudgeMessages(messages);
16+
expect(result).toHaveLength(1);
17+
expect(result[0].role).toBe('system');
18+
});
19+
20+
it('strips user messages containing {{response_to_evaluate}}', () => {
21+
const messages: LDMessage[] = [
22+
{ role: 'system', content: 'You are a judge.' },
23+
{ role: 'user', content: 'Evaluate: {{response_to_evaluate}}' },
24+
];
25+
const result = stripLegacyJudgeMessages(messages);
26+
expect(result).toHaveLength(1);
27+
expect(result[0].role).toBe('system');
28+
});
29+
30+
it('strips all legacy template messages from a typical legacy config', () => {
31+
const messages: LDMessage[] = [
32+
{ role: 'system', content: 'You are a judge.' },
33+
{ role: 'assistant', content: '{{message_history}}' },
34+
{ role: 'user', content: '{{response_to_evaluate}}' },
35+
];
36+
const result = stripLegacyJudgeMessages(messages);
37+
expect(result).toHaveLength(1);
38+
expect(result[0].role).toBe('system');
39+
});
40+
41+
it('does not strip system messages even when they contain template variables', () => {
42+
const messages: LDMessage[] = [
43+
{
44+
role: 'system',
45+
content: 'Judge using {{message_history}} and {{response_to_evaluate}}.',
46+
},
47+
];
48+
const result = stripLegacyJudgeMessages(messages);
49+
expect(result).toHaveLength(1);
50+
expect(result[0].role).toBe('system');
51+
});
52+
53+
it('leaves non-system messages without template variables untouched', () => {
54+
const messages: LDMessage[] = [
55+
{ role: 'system', content: 'You are a judge.' },
56+
{ role: 'user', content: 'This is a regular message.' },
57+
];
58+
const result = stripLegacyJudgeMessages(messages);
59+
expect(result).toHaveLength(2);
60+
});
61+
62+
it('returns an empty array for an empty input', () => {
63+
expect(stripLegacyJudgeMessages([])).toEqual([]);
64+
});
65+
66+
it('passes a new-style system-only config through unchanged', () => {
67+
const messages: LDMessage[] = [{ role: 'system', content: 'You are a judge.' }];
68+
const result = stripLegacyJudgeMessages(messages);
69+
expect(result).toEqual(messages);
70+
});
71+
});
72+
973
describe('Judge', () => {
1074
let mockRunner: jest.Mocked<Runner>;
1175
let mockTracker: jest.Mocked<LDAIConfigTracker>;
@@ -37,14 +101,7 @@ describe('Judge', () => {
37101
judgeConfig = {
38102
key: 'test-judge',
39103
enabled: true,
40-
messages: [
41-
{ role: 'system', content: 'You are a helpful judge that evaluates AI responses.' },
42-
{
43-
role: 'user',
44-
content:
45-
'Evaluate and report scores for important metrics: Input: {{message_history}}, Output: {{response_to_evaluate}}',
46-
},
47-
],
104+
messages: [{ role: 'system', content: 'You are a helpful judge that evaluates AI responses.' }],
48105
model: { name: 'gpt-4' },
49106
provider: { name: 'openai' },
50107
createTracker: () => mockTracker,
@@ -161,21 +218,33 @@ describe('Judge', () => {
161218
});
162219

163220
expect(mockRunner.run).toHaveBeenCalledWith(
164-
expect.arrayContaining([
165-
expect.objectContaining({
166-
role: 'system',
167-
content: 'You are a helpful judge that evaluates AI responses.',
168-
}),
169-
expect.objectContaining({
170-
role: 'user',
171-
content:
172-
'Evaluate and report scores for important metrics: Input: What is the capital of France?, Output: Paris is the capital of France.',
173-
}),
174-
]),
221+
'MESSAGE HISTORY:\nWhat is the capital of France?\n\nRESPONSE TO EVALUATE:\nParis is the capital of France.',
175222
expect.any(Object), // evaluation schema
176223
);
177224
});
178225

226+
it('passes a string input to the runner (not a message list)', async () => {
227+
const mockRunnerResult: RunnerResult = {
228+
content: '',
229+
parsed: {
230+
score: 0.85,
231+
reasoning: 'Good answer.',
232+
},
233+
metrics: { success: true },
234+
};
235+
236+
mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
237+
mockRunner.run.mockResolvedValue(mockRunnerResult);
238+
239+
await judge.evaluate('What is AI?', 'AI is artificial intelligence.');
240+
241+
expect(mockRunner.run).toHaveBeenCalledTimes(1);
242+
const inputArg = mockRunner.run.mock.calls[0][0];
243+
expect(typeof inputArg).toBe('string');
244+
expect(inputArg).toContain('MESSAGE HISTORY:\nWhat is AI?');
245+
expect(inputArg).toContain('RESPONSE TO EVALUATE:\nAI is artificial intelligence.');
246+
});
247+
179248
it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => {
180249
const mockRunnerResult: RunnerResult = {
181250
content: '',
@@ -412,25 +481,29 @@ describe('Judge', () => {
412481
});
413482
});
414483

415-
it('returns error result when messages are missing', async () => {
484+
it('proceeds (does not error early) when messages is undefined', async () => {
416485
const configWithoutMessages: LDAIJudgeConfig = {
417486
...judgeConfig,
418487
messages: undefined,
419488
};
420489
const judgeWithoutMessages = new Judge(configWithoutMessages, mockRunner, 1.0, mockLogger);
421490

491+
const mockRunnerResult: RunnerResult = {
492+
content: '',
493+
parsed: {
494+
score: 0.7,
495+
reasoning: 'Acceptable response.',
496+
},
497+
metrics: { success: true },
498+
};
499+
mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func());
500+
mockRunner.run.mockResolvedValue(mockRunnerResult);
501+
422502
const result = await judgeWithoutMessages.evaluate('test input', 'test output');
423503

424-
expect(result).toEqual({
425-
success: false,
426-
sampled: true,
427-
errorMessage: 'Judge configuration must include messages',
428-
judgeConfigKey: 'test-judge',
429-
});
430-
expect(mockLogger.warn).toHaveBeenCalledWith(
431-
'Judge configuration must include messages',
432-
mockTrackData,
433-
);
504+
expect(result.sampled).toBe(true);
505+
expect(result.success).toBe(true);
506+
expect(mockRunner.run).toHaveBeenCalledTimes(1);
434507
});
435508

436509
it('returns result with success false when parsed is undefined or has no score/reasoning', async () => {
@@ -588,17 +661,7 @@ describe('Judge', () => {
588661
});
589662

590663
expect(mockRunner.run).toHaveBeenCalledWith(
591-
expect.arrayContaining([
592-
expect.objectContaining({
593-
role: 'system',
594-
content: 'You are a helpful judge that evaluates AI responses.',
595-
}),
596-
expect.objectContaining({
597-
role: 'user',
598-
content:
599-
'Evaluate and report scores for important metrics: Input: What is the capital of France?\r\nParis is the capital of France., Output: Paris is the capital of France.',
600-
}),
601-
]),
664+
'MESSAGE HISTORY:\nWhat is the capital of France?\r\nParis is the capital of France.\n\nRESPONSE TO EVALUATE:\nParis is the capital of France.',
602665
expect.any(Object), // evaluation schema
603666
);
604667
});
@@ -626,28 +689,19 @@ describe('Judge', () => {
626689
});
627690
});
628691

629-
describe('_constructEvaluationMessages', () => {
692+
describe('_buildEvaluationInput', () => {
630693
let judge: Judge;
631694

632695
beforeEach(() => {
633696
judge = new Judge(judgeConfig, mockRunner, 1.0, mockLogger);
634697
});
635698

636-
it('constructs evaluation messages correctly', () => {
699+
it('builds the evaluation string in the expected format', () => {
637700
// eslint-disable-next-line no-underscore-dangle
638-
const constructMessages = (judge as any)._constructEvaluationMessages.bind(judge);
639-
const messages = constructMessages('test input', 'test output');
701+
const buildInput = (judge as any)._buildEvaluationInput.bind(judge);
702+
const input = buildInput('hello', 'world');
640703

641-
expect(messages).toHaveLength(2);
642-
expect(messages[0]).toEqual({
643-
role: 'system',
644-
content: 'You are a helpful judge that evaluates AI responses.',
645-
});
646-
expect(messages[1]).toEqual({
647-
role: 'user',
648-
content:
649-
'Evaluate and report scores for important metrics: Input: test input, Output: test output',
650-
});
704+
expect(input).toBe('MESSAGE HISTORY:\nhello\n\nRESPONSE TO EVALUATE:\nworld');
651705
});
652706
});
653707

0 commit comments

Comments
 (0)