Skip to content

Commit ca4aacd

Browse files
jsonbaileyclaude
andcommitted
fix: Make judge runners non-multi-turn
Add a multiTurn parameter (default true) on provider model runners and RunnerFactory.createModel. When false, the runner does not persist the user prompt and assistant reply back into its conversation history, so each run() call starts fresh from the seeded config messages. Judges now construct their underlying runner with multiTurn=false so successive evaluate() calls on a shared Judge instance do not see each other's prompts and responses. Without this, every evaluation after the first contaminated the judge model's input with prior conversations and concurrent evaluations raced on the mutable history. Also fix Judge.evaluateMessages to render messages as "<role>: <content>" joined by newlines, preserving speaker identity in the message history section the judge model receives. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 07aa79d commit ca4aacd

16 files changed

Lines changed: 469 additions & 37 deletions

File tree

packages/ai-providers/server-ai-langchain/__tests__/LangChainModelRunner.test.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,68 @@ describe('LangChainModelRunner', () => {
180180
expect(secondCallMessages[3].content).toBe('Q2');
181181
});
182182
});
183+
184+
describe('multiTurn=false (stateless)', () => {
185+
const configWithMessages: LDAICompletionConfig = {
186+
...baseConfig,
187+
messages: [{ role: 'system', content: 'You are a judge.' }],
188+
};
189+
190+
it('does not accumulate history across successful calls', async () => {
191+
const statelessRunner = new LangChainModelRunner(
192+
mockLLM,
193+
configWithMessages,
194+
mockLogger,
195+
false,
196+
);
197+
198+
mockLLM.invoke
199+
.mockResolvedValueOnce(new AIMessage('First response'))
200+
.mockResolvedValueOnce(new AIMessage('Second response'));
201+
202+
await statelessRunner.run('First question');
203+
await statelessRunner.run('Second question');
204+
205+
const firstCallMessages = mockLLM.invoke.mock.calls[0][0];
206+
const secondCallMessages = mockLLM.invoke.mock.calls[1][0];
207+
expect(firstCallMessages).toHaveLength(2);
208+
expect(firstCallMessages[0].content).toBe('You are a judge.');
209+
expect(firstCallMessages[1].content).toBe('First question');
210+
expect(secondCallMessages).toHaveLength(2);
211+
expect(secondCallMessages[0].content).toBe('You are a judge.');
212+
expect(secondCallMessages[1].content).toBe('Second question');
213+
});
214+
215+
it('keeps the internal chat history length pinned to the seeded config messages', async () => {
216+
const statelessRunner = new LangChainModelRunner(
217+
mockLLM,
218+
configWithMessages,
219+
mockLogger,
220+
false,
221+
);
222+
223+
mockLLM.invoke.mockResolvedValue(new AIMessage('response'));
224+
225+
await statelessRunner.run('Q1');
226+
await statelessRunner.run('Q2');
227+
228+
// eslint-disable-next-line no-underscore-dangle
229+
const messages = await (statelessRunner as any)._chatHistory.getMessages();
230+
expect(messages).toHaveLength(1);
231+
});
232+
233+
it('defaults to multiTurn=true when the parameter is omitted', async () => {
234+
const defaultRunner = new LangChainModelRunner(mockLLM, configWithMessages, mockLogger);
235+
236+
mockLLM.invoke
237+
.mockResolvedValueOnce(new AIMessage('Answer 1'))
238+
.mockResolvedValueOnce(new AIMessage('Answer 2'));
239+
240+
await defaultRunner.run('Q1');
241+
await defaultRunner.run('Q2');
242+
243+
const secondCallMessages = mockLLM.invoke.mock.calls[1][0];
244+
expect(secondCallMessages).toHaveLength(4);
245+
});
246+
});
183247
});

packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,37 @@ import { convertMessagesToLangChain, getAIMetricsFromResponse } from './LangChai
2020
export class LangChainModelRunner implements Runner {
2121
private _llm: BaseChatModel;
2222
private _chatHistory: InMemoryChatMessageHistory;
23+
private _multiTurn: boolean;
2324
private _logger?: LDLogger;
2425

25-
constructor(llm: BaseChatModel, config: LDAICompletionConfig, logger?: LDLogger) {
26+
constructor(
27+
llm: BaseChatModel,
28+
config: LDAICompletionConfig,
29+
logger?: LDLogger,
30+
multiTurn: boolean = true,
31+
) {
2632
this._llm = llm;
2733
this._chatHistory = new InMemoryChatMessageHistory(
2834
convertMessagesToLangChain(config.messages ?? []),
2935
);
36+
this._multiTurn = multiTurn;
3037
this._logger = logger;
3138
}
3239

3340
/**
3441
* Run the LangChain model with the given user prompt.
3542
*
3643
* The runner maintains a LangChain `InMemoryChatMessageHistory` that is
37-
* initialized from any messages on the AI config (system prompt, etc.) and
38-
* grows with each successful call. On every invocation the user prompt is
39-
* appended to the existing history before being sent to the model. When the
40-
* call succeeds and produces non-empty content, the user prompt and the
41-
* assistant's reply are persisted to the history; failed calls leave the
42-
* history unchanged so the next call can retry cleanly.
44+
* initialized from any messages on the AI config (system prompt, etc.). On
45+
* every invocation the user prompt is appended to the existing history
46+
* before being sent to the model. When `multiTurn` is `true` (the default)
47+
* and the call succeeds with non-empty content, the user prompt and the
48+
* assistant's reply are persisted to the history so subsequent calls
49+
* continue the conversation. When `multiTurn` is `false`, history is
50+
* treated as read-only — useful for stateless runners (e.g. judges) where
51+
* every call should see only the initial config messages plus the current
52+
* input. Failed calls leave the history unchanged so the next call can
53+
* retry cleanly.
4354
*
4455
* @param input The user prompt string.
4556
* @param outputType Optional JSON schema for structured output. When provided,
@@ -56,7 +67,7 @@ export class LangChainModelRunner implements Runner {
5667
? await this._runStructured(langchainMessages, outputType)
5768
: await this._runCompletion(langchainMessages);
5869

59-
if (result.metrics.success && result.content) {
70+
if (result.metrics.success && result.content && this._multiTurn) {
6071
await this._chatHistory.addUserMessage(input);
6172
await this._chatHistory.addAIMessage(result.content);
6273
}

packages/ai-providers/server-ai-langchain/src/LangChainRunnerFactory.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,18 @@ export class LangChainRunnerFactory extends AIProvider {
2626

2727
/**
2828
* Create a model runner from a completion AI configuration.
29+
*
30+
* @param config The completion (or judge) AI configuration.
31+
* @param multiTurn Whether the runner should accumulate conversation history
32+
* across successive `run()` calls. Defaults to `true` (chat semantics).
33+
* Pass `false` for stateless runners such as judges.
2934
*/
30-
async createModel(config: LDAICompletionConfig): Promise<LangChainModelRunner> {
35+
async createModel(
36+
config: LDAICompletionConfig,
37+
multiTurn: boolean = true,
38+
): Promise<LangChainModelRunner> {
3139
const llm = await createLangChainModel(config);
32-
return new LangChainModelRunner(llm, config, this._logger);
40+
return new LangChainModelRunner(llm, config, this._logger, multiTurn);
3341
}
3442

3543
/**

packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,4 +231,89 @@ describe('OpenAIModelRunner', () => {
231231
]);
232232
});
233233
});
234+
235+
describe('multiTurn=false (stateless)', () => {
236+
const configWithMessages: LDAICompletionConfig = {
237+
...baseConfig,
238+
messages: [{ role: 'system', content: 'You are a judge.' }],
239+
};
240+
241+
it('does not accumulate history across successful calls', async () => {
242+
const statelessRunner = new OpenAIModelRunner(
243+
mockOpenAI,
244+
configWithMessages,
245+
undefined,
246+
false,
247+
);
248+
249+
(mockOpenAI.chat.completions.create as jest.Mock)
250+
.mockResolvedValueOnce({
251+
choices: [{ message: { content: 'First response' } }],
252+
usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
253+
} as any)
254+
.mockResolvedValueOnce({
255+
choices: [{ message: { content: 'Second response' } }],
256+
usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
257+
} as any);
258+
259+
await statelessRunner.run('First question');
260+
await statelessRunner.run('Second question');
261+
262+
const firstCallArgs = (mockOpenAI.chat.completions.create as jest.Mock).mock.calls[0][0];
263+
const secondCallArgs = (mockOpenAI.chat.completions.create as jest.Mock).mock.calls[1][0];
264+
expect(firstCallArgs.messages).toEqual([
265+
{ role: 'system', content: 'You are a judge.' },
266+
{ role: 'user', content: 'First question' },
267+
]);
268+
expect(secondCallArgs.messages).toEqual([
269+
{ role: 'system', content: 'You are a judge.' },
270+
{ role: 'user', content: 'Second question' },
271+
]);
272+
});
273+
274+
it('keeps the internal history length pinned to the seeded config messages', async () => {
275+
const statelessRunner = new OpenAIModelRunner(
276+
mockOpenAI,
277+
configWithMessages,
278+
undefined,
279+
false,
280+
);
281+
282+
(mockOpenAI.chat.completions.create as jest.Mock).mockResolvedValue({
283+
choices: [{ message: { content: 'response' } }],
284+
usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
285+
} as any);
286+
287+
await statelessRunner.run('Q1');
288+
await statelessRunner.run('Q2');
289+
290+
// eslint-disable-next-line no-underscore-dangle
291+
expect((statelessRunner as any)._history).toHaveLength(1);
292+
});
293+
294+
it('defaults to multiTurn=true when the parameter is omitted', async () => {
295+
const defaultRunner = new OpenAIModelRunner(mockOpenAI, configWithMessages);
296+
297+
(mockOpenAI.chat.completions.create as jest.Mock)
298+
.mockResolvedValueOnce({
299+
choices: [{ message: { content: 'Answer 1' } }],
300+
usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
301+
} as any)
302+
.mockResolvedValueOnce({
303+
choices: [{ message: { content: 'Answer 2' } }],
304+
usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 },
305+
} as any);
306+
307+
await defaultRunner.run('Q1');
308+
await defaultRunner.run('Q2');
309+
310+
const secondCallArgs = (mockOpenAI.chat.completions.create as jest.Mock).mock.calls[1][0];
311+
expect(secondCallArgs.messages).toEqual([
312+
{ role: 'system', content: 'You are a judge.' },
313+
{ role: 'user', content: 'Q1' },
314+
{ role: 'assistant', content: 'Answer 1' },
315+
{ role: 'user', content: 'Q2' },
316+
]);
317+
});
318+
});
234319
});

packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,37 @@ export class OpenAIModelRunner implements Runner {
2121
private _modelName: string;
2222
private _parameters: Record<string, unknown>;
2323
private _history: LDMessage[];
24+
private _multiTurn: boolean;
2425
private _logger?: LDLogger;
2526

26-
constructor(client: OpenAI, config: LDAICompletionConfig, logger?: LDLogger) {
27+
constructor(
28+
client: OpenAI,
29+
config: LDAICompletionConfig,
30+
logger?: LDLogger,
31+
multiTurn: boolean = true,
32+
) {
2733
this._client = client;
2834
this._modelName = config.model?.name ?? '';
2935
this._parameters = { ...(config.model?.parameters ?? {}) };
3036
this._history = [...(config.messages ?? [])];
37+
this._multiTurn = multiTurn;
3138
this._logger = logger;
3239
}
3340

3441
/**
3542
* Run the OpenAI model with the given user prompt.
3643
*
3744
* The runner maintains a conversation history that is initialized from any
38-
* messages on the AI config (system prompt, instructions, etc.) and grows
39-
* with each successful call. On every invocation the user prompt is appended
40-
* to the existing history before being sent to the model. When the call
41-
* succeeds and produces non-empty content, the user prompt and the
42-
* assistant's reply are persisted to the history; failed calls leave the
43-
* history unchanged so the next call can retry cleanly.
45+
* messages on the AI config (system prompt, instructions, etc.). On every
46+
* invocation the user prompt is appended to the existing history before
47+
* being sent to the model. When `multiTurn` is `true` (the default) and the
48+
* call succeeds with non-empty content, the user prompt and the assistant's
49+
* reply are persisted to the history so subsequent calls continue the
50+
* conversation. When `multiTurn` is `false`, history is treated as
51+
* read-only — useful for stateless runners (e.g. judges) where every call
52+
* should see only the initial config messages plus the current input.
53+
* Failed calls leave the history unchanged so the next call can retry
54+
* cleanly.
4455
*
4556
* @param input The user prompt string.
4657
* @param outputType Optional JSON schema for structured output. When provided,
@@ -55,7 +66,7 @@ export class OpenAIModelRunner implements Runner {
5566
? await this._runStructured(messages, outputType)
5667
: await this._runCompletion(messages);
5768

58-
if (result.metrics.success && result.content) {
69+
if (result.metrics.success && result.content && this._multiTurn) {
5970
this._history.push(userMessage);
6071
this._history.push({ role: 'assistant', content: result.content });
6172
}

packages/ai-providers/server-ai-openai/src/OpenAIRunnerFactory.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,17 @@ export class OpenAIRunnerFactory extends AIProvider {
2828

2929
/**
3030
* Create a model runner from a completion AI configuration.
31+
*
32+
* @param config The completion (or judge) AI configuration.
33+
* @param multiTurn Whether the runner should accumulate conversation history
34+
* across successive `run()` calls. Defaults to `true` (chat semantics).
35+
* Pass `false` for stateless runners such as judges.
3136
*/
32-
async createModel(config: LDAICompletionConfig): Promise<OpenAIModelRunner> {
33-
return new OpenAIModelRunner(this._client, config, this._logger);
37+
async createModel(
38+
config: LDAICompletionConfig,
39+
multiTurn: boolean = true,
40+
): Promise<OpenAIModelRunner> {
41+
return new OpenAIModelRunner(this._client, config, this._logger, multiTurn);
3442
}
3543

3644
/**

0 commit comments

Comments
 (0)