diff --git a/packages/ai-providers/server-ai-langchain/__tests__/LangChainModelRunner.test.ts b/packages/ai-providers/server-ai-langchain/__tests__/LangChainModelRunner.test.ts index 63888b084a..e68af0d718 100644 --- a/packages/ai-providers/server-ai-langchain/__tests__/LangChainModelRunner.test.ts +++ b/packages/ai-providers/server-ai-langchain/__tests__/LangChainModelRunner.test.ts @@ -1,6 +1,6 @@ import { AIMessage } from '@langchain/core/messages'; -import type { LDAICompletionConfig, LDMessage } from '@launchdarkly/server-sdk-ai'; +import type { LDAICompletionConfig } from '@launchdarkly/server-sdk-ai'; import { LangChainModelRunner } from '../src/LangChainModelRunner'; @@ -63,25 +63,6 @@ describe('LangChainModelRunner', () => { expect(passed[1].content).toBe('hi'); }); - it('uses a LDMessage[] as-is without prepending config messages', async () => { - const response = new AIMessage('direct reply'); - mockLLM.invoke.mockResolvedValue(response); - - const configWithMessages: LDAICompletionConfig = { - ...baseConfig, - messages: [{ role: 'system', content: 'You are X' }], - }; - const r = new LangChainModelRunner(mockLLM, configWithMessages, mockLogger); - const inputMessages: LDMessage[] = [ - { role: 'user', content: 'direct question' }, - ]; - await r.run(inputMessages); - - const passed = mockLLM.invoke.mock.calls[0][0]; - expect(passed).toHaveLength(1); - expect(passed[0].content).toBe('direct question'); - }); - it('marks success=false and warns when content is non-string (multimodal)', async () => { mockLLM.invoke.mockResolvedValue(new AIMessage([{ type: 'image' }] as any)); diff --git a/packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts b/packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts index ec4237269f..37cb657387 100644 --- a/packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts +++ b/packages/ai-providers/server-ai-langchain/src/LangChainModelRunner.ts @@ -29,19 +29,20 @@ export class LangChainModelRunner implements Runner { } /** - * Run the LangChain model with the given prompt. + * Run the LangChain model with the given user prompt. * - * @param input The user prompt string or a pre-built message array to send to the model. - * When a string is provided, config messages are prepended before the user prompt. - * When an {@link LDMessage} array is provided, it is used as-is (config messages are - * not prepended). + * Prepends any messages defined in the AI config (system prompt, etc.) before + * the user prompt. + * + * @param input The user prompt string. * @param outputType Optional JSON schema for structured output. When provided, * the parsed result is exposed via {@link RunnerResult.parsed}. */ - async run(input: string | LDMessage[], outputType?: Record): Promise { - const messages: LDMessage[] = Array.isArray(input) - ? input - : [...(this._config.messages ?? []), { role: 'user', content: input }]; + async run(input: string, outputType?: Record): Promise { + const messages: LDMessage[] = [ + ...(this._config.messages ?? []), + { role: 'user', content: input }, + ]; if (outputType !== undefined) { return this._runStructured(messages, outputType); diff --git a/packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts b/packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts index 5e6cfaba30..613578e8f5 100644 --- a/packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts +++ b/packages/ai-providers/server-ai-openai/__tests__/OpenAIModelRunner.test.ts @@ -75,27 +75,6 @@ describe('OpenAIModelRunner', () => { }); }); - it('passes a LDMessage[] input directly without prepending config messages', async () => { - const mockResponse = { - choices: [{ message: { content: 'Evaluation result' } }], - usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, - }; - (mockOpenAI.chat.completions.create as jest.Mock).mockResolvedValue(mockResponse as any); - - const messages = [ - { role: 'system' as const, content: 'You are a judge' }, - { role: 'user' as const, content: 'Rate this: hello' }, - ]; - const result = await runner.run(messages); - - expect(mockOpenAI.chat.completions.create).toHaveBeenCalledWith({ - model: 'gpt-3.5-turbo', - messages, - }); - expect(result.content).toBe('Evaluation result'); - expect(result.metrics.success).toBe(true); - }); - it('marks the result unsuccessful when response has no content', async () => { const mockResponse = { choices: [{ message: {} }] }; (mockOpenAI.chat.completions.create as jest.Mock).mockResolvedValue(mockResponse as any); diff --git a/packages/ai-providers/server-ai-openai/src/OpenAIAgentRunner.ts b/packages/ai-providers/server-ai-openai/src/OpenAIAgentRunner.ts index dbe13ac29b..2511149ec4 100644 --- a/packages/ai-providers/server-ai-openai/src/OpenAIAgentRunner.ts +++ b/packages/ai-providers/server-ai-openai/src/OpenAIAgentRunner.ts @@ -48,7 +48,7 @@ export class OpenAIAgentRunner implements Runner { async run(input: string, _outputType?: Record): Promise { try { - const result = await this._agentRun(this._agent, String(input), { maxTurns: MAX_TURNS }); + const result = await this._agentRun(this._agent, input, { maxTurns: MAX_TURNS }); const toolCalls = getToolCallsFromRunItems(result.newItems ?? []).reduce( (acc: string[], fnName: string) => { diff --git a/packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts b/packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts index 196ce257be..7f8b874e3b 100644 --- a/packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts +++ b/packages/ai-providers/server-ai-openai/src/OpenAIModelRunner.ts @@ -32,20 +32,20 @@ export class OpenAIModelRunner implements Runner { } /** - * Run the OpenAI model with the given prompt or message array. + * Run the OpenAI model with the given user prompt. * - * When `input` is a string it is wrapped as a user turn and appended to any - * messages defined in the config. When `input` is already a `LDMessage[]` - * (e.g. when called from the Judge evaluation path) it is used as-is. + * Prepends any messages defined in the AI config (system prompt, + * instructions, etc.) before the user prompt. * - * @param input The user prompt string, or a pre-built message array. + * @param input The user prompt string. * @param outputType Optional JSON schema for structured output. When provided, * the response is parsed and exposed via {@link RunnerResult.parsed}. */ - async run(input: string | LDMessage[], outputType?: Record): Promise { - const messages: LDMessage[] = Array.isArray(input) - ? input - : [...(this._config.messages ?? []), { role: 'user', content: input }]; + async run(input: string, outputType?: Record): Promise { + const messages: LDMessage[] = [ + ...(this._config.messages ?? []), + { role: 'user', content: input }, + ]; if (outputType !== undefined) { return this._runStructured(messages, outputType); diff --git a/packages/ai-providers/server-ai-vercel/__tests__/VercelModelRunner.test.ts b/packages/ai-providers/server-ai-vercel/__tests__/VercelModelRunner.test.ts index a340dc5407..49313865f1 100644 --- a/packages/ai-providers/server-ai-vercel/__tests__/VercelModelRunner.test.ts +++ b/packages/ai-providers/server-ai-vercel/__tests__/VercelModelRunner.test.ts @@ -90,30 +90,6 @@ describe('VercelModelRunner', () => { expect(out.metrics.usage).toEqual({ total: 100, input: 40, output: 60 }); }); - it('uses a LDMessage[] directly without prepending config messages', async () => { - (generateText as jest.Mock).mockResolvedValue({ - text: 'direct', - usage: { totalTokens: 5, promptTokens: 2, completionTokens: 3 }, - }); - - const configWithMessages: LDAICompletionConfig = { - ...baseConfig, - messages: [{ role: 'system', content: 'Should not appear' }], - }; - const r = new VercelModelRunner(fakeModel as any, configWithMessages, {}, mockLogger); - const prebuilt = [ - { role: 'system' as const, content: 'Custom system' }, - { role: 'user' as const, content: 'Direct input' }, - ]; - await r.run(prebuilt); - - expect(generateText).toHaveBeenCalledWith({ - model: fakeModel, - messages: prebuilt, - experimental_telemetry: { isEnabled: true }, - }); - }); - it('returns success=false when generateText throws', async () => { const err = new Error('boom'); (generateText as jest.Mock).mockRejectedValue(err); diff --git a/packages/ai-providers/server-ai-vercel/src/VercelModelRunner.ts b/packages/ai-providers/server-ai-vercel/src/VercelModelRunner.ts index 61293cf0c9..54e7db3472 100644 --- a/packages/ai-providers/server-ai-vercel/src/VercelModelRunner.ts +++ b/packages/ai-providers/server-ai-vercel/src/VercelModelRunner.ts @@ -36,19 +36,20 @@ export class VercelModelRunner implements Runner { } /** - * Run the Vercel AI model with the given prompt. + * Run the Vercel AI model with the given user prompt. * - * @param input The user prompt string, or a pre-built message array. When a - * string is supplied the config's system messages are prepended automatically. - * When a `LDMessage[]` is supplied it is used as-is (config messages are NOT - * prepended — the caller is responsible for the full message list). + * Prepends any messages defined in the AI config (system prompt, etc.) before + * the user prompt. + * + * @param input The user prompt string. * @param outputType Optional JSON schema for structured output. When provided, * the parsed object is exposed via {@link RunnerResult.parsed}. */ - async run(input: string | LDMessage[], outputType?: Record): Promise { - const messages: LDMessage[] = Array.isArray(input) - ? input - : [...(this._config.messages ?? []), { role: 'user', content: input }]; + async run(input: string, outputType?: Record): Promise { + const messages: LDMessage[] = [ + ...(this._config.messages ?? []), + { role: 'user', content: input }, + ]; if (outputType !== undefined) { return this._runStructured(messages, outputType); diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index bd49305f4d..a242d371ed 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -2,10 +2,74 @@ import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { LDAIConfigTracker } from '../src/api/config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../src/api/config/types'; -import { Judge } from '../src/api/judge/Judge'; +import { Judge, stripLegacyJudgeMessages } from '../src/api/judge/Judge'; import { RunnerResult } from '../src/api/model/types'; import { Runner } from '../src/api/providers/Runner'; +describe('stripLegacyJudgeMessages', () => { + it('strips assistant messages containing {{message_history}}', () => { + const messages: LDMessage[] = [ + { role: 'system', content: 'You are a judge.' }, + { role: 'assistant', content: 'Here is the history: {{message_history}}' }, + ]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toHaveLength(1); + expect(result[0].role).toBe('system'); + }); + + it('strips user messages containing {{response_to_evaluate}}', () => { + const messages: LDMessage[] = [ + { role: 'system', content: 'You are a judge.' }, + { role: 'user', content: 'Evaluate: {{response_to_evaluate}}' }, + ]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toHaveLength(1); + expect(result[0].role).toBe('system'); + }); + + it('strips all legacy template messages from a typical legacy config', () => { + const messages: LDMessage[] = [ + { role: 'system', content: 'You are a judge.' }, + { role: 'assistant', content: '{{message_history}}' }, + { role: 'user', content: '{{response_to_evaluate}}' }, + ]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toHaveLength(1); + expect(result[0].role).toBe('system'); + }); + + it('does not strip system messages even when they contain template variables', () => { + const messages: LDMessage[] = [ + { + role: 'system', + content: 'Judge using {{message_history}} and {{response_to_evaluate}}.', + }, + ]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toHaveLength(1); + expect(result[0].role).toBe('system'); + }); + + it('leaves non-system messages without template variables untouched', () => { + const messages: LDMessage[] = [ + { role: 'system', content: 'You are a judge.' }, + { role: 'user', content: 'This is a regular message.' }, + ]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toHaveLength(2); + }); + + it('returns an empty array for an empty input', () => { + expect(stripLegacyJudgeMessages([])).toEqual([]); + }); + + it('passes a new-style system-only config through unchanged', () => { + const messages: LDMessage[] = [{ role: 'system', content: 'You are a judge.' }]; + const result = stripLegacyJudgeMessages(messages); + expect(result).toEqual(messages); + }); +}); + describe('Judge', () => { let mockRunner: jest.Mocked; let mockTracker: jest.Mocked; @@ -37,14 +101,7 @@ describe('Judge', () => { judgeConfig = { key: 'test-judge', enabled: true, - messages: [ - { role: 'system', content: 'You are a helpful judge that evaluates AI responses.' }, - { - role: 'user', - content: - 'Evaluate and report scores for important metrics: Input: {{message_history}}, Output: {{response_to_evaluate}}', - }, - ], + messages: [{ role: 'system', content: 'You are a helpful judge that evaluates AI responses.' }], model: { name: 'gpt-4' }, provider: { name: 'openai' }, createTracker: () => mockTracker, @@ -161,21 +218,33 @@ describe('Judge', () => { }); expect(mockRunner.run).toHaveBeenCalledWith( - expect.arrayContaining([ - expect.objectContaining({ - role: 'system', - content: 'You are a helpful judge that evaluates AI responses.', - }), - expect.objectContaining({ - role: 'user', - content: - 'Evaluate and report scores for important metrics: Input: What is the capital of France?, Output: Paris is the capital of France.', - }), - ]), + 'MESSAGE HISTORY:\nWhat is the capital of France?\n\nRESPONSE TO EVALUATE:\nParis is the capital of France.', expect.any(Object), // evaluation schema ); }); + it('passes a string input to the runner (not a message list)', async () => { + const mockRunnerResult: RunnerResult = { + content: '', + parsed: { + score: 0.85, + reasoning: 'Good answer.', + }, + metrics: { success: true }, + }; + + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockRunner.run.mockResolvedValue(mockRunnerResult); + + await judge.evaluate('What is AI?', 'AI is artificial intelligence.'); + + expect(mockRunner.run).toHaveBeenCalledTimes(1); + const inputArg = mockRunner.run.mock.calls[0][0]; + expect(typeof inputArg).toBe('string'); + expect(inputArg).toContain('MESSAGE HISTORY:\nWhat is AI?'); + expect(inputArg).toContain('RESPONSE TO EVALUATE:\nAI is artificial intelligence.'); + }); + it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => { const mockRunnerResult: RunnerResult = { content: '', @@ -412,25 +481,29 @@ describe('Judge', () => { }); }); - it('returns error result when messages are missing', async () => { + it('proceeds (does not error early) when messages is undefined', async () => { const configWithoutMessages: LDAIJudgeConfig = { ...judgeConfig, messages: undefined, }; const judgeWithoutMessages = new Judge(configWithoutMessages, mockRunner, 1.0, mockLogger); + const mockRunnerResult: RunnerResult = { + content: '', + parsed: { + score: 0.7, + reasoning: 'Acceptable response.', + }, + metrics: { success: true }, + }; + mockTracker.trackMetricsOf.mockImplementation(async (extractor, func) => func()); + mockRunner.run.mockResolvedValue(mockRunnerResult); + const result = await judgeWithoutMessages.evaluate('test input', 'test output'); - expect(result).toEqual({ - success: false, - sampled: true, - errorMessage: 'Judge configuration must include messages', - judgeConfigKey: 'test-judge', - }); - expect(mockLogger.warn).toHaveBeenCalledWith( - 'Judge configuration must include messages', - mockTrackData, - ); + expect(result.sampled).toBe(true); + expect(result.success).toBe(true); + expect(mockRunner.run).toHaveBeenCalledTimes(1); }); it('returns result with success false when parsed is undefined or has no score/reasoning', async () => { @@ -588,17 +661,7 @@ describe('Judge', () => { }); expect(mockRunner.run).toHaveBeenCalledWith( - expect.arrayContaining([ - expect.objectContaining({ - role: 'system', - content: 'You are a helpful judge that evaluates AI responses.', - }), - expect.objectContaining({ - role: 'user', - content: - 'Evaluate and report scores for important metrics: Input: What is the capital of France?\r\nParis is the capital of France., Output: Paris is the capital of France.', - }), - ]), + 'MESSAGE HISTORY:\nWhat is the capital of France?\r\nParis is the capital of France.\n\nRESPONSE TO EVALUATE:\nParis is the capital of France.', expect.any(Object), // evaluation schema ); }); @@ -626,28 +689,19 @@ describe('Judge', () => { }); }); - describe('_constructEvaluationMessages', () => { + describe('_buildEvaluationInput', () => { let judge: Judge; beforeEach(() => { judge = new Judge(judgeConfig, mockRunner, 1.0, mockLogger); }); - it('constructs evaluation messages correctly', () => { + it('builds the evaluation string in the expected format', () => { // eslint-disable-next-line no-underscore-dangle - const constructMessages = (judge as any)._constructEvaluationMessages.bind(judge); - const messages = constructMessages('test input', 'test output'); + const buildInput = (judge as any)._buildEvaluationInput.bind(judge); + const input = buildInput('hello', 'world'); - expect(messages).toHaveLength(2); - expect(messages[0]).toEqual({ - role: 'system', - content: 'You are a helpful judge that evaluates AI responses.', - }); - expect(messages[1]).toEqual({ - role: 'user', - content: - 'Evaluate and report scores for important metrics: Input: test input, Output: test output', - }); + expect(input).toBe('MESSAGE HISTORY:\nhello\n\nRESPONSE TO EVALUATE:\nworld'); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts index 0e5ca6de0e..a113514e82 100644 --- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts @@ -12,8 +12,15 @@ import { LDAIClientImpl } from '../src/LDAIClientImpl'; import { LDClientMin } from '../src/LDClientMin'; import { aiSdkLanguage, aiSdkName, aiSdkVersion } from '../src/sdkInfo'; -// Mock Judge and RunnerFactory -jest.mock('../src/api/judge/Judge'); +// Mock Judge and RunnerFactory. Preserve the real `stripLegacyJudgeMessages` +// helper so the real `_judgeConfig` strip path can be exercised by tests. +jest.mock('../src/api/judge/Judge', () => { + const actual = jest.requireActual('../src/api/judge/Judge'); + return { + ...actual, + Judge: jest.fn(), + }; +}); jest.mock('../src/api/providers/RunnerFactory'); const mockLdClient: jest.Mocked = { @@ -184,7 +191,10 @@ describe('config evaluation', () => { const evaluateSpy = jest.spyOn(client as any, '_evaluate'); const result = await client.judgeConfig(key, testContext, defaultValue); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); // Should use first value from evaluationMetricKeys expect(result.evaluationMetricKey).toBe('relevance'); expect(result.createTracker).toBeDefined(); @@ -217,7 +227,10 @@ describe('config evaluation', () => { const evaluateSpy = jest.spyOn(client as any, '_evaluate'); const result = await client.judgeConfig(key, testContext, defaultValue); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); expect(result.evaluationMetricKey).toBe('relevance'); expect(result.createTracker).toBeDefined(); expect(result.enabled).toBe(true); @@ -250,7 +263,10 @@ describe('config evaluation', () => { const evaluateSpy = jest.spyOn(client as any, '_evaluate'); const result = await client.judgeConfig(key, testContext, defaultValue); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); expect(result.evaluationMetricKey).toBe('helpfulness'); expect(result.createTracker).toBeDefined(); expect(result.enabled).toBe(true); @@ -283,7 +299,10 @@ describe('config evaluation', () => { const evaluateSpy = jest.spyOn(client as any, '_evaluate'); const result = await client.judgeConfig(key, testContext, defaultValue); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); // Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys expect(result.evaluationMetricKey).toBe('relevance'); expect(result.createTracker).toBeDefined(); @@ -316,7 +335,10 @@ describe('config evaluation', () => { const evaluateSpy = jest.spyOn(client as any, '_evaluate'); const result = await client.judgeConfig(key, testContext, defaultValue); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); // Should skip empty and whitespace strings, use first valid value expect(result.evaluationMetricKey).toBe('relevance'); expect(result.createTracker).toBeDefined(); @@ -622,8 +644,44 @@ describe('judgeConfig method', () => { key, 1, ); - expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', variables); - expect(result).toBe(mockJudgeConfig); + expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', { + ...variables, + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }); + // System messages without legacy template variables pass through unchanged. + expect(result).toMatchObject(mockJudgeConfig); + expect(result.messages).toEqual(mockJudgeConfig.messages); + evaluateSpy.mockRestore(); + }); + + it('strips legacy judge template messages from the returned config', async () => { + const client = new LDAIClientImpl(mockLdClient); + const key = 'test-judge'; + const defaultValue: LDAIJudgeConfigDefault = { + enabled: false, + }; + + const mockJudgeConfig = { + enabled: true, + model: { name: 'gpt-4' }, + provider: { name: 'openai' }, + evaluationMetricKey: 'relevance', + messages: [ + { role: 'system' as const, content: 'You are a judge.' }, + { role: 'assistant' as const, content: '{{message_history}}' }, + { role: 'user' as const, content: 'Evaluate: {{response_to_evaluate}}' }, + ], + createTracker: () => ({}) as any, + toVercelAISDK: jest.fn(), + }; + + const evaluateSpy = jest.spyOn(client as any, '_evaluate'); + evaluateSpy.mockResolvedValue(mockJudgeConfig); + + const result = await client.judgeConfig(key, testContext, defaultValue); + + expect(result.messages).toEqual([{ role: 'system', content: 'You are a judge.' }]); evaluateSpy.mockRestore(); }); }); @@ -679,10 +737,7 @@ describe('createJudge method', () => { key, 1, ); - expect(judgeConfigSpy).toHaveBeenCalledWith(key, testContext, defaultValue, { - message_history: '{{message_history}}', - response_to_evaluate: '{{response_to_evaluate}}', - }); + expect(judgeConfigSpy).toHaveBeenCalledWith(key, testContext, defaultValue, undefined); expect(RunnerFactory.createModel).toHaveBeenCalledWith(mockJudgeConfig, undefined, undefined); expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, 1.0, undefined); expect(result).toBe(mockJudge); diff --git a/packages/sdk/server-ai/src/LDAIClientImpl.ts b/packages/sdk/server-ai/src/LDAIClientImpl.ts index c655db2551..f94b43bc3d 100644 --- a/packages/sdk/server-ai/src/LDAIClientImpl.ts +++ b/packages/sdk/server-ai/src/LDAIClientImpl.ts @@ -24,7 +24,7 @@ import { import { LDAIConfigFlagValue, LDAIConfigUtils } from './api/config/LDAIConfigUtils'; import { AgentGraphDefinition, LDAgentGraphFlagValue, LDGraphTracker } from './api/graph'; import { Evaluator } from './api/judge/Evaluator'; -import { Judge } from './api/judge/Judge'; +import { Judge, stripLegacyJudgeMessages } from './api/judge/Judge'; import { LDAIClient } from './api/LDAIClient'; import { RunnerFactory, SupportedAIProvider } from './api/providers'; import { LDAIConfigTrackerImpl } from './LDAIConfigTrackerImpl'; @@ -239,8 +239,43 @@ export class LDAIClientImpl implements LDAIClient { defaultValue: LDAIJudgeConfigDefault, variables?: Record, ): Promise { - const config = await this._evaluate(key, context, defaultValue, 'judge', variables); - return config as LDAIJudgeConfig; + if (variables?.message_history !== undefined) { + this._logger?.warn( + "The variable 'message_history' is reserved by the judge and will be ignored.", + ); + } + if (variables?.response_to_evaluate !== undefined) { + this._logger?.warn( + "The variable 'response_to_evaluate' is reserved by the judge and will be ignored.", + ); + } + + // Re-inject the reserved variables as their literal placeholders so they + // survive Mustache interpolation in `_evaluate`. Without this, legacy + // templates like `{{message_history}}` get rendered to empty strings and + // `stripLegacyJudgeMessages` below cannot detect them. + const extendedVariables = { + ...variables, + message_history: '{{message_history}}', + response_to_evaluate: '{{response_to_evaluate}}', + }; + + const config = (await this._evaluate( + key, + context, + defaultValue, + 'judge', + extendedVariables, + )) as LDAIJudgeConfig; + + // Strip legacy judge template messages (containing {{message_history}} or + // {{response_to_evaluate}}) before returning the config. New-style configs + // omit these and rely on Judge._buildEvaluationInput. + if (config.messages) { + return { ...config, messages: stripLegacyJudgeMessages(config.messages) }; + } + + return config; } async judgeConfig( @@ -381,29 +416,11 @@ export class LDAIClientImpl implements LDAIClient { sampleRate: number = 1.0, ): Promise { try { - if (variables?.message_history !== undefined) { - this._logger?.warn( - "The variable 'message_history' is reserved by the judge and will be ignored.", - ); - } - if (variables?.response_to_evaluate !== undefined) { - this._logger?.warn( - "The variable 'response_to_evaluate' is reserved by the judge and will be ignored.", - ); - } - - // Overwrite reserved variables to ensure they remain as placeholders for judge evaluation - const extendedVariables = { - ...variables, - message_history: '{{message_history}}', - response_to_evaluate: '{{response_to_evaluate}}', - }; - const judgeConfig = await this._judgeConfig( key, context, defaultValue ?? disabledAIConfig, - extendedVariables, + variables, ); if (!judgeConfig.enabled) { diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 35624cad30..86e34c8088 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -1,5 +1,3 @@ -import Mustache from 'mustache'; - import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { ChatResponse } from '../chat/types'; @@ -26,6 +24,27 @@ const EVALUATION_SCHEMA = { additionalProperties: false, } as const; +/** + * Remove legacy judge template messages from a message list. + * + * Strips any non-system message whose content contains `{{message_history}}` + * or `{{response_to_evaluate}}`. These were used by older judge configs to + * indicate where the SDK should interpolate the evaluated conversation; new + * configs omit them entirely and rely on the string input built by + * `Judge._buildEvaluationInput`. + * + * @param messages The raw message list from the judge AI config. + * @returns A new list with legacy template messages removed. + */ +export function stripLegacyJudgeMessages(messages: LDMessage[]): LDMessage[] { + return messages.filter( + (msg) => + msg.role === 'system' || + (!msg.content.includes('{{message_history}}') && + !msg.content.includes('{{response_to_evaluate}}')), + ); +} + /** * Judge implementation that handles evaluation functionality and conversation management. * @@ -105,13 +124,6 @@ export class Judge { return result; } - if (!this._aiConfig.messages) { - this._logger?.warn('Judge configuration must include messages', tracker.getTrackData()); - result.sampled = true; - result.errorMessage = 'Judge configuration must include messages'; - return result; - } - if (Math.random() > effectiveRate) { this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${effectiveRate}`); return result; @@ -119,11 +131,11 @@ export class Judge { result.sampled = true; - const messages = this._constructEvaluationMessages(input, output); + const evaluationInput = this._buildEvaluationInput(input, output); const response = await tracker.trackMetricsOf( (r: RunnerResult) => r.metrics, - () => this._runner.run(messages, EVALUATION_SCHEMA), + () => this._runner.run(evaluationInput, EVALUATION_SCHEMA), ); const evalResult = this._parseEvaluationResponse(response.parsed); @@ -186,25 +198,13 @@ export class Judge { } /** - * Constructs evaluation messages by combining judge's config messages with input/output. - */ - private _constructEvaluationMessages(input: string, output: string): LDMessage[] { - const messages: LDMessage[] = this._aiConfig.messages!.map((msg) => ({ - ...msg, - content: this._interpolateMessage(msg.content, { - message_history: input, - response_to_evaluate: output, - }), - })); - - return messages; - } - - /** - * Interpolates message content with variables using Mustache templating. + * Builds the evaluation input string passed to the runner. + * + * Combines the original prompt and the response into a single, well-known + * format the judge model is expected to evaluate. */ - private _interpolateMessage(content: string, variables: Record): string { - return Mustache.render(content, variables, undefined, { escape: (item: any) => item }); + private _buildEvaluationInput(input: string, output: string): string { + return `MESSAGE HISTORY:\n${input}\n\nRESPONSE TO EVALUATE:\n${output}`; } /** diff --git a/packages/sdk/server-ai/src/api/providers/Runner.ts b/packages/sdk/server-ai/src/api/providers/Runner.ts index 42b43cd8ce..7a642275e8 100644 --- a/packages/sdk/server-ai/src/api/providers/Runner.ts +++ b/packages/sdk/server-ai/src/api/providers/Runner.ts @@ -1,4 +1,3 @@ -import { LDMessage } from '../config/types'; import { AgentGraphRunnerResult } from '../graph/types'; import { RunnerResult } from '../model/types'; @@ -11,15 +10,14 @@ import { RunnerResult } from '../model/types'; */ export interface Runner { /** - * Invoke the model with the given input. + * Invoke the model with the given input string. * - * @param input The input to the model. For agents this is a string prompt; - * for model completions and judges this is an array of messages. + * @param input The string input to the model. * @param outputType Optional JSON schema for structured output. When provided, * the model should return structured data accessible via `RunnerResult.parsed`. * @returns Promise resolving to a RunnerResult. */ - run(input: string | LDMessage[], outputType?: Record): Promise; + run(input: string, outputType?: Record): Promise; } /**