diff --git a/packages/sdk/server-ai/__tests__/Evaluator.test.ts b/packages/sdk/server-ai/__tests__/Evaluator.test.ts new file mode 100644 index 0000000000..730b77eb0c --- /dev/null +++ b/packages/sdk/server-ai/__tests__/Evaluator.test.ts @@ -0,0 +1,108 @@ +import { LDAIJudgeConfig } from '../src/api/config/types'; +import { Evaluator } from '../src/api/judge/Evaluator'; +import { Judge } from '../src/api/judge/Judge'; +import { LDJudgeResult } from '../src/api/judge/types'; +import { AIProvider } from '../src/api/providers/AIProvider'; + +function makeJudgeConfig(key: string): LDAIJudgeConfig { + return { + key, + enabled: true, + evaluationMetricKey: '$ld:ai:judge:quality', + messages: [{ role: 'system', content: 'You are a judge.' }], + createTracker: () => ({}) as any, + }; +} + +function makeProvider(): jest.Mocked { + return { + invokeModel: jest.fn(), + invokeStructuredModel: jest.fn(), + } as any; +} + +describe('Evaluator', () => { + describe('noop()', () => { + it('returns an empty result array', async () => { + const evaluator = Evaluator.noop(); + const results = await evaluator.evaluate('input', 'output'); + expect(results).toEqual([]); + }); + }); + + describe('evaluate()', () => { + it('calls each configured judge and returns results', async () => { + const mockProvider = makeProvider(); + const judgeConfig = makeJudgeConfig('judge-1'); + + const mockResult: LDJudgeResult = { + success: true, + sampled: true, + score: 0.9, + reasoning: 'Good response', + metricKey: '$ld:ai:judge:quality', + judgeConfigKey: 'judge-1', + }; + + const judge = new Judge(judgeConfig, mockProvider, 1.0); + jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult); + + const evaluator = new Evaluator([judge]); + + const results = await evaluator.evaluate('user input', 'ai output'); + + expect(results).toHaveLength(1); + expect(results[0]).toEqual(mockResult); + // Evaluator does not pass a per-call samplingRate — judge uses its own. + expect(judge.evaluate).toHaveBeenCalledWith('user input', 'ai output'); + }); + + it('does NOT call tracker.trackJudgeResult', async () => { + const mockProvider = makeProvider(); + const judgeConfig = makeJudgeConfig('judge-1'); + + const mockResult: LDJudgeResult = { + success: true, + sampled: true, + score: 0.8, + reasoning: 'ok', + metricKey: '$ld:ai:judge:quality', + }; + + const judge = new Judge(judgeConfig, mockProvider, 1.0); + jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult); + + const evaluator = new Evaluator([judge]); + + // No tracker — if Evaluator tried to call trackJudgeResult this would throw or fail + await evaluator.evaluate('input', 'output'); + + // Test passes if no error is thrown (no tracker involved) + expect(true).toBe(true); + }); + + it('runs multiple judges in parallel and returns all results', async () => { + const makeJudge = (key: string, score: number): Judge => { + const mockProvider = makeProvider(); + const jc = makeJudgeConfig(key); + const j = new Judge(jc, mockProvider, 1.0); + jest.spyOn(j, 'evaluate').mockResolvedValue({ + success: true, + sampled: true, + score, + reasoning: 'ok', + metricKey: '$ld:ai:judge:quality', + }); + return j; + }; + + const evaluator = new Evaluator([makeJudge('judge-a', 0.5), makeJudge('judge-b', 0.9)]); + + const results = await evaluator.evaluate('input', 'output'); + + expect(results).toHaveLength(2); + const scores = results.map((r) => r.score).sort(); + expect(scores).toEqual([0.5, 0.9]); + }); + }); +}); diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index 43ea75e0ab..044ecd1f6d 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -54,17 +54,76 @@ describe('Judge', () => { describe('constructor', () => { it('initializes with proper configuration', () => { - const judge = new Judge(judgeConfig, mockProvider, mockLogger); + const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); expect(judge).toBeDefined(); }); + + it('defaults sampleRate to 1.0 when omitted', () => { + const judge = new Judge(judgeConfig, mockProvider); + expect(judge.sampleRate).toBe(1.0); + }); + + it('exposes the sampleRate provided to the constructor', () => { + const judge = new Judge(judgeConfig, mockProvider, 0.25, mockLogger); + expect(judge.sampleRate).toBe(0.25); + }); + + it('honors a sampleRate of 0', () => { + const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger); + expect(judge.sampleRate).toBe(0); + }); + }); + + describe('sampling fallback in evaluate()', () => { + it('uses the constructor sampleRate when no per-call rate is supplied', async () => { + // Force sampling to skip: math.random() returns 0.6, sampleRate 0.5 → 0.6 > 0.5 → skip. + const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.6); + + const judge = new Judge(judgeConfig, mockProvider, 0.5, mockLogger); + const result = await judge.evaluate('input', 'output'); + + // Skipped due to sampling: sampled stays false (default), no provider call. + expect(result.sampled).toBe(false); + expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); + + randomSpy.mockRestore(); + }); + + it('honors an explicit per-call samplingRate of 0 over the constructor default', async () => { + // Even with Math.random() at 0, samplingRate=0 means 0 > 0 is false — skip path is + // `Math.random() > rate`, so rate=0 + random=0 does NOT skip. Use random=0.5. + const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5); + + // Constructor rate is 1.0 (would normally always sample); per-call 0 overrides to skip. + const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); + const result = await judge.evaluate('input', 'output', 0); + + expect(result.sampled).toBe(false); + expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); + + randomSpy.mockRestore(); + }); + + it('per-call samplingRate of undefined falls through to the constructor default', async () => { + // Constructor 0 (always skip), per-call undefined → effective rate 0. + const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5); + + const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger); + const result = await judge.evaluate('input', 'output', undefined); + + expect(result.sampled).toBe(false); + expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled(); + + randomSpy.mockRestore(); + }); }); describe('evaluate', () => { let judge: Judge; beforeEach(() => { - judge = new Judge(judgeConfig, mockProvider, mockLogger); + judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); }); it('evaluates AI response successfully', async () => { @@ -205,7 +264,7 @@ describe('Judge', () => { evaluationMetricKey: undefined, evaluationMetricKeys: [], }; - const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, mockLogger); + const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, 1.0, mockLogger); const result = await judgeWithoutMetrics.evaluate('test input', 'test output'); @@ -227,7 +286,7 @@ describe('Judge', () => { evaluationMetricKey: 'relevance', evaluationMetricKeys: undefined, }; - const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, mockLogger); + const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, 1.0, mockLogger); const mockStructuredResponse: StructuredResponse = { data: { @@ -265,7 +324,7 @@ describe('Judge', () => { evaluationMetricKey: undefined, evaluationMetricKeys: ['relevance', 'accuracy'], }; - const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, mockLogger); + const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, 1.0, mockLogger); const mockStructuredResponse: StructuredResponse = { data: { @@ -303,7 +362,7 @@ describe('Judge', () => { evaluationMetricKey: undefined, evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'], }; - const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, mockLogger); + const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, 1.0, mockLogger); const mockStructuredResponse: StructuredResponse = { data: { @@ -342,7 +401,7 @@ describe('Judge', () => { evaluationMetricKey: 'helpfulness', evaluationMetricKeys: ['relevance', 'accuracy'], }; - const judgeWithBoth = new Judge(configWithBoth, mockProvider, mockLogger); + const judgeWithBoth = new Judge(configWithBoth, mockProvider, 1.0, mockLogger); const mockStructuredResponse: StructuredResponse = { data: { @@ -379,7 +438,7 @@ describe('Judge', () => { ...judgeConfig, messages: undefined, }; - const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, mockLogger); + const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, 1.0, mockLogger); const result = await judgeWithoutMessages.evaluate('test input', 'test output'); @@ -488,7 +547,7 @@ describe('Judge', () => { let judge: Judge; beforeEach(() => { - judge = new Judge(judgeConfig, mockProvider, mockLogger); + judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); }); it('evaluates messages and response successfully', async () => { @@ -573,7 +632,7 @@ describe('Judge', () => { let judge: Judge; beforeEach(() => { - judge = new Judge(judgeConfig, mockProvider, mockLogger); + judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); }); it('constructs evaluation messages correctly', () => { @@ -598,7 +657,7 @@ describe('Judge', () => { let judge: Judge; beforeEach(() => { - judge = new Judge(judgeConfig, mockProvider, mockLogger); + judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger); }); it('parses valid evaluation response correctly', () => { @@ -669,7 +728,7 @@ describe('Judge', () => { evaluationMetricKey: undefined, evaluationMetricKeys: [], }; - const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, mockLogger); + const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, 1.0, mockLogger); const result = await judgeWithEmptyKeys.evaluate('test input', 'test output'); diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts index 78ffa0baf7..15061ba8e5 100644 --- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts @@ -673,7 +673,7 @@ describe('createJudge method', () => { response_to_evaluate: '{{response_to_evaluate}}', }); expect(AIProviderFactory.create).toHaveBeenCalledWith(mockJudgeConfig, undefined, undefined); - expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, undefined); + expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, 1.0, undefined); expect(result).toBe(mockJudge); judgeConfigSpy.mockRestore(); }); diff --git a/packages/sdk/server-ai/src/LDAIClientImpl.ts b/packages/sdk/server-ai/src/LDAIClientImpl.ts index 316a84e56d..16248fe564 100644 --- a/packages/sdk/server-ai/src/LDAIClientImpl.ts +++ b/packages/sdk/server-ai/src/LDAIClientImpl.ts @@ -22,6 +22,7 @@ import { } from './api/config'; import { LDAIConfigFlagValue, LDAIConfigUtils } from './api/config/LDAIConfigUtils'; import { AgentGraphDefinition, LDAgentGraphFlagValue, LDGraphTracker } from './api/graph'; +import { Evaluator } from './api/judge/Evaluator'; import { Judge } from './api/judge/Judge'; import { LDAIClient } from './api/LDAIClient'; import { AIProviderFactory, SupportedAIProvider } from './api/providers'; @@ -141,33 +142,32 @@ export class LDAIClientImpl implements LDAIClient { return config; } - private async _initializeJudges( + private async _buildEvaluator( judgeConfigs: LDJudge[], context: LDContext, variables?: Record, defaultAiProvider?: SupportedAIProvider, - ): Promise> { - const judges: Record = {}; - - const judgePromises = judgeConfigs.map(async (judgeConfig) => { - const judge = await this.createJudge( - judgeConfig.key, - context, - undefined, - variables, - defaultAiProvider, - ); - return judge ? { key: judgeConfig.key, judge } : null; - }); - - const results = await Promise.all(judgePromises); - results.forEach((result) => { - if (result) { - judges[result.key] = result.judge; - } - }); + ): Promise { + if (judgeConfigs.length === 0) { + return Evaluator.noop(); + } - return judges; + const judgeInstances = ( + await Promise.all( + judgeConfigs.map((jc) => + this._createJudgeInstance( + jc.key, + context, + undefined, + variables, + defaultAiProvider, + jc.samplingRate, + ), + ), + ) + ).filter((j): j is Judge => j !== undefined); + + return new Evaluator(judgeInstances); } private async _completionConfig( @@ -318,14 +318,17 @@ export class LDAIClientImpl implements LDAIClient { return undefined; } - const judges = await this._initializeJudges( + const evaluator = await this._buildEvaluator( config.judgeConfiguration?.judges ?? [], context, variables, defaultAiProvider, ); - return new TrackedChat(config, provider, judges, this._logger); + // Attach the evaluator to the config for use by the managed layer + const configWithEvaluator: LDAICompletionConfig = { ...config, evaluator }; + + return new TrackedChat(configWithEvaluator, provider, {}, this._logger); } async createJudge( @@ -334,9 +337,27 @@ export class LDAIClientImpl implements LDAIClient { defaultValue?: LDAIJudgeConfigDefault, variables?: Record, defaultAiProvider?: SupportedAIProvider, + sampleRate: number = 1.0, ): Promise { this._ldClient.track(TRACK_USAGE_CREATE_JUDGE, context, key, 1); + return this._createJudgeInstance( + key, + context, + defaultValue, + variables, + defaultAiProvider, + sampleRate, + ); + } + private async _createJudgeInstance( + key: string, + context: LDContext, + defaultValue?: LDAIJudgeConfigDefault, + variables?: Record, + defaultAiProvider?: SupportedAIProvider, + sampleRate: number = 1.0, + ): Promise { try { if (variables?.message_history !== undefined) { this._logger?.warn( @@ -373,7 +394,7 @@ export class LDAIClientImpl implements LDAIClient { return undefined; } - return new Judge(judgeConfig, provider, this._logger); + return new Judge(judgeConfig, provider, sampleRate, this._logger); } catch (error) { this._logger?.error(`Failed to initialize judge ${key}:`, error); return undefined; diff --git a/packages/sdk/server-ai/src/api/LDAIClient.ts b/packages/sdk/server-ai/src/api/LDAIClient.ts index 5dfec98072..fa8170e0eb 100644 --- a/packages/sdk/server-ai/src/api/LDAIClient.ts +++ b/packages/sdk/server-ai/src/api/LDAIClient.ts @@ -297,6 +297,8 @@ export interface LDAIClient { * @param variables Dictionary of values for instruction interpolation. * The variables `message_history` and `response_to_evaluate` are reserved for the judge and will be ignored. * @param defaultAiProvider Optional default AI provider to use. + * @param sampleRate Optional default sampling rate (0-1) baked into the Judge. + * Used by `Judge.evaluate()` when no per-call rate is supplied. Defaults to 1.0. * @returns Promise that resolves to a Judge instance or undefined if disabled/unsupported * * @example @@ -326,6 +328,7 @@ export interface LDAIClient { defaultValue?: LDAIJudgeConfigDefault, variables?: Record, defaultAiProvider?: SupportedAIProvider, + sampleRate?: number, ): Promise; /** diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts index d1c2a161e2..159cad8f83 100644 --- a/packages/sdk/server-ai/src/api/config/types.ts +++ b/packages/sdk/server-ai/src/api/config/types.ts @@ -1,3 +1,4 @@ +import type { Evaluator } from '../judge/Evaluator'; import { LDAIConfigTracker } from './LDAIConfigTracker'; // ============================================================================ @@ -220,6 +221,13 @@ export interface LDAIAgentConfig extends LDAIConfig { * Root-level tools map keyed by tool name. Distinct from model.parameters.tools[]. */ tools?: { [toolName: string]: LDTool }; + /** + * Evaluator for this agent config. Populated by createAgent. + * Not part of the flag value shape. + * + * @internal + */ + evaluator?: Evaluator; } /** @@ -239,6 +247,13 @@ export interface LDAICompletionConfig extends LDAIConfig { * Root-level tools map keyed by tool name. Distinct from model.parameters.tools[]. */ tools?: { [toolName: string]: LDTool }; + /** + * Evaluator for this completion config. Populated by createChat/createModel. + * Not part of the flag value shape. + * + * @internal + */ + evaluator?: Evaluator; } /** diff --git a/packages/sdk/server-ai/src/api/judge/Evaluator.ts b/packages/sdk/server-ai/src/api/judge/Evaluator.ts new file mode 100644 index 0000000000..8d596364ad --- /dev/null +++ b/packages/sdk/server-ai/src/api/judge/Evaluator.ts @@ -0,0 +1,24 @@ +import { Judge } from './Judge'; +import { LDJudgeResult } from './types'; + +/** + * Wraps a collection of judges, providing a single `evaluate` method that + * runs all judges against a given input/output pair. + * + * @internal + */ +export class Evaluator { + constructor(private readonly _judges: Judge[]) {} + + static noop(): Evaluator { + return new Evaluator([]); + } + + async evaluate(input: string, output: string): Promise { + if (this._judges.length === 0) { + return []; + } + + return Promise.all(this._judges.map((judge) => judge.evaluate(input, output))); + } +} diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index ef49e3b723..820014ffaa 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -37,11 +37,20 @@ export class Judge { constructor( private readonly _aiConfig: LDAIJudgeConfig, private readonly _aiProvider: AIProvider, + private readonly _sampleRate: number = 1.0, logger?: LDLogger, ) { this._logger = logger; } + /** + * The default sampling rate baked in at construction. Used by `evaluate` / + * `evaluateMessages` when no per-call rate is supplied. + */ + get sampleRate(): number { + return this._sampleRate; + } + /** * Gets the evaluation metric key, prioritizing evaluationMetricKey over evaluationMetricKeys. * Falls back to the first valid (non-empty, non-whitespace) value in evaluationMetricKeys if evaluationMetricKey is not provided. @@ -69,10 +78,13 @@ export class Judge { * * @param input The input prompt or question that was provided to the AI * @param output The AI-generated response to be evaluated - * @param samplingRate Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1) + * @param samplingRate Sampling rate (0-1) to determine if evaluation should be processed. + * When omitted, the Judge's constructor-default rate is used. An explicit `0` overrides + * the default — only `undefined` falls through. * @returns Promise that resolves to evaluation results */ - async evaluate(input: string, output: string, samplingRate: number = 1): Promise { + async evaluate(input: string, output: string, samplingRate?: number): Promise { + const effectiveRate = samplingRate ?? this._sampleRate; const result: LDJudgeResult = { success: false, sampled: false, @@ -99,8 +111,8 @@ export class Judge { return result; } - if (Math.random() > samplingRate) { - this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${samplingRate}`); + if (Math.random() > effectiveRate) { + this._logger?.debug(`Judge evaluation skipped due to sampling rate: ${effectiveRate}`); return result; } @@ -143,13 +155,14 @@ export class Judge { * * @param messages Array of messages representing the conversation history * @param response The AI response to be evaluated - * @param samplingRatio Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1) + * @param samplingRatio Sampling ratio (0-1). When omitted, the Judge's + * constructor-default rate is used. * @returns Promise that resolves to evaluation results */ async evaluateMessages( messages: LDMessage[], response: ChatResponse, - samplingRatio: number = 1, + samplingRatio?: number, ): Promise { const input = messages.length === 0 ? '' : messages.map((msg) => msg.content).join('\r\n'); const output = response.message.content;