Skip to content

Commit c751ce6

Browse files
jsonbaileyclaude
andcommitted
feat(server-sdk-ai): add Evaluator class for judge orchestration (AIC-1657)
Introduces `Evaluator` wrapping judges and JudgeConfiguration. The evaluator runs all configured judges in parallel, warns+skips on missing judge keys, and intentionally does NOT call tracker.trackJudgeResult — that responsibility belongs in the managed layer. Attaches Evaluator to LDAICompletionConfig and LDAIAgentConfig via createChat/createAgent. Adds Evaluator.noop() static factory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 247ae7c commit c751ce6

5 files changed

Lines changed: 288 additions & 2 deletions

File tree

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import { LDAIJudgeConfig } from '../src/api/config/types';
2+
import { Evaluator } from '../src/api/judge/Evaluator';
3+
import { Judge } from '../src/api/judge/Judge';
4+
import { LDJudgeResult } from '../src/api/judge/types';
5+
import { AIProvider } from '../src/api/providers/AIProvider';
6+
7+
function makeJudgeConfig(key: string): LDAIJudgeConfig {
8+
return {
9+
key,
10+
enabled: true,
11+
evaluationMetricKey: '$ld:ai:judge:quality',
12+
messages: [{ role: 'system', content: 'You are a judge.' }],
13+
createTracker: () => ({}) as any,
14+
};
15+
}
16+
17+
function makeProvider(): jest.Mocked<AIProvider> {
18+
return {
19+
invokeModel: jest.fn(),
20+
invokeStructuredModel: jest.fn(),
21+
} as any;
22+
}
23+
24+
describe('Evaluator', () => {
25+
describe('noop()', () => {
26+
it('returns an empty result array', async () => {
27+
const evaluator = Evaluator.noop();
28+
const results = await evaluator.evaluate('input', 'output');
29+
expect(results).toEqual([]);
30+
});
31+
32+
it('has empty judges map', () => {
33+
const evaluator = Evaluator.noop();
34+
expect(evaluator.judges.size).toBe(0);
35+
});
36+
37+
it('has empty judge configuration', () => {
38+
const evaluator = Evaluator.noop();
39+
expect(evaluator.judgeConfiguration.judges).toEqual([]);
40+
});
41+
});
42+
43+
describe('evaluate()', () => {
44+
it('calls each configured judge and returns results', async () => {
45+
const mockProvider = makeProvider();
46+
const judgeConfig = makeJudgeConfig('judge-1');
47+
48+
const mockResult: LDJudgeResult = {
49+
success: true,
50+
sampled: true,
51+
score: 0.9,
52+
reasoning: 'Good response',
53+
metricKey: '$ld:ai:judge:quality',
54+
judgeConfigKey: 'judge-1',
55+
};
56+
57+
const judge = new Judge(judgeConfig, mockProvider);
58+
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);
59+
60+
const judges = new Map([['judge-1', judge]]);
61+
const evaluator = new Evaluator(judges, { judges: [{ key: 'judge-1', samplingRate: 1.0 }] });
62+
63+
const results = await evaluator.evaluate('user input', 'ai output');
64+
65+
expect(results).toHaveLength(1);
66+
expect(results[0]).toEqual(mockResult);
67+
expect(judge.evaluate).toHaveBeenCalledWith('user input', 'ai output', 1.0);
68+
});
69+
70+
it('warns and skips when judge key is not found in judges map', async () => {
71+
const mockLogger = { warn: jest.fn(), debug: jest.fn(), info: jest.fn(), error: jest.fn() };
72+
const judges = new Map<string, Judge>();
73+
const evaluator = new Evaluator(
74+
judges,
75+
{ judges: [{ key: 'missing-judge', samplingRate: 1.0 }] },
76+
mockLogger,
77+
);
78+
79+
const results = await evaluator.evaluate('input', 'output');
80+
81+
expect(mockLogger.warn).toHaveBeenCalledWith(expect.stringContaining('missing-judge'));
82+
// Missing judge is skipped (not an error result), so results array is empty
83+
expect(results).toEqual([]);
84+
});
85+
86+
it('returns error result when judge throws', async () => {
87+
const mockProvider = makeProvider();
88+
const judgeConfig = makeJudgeConfig('judge-err');
89+
90+
const judge = new Judge(judgeConfig, mockProvider);
91+
jest.spyOn(judge, 'evaluate').mockRejectedValue(new Error('evaluation error'));
92+
93+
const judges = new Map([['judge-err', judge]]);
94+
const evaluator = new Evaluator(judges, {
95+
judges: [{ key: 'judge-err', samplingRate: 1.0 }],
96+
});
97+
98+
const results = await evaluator.evaluate('input', 'output');
99+
100+
expect(results).toHaveLength(1);
101+
expect(results[0].success).toBe(false);
102+
expect(results[0].sampled).toBe(true);
103+
expect(results[0].errorMessage).toBe('evaluation error');
104+
});
105+
106+
it('does NOT call tracker.trackJudgeResult', async () => {
107+
const mockProvider = makeProvider();
108+
const judgeConfig = makeJudgeConfig('judge-1');
109+
110+
const mockResult: LDJudgeResult = {
111+
success: true,
112+
sampled: true,
113+
score: 0.8,
114+
reasoning: 'ok',
115+
metricKey: '$ld:ai:judge:quality',
116+
};
117+
118+
const judge = new Judge(judgeConfig, mockProvider);
119+
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);
120+
121+
const judges = new Map([['judge-1', judge]]);
122+
const evaluator = new Evaluator(judges, { judges: [{ key: 'judge-1', samplingRate: 1.0 }] });
123+
124+
// No tracker — if Evaluator tried to call trackJudgeResult this would throw or fail
125+
await evaluator.evaluate('input', 'output');
126+
127+
// Test passes if no error is thrown (no tracker involved)
128+
expect(true).toBe(true);
129+
});
130+
131+
it('runs multiple judges in parallel and returns all results', async () => {
132+
const makeJudge = (key: string, score: number): Judge => {
133+
const mockProvider = makeProvider();
134+
const jc = makeJudgeConfig(key);
135+
const j = new Judge(jc, mockProvider);
136+
jest.spyOn(j, 'evaluate').mockResolvedValue({
137+
success: true,
138+
sampled: true,
139+
score,
140+
reasoning: 'ok',
141+
metricKey: '$ld:ai:judge:quality',
142+
});
143+
return j;
144+
};
145+
146+
const judges = new Map([
147+
['judge-a', makeJudge('judge-a', 0.5)],
148+
['judge-b', makeJudge('judge-b', 0.9)],
149+
]);
150+
const evaluator = new Evaluator(judges, {
151+
judges: [
152+
{ key: 'judge-a', samplingRate: 1.0 },
153+
{ key: 'judge-b', samplingRate: 1.0 },
154+
],
155+
});
156+
157+
const results = await evaluator.evaluate('input', 'output');
158+
159+
expect(results).toHaveLength(2);
160+
const scores = results.map((r) => r.score).sort();
161+
expect(scores).toEqual([0.5, 0.9]);
162+
});
163+
});
164+
});

packages/sdk/server-ai/src/LDAIClientImpl.ts

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
} from './api/config';
2323
import { LDAIConfigFlagValue, LDAIConfigUtils } from './api/config/LDAIConfigUtils';
2424
import { AgentGraphDefinition, LDAgentGraphFlagValue, LDGraphTracker } from './api/graph';
25+
import { Evaluator } from './api/judge/Evaluator';
2526
import { Judge } from './api/judge/Judge';
2627
import { LDAIClient } from './api/LDAIClient';
2728
import { AIProviderFactory, SupportedAIProvider } from './api/providers';
@@ -170,6 +171,26 @@ export class LDAIClientImpl implements LDAIClient {
170171
return judges;
171172
}
172173

174+
private async _buildEvaluator(
175+
judgeConfigs: LDJudge[],
176+
context: LDContext,
177+
variables?: Record<string, unknown>,
178+
defaultAiProvider?: SupportedAIProvider,
179+
): Promise<Evaluator> {
180+
if (judgeConfigs.length === 0) {
181+
return Evaluator.noop();
182+
}
183+
184+
const judgesRecord = await this._initializeJudges(
185+
judgeConfigs,
186+
context,
187+
variables,
188+
defaultAiProvider,
189+
);
190+
const judgesMap = new Map<string, Judge>(Object.entries(judgesRecord));
191+
return new Evaluator(judgesMap, { judges: judgeConfigs }, this._logger);
192+
}
193+
173194
private async _completionConfig(
174195
key: string,
175196
context: LDContext,
@@ -318,14 +339,23 @@ export class LDAIClientImpl implements LDAIClient {
318339
return undefined;
319340
}
320341

321-
const judges = await this._initializeJudges(
342+
const evaluator = await this._buildEvaluator(
322343
config.judgeConfiguration?.judges ?? [],
323344
context,
324345
variables,
325346
defaultAiProvider,
326347
);
327348

328-
return new TrackedChat(config, provider, judges, this._logger);
349+
// Attach the evaluator to the config for use by the managed layer
350+
const configWithEvaluator: LDAICompletionConfig = { ...config, evaluator };
351+
352+
// Build the legacy judges record for TrackedChat backward compat
353+
const judges: Record<string, Judge> = {};
354+
evaluator.judges.forEach((judge, k) => {
355+
judges[k] = judge;
356+
});
357+
358+
return new TrackedChat(configWithEvaluator, provider, judges, this._logger);
329359
}
330360

331361
async createJudge(

packages/sdk/server-ai/src/api/config/types.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import type { Evaluator } from '../judge/Evaluator';
12
import { LDAIConfigTracker } from './LDAIConfigTracker';
23

34
// ============================================================================
@@ -220,6 +221,11 @@ export interface LDAIAgentConfig extends LDAIConfig {
220221
* Root-level tools map keyed by tool name. Distinct from model.parameters.tools[].
221222
*/
222223
tools?: { [toolName: string]: LDTool };
224+
/**
225+
* Evaluator for this agent config. Populated by createAgent.
226+
* Internal; not part of the flag value shape.
227+
*/
228+
evaluator?: Evaluator;
223229
}
224230

225231
/**
@@ -239,6 +245,11 @@ export interface LDAICompletionConfig extends LDAIConfig {
239245
* Root-level tools map keyed by tool name. Distinct from model.parameters.tools[].
240246
*/
241247
tools?: { [toolName: string]: LDTool };
248+
/**
249+
* Evaluator for this completion config. Populated by createChat/createModel.
250+
* Internal; not part of the flag value shape.
251+
*/
252+
evaluator?: Evaluator;
242253
}
243254

244255
/**
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import { LDLogger } from '@launchdarkly/js-server-sdk-common';
2+
3+
import { LDJudgeConfiguration } from '../config/types';
4+
import { Judge } from './Judge';
5+
import { LDJudgeResult } from './types';
6+
7+
/**
8+
* Wraps a collection of judges and a judge configuration, providing a single
9+
* `evaluate` method that runs all configured judges against a given input/output pair.
10+
*
11+
* The `Evaluator` is responsible only for running judges and returning results.
12+
* It does NOT call `tracker.trackJudgeResult` — that is the responsibility of
13+
* the managed layer (e.g., ManagedModel, ManagedAgent).
14+
*/
15+
export class Evaluator {
16+
constructor(
17+
readonly judges: Map<string, Judge>,
18+
readonly judgeConfiguration: LDJudgeConfiguration,
19+
private readonly _logger?: LDLogger,
20+
) {}
21+
22+
/**
23+
* Returns a no-op Evaluator that always resolves to an empty array.
24+
* Use this when no judges are configured.
25+
*/
26+
static noop(): Evaluator {
27+
return new Evaluator(new Map(), { judges: [] });
28+
}
29+
30+
/**
31+
* Evaluates the given input/output pair using all configured judges.
32+
* Missing judge instances are logged as warnings and skipped (not errors).
33+
*
34+
* @param input The input that was provided to the AI model.
35+
* @param output The output produced by the AI model.
36+
* @returns A promise resolving to an array of judge evaluation results.
37+
*/
38+
async evaluate(input: string, output: string): Promise<LDJudgeResult[]> {
39+
if (this.judgeConfiguration.judges.length === 0) {
40+
return [];
41+
}
42+
43+
const evaluationPromises = this.judgeConfiguration.judges.map(async (judgeConfig) => {
44+
const judge = this.judges.get(judgeConfig.key);
45+
if (!judge) {
46+
this._logger?.warn(`Judge configuration is not enabled for ${judgeConfig.key}`);
47+
// Skip — missing judge is a warning, not an error result
48+
return null;
49+
}
50+
51+
try {
52+
return await judge.evaluate(input, output, judgeConfig.samplingRate);
53+
} catch (err) {
54+
const result: LDJudgeResult = {
55+
success: false,
56+
sampled: true,
57+
errorMessage: err instanceof Error ? err.message : 'Unknown error',
58+
};
59+
return result;
60+
}
61+
});
62+
63+
const settled = await Promise.allSettled(evaluationPromises);
64+
65+
const results: LDJudgeResult[] = [];
66+
settled.forEach((item) => {
67+
if (item.status === 'fulfilled' && item.value !== null) {
68+
results.push(item.value);
69+
} else if (item.status === 'rejected') {
70+
results.push({
71+
success: false,
72+
sampled: true,
73+
errorMessage: 'Judge evaluation failed unexpectedly',
74+
});
75+
}
76+
});
77+
78+
return results;
79+
}
80+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
export { Judge } from './Judge';
2+
export { Evaluator } from './Evaluator';
23
export type { LDJudgeResult, StructuredResponse } from './types';

0 commit comments

Comments
 (0)