Skip to content

Commit f26c4cc

Browse files
authored
feat: add Evaluator class for judge orchestration (#1331)
1 parent 8a0de8e commit f26c4cc

8 files changed

Lines changed: 287 additions & 44 deletions

File tree

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import { LDAIJudgeConfig } from '../src/api/config/types';
2+
import { Evaluator } from '../src/api/judge/Evaluator';
3+
import { Judge } from '../src/api/judge/Judge';
4+
import { LDJudgeResult } from '../src/api/judge/types';
5+
import { AIProvider } from '../src/api/providers/AIProvider';
6+
7+
function makeJudgeConfig(key: string): LDAIJudgeConfig {
8+
return {
9+
key,
10+
enabled: true,
11+
evaluationMetricKey: '$ld:ai:judge:quality',
12+
messages: [{ role: 'system', content: 'You are a judge.' }],
13+
createTracker: () => ({}) as any,
14+
};
15+
}
16+
17+
function makeProvider(): jest.Mocked<AIProvider> {
18+
return {
19+
invokeModel: jest.fn(),
20+
invokeStructuredModel: jest.fn(),
21+
} as any;
22+
}
23+
24+
describe('Evaluator', () => {
25+
describe('noop()', () => {
26+
it('returns an empty result array', async () => {
27+
const evaluator = Evaluator.noop();
28+
const results = await evaluator.evaluate('input', 'output');
29+
expect(results).toEqual([]);
30+
});
31+
});
32+
33+
describe('evaluate()', () => {
34+
it('calls each configured judge and returns results', async () => {
35+
const mockProvider = makeProvider();
36+
const judgeConfig = makeJudgeConfig('judge-1');
37+
38+
const mockResult: LDJudgeResult = {
39+
success: true,
40+
sampled: true,
41+
score: 0.9,
42+
reasoning: 'Good response',
43+
metricKey: '$ld:ai:judge:quality',
44+
judgeConfigKey: 'judge-1',
45+
};
46+
47+
const judge = new Judge(judgeConfig, mockProvider, 1.0);
48+
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);
49+
50+
const evaluator = new Evaluator([judge]);
51+
52+
const results = await evaluator.evaluate('user input', 'ai output');
53+
54+
expect(results).toHaveLength(1);
55+
expect(results[0]).toEqual(mockResult);
56+
// Evaluator does not pass a per-call samplingRate — judge uses its own.
57+
expect(judge.evaluate).toHaveBeenCalledWith('user input', 'ai output');
58+
});
59+
60+
it('does NOT call tracker.trackJudgeResult', async () => {
61+
const mockProvider = makeProvider();
62+
const judgeConfig = makeJudgeConfig('judge-1');
63+
64+
const mockResult: LDJudgeResult = {
65+
success: true,
66+
sampled: true,
67+
score: 0.8,
68+
reasoning: 'ok',
69+
metricKey: '$ld:ai:judge:quality',
70+
};
71+
72+
const judge = new Judge(judgeConfig, mockProvider, 1.0);
73+
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);
74+
75+
const evaluator = new Evaluator([judge]);
76+
77+
// No tracker — if Evaluator tried to call trackJudgeResult this would throw or fail
78+
await evaluator.evaluate('input', 'output');
79+
80+
// Test passes if no error is thrown (no tracker involved)
81+
expect(true).toBe(true);
82+
});
83+
84+
it('runs multiple judges in parallel and returns all results', async () => {
85+
const makeJudge = (key: string, score: number): Judge => {
86+
const mockProvider = makeProvider();
87+
const jc = makeJudgeConfig(key);
88+
const j = new Judge(jc, mockProvider, 1.0);
89+
jest.spyOn(j, 'evaluate').mockResolvedValue({
90+
success: true,
91+
sampled: true,
92+
score,
93+
reasoning: 'ok',
94+
metricKey: '$ld:ai:judge:quality',
95+
});
96+
return j;
97+
};
98+
99+
const evaluator = new Evaluator([makeJudge('judge-a', 0.5), makeJudge('judge-b', 0.9)]);
100+
101+
const results = await evaluator.evaluate('input', 'output');
102+
103+
expect(results).toHaveLength(2);
104+
const scores = results.map((r) => r.score).sort();
105+
expect(scores).toEqual([0.5, 0.9]);
106+
});
107+
});
108+
});

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,76 @@ describe('Judge', () => {
5454

5555
describe('constructor', () => {
5656
it('initializes with proper configuration', () => {
57-
const judge = new Judge(judgeConfig, mockProvider, mockLogger);
57+
const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
5858

5959
expect(judge).toBeDefined();
6060
});
61+
62+
it('defaults sampleRate to 1.0 when omitted', () => {
63+
const judge = new Judge(judgeConfig, mockProvider);
64+
expect(judge.sampleRate).toBe(1.0);
65+
});
66+
67+
it('exposes the sampleRate provided to the constructor', () => {
68+
const judge = new Judge(judgeConfig, mockProvider, 0.25, mockLogger);
69+
expect(judge.sampleRate).toBe(0.25);
70+
});
71+
72+
it('honors a sampleRate of 0', () => {
73+
const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger);
74+
expect(judge.sampleRate).toBe(0);
75+
});
76+
});
77+
78+
describe('sampling fallback in evaluate()', () => {
79+
it('uses the constructor sampleRate when no per-call rate is supplied', async () => {
80+
// Force sampling to skip: math.random() returns 0.6, sampleRate 0.5 → 0.6 > 0.5 → skip.
81+
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.6);
82+
83+
const judge = new Judge(judgeConfig, mockProvider, 0.5, mockLogger);
84+
const result = await judge.evaluate('input', 'output');
85+
86+
// Skipped due to sampling: sampled stays false (default), no provider call.
87+
expect(result.sampled).toBe(false);
88+
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();
89+
90+
randomSpy.mockRestore();
91+
});
92+
93+
it('honors an explicit per-call samplingRate of 0 over the constructor default', async () => {
94+
// Even with Math.random() at 0, samplingRate=0 means 0 > 0 is false — skip path is
95+
// `Math.random() > rate`, so rate=0 + random=0 does NOT skip. Use random=0.5.
96+
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5);
97+
98+
// Constructor rate is 1.0 (would normally always sample); per-call 0 overrides to skip.
99+
const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
100+
const result = await judge.evaluate('input', 'output', 0);
101+
102+
expect(result.sampled).toBe(false);
103+
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();
104+
105+
randomSpy.mockRestore();
106+
});
107+
108+
it('per-call samplingRate of undefined falls through to the constructor default', async () => {
109+
// Constructor 0 (always skip), per-call undefined → effective rate 0.
110+
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5);
111+
112+
const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger);
113+
const result = await judge.evaluate('input', 'output', undefined);
114+
115+
expect(result.sampled).toBe(false);
116+
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();
117+
118+
randomSpy.mockRestore();
119+
});
61120
});
62121

63122
describe('evaluate', () => {
64123
let judge: Judge;
65124

66125
beforeEach(() => {
67-
judge = new Judge(judgeConfig, mockProvider, mockLogger);
126+
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
68127
});
69128

70129
it('evaluates AI response successfully', async () => {
@@ -205,7 +264,7 @@ describe('Judge', () => {
205264
evaluationMetricKey: undefined,
206265
evaluationMetricKeys: [],
207266
};
208-
const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, mockLogger);
267+
const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, 1.0, mockLogger);
209268

210269
const result = await judgeWithoutMetrics.evaluate('test input', 'test output');
211270

@@ -227,7 +286,7 @@ describe('Judge', () => {
227286
evaluationMetricKey: 'relevance',
228287
evaluationMetricKeys: undefined,
229288
};
230-
const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, mockLogger);
289+
const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, 1.0, mockLogger);
231290

232291
const mockStructuredResponse: StructuredResponse = {
233292
data: {
@@ -265,7 +324,7 @@ describe('Judge', () => {
265324
evaluationMetricKey: undefined,
266325
evaluationMetricKeys: ['relevance', 'accuracy'],
267326
};
268-
const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, mockLogger);
327+
const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, 1.0, mockLogger);
269328

270329
const mockStructuredResponse: StructuredResponse = {
271330
data: {
@@ -303,7 +362,7 @@ describe('Judge', () => {
303362
evaluationMetricKey: undefined,
304363
evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'],
305364
};
306-
const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, mockLogger);
365+
const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, 1.0, mockLogger);
307366

308367
const mockStructuredResponse: StructuredResponse = {
309368
data: {
@@ -342,7 +401,7 @@ describe('Judge', () => {
342401
evaluationMetricKey: 'helpfulness',
343402
evaluationMetricKeys: ['relevance', 'accuracy'],
344403
};
345-
const judgeWithBoth = new Judge(configWithBoth, mockProvider, mockLogger);
404+
const judgeWithBoth = new Judge(configWithBoth, mockProvider, 1.0, mockLogger);
346405

347406
const mockStructuredResponse: StructuredResponse = {
348407
data: {
@@ -379,7 +438,7 @@ describe('Judge', () => {
379438
...judgeConfig,
380439
messages: undefined,
381440
};
382-
const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, mockLogger);
441+
const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, 1.0, mockLogger);
383442

384443
const result = await judgeWithoutMessages.evaluate('test input', 'test output');
385444

@@ -488,7 +547,7 @@ describe('Judge', () => {
488547
let judge: Judge;
489548

490549
beforeEach(() => {
491-
judge = new Judge(judgeConfig, mockProvider, mockLogger);
550+
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
492551
});
493552

494553
it('evaluates messages and response successfully', async () => {
@@ -573,7 +632,7 @@ describe('Judge', () => {
573632
let judge: Judge;
574633

575634
beforeEach(() => {
576-
judge = new Judge(judgeConfig, mockProvider, mockLogger);
635+
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
577636
});
578637

579638
it('constructs evaluation messages correctly', () => {
@@ -598,7 +657,7 @@ describe('Judge', () => {
598657
let judge: Judge;
599658

600659
beforeEach(() => {
601-
judge = new Judge(judgeConfig, mockProvider, mockLogger);
660+
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
602661
});
603662

604663
it('parses valid evaluation response correctly', () => {
@@ -669,7 +728,7 @@ describe('Judge', () => {
669728
evaluationMetricKey: undefined,
670729
evaluationMetricKeys: [],
671730
};
672-
const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, mockLogger);
731+
const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, 1.0, mockLogger);
673732

674733
const result = await judgeWithEmptyKeys.evaluate('test input', 'test output');
675734

packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ describe('createJudge method', () => {
673673
response_to_evaluate: '{{response_to_evaluate}}',
674674
});
675675
expect(AIProviderFactory.create).toHaveBeenCalledWith(mockJudgeConfig, undefined, undefined);
676-
expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, undefined);
676+
expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, 1.0, undefined);
677677
expect(result).toBe(mockJudge);
678678
judgeConfigSpy.mockRestore();
679679
});

0 commit comments

Comments
 (0)