Skip to content

Commit aba1221

Browse files
committed
feat!: Flatten JudgeResponse and EvalScore into new LDJudgeResult (#1284)
1 parent ebf93a5 commit aba1221

13 files changed

Lines changed: 277 additions & 205 deletions

File tree

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 80 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,11 @@ describe('Judge', () => {
9898
);
9999

100100
expect(result).toEqual({
101-
evals: {
102-
relevance: {
103-
score: 0.8,
104-
reasoning: 'The response is relevant to the question',
105-
},
106-
},
101+
score: 0.8,
102+
reasoning: 'The response is relevant to the question',
103+
metricKey: 'relevance',
107104
success: true,
105+
sampled: true,
108106
judgeConfigKey: 'test-judge',
109107
});
110108

@@ -148,12 +146,11 @@ describe('Judge', () => {
148146
const result = await judge.evaluate('test input', 'test output');
149147

150148
expect(result).toBeDefined();
151-
expect(result?.evals).toHaveProperty('relevance');
152-
expect(result?.evals.relevance.score).toBe(0.85);
153-
expect(result?.judgeConfigKey).toBe('test-judge');
154-
expect(result?.success).toBe(true);
155-
// Verify the evaluationMetricKey from config is used in the result
156-
expect(Object.keys(result?.evals || {})).toContain(judgeConfig.evaluationMetricKey);
149+
expect(result.score).toBe(0.85);
150+
expect(result.metricKey).toBe('relevance');
151+
expect(result.judgeConfigKey).toBe('test-judge');
152+
expect(result.success).toBe(true);
153+
expect(result.sampled).toBe(true);
157154
});
158155

159156
it('handles sampling rate correctly', async () => {
@@ -183,18 +180,23 @@ describe('Judge', () => {
183180
const result = await judge.evaluate('test input', 'test output', 0.5);
184181

185182
expect(result).toBeDefined();
183+
expect(result.sampled).toBe(true);
186184
expect(mockProvider.invokeStructuredModel).toHaveBeenCalled();
187185

188186
Math.random = originalRandom;
189187
});
190188

191-
it('returns undefined when not sampled', async () => {
189+
it('returns unsampled result when skipped by sampling', async () => {
192190
const originalRandom = Math.random;
193191
Math.random = jest.fn().mockReturnValue(0.8);
194192

195193
const result = await judge.evaluate('test input', 'test output', 0.5);
196194

197-
expect(result).toBeUndefined();
195+
expect(result).toEqual({
196+
success: false,
197+
sampled: false,
198+
judgeConfigKey: 'test-judge',
199+
});
198200
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();
199201
expect(mockLogger.debug).toHaveBeenCalledWith(
200202
'Judge evaluation skipped due to sampling rate: 0.5',
@@ -203,7 +205,7 @@ describe('Judge', () => {
203205
Math.random = originalRandom;
204206
});
205207

206-
it('returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing', async () => {
208+
it('returns error result when evaluationMetricKey and evaluationMetricKeys are both missing', async () => {
207209
const configWithoutMetrics: LDAIJudgeConfig = {
208210
...judgeConfig,
209211
evaluationMetricKey: undefined,
@@ -213,7 +215,12 @@ describe('Judge', () => {
213215

214216
const result = await judgeWithoutMetrics.evaluate('test input', 'test output');
215217

216-
expect(result).toBeUndefined();
218+
expect(result).toEqual({
219+
success: false,
220+
sampled: true,
221+
errorMessage: 'Judge configuration is missing required evaluation metric key',
222+
judgeConfigKey: 'test-judge',
223+
});
217224
expect(mockLogger.warn).toHaveBeenCalledWith(
218225
'Judge configuration is missing required evaluation metric key',
219226
mockTrackData,
@@ -251,10 +258,11 @@ describe('Judge', () => {
251258
const result = await judgeWithSingleKey.evaluate('test input', 'test output');
252259

253260
expect(result).toEqual({
254-
evals: {
255-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
256-
},
261+
score: 0.8,
262+
reasoning: 'The response is relevant',
263+
metricKey: 'relevance',
257264
success: true,
265+
sampled: true,
258266
judgeConfigKey: 'test-judge',
259267
});
260268
});
@@ -290,10 +298,11 @@ describe('Judge', () => {
290298
const result = await judgeWithLegacyKeys.evaluate('test input', 'test output');
291299

292300
expect(result).toEqual({
293-
evals: {
294-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
295-
},
301+
score: 0.8,
302+
reasoning: 'The response is relevant',
303+
metricKey: 'relevance',
296304
success: true,
305+
sampled: true,
297306
judgeConfigKey: 'test-judge',
298307
});
299308
});
@@ -330,10 +339,11 @@ describe('Judge', () => {
330339

331340
// Should skip empty and whitespace strings, use first valid value
332341
expect(result).toEqual({
333-
evals: {
334-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
335-
},
342+
score: 0.8,
343+
reasoning: 'The response is relevant',
344+
metricKey: 'relevance',
336345
success: true,
346+
sampled: true,
337347
judgeConfigKey: 'test-judge',
338348
});
339349
});
@@ -369,15 +379,16 @@ describe('Judge', () => {
369379
const result = await judgeWithBoth.evaluate('test input', 'test output');
370380

371381
expect(result).toEqual({
372-
evals: {
373-
helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
374-
},
382+
score: 0.7,
383+
reasoning: 'The response is helpful',
384+
metricKey: 'helpfulness',
375385
success: true,
386+
sampled: true,
376387
judgeConfigKey: 'test-judge',
377388
});
378389
});
379390

380-
it('returns undefined when messages are missing', async () => {
391+
it('returns error result when messages are missing', async () => {
381392
const configWithoutMessages: LDAIJudgeConfig = {
382393
...judgeConfig,
383394
messages: undefined,
@@ -386,14 +397,19 @@ describe('Judge', () => {
386397

387398
const result = await judgeWithoutMessages.evaluate('test input', 'test output');
388399

389-
expect(result).toBeUndefined();
400+
expect(result).toEqual({
401+
success: false,
402+
sampled: true,
403+
errorMessage: 'Judge configuration must include messages',
404+
judgeConfigKey: 'test-judge',
405+
});
390406
expect(mockLogger.warn).toHaveBeenCalledWith(
391407
'Judge configuration must include messages',
392408
mockTrackData,
393409
);
394410
});
395411

396-
it('returns empty evaluations with success false when expected metric is missing', async () => {
412+
it('returns result with success false when expected metric is missing', async () => {
397413
const mockStructuredResponse: StructuredResponse = {
398414
data: {
399415
evaluations: {
@@ -417,13 +433,13 @@ describe('Judge', () => {
417433
const result = await judge.evaluate('test input', 'test output');
418434

419435
expect(result).toEqual({
420-
evals: {},
421436
success: false,
437+
sampled: true,
422438
judgeConfigKey: 'test-judge',
423439
});
424440
});
425441

426-
it('returns empty evaluations when response structure is malformed', async () => {
442+
it('returns result with success false when response structure is malformed', async () => {
427443
const mockStructuredResponse: StructuredResponse = {
428444
data: {
429445
relevance: { score: 0.8, reasoning: 'Good' },
@@ -447,8 +463,8 @@ describe('Judge', () => {
447463
const result = await judge.evaluate('test input', 'test output');
448464

449465
expect(result).toEqual({
450-
evals: {},
451466
success: false,
467+
sampled: true,
452468
judgeConfigKey: 'test-judge',
453469
});
454470
});
@@ -460,9 +476,9 @@ describe('Judge', () => {
460476
const result = await judge.evaluate('test input', 'test output');
461477

462478
expect(result).toEqual({
463-
evals: {},
464479
success: false,
465-
error: 'Provider error',
480+
sampled: true,
481+
errorMessage: 'Provider error',
466482
judgeConfigKey: 'test-judge',
467483
});
468484
expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error);
@@ -474,9 +490,9 @@ describe('Judge', () => {
474490
const result = await judge.evaluate('test input', 'test output');
475491

476492
expect(result).toEqual({
477-
evals: {},
478493
success: false,
479-
error: 'Unknown error',
494+
sampled: true,
495+
errorMessage: 'Unknown error',
480496
judgeConfigKey: 'test-judge',
481497
});
482498
});
@@ -522,13 +538,11 @@ describe('Judge', () => {
522538
const result = await judge.evaluateMessages(messages, response);
523539

524540
expect(result).toEqual({
525-
evals: {
526-
relevance: {
527-
score: 0.8,
528-
reasoning: 'The response is relevant to the question',
529-
},
530-
},
541+
score: 0.8,
542+
reasoning: 'The response is relevant to the question',
543+
metricKey: 'relevance',
531544
success: true,
545+
sampled: true,
532546
judgeConfigKey: 'test-judge',
533547
});
534548

@@ -560,7 +574,11 @@ describe('Judge', () => {
560574

561575
const result = await judge.evaluateMessages(messages, response, 0.5);
562576

563-
expect(result).toBeUndefined();
577+
expect(result).toEqual({
578+
success: false,
579+
sampled: false,
580+
judgeConfigKey: 'test-judge',
581+
});
564582
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();
565583

566584
Math.random = originalRandom;
@@ -611,11 +629,12 @@ describe('Judge', () => {
611629
const result = parseResponse(responseData, 'relevance', mockTracker);
612630

613631
expect(result).toEqual({
614-
relevance: { score: 0.8, reasoning: 'Good' },
632+
score: 0.8,
633+
reasoning: 'Good',
615634
});
616635
});
617636

618-
it('returns empty object for invalid response data', () => {
637+
it('returns undefined for invalid response data', () => {
619638
// eslint-disable-next-line no-underscore-dangle
620639
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
621640
const responseData = {
@@ -624,7 +643,7 @@ describe('Judge', () => {
624643

625644
const result = parseResponse(responseData, 'relevance', mockTracker);
626645

627-
expect(result).toEqual({});
646+
expect(result).toBeUndefined();
628647
});
629648

630649
it('handles missing score or reasoning fields', () => {
@@ -638,7 +657,7 @@ describe('Judge', () => {
638657

639658
const result = parseResponse(responseData, 'relevance', mockTracker);
640659

641-
expect(result).toEqual({});
660+
expect(result).toBeUndefined();
642661
});
643662

644663
it('handles invalid score values out of range', () => {
@@ -652,7 +671,7 @@ describe('Judge', () => {
652671

653672
const result = parseResponse(responseData, 'relevance', mockTracker);
654673

655-
expect(result).toEqual({});
674+
expect(result).toBeUndefined();
656675
expect(mockLogger.warn).toHaveBeenCalledWith(
657676
expect.stringContaining('Invalid score evaluated for relevance: 1.5'),
658677
mockTrackData,
@@ -670,7 +689,7 @@ describe('Judge', () => {
670689

671690
const result = parseResponse(responseData, 'relevance', mockTracker);
672691

673-
expect(result).toEqual({});
692+
expect(result).toBeUndefined();
674693
expect(mockLogger.warn).toHaveBeenCalledWith(
675694
expect.stringContaining('Invalid score evaluated for relevance: -0.1'),
676695
mockTrackData,
@@ -688,7 +707,7 @@ describe('Judge', () => {
688707

689708
const result = parseResponse(responseData, 'relevance', mockTracker);
690709

691-
expect(result).toEqual({});
710+
expect(result).toBeUndefined();
692711
expect(mockLogger.warn).toHaveBeenCalledWith(
693712
expect.stringContaining('Invalid reasoning evaluated for relevance: 123'),
694713
mockTrackData,
@@ -706,7 +725,7 @@ describe('Judge', () => {
706725

707726
const result = parseResponse(responseData, 'relevance', mockTracker);
708727

709-
expect(result).toEqual({});
728+
expect(result).toBeUndefined();
710729
expect(mockLogger.warn).toHaveBeenCalledWith(
711730
'Missing evaluation for metric key: relevance',
712731
mockTrackData,
@@ -723,7 +742,12 @@ describe('Judge', () => {
723742

724743
const result = await judgeWithEmptyKeys.evaluate('test input', 'test output');
725744

726-
expect(result).toBeUndefined();
745+
expect(result).toEqual({
746+
success: false,
747+
sampled: true,
748+
errorMessage: 'Judge configuration is missing required evaluation metric key',
749+
judgeConfigKey: 'test-judge',
750+
});
727751
expect(mockLogger.warn).toHaveBeenCalledWith(
728752
'Judge configuration is missing required evaluation metric key',
729753
mockTrackData,
@@ -741,7 +765,7 @@ describe('Judge', () => {
741765

742766
const result = parseResponse(responseData, 'relevance', mockTracker);
743767

744-
expect(result).toEqual({});
768+
expect(result).toBeUndefined();
745769
expect(mockLogger.warn).toHaveBeenCalledWith(
746770
'Missing evaluation for metric key: relevance',
747771
mockTrackData,
@@ -759,7 +783,7 @@ describe('Judge', () => {
759783

760784
const result = parseResponse(responseData, 'relevance', mockTracker);
761785

762-
expect(result).toEqual({});
786+
expect(result).toBeUndefined();
763787
expect(mockLogger.warn).toHaveBeenCalledWith(
764788
'Missing evaluation for metric key: relevance',
765789
mockTrackData,

0 commit comments

Comments
 (0)