Skip to content

Commit c132e9f

Browse files
committed
feat: simplify evaluation schema to flat score/reasoning shape (#1286)
1 parent aba1221 commit c132e9f

3 files changed

Lines changed: 89 additions & 255 deletions

File tree

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 62 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,12 @@ describe('Judge', () => {
7070
it('evaluates AI response successfully', async () => {
7171
const mockStructuredResponse: StructuredResponse = {
7272
data: {
73-
evaluations: {
74-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
75-
},
73+
score: 0.8,
74+
reasoning: 'The response is relevant to the question',
7675
},
7776
rawResponse: JSON.stringify({
78-
evaluations: {
79-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
80-
},
77+
score: 0.8,
78+
reasoning: 'The response is relevant to the question',
8179
}),
8280
metrics: {
8381
success: true,
@@ -125,14 +123,12 @@ describe('Judge', () => {
125123
it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => {
126124
const mockStructuredResponse: StructuredResponse = {
127125
data: {
128-
evaluations: {
129-
relevance: { score: 0.85, reasoning: 'Highly relevant response' },
130-
},
126+
score: 0.85,
127+
reasoning: 'Highly relevant response',
131128
},
132129
rawResponse: JSON.stringify({
133-
evaluations: {
134-
relevance: { score: 0.85, reasoning: 'Highly relevant response' },
135-
},
130+
score: 0.85,
131+
reasoning: 'Highly relevant response',
136132
}),
137133
metrics: {
138134
success: true,
@@ -159,14 +155,12 @@ describe('Judge', () => {
159155

160156
const mockStructuredResponse: StructuredResponse = {
161157
data: {
162-
evaluations: {
163-
relevance: { score: 0.8, reasoning: 'Good' },
164-
},
158+
score: 0.8,
159+
reasoning: 'Good',
165160
},
166161
rawResponse: JSON.stringify({
167-
evaluations: {
168-
relevance: { score: 0.8, reasoning: 'Good' },
169-
},
162+
score: 0.8,
163+
reasoning: 'Good',
170164
}),
171165
metrics: {
172166
success: true,
@@ -237,14 +231,12 @@ describe('Judge', () => {
237231

238232
const mockStructuredResponse: StructuredResponse = {
239233
data: {
240-
evaluations: {
241-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
242-
},
234+
score: 0.8,
235+
reasoning: 'The response is relevant',
243236
},
244237
rawResponse: JSON.stringify({
245-
evaluations: {
246-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
247-
},
238+
score: 0.8,
239+
reasoning: 'The response is relevant',
248240
}),
249241
metrics: {
250242
success: true,
@@ -277,14 +269,12 @@ describe('Judge', () => {
277269

278270
const mockStructuredResponse: StructuredResponse = {
279271
data: {
280-
evaluations: {
281-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
282-
},
272+
score: 0.8,
273+
reasoning: 'The response is relevant',
283274
},
284275
rawResponse: JSON.stringify({
285-
evaluations: {
286-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
287-
},
276+
score: 0.8,
277+
reasoning: 'The response is relevant',
288278
}),
289279
metrics: {
290280
success: true,
@@ -317,14 +307,12 @@ describe('Judge', () => {
317307

318308
const mockStructuredResponse: StructuredResponse = {
319309
data: {
320-
evaluations: {
321-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
322-
},
310+
score: 0.8,
311+
reasoning: 'The response is relevant',
323312
},
324313
rawResponse: JSON.stringify({
325-
evaluations: {
326-
relevance: { score: 0.8, reasoning: 'The response is relevant' },
327-
},
314+
score: 0.8,
315+
reasoning: 'The response is relevant',
328316
}),
329317
metrics: {
330318
success: true,
@@ -358,14 +346,12 @@ describe('Judge', () => {
358346

359347
const mockStructuredResponse: StructuredResponse = {
360348
data: {
361-
evaluations: {
362-
helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
363-
},
349+
score: 0.7,
350+
reasoning: 'The response is helpful',
364351
},
365352
rawResponse: JSON.stringify({
366-
evaluations: {
367-
helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
368-
},
353+
score: 0.7,
354+
reasoning: 'The response is helpful',
369355
}),
370356
metrics: {
371357
success: true,
@@ -409,18 +395,10 @@ describe('Judge', () => {
409395
);
410396
});
411397

412-
it('returns result with success false when expected metric is missing', async () => {
398+
it('returns result with success false when response has no score or reasoning', async () => {
413399
const mockStructuredResponse: StructuredResponse = {
414-
data: {
415-
evaluations: {
416-
accuracy: { score: 0.9, reasoning: 'Accurate' },
417-
},
418-
},
419-
rawResponse: JSON.stringify({
420-
evaluations: {
421-
accuracy: { score: 0.9, reasoning: 'Accurate' },
422-
},
423-
}),
400+
data: {},
401+
rawResponse: '{}',
424402
metrics: {
425403
success: true,
426404
usage: { total: 100, input: 50, output: 50 },
@@ -437,19 +415,23 @@ describe('Judge', () => {
437415
sampled: true,
438416
judgeConfigKey: 'test-judge',
439417
});
418+
expect(mockLogger.warn).toHaveBeenCalledWith(
419+
'Could not parse evaluation response: {}',
420+
mockTrackData,
421+
);
440422
});
441423

442424
it('returns result with success false when response structure is malformed', async () => {
443425
const mockStructuredResponse: StructuredResponse = {
444426
data: {
445-
relevance: { score: 0.8, reasoning: 'Good' },
446-
accuracy: { score: 0.9, reasoning: 'Accurate' },
447-
helpfulness: { score: 0.7, reasoning: 'Helpful' },
427+
evaluations: {
428+
relevance: { score: 0.8, reasoning: 'Good' },
429+
},
448430
},
449431
rawResponse: JSON.stringify({
450-
relevance: { score: 0.8, reasoning: 'Good' },
451-
accuracy: { score: 0.9, reasoning: 'Accurate' },
452-
helpfulness: { score: 0.7, reasoning: 'Helpful' },
432+
evaluations: {
433+
relevance: { score: 0.8, reasoning: 'Good' },
434+
},
453435
}),
454436
metrics: {
455437
success: true,
@@ -467,6 +449,10 @@ describe('Judge', () => {
467449
sampled: true,
468450
judgeConfigKey: 'test-judge',
469451
});
452+
expect(mockLogger.warn).toHaveBeenCalledWith(
453+
expect.stringContaining('Could not parse evaluation response:'),
454+
mockTrackData,
455+
);
470456
});
471457

472458
it('handles provider errors gracefully', async () => {
@@ -517,14 +503,12 @@ describe('Judge', () => {
517503

518504
const mockStructuredResponse: StructuredResponse = {
519505
data: {
520-
evaluations: {
521-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
522-
},
506+
score: 0.8,
507+
reasoning: 'The response is relevant to the question',
523508
},
524509
rawResponse: JSON.stringify({
525-
evaluations: {
526-
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
527-
},
510+
score: 0.8,
511+
reasoning: 'The response is relevant to the question',
528512
}),
529513
metrics: {
530514
success: true,
@@ -620,116 +604,63 @@ describe('Judge', () => {
620604
it('parses valid evaluation response correctly', () => {
621605
// eslint-disable-next-line no-underscore-dangle
622606
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
623-
const responseData = {
624-
evaluations: {
625-
relevance: { score: 0.8, reasoning: 'Good' },
626-
},
627-
};
607+
const responseData = { score: 0.8, reasoning: 'Good' };
628608

629-
const result = parseResponse(responseData, 'relevance', mockTracker);
609+
const result = parseResponse(responseData);
630610

631611
expect(result).toEqual({
632612
score: 0.8,
633613
reasoning: 'Good',
634614
});
635615
});
636616

637-
it('returns undefined for invalid response data', () => {
617+
it('returns undefined for empty response data', () => {
638618
// eslint-disable-next-line no-underscore-dangle
639619
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
640-
const responseData = {
641-
relevance: { score: 0.8, reasoning: 'Good' },
642-
};
643620

644-
const result = parseResponse(responseData, 'relevance', mockTracker);
621+
const result = parseResponse({});
645622

646623
expect(result).toBeUndefined();
647624
});
648625

649-
it('handles missing score or reasoning fields', () => {
626+
it('handles missing reasoning field', () => {
650627
// eslint-disable-next-line no-underscore-dangle
651628
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
652-
const responseData = {
653-
evaluations: {
654-
relevance: { score: 0.8 },
655-
},
656-
};
629+
const responseData = { score: 0.8 };
657630

658-
const result = parseResponse(responseData, 'relevance', mockTracker);
631+
const result = parseResponse(responseData);
659632

660633
expect(result).toBeUndefined();
661634
});
662635

663636
it('handles invalid score values out of range', () => {
664637
// eslint-disable-next-line no-underscore-dangle
665638
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
666-
const responseData = {
667-
evaluations: {
668-
relevance: { score: 1.5, reasoning: 'Good' },
669-
},
670-
};
639+
const responseData = { score: 1.5, reasoning: 'Good' };
671640

672-
const result = parseResponse(responseData, 'relevance', mockTracker);
641+
const result = parseResponse(responseData);
673642

674643
expect(result).toBeUndefined();
675-
expect(mockLogger.warn).toHaveBeenCalledWith(
676-
expect.stringContaining('Invalid score evaluated for relevance: 1.5'),
677-
mockTrackData,
678-
);
679644
});
680645

681646
it('handles negative score values', () => {
682647
// eslint-disable-next-line no-underscore-dangle
683648
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
684-
const responseData = {
685-
evaluations: {
686-
relevance: { score: -0.1, reasoning: 'Good' },
687-
},
688-
};
649+
const responseData = { score: -0.1, reasoning: 'Good' };
689650

690-
const result = parseResponse(responseData, 'relevance', mockTracker);
651+
const result = parseResponse(responseData);
691652

692653
expect(result).toBeUndefined();
693-
expect(mockLogger.warn).toHaveBeenCalledWith(
694-
expect.stringContaining('Invalid score evaluated for relevance: -0.1'),
695-
mockTrackData,
696-
);
697654
});
698655

699656
it('handles invalid reasoning type', () => {
700657
// eslint-disable-next-line no-underscore-dangle
701658
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
702-
const responseData = {
703-
evaluations: {
704-
relevance: { score: 0.8, reasoning: 123 },
705-
},
706-
};
659+
const responseData = { score: 0.8, reasoning: 123 };
707660

708-
const result = parseResponse(responseData, 'relevance', mockTracker);
661+
const result = parseResponse(responseData);
709662

710663
expect(result).toBeUndefined();
711-
expect(mockLogger.warn).toHaveBeenCalledWith(
712-
expect.stringContaining('Invalid reasoning evaluated for relevance: 123'),
713-
mockTrackData,
714-
);
715-
});
716-
717-
it('handles missing evaluation when key does not exist in response', () => {
718-
// eslint-disable-next-line no-underscore-dangle
719-
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
720-
const responseData = {
721-
evaluations: {
722-
accuracy: { score: 0.9, reasoning: 'Accurate' },
723-
},
724-
};
725-
726-
const result = parseResponse(responseData, 'relevance', mockTracker);
727-
728-
expect(result).toBeUndefined();
729-
expect(mockLogger.warn).toHaveBeenCalledWith(
730-
'Missing evaluation for metric key: relevance',
731-
mockTrackData,
732-
);
733664
});
734665

735666
it('handles empty evaluationMetricKeys array fallback', async () => {
@@ -753,41 +684,5 @@ describe('Judge', () => {
753684
mockTrackData,
754685
);
755686
});
756-
757-
it('handles evaluation value that is not an object', () => {
758-
// eslint-disable-next-line no-underscore-dangle
759-
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
760-
const responseData = {
761-
evaluations: {
762-
relevance: 'not an object',
763-
},
764-
};
765-
766-
const result = parseResponse(responseData, 'relevance', mockTracker);
767-
768-
expect(result).toBeUndefined();
769-
expect(mockLogger.warn).toHaveBeenCalledWith(
770-
'Missing evaluation for metric key: relevance',
771-
mockTrackData,
772-
);
773-
});
774-
775-
it('handles null evaluation value', () => {
776-
// eslint-disable-next-line no-underscore-dangle
777-
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
778-
const responseData = {
779-
evaluations: {
780-
relevance: null,
781-
},
782-
};
783-
784-
const result = parseResponse(responseData, 'relevance', mockTracker);
785-
786-
expect(result).toBeUndefined();
787-
expect(mockLogger.warn).toHaveBeenCalledWith(
788-
'Missing evaluation for metric key: relevance',
789-
mockTrackData,
790-
);
791-
});
792687
});
793688
});

0 commit comments

Comments
 (0)