@@ -70,14 +70,12 @@ describe('Judge', () => {
7070 it ( 'evaluates AI response successfully' , async ( ) => {
7171 const mockStructuredResponse : StructuredResponse = {
7272 data : {
73- evaluations : {
74- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
75- } ,
73+ score : 0.8 ,
74+ reasoning : 'The response is relevant to the question' ,
7675 } ,
7776 rawResponse : JSON . stringify ( {
78- evaluations : {
79- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
80- } ,
77+ score : 0.8 ,
78+ reasoning : 'The response is relevant to the question' ,
8179 } ) ,
8280 metrics : {
8381 success : true ,
@@ -125,14 +123,12 @@ describe('Judge', () => {
125123 it ( 'returns evaluation result with correct evaluationMetricKey for tracker integration' , async ( ) => {
126124 const mockStructuredResponse : StructuredResponse = {
127125 data : {
128- evaluations : {
129- relevance : { score : 0.85 , reasoning : 'Highly relevant response' } ,
130- } ,
126+ score : 0.85 ,
127+ reasoning : 'Highly relevant response' ,
131128 } ,
132129 rawResponse : JSON . stringify ( {
133- evaluations : {
134- relevance : { score : 0.85 , reasoning : 'Highly relevant response' } ,
135- } ,
130+ score : 0.85 ,
131+ reasoning : 'Highly relevant response' ,
136132 } ) ,
137133 metrics : {
138134 success : true ,
@@ -159,14 +155,12 @@ describe('Judge', () => {
159155
160156 const mockStructuredResponse : StructuredResponse = {
161157 data : {
162- evaluations : {
163- relevance : { score : 0.8 , reasoning : 'Good' } ,
164- } ,
158+ score : 0.8 ,
159+ reasoning : 'Good' ,
165160 } ,
166161 rawResponse : JSON . stringify ( {
167- evaluations : {
168- relevance : { score : 0.8 , reasoning : 'Good' } ,
169- } ,
162+ score : 0.8 ,
163+ reasoning : 'Good' ,
170164 } ) ,
171165 metrics : {
172166 success : true ,
@@ -237,14 +231,12 @@ describe('Judge', () => {
237231
238232 const mockStructuredResponse : StructuredResponse = {
239233 data : {
240- evaluations : {
241- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
242- } ,
234+ score : 0.8 ,
235+ reasoning : 'The response is relevant' ,
243236 } ,
244237 rawResponse : JSON . stringify ( {
245- evaluations : {
246- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
247- } ,
238+ score : 0.8 ,
239+ reasoning : 'The response is relevant' ,
248240 } ) ,
249241 metrics : {
250242 success : true ,
@@ -277,14 +269,12 @@ describe('Judge', () => {
277269
278270 const mockStructuredResponse : StructuredResponse = {
279271 data : {
280- evaluations : {
281- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
282- } ,
272+ score : 0.8 ,
273+ reasoning : 'The response is relevant' ,
283274 } ,
284275 rawResponse : JSON . stringify ( {
285- evaluations : {
286- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
287- } ,
276+ score : 0.8 ,
277+ reasoning : 'The response is relevant' ,
288278 } ) ,
289279 metrics : {
290280 success : true ,
@@ -317,14 +307,12 @@ describe('Judge', () => {
317307
318308 const mockStructuredResponse : StructuredResponse = {
319309 data : {
320- evaluations : {
321- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
322- } ,
310+ score : 0.8 ,
311+ reasoning : 'The response is relevant' ,
323312 } ,
324313 rawResponse : JSON . stringify ( {
325- evaluations : {
326- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
327- } ,
314+ score : 0.8 ,
315+ reasoning : 'The response is relevant' ,
328316 } ) ,
329317 metrics : {
330318 success : true ,
@@ -358,14 +346,12 @@ describe('Judge', () => {
358346
359347 const mockStructuredResponse : StructuredResponse = {
360348 data : {
361- evaluations : {
362- helpfulness : { score : 0.7 , reasoning : 'The response is helpful' } ,
363- } ,
349+ score : 0.7 ,
350+ reasoning : 'The response is helpful' ,
364351 } ,
365352 rawResponse : JSON . stringify ( {
366- evaluations : {
367- helpfulness : { score : 0.7 , reasoning : 'The response is helpful' } ,
368- } ,
353+ score : 0.7 ,
354+ reasoning : 'The response is helpful' ,
369355 } ) ,
370356 metrics : {
371357 success : true ,
@@ -409,18 +395,10 @@ describe('Judge', () => {
409395 ) ;
410396 } ) ;
411397
412- it ( 'returns result with success false when expected metric is missing ' , async ( ) => {
398+ it ( 'returns result with success false when response has no score or reasoning ' , async ( ) => {
413399 const mockStructuredResponse : StructuredResponse = {
414- data : {
415- evaluations : {
416- accuracy : { score : 0.9 , reasoning : 'Accurate' } ,
417- } ,
418- } ,
419- rawResponse : JSON . stringify ( {
420- evaluations : {
421- accuracy : { score : 0.9 , reasoning : 'Accurate' } ,
422- } ,
423- } ) ,
400+ data : { } ,
401+ rawResponse : '{}' ,
424402 metrics : {
425403 success : true ,
426404 usage : { total : 100 , input : 50 , output : 50 } ,
@@ -437,19 +415,23 @@ describe('Judge', () => {
437415 sampled : true ,
438416 judgeConfigKey : 'test-judge' ,
439417 } ) ;
418+ expect ( mockLogger . warn ) . toHaveBeenCalledWith (
419+ 'Could not parse evaluation response: {}' ,
420+ mockTrackData ,
421+ ) ;
440422 } ) ;
441423
442424 it ( 'returns result with success false when response structure is malformed' , async ( ) => {
443425 const mockStructuredResponse : StructuredResponse = {
444426 data : {
445- relevance : { score : 0.8 , reasoning : 'Good' } ,
446- accuracy : { score : 0.9 , reasoning : 'Accurate ' } ,
447- helpfulness : { score : 0.7 , reasoning : 'Helpful' } ,
427+ evaluations : {
428+ relevance : { score : 0.8 , reasoning : 'Good ' } ,
429+ } ,
448430 } ,
449431 rawResponse : JSON . stringify ( {
450- relevance : { score : 0.8 , reasoning : 'Good' } ,
451- accuracy : { score : 0.9 , reasoning : 'Accurate ' } ,
452- helpfulness : { score : 0.7 , reasoning : 'Helpful' } ,
432+ evaluations : {
433+ relevance : { score : 0.8 , reasoning : 'Good ' } ,
434+ } ,
453435 } ) ,
454436 metrics : {
455437 success : true ,
@@ -467,6 +449,10 @@ describe('Judge', () => {
467449 sampled : true ,
468450 judgeConfigKey : 'test-judge' ,
469451 } ) ;
452+ expect ( mockLogger . warn ) . toHaveBeenCalledWith (
453+ expect . stringContaining ( 'Could not parse evaluation response:' ) ,
454+ mockTrackData ,
455+ ) ;
470456 } ) ;
471457
472458 it ( 'handles provider errors gracefully' , async ( ) => {
@@ -517,14 +503,12 @@ describe('Judge', () => {
517503
518504 const mockStructuredResponse : StructuredResponse = {
519505 data : {
520- evaluations : {
521- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
522- } ,
506+ score : 0.8 ,
507+ reasoning : 'The response is relevant to the question' ,
523508 } ,
524509 rawResponse : JSON . stringify ( {
525- evaluations : {
526- relevance : { score : 0.8 , reasoning : 'The response is relevant to the question' } ,
527- } ,
510+ score : 0.8 ,
511+ reasoning : 'The response is relevant to the question' ,
528512 } ) ,
529513 metrics : {
530514 success : true ,
@@ -620,116 +604,63 @@ describe('Judge', () => {
620604 it ( 'parses valid evaluation response correctly' , ( ) => {
621605 // eslint-disable-next-line no-underscore-dangle
622606 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
623- const responseData = {
624- evaluations : {
625- relevance : { score : 0.8 , reasoning : 'Good' } ,
626- } ,
627- } ;
607+ const responseData = { score : 0.8 , reasoning : 'Good' } ;
628608
629- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
609+ const result = parseResponse ( responseData ) ;
630610
631611 expect ( result ) . toEqual ( {
632612 score : 0.8 ,
633613 reasoning : 'Good' ,
634614 } ) ;
635615 } ) ;
636616
637- it ( 'returns undefined for invalid response data' , ( ) => {
617+ it ( 'returns undefined for empty response data' , ( ) => {
638618 // eslint-disable-next-line no-underscore-dangle
639619 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
640- const responseData = {
641- relevance : { score : 0.8 , reasoning : 'Good' } ,
642- } ;
643620
644- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
621+ const result = parseResponse ( { } ) ;
645622
646623 expect ( result ) . toBeUndefined ( ) ;
647624 } ) ;
648625
649- it ( 'handles missing score or reasoning fields ' , ( ) => {
626+ it ( 'handles missing reasoning field ' , ( ) => {
650627 // eslint-disable-next-line no-underscore-dangle
651628 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
652- const responseData = {
653- evaluations : {
654- relevance : { score : 0.8 } ,
655- } ,
656- } ;
629+ const responseData = { score : 0.8 } ;
657630
658- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
631+ const result = parseResponse ( responseData ) ;
659632
660633 expect ( result ) . toBeUndefined ( ) ;
661634 } ) ;
662635
663636 it ( 'handles invalid score values out of range' , ( ) => {
664637 // eslint-disable-next-line no-underscore-dangle
665638 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
666- const responseData = {
667- evaluations : {
668- relevance : { score : 1.5 , reasoning : 'Good' } ,
669- } ,
670- } ;
639+ const responseData = { score : 1.5 , reasoning : 'Good' } ;
671640
672- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
641+ const result = parseResponse ( responseData ) ;
673642
674643 expect ( result ) . toBeUndefined ( ) ;
675- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
676- expect . stringContaining ( 'Invalid score evaluated for relevance: 1.5' ) ,
677- mockTrackData ,
678- ) ;
679644 } ) ;
680645
681646 it ( 'handles negative score values' , ( ) => {
682647 // eslint-disable-next-line no-underscore-dangle
683648 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
684- const responseData = {
685- evaluations : {
686- relevance : { score : - 0.1 , reasoning : 'Good' } ,
687- } ,
688- } ;
649+ const responseData = { score : - 0.1 , reasoning : 'Good' } ;
689650
690- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
651+ const result = parseResponse ( responseData ) ;
691652
692653 expect ( result ) . toBeUndefined ( ) ;
693- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
694- expect . stringContaining ( 'Invalid score evaluated for relevance: -0.1' ) ,
695- mockTrackData ,
696- ) ;
697654 } ) ;
698655
699656 it ( 'handles invalid reasoning type' , ( ) => {
700657 // eslint-disable-next-line no-underscore-dangle
701658 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
702- const responseData = {
703- evaluations : {
704- relevance : { score : 0.8 , reasoning : 123 } ,
705- } ,
706- } ;
659+ const responseData = { score : 0.8 , reasoning : 123 } ;
707660
708- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
661+ const result = parseResponse ( responseData ) ;
709662
710663 expect ( result ) . toBeUndefined ( ) ;
711- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
712- expect . stringContaining ( 'Invalid reasoning evaluated for relevance: 123' ) ,
713- mockTrackData ,
714- ) ;
715- } ) ;
716-
717- it ( 'handles missing evaluation when key does not exist in response' , ( ) => {
718- // eslint-disable-next-line no-underscore-dangle
719- const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
720- const responseData = {
721- evaluations : {
722- accuracy : { score : 0.9 , reasoning : 'Accurate' } ,
723- } ,
724- } ;
725-
726- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
727-
728- expect ( result ) . toBeUndefined ( ) ;
729- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
730- 'Missing evaluation for metric key: relevance' ,
731- mockTrackData ,
732- ) ;
733664 } ) ;
734665
735666 it ( 'handles empty evaluationMetricKeys array fallback' , async ( ) => {
@@ -753,41 +684,5 @@ describe('Judge', () => {
753684 mockTrackData ,
754685 ) ;
755686 } ) ;
756-
757- it ( 'handles evaluation value that is not an object' , ( ) => {
758- // eslint-disable-next-line no-underscore-dangle
759- const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
760- const responseData = {
761- evaluations : {
762- relevance : 'not an object' ,
763- } ,
764- } ;
765-
766- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
767-
768- expect ( result ) . toBeUndefined ( ) ;
769- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
770- 'Missing evaluation for metric key: relevance' ,
771- mockTrackData ,
772- ) ;
773- } ) ;
774-
775- it ( 'handles null evaluation value' , ( ) => {
776- // eslint-disable-next-line no-underscore-dangle
777- const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
778- const responseData = {
779- evaluations : {
780- relevance : null ,
781- } ,
782- } ;
783-
784- const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
785-
786- expect ( result ) . toBeUndefined ( ) ;
787- expect ( mockLogger . warn ) . toHaveBeenCalledWith (
788- 'Missing evaluation for metric key: relevance' ,
789- mockTrackData ,
790- ) ;
791- } ) ;
792687 } ) ;
793688} ) ;
0 commit comments