@@ -98,13 +98,11 @@ describe('Judge', () => {
9898 ) ;
9999
100100 expect ( result ) . toEqual ( {
101- evals : {
102- relevance : {
103- score : 0.8 ,
104- reasoning : 'The response is relevant to the question' ,
105- } ,
106- } ,
101+ score : 0.8 ,
102+ reasoning : 'The response is relevant to the question' ,
103+ metricKey : 'relevance' ,
107104 success : true ,
105+ sampled : true ,
108106 judgeConfigKey : 'test-judge' ,
109107 } ) ;
110108
@@ -148,12 +146,11 @@ describe('Judge', () => {
148146 const result = await judge . evaluate ( 'test input' , 'test output' ) ;
149147
150148 expect ( result ) . toBeDefined ( ) ;
151- expect ( result ?. evals ) . toHaveProperty ( 'relevance' ) ;
152- expect ( result ?. evals . relevance . score ) . toBe ( 0.85 ) ;
153- expect ( result ?. judgeConfigKey ) . toBe ( 'test-judge' ) ;
154- expect ( result ?. success ) . toBe ( true ) ;
155- // Verify the evaluationMetricKey from config is used in the result
156- expect ( Object . keys ( result ?. evals || { } ) ) . toContain ( judgeConfig . evaluationMetricKey ) ;
149+ expect ( result . score ) . toBe ( 0.85 ) ;
150+ expect ( result . metricKey ) . toBe ( 'relevance' ) ;
151+ expect ( result . judgeConfigKey ) . toBe ( 'test-judge' ) ;
152+ expect ( result . success ) . toBe ( true ) ;
153+ expect ( result . sampled ) . toBe ( true ) ;
157154 } ) ;
158155
159156 it ( 'handles sampling rate correctly' , async ( ) => {
@@ -183,18 +180,23 @@ describe('Judge', () => {
183180 const result = await judge . evaluate ( 'test input' , 'test output' , 0.5 ) ;
184181
185182 expect ( result ) . toBeDefined ( ) ;
183+ expect ( result . sampled ) . toBe ( true ) ;
186184 expect ( mockProvider . invokeStructuredModel ) . toHaveBeenCalled ( ) ;
187185
188186 Math . random = originalRandom ;
189187 } ) ;
190188
191- it ( 'returns undefined when not sampled ' , async ( ) => {
189+ it ( 'returns unsampled result when skipped by sampling ' , async ( ) => {
192190 const originalRandom = Math . random ;
193191 Math . random = jest . fn ( ) . mockReturnValue ( 0.8 ) ;
194192
195193 const result = await judge . evaluate ( 'test input' , 'test output' , 0.5 ) ;
196194
197- expect ( result ) . toBeUndefined ( ) ;
195+ expect ( result ) . toEqual ( {
196+ success : false ,
197+ sampled : false ,
198+ judgeConfigKey : 'test-judge' ,
199+ } ) ;
198200 expect ( mockProvider . invokeStructuredModel ) . not . toHaveBeenCalled ( ) ;
199201 expect ( mockLogger . debug ) . toHaveBeenCalledWith (
200202 'Judge evaluation skipped due to sampling rate: 0.5' ,
@@ -203,7 +205,7 @@ describe('Judge', () => {
203205 Math . random = originalRandom ;
204206 } ) ;
205207
206- it ( 'returns undefined when evaluationMetricKey and evaluationMetricKeys are both missing' , async ( ) => {
208+ it ( 'returns error result when evaluationMetricKey and evaluationMetricKeys are both missing' , async ( ) => {
207209 const configWithoutMetrics : LDAIJudgeConfig = {
208210 ...judgeConfig ,
209211 evaluationMetricKey : undefined ,
@@ -213,7 +215,12 @@ describe('Judge', () => {
213215
214216 const result = await judgeWithoutMetrics . evaluate ( 'test input' , 'test output' ) ;
215217
216- expect ( result ) . toBeUndefined ( ) ;
218+ expect ( result ) . toEqual ( {
219+ success : false ,
220+ sampled : true ,
221+ errorMessage : 'Judge configuration is missing required evaluation metric key' ,
222+ judgeConfigKey : 'test-judge' ,
223+ } ) ;
217224 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
218225 'Judge configuration is missing required evaluation metric key' ,
219226 mockTrackData ,
@@ -251,10 +258,11 @@ describe('Judge', () => {
251258 const result = await judgeWithSingleKey . evaluate ( 'test input' , 'test output' ) ;
252259
253260 expect ( result ) . toEqual ( {
254- evals : {
255- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
256- } ,
261+ score : 0.8 ,
262+ reasoning : 'The response is relevant' ,
263+ metricKey : 'relevance' ,
257264 success : true ,
265+ sampled : true ,
258266 judgeConfigKey : 'test-judge' ,
259267 } ) ;
260268 } ) ;
@@ -290,10 +298,11 @@ describe('Judge', () => {
290298 const result = await judgeWithLegacyKeys . evaluate ( 'test input' , 'test output' ) ;
291299
292300 expect ( result ) . toEqual ( {
293- evals : {
294- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
295- } ,
301+ score : 0.8 ,
302+ reasoning : 'The response is relevant' ,
303+ metricKey : 'relevance' ,
296304 success : true ,
305+ sampled : true ,
297306 judgeConfigKey : 'test-judge' ,
298307 } ) ;
299308 } ) ;
@@ -330,10 +339,11 @@ describe('Judge', () => {
330339
331340 // Should skip empty and whitespace strings, use first valid value
332341 expect ( result ) . toEqual ( {
333- evals : {
334- relevance : { score : 0.8 , reasoning : 'The response is relevant' } ,
335- } ,
342+ score : 0.8 ,
343+ reasoning : 'The response is relevant' ,
344+ metricKey : 'relevance' ,
336345 success : true ,
346+ sampled : true ,
337347 judgeConfigKey : 'test-judge' ,
338348 } ) ;
339349 } ) ;
@@ -369,15 +379,16 @@ describe('Judge', () => {
369379 const result = await judgeWithBoth . evaluate ( 'test input' , 'test output' ) ;
370380
371381 expect ( result ) . toEqual ( {
372- evals : {
373- helpfulness : { score : 0.7 , reasoning : 'The response is helpful' } ,
374- } ,
382+ score : 0.7 ,
383+ reasoning : 'The response is helpful' ,
384+ metricKey : 'helpfulness' ,
375385 success : true ,
386+ sampled : true ,
376387 judgeConfigKey : 'test-judge' ,
377388 } ) ;
378389 } ) ;
379390
380- it ( 'returns undefined when messages are missing' , async ( ) => {
391+ it ( 'returns error result when messages are missing' , async ( ) => {
381392 const configWithoutMessages : LDAIJudgeConfig = {
382393 ...judgeConfig ,
383394 messages : undefined ,
@@ -386,14 +397,19 @@ describe('Judge', () => {
386397
387398 const result = await judgeWithoutMessages . evaluate ( 'test input' , 'test output' ) ;
388399
389- expect ( result ) . toBeUndefined ( ) ;
400+ expect ( result ) . toEqual ( {
401+ success : false ,
402+ sampled : true ,
403+ errorMessage : 'Judge configuration must include messages' ,
404+ judgeConfigKey : 'test-judge' ,
405+ } ) ;
390406 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
391407 'Judge configuration must include messages' ,
392408 mockTrackData ,
393409 ) ;
394410 } ) ;
395411
396- it ( 'returns empty evaluations with success false when expected metric is missing' , async ( ) => {
412+ it ( 'returns result with success false when expected metric is missing' , async ( ) => {
397413 const mockStructuredResponse : StructuredResponse = {
398414 data : {
399415 evaluations : {
@@ -417,13 +433,13 @@ describe('Judge', () => {
417433 const result = await judge . evaluate ( 'test input' , 'test output' ) ;
418434
419435 expect ( result ) . toEqual ( {
420- evals : { } ,
421436 success : false ,
437+ sampled : true ,
422438 judgeConfigKey : 'test-judge' ,
423439 } ) ;
424440 } ) ;
425441
426- it ( 'returns empty evaluations when response structure is malformed' , async ( ) => {
442+ it ( 'returns result with success false when response structure is malformed' , async ( ) => {
427443 const mockStructuredResponse : StructuredResponse = {
428444 data : {
429445 relevance : { score : 0.8 , reasoning : 'Good' } ,
@@ -447,8 +463,8 @@ describe('Judge', () => {
447463 const result = await judge . evaluate ( 'test input' , 'test output' ) ;
448464
449465 expect ( result ) . toEqual ( {
450- evals : { } ,
451466 success : false ,
467+ sampled : true ,
452468 judgeConfigKey : 'test-judge' ,
453469 } ) ;
454470 } ) ;
@@ -460,9 +476,9 @@ describe('Judge', () => {
460476 const result = await judge . evaluate ( 'test input' , 'test output' ) ;
461477
462478 expect ( result ) . toEqual ( {
463- evals : { } ,
464479 success : false ,
465- error : 'Provider error' ,
480+ sampled : true ,
481+ errorMessage : 'Provider error' ,
466482 judgeConfigKey : 'test-judge' ,
467483 } ) ;
468484 expect ( mockLogger . error ) . toHaveBeenCalledWith ( 'Judge evaluation failed:' , error ) ;
@@ -474,9 +490,9 @@ describe('Judge', () => {
474490 const result = await judge . evaluate ( 'test input' , 'test output' ) ;
475491
476492 expect ( result ) . toEqual ( {
477- evals : { } ,
478493 success : false ,
479- error : 'Unknown error' ,
494+ sampled : true ,
495+ errorMessage : 'Unknown error' ,
480496 judgeConfigKey : 'test-judge' ,
481497 } ) ;
482498 } ) ;
@@ -522,13 +538,11 @@ describe('Judge', () => {
522538 const result = await judge . evaluateMessages ( messages , response ) ;
523539
524540 expect ( result ) . toEqual ( {
525- evals : {
526- relevance : {
527- score : 0.8 ,
528- reasoning : 'The response is relevant to the question' ,
529- } ,
530- } ,
541+ score : 0.8 ,
542+ reasoning : 'The response is relevant to the question' ,
543+ metricKey : 'relevance' ,
531544 success : true ,
545+ sampled : true ,
532546 judgeConfigKey : 'test-judge' ,
533547 } ) ;
534548
@@ -560,7 +574,11 @@ describe('Judge', () => {
560574
561575 const result = await judge . evaluateMessages ( messages , response , 0.5 ) ;
562576
563- expect ( result ) . toBeUndefined ( ) ;
577+ expect ( result ) . toEqual ( {
578+ success : false ,
579+ sampled : false ,
580+ judgeConfigKey : 'test-judge' ,
581+ } ) ;
564582 expect ( mockProvider . invokeStructuredModel ) . not . toHaveBeenCalled ( ) ;
565583
566584 Math . random = originalRandom ;
@@ -611,11 +629,12 @@ describe('Judge', () => {
611629 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
612630
613631 expect ( result ) . toEqual ( {
614- relevance : { score : 0.8 , reasoning : 'Good' } ,
632+ score : 0.8 ,
633+ reasoning : 'Good' ,
615634 } ) ;
616635 } ) ;
617636
618- it ( 'returns empty object for invalid response data' , ( ) => {
637+ it ( 'returns undefined for invalid response data' , ( ) => {
619638 // eslint-disable-next-line no-underscore-dangle
620639 const parseResponse = ( judge as any ) . _parseEvaluationResponse . bind ( judge ) ;
621640 const responseData = {
@@ -624,7 +643,7 @@ describe('Judge', () => {
624643
625644 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
626645
627- expect ( result ) . toEqual ( { } ) ;
646+ expect ( result ) . toBeUndefined ( ) ;
628647 } ) ;
629648
630649 it ( 'handles missing score or reasoning fields' , ( ) => {
@@ -638,7 +657,7 @@ describe('Judge', () => {
638657
639658 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
640659
641- expect ( result ) . toEqual ( { } ) ;
660+ expect ( result ) . toBeUndefined ( ) ;
642661 } ) ;
643662
644663 it ( 'handles invalid score values out of range' , ( ) => {
@@ -652,7 +671,7 @@ describe('Judge', () => {
652671
653672 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
654673
655- expect ( result ) . toEqual ( { } ) ;
674+ expect ( result ) . toBeUndefined ( ) ;
656675 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
657676 expect . stringContaining ( 'Invalid score evaluated for relevance: 1.5' ) ,
658677 mockTrackData ,
@@ -670,7 +689,7 @@ describe('Judge', () => {
670689
671690 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
672691
673- expect ( result ) . toEqual ( { } ) ;
692+ expect ( result ) . toBeUndefined ( ) ;
674693 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
675694 expect . stringContaining ( 'Invalid score evaluated for relevance: -0.1' ) ,
676695 mockTrackData ,
@@ -688,7 +707,7 @@ describe('Judge', () => {
688707
689708 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
690709
691- expect ( result ) . toEqual ( { } ) ;
710+ expect ( result ) . toBeUndefined ( ) ;
692711 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
693712 expect . stringContaining ( 'Invalid reasoning evaluated for relevance: 123' ) ,
694713 mockTrackData ,
@@ -706,7 +725,7 @@ describe('Judge', () => {
706725
707726 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
708727
709- expect ( result ) . toEqual ( { } ) ;
728+ expect ( result ) . toBeUndefined ( ) ;
710729 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
711730 'Missing evaluation for metric key: relevance' ,
712731 mockTrackData ,
@@ -723,7 +742,12 @@ describe('Judge', () => {
723742
724743 const result = await judgeWithEmptyKeys . evaluate ( 'test input' , 'test output' ) ;
725744
726- expect ( result ) . toBeUndefined ( ) ;
745+ expect ( result ) . toEqual ( {
746+ success : false ,
747+ sampled : true ,
748+ errorMessage : 'Judge configuration is missing required evaluation metric key' ,
749+ judgeConfigKey : 'test-judge' ,
750+ } ) ;
727751 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
728752 'Judge configuration is missing required evaluation metric key' ,
729753 mockTrackData ,
@@ -741,7 +765,7 @@ describe('Judge', () => {
741765
742766 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
743767
744- expect ( result ) . toEqual ( { } ) ;
768+ expect ( result ) . toBeUndefined ( ) ;
745769 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
746770 'Missing evaluation for metric key: relevance' ,
747771 mockTrackData ,
@@ -759,7 +783,7 @@ describe('Judge', () => {
759783
760784 const result = parseResponse ( responseData , 'relevance' , mockTracker ) ;
761785
762- expect ( result ) . toEqual ( { } ) ;
786+ expect ( result ) . toBeUndefined ( ) ;
763787 expect ( mockLogger . warn ) . toHaveBeenCalledWith (
764788 'Missing evaluation for metric key: relevance' ,
765789 mockTrackData ,
0 commit comments