Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions packages/sdk/server-ai/__tests__/Evaluator.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import { LDAIJudgeConfig } from '../src/api/config/types';
import { Evaluator } from '../src/api/judge/Evaluator';
import { Judge } from '../src/api/judge/Judge';
import { LDJudgeResult } from '../src/api/judge/types';
import { AIProvider } from '../src/api/providers/AIProvider';

function makeJudgeConfig(key: string): LDAIJudgeConfig {
return {
key,
enabled: true,
evaluationMetricKey: '$ld:ai:judge:quality',
messages: [{ role: 'system', content: 'You are a judge.' }],
createTracker: () => ({}) as any,
};
}

function makeProvider(): jest.Mocked<AIProvider> {
return {
invokeModel: jest.fn(),
invokeStructuredModel: jest.fn(),
} as any;
}

describe('Evaluator', () => {
describe('noop()', () => {
it('returns an empty result array', async () => {
const evaluator = Evaluator.noop();
const results = await evaluator.evaluate('input', 'output');
expect(results).toEqual([]);
});
});

describe('evaluate()', () => {
it('calls each configured judge and returns results', async () => {
const mockProvider = makeProvider();
const judgeConfig = makeJudgeConfig('judge-1');

const mockResult: LDJudgeResult = {
success: true,
sampled: true,
score: 0.9,
reasoning: 'Good response',
metricKey: '$ld:ai:judge:quality',
judgeConfigKey: 'judge-1',
};

const judge = new Judge(judgeConfig, mockProvider, 1.0);
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);

const evaluator = new Evaluator([judge]);

const results = await evaluator.evaluate('user input', 'ai output');

expect(results).toHaveLength(1);
expect(results[0]).toEqual(mockResult);
// Evaluator does not pass a per-call samplingRate — judge uses its own.
expect(judge.evaluate).toHaveBeenCalledWith('user input', 'ai output');
});

it('returns error result when judge throws', async () => {
const mockProvider = makeProvider();
const judgeConfig = makeJudgeConfig('judge-err');

const judge = new Judge(judgeConfig, mockProvider, 1.0);
jest.spyOn(judge, 'evaluate').mockRejectedValue(new Error('evaluation error'));

const evaluator = new Evaluator([judge]);
const results = await evaluator.evaluate('input', 'output');

expect(results).toHaveLength(1);
expect(results[0].success).toBe(false);
expect(results[0].sampled).toBe(true);
expect(results[0].errorMessage).toBe('evaluation error');
expect(results[0].judgeConfigKey).toBe('judge-err');
});

it('does NOT call tracker.trackJudgeResult', async () => {
const mockProvider = makeProvider();
const judgeConfig = makeJudgeConfig('judge-1');

const mockResult: LDJudgeResult = {
success: true,
sampled: true,
score: 0.8,
reasoning: 'ok',
metricKey: '$ld:ai:judge:quality',
};

const judge = new Judge(judgeConfig, mockProvider, 1.0);
jest.spyOn(judge, 'evaluate').mockResolvedValue(mockResult);

const evaluator = new Evaluator([judge]);

// No tracker — if Evaluator tried to call trackJudgeResult this would throw or fail
await evaluator.evaluate('input', 'output');

// Test passes if no error is thrown (no tracker involved)
expect(true).toBe(true);
});

it('runs multiple judges in parallel and returns all results', async () => {
const makeJudge = (key: string, score: number): Judge => {
const mockProvider = makeProvider();
const jc = makeJudgeConfig(key);
const j = new Judge(jc, mockProvider, 1.0);
jest.spyOn(j, 'evaluate').mockResolvedValue({
success: true,
sampled: true,
score,
reasoning: 'ok',
metricKey: '$ld:ai:judge:quality',
});
return j;
};

const evaluator = new Evaluator([makeJudge('judge-a', 0.5), makeJudge('judge-b', 0.9)]);

const results = await evaluator.evaluate('input', 'output');

expect(results).toHaveLength(2);
const scores = results.map((r) => r.score).sort();
expect(scores).toEqual([0.5, 0.9]);
});
});
});
83 changes: 71 additions & 12 deletions packages/sdk/server-ai/__tests__/Judge.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,76 @@ describe('Judge', () => {

describe('constructor', () => {
it('initializes with proper configuration', () => {
const judge = new Judge(judgeConfig, mockProvider, mockLogger);
const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);

expect(judge).toBeDefined();
});

it('defaults sampleRate to 1.0 when omitted', () => {
const judge = new Judge(judgeConfig, mockProvider);
expect(judge.sampleRate).toBe(1.0);
});

it('exposes the sampleRate provided to the constructor', () => {
const judge = new Judge(judgeConfig, mockProvider, 0.25, mockLogger);
expect(judge.sampleRate).toBe(0.25);
});

it('honors a sampleRate of 0', () => {
const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger);
expect(judge.sampleRate).toBe(0);
});
});

describe('sampling fallback in evaluate()', () => {
it('uses the constructor sampleRate when no per-call rate is supplied', async () => {
// Force sampling to skip: math.random() returns 0.6, sampleRate 0.5 → 0.6 > 0.5 → skip.
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.6);

const judge = new Judge(judgeConfig, mockProvider, 0.5, mockLogger);
const result = await judge.evaluate('input', 'output');

// Skipped due to sampling: sampled stays false (default), no provider call.
expect(result.sampled).toBe(false);
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();

randomSpy.mockRestore();
});

it('honors an explicit per-call samplingRate of 0 over the constructor default', async () => {
// Even with Math.random() at 0, samplingRate=0 means 0 > 0 is false — skip path is
// `Math.random() > rate`, so rate=0 + random=0 does NOT skip. Use random=0.5.
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5);

// Constructor rate is 1.0 (would normally always sample); per-call 0 overrides to skip.
const judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
const result = await judge.evaluate('input', 'output', 0);

expect(result.sampled).toBe(false);
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();

randomSpy.mockRestore();
});

it('per-call samplingRate of undefined falls through to the constructor default', async () => {
// Constructor 0 (always skip), per-call undefined → effective rate 0.
const randomSpy = jest.spyOn(Math, 'random').mockReturnValue(0.5);

const judge = new Judge(judgeConfig, mockProvider, 0, mockLogger);
const result = await judge.evaluate('input', 'output', undefined);

expect(result.sampled).toBe(false);
expect(mockProvider.invokeStructuredModel).not.toHaveBeenCalled();

randomSpy.mockRestore();
});
});

describe('evaluate', () => {
let judge: Judge;

beforeEach(() => {
judge = new Judge(judgeConfig, mockProvider, mockLogger);
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
});

it('evaluates AI response successfully', async () => {
Expand Down Expand Up @@ -205,7 +264,7 @@ describe('Judge', () => {
evaluationMetricKey: undefined,
evaluationMetricKeys: [],
};
const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, mockLogger);
const judgeWithoutMetrics = new Judge(configWithoutMetrics, mockProvider, 1.0, mockLogger);

const result = await judgeWithoutMetrics.evaluate('test input', 'test output');

Expand All @@ -227,7 +286,7 @@ describe('Judge', () => {
evaluationMetricKey: 'relevance',
evaluationMetricKeys: undefined,
};
const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, mockLogger);
const judgeWithSingleKey = new Judge(configWithSingleKey, mockProvider, 1.0, mockLogger);

const mockStructuredResponse: StructuredResponse = {
data: {
Expand Down Expand Up @@ -265,7 +324,7 @@ describe('Judge', () => {
evaluationMetricKey: undefined,
evaluationMetricKeys: ['relevance', 'accuracy'],
};
const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, mockLogger);
const judgeWithLegacyKeys = new Judge(configWithLegacyKeys, mockProvider, 1.0, mockLogger);

const mockStructuredResponse: StructuredResponse = {
data: {
Expand Down Expand Up @@ -303,7 +362,7 @@ describe('Judge', () => {
evaluationMetricKey: undefined,
evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'],
};
const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, mockLogger);
const judgeWithInvalidKeys = new Judge(configWithInvalidKeys, mockProvider, 1.0, mockLogger);

const mockStructuredResponse: StructuredResponse = {
data: {
Expand Down Expand Up @@ -342,7 +401,7 @@ describe('Judge', () => {
evaluationMetricKey: 'helpfulness',
evaluationMetricKeys: ['relevance', 'accuracy'],
};
const judgeWithBoth = new Judge(configWithBoth, mockProvider, mockLogger);
const judgeWithBoth = new Judge(configWithBoth, mockProvider, 1.0, mockLogger);

const mockStructuredResponse: StructuredResponse = {
data: {
Expand Down Expand Up @@ -379,7 +438,7 @@ describe('Judge', () => {
...judgeConfig,
messages: undefined,
};
const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, mockLogger);
const judgeWithoutMessages = new Judge(configWithoutMessages, mockProvider, 1.0, mockLogger);

const result = await judgeWithoutMessages.evaluate('test input', 'test output');

Expand Down Expand Up @@ -488,7 +547,7 @@ describe('Judge', () => {
let judge: Judge;

beforeEach(() => {
judge = new Judge(judgeConfig, mockProvider, mockLogger);
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
});

it('evaluates messages and response successfully', async () => {
Expand Down Expand Up @@ -573,7 +632,7 @@ describe('Judge', () => {
let judge: Judge;

beforeEach(() => {
judge = new Judge(judgeConfig, mockProvider, mockLogger);
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
});

it('constructs evaluation messages correctly', () => {
Expand All @@ -598,7 +657,7 @@ describe('Judge', () => {
let judge: Judge;

beforeEach(() => {
judge = new Judge(judgeConfig, mockProvider, mockLogger);
judge = new Judge(judgeConfig, mockProvider, 1.0, mockLogger);
});

it('parses valid evaluation response correctly', () => {
Expand Down Expand Up @@ -669,7 +728,7 @@ describe('Judge', () => {
evaluationMetricKey: undefined,
evaluationMetricKeys: [],
};
const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, mockLogger);
const judgeWithEmptyKeys = new Judge(configWithEmptyKeys, mockProvider, 1.0, mockLogger);

const result = await judgeWithEmptyKeys.evaluate('test input', 'test output');

Expand Down
2 changes: 1 addition & 1 deletion packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ describe('createJudge method', () => {
response_to_evaluate: '{{response_to_evaluate}}',
});
expect(AIProviderFactory.create).toHaveBeenCalledWith(mockJudgeConfig, undefined, undefined);
expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, undefined);
expect(Judge).toHaveBeenCalledWith(mockJudgeConfig, mockProvider, 1.0, undefined);
expect(result).toBe(mockJudge);
judgeConfigSpy.mockRestore();
});
Expand Down
Loading
Loading