Skip to content

Commit 2066e70

Browse files
authored
feat: Added custom judge support for ai configs (#1073)
**Requirements** - [X] I have added test coverage for new or changed functionality - [X] I have followed the repository's [pull request submission guidelines](../blob/main/CONTRIBUTING.md#submitting-pull-requests) - [X] I have validated my changes against all supported platform versions **Related issues** Node version of launchdarkly/python-server-sdk-ai#86 **Describe the solution you've provided** See launchdarkly/python-server-sdk-ai#86 **Describe alternatives you've considered** Provide a clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context about the pull request here. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Switches judge evaluation to a single metric key while preserving backward compatibility. > > - **API/types:** Add optional `evaluationMetricKey` to `LDAIJudgeConfig(Default)` and deprecate array usage; examples updated in `LDAIClient.ts` > - **Config utils:** Map flag values to prefer `evaluationMetricKey` and fallback to first valid entry in `evaluationMetricKeys`; include key when converting defaults > - **Schema:** `EvaluationSchemaBuilder` now builds response schema for one required metric key > - **Judge behavior:** Determine metric via `_getEvaluationMetricKey`; require messages; parse/validate only that key; mark `success: false` if missing/invalid; updated warnings > - **Client/tests:** `LDAIClientImpl` and `Judge` tests updated for new key semantics and legacy fallbacks; added tests for invalid/whitespace keys and sampling > - **Tracking:** Add tests for `trackJudgeResponse` handling single/multiple eval metrics > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 288ee6d. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent 8f59982 commit 2066e70

8 files changed

Lines changed: 652 additions & 133 deletions

File tree

packages/sdk/server-ai/__tests__/Judge.test.ts

Lines changed: 347 additions & 69 deletions
Large diffs are not rendered by default.

packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ describe('config evaluation', () => {
133133
evaluateSpy.mockRestore();
134134
});
135135

136-
it('evaluates judge config successfully', async () => {
136+
it('evaluates judge config successfully with evaluationMetricKeys (legacy)', async () => {
137137
const client = new LDAIClientImpl(mockLdClient);
138138
const key = 'test-judge';
139139
const defaultValue: LDAIJudgeConfigDefault = {
@@ -159,7 +159,140 @@ describe('config evaluation', () => {
159159
const result = await client.judgeConfig(key, testContext, defaultValue);
160160

161161
expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
162-
expect(result.evaluationMetricKeys).toEqual(['relevance', 'accuracy']);
162+
// Should use first value from evaluationMetricKeys
163+
expect(result.evaluationMetricKey).toBe('relevance');
164+
expect(result.tracker).toBeDefined();
165+
expect(result.enabled).toBe(true);
166+
evaluateSpy.mockRestore();
167+
});
168+
169+
it('evaluates judge config successfully with evaluationMetricKey', async () => {
170+
const client = new LDAIClientImpl(mockLdClient);
171+
const key = 'test-judge';
172+
const defaultValue: LDAIJudgeConfigDefault = {
173+
enabled: false,
174+
};
175+
176+
const mockVariation = {
177+
enabled: true,
178+
model: { name: 'gpt-4' },
179+
provider: { name: 'openai' },
180+
evaluationMetricKey: 'relevance',
181+
messages: [{ role: 'system', content: 'You are a judge.' }],
182+
_ldMeta: {
183+
variationKey: 'v1',
184+
enabled: true,
185+
mode: 'judge',
186+
},
187+
};
188+
189+
mockLdClient.variation.mockResolvedValue(mockVariation);
190+
191+
const evaluateSpy = jest.spyOn(client as any, '_evaluate');
192+
const result = await client.judgeConfig(key, testContext, defaultValue);
193+
194+
expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
195+
expect(result.evaluationMetricKey).toBe('relevance');
196+
expect(result.tracker).toBeDefined();
197+
expect(result.enabled).toBe(true);
198+
evaluateSpy.mockRestore();
199+
});
200+
201+
it('prioritizes evaluationMetricKey over evaluationMetricKeys when both are provided', async () => {
202+
const client = new LDAIClientImpl(mockLdClient);
203+
const key = 'test-judge';
204+
const defaultValue: LDAIJudgeConfigDefault = {
205+
enabled: false,
206+
};
207+
208+
const mockVariation = {
209+
enabled: true,
210+
model: { name: 'gpt-4' },
211+
provider: { name: 'openai' },
212+
evaluationMetricKey: 'helpfulness',
213+
evaluationMetricKeys: ['relevance', 'accuracy'],
214+
messages: [{ role: 'system', content: 'You are a judge.' }],
215+
_ldMeta: {
216+
variationKey: 'v1',
217+
enabled: true,
218+
mode: 'judge',
219+
},
220+
};
221+
222+
mockLdClient.variation.mockResolvedValue(mockVariation);
223+
224+
const evaluateSpy = jest.spyOn(client as any, '_evaluate');
225+
const result = await client.judgeConfig(key, testContext, defaultValue);
226+
227+
expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
228+
expect(result.evaluationMetricKey).toBe('helpfulness');
229+
expect(result.tracker).toBeDefined();
230+
expect(result.enabled).toBe(true);
231+
evaluateSpy.mockRestore();
232+
});
233+
234+
it('treats empty string evaluationMetricKey as invalid and falls back to evaluationMetricKeys', async () => {
235+
const client = new LDAIClientImpl(mockLdClient);
236+
const key = 'test-judge';
237+
const defaultValue: LDAIJudgeConfigDefault = {
238+
enabled: false,
239+
};
240+
241+
const mockVariation = {
242+
enabled: true,
243+
model: { name: 'gpt-4' },
244+
provider: { name: 'openai' },
245+
evaluationMetricKey: '',
246+
evaluationMetricKeys: ['relevance', 'accuracy'],
247+
messages: [{ role: 'system', content: 'You are a judge.' }],
248+
_ldMeta: {
249+
variationKey: 'v1',
250+
enabled: true,
251+
mode: 'judge',
252+
},
253+
};
254+
255+
mockLdClient.variation.mockResolvedValue(mockVariation);
256+
257+
const evaluateSpy = jest.spyOn(client as any, '_evaluate');
258+
const result = await client.judgeConfig(key, testContext, defaultValue);
259+
260+
expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
261+
// Empty string should be treated as invalid, so should fall back to first value in evaluationMetricKeys
262+
expect(result.evaluationMetricKey).toBe('relevance');
263+
expect(result.tracker).toBeDefined();
264+
expect(result.enabled).toBe(true);
265+
evaluateSpy.mockRestore();
266+
});
267+
268+
it('skips empty and whitespace-only strings in evaluationMetricKeys array', async () => {
269+
const client = new LDAIClientImpl(mockLdClient);
270+
const key = 'test-judge';
271+
const defaultValue: LDAIJudgeConfigDefault = {
272+
enabled: false,
273+
};
274+
275+
const mockVariation = {
276+
enabled: true,
277+
model: { name: 'gpt-4' },
278+
provider: { name: 'openai' },
279+
evaluationMetricKeys: ['', ' ', 'relevance', 'accuracy'],
280+
messages: [{ role: 'system', content: 'You are a judge.' }],
281+
_ldMeta: {
282+
variationKey: 'v1',
283+
enabled: true,
284+
mode: 'judge',
285+
},
286+
};
287+
288+
mockLdClient.variation.mockResolvedValue(mockVariation);
289+
290+
const evaluateSpy = jest.spyOn(client as any, '_evaluate');
291+
const result = await client.judgeConfig(key, testContext, defaultValue);
292+
293+
expect(evaluateSpy).toHaveBeenCalledWith(key, testContext, defaultValue, 'judge', undefined);
294+
// Should skip empty and whitespace strings, use first valid value
295+
expect(result.evaluationMetricKey).toBe('relevance');
163296
expect(result.tracker).toBeDefined();
164297
expect(result.enabled).toBe(true);
165298
evaluateSpy.mockRestore();

packages/sdk/server-ai/__tests__/LDAIConfigTrackerImpl.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,3 +813,70 @@ describe('trackMetricsOf', () => {
813813
);
814814
});
815815
});
816+
817+
describe('trackJudgeResponse', () => {
818+
it('tracks evaluation metric key with score', () => {
819+
const tracker = new LDAIConfigTrackerImpl(
820+
mockLdClient,
821+
configKey,
822+
variationKey,
823+
version,
824+
modelName,
825+
providerName,
826+
testContext,
827+
);
828+
829+
const judgeResponse = {
830+
judgeConfigKey: 'test-judge',
831+
evals: {
832+
relevance: { score: 0.8, reasoning: 'The response is relevant' },
833+
},
834+
success: true,
835+
};
836+
837+
tracker.trackJudgeResponse(judgeResponse);
838+
839+
expect(mockTrack).toHaveBeenCalledWith(
840+
'relevance',
841+
testContext,
842+
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
843+
0.8,
844+
);
845+
});
846+
847+
it('tracks multiple evaluation metrics when present', () => {
848+
const tracker = new LDAIConfigTrackerImpl(
849+
mockLdClient,
850+
configKey,
851+
variationKey,
852+
version,
853+
modelName,
854+
providerName,
855+
testContext,
856+
);
857+
858+
const judgeResponse = {
859+
judgeConfigKey: 'test-judge',
860+
evals: {
861+
relevance: { score: 0.8, reasoning: 'Relevant' },
862+
accuracy: { score: 0.9, reasoning: 'Accurate' },
863+
},
864+
success: true,
865+
};
866+
867+
tracker.trackJudgeResponse(judgeResponse);
868+
869+
expect(mockTrack).toHaveBeenCalledWith(
870+
'relevance',
871+
testContext,
872+
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
873+
0.8,
874+
);
875+
expect(mockTrack).toHaveBeenCalledWith(
876+
'accuracy',
877+
testContext,
878+
{ ...getExpectedTrackData(), judgeConfigKey: 'test-judge' },
879+
0.9,
880+
);
881+
});
882+
});

packages/sdk/server-ai/src/api/LDAIClient.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ export interface LDAIClient {
156156
* enabled: true,
157157
* model: { name: 'gpt-4' },
158158
* provider: { name: 'openai' },
159-
* evaluationMetricKeys: ['$ld:ai:judge:relevance'],
159+
* evaluationMetricKey: '$ld:ai:judge:relevance',
160160
* messages: [{ role: 'system', content: 'You are a relevance judge.' }]
161161
* }, variables);
162162
*
@@ -303,7 +303,7 @@ export interface LDAIClient {
303303
* enabled: true,
304304
* model: { name: "gpt-4" },
305305
* provider: { name: "openai" },
306-
* evaluationMetricKeys: ['$ld:ai:judge:relevance'],
306+
* evaluationMetricKey: '$ld:ai:judge:relevance',
307307
* messages: [{ role: 'system', content: 'You are a relevance judge.' }]
308308
* },
309309
* { metric: "relevance" }

packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ export interface LDAIConfigFlagValue {
2929
messages?: LDMessage[];
3030
provider?: LDProviderConfig;
3131
instructions?: string;
32+
evaluationMetricKey?: string;
3233
evaluationMetricKeys?: string[];
3334
judgeConfiguration?: LDJudgeConfiguration;
3435
}
@@ -65,6 +66,9 @@ export class LDAIConfigUtils {
6566
if ('instructions' in config && config.instructions !== undefined) {
6667
flagValue.instructions = config.instructions;
6768
}
69+
if ('evaluationMetricKey' in config && config.evaluationMetricKey !== undefined) {
70+
flagValue.evaluationMetricKey = config.evaluationMetricKey;
71+
}
6872
if ('evaluationMetricKeys' in config && config.evaluationMetricKeys !== undefined) {
6973
flagValue.evaluationMetricKeys = config.evaluationMetricKeys;
7074
}
@@ -121,7 +125,6 @@ export class LDAIConfigUtils {
121125
key,
122126
enabled: false,
123127
tracker: undefined,
124-
evaluationMetricKeys: [],
125128
} as LDAIJudgeConfig;
126129
case 'completion':
127130
default:
@@ -202,11 +205,22 @@ export class LDAIConfigUtils {
202205
flagValue: LDAIConfigFlagValue,
203206
tracker: LDAIConfigTracker,
204207
): LDAIJudgeConfig {
208+
// Prioritize evaluationMetricKey, fallback to first valid (non-empty, non-whitespace) value in evaluationMetricKeys
209+
let evaluationMetricKey: string | undefined;
210+
if (flagValue.evaluationMetricKey && flagValue.evaluationMetricKey.trim().length > 0) {
211+
evaluationMetricKey = flagValue.evaluationMetricKey.trim();
212+
} else if (flagValue.evaluationMetricKeys && flagValue.evaluationMetricKeys.length > 0) {
213+
const validKey = flagValue.evaluationMetricKeys.find(
214+
(metricKey) => metricKey && metricKey.trim().length > 0,
215+
);
216+
evaluationMetricKey = validKey ? validKey.trim() : undefined;
217+
}
218+
205219
return {
206220
...this._toBaseConfig(key, flagValue),
207221
tracker,
208222
messages: flagValue.messages,
209-
evaluationMetricKeys: flagValue.evaluationMetricKeys || [],
223+
evaluationMetricKey,
210224
};
211225
}
212226
}

packages/sdk/server-ai/src/api/config/types.ts

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,14 @@ export interface LDAIJudgeConfigDefault extends LDAIConfigDefault {
154154
*/
155155
messages?: LDMessage[];
156156
/**
157-
* Evaluation metric keys for judge configurations.
157+
* Evaluation metric key for judge configurations.
158+
* The key of the metric that this judge can evaluate.
159+
*/
160+
evaluationMetricKey?: string;
161+
/**
162+
* Evaluation metric keys for judge configurations (legacy).
158163
* The keys of the metrics that this judge can evaluate.
164+
* @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
159165
*/
160166
evaluationMetricKeys?: string[];
161167
}
@@ -211,10 +217,16 @@ export interface LDAIJudgeConfig extends LDAIConfig {
211217
*/
212218
messages?: LDMessage[];
213219
/**
214-
* Evaluation metric keys for judge configurations.
220+
* Evaluation metric key for judge configurations.
221+
* The key of the metric that this judge can evaluate.
222+
*/
223+
evaluationMetricKey?: string;
224+
/**
225+
* Evaluation metric keys for judge configurations (legacy).
215226
* The keys of the metrics that this judge can evaluate.
227+
* @deprecated Use evaluationMetricKey instead. This field is kept for legacy support.
216228
*/
217-
evaluationMetricKeys: string[];
229+
evaluationMetricKeys?: string[];
218230
}
219231

220232
// ============================================================================

packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,20 @@
33
* Not exported - only used internally by TrackedJudge.
44
*/
55
class EvaluationSchemaBuilder {
6-
static build(evaluationMetricKeys: string[]): Record<string, unknown> {
6+
static build(evaluationMetricKey?: string): Record<string, unknown> {
7+
if (!evaluationMetricKey) {
8+
return {};
9+
}
710
return {
811
type: 'object',
912
properties: {
1013
evaluations: {
1114
type: 'object',
12-
description: `Object containing evaluation results for ${evaluationMetricKeys.join(', ')} metrics`,
13-
properties: this._buildKeyProperties(evaluationMetricKeys),
14-
required: evaluationMetricKeys,
15+
description: `Object containing evaluation results for ${evaluationMetricKey} metric`,
16+
properties: {
17+
[evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey),
18+
},
19+
required: [evaluationMetricKey],
1520
additionalProperties: false,
1621
},
1722
},
@@ -20,16 +25,6 @@ class EvaluationSchemaBuilder {
2025
} as const;
2126
}
2227

23-
private static _buildKeyProperties(evaluationMetricKeys: string[]) {
24-
return evaluationMetricKeys.reduce(
25-
(acc, key) => {
26-
acc[key] = this._buildKeySchema(key);
27-
return acc;
28-
},
29-
{} as Record<string, unknown>,
30-
);
31-
}
32-
3328
private static _buildKeySchema(key: string) {
3429
return {
3530
type: 'object',

0 commit comments

Comments
 (0)