Skip to content

Commit 4494a66

Browse files
jsonbaileyclaude
andcommitted
feat: wire evaluations tracking chain in ManagedModel.run() (AIC-1657)
ManagedModel.run() now delegates to aiConfig.evaluator.evaluate() and wraps evaluation + tracker.trackJudgeResult() into a single Promise set on ManagedResult.evaluations. run() returns before evaluations resolves; awaiting evaluations guarantees both evaluation and tracking are complete. Removes evaluations from ChatResponse (moved to ManagedResult). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent fe6948b commit 4494a66

3 files changed

Lines changed: 172 additions & 73 deletions

File tree

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import { Evaluator } from '../src/api/judge/Evaluator';
2+
import { LDJudgeResult } from '../src/api/judge/types';
3+
import { LDAICompletionConfig } from '../src/api/config/types';
4+
import { LDAIConfigTracker } from '../src/api/config/LDAIConfigTracker';
5+
import { AIProvider } from '../src/api/providers/AIProvider';
6+
import { ChatResponse } from '../src/api/chat/types';
7+
import { TrackedChat } from '../src/api/chat/TrackedChat';
8+
9+
describe('TrackedChat.run()', () => {
10+
let mockProvider: jest.Mocked<AIProvider>;
11+
let mockTracker: jest.Mocked<LDAIConfigTracker>;
12+
let aiConfig: LDAICompletionConfig;
13+
14+
const mockResponse: ChatResponse = {
15+
message: { role: 'assistant', content: 'AI response content' },
16+
metrics: { success: true },
17+
};
18+
19+
beforeEach(() => {
20+
mockProvider = {
21+
invokeModel: jest.fn().mockResolvedValue(mockResponse),
22+
} as any;
23+
24+
mockTracker = {
25+
trackMetricsOf: jest.fn().mockImplementation(async (_extractor: any, func: any) => func()),
26+
trackJudgeResult: jest.fn(),
27+
resumptionToken: 'test-resumption-token',
28+
getTrackData: jest.fn().mockReturnValue({}),
29+
trackDuration: jest.fn(),
30+
trackTokens: jest.fn(),
31+
trackSuccess: jest.fn(),
32+
trackError: jest.fn(),
33+
trackFeedback: jest.fn(),
34+
trackTimeToFirstToken: jest.fn(),
35+
trackDurationOf: jest.fn(),
36+
trackOpenAIMetrics: jest.fn(),
37+
trackBedrockConverseMetrics: jest.fn(),
38+
trackVercelAISDKGenerateTextMetrics: jest.fn(),
39+
trackStreamMetricsOf: jest.fn(),
40+
trackToolCall: jest.fn(),
41+
trackToolCalls: jest.fn(),
42+
getSummary: jest.fn(),
43+
} as any;
44+
45+
aiConfig = {
46+
key: 'test-config',
47+
enabled: true,
48+
messages: [{ role: 'system', content: 'You are helpful.' }],
49+
model: { name: 'gpt-4' },
50+
provider: { name: 'openai' },
51+
createTracker: () => mockTracker,
52+
};
53+
});
54+
55+
it('returns before evaluations resolve', async () => {
56+
let resolveEval!: (v: LDJudgeResult[]) => void;
57+
const slowEvaluator = {
58+
judgeConfiguration: { judges: [{ key: 'judge-1', samplingRate: 1.0 }] },
59+
evaluate: jest.fn().mockReturnValue(new Promise<LDJudgeResult[]>((resolve) => {
60+
resolveEval = resolve;
61+
})),
62+
judges: new Map(),
63+
} as unknown as Evaluator;
64+
65+
const configWithEvaluator: LDAICompletionConfig = {
66+
...aiConfig,
67+
evaluator: slowEvaluator,
68+
};
69+
70+
const chat = new TrackedChat(configWithEvaluator, mockProvider);
71+
72+
let evaluationsResolved = false;
73+
const resultPromise = chat.run('Hello');
74+
const result = await resultPromise;
75+
76+
// result is immediately available
77+
expect(result.content).toBe('AI response content');
78+
79+
// evaluations haven't resolved yet
80+
result.evaluations.then(() => {
81+
evaluationsResolved = true;
82+
});
83+
84+
// microtask flush — evaluations should not have resolved yet
85+
await Promise.resolve();
86+
expect(evaluationsResolved).toBe(false);
87+
88+
// Now resolve the evaluation
89+
resolveEval([{ success: true, sampled: true, score: 0.9 }]);
90+
await result.evaluations;
91+
expect(evaluationsResolved).toBe(true);
92+
});
93+
94+
it('awaiting evaluations guarantees tracking is complete', async () => {
95+
const judgeResult: LDJudgeResult = { success: true, sampled: true, score: 0.8, metricKey: 'quality' };
96+
const mockEvaluator = {
97+
judgeConfiguration: { judges: [{ key: 'judge-1', samplingRate: 1.0 }] },
98+
evaluate: jest.fn().mockResolvedValue([judgeResult]),
99+
judges: new Map(),
100+
} as unknown as Evaluator;
101+
102+
const configWithEvaluator: LDAICompletionConfig = {
103+
...aiConfig,
104+
evaluator: mockEvaluator,
105+
};
106+
107+
const chat = new TrackedChat(configWithEvaluator, mockProvider);
108+
const result = await chat.run('Hello');
109+
110+
// After awaiting evaluations, tracking IS complete
111+
await result.evaluations;
112+
expect(mockTracker.trackJudgeResult).toHaveBeenCalledWith(judgeResult);
113+
});
114+
115+
it('builds ManagedResult with correct content and metrics', async () => {
116+
const chat = new TrackedChat(aiConfig, mockProvider);
117+
const result = await chat.run('test prompt');
118+
119+
expect(result.content).toBe('AI response content');
120+
expect(result.metrics.success).toBe(true);
121+
expect(result.metrics.resumptionToken).toBe('test-resumption-token');
122+
expect(result.evaluations).toBeInstanceOf(Promise);
123+
});
124+
125+
it('resolves to empty evaluations when no evaluator configured', async () => {
126+
const chat = new TrackedChat(aiConfig, mockProvider);
127+
const result = await chat.run('Hello');
128+
const evaluations = await result.evaluations;
129+
expect(evaluations).toEqual([]);
130+
});
131+
132+
it('resolves to empty evaluations when evaluator is noop', async () => {
133+
const configWithNoop: LDAICompletionConfig = {
134+
...aiConfig,
135+
evaluator: Evaluator.noop(),
136+
};
137+
const chat = new TrackedChat(configWithNoop, mockProvider);
138+
const result = await chat.run('Hello');
139+
const evaluations = await result.evaluations;
140+
expect(evaluations).toEqual([]);
141+
});
142+
});

packages/sdk/server-ai/src/api/chat/TrackedChat.ts

Lines changed: 28 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import { ChatResponse } from './types';
1212
* by delegating to an AIProvider implementation.
1313
* This class handles conversation management and tracking, while delegating
1414
* the actual model invocation to the provider.
15+
*
16+
* Use `run()` as the primary entry point. `invoke()` is deprecated.
1517
*/
1618
export class TrackedChat {
1719
protected messages: LDMessage[];
@@ -29,6 +31,9 @@ export class TrackedChat {
2931
* Invoke the chat model with a prompt string and return a ManagedResult.
3032
* This is the primary entry point for model invocation. Judge evaluations are
3133
* wired asynchronously and exposed via ManagedResult.evaluations.
34+
*
35+
* run() returns before ManagedResult.evaluations resolves. Awaiting evaluations
36+
* guarantees both evaluation and tracking (tracker.trackJudgeResult) are complete.
3237
*/
3338
async run(prompt: string): Promise<ManagedResult> {
3439
const tracker = this.aiConfig.createTracker!();
@@ -61,11 +66,31 @@ export class TrackedChat {
6166
resumptionToken: tracker.resumptionToken,
6267
};
6368

64-
// Evaluations are wired in the managed layer (PR 3). For now, resolve empty.
65-
const evaluations: Promise<LDJudgeResult[]> = Promise.resolve([]);
69+
const output = response.message.content;
70+
// Build a single string of the input messages for judge evaluation
71+
const inputText = this.messages
72+
.slice(0, -1) // exclude the just-added assistant response
73+
.map((m) => m.content)
74+
.join('\r\n');
75+
76+
// Wire evaluation + tracking into a single Promise.
77+
// run() returns before this resolves — awaiting evaluations guarantees
78+
// both evaluation and tracking are complete.
79+
const evaluator = this.aiConfig.evaluator;
80+
let evaluations: Promise<LDJudgeResult[]>;
81+
if (evaluator && evaluator.judgeConfiguration.judges.length > 0) {
82+
evaluations = evaluator.evaluate(inputText, output).then((results) => {
83+
results.forEach((judgeResult) => {
84+
tracker.trackJudgeResult(judgeResult);
85+
});
86+
return results;
87+
});
88+
} else {
89+
evaluations = Promise.resolve([]);
90+
}
6691

6792
return {
68-
content: response.message.content,
93+
content: output,
6994
metrics,
7095
evaluations,
7196
};
@@ -96,72 +121,10 @@ export class TrackedChat {
96121
() => this.provider.invokeModel(allMessages),
97122
);
98123

99-
if (
100-
this.aiConfig.judgeConfiguration?.judges &&
101-
this.aiConfig.judgeConfiguration.judges.length > 0
102-
) {
103-
response.evaluations = this._evaluateWithJudges(this.messages, response).then(
104-
(evaluations) => {
105-
evaluations.forEach((judgeResult) => {
106-
tracker.trackJudgeResult(judgeResult);
107-
});
108-
return evaluations;
109-
},
110-
);
111-
}
112-
113124
this.messages.push(response.message);
114125
return response;
115126
}
116127

117-
/**
118-
* Evaluates the response with all configured judges.
119-
* Returns a promise that resolves to an array of evaluation results.
120-
*
121-
* @param messages Array of messages representing the conversation history
122-
* @param response The AI response to be evaluated
123-
* @returns Promise resolving to array of judge evaluation results
124-
*/
125-
private async _evaluateWithJudges(
126-
messages: LDMessage[],
127-
response: ChatResponse,
128-
): Promise<LDJudgeResult[]> {
129-
const judgeConfigs = this.aiConfig.judgeConfiguration!.judges;
130-
131-
// Start all judge evaluations in parallel
132-
const evaluationPromises = judgeConfigs.map(async (judgeConfig) => {
133-
const judge = this.judges[judgeConfig.key];
134-
if (!judge) {
135-
this._logger?.warn(
136-
`Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`,
137-
);
138-
const result: LDJudgeResult = {
139-
success: false,
140-
sampled: true,
141-
errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`,
142-
};
143-
return result;
144-
}
145-
146-
return judge.evaluateMessages(messages, response, judgeConfig.samplingRate);
147-
});
148-
149-
// ensure all evaluations complete even if some fail
150-
const results = await Promise.allSettled(evaluationPromises);
151-
152-
return results.map((settled) => {
153-
if (settled.status === 'fulfilled') {
154-
return settled.value;
155-
}
156-
const result: LDJudgeResult = {
157-
success: false,
158-
sampled: true,
159-
errorMessage: 'Judge evaluation failed',
160-
};
161-
return result;
162-
});
163-
}
164-
165128
/**
166129
* Get the underlying AI configuration used to initialize this TrackedChat.
167130
*/
Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import { LDMessage } from '../config/types';
2-
import { LDJudgeResult } from '../judge/types';
32
import { LDAIMetrics } from '../metrics/LDAIMetrics';
43

54
/**
6-
* Chat response structure.
5+
* Chat response structure returned by provider implementations.
6+
* This is the runner-level type; evaluations belong in ManagedResult.
77
*/
88
export interface ChatResponse {
99
/**
@@ -15,10 +15,4 @@ export interface ChatResponse {
1515
* Metrics information including success status and token usage.
1616
*/
1717
metrics: LDAIMetrics;
18-
19-
/**
20-
* Promise that resolves to judge evaluation results.
21-
* Only present when judges are configured for evaluation.
22-
*/
23-
evaluations?: Promise<LDJudgeResult[]>;
2418
}

0 commit comments

Comments
 (0)