feat: wire evaluations tracking chain in ManagedModel.run() (AIC-1657)

jsonbailey · claude · jsonbailey · commit 4494a66776bf · 2026-04-28T17:58:39.000-05:00
ManagedModel.run() now delegates to aiConfig.evaluator.evaluate() and wraps
evaluation + tracker.trackJudgeResult() into a single Promise set on
ManagedResult.evaluations. run() returns before evaluations resolves; awaiting
evaluations guarantees both evaluation and tracking are complete. Removes
evaluations from ChatResponse (moved to ManagedResult).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/sdk/server-ai/__tests__/TrackedChatRun.test.ts b/packages/sdk/server-ai/__tests__/TrackedChatRun.test.ts
@@ -0,0 +1,142 @@
+import { Evaluator } from '../src/api/judge/Evaluator';
+import { LDJudgeResult } from '../src/api/judge/types';
+import { LDAICompletionConfig } from '../src/api/config/types';
+import { LDAIConfigTracker } from '../src/api/config/LDAIConfigTracker';
+import { AIProvider } from '../src/api/providers/AIProvider';
+import { ChatResponse } from '../src/api/chat/types';
+import { TrackedChat } from '../src/api/chat/TrackedChat';
+
+describe('TrackedChat.run()', () => {
+  let mockProvider: jest.Mocked<AIProvider>;
+  let mockTracker: jest.Mocked<LDAIConfigTracker>;
+  let aiConfig: LDAICompletionConfig;
+
+  const mockResponse: ChatResponse = {
+    message: { role: 'assistant', content: 'AI response content' },
+    metrics: { success: true },
+  };
+
+  beforeEach(() => {
+    mockProvider = {
+      invokeModel: jest.fn().mockResolvedValue(mockResponse),
+    } as any;
+
+    mockTracker = {
+      trackMetricsOf: jest.fn().mockImplementation(async (_extractor: any, func: any) => func()),
+      trackJudgeResult: jest.fn(),
+      resumptionToken: 'test-resumption-token',
+      getTrackData: jest.fn().mockReturnValue({}),
+      trackDuration: jest.fn(),
+      trackTokens: jest.fn(),
+      trackSuccess: jest.fn(),
+      trackError: jest.fn(),
+      trackFeedback: jest.fn(),
+      trackTimeToFirstToken: jest.fn(),
+      trackDurationOf: jest.fn(),
+      trackOpenAIMetrics: jest.fn(),
+      trackBedrockConverseMetrics: jest.fn(),
+      trackVercelAISDKGenerateTextMetrics: jest.fn(),
+      trackStreamMetricsOf: jest.fn(),
+      trackToolCall: jest.fn(),
+      trackToolCalls: jest.fn(),
+      getSummary: jest.fn(),
+    } as any;
+
+    aiConfig = {
+      key: 'test-config',
+      enabled: true,
+      messages: [{ role: 'system', content: 'You are helpful.' }],
+      model: { name: 'gpt-4' },
+      provider: { name: 'openai' },
+      createTracker: () => mockTracker,
+    };
+  });
+
+  it('returns before evaluations resolve', async () => {
+    let resolveEval!: (v: LDJudgeResult[]) => void;
+    const slowEvaluator = {
+      judgeConfiguration: { judges: [{ key: 'judge-1', samplingRate: 1.0 }] },
+      evaluate: jest.fn().mockReturnValue(new Promise<LDJudgeResult[]>((resolve) => {
+        resolveEval = resolve;
+      })),
+      judges: new Map(),
+    } as unknown as Evaluator;
+
+    const configWithEvaluator: LDAICompletionConfig = {
+      ...aiConfig,
+      evaluator: slowEvaluator,
+    };
+
+    const chat = new TrackedChat(configWithEvaluator, mockProvider);
+
+    let evaluationsResolved = false;
+    const resultPromise = chat.run('Hello');
+    const result = await resultPromise;
+
+    // result is immediately available
+    expect(result.content).toBe('AI response content');
+
+    // evaluations haven't resolved yet
+    result.evaluations.then(() => {
+      evaluationsResolved = true;
+    });
+
+    // microtask flush — evaluations should not have resolved yet
+    await Promise.resolve();
+    expect(evaluationsResolved).toBe(false);
+
+    // Now resolve the evaluation
+    resolveEval([{ success: true, sampled: true, score: 0.9 }]);
+    await result.evaluations;
+    expect(evaluationsResolved).toBe(true);
+  });
+
+  it('awaiting evaluations guarantees tracking is complete', async () => {
+    const judgeResult: LDJudgeResult = { success: true, sampled: true, score: 0.8, metricKey: 'quality' };
+    const mockEvaluator = {
+      judgeConfiguration: { judges: [{ key: 'judge-1', samplingRate: 1.0 }] },
+      evaluate: jest.fn().mockResolvedValue([judgeResult]),
+      judges: new Map(),
+    } as unknown as Evaluator;
+
+    const configWithEvaluator: LDAICompletionConfig = {
+      ...aiConfig,
+      evaluator: mockEvaluator,
+    };
+
+    const chat = new TrackedChat(configWithEvaluator, mockProvider);
+    const result = await chat.run('Hello');
+
+    // After awaiting evaluations, tracking IS complete
+    await result.evaluations;
+    expect(mockTracker.trackJudgeResult).toHaveBeenCalledWith(judgeResult);
+  });
+
+  it('builds ManagedResult with correct content and metrics', async () => {
+    const chat = new TrackedChat(aiConfig, mockProvider);
+    const result = await chat.run('test prompt');
+
+    expect(result.content).toBe('AI response content');
+    expect(result.metrics.success).toBe(true);
+    expect(result.metrics.resumptionToken).toBe('test-resumption-token');
+    expect(result.evaluations).toBeInstanceOf(Promise);
+  });
+
+  it('resolves to empty evaluations when no evaluator configured', async () => {
+    const chat = new TrackedChat(aiConfig, mockProvider);
+    const result = await chat.run('Hello');
+    const evaluations = await result.evaluations;
+    expect(evaluations).toEqual([]);
+  });
+
+  it('resolves to empty evaluations when evaluator is noop', async () => {
+    const configWithNoop: LDAICompletionConfig = {
+      ...aiConfig,
+      evaluator: Evaluator.noop(),
+    };
+    const chat = new TrackedChat(configWithNoop, mockProvider);
+    const result = await chat.run('Hello');
+    const evaluations = await result.evaluations;
+    expect(evaluations).toEqual([]);
+  });
+});
diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts
@@ -12,6 +12,8 @@ import { ChatResponse } from './types';
  * by delegating to an AIProvider implementation.
  * This class handles conversation management and tracking, while delegating
  * the actual model invocation to the provider.
+ *
+ * Use `run()` as the primary entry point. `invoke()` is deprecated.
  */
 export class TrackedChat {
   protected messages: LDMessage[];
@@ -29,6 +31,9 @@ export class TrackedChat {
    * Invoke the chat model with a prompt string and return a ManagedResult.
    * This is the primary entry point for model invocation. Judge evaluations are
    * wired asynchronously and exposed via ManagedResult.evaluations.
+   *
+   * run() returns before ManagedResult.evaluations resolves. Awaiting evaluations
+   * guarantees both evaluation and tracking (tracker.trackJudgeResult) are complete.
    */
   async run(prompt: string): Promise<ManagedResult> {
     const tracker = this.aiConfig.createTracker!();
@@ -61,11 +66,31 @@ export class TrackedChat {
       resumptionToken: tracker.resumptionToken,
     };
 
-    // Evaluations are wired in the managed layer (PR 3). For now, resolve empty.
-    const evaluations: Promise<LDJudgeResult[]> = Promise.resolve([]);
+    const output = response.message.content;
+    // Build a single string of the input messages for judge evaluation
+    const inputText = this.messages
+      .slice(0, -1) // exclude the just-added assistant response
+      .map((m) => m.content)
+      .join('\r\n');
+
+    // Wire evaluation + tracking into a single Promise.
+    // run() returns before this resolves — awaiting evaluations guarantees
+    // both evaluation and tracking are complete.
+    const evaluator = this.aiConfig.evaluator;
+    let evaluations: Promise<LDJudgeResult[]>;
+    if (evaluator && evaluator.judgeConfiguration.judges.length > 0) {
+      evaluations = evaluator.evaluate(inputText, output).then((results) => {
+        results.forEach((judgeResult) => {
+          tracker.trackJudgeResult(judgeResult);
+        });
+        return results;
+      });
+    } else {
+      evaluations = Promise.resolve([]);
+    }
 
     return {
-      content: response.message.content,
+      content: output,
       metrics,
       evaluations,
     };
@@ -96,72 +121,10 @@ export class TrackedChat {
       () => this.provider.invokeModel(allMessages),
     );
 
-    if (
-      this.aiConfig.judgeConfiguration?.judges &&
-      this.aiConfig.judgeConfiguration.judges.length > 0
-    ) {
-      response.evaluations = this._evaluateWithJudges(this.messages, response).then(
-        (evaluations) => {
-          evaluations.forEach((judgeResult) => {
-            tracker.trackJudgeResult(judgeResult);
-          });
-          return evaluations;
-        },
-      );
-    }
-
     this.messages.push(response.message);
     return response;
   }
 
-  /**
-   * Evaluates the response with all configured judges.
-   * Returns a promise that resolves to an array of evaluation results.
-   *
-   * @param messages Array of messages representing the conversation history
-   * @param response The AI response to be evaluated
-   * @returns Promise resolving to array of judge evaluation results
-   */
-  private async _evaluateWithJudges(
-    messages: LDMessage[],
-    response: ChatResponse,
-  ): Promise<LDJudgeResult[]> {
-    const judgeConfigs = this.aiConfig.judgeConfiguration!.judges;
-
-    // Start all judge evaluations in parallel
-    const evaluationPromises = judgeConfigs.map(async (judgeConfig) => {
-      const judge = this.judges[judgeConfig.key];
-      if (!judge) {
-        this._logger?.warn(
-          `Judge configuration is not enabled for ${judgeConfig.key} in ${this.aiConfig.key}`,
-        );
-        const result: LDJudgeResult = {
-          success: false,
-          sampled: true,
-          errorMessage: `Judge configuration is not enabled for ${judgeConfig.key}`,
-        };
-        return result;
-      }
-
-      return judge.evaluateMessages(messages, response, judgeConfig.samplingRate);
-    });
-
-    // ensure all evaluations complete even if some fail
-    const results = await Promise.allSettled(evaluationPromises);
-
-    return results.map((settled) => {
-      if (settled.status === 'fulfilled') {
-        return settled.value;
-      }
-      const result: LDJudgeResult = {
-        success: false,
-        sampled: true,
-        errorMessage: 'Judge evaluation failed',
-      };
-      return result;
-    });
-  }
-
   /**
    * Get the underlying AI configuration used to initialize this TrackedChat.
    */
diff --git a/packages/sdk/server-ai/src/api/chat/types.ts b/packages/sdk/server-ai/src/api/chat/types.ts
@@ -1,9 +1,9 @@
 import { LDMessage } from '../config/types';
-import { LDJudgeResult } from '../judge/types';
 import { LDAIMetrics } from '../metrics/LDAIMetrics';
 
 /**
- * Chat response structure.
+ * Chat response structure returned by provider implementations.
+ * This is the runner-level type; evaluations belong in ManagedResult.
  */
 export interface ChatResponse {
   /**
@@ -15,10 +15,4 @@ export interface ChatResponse {
    * Metrics information including success status and token usage.
    */
   metrics: LDAIMetrics;
-
-  /**
-   * Promise that resolves to judge evaluation results.
-   * Only present when judges are configured for evaluation.
-   */
-  evaluations?: Promise<LDJudgeResult[]>;
 }