@@ -3,7 +3,7 @@ import {existsSync, readdirSync} from 'fs';
33import { availableParallelism } from 'os' ;
44import PQueue from 'p-queue' ;
55import { basename , join } from 'path' ;
6- import { assertValidModelName } from '../codegen/llm-runner.js' ;
6+ import { assertValidModelName , LlmRunner } from '../codegen/llm-runner.js' ;
77import { getRunnerByName } from '../codegen/runner-creation.js' ;
88import { LLM_OUTPUT_DIR , REPORT_VERSION } from '../configuration/constants.js' ;
99import { getEnvironmentByPath } from '../configuration/environment-resolution.js' ;
@@ -27,6 +27,7 @@ import {startEvaluationTask} from './generate-eval-task.js';
2727import { prepareSummary } from './generate-summary.js' ;
2828import { getRunGroupId } from './grouping.js' ;
2929import { combineAbortSignals } from '../utils/abort-signal.js' ;
30+ import { RatingKind } from '../ratings/rating-types.js' ;
3031
3132/**
3233 * Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -43,10 +44,14 @@ import {combineAbortSignals} from '../utils/abort-signal.js';
4344 */
4445export async function generateCodeAndAssess ( options : AssessmentConfig ) : Promise < RunInfo > {
4546 const env = await getEnvironmentByPath ( options . environmentConfigPath , options . runner ) ;
47+ const extraCleanupFns : ( ( ) => Promise < void > ) [ ] = [ ] ;
4648 const cleanup = async ( ) => {
4749 // Clean-up should never interrupt a potentially passing completion.
4850 try {
4951 await env . executor . destroy ( ) ;
52+ for ( const cleanupFn of extraCleanupFns ) {
53+ await cleanupFn ( ) ;
54+ }
5055 } catch ( e ) {
5156 console . error ( `Failed to destroy executor: ${ e } ` ) ;
5257 if ( e instanceof Error ) {
@@ -58,16 +63,39 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
5863 // Ensure cleanup logic runs when the evaluation is aborted.
5964 options . abortSignal ?. addEventListener ( 'abort' , cleanup ) ;
6065
61- await assertValidModelName ( options . model , env . executor ) ;
62-
63- const ratingLlm = await getRunnerByName ( 'genkit' ) ;
6466 const allTasksAbortCtrl = new AbortController ( ) ;
6567
6668 try {
69+ await assertValidModelName ( options . model , env . executor ) ;
70+
6771 const promptsToProcess = (
6872 await getCandidateExecutablePrompts ( env , options . localMode , options . promptFilter )
6973 ) . slice ( 0 , options . limit ) ;
7074
75+ const hasLlmBasedRatings = promptsToProcess . some ( p =>
76+ p . kind === 'single'
77+ ? // Check if some ratings are LLM based.
78+ p . ratings . some ( r => r . kind === RatingKind . LLM_BASED )
79+ : // Check if some steps contain LLM based ratings.
80+ p . steps . some ( s => s . ratings . some ( r => r . kind === RatingKind . LLM_BASED ) ) ,
81+ ) ;
82+
83+ // Only construct LLMs when necessary. This is helpful in cases where WCS is invoked
84+ // as a auto-rater that doesn't have access to other LLMs.
85+ const autoraterLlm = hasLlmBasedRatings ? await getRunnerByName ( 'genkit' ) : null ;
86+ const cujGenerationLlm = options . enableUserJourneyTesting
87+ ? ( autoraterLlm ?? ( await getRunnerByName ( 'genkit' ) ) )
88+ : null ;
89+ const generateAiSummaryLlm = ! options . skipAiSummary
90+ ? ( autoraterLlm ?? cujGenerationLlm ?? ( await getRunnerByName ( 'genkit' ) ) )
91+ : null ;
92+
93+ extraCleanupFns . push ( async ( ) => {
94+ await autoraterLlm ?. dispose ( ) ;
95+ await cujGenerationLlm ?. dispose ( ) ;
96+ await generateAiSummaryLlm ?. dispose ( ) ;
97+ } ) ;
98+
7199 const progress =
72100 options . logging === 'dynamic' ? new DynamicProgressLogger ( ) : new TextProgressLogger ( ) ;
73101 const appConcurrency =
@@ -128,7 +156,8 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
128156 options ,
129157 evalID ,
130158 env ,
131- ratingLlm ,
159+ autoraterLlm ,
160+ cujGenerationLlm ,
132161 rootPromptDef ,
133162 combineAbortSignals (
134163 allTasksAbortCtrl . signal ,
@@ -187,7 +216,7 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
187216 const timestamp = new Date ( ) ;
188217 const details = {
189218 summary : await prepareSummary (
190- ratingLlm ,
219+ generateAiSummaryLlm ,
191220 allTasksAbortCtrl . signal ,
192221 options . model ,
193222 env ,
@@ -196,7 +225,6 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
196225 allPromptsCount : promptsToProcess . length ,
197226 failedPrompts,
198227 } ,
199- options ,
200228 ) ,
201229 timestamp : timestamp . toISOString ( ) ,
202230 reportName : options . reportName ,
0 commit comments