Skip to content

Commit 15fa7bd

Browse files
danymarquesDany Marques (u118970)
authored andcommitted
feat: support passing multiple models to evaluate the same prompts across models
The --model option now accepts multiple values, allowing users to run the same evaluation against several models in a single command. When multiple models are specified, each model's report name is suffixed with the model name to avoid collisions. Usage: pnpm run wcs eval --model=gemini-2.5-pro --model=claude-sonnet-4-5
1 parent 1982bac commit 15fa7bd

File tree

1 file changed

+57
-42
lines changed

1 file changed

+57
-42
lines changed

runner/eval-cli.ts

Lines changed: 57 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ export const EvalModule = {
2323

2424
interface Options {
2525
environment?: string;
26-
model: string;
26+
model: string[];
2727
runner: RunnerName;
2828
local: boolean;
2929
limit: number;
@@ -57,8 +57,9 @@ function builder(argv: Argv): Argv<Options> {
5757
})
5858
.option('model', {
5959
type: 'string',
60-
default: DEFAULT_MODEL_NAME,
61-
descript: 'Model to use when generating code',
60+
array: true,
61+
default: [DEFAULT_MODEL_NAME],
62+
descript: 'Model(s) to use when generating code',
6263
})
6364
// Option is a noop right now when using a remote environment.
6465
.option('runner', {
@@ -206,46 +207,60 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
206207
process.on('SIGTERM', () => abortCtrl.abort());
207208
process.on('exit', () => abortCtrl.abort());
208209

209-
try {
210-
const runInfo = await generateCodeAndAssess({
211-
runner: cliArgs.runner,
212-
model: cliArgs.model,
213-
environment: {
214-
configPath: BUILT_IN_ENVIRONMENTS.get(cliArgs.environment) || cliArgs.environment,
215-
},
216-
localMode: cliArgs.local,
217-
limit: cliArgs.limit,
218-
concurrency: cliArgs.concurrency as number,
219-
reportName: cliArgs.reportName!,
220-
skipScreenshots: !!cliArgs.skipScreenshots,
221-
startMcp: cliArgs.mcp,
222-
ragEndpoint: cliArgs.ragEndpoint,
223-
outputDirectory: cliArgs.outputDirectory,
224-
promptFilter: cliArgs.promptFilter,
225-
labels: cliArgs.labels || [],
226-
skipAxeTesting: !!cliArgs.skipAxeTesting,
227-
enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
228-
enableAutoCsp: cliArgs.enableAutoCsp,
229-
logging: cliArgs.logging,
230-
autoraterModel: cliArgs.autoraterModel,
231-
skipAiSummary: cliArgs.skipAiSummary,
232-
skipLighthouse: cliArgs.skipLighthouse,
233-
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
234-
maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
235-
promptTimeoutRetries: cliArgs.promptTimeoutRetries,
236-
abortSignal: abortCtrl.signal,
237-
});
210+
const models = cliArgs.model;
211+
const baseReportName = cliArgs.reportName!;
212+
213+
for (const model of models) {
214+
const reportName =
215+
models.length > 1
216+
? `${baseReportName}--${model.replace(/[^a-zA-Z0-9-]/g, '-')}`
217+
: baseReportName;
218+
219+
if (models.length > 1) {
220+
console.log(chalk.cyan(`\nStarting evaluation with model: ${model}\n`));
221+
}
222+
223+
try {
224+
const runInfo = await generateCodeAndAssess({
225+
runner: cliArgs.runner,
226+
model,
227+
environment: {
228+
configPath: BUILT_IN_ENVIRONMENTS.get(cliArgs.environment) || cliArgs.environment,
229+
},
230+
localMode: cliArgs.local,
231+
limit: cliArgs.limit,
232+
concurrency: cliArgs.concurrency as number,
233+
reportName,
234+
skipScreenshots: !!cliArgs.skipScreenshots,
235+
startMcp: cliArgs.mcp,
236+
ragEndpoint: cliArgs.ragEndpoint,
237+
outputDirectory: cliArgs.outputDirectory,
238+
promptFilter: cliArgs.promptFilter,
239+
labels: cliArgs.labels || [],
240+
skipAxeTesting: !!cliArgs.skipAxeTesting,
241+
enableUserJourneyTesting: cliArgs.enableUserJourneyTesting,
242+
enableAutoCsp: cliArgs.enableAutoCsp,
243+
logging: cliArgs.logging,
244+
autoraterModel: cliArgs.autoraterModel,
245+
skipAiSummary: cliArgs.skipAiSummary,
246+
skipLighthouse: cliArgs.skipLighthouse,
247+
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
248+
maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
249+
promptTimeoutRetries: cliArgs.promptTimeoutRetries,
250+
abortSignal: abortCtrl.signal,
251+
});
238252

239-
logReportToConsole(runInfo);
240-
await writeReportToDisk(runInfo, runInfo.details.summary.environmentId, REPORTS_ROOT_DIR);
241-
} catch (error: unknown) {
242-
if (error instanceof UserFacingError) {
243-
console.error(chalk.red(error.message));
244-
} else {
245-
console.error(chalk.red('An error occurred during the assessment process:'));
246-
console.error(chalk.red(error));
247-
if (process.env.DEBUG === '1' && (error as Partial<Error>).stack) {
248-
console.error(chalk.red((error as Error).stack));
253+
logReportToConsole(runInfo);
254+
await writeReportToDisk(runInfo, runInfo.details.summary.environmentId, REPORTS_ROOT_DIR);
255+
} catch (error: unknown) {
256+
if (error instanceof UserFacingError) {
257+
console.error(chalk.red(error.message));
258+
} else {
259+
console.error(chalk.red('An error occurred during the assessment process:'));
260+
console.error(chalk.red(error));
261+
if (process.env.DEBUG === '1' && (error as Partial<Error>).stack) {
262+
console.error(chalk.red((error as Error).stack));
263+
}
249264
}
250265
}
251266
}

0 commit comments

Comments
 (0)