Skip to content

Commit f0f739e

Browse files
mturcoAndrewKushnir
authored andcommitted
feat: add timings info to eval assessment results
1 parent e104d0e commit f0f739e

File tree

5 files changed

+113
-0
lines changed

5 files changed

+113
-0
lines changed

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,25 @@ <h3 class="chart-title">
156156
}
157157
</div>
158158

159+
@if (overview.stats.timings) {
160+
@let timings = overview.stats.timings;
161+
<h3>Timings</h3>
162+
<ul class="status-badge-group">
163+
<li class="status-badge neutral">
164+
<b>Generate:</b> {{(timings.generate.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.generate.median / 1000) |
165+
number:'1.1-2'}}s (median)
166+
</li>
167+
<li class="status-badge neutral">
168+
<b>Build:</b> {{(timings.build.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.build.median / 1000) |
169+
number:'1.1-2'}}s (median)
170+
</li>
171+
<li class="status-badge neutral">
172+
<b>Repair:</b> {{(timings.repair.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.repair.median / 1000) |
173+
number:'1.1-2'}}s (median)
174+
</li>
175+
</ul>
176+
}
177+
159178
@if (details) {
160179
<h3>Usage Details</h3>
161180
<ul class="status-badge-group">
@@ -466,6 +485,23 @@ <h5>
466485
</div>
467486
</div>
468487

488+
@if (result.timings) {
489+
<div class="app-details-section">
490+
<h4>Timings</h4>
491+
<ul class="status-badge-group">
492+
<li class="status-badge neutral">
493+
<b>Generate:</b> {{(result.timings.generateDurationMs / 1000) | number:'1.1-2'}}s
494+
</li>
495+
<li class="status-badge neutral">
496+
<b>Build:</b> {{(result.timings.buildDurationMs / 1000) | number:'1.1-2'}}s
497+
</li>
498+
<li class="status-badge neutral">
499+
<b>Repair:</b> {{(result.timings.repairDurationMs / 1000) | number:'1.1-2'}}s
500+
</li>
501+
</ul>
502+
</div>
503+
}
504+
469505
@if (result.testResult) {
470506
<div class="app-details-section">
471507
<h4>Test Results</h4>

runner/orchestration/build-serve-test-loop.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ import {BuildResultStatus} from '../workers/builder/builder-types.js';
44
import {Environment} from '../configuration/environment.js';
55
import {
66
AssessmentConfig,
7+
AssessmentTimings,
78
AttemptDetails,
89
LlmContextFile,
910
RootPromptDefinition,
1011
} from '../shared-interfaces.js';
12+
import {performance} from 'node:perf_hooks';
1113
import {ProgressLogger} from '../progress/progress-logger.js';
1214
import {BuildType, runBuild} from './build-worker.js';
1315
import {EvalID} from './executors/executor.js';
@@ -53,7 +55,9 @@ export async function attemptBuildAndTest(
5355
workerConcurrencyQueue: PQueue,
5456
progress: ProgressLogger,
5557
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
58+
timings: AssessmentTimings,
5659
) {
60+
const initialBuildStart = performance.now();
5761
const initialBuildResult = await runBuild(
5862
evalID,
5963
directory,
@@ -64,6 +68,7 @@ export async function attemptBuildAndTest(
6468
progress,
6569
BuildType.INITIAL_BUILD,
6670
);
71+
timings.buildDurationMs += performance.now() - initialBuildStart;
6772
let repairAttempts = 0;
6873
let maxRepairAttempts: number;
6974
let maxTestRepairAttempts: number;
@@ -100,6 +105,7 @@ export async function attemptBuildAndTest(
100105
`Trying to repair app build (attempt #${repairAttempts + 1})`,
101106
);
102107

108+
const repairStart = performance.now();
103109
const attempt = await repairAndBuild(
104110
evalID,
105111
config.model,
@@ -120,6 +126,7 @@ export async function attemptBuildAndTest(
120126
progress,
121127
'build',
122128
);
129+
timings.repairDurationMs += performance.now() - repairStart;
123130

124131
attemptDetails.push(attempt);
125132
lastAttempt = attempt;
@@ -200,6 +207,7 @@ export async function attemptBuildAndTest(
200207
});
201208
}
202209

210+
const repairStart = performance.now();
203211
const attempt = await repairAndBuild(
204212
evalID,
205213
config.model,
@@ -224,6 +232,7 @@ export async function attemptBuildAndTest(
224232
// further repairs and capture the failed build. This is useful insight
225233
// as LLMs seem to regress when asked to repair violations.
226234
if (hasBuildFailure) {
235+
timings.repairDurationMs += performance.now() - repairStart;
227236
break;
228237
}
229238

@@ -249,6 +258,7 @@ export async function attemptBuildAndTest(
249258
workerConcurrencyQueue,
250259
progress,
251260
)) ?? undefined;
261+
timings.repairDurationMs += performance.now() - repairStart;
252262

253263
if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
254264
progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);

runner/orchestration/generate-eval-task.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {rateGeneratedCode} from '../ratings/rate-code.js';
1818
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
1919
import assert from 'node:assert';
2020
import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
21+
import {performance} from 'node:perf_hooks';
2122

2223
/**
2324
* Creates and executes a task to generate or load code for a given prompt,
@@ -60,6 +61,7 @@ export async function startEvaluationTask(
6061
// and for each sub-prompt, because the project will be augmented on each iteration.
6162
const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);
6263

64+
const generateStart = performance.now();
6365
// Generate the initial set of files through the LLM.
6466
const initialResponse = await generateInitialFiles(
6567
config,
@@ -76,6 +78,7 @@ export async function startEvaluationTask(
7678
abortSignal,
7779
progress,
7880
);
81+
const generateDurationMs = performance.now() - generateStart;
7982

8083
const toolLogs = initialResponse.toolLogs ?? [];
8184

@@ -140,6 +143,7 @@ export async function startEvaluationTask(
140143
}
141144

142145
const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
146+
const timings = {generateDurationMs, buildDurationMs: 0, repairDurationMs: 0};
143147

144148
// Try to build the files in the root prompt directory.
145149
// This will also attempt to fix issues with the generated code.
@@ -156,6 +160,7 @@ export async function startEvaluationTask(
156160
workerConcurrencyQueue,
157161
progress,
158162
userJourneyAgentTaskInput,
163+
timings,
159164
);
160165

161166
if (!attempt) {
@@ -197,6 +202,7 @@ export async function startEvaluationTask(
197202
toolLogs,
198203
testResult: attempt.testResult ?? null,
199204
testRepairAttempts: attempt.testRepairAttempts,
205+
timings,
200206
} satisfies AssessmentResult);
201207
}
202208

runner/ratings/stats.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import {BuildErrorType, BuildResultStatus} from '../workers/builder/builder-type
22
import {UserFacingError} from '../utils/errors.js';
33
import {
44
AggregatedRunStats,
5+
AggregatedTimings,
56
AssessmentResult,
67
RuntimeStats,
78
ScoreBucket,
@@ -15,13 +16,32 @@ export const BUCKET_CONFIG = [
1516
{name: 'Poor', min: 0, max: 70, id: 'poor'},
1617
];
1718

19+
function calculateMean(values: number[]): number {
20+
if (values.length === 0) return 0;
21+
return values.reduce((sum, value) => sum + value, 0) / values.length;
22+
}
23+
24+
function calculateMedian(values: number[]): number {
25+
if (values.length === 0) return 0;
26+
const sorted = [...values].sort((a, b) => a - b);
27+
const middle = Math.floor(sorted.length / 2);
28+
if (sorted.length % 2 === 0) {
29+
return (sorted[middle - 1] + sorted[middle]) / 2;
30+
}
31+
return sorted[middle];
32+
}
33+
1834
/**
1935
* Calculates build and check statistics from assessment results.
2036
*
2137
* @param assessments - An array of `AssessmentResult` objects.
2238
* @returns An object containing aggregated build and check statistics.
2339
*/
2440
export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): AggregatedRunStats {
41+
const generateDurations: number[] = [];
42+
const buildDurations: number[] = [];
43+
const repairDurations: number[] = [];
44+
2545
let successfulInitialBuilds = 0;
2646
let successfulBuildsAfterRepair = 0;
2747
let failedBuilds = 0;
@@ -63,6 +83,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
6383
}
6484
}
6585

86+
if (result.timings) {
87+
generateDurations.push(result.timings.generateDurationMs);
88+
buildDurations.push(result.timings.buildDurationMs);
89+
repairDurations.push(result.timings.repairDurationMs);
90+
}
91+
6692
// Calculate test statistics
6793
if (result.testResult) {
6894
if (result.testResult.passed) {
@@ -158,6 +184,22 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
158184
: undefined,
159185
accessibility: accessibilityStats,
160186
security: securityStats,
187+
...(generateDurations.length > 0 && {
188+
timings: {
189+
generate: {
190+
mean: calculateMean(generateDurations),
191+
median: calculateMedian(generateDurations),
192+
},
193+
build: {
194+
mean: calculateMean(buildDurations),
195+
median: calculateMedian(buildDurations),
196+
},
197+
repair: {
198+
mean: calculateMean(repairDurations),
199+
median: calculateMedian(repairDurations),
200+
},
201+
},
202+
}),
161203
};
162204
}
163205

runner/shared-interfaces.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,20 @@ export interface SkippedIndividualAssessment {
235235
groupingLabels?: string[];
236236
}
237237

238+
/** Stores the duration in milliseconds for different phases of the evaluation. */
239+
export interface AssessmentTimings {
240+
generateDurationMs: number;
241+
buildDurationMs: number;
242+
repairDurationMs: number;
243+
}
244+
245+
/** Stores aggregated timing statistics. */
246+
export interface AggregatedTimings {
247+
generate: {mean: number; median: number};
248+
build: {mean: number; median: number};
249+
repair: {mean: number; median: number};
250+
}
251+
238252
/**
239253
* Represents the overall score and breakdown of code assessments.
240254
*/
@@ -345,6 +359,9 @@ export interface AggregatedRunStats {
345359
appsWithoutErrors: number;
346360
};
347361
security?: {appsWithErrors: number; appsWithoutErrors: number};
362+
363+
/** Timing statistics for the run. */
364+
timings?: AggregatedTimings;
348365
}
349366

350367
export interface CompletionStats {
@@ -555,6 +572,8 @@ export interface AssessmentResult {
555572
testResult: TestExecutionResult | null;
556573
/** Number of repair attempts for tests. */
557574
testRepairAttempts?: number;
575+
/** Timings captured for the execution and repair stages. */
576+
timings?: AssessmentTimings;
558577
}
559578

560579
/**

0 commit comments

Comments
 (0)