Skip to content

Commit 9c197db

Browse files
committed
feat: bring common utilities to WCS
These two utilities are currently used in various places and are useful generally. We should just expose them from WCS directly.
1 parent ead1b96 commit 9c197db

3 files changed

Lines changed: 98 additions & 0 deletions

File tree

runner/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,5 @@ export {NoopProgressLogger} from './progress/noop-progress-logger.js';
4949
export {TextProgressLogger} from './progress/text-progress-logger.js';
5050
export {type ServeTestingResult} from './workers/serve-testing/worker-types.js';
5151
export {replaceAtReferencesInPrompt} from './utils/prompt-at-references.js';
52+
export {extractRubrics, type RubricInfo} from './utils/extract-rubrics.js';
53+
export {combineReports} from './utils/combine-reports.mjs';

runner/utils/combine-reports.mts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import assert from 'assert';
2+
import {RunGroup, RunInfo} from '../shared-interfaces.js';
3+
import {groupSimilarReports} from '../orchestration/grouping.js';
4+
5+
/**
6+
* Takes a list of individual WCS reports and combines
7+
* them into a single WCS group with combined run.
8+
*/
9+
export function combineReports(
10+
runs: RunInfo[],
11+
groupId: string,
12+
runId: string,
13+
): {
14+
group: RunGroup;
15+
runInfo: RunInfo;
16+
} {
17+
assert.notEqual(runs.length, 0, 'Expected more than zero reports.');
18+
19+
const combinedRuns = groupSimilarReports(
20+
runs.map(r => {
21+
return {...r, group: groupId} satisfies RunInfo;
22+
}),
23+
);
24+
assert.equal(combinedRuns.length, 1);
25+
26+
const combinedRun = combinedRuns[0];
27+
const singleSampleRun = runs[0];
28+
const runInfo: RunInfo = {
29+
id: runId,
30+
group: combinedRun.id,
31+
results: runs.map(r => r.results).flat(),
32+
version: singleSampleRun.version,
33+
details: {
34+
reportName: singleSampleRun.details.reportName,
35+
summary: {
36+
displayName: singleSampleRun.details.summary.displayName,
37+
environmentId: singleSampleRun.details.summary.environmentId,
38+
framework: singleSampleRun.details.summary.framework,
39+
model: singleSampleRun.details.summary.model,
40+
usage: singleSampleRun.details.summary.usage,
41+
},
42+
systemPromptGeneration: '',
43+
systemPromptRepair: '',
44+
timestamp: singleSampleRun.details.timestamp,
45+
},
46+
};
47+
48+
return {
49+
group: combinedRun,
50+
runInfo,
51+
};
52+
}

runner/utils/extract-rubrics.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import {IndividualAssessmentState, RunInfo} from '../shared-interfaces.js';
2+
3+
export interface RubricInfo {
4+
score: number;
5+
}
6+
7+
export function extractRubrics(run: RunInfo): Record<string, RubricInfo> {
8+
const rubricsAnalysis: Record<string, {scores: {value: number; weight: number}[]}> = {};
9+
10+
for (const app of run.results) {
11+
for (const category of app.score.categories) {
12+
for (const check of category.assessments) {
13+
if (check.state === IndividualAssessmentState.SKIPPED) {
14+
continue;
15+
}
16+
17+
for (const label of check.groupingLabels ?? []) {
18+
if (!rubricsAnalysis[label]) {
19+
rubricsAnalysis[label] = {scores: []};
20+
}
21+
22+
const checkWeightWithPillar =
23+
category.maxPoints * (parseFloat(check.scoreReduction) / 100);
24+
25+
rubricsAnalysis[label]!.scores.push({
26+
value: checkWeightWithPillar * check.successPercentage,
27+
weight: checkWeightWithPillar,
28+
});
29+
}
30+
}
31+
}
32+
}
33+
34+
const rubricsBreakdown: Record<string, RubricInfo> = {};
35+
for (const label in rubricsAnalysis) {
36+
const scores = rubricsAnalysis[label]!.scores;
37+
const numerator = scores.reduce((sum, score) => sum + score.value, 0);
38+
const denominator = scores.reduce((sum, score) => sum + score.weight, 0);
39+
rubricsBreakdown[label] = {
40+
score: numerator / denominator,
41+
};
42+
}
43+
return rubricsBreakdown;
44+
}

0 commit comments

Comments
 (0)