Skip to content

Commit d90bf4a

Browse files
committed
feat: Add full evaluation type for regression checks and baseline saving, and update CI workflow to use it.
1 parent 2782458 commit d90bf4a

4 files changed

Lines changed: 97 additions & 8 deletions

File tree

.github/workflows/llm-evals.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,12 @@ jobs:
149149

150150
- name: Check for regressions
151151
id: regression_check
152-
run: npm run eval:check-regressions
152+
run: npm run eval:check-regressions -- --type=full
153153
continue-on-error: true
154154

155155
- name: Save as new baseline
156156
if: github.ref == 'refs/heads/main' && steps.regression_check.outcome == 'success'
157-
run: npm run eval:baseline
157+
run: npm run eval:baseline -- --type=full
158158

159159
- name: Upload eval results
160160
uses: actions/upload-artifact@v4

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![npm version](https://img.shields.io/npm/v/zon-format.svg)](https://www.npmjs.com/package/zon-format)
44
[![GitHub stars](https://img.shields.io/github/stars/ZON-Format/zon-TS?style=social)](https://github.com/ZON-Format/zon-TS)
55
[![TypeScript](https://img.shields.io/badge/TypeScript-5.x-blue.svg)](https://www.typescriptlang.org/)
6-
[![Tests](https://img.shields.io/badge/tests-288%2F288%20passing-brightgreen.svg)](#quality--testing)
6+
[![Tests](https://img.shields.io/badge/tests-297%2F297 %20passing-brightgreen.svg)](#quality--testing)
77
[![npm downloads](https://img.shields.io/npm/dm/zon-format?color=red)](https://www.npmjs.com/package/zon-format)
88
[![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
99

benchmarks/scripts/check-regressions.js

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,78 @@ const fs = require('fs');
99
const path = require('path');
1010
const { ZonEvaluator, registerBuiltinMetrics, FileEvalStorage } = require('../../dist/evals');
1111

12+
const { parseArgs } = require('util');
13+
1214
async function checkRegressions() {
13-
console.log('🔍 Checking for regressions...\n');
15+
const { values } = parseArgs({
16+
args: process.argv.slice(2),
17+
options: {
18+
type: { type: 'string', default: 'smoke' }
19+
}
20+
});
21+
22+
console.log(`🔍 Checking for regressions for ${values.type}...\n`);
1423

24+
if (values.type === 'full') {
25+
const resultsPath = path.join(__dirname, '../results/accuracy-results.json');
26+
const baselinePath = path.join(__dirname, '../results/accuracy-baseline.json');
27+
28+
if (!fs.existsSync(resultsPath)) {
29+
console.log('⚠️ No full evaluation results found');
30+
process.exit(0);
31+
}
32+
33+
if (!fs.existsSync(baselinePath)) {
34+
console.log('ℹ️ No baseline found - this will become the baseline');
35+
process.exit(0);
36+
}
37+
38+
const results = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
39+
const baseline = JSON.parse(fs.readFileSync(baselinePath, 'utf-8'));
40+
41+
// Compare accuracy and efficiency
42+
let hasCritical = false;
43+
const models = Object.keys(results.models);
44+
45+
models.forEach(model => {
46+
const currentModel = results.models[model];
47+
const baselineModel = baseline.models[model];
48+
49+
if (!baselineModel) return;
50+
51+
Object.keys(currentModel).forEach(format => {
52+
const current = currentModel[format];
53+
const base = baselineModel[format];
54+
55+
if (!base) return;
56+
57+
// Check accuracy regression
58+
const currentAcc = current.total > 0 ? (current.correct / current.total) : 0;
59+
const baseAcc = base.total > 0 ? (base.correct / base.total) : 0;
60+
61+
if (currentAcc < baseAcc - 0.05) { // 5% drop
62+
console.log(`🔴 ${model}/${format} Accuracy: ${baseAcc.toFixed(2)} -> ${currentAcc.toFixed(2)}`);
63+
hasCritical = true;
64+
}
65+
});
66+
});
67+
68+
if (hasCritical) {
69+
console.error('❌ Critical regressions detected');
70+
process.exit(1);
71+
}
72+
73+
console.log('✅ No regressions detected!\n');
74+
process.exit(0);
75+
}
76+
1577
const storage = new FileEvalStorage('./benchmarks/results');
1678

1779
try {
1880
// Load latest results
1981
const latest = await storage.getLatest('smoke-test');
2082
if (!latest) {
21-
console.log('⚠️ No recent eval results found');
83+
console.log('⚠️ No recent smoke test results found');
2284
process.exit(0);
2385
}
2486

benchmarks/scripts/save-baseline.js

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,43 @@ const fs = require('fs');
99
const path = require('path');
1010
const { FileEvalStorage } = require('../../dist/evals');
1111

12+
const { parseArgs } = require('util');
13+
1214
async function saveBaseline() {
13-
console.log('💾 Saving baseline...\n');
15+
const { values } = parseArgs({
16+
args: process.argv.slice(2),
17+
options: {
18+
type: { type: 'string', default: 'smoke' }
19+
}
20+
});
21+
22+
console.log(`💾 Saving baseline for ${values.type}...\n`);
1423

24+
if (values.type === 'full') {
25+
const resultsPath = path.join(__dirname, '../results/accuracy-results.json');
26+
const baselinePath = path.join(__dirname, '../results/accuracy-baseline.json');
27+
28+
if (!fs.existsSync(resultsPath)) {
29+
console.error('❌ No full evaluation results found');
30+
process.exit(1);
31+
}
32+
33+
const results = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
34+
fs.writeFileSync(baselinePath, JSON.stringify(results, null, 2));
35+
36+
console.log('✅ Full baseline saved successfully!');
37+
console.log(` Timestamp: ${results.timestamp}`);
38+
process.exit(0);
39+
}
40+
1541
const storage = new FileEvalStorage('./benchmarks/results');
1642

1743
try {
1844
// Load latest results
1945
const latest = await storage.getLatest('smoke-test');
2046
if (!latest) {
21-
console.error('❌ No recent eval results to save as baseline');
47+
console.error('❌ No recent smoke test results to save as baseline');
48+
console.error(' Run "npm run eval:smoke" first');
2249
process.exit(1);
2350
}
2451

@@ -31,7 +58,7 @@ async function saveBaseline() {
3158

3259
await storage.save(baselineResult);
3360

34-
console.log('✅ Baseline saved successfully!');
61+
console.log('✅ Smoke test baseline saved successfully!');
3562
console.log(` Test ID: ${baselineResult.testId}`);
3663
console.log(` Timestamp: ${new Date(baselineResult.timestamp).toISOString()}`);
3764
process.exit(0);

0 commit comments

Comments
 (0)