-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathreporter.ts
More file actions
154 lines (135 loc) · 4.59 KB
/
reporter.ts
File metadata and controls
154 lines (135 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/**
* Result reporting and formatting for evaluation results
*/
import { FocusedEvaluationResults } from './types.js';
import fs from 'fs';
import path, { dirname } from 'path';
import { fileURLToPath } from 'url';
/**
* Reporter class for formatting and displaying evaluation results
*/
export class EvaluationReporter {
/**
* Print comparison results
*/
printComparison(results: FocusedEvaluationResults[]): void {
console.log('\n' + '='.repeat(80));
console.log('🏆 MODEL COMPARISON - TOOL USAGE EFFECTIVENESS');
console.log('='.repeat(80));
// Sort by overall score
const sorted = [...results].sort((a, b) => b.overallScore - a.overallScore);
console.log('\n📊 OVERALL RANKINGS:');
sorted.forEach((result, index) => {
const rank = index + 1;
const emoji = rank === 1 ? '🥇' : rank === 2 ? '🥈' : rank === 3 ? '🥉' : ' ';
console.log(
`${emoji} ${rank}. ${result.model.padEnd(20)} ${(result.overallScore * 100).toFixed(1)}%`
);
});
console.log('\n📋 DETAILED BREAKDOWN:');
console.log(
'Model'.padEnd(20) +
'Overall'.padEnd(10) +
'Tools'.padEnd(10) +
'Order'.padEnd(10) +
'Efficiency'
);
console.log('-'.repeat(60));
sorted.forEach(result => {
const overall = (result.overallScore * 100).toFixed(1) + '%';
const tools = (result.correctToolUsage * 100).toFixed(1) + '%';
const order = (result.correctOrderUsage * 100).toFixed(1) + '%';
const efficiency = (result.efficiencyScore * 100).toFixed(1) + '%';
console.log(
result.model.padEnd(20) +
overall.padEnd(10) +
tools.padEnd(10) +
order.padEnd(10) +
efficiency
);
});
// Find best and worst scenarios
console.log('\n🎯 KEY INSIGHTS:');
const allResults = results.flatMap(r => r.results);
const bestScenarios = allResults.filter(r => r.score === 1.0);
const worstScenarios = allResults.filter(r => r.score < 0.5);
if (bestScenarios.length > 0) {
const commonBest = this.findMostCommon(bestScenarios.map(r => r.scenarioId));
console.log(`✅ Best performing scenario: ${commonBest} (consistent across models)`);
}
if (worstScenarios.length > 0) {
const commonWorst = this.findMostCommon(worstScenarios.map(r => r.scenarioId));
console.log(`❌ Most challenging scenario: ${commonWorst} (needs improvement)`);
}
console.log('='.repeat(80));
}
/**
* Print progress update for a single scenario
*/
printScenarioProgress(
index: number,
total: number,
scenario: { description: string },
result: { score: number }
): void {
const status = result.score > 0.8 ? '✅' : result.score > 0.5 ? '⚠️' : '❌';
console.log(
` ${index + 1}/${total} ${status} ${scenario.description} (${(result.score * 100).toFixed(0)}%)`
);
}
/**
* Print error for failed scenario
*/
printScenarioError(
index: number,
total: number,
scenario: { description: string },
error: Error | string
): void {
console.log(` ${index + 1}/${total} ❌ ${scenario.description} - ERROR: ${error}`);
}
/**
* Print model evaluation summary
*/
printModelSummary(modelName: string, result: FocusedEvaluationResults): void {
console.log(` Overall Score: ${(result.overallScore * 100).toFixed(1)}%`);
console.log(` Correct Tools: ${(result.correctToolUsage * 100).toFixed(1)}%`);
console.log(` Correct Order: ${(result.correctOrderUsage * 100).toFixed(1)}%`);
console.log(` Efficiency: ${(result.efficiencyScore * 100).toFixed(1)}%`);
}
/**
* Save results to file
*/
async saveResults(
results: FocusedEvaluationResults[],
filename: string = path.join(
dirname(fileURLToPath(import.meta.url)),
'results',
`${new Date(Date.now()).toISOString()}.json`
)
): Promise<void> {
try {
await fs.promises.mkdir(path.dirname(filename));
await fs.promises.writeFile(filename, JSON.stringify(results, null, 2));
console.log(`\n📁 Results saved to: ${filename}`);
} catch (error) {
console.warn(`⚠️ Could not save results: ${error}`);
}
}
/**
* Find most common item in array
*/
private findMostCommon(items: string[]): string {
const counts = new Map<string, number>();
items.forEach(item => counts.set(item, (counts.get(item) || 0) + 1));
let maxCount = 0;
let mostCommon = '';
counts.forEach((count, item) => {
if (count > maxCount) {
maxCount = count;
mostCommon = item;
}
});
return mostCommon;
}
}