@@ -29,15 +29,44 @@ import type {
2929} from '../../types/audit-types.js' ;
3030import type { ValidationResult } from '../../types/index.js' ;
3131
32- // Claude Sonnet 4.6 pricing (USD per 1M tokens)
33- const CLAUDE_SONNET_INPUT_COST_PER_MILLION = 3 ;
34- const CLAUDE_SONNET_OUTPUT_COST_PER_MILLION = 15 ;
35- const CLAUDE_SONNET_AVERAGE_COST_PER_MILLION = 9 ; // Approximate average
36- const TOKENS_PER_MILLION = 1_000_000 ;
32+ /**
33+ * Claude Sonnet 4.6 pricing configuration
34+ * @see https://www.anthropic.com/pricing
35+ */
36+ const PRICING_CONFIG = {
37+ model : 'claude-sonnet-4.6' ,
38+ inputCostPerMillion : 3 , // USD per 1M input tokens
39+ outputCostPerMillion : 15 , // USD per 1M output tokens
40+ averageCostPerMillion : 9 , // Approximate average (assuming 50/50 input/output)
41+ effectiveDate : '2026-05-01' ,
42+ } as const ;
43+
44+ const TOKENS_PER_MILLION = 1_000_000 as const ;
45+
46+ /**
47+ * Grade thresholds based on industry-standard academic grading scale
48+ */
49+ const GRADE_THRESHOLDS = {
50+ A : 90 , // Excellent: ready for production
51+ B : 80 , // Good: minor improvements needed
52+ C : 70 , // Acceptable: significant improvements recommended
53+ D : 60 , // Poor: major issues must be addressed
54+ F : 0 , // Failing: unsuitable for production
55+ } as const ;
56+
57+ /**
58+ * Calculate letter grade from numerical score
59+ */
60+ function calculateGrade ( score : number ) : 'A' | 'B' | 'C' | 'D' | 'F' {
61+ if ( score >= GRADE_THRESHOLDS . A ) return 'A' ;
62+ if ( score >= GRADE_THRESHOLDS . B ) return 'B' ;
63+ if ( score >= GRADE_THRESHOLDS . C ) return 'C' ;
64+ if ( score >= GRADE_THRESHOLDS . D ) return 'D' ;
65+ return 'F' ;
66+ }
3767
3868const DEFAULT_AUDIT_CONFIG : AuditConfig = {
3969 iterations : 1 ,
40- benchmark : false ,
4170 format : 'text' ,
4271 confidenceLevel : 0.95 ,
4372 thresholds : {
@@ -47,6 +76,32 @@ const DEFAULT_AUDIT_CONFIG: AuditConfig = {
4776 } ,
4877} ;
4978
79+ /**
80+ * Run comprehensive harness audit with statistical analysis
81+ *
82+ * Executes the harness validator multiple times and provides detailed
83+ * statistical analysis including mean, median, standard deviation, and
84+ * confidence intervals for accuracy, latency, and token usage.
85+ *
86+ * @param skillPath - Absolute or relative path to skill directory or SKILL.md
87+ * @param options - Audit configuration options
88+ * @returns Exit code: 0 for pass, 1 for fail, 2 for execution error
89+ * @throws {Error } When skill path is invalid or cannot be loaded
90+ *
91+ * @example
92+ * ```typescript
93+ * // Basic audit
94+ * const exitCode = await auditCommand('../skills/my-skill');
95+ *
96+ * // Statistical confidence with 10 iterations
97+ * const exitCode = await auditCommand('../skills/my-skill', {
98+ * iterations: 10,
99+ * format: 'html',
100+ * output: 'reports/audit.html',
101+ * baseline: 'baselines/v1.0.0.json'
102+ * });
103+ * ```
104+ */
50105export async function auditCommand (
51106 skillPath : string ,
52107 options : AuditOptions = { } ,
@@ -94,9 +149,22 @@ export async function auditCommand(
94149 if ( config . baseline ) {
95150 try {
96151 const data = await readFile ( config . baseline , 'utf-8' ) ;
97- baselineData = JSON . parse ( data ) as AuditResult ;
152+ const parsed = JSON . parse ( data ) ;
153+
154+ // Validate baseline structure
155+ if ( ! parsed . statistics ?. accuracy ?. mean ||
156+ ! parsed . statistics ?. latency ?. mean ||
157+ ! parsed . statistics ?. tokenUsage ?. mean ||
158+ ! parsed . timestamp ) {
159+ throw new Error ( 'Invalid baseline format: missing required statistics or timestamp' ) ;
160+ }
161+
162+ baselineData = parsed as AuditResult ;
163+ Logger . info ( `Loaded baseline from ${ config . baseline } ` ) ;
98164 } catch ( error ) {
99- Logger . warning ( `Failed to load baseline: ${ error instanceof Error ? error . message : String ( error ) } ` ) ;
165+ const message = error instanceof Error ? error . message : String ( error ) ;
166+ Logger . warning ( `Failed to load baseline: ${ message } ` ) ;
167+ Logger . info ( 'Continuing without baseline comparison' ) ;
100168 }
101169 }
102170
@@ -130,34 +198,43 @@ export async function auditCommand(
130198 }
131199}
132200
201+ /**
202+ * Extract harness metadata from validation result with runtime type checking
203+ */
133204function extractHarnessMetadata ( result : ValidationResult ) : HarnessIterationMetadata | undefined {
134205 const metadata = result . metrics ;
135206 if ( ! metadata ) return undefined ;
136207
208+ const getNumber = ( value : unknown ) : number => {
209+ return typeof value === 'number' && ! isNaN ( value ) ? value : 0 ;
210+ } ;
211+
137212 return {
138- totalCases : ( metadata . totalCases as number ) ?? 0 ,
139- passed : ( metadata . passed as number ) ?? 0 ,
140- failed : ( metadata . failed as number ) ?? 0 ,
141- accuracy : ( metadata . accuracy as number ) ?? 0 ,
142- totalTokens : ( metadata . totalTokens as number ) ?? 0 ,
143- averageLatency : ( metadata . averageLatency as number ) ?? 0 ,
144- totalCost : 0 ,
213+ totalCases : getNumber ( metadata . totalCases ) ,
214+ passed : getNumber ( metadata . passed ) ,
215+ failed : getNumber ( metadata . failed ) ,
216+ accuracy : getNumber ( metadata . accuracy ) ,
217+ totalTokens : getNumber ( metadata . totalTokens ) ,
218+ averageLatency : getNumber ( metadata . averageLatency ) ,
145219 } ;
146220}
147221
148222/**
149223 * Extract a specific metric from iterations
150224 * @param iterations - Array of audit iterations
151225 * @param key - Metadata key to extract
152- * @returns Array of non-zero values for the metric
226+ * @returns Array of metric values (includes zeros, excludes missing metadata)
153227 */
154228function extractMetric (
155229 iterations : readonly AuditIteration [ ] ,
156230 key : keyof HarnessIterationMetadata
157231) : number [ ] {
158232 return iterations
159- . map ( it => ( it . harnessMetadata ?. [ key ] as number ) ?? 0 )
160- . filter ( val => val > 0 ) ;
233+ . filter ( it => it . harnessMetadata !== undefined )
234+ . map ( it => {
235+ const value = it . harnessMetadata ! [ key ] ;
236+ return typeof value === 'number' && ! isNaN ( value ) ? value : 0 ;
237+ } ) ;
161238}
162239
163240async function computeAuditResult (
@@ -171,7 +248,7 @@ async function computeAuditResult(
171248 const latencies = extractMetric ( iterations , 'averageLatency' ) ;
172249 const tokens = extractMetric ( iterations , 'totalTokens' ) ;
173250
174- const costs = tokens . map ( t => ( t / TOKENS_PER_MILLION ) * CLAUDE_SONNET_AVERAGE_COST_PER_MILLION ) ;
251+ const costs = tokens . map ( t => ( t / TOKENS_PER_MILLION ) * PRICING_CONFIG . averageCostPerMillion ) ;
175252
176253 const statistics = {
177254 accuracy : summarize ( accuracies , config . confidenceLevel ) ,
@@ -212,6 +289,9 @@ async function computeAuditResult(
212289 return result ;
213290}
214291
292+ /**
293+ * Assess audit quality and assign grade based on performance metrics
294+ */
215295function assessQuality (
216296 statistics : AuditResult [ 'statistics' ] ,
217297 aggregated : AuditResult [ 'aggregated' ] ,
@@ -221,6 +301,18 @@ function assessQuality(
221301 const recommendations : string [ ] = [ ] ;
222302 let score = 100 ;
223303
304+ // Handle edge case: no tests executed
305+ if ( aggregated . totalTests === 0 ) {
306+ issues . push ( 'No test cases were executed' ) ;
307+ return {
308+ grade : 'F' ,
309+ score : 0 ,
310+ passed : false ,
311+ issues,
312+ recommendations : [ 'Ensure harness validator has valid test cases' ] ,
313+ } ;
314+ }
315+
224316 if ( statistics . accuracy . mean < config . thresholds . minAccuracy ) {
225317 issues . push ( `Accuracy ${ statistics . accuracy . mean . toFixed ( 1 ) } % is below ${ config . thresholds . minAccuracy } % threshold` ) ;
226318 recommendations . push ( 'Improve skill description and trigger keywords' ) ;
@@ -236,7 +328,7 @@ function assessQuality(
236328 score -= 20 ;
237329 }
238330
239- const tokensPerTest = aggregated . totalTokens / Math . max ( 1 , aggregated . totalTests ) ;
331+ const tokensPerTest = aggregated . totalTokens / aggregated . totalTests ;
240332 if ( tokensPerTest > config . thresholds . maxTokensPerTest ) {
241333 issues . push ( `Average tokens per test ${ tokensPerTest . toFixed ( 0 ) } exceeds ${ config . thresholds . maxTokensPerTest } threshold` ) ;
242334 recommendations . push ( 'Reduce skill content size or use reference files' ) ;
@@ -249,8 +341,8 @@ function assessQuality(
249341 score -= 10 ;
250342 }
251343
252- const grade = score >= 90 ? 'A' : score >= 80 ? 'B' : score >= 70 ? 'C' : score >= 60 ? 'D' : 'F' ;
253- const passed = score >= 70 && statistics . accuracy . mean >= config . thresholds . minAccuracy ;
344+ const grade = calculateGrade ( score ) ;
345+ const passed = score >= GRADE_THRESHOLDS . C && statistics . accuracy . mean >= config . thresholds . minAccuracy ;
254346
255347 return {
256348 grade,
@@ -261,10 +353,27 @@ function assessQuality(
261353 } ;
262354}
263355
356+ /**
357+ * Compare current audit results with historical baseline
358+ * Warns if baseline is stale or iteration counts differ significantly
359+ */
264360function compareWithBaseline (
265361 current : AuditResult ,
266362 baseline : AuditResult ,
267363) : AuditResult [ 'baseline' ] {
364+ // Warn if baseline is old (> 30 days)
365+ const baselineDate = new Date ( baseline . timestamp ) ;
366+ const daysSinceBaseline = ( Date . now ( ) - baselineDate . getTime ( ) ) / ( 1000 * 60 * 60 * 24 ) ;
367+
368+ if ( daysSinceBaseline > 30 ) {
369+ Logger . warning ( `Baseline is ${ Math . floor ( daysSinceBaseline ) } days old - consider updating` ) ;
370+ }
371+
372+ // Warn if iteration counts differ significantly
373+ if ( Math . abs ( current . iterations . length - baseline . iterations . length ) > 2 ) {
374+ Logger . warning ( 'Baseline used different iteration count - comparison may be less reliable' ) ;
375+ }
376+
268377 const accuracyDelta = current . statistics . accuracy . mean - baseline . statistics . accuracy . mean ;
269378 const latencyDelta = current . statistics . latency . mean - baseline . statistics . latency . mean ;
270379 const tokenDelta = current . statistics . tokenUsage . mean - baseline . statistics . tokenUsage . mean ;
@@ -283,7 +392,6 @@ async function buildAuditConfig(options: AuditOptions): Promise<AuditConfig> {
283392 return {
284393 ...DEFAULT_AUDIT_CONFIG ,
285394 iterations : options . iterations ?? DEFAULT_AUDIT_CONFIG . iterations ,
286- benchmark : options . benchmark ?? DEFAULT_AUDIT_CONFIG . benchmark ,
287395 format : options . format ?? DEFAULT_AUDIT_CONFIG . format ,
288396 output : options . output ,
289397 baseline : options . baseline ,
0 commit comments