|
| 1 | +use anyhow::{Context, Result}; |
| 2 | +use chrono::Utc; |
| 3 | +use std::path::Path; |
| 4 | + |
| 5 | +use crate::commands::eval::EvalReport; |
| 6 | + |
| 7 | +use super::super::types::{FeedbackEvalTrend, FeedbackEvalTrendEntry, FeedbackEvalTrendGap}; |
| 8 | +use super::super::{FeedbackEvalReport, FeedbackEvalRuleCorrelation}; |
| 9 | + |
| 10 | +const MAX_ATTENTION_GAPS: usize = 5; |
| 11 | + |
| 12 | +pub(in super::super) async fn update_feedback_eval_trend( |
| 13 | + report: &FeedbackEvalReport, |
| 14 | + eval_report: Option<&EvalReport>, |
| 15 | + path: &Path, |
| 16 | +) -> Result<()> { |
| 17 | + let mut trend = if path.exists() { |
| 18 | + let content = tokio::fs::read_to_string(path) |
| 19 | + .await |
| 20 | + .with_context(|| format!("failed to read feedback trend file {}", path.display()))?; |
| 21 | + FeedbackEvalTrend::from_json(&content) |
| 22 | + .with_context(|| format!("failed to parse feedback trend file {}", path.display()))? |
| 23 | + } else { |
| 24 | + FeedbackEvalTrend::new() |
| 25 | + }; |
| 26 | + trend |
| 27 | + .entries |
| 28 | + .push(trend_entry_for_report(report, eval_report)); |
| 29 | + |
| 30 | + if let Some(parent) = path.parent() { |
| 31 | + tokio::fs::create_dir_all(parent) |
| 32 | + .await |
| 33 | + .with_context(|| format!("failed to create {}", parent.display()))?; |
| 34 | + } |
| 35 | + tokio::fs::write(path, trend.to_json()?) |
| 36 | + .await |
| 37 | + .with_context(|| format!("failed to write feedback trend file {}", path.display()))?; |
| 38 | + Ok(()) |
| 39 | +} |
| 40 | + |
| 41 | +fn trend_entry_for_report( |
| 42 | + report: &FeedbackEvalReport, |
| 43 | + eval_report: Option<&EvalReport>, |
| 44 | +) -> FeedbackEvalTrendEntry { |
| 45 | + let confidence_metrics = report.confidence_metrics; |
| 46 | + let correlation = report.eval_correlation.as_ref(); |
| 47 | + FeedbackEvalTrendEntry { |
| 48 | + timestamp: Utc::now().to_rfc3339(), |
| 49 | + labeled_comments: report.labeled_comments, |
| 50 | + accepted: report.accepted, |
| 51 | + rejected: report.rejected, |
| 52 | + acceptance_rate: report.acceptance_rate, |
| 53 | + confidence_threshold: report.confidence_threshold, |
| 54 | + confidence_agreement_rate: confidence_metrics.map(|metrics| metrics.agreement_rate), |
| 55 | + confidence_precision: confidence_metrics.map(|metrics| metrics.precision), |
| 56 | + confidence_recall: confidence_metrics.map(|metrics| metrics.recall), |
| 57 | + confidence_f1: confidence_metrics.map(|metrics| metrics.f1), |
| 58 | + eval_label: eval_report.and_then(|report| report.run.label.clone()), |
| 59 | + eval_model: eval_report.map(|report| report.run.model.clone()), |
| 60 | + eval_provider: eval_report.and_then(|report| report.run.provider.clone()), |
| 61 | + attention_by_category: correlation |
| 62 | + .map(|report| { |
| 63 | + report |
| 64 | + .attention_by_category |
| 65 | + .iter() |
| 66 | + .take(MAX_ATTENTION_GAPS) |
| 67 | + .map(|category| FeedbackEvalTrendGap { |
| 68 | + name: category.name.clone(), |
| 69 | + feedback_total: category.feedback_total, |
| 70 | + high_confidence_total: category.high_confidence_total, |
| 71 | + high_confidence_acceptance_rate: category.high_confidence_acceptance_rate, |
| 72 | + eval_score: category.eval_micro_f1, |
| 73 | + gap: category.high_confidence_vs_eval_gap, |
| 74 | + }) |
| 75 | + .collect() |
| 76 | + }) |
| 77 | + .unwrap_or_default(), |
| 78 | + attention_by_rule: correlation |
| 79 | + .map(|report| { |
| 80 | + report |
| 81 | + .attention_by_rule |
| 82 | + .iter() |
| 83 | + .take(MAX_ATTENTION_GAPS) |
| 84 | + .map(rule_gap) |
| 85 | + .collect() |
| 86 | + }) |
| 87 | + .unwrap_or_default(), |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +fn rule_gap(rule: &FeedbackEvalRuleCorrelation) -> FeedbackEvalTrendGap { |
| 92 | + FeedbackEvalTrendGap { |
| 93 | + name: rule.rule_id.clone(), |
| 94 | + feedback_total: rule.feedback_total, |
| 95 | + high_confidence_total: rule.high_confidence_total, |
| 96 | + high_confidence_acceptance_rate: rule.high_confidence_acceptance_rate, |
| 97 | + eval_score: rule.eval_f1, |
| 98 | + gap: rule.high_confidence_vs_eval_gap, |
| 99 | + } |
| 100 | +} |
| 101 | + |
| 102 | +#[cfg(test)] |
| 103 | +mod tests { |
| 104 | + use tempfile::tempdir; |
| 105 | + |
| 106 | + use crate::commands::eval::EvalRunMetadata; |
| 107 | + use crate::commands::feedback_eval::{ |
| 108 | + FeedbackEvalBucket, FeedbackEvalCategoryCorrelation, FeedbackEvalCorrelationReport, |
| 109 | + FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackThresholdMetrics, |
| 110 | + }; |
| 111 | + |
| 112 | + use super::*; |
| 113 | + |
| 114 | + fn sample_feedback_report() -> FeedbackEvalReport { |
| 115 | + FeedbackEvalReport { |
| 116 | + total_comments_seen: 12, |
| 117 | + total_reviews_seen: 3, |
| 118 | + labeled_comments: 8, |
| 119 | + labeled_reviews: 2, |
| 120 | + accepted: 3, |
| 121 | + rejected: 5, |
| 122 | + acceptance_rate: 0.375, |
| 123 | + confidence_threshold: 0.75, |
| 124 | + vague_comments: FeedbackEvalBucket { |
| 125 | + name: "vague".to_string(), |
| 126 | + total: 1, |
| 127 | + accepted: 0, |
| 128 | + rejected: 1, |
| 129 | + acceptance_rate: 0.0, |
| 130 | + }, |
| 131 | + confidence_metrics: Some(FeedbackThresholdMetrics { |
| 132 | + total_scored: 6, |
| 133 | + true_positive: 2, |
| 134 | + false_positive: 1, |
| 135 | + true_negative: 2, |
| 136 | + false_negative: 1, |
| 137 | + precision: 0.67, |
| 138 | + recall: 0.67, |
| 139 | + f1: 0.67, |
| 140 | + agreement_rate: 0.67, |
| 141 | + }), |
| 142 | + by_category: vec![], |
| 143 | + by_rule: vec![], |
| 144 | + high_confidence_by_category: vec![], |
| 145 | + high_confidence_by_rule: vec![], |
| 146 | + by_severity: vec![], |
| 147 | + by_repo: vec![], |
| 148 | + by_file_pattern: vec![], |
| 149 | + eval_correlation: Some(FeedbackEvalCorrelationReport { |
| 150 | + by_category: vec![], |
| 151 | + by_rule: vec![], |
| 152 | + attention_by_category: vec![FeedbackEvalCategoryCorrelation { |
| 153 | + name: "Security".to_string(), |
| 154 | + feedback_total: 4, |
| 155 | + feedback_acceptance_rate: 0.25, |
| 156 | + high_confidence_total: 3, |
| 157 | + high_confidence_acceptance_rate: 0.0, |
| 158 | + eval_fixture_count: Some(5), |
| 159 | + eval_micro_f1: Some(0.9), |
| 160 | + eval_weighted_score: Some(0.91), |
| 161 | + feedback_vs_eval_gap: Some(0.65), |
| 162 | + high_confidence_vs_eval_gap: Some(0.9), |
| 163 | + }], |
| 164 | + attention_by_rule: vec![FeedbackEvalRuleCorrelation { |
| 165 | + rule_id: "sec.sql.injection".to_string(), |
| 166 | + feedback_total: 3, |
| 167 | + feedback_acceptance_rate: 0.33, |
| 168 | + high_confidence_total: 2, |
| 169 | + high_confidence_acceptance_rate: 0.0, |
| 170 | + eval_precision: Some(1.0), |
| 171 | + eval_recall: Some(1.0), |
| 172 | + eval_f1: Some(1.0), |
| 173 | + feedback_vs_eval_gap: Some(0.67), |
| 174 | + high_confidence_vs_eval_gap: Some(1.0), |
| 175 | + }], |
| 176 | + }), |
| 177 | + showcase_candidates: vec![], |
| 178 | + vague_rejections: vec![], |
| 179 | + } |
| 180 | + } |
| 181 | + |
| 182 | + fn sample_eval_report() -> EvalReport { |
| 183 | + EvalReport { |
| 184 | + run: EvalRunMetadata { |
| 185 | + label: Some("frontier-e2e".to_string()), |
| 186 | + model: "anthropic/claude-opus-4.5".to_string(), |
| 187 | + provider: Some("openrouter".to_string()), |
| 188 | + ..Default::default() |
| 189 | + }, |
| 190 | + fixtures_total: 0, |
| 191 | + fixtures_passed: 0, |
| 192 | + fixtures_failed: 0, |
| 193 | + rule_metrics: vec![], |
| 194 | + rule_summary: None, |
| 195 | + benchmark_summary: None, |
| 196 | + suite_results: vec![], |
| 197 | + benchmark_by_category: Default::default(), |
| 198 | + benchmark_by_language: Default::default(), |
| 199 | + benchmark_by_difficulty: Default::default(), |
| 200 | + suite_comparisons: vec![], |
| 201 | + category_comparisons: vec![], |
| 202 | + language_comparisons: vec![], |
| 203 | + verification_health: None, |
| 204 | + warnings: vec![], |
| 205 | + threshold_failures: vec![], |
| 206 | + results: vec![], |
| 207 | + } |
| 208 | + } |
| 209 | + |
| 210 | + #[tokio::test] |
| 211 | + async fn update_feedback_eval_trend_appends_attention_entries() { |
| 212 | + let dir = tempdir().unwrap(); |
| 213 | + let path = dir.path().join("feedback-trend.json"); |
| 214 | + |
| 215 | + update_feedback_eval_trend( |
| 216 | + &sample_feedback_report(), |
| 217 | + Some(&sample_eval_report()), |
| 218 | + &path, |
| 219 | + ) |
| 220 | + .await |
| 221 | + .unwrap(); |
| 222 | + update_feedback_eval_trend( |
| 223 | + &sample_feedback_report(), |
| 224 | + Some(&sample_eval_report()), |
| 225 | + &path, |
| 226 | + ) |
| 227 | + .await |
| 228 | + .unwrap(); |
| 229 | + |
| 230 | + let content = tokio::fs::read_to_string(&path).await.unwrap(); |
| 231 | + let trend = FeedbackEvalTrend::from_json(&content).unwrap(); |
| 232 | + assert_eq!(trend.entries.len(), 2); |
| 233 | + assert_eq!(trend.entries[0].eval_label.as_deref(), Some("frontier-e2e")); |
| 234 | + assert_eq!( |
| 235 | + trend.entries[0].eval_model.as_deref(), |
| 236 | + Some("anthropic/claude-opus-4.5") |
| 237 | + ); |
| 238 | + assert_eq!(trend.entries[0].attention_by_category.len(), 1); |
| 239 | + assert_eq!(trend.entries[0].attention_by_category[0].name, "Security"); |
| 240 | + assert_eq!( |
| 241 | + trend.entries[0].attention_by_rule[0].name, |
| 242 | + "sec.sql.injection" |
| 243 | + ); |
| 244 | + } |
| 245 | +} |
0 commit comments