Skip to content

Commit f6547d3

Browse files
Merge remote-tracking branch 'origin/main' into cursor/dag-execution-parallelization-429d
Co-authored-by: EvalOpsBot <EvalOpsBot@users.noreply.github.com>
2 parents 0bfc904 + a4639ff commit f6547d3

File tree

14 files changed

+513
-13
lines changed

14 files changed

+513
-13
lines changed

src/commands/feedback_eval/command.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use report::emit_feedback_eval_report;
1212
pub async fn feedback_eval_command(
1313
input_path: PathBuf,
1414
output_path: Option<PathBuf>,
15+
trend_path: Option<PathBuf>,
1516
confidence_threshold: f32,
1617
eval_report_path: Option<PathBuf>,
1718
) -> Result<()> {
@@ -23,6 +24,7 @@ pub async fn feedback_eval_command(
2324
emit_feedback_eval_report(
2425
&loaded,
2526
output_path.as_deref(),
27+
trend_path.as_deref(),
2628
confidence_threshold,
2729
eval_report.as_ref(),
2830
)

src/commands/feedback_eval/command/report.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ use std::path::Path;
44
use crate::commands::eval::EvalReport;
55

66
use super::super::report::{
7-
build_feedback_eval_report, print_feedback_eval_report, write_feedback_eval_report,
7+
build_feedback_eval_report, print_feedback_eval_report, update_feedback_eval_trend,
8+
write_feedback_eval_report,
89
};
910
use super::super::LoadedFeedbackEvalInput;
1011

1112
pub(super) async fn emit_feedback_eval_report(
1213
loaded: &LoadedFeedbackEvalInput,
1314
output_path: Option<&Path>,
15+
trend_path: Option<&Path>,
1416
confidence_threshold: f32,
1517
eval_report: Option<&EvalReport>,
1618
) -> Result<()> {
@@ -21,6 +23,9 @@ pub(super) async fn emit_feedback_eval_report(
2123
if let Some(path) = output_path {
2224
write_feedback_eval_report(&report, path).await?;
2325
}
26+
if let Some(path) = trend_path {
27+
update_feedback_eval_trend(&report, eval_report, path).await?;
28+
}
2429

2530
Ok(())
2631
}

src/commands/feedback_eval/report.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ mod build;
44
mod examples;
55
#[path = "report/output.rs"]
66
mod output;
7+
#[path = "report/trend.rs"]
8+
mod trend;
79

810
pub(super) use build::build_feedback_eval_report;
911
pub(super) use output::{print_feedback_eval_report, write_feedback_eval_report};
12+
pub(super) use trend::update_feedback_eval_trend;
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
use anyhow::{Context, Result};
2+
use chrono::Utc;
3+
use std::path::Path;
4+
5+
use crate::commands::eval::EvalReport;
6+
7+
use super::super::types::{FeedbackEvalTrend, FeedbackEvalTrendEntry, FeedbackEvalTrendGap};
8+
use super::super::{FeedbackEvalReport, FeedbackEvalRuleCorrelation};
9+
10+
const MAX_ATTENTION_GAPS: usize = 5;
11+
12+
pub(in super::super) async fn update_feedback_eval_trend(
13+
report: &FeedbackEvalReport,
14+
eval_report: Option<&EvalReport>,
15+
path: &Path,
16+
) -> Result<()> {
17+
let mut trend = if path.exists() {
18+
let content = tokio::fs::read_to_string(path)
19+
.await
20+
.with_context(|| format!("failed to read feedback trend file {}", path.display()))?;
21+
FeedbackEvalTrend::from_json(&content)
22+
.with_context(|| format!("failed to parse feedback trend file {}", path.display()))?
23+
} else {
24+
FeedbackEvalTrend::new()
25+
};
26+
trend
27+
.entries
28+
.push(trend_entry_for_report(report, eval_report));
29+
30+
if let Some(parent) = path.parent() {
31+
tokio::fs::create_dir_all(parent)
32+
.await
33+
.with_context(|| format!("failed to create {}", parent.display()))?;
34+
}
35+
tokio::fs::write(path, trend.to_json()?)
36+
.await
37+
.with_context(|| format!("failed to write feedback trend file {}", path.display()))?;
38+
Ok(())
39+
}
40+
41+
fn trend_entry_for_report(
42+
report: &FeedbackEvalReport,
43+
eval_report: Option<&EvalReport>,
44+
) -> FeedbackEvalTrendEntry {
45+
let confidence_metrics = report.confidence_metrics;
46+
let correlation = report.eval_correlation.as_ref();
47+
FeedbackEvalTrendEntry {
48+
timestamp: Utc::now().to_rfc3339(),
49+
labeled_comments: report.labeled_comments,
50+
accepted: report.accepted,
51+
rejected: report.rejected,
52+
acceptance_rate: report.acceptance_rate,
53+
confidence_threshold: report.confidence_threshold,
54+
confidence_agreement_rate: confidence_metrics.map(|metrics| metrics.agreement_rate),
55+
confidence_precision: confidence_metrics.map(|metrics| metrics.precision),
56+
confidence_recall: confidence_metrics.map(|metrics| metrics.recall),
57+
confidence_f1: confidence_metrics.map(|metrics| metrics.f1),
58+
eval_label: eval_report.and_then(|report| report.run.label.clone()),
59+
eval_model: eval_report.map(|report| report.run.model.clone()),
60+
eval_provider: eval_report.and_then(|report| report.run.provider.clone()),
61+
attention_by_category: correlation
62+
.map(|report| {
63+
report
64+
.attention_by_category
65+
.iter()
66+
.take(MAX_ATTENTION_GAPS)
67+
.map(|category| FeedbackEvalTrendGap {
68+
name: category.name.clone(),
69+
feedback_total: category.feedback_total,
70+
high_confidence_total: category.high_confidence_total,
71+
high_confidence_acceptance_rate: category.high_confidence_acceptance_rate,
72+
eval_score: category.eval_micro_f1,
73+
gap: category.high_confidence_vs_eval_gap,
74+
})
75+
.collect()
76+
})
77+
.unwrap_or_default(),
78+
attention_by_rule: correlation
79+
.map(|report| {
80+
report
81+
.attention_by_rule
82+
.iter()
83+
.take(MAX_ATTENTION_GAPS)
84+
.map(rule_gap)
85+
.collect()
86+
})
87+
.unwrap_or_default(),
88+
}
89+
}
90+
91+
fn rule_gap(rule: &FeedbackEvalRuleCorrelation) -> FeedbackEvalTrendGap {
92+
FeedbackEvalTrendGap {
93+
name: rule.rule_id.clone(),
94+
feedback_total: rule.feedback_total,
95+
high_confidence_total: rule.high_confidence_total,
96+
high_confidence_acceptance_rate: rule.high_confidence_acceptance_rate,
97+
eval_score: rule.eval_f1,
98+
gap: rule.high_confidence_vs_eval_gap,
99+
}
100+
}
101+
102+
#[cfg(test)]
103+
mod tests {
104+
use tempfile::tempdir;
105+
106+
use crate::commands::eval::EvalRunMetadata;
107+
use crate::commands::feedback_eval::{
108+
FeedbackEvalBucket, FeedbackEvalCategoryCorrelation, FeedbackEvalCorrelationReport,
109+
FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackThresholdMetrics,
110+
};
111+
112+
use super::*;
113+
114+
fn sample_feedback_report() -> FeedbackEvalReport {
115+
FeedbackEvalReport {
116+
total_comments_seen: 12,
117+
total_reviews_seen: 3,
118+
labeled_comments: 8,
119+
labeled_reviews: 2,
120+
accepted: 3,
121+
rejected: 5,
122+
acceptance_rate: 0.375,
123+
confidence_threshold: 0.75,
124+
vague_comments: FeedbackEvalBucket {
125+
name: "vague".to_string(),
126+
total: 1,
127+
accepted: 0,
128+
rejected: 1,
129+
acceptance_rate: 0.0,
130+
},
131+
confidence_metrics: Some(FeedbackThresholdMetrics {
132+
total_scored: 6,
133+
true_positive: 2,
134+
false_positive: 1,
135+
true_negative: 2,
136+
false_negative: 1,
137+
precision: 0.67,
138+
recall: 0.67,
139+
f1: 0.67,
140+
agreement_rate: 0.67,
141+
}),
142+
by_category: vec![],
143+
by_rule: vec![],
144+
high_confidence_by_category: vec![],
145+
high_confidence_by_rule: vec![],
146+
by_severity: vec![],
147+
by_repo: vec![],
148+
by_file_pattern: vec![],
149+
eval_correlation: Some(FeedbackEvalCorrelationReport {
150+
by_category: vec![],
151+
by_rule: vec![],
152+
attention_by_category: vec![FeedbackEvalCategoryCorrelation {
153+
name: "Security".to_string(),
154+
feedback_total: 4,
155+
feedback_acceptance_rate: 0.25,
156+
high_confidence_total: 3,
157+
high_confidence_acceptance_rate: 0.0,
158+
eval_fixture_count: Some(5),
159+
eval_micro_f1: Some(0.9),
160+
eval_weighted_score: Some(0.91),
161+
feedback_vs_eval_gap: Some(0.65),
162+
high_confidence_vs_eval_gap: Some(0.9),
163+
}],
164+
attention_by_rule: vec![FeedbackEvalRuleCorrelation {
165+
rule_id: "sec.sql.injection".to_string(),
166+
feedback_total: 3,
167+
feedback_acceptance_rate: 0.33,
168+
high_confidence_total: 2,
169+
high_confidence_acceptance_rate: 0.0,
170+
eval_precision: Some(1.0),
171+
eval_recall: Some(1.0),
172+
eval_f1: Some(1.0),
173+
feedback_vs_eval_gap: Some(0.67),
174+
high_confidence_vs_eval_gap: Some(1.0),
175+
}],
176+
}),
177+
showcase_candidates: vec![],
178+
vague_rejections: vec![],
179+
}
180+
}
181+
182+
fn sample_eval_report() -> EvalReport {
183+
EvalReport {
184+
run: EvalRunMetadata {
185+
label: Some("frontier-e2e".to_string()),
186+
model: "anthropic/claude-opus-4.5".to_string(),
187+
provider: Some("openrouter".to_string()),
188+
..Default::default()
189+
},
190+
fixtures_total: 0,
191+
fixtures_passed: 0,
192+
fixtures_failed: 0,
193+
rule_metrics: vec![],
194+
rule_summary: None,
195+
benchmark_summary: None,
196+
suite_results: vec![],
197+
benchmark_by_category: Default::default(),
198+
benchmark_by_language: Default::default(),
199+
benchmark_by_difficulty: Default::default(),
200+
suite_comparisons: vec![],
201+
category_comparisons: vec![],
202+
language_comparisons: vec![],
203+
verification_health: None,
204+
warnings: vec![],
205+
threshold_failures: vec![],
206+
results: vec![],
207+
}
208+
}
209+
210+
#[tokio::test]
211+
async fn update_feedback_eval_trend_appends_attention_entries() {
212+
let dir = tempdir().unwrap();
213+
let path = dir.path().join("feedback-trend.json");
214+
215+
update_feedback_eval_trend(
216+
&sample_feedback_report(),
217+
Some(&sample_eval_report()),
218+
&path,
219+
)
220+
.await
221+
.unwrap();
222+
update_feedback_eval_trend(
223+
&sample_feedback_report(),
224+
Some(&sample_eval_report()),
225+
&path,
226+
)
227+
.await
228+
.unwrap();
229+
230+
let content = tokio::fs::read_to_string(&path).await.unwrap();
231+
let trend = FeedbackEvalTrend::from_json(&content).unwrap();
232+
assert_eq!(trend.entries.len(), 2);
233+
assert_eq!(trend.entries[0].eval_label.as_deref(), Some("frontier-e2e"));
234+
assert_eq!(
235+
trend.entries[0].eval_model.as_deref(),
236+
Some("anthropic/claude-opus-4.5")
237+
);
238+
assert_eq!(trend.entries[0].attention_by_category.len(), 1);
239+
assert_eq!(trend.entries[0].attention_by_category[0].name, "Security");
240+
assert_eq!(
241+
trend.entries[0].attention_by_rule[0].name,
242+
"sec.sql.injection"
243+
);
244+
}
245+
}

src/commands/feedback_eval/types.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@ mod report;
66
pub(super) use input::{FeedbackEvalComment, LoadedFeedbackEvalInput};
77
pub(super) use report::{
88
FeedbackEvalBucket, FeedbackEvalCategoryCorrelation, FeedbackEvalCorrelationReport,
9-
FeedbackEvalExample, FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackThresholdMetrics,
9+
FeedbackEvalExample, FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackEvalTrend,
10+
FeedbackEvalTrendEntry, FeedbackEvalTrendGap, FeedbackThresholdMetrics,
1011
};

src/commands/feedback_eval/types/report.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,76 @@ pub(in super::super) struct FeedbackEvalCorrelationReport {
7575
pub(in super::super) attention_by_rule: Vec<FeedbackEvalRuleCorrelation>,
7676
}
7777

78+
#[derive(Debug, Clone, Serialize, Deserialize)]
79+
pub(in super::super) struct FeedbackEvalTrendGap {
80+
#[serde(default)]
81+
pub(in super::super) name: String,
82+
#[serde(default)]
83+
pub(in super::super) feedback_total: usize,
84+
#[serde(default)]
85+
pub(in super::super) high_confidence_total: usize,
86+
#[serde(default)]
87+
pub(in super::super) high_confidence_acceptance_rate: f32,
88+
#[serde(default, skip_serializing_if = "Option::is_none")]
89+
pub(in super::super) eval_score: Option<f32>,
90+
#[serde(default, skip_serializing_if = "Option::is_none")]
91+
pub(in super::super) gap: Option<f32>,
92+
}
93+
94+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
95+
pub(in super::super) struct FeedbackEvalTrendEntry {
96+
#[serde(default)]
97+
pub(in super::super) timestamp: String,
98+
#[serde(default)]
99+
pub(in super::super) labeled_comments: usize,
100+
#[serde(default)]
101+
pub(in super::super) accepted: usize,
102+
#[serde(default)]
103+
pub(in super::super) rejected: usize,
104+
#[serde(default)]
105+
pub(in super::super) acceptance_rate: f32,
106+
#[serde(default)]
107+
pub(in super::super) confidence_threshold: f32,
108+
#[serde(default, skip_serializing_if = "Option::is_none")]
109+
pub(in super::super) confidence_agreement_rate: Option<f32>,
110+
#[serde(default, skip_serializing_if = "Option::is_none")]
111+
pub(in super::super) confidence_precision: Option<f32>,
112+
#[serde(default, skip_serializing_if = "Option::is_none")]
113+
pub(in super::super) confidence_recall: Option<f32>,
114+
#[serde(default, skip_serializing_if = "Option::is_none")]
115+
pub(in super::super) confidence_f1: Option<f32>,
116+
#[serde(default, skip_serializing_if = "Option::is_none")]
117+
pub(in super::super) eval_label: Option<String>,
118+
#[serde(default, skip_serializing_if = "Option::is_none")]
119+
pub(in super::super) eval_model: Option<String>,
120+
#[serde(default, skip_serializing_if = "Option::is_none")]
121+
pub(in super::super) eval_provider: Option<String>,
122+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
123+
pub(in super::super) attention_by_category: Vec<FeedbackEvalTrendGap>,
124+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
125+
pub(in super::super) attention_by_rule: Vec<FeedbackEvalTrendGap>,
126+
}
127+
128+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
129+
pub(in super::super) struct FeedbackEvalTrend {
130+
#[serde(default)]
131+
pub(in super::super) entries: Vec<FeedbackEvalTrendEntry>,
132+
}
133+
134+
impl FeedbackEvalTrend {
135+
pub(in super::super) fn new() -> Self {
136+
Self::default()
137+
}
138+
139+
pub(in super::super) fn to_json(&self) -> Result<String, serde_json::Error> {
140+
serde_json::to_string_pretty(self)
141+
}
142+
143+
pub(in super::super) fn from_json(json: &str) -> Result<Self, serde_json::Error> {
144+
serde_json::from_str(json)
145+
}
146+
}
147+
78148
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
79149
pub(in super::super) struct FeedbackThresholdMetrics {
80150
#[serde(default)]

0 commit comments

Comments
 (0)