Skip to content

Commit a4639ff

Browse files
committed
feat: expose review controls and feedback trends
Persist feedback-eval attention gaps into a reusable history file so calibration drift can be tracked over time alongside eval output. Expose the newer review pipeline controls in the web settings UI and align the model picker with the frontier defaults so the capabilities we added are actually configurable from the app. Made-with: Cursor
1 parent 92d6698 commit a4639ff

File tree

14 files changed

+513
-13
lines changed

14 files changed

+513
-13
lines changed

src/commands/feedback_eval/command.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use report::emit_feedback_eval_report;
1212
pub async fn feedback_eval_command(
1313
input_path: PathBuf,
1414
output_path: Option<PathBuf>,
15+
trend_path: Option<PathBuf>,
1516
confidence_threshold: f32,
1617
eval_report_path: Option<PathBuf>,
1718
) -> Result<()> {
@@ -23,6 +24,7 @@ pub async fn feedback_eval_command(
2324
emit_feedback_eval_report(
2425
&loaded,
2526
output_path.as_deref(),
27+
trend_path.as_deref(),
2628
confidence_threshold,
2729
eval_report.as_ref(),
2830
)

src/commands/feedback_eval/command/report.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ use std::path::Path;
44
use crate::commands::eval::EvalReport;
55

66
use super::super::report::{
7-
build_feedback_eval_report, print_feedback_eval_report, write_feedback_eval_report,
7+
build_feedback_eval_report, print_feedback_eval_report, update_feedback_eval_trend,
8+
write_feedback_eval_report,
89
};
910
use super::super::LoadedFeedbackEvalInput;
1011

1112
pub(super) async fn emit_feedback_eval_report(
1213
loaded: &LoadedFeedbackEvalInput,
1314
output_path: Option<&Path>,
15+
trend_path: Option<&Path>,
1416
confidence_threshold: f32,
1517
eval_report: Option<&EvalReport>,
1618
) -> Result<()> {
@@ -21,6 +23,9 @@ pub(super) async fn emit_feedback_eval_report(
2123
if let Some(path) = output_path {
2224
write_feedback_eval_report(&report, path).await?;
2325
}
26+
if let Some(path) = trend_path {
27+
update_feedback_eval_trend(&report, eval_report, path).await?;
28+
}
2429

2530
Ok(())
2631
}

src/commands/feedback_eval/report.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ mod build;
44
mod examples;
55
#[path = "report/output.rs"]
66
mod output;
7+
#[path = "report/trend.rs"]
8+
mod trend;
79

810
pub(super) use build::build_feedback_eval_report;
911
pub(super) use output::{print_feedback_eval_report, write_feedback_eval_report};
12+
pub(super) use trend::update_feedback_eval_trend;
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
use anyhow::{Context, Result};
2+
use chrono::Utc;
3+
use std::path::Path;
4+
5+
use crate::commands::eval::EvalReport;
6+
7+
use super::super::types::{FeedbackEvalTrend, FeedbackEvalTrendEntry, FeedbackEvalTrendGap};
8+
use super::super::{FeedbackEvalReport, FeedbackEvalRuleCorrelation};
9+
10+
const MAX_ATTENTION_GAPS: usize = 5;
11+
12+
pub(in super::super) async fn update_feedback_eval_trend(
13+
report: &FeedbackEvalReport,
14+
eval_report: Option<&EvalReport>,
15+
path: &Path,
16+
) -> Result<()> {
17+
let mut trend = if path.exists() {
18+
let content = tokio::fs::read_to_string(path)
19+
.await
20+
.with_context(|| format!("failed to read feedback trend file {}", path.display()))?;
21+
FeedbackEvalTrend::from_json(&content)
22+
.with_context(|| format!("failed to parse feedback trend file {}", path.display()))?
23+
} else {
24+
FeedbackEvalTrend::new()
25+
};
26+
trend
27+
.entries
28+
.push(trend_entry_for_report(report, eval_report));
29+
30+
if let Some(parent) = path.parent() {
31+
tokio::fs::create_dir_all(parent)
32+
.await
33+
.with_context(|| format!("failed to create {}", parent.display()))?;
34+
}
35+
tokio::fs::write(path, trend.to_json()?)
36+
.await
37+
.with_context(|| format!("failed to write feedback trend file {}", path.display()))?;
38+
Ok(())
39+
}
40+
41+
fn trend_entry_for_report(
42+
report: &FeedbackEvalReport,
43+
eval_report: Option<&EvalReport>,
44+
) -> FeedbackEvalTrendEntry {
45+
let confidence_metrics = report.confidence_metrics;
46+
let correlation = report.eval_correlation.as_ref();
47+
FeedbackEvalTrendEntry {
48+
timestamp: Utc::now().to_rfc3339(),
49+
labeled_comments: report.labeled_comments,
50+
accepted: report.accepted,
51+
rejected: report.rejected,
52+
acceptance_rate: report.acceptance_rate,
53+
confidence_threshold: report.confidence_threshold,
54+
confidence_agreement_rate: confidence_metrics.map(|metrics| metrics.agreement_rate),
55+
confidence_precision: confidence_metrics.map(|metrics| metrics.precision),
56+
confidence_recall: confidence_metrics.map(|metrics| metrics.recall),
57+
confidence_f1: confidence_metrics.map(|metrics| metrics.f1),
58+
eval_label: eval_report.and_then(|report| report.run.label.clone()),
59+
eval_model: eval_report.map(|report| report.run.model.clone()),
60+
eval_provider: eval_report.and_then(|report| report.run.provider.clone()),
61+
attention_by_category: correlation
62+
.map(|report| {
63+
report
64+
.attention_by_category
65+
.iter()
66+
.take(MAX_ATTENTION_GAPS)
67+
.map(|category| FeedbackEvalTrendGap {
68+
name: category.name.clone(),
69+
feedback_total: category.feedback_total,
70+
high_confidence_total: category.high_confidence_total,
71+
high_confidence_acceptance_rate: category.high_confidence_acceptance_rate,
72+
eval_score: category.eval_micro_f1,
73+
gap: category.high_confidence_vs_eval_gap,
74+
})
75+
.collect()
76+
})
77+
.unwrap_or_default(),
78+
attention_by_rule: correlation
79+
.map(|report| {
80+
report
81+
.attention_by_rule
82+
.iter()
83+
.take(MAX_ATTENTION_GAPS)
84+
.map(rule_gap)
85+
.collect()
86+
})
87+
.unwrap_or_default(),
88+
}
89+
}
90+
91+
fn rule_gap(rule: &FeedbackEvalRuleCorrelation) -> FeedbackEvalTrendGap {
92+
FeedbackEvalTrendGap {
93+
name: rule.rule_id.clone(),
94+
feedback_total: rule.feedback_total,
95+
high_confidence_total: rule.high_confidence_total,
96+
high_confidence_acceptance_rate: rule.high_confidence_acceptance_rate,
97+
eval_score: rule.eval_f1,
98+
gap: rule.high_confidence_vs_eval_gap,
99+
}
100+
}
101+
102+
#[cfg(test)]
103+
mod tests {
104+
use tempfile::tempdir;
105+
106+
use crate::commands::eval::EvalRunMetadata;
107+
use crate::commands::feedback_eval::{
108+
FeedbackEvalBucket, FeedbackEvalCategoryCorrelation, FeedbackEvalCorrelationReport,
109+
FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackThresholdMetrics,
110+
};
111+
112+
use super::*;
113+
114+
fn sample_feedback_report() -> FeedbackEvalReport {
115+
FeedbackEvalReport {
116+
total_comments_seen: 12,
117+
total_reviews_seen: 3,
118+
labeled_comments: 8,
119+
labeled_reviews: 2,
120+
accepted: 3,
121+
rejected: 5,
122+
acceptance_rate: 0.375,
123+
confidence_threshold: 0.75,
124+
vague_comments: FeedbackEvalBucket {
125+
name: "vague".to_string(),
126+
total: 1,
127+
accepted: 0,
128+
rejected: 1,
129+
acceptance_rate: 0.0,
130+
},
131+
confidence_metrics: Some(FeedbackThresholdMetrics {
132+
total_scored: 6,
133+
true_positive: 2,
134+
false_positive: 1,
135+
true_negative: 2,
136+
false_negative: 1,
137+
precision: 0.67,
138+
recall: 0.67,
139+
f1: 0.67,
140+
agreement_rate: 0.67,
141+
}),
142+
by_category: vec![],
143+
by_rule: vec![],
144+
high_confidence_by_category: vec![],
145+
high_confidence_by_rule: vec![],
146+
by_severity: vec![],
147+
by_repo: vec![],
148+
by_file_pattern: vec![],
149+
eval_correlation: Some(FeedbackEvalCorrelationReport {
150+
by_category: vec![],
151+
by_rule: vec![],
152+
attention_by_category: vec![FeedbackEvalCategoryCorrelation {
153+
name: "Security".to_string(),
154+
feedback_total: 4,
155+
feedback_acceptance_rate: 0.25,
156+
high_confidence_total: 3,
157+
high_confidence_acceptance_rate: 0.0,
158+
eval_fixture_count: Some(5),
159+
eval_micro_f1: Some(0.9),
160+
eval_weighted_score: Some(0.91),
161+
feedback_vs_eval_gap: Some(0.65),
162+
high_confidence_vs_eval_gap: Some(0.9),
163+
}],
164+
attention_by_rule: vec![FeedbackEvalRuleCorrelation {
165+
rule_id: "sec.sql.injection".to_string(),
166+
feedback_total: 3,
167+
feedback_acceptance_rate: 0.33,
168+
high_confidence_total: 2,
169+
high_confidence_acceptance_rate: 0.0,
170+
eval_precision: Some(1.0),
171+
eval_recall: Some(1.0),
172+
eval_f1: Some(1.0),
173+
feedback_vs_eval_gap: Some(0.67),
174+
high_confidence_vs_eval_gap: Some(1.0),
175+
}],
176+
}),
177+
showcase_candidates: vec![],
178+
vague_rejections: vec![],
179+
}
180+
}
181+
182+
fn sample_eval_report() -> EvalReport {
183+
EvalReport {
184+
run: EvalRunMetadata {
185+
label: Some("frontier-e2e".to_string()),
186+
model: "anthropic/claude-opus-4.5".to_string(),
187+
provider: Some("openrouter".to_string()),
188+
..Default::default()
189+
},
190+
fixtures_total: 0,
191+
fixtures_passed: 0,
192+
fixtures_failed: 0,
193+
rule_metrics: vec![],
194+
rule_summary: None,
195+
benchmark_summary: None,
196+
suite_results: vec![],
197+
benchmark_by_category: Default::default(),
198+
benchmark_by_language: Default::default(),
199+
benchmark_by_difficulty: Default::default(),
200+
suite_comparisons: vec![],
201+
category_comparisons: vec![],
202+
language_comparisons: vec![],
203+
verification_health: None,
204+
warnings: vec![],
205+
threshold_failures: vec![],
206+
results: vec![],
207+
}
208+
}
209+
210+
#[tokio::test]
211+
async fn update_feedback_eval_trend_appends_attention_entries() {
212+
let dir = tempdir().unwrap();
213+
let path = dir.path().join("feedback-trend.json");
214+
215+
update_feedback_eval_trend(
216+
&sample_feedback_report(),
217+
Some(&sample_eval_report()),
218+
&path,
219+
)
220+
.await
221+
.unwrap();
222+
update_feedback_eval_trend(
223+
&sample_feedback_report(),
224+
Some(&sample_eval_report()),
225+
&path,
226+
)
227+
.await
228+
.unwrap();
229+
230+
let content = tokio::fs::read_to_string(&path).await.unwrap();
231+
let trend = FeedbackEvalTrend::from_json(&content).unwrap();
232+
assert_eq!(trend.entries.len(), 2);
233+
assert_eq!(trend.entries[0].eval_label.as_deref(), Some("frontier-e2e"));
234+
assert_eq!(
235+
trend.entries[0].eval_model.as_deref(),
236+
Some("anthropic/claude-opus-4.5")
237+
);
238+
assert_eq!(trend.entries[0].attention_by_category.len(), 1);
239+
assert_eq!(trend.entries[0].attention_by_category[0].name, "Security");
240+
assert_eq!(
241+
trend.entries[0].attention_by_rule[0].name,
242+
"sec.sql.injection"
243+
);
244+
}
245+
}

src/commands/feedback_eval/types.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@ mod report;
66
pub(super) use input::{FeedbackEvalComment, LoadedFeedbackEvalInput};
77
pub(super) use report::{
88
FeedbackEvalBucket, FeedbackEvalCategoryCorrelation, FeedbackEvalCorrelationReport,
9-
FeedbackEvalExample, FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackThresholdMetrics,
9+
FeedbackEvalExample, FeedbackEvalReport, FeedbackEvalRuleCorrelation, FeedbackEvalTrend,
10+
FeedbackEvalTrendEntry, FeedbackEvalTrendGap, FeedbackThresholdMetrics,
1011
};

src/commands/feedback_eval/types/report.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,76 @@ pub(in super::super) struct FeedbackEvalCorrelationReport {
7575
pub(in super::super) attention_by_rule: Vec<FeedbackEvalRuleCorrelation>,
7676
}
7777

78+
#[derive(Debug, Clone, Serialize, Deserialize)]
79+
pub(in super::super) struct FeedbackEvalTrendGap {
80+
#[serde(default)]
81+
pub(in super::super) name: String,
82+
#[serde(default)]
83+
pub(in super::super) feedback_total: usize,
84+
#[serde(default)]
85+
pub(in super::super) high_confidence_total: usize,
86+
#[serde(default)]
87+
pub(in super::super) high_confidence_acceptance_rate: f32,
88+
#[serde(default, skip_serializing_if = "Option::is_none")]
89+
pub(in super::super) eval_score: Option<f32>,
90+
#[serde(default, skip_serializing_if = "Option::is_none")]
91+
pub(in super::super) gap: Option<f32>,
92+
}
93+
94+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
95+
pub(in super::super) struct FeedbackEvalTrendEntry {
96+
#[serde(default)]
97+
pub(in super::super) timestamp: String,
98+
#[serde(default)]
99+
pub(in super::super) labeled_comments: usize,
100+
#[serde(default)]
101+
pub(in super::super) accepted: usize,
102+
#[serde(default)]
103+
pub(in super::super) rejected: usize,
104+
#[serde(default)]
105+
pub(in super::super) acceptance_rate: f32,
106+
#[serde(default)]
107+
pub(in super::super) confidence_threshold: f32,
108+
#[serde(default, skip_serializing_if = "Option::is_none")]
109+
pub(in super::super) confidence_agreement_rate: Option<f32>,
110+
#[serde(default, skip_serializing_if = "Option::is_none")]
111+
pub(in super::super) confidence_precision: Option<f32>,
112+
#[serde(default, skip_serializing_if = "Option::is_none")]
113+
pub(in super::super) confidence_recall: Option<f32>,
114+
#[serde(default, skip_serializing_if = "Option::is_none")]
115+
pub(in super::super) confidence_f1: Option<f32>,
116+
#[serde(default, skip_serializing_if = "Option::is_none")]
117+
pub(in super::super) eval_label: Option<String>,
118+
#[serde(default, skip_serializing_if = "Option::is_none")]
119+
pub(in super::super) eval_model: Option<String>,
120+
#[serde(default, skip_serializing_if = "Option::is_none")]
121+
pub(in super::super) eval_provider: Option<String>,
122+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
123+
pub(in super::super) attention_by_category: Vec<FeedbackEvalTrendGap>,
124+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
125+
pub(in super::super) attention_by_rule: Vec<FeedbackEvalTrendGap>,
126+
}
127+
128+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
129+
pub(in super::super) struct FeedbackEvalTrend {
130+
#[serde(default)]
131+
pub(in super::super) entries: Vec<FeedbackEvalTrendEntry>,
132+
}
133+
134+
impl FeedbackEvalTrend {
135+
pub(in super::super) fn new() -> Self {
136+
Self::default()
137+
}
138+
139+
pub(in super::super) fn to_json(&self) -> Result<String, serde_json::Error> {
140+
serde_json::to_string_pretty(self)
141+
}
142+
143+
pub(in super::super) fn from_json(json: &str) -> Result<Self, serde_json::Error> {
144+
serde_json::from_str(json)
145+
}
146+
}
147+
78148
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
79149
pub(in super::super) struct FeedbackThresholdMetrics {
80150
#[serde(default)]

0 commit comments

Comments
 (0)