Skip to content

Commit f247187

Browse files
committed
feat(analytics): measure feedback learning lift
1 parent 1cef489 commit f247187

File tree

6 files changed

+335
-2
lines changed

6 files changed

+335
-2
lines changed

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
105105
63. [x] Add top accepted categories/rules and top rejected categories/rules to Analytics.
106106
64. [x] Add unresolved blocker counts per repository and per PR.
107107
65. [x] Add review completeness and mean-time-to-resolution charts.
108-
66. [ ] Add feedback-learning effectiveness metrics: did reranked findings get higher acceptance after rollout?
108+
66. [x] Add feedback-learning effectiveness metrics: did reranked findings get higher acceptance after rollout?
109109
67. [ ] Add pattern-repository utilization analytics showing when extra context actually affected findings.
110110
68. [x] Add eval-vs-production dashboards comparing benchmark strength against real-world acceptance.
111111
69. [x] Add drill-downs from trend charts directly into the affected reviews, findings, and rules.

src/review/filters.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,7 @@ mod tests {
369369
)];
370370
let result = apply_feedback_confidence_adjustment(comments, &feedback, 5);
371371
assert_eq!(result[0].confidence, 0.8);
372+
assert!(result[0].tags.is_empty());
372373
}
373374

374375
#[test]
@@ -409,6 +410,10 @@ mod tests {
409410
"Got: {}",
410411
result[0].confidence
411412
);
413+
assert!(result[0].tags.contains(&"feedback-calibration".to_string()));
414+
assert!(result[0]
415+
.tags
416+
.contains(&"feedback-calibration:demoted".to_string()));
412417
}
413418

414419
#[test]
@@ -431,6 +436,10 @@ mod tests {
431436
"Got: {}",
432437
result[0].confidence
433438
);
439+
assert!(result[0].tags.contains(&"feedback-calibration".to_string()));
440+
assert!(result[0]
441+
.tags
442+
.contains(&"feedback-calibration:boosted".to_string()));
434443
}
435444

436445
#[test]
@@ -573,5 +582,9 @@ mod tests {
573582
"Expected exact accepted comment ids to get a boost, got {}",
574583
result[0].confidence
575584
);
585+
assert!(result[0].tags.contains(&"feedback-calibration".to_string()));
586+
assert!(result[0]
587+
.tags
588+
.contains(&"feedback-calibration:accepted-id".to_string()));
576589
}
577590
}

src/review/filters/confidence.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,20 @@ pub fn apply_feedback_confidence_adjustment(
4343
.map(|mut comment| {
4444
if feedback.accept.contains(&comment.id) {
4545
comment.confidence = (comment.confidence * 1.15).clamp(0.0, 1.0);
46+
push_feedback_calibration_tag(&mut comment, "feedback-calibration:accepted-id");
4647
}
4748
if let Some(stats) = lookup_feedback_confidence_stats(&comment, feedback) {
4849
if stats.total() >= min_observations {
4950
let rate = stats.acceptance_rate();
5051
let adjustment = 0.75 + rate * 0.5;
52+
let previous_confidence = comment.confidence;
5153
comment.confidence = (comment.confidence * adjustment).clamp(0.0, 1.0);
54+
55+
if comment.confidence > previous_confidence {
56+
push_feedback_calibration_tag(&mut comment, "feedback-calibration:boosted");
57+
} else if comment.confidence < previous_confidence {
58+
push_feedback_calibration_tag(&mut comment, "feedback-calibration:demoted");
59+
}
5260
}
5361
}
5462

@@ -57,6 +65,17 @@ pub fn apply_feedback_confidence_adjustment(
5765
.collect()
5866
}
5967

68+
fn push_feedback_calibration_tag(comment: &mut core::Comment, tag: &str) {
69+
push_feedback_tag(comment, "feedback-calibration");
70+
push_feedback_tag(comment, tag);
71+
}
72+
73+
fn push_feedback_tag(comment: &mut core::Comment, tag: &str) {
74+
if !comment.tags.iter().any(|existing| existing == tag) {
75+
comment.tags.push(tag.to_string());
76+
}
77+
}
78+
6079
fn lookup_feedback_confidence_stats<'a>(
6180
comment: &core::Comment,
6281
feedback: &'a FeedbackStore,

web/src/lib/analytics.ts

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,39 @@ import type {
55
Severity,
66
} from '../api/types'
77

8+
type ReviewComment = ReviewSession['comments'][number]
9+
10+
const FEEDBACK_LEARNING_ACCEPT_TAGS = new Set([
11+
'feedback-calibration:accepted-id',
12+
'feedback-calibration:boosted',
13+
'semantic-feedback:accepted',
14+
])
15+
16+
const FEEDBACK_LEARNING_REJECT_TAGS = new Set([
17+
'feedback-calibration:demoted',
18+
'semantic-feedback:rejected',
19+
])
20+
21+
function isLabeledFeedbackComment(comment: ReviewComment): boolean {
22+
return comment.feedback === 'accept' || comment.feedback === 'reject'
23+
}
24+
25+
function isFeedbackLearningComment(comment: ReviewComment): boolean {
26+
return comment.tags.some(tag => (
27+
tag === 'feedback-calibration'
28+
|| tag.startsWith('feedback-calibration:')
29+
|| tag.startsWith('semantic-feedback:')
30+
))
31+
}
32+
33+
function hasFeedbackLearningAcceptTag(comment: ReviewComment): boolean {
34+
return comment.tags.some(tag => FEEDBACK_LEARNING_ACCEPT_TAGS.has(tag))
35+
}
36+
37+
function hasFeedbackLearningRejectTag(comment: ReviewComment): boolean {
38+
return comment.tags.some(tag => FEEDBACK_LEARNING_REJECT_TAGS.has(tag))
39+
}
40+
841
export function computeAnalytics(reviews: ReviewSession[]) {
942
const completed = getCompletedReviews(reviews)
1043

@@ -88,6 +121,37 @@ export function computeAnalytics(reviews: ReviewSession[]) {
88121
}
89122
})
90123

124+
const feedbackLearningSeries = completed.map((r, i) => {
125+
const labeledComments = r.comments.filter(isLabeledFeedbackComment)
126+
const tunedComments = labeledComments.filter(isFeedbackLearningComment)
127+
const baselineComments = labeledComments.filter(comment => !isFeedbackLearningComment(comment))
128+
const tunedAccepted = tunedComments.filter(comment => comment.feedback === 'accept').length
129+
const tunedRejected = tunedComments.filter(comment => comment.feedback === 'reject').length
130+
const baselineAccepted = baselineComments.filter(comment => comment.feedback === 'accept').length
131+
132+
return {
133+
reviewId: r.id,
134+
idx: i + 1,
135+
label: `#${i + 1}`,
136+
tunedLabeled: tunedComments.length,
137+
tunedAccepted,
138+
tunedRejected,
139+
baselineLabeled: baselineComments.length,
140+
baselineAccepted,
141+
tunedAcceptanceRate: tunedComments.length > 0 ? tunedAccepted / tunedComments.length : null,
142+
baselineAcceptanceRate: baselineComments.length > 0 ? baselineAccepted / baselineComments.length : null,
143+
acceptanceLift: tunedComments.length > 0 && baselineComments.length > 0
144+
? (tunedAccepted / tunedComments.length) - (baselineAccepted / baselineComments.length)
145+
: null,
146+
boostedAccepted: tunedComments.filter(comment => (
147+
comment.feedback === 'accept' && hasFeedbackLearningAcceptTag(comment)
148+
)).length,
149+
demotedRejected: tunedComments.filter(comment => (
150+
comment.feedback === 'reject' && hasFeedbackLearningRejectTag(comment)
151+
)).length,
152+
}
153+
})
154+
91155
const feedbackCategoryData = Object.entries(feedbackTotalsByCategory)
92156
.map(([name, totals]) => {
93157
const total = totals.accepted + totals.rejected
@@ -226,8 +290,49 @@ export function computeAnalytics(reviews: ReviewSession[]) {
226290
const labeledFeedbackTotal = feedbackCoverageSeries.reduce((sum, point) => sum + point.labeled, 0)
227291
const acceptedFeedbackTotal = feedbackCoverageSeries.reduce((sum, point) => sum + point.accepted, 0)
228292
const rejectedFeedbackTotal = feedbackCoverageSeries.reduce((sum, point) => sum + point.rejected, 0)
293+
const feedbackLearningLabeledTotal = feedbackLearningSeries.reduce(
294+
(sum, point) => sum + point.tunedLabeled,
295+
0,
296+
)
297+
const feedbackLearningAcceptedTotal = feedbackLearningSeries.reduce(
298+
(sum, point) => sum + point.tunedAccepted,
299+
0,
300+
)
301+
const feedbackLearningRejectedTotal = feedbackLearningSeries.reduce(
302+
(sum, point) => sum + point.tunedRejected,
303+
0,
304+
)
305+
const feedbackLearningBaselineLabeledTotal = feedbackLearningSeries.reduce(
306+
(sum, point) => sum + point.baselineLabeled,
307+
0,
308+
)
309+
const feedbackLearningBaselineAcceptedTotal = feedbackLearningSeries.reduce(
310+
(sum, point) => sum + point.baselineAccepted,
311+
0,
312+
)
313+
const feedbackLearningBoostedAcceptedTotal = feedbackLearningSeries.reduce(
314+
(sum, point) => sum + point.boostedAccepted,
315+
0,
316+
)
317+
const feedbackLearningDemotedRejectedTotal = feedbackLearningSeries.reduce(
318+
(sum, point) => sum + point.demotedRejected,
319+
0,
320+
)
229321
const totalCommentCount = completed.reduce((sum, r) => sum + r.comments.length, 0)
230322
const reviewsWithFeedback = feedbackCoverageSeries.filter(point => point.labeled > 0).length
323+
const feedbackLearningReviewCount = feedbackLearningSeries.filter(
324+
point => point.tunedLabeled > 0,
325+
).length
326+
const feedbackLearningAcceptanceRate = feedbackLearningLabeledTotal > 0
327+
? feedbackLearningAcceptedTotal / feedbackLearningLabeledTotal
328+
: 0
329+
const feedbackLearningBaselineAcceptanceRate = feedbackLearningBaselineLabeledTotal > 0
330+
? feedbackLearningBaselineAcceptedTotal / feedbackLearningBaselineLabeledTotal
331+
: null
332+
const feedbackLearningAcceptanceLift = feedbackLearningBaselineAcceptanceRate != null
333+
&& feedbackLearningLabeledTotal > 0
334+
? feedbackLearningAcceptanceRate - feedbackLearningBaselineAcceptanceRate
335+
: null
231336

232337
const sevTotals: Record<Severity, number> = { Error: 0, Warning: 0, Info: 0, Suggestion: 0 }
233338
for (const r of completed) {
@@ -247,6 +352,7 @@ export function computeAnalytics(reviews: ReviewSession[]) {
247352
completenessSeries,
248353
meanTimeToResolutionSeries,
249354
feedbackCoverageSeries,
355+
feedbackLearningSeries,
250356
topAcceptedCategories,
251357
topRejectedCategories,
252358
topAcceptedRules,
@@ -281,6 +387,16 @@ export function computeAnalytics(reviews: ReviewSession[]) {
281387
feedbackCoverageRate: totalCommentCount > 0 ? labeledFeedbackTotal / totalCommentCount : 0,
282388
feedbackAcceptanceRate: labeledFeedbackTotal > 0 ? acceptedFeedbackTotal / labeledFeedbackTotal : 0,
283389
reviewsWithFeedback,
390+
feedbackLearningLabeledTotal,
391+
feedbackLearningAcceptedTotal,
392+
feedbackLearningRejectedTotal,
393+
feedbackLearningReviewCount,
394+
feedbackLearningAcceptanceRate,
395+
feedbackLearningBaselineLabeledTotal,
396+
feedbackLearningBaselineAcceptanceRate,
397+
feedbackLearningAcceptanceLift,
398+
feedbackLearningBoostedAcceptedTotal,
399+
feedbackLearningDemotedRejectedTotal,
284400
},
285401
}
286402
}
@@ -525,12 +641,23 @@ export interface AnalyticsExportReport {
525641
feedbackCoverageRate: number
526642
feedbackAcceptanceRate: number
527643
reviewsWithFeedback: number
644+
feedbackLearningLabeledTotal: number
645+
feedbackLearningAcceptedTotal: number
646+
feedbackLearningRejectedTotal: number
647+
feedbackLearningReviewCount: number
648+
feedbackLearningAcceptanceRate: number
649+
feedbackLearningBaselineLabeledTotal: number
650+
feedbackLearningBaselineAcceptanceRate?: number
651+
feedbackLearningAcceptanceLift?: number
652+
feedbackLearningBoostedAcceptedTotal: number
653+
feedbackLearningDemotedRejectedTotal: number
528654
latestMicroF1?: number
529655
latestWeightedScore?: number
530656
latestAcceptanceRate?: number
531657
latestConfidenceF1?: number
532658
}
533659
coverageByReview: AnalyticsSnapshot['feedbackCoverageSeries']
660+
feedbackLearningByReview: AnalyticsSnapshot['feedbackLearningSeries']
534661
topAcceptedCategories: AnalyticsSnapshot['topAcceptedCategories']
535662
topRejectedCategories: AnalyticsSnapshot['topRejectedCategories']
536663
topAcceptedRules: AnalyticsSnapshot['topAcceptedRules']
@@ -644,12 +771,23 @@ export function buildAnalyticsExportReport(
644771
feedbackCoverageRate: analytics.stats.feedbackCoverageRate,
645772
feedbackAcceptanceRate: analytics.stats.feedbackAcceptanceRate,
646773
reviewsWithFeedback: analytics.stats.reviewsWithFeedback,
774+
feedbackLearningLabeledTotal: analytics.stats.feedbackLearningLabeledTotal,
775+
feedbackLearningAcceptedTotal: analytics.stats.feedbackLearningAcceptedTotal,
776+
feedbackLearningRejectedTotal: analytics.stats.feedbackLearningRejectedTotal,
777+
feedbackLearningReviewCount: analytics.stats.feedbackLearningReviewCount,
778+
feedbackLearningAcceptanceRate: analytics.stats.feedbackLearningAcceptanceRate,
779+
feedbackLearningBaselineLabeledTotal: analytics.stats.feedbackLearningBaselineLabeledTotal,
780+
feedbackLearningBaselineAcceptanceRate: analytics.stats.feedbackLearningBaselineAcceptanceRate ?? undefined,
781+
feedbackLearningAcceptanceLift: analytics.stats.feedbackLearningAcceptanceLift ?? undefined,
782+
feedbackLearningBoostedAcceptedTotal: analytics.stats.feedbackLearningBoostedAcceptedTotal,
783+
feedbackLearningDemotedRejectedTotal: analytics.stats.feedbackLearningDemotedRejectedTotal,
647784
latestMicroF1: trendAnalytics.latestEval?.micro_f1,
648785
latestWeightedScore: trendAnalytics.latestEval?.weighted_score,
649786
latestAcceptanceRate: trendAnalytics.latestFeedback?.acceptance_rate,
650787
latestConfidenceF1: trendAnalytics.latestFeedback?.confidence_f1,
651788
},
652789
coverageByReview: analytics.feedbackCoverageSeries,
790+
feedbackLearningByReview: analytics.feedbackLearningSeries,
653791
topAcceptedCategories: analytics.topAcceptedCategories,
654792
topRejectedCategories: analytics.topRejectedCategories,
655793
topAcceptedRules: analytics.topAcceptedRules,
@@ -718,6 +856,23 @@ export function buildAnalyticsCsv(report: AnalyticsExportReport): string {
718856
rows.push({ report: 'reinforcement', group: 'coverage_by_review', label: point.label, metric: 'rejected', value: point.rejected })
719857
rows.push({ report: 'reinforcement', group: 'coverage_by_review', label: point.label, metric: 'total_comments', value: point.totalComments })
720858
})
859+
report.reinforcement.feedbackLearningByReview.forEach(point => {
860+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'tuned_labeled', value: point.tunedLabeled })
861+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'tuned_accepted', value: point.tunedAccepted })
862+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'tuned_rejected', value: point.tunedRejected })
863+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'baseline_labeled', value: point.baselineLabeled })
864+
if (point.tunedAcceptanceRate != null) {
865+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'tuned_acceptance_rate', value: point.tunedAcceptanceRate })
866+
}
867+
if (point.baselineAcceptanceRate != null) {
868+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'baseline_acceptance_rate', value: point.baselineAcceptanceRate })
869+
}
870+
if (point.acceptanceLift != null) {
871+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'acceptance_lift', value: point.acceptanceLift })
872+
}
873+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'boosted_accepted', value: point.boostedAccepted })
874+
rows.push({ report: 'reinforcement', group: 'feedback_learning_by_review', label: point.label, metric: 'demoted_rejected', value: point.demotedRejected })
875+
})
721876
appendFeedbackBreakdownRows(rows, 'top_accepted_categories', report.reinforcement.topAcceptedCategories)
722877
appendFeedbackBreakdownRows(rows, 'top_rejected_categories', report.reinforcement.topRejectedCategories)
723878
appendFeedbackBreakdownRows(rows, 'top_accepted_rules', report.reinforcement.topAcceptedRules)

0 commit comments

Comments
 (0)