Skip to content

Commit a621c05

Browse files
committed
Add eval harness: regression-test review prompt quality
From promptfoo/promptfoo pattern: - EvalSuite with EvalCase definitions (diff + expectations + denials) - RunEval() executes reviews and checks findings against expectations - EvalExpectation: concern, min severity, message substring, file matching - EvalDenial: false positive detection (findings that should NOT appear) - LoadEvalSuite() from JSON files for CI integration - EvalSummary() for pass/fail counts and success rate - Case-insensitive matching, all fields optional - 25 tests covering all matching modes and edge cases
1 parent 464edff commit a621c05

2 files changed

Lines changed: 817 additions & 0 deletions

File tree

eval.go

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
package sight
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"os"
8+
"strings"
9+
)
10+
11+
// EvalCase defines a single test case for evaluating review quality.
12+
type EvalCase struct {
13+
Name string `json:"name"`
14+
Diff string `json:"diff"`
15+
ExpectFindings []EvalExpectation `json:"expect_findings"`
16+
DenyFindings []EvalDenial `json:"deny_findings"`
17+
}
18+
19+
// EvalExpectation defines what we expect the reviewer to find.
20+
type EvalExpectation struct {
21+
Concern string `json:"concern,omitempty"`
22+
MinSeverity string `json:"min_severity,omitempty"`
23+
MessageContains string `json:"message_contains,omitempty"`
24+
File string `json:"file,omitempty"`
25+
}
26+
27+
// EvalDenial defines what the reviewer should NOT report (false positive check).
28+
type EvalDenial struct {
29+
MessageContains string `json:"message_contains,omitempty"`
30+
Concern string `json:"concern,omitempty"`
31+
}
32+
33+
// EvalResult is the outcome of running one eval case.
34+
type EvalResult struct {
35+
Case string `json:"case"`
36+
Passed bool `json:"passed"`
37+
Failures []string `json:"failures,omitempty"`
38+
Findings []Finding `json:"findings"`
39+
}
40+
41+
// EvalSuite is a collection of eval cases.
42+
type EvalSuite struct {
43+
Cases []EvalCase `json:"cases"`
44+
}
45+
46+
// RunEval executes an evaluation suite against the reviewer with the given options.
47+
// For each case it runs Review() on the diff, then checks expectations and denials.
48+
func RunEval(ctx context.Context, suite *EvalSuite, opts ...Option) ([]EvalResult, error) {
49+
if suite == nil || len(suite.Cases) == 0 {
50+
return nil, nil
51+
}
52+
53+
results := make([]EvalResult, 0, len(suite.Cases))
54+
55+
for _, ec := range suite.Cases {
56+
if ctx.Err() != nil {
57+
return results, ctx.Err()
58+
}
59+
60+
er := EvalResult{Case: ec.Name}
61+
62+
reviewResult, err := Review(ctx, ec.Diff, opts...)
63+
if err != nil {
64+
er.Failures = append(er.Failures, fmt.Sprintf("review error: %v", err))
65+
results = append(results, er)
66+
continue
67+
}
68+
69+
er.Findings = reviewResult.Findings
70+
71+
// Check expectations: each must be matched by at least one finding.
72+
for i, exp := range ec.ExpectFindings {
73+
if !matchExpectation(exp, reviewResult.Findings) {
74+
er.Failures = append(er.Failures,
75+
fmt.Sprintf("expect_findings[%d]: no finding matched %s", i, describeExpectation(exp)))
76+
}
77+
}
78+
79+
// Check denials: none should be matched by any finding.
80+
for i, deny := range ec.DenyFindings {
81+
if matchDenial(deny, reviewResult.Findings) {
82+
er.Failures = append(er.Failures,
83+
fmt.Sprintf("deny_findings[%d]: found unexpected match for %s", i, describeDenial(deny)))
84+
}
85+
}
86+
87+
er.Passed = len(er.Failures) == 0
88+
results = append(results, er)
89+
}
90+
91+
return results, nil
92+
}
93+
94+
// LoadEvalSuite loads eval cases from a JSON file.
95+
func LoadEvalSuite(path string) (*EvalSuite, error) {
96+
data, err := os.ReadFile(path)
97+
if err != nil {
98+
return nil, fmt.Errorf("loading eval suite: %w", err)
99+
}
100+
return ParseEvalSuite(data)
101+
}
102+
103+
// ParseEvalSuite parses eval cases from JSON bytes.
104+
func ParseEvalSuite(data []byte) (*EvalSuite, error) {
105+
var suite EvalSuite
106+
if err := json.Unmarshal(data, &suite); err != nil {
107+
return nil, fmt.Errorf("parsing eval suite: %w", err)
108+
}
109+
return &suite, nil
110+
}
111+
112+
// EvalSummary returns pass/fail counts and overall success rate.
113+
func EvalSummary(results []EvalResult) (passed, failed int, rate float64) {
114+
for _, r := range results {
115+
if r.Passed {
116+
passed++
117+
} else {
118+
failed++
119+
}
120+
}
121+
total := passed + failed
122+
if total > 0 {
123+
rate = float64(passed) / float64(total)
124+
}
125+
return
126+
}
127+
128+
// matchExpectation returns true if at least one finding matches all non-empty
129+
// fields in the expectation.
130+
func matchExpectation(exp EvalExpectation, findings []Finding) bool {
131+
for _, f := range findings {
132+
if matchesSingleExpectation(exp, f) {
133+
return true
134+
}
135+
}
136+
return false
137+
}
138+
139+
func matchesSingleExpectation(exp EvalExpectation, f Finding) bool {
140+
if exp.Concern != "" && !strings.EqualFold(f.Concern, exp.Concern) {
141+
return false
142+
}
143+
if exp.MinSeverity != "" {
144+
minSev := ParseSeverity(exp.MinSeverity)
145+
if !f.Severity.AtLeast(minSev) {
146+
return false
147+
}
148+
}
149+
if exp.MessageContains != "" && !strings.Contains(
150+
strings.ToLower(f.Message), strings.ToLower(exp.MessageContains)) {
151+
return false
152+
}
153+
if exp.File != "" && !strings.EqualFold(f.File, exp.File) {
154+
return false
155+
}
156+
return true
157+
}
158+
159+
// matchDenial returns true if any finding matches all non-empty fields in the denial.
160+
func matchDenial(deny EvalDenial, findings []Finding) bool {
161+
for _, f := range findings {
162+
if matchesSingleDenial(deny, f) {
163+
return true
164+
}
165+
}
166+
return false
167+
}
168+
169+
func matchesSingleDenial(deny EvalDenial, f Finding) bool {
170+
if deny.Concern != "" && !strings.EqualFold(f.Concern, deny.Concern) {
171+
return false
172+
}
173+
if deny.MessageContains != "" && !strings.Contains(
174+
strings.ToLower(f.Message), strings.ToLower(deny.MessageContains)) {
175+
return false
176+
}
177+
return true
178+
}
179+
180+
func describeExpectation(exp EvalExpectation) string {
181+
var parts []string
182+
if exp.Concern != "" {
183+
parts = append(parts, fmt.Sprintf("concern=%q", exp.Concern))
184+
}
185+
if exp.MinSeverity != "" {
186+
parts = append(parts, fmt.Sprintf("min_severity=%q", exp.MinSeverity))
187+
}
188+
if exp.MessageContains != "" {
189+
parts = append(parts, fmt.Sprintf("message_contains=%q", exp.MessageContains))
190+
}
191+
if exp.File != "" {
192+
parts = append(parts, fmt.Sprintf("file=%q", exp.File))
193+
}
194+
if len(parts) == 0 {
195+
return "{}"
196+
}
197+
return "{" + strings.Join(parts, ", ") + "}"
198+
}
199+
200+
func describeDenial(deny EvalDenial) string {
201+
var parts []string
202+
if deny.Concern != "" {
203+
parts = append(parts, fmt.Sprintf("concern=%q", deny.Concern))
204+
}
205+
if deny.MessageContains != "" {
206+
parts = append(parts, fmt.Sprintf("message_contains=%q", deny.MessageContains))
207+
}
208+
if len(parts) == 0 {
209+
return "{}"
210+
}
211+
return "{" + strings.Join(parts, ", ") + "}"
212+
}

0 commit comments

Comments
 (0)