Skip to content

Commit 9282adb

Browse files
committed
Competitor parity: BPE tokens, enhanced hunks, filtering, scoring, CWE, incremental, rules
Features from analyzing PR-Agent, gosec, reviewdog, Shippie: - BPE-approximation token counting (code vs prose detection) - PR-Agent style __new hunk__ / __old hunk__ diff format with line numbers - File filtering: auto-exclude lock files, generated code, minified assets - Suggestion scoring (1-10) with configurable minimum threshold - CWE mapping: 15 common weakness patterns auto-tagged on findings - SARIF output includes CWE taxa references - Incremental review: only review new commits since last review SHA - Project rules awareness: reads .cursor/rules/, CLAUDE.md, CONTRIBUTING.md
1 parent ce79623 commit 9282adb

12 files changed

Lines changed: 616 additions & 12 deletions

File tree

config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ func ApplyFileConfig(fc *FileConfig) []Option {
6464
if fc.Parallel != nil {
6565
opts = append(opts, WithParallel(*fc.Parallel))
6666
}
67+
if len(fc.Exclude) > 0 {
68+
opts = append(opts, WithExclude(fc.Exclude...))
69+
}
6770

6871
return opts
6972
}

incremental.go

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package sight
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os/exec"
7+
"strings"
8+
"sync"
9+
)
10+
11+
// IncrementalState tracks the last-reviewed commit SHA for incremental reviews.
12+
// It is safe for concurrent use.
13+
type IncrementalState struct {
14+
mu sync.Mutex
15+
lastReviewedSHA string
16+
}
17+
18+
// NewIncrementalState creates a new state tracker, optionally seeded with a
19+
// previously reviewed SHA for resumption.
20+
func NewIncrementalState(lastSHA string) *IncrementalState {
21+
return &IncrementalState{lastReviewedSHA: lastSHA}
22+
}
23+
24+
// LastReviewedSHA returns the SHA of the last reviewed commit.
25+
func (s *IncrementalState) LastReviewedSHA() string {
26+
s.mu.Lock()
27+
defer s.mu.Unlock()
28+
return s.lastReviewedSHA
29+
}
30+
31+
// SetLastReviewedSHA updates the last-reviewed SHA after a successful review.
32+
func (s *IncrementalState) SetLastReviewedSHA(sha string) {
33+
s.mu.Lock()
34+
defer s.mu.Unlock()
35+
s.lastReviewedSHA = sha
36+
}
37+
38+
// ReviewIncremental reviews only the changes between base and head commits.
39+
// It uses `git diff base...head` to obtain the diff, reviews it, and records
40+
// the head SHA in the provided state for future incremental runs.
41+
//
42+
// If state is non-nil and has a LastReviewedSHA, that SHA is used as the base
43+
// instead of the provided base argument (enabling resumption).
44+
//
45+
// Pass nil for state if you don't need resumption tracking.
46+
func ReviewIncremental(ctx context.Context, base, head string, state *IncrementalState, opts ...Option) (*Result, error) {
47+
if state != nil {
48+
if last := state.LastReviewedSHA(); last != "" {
49+
base = last
50+
}
51+
}
52+
53+
diffText, err := gitDiffRange(base, head)
54+
if err != nil {
55+
return nil, fmt.Errorf("sight: incremental diff failed: %w", err)
56+
}
57+
58+
if strings.TrimSpace(diffText) == "" {
59+
result := &Result{Report: "No new changes since last review."}
60+
if state != nil {
61+
state.SetLastReviewedSHA(head)
62+
}
63+
return result, nil
64+
}
65+
66+
r := NewReviewer(opts...)
67+
result, err := r.Review(ctx, diffText)
68+
if err != nil {
69+
return nil, err
70+
}
71+
72+
if state != nil {
73+
state.SetLastReviewedSHA(head)
74+
}
75+
76+
return result, nil
77+
}
78+
79+
// gitDiffRange runs `git diff base...head` and returns the output.
80+
func gitDiffRange(base, head string) (string, error) {
81+
// Try three-dot syntax first (merge-base diff)
82+
out, err := exec.Command("git", "diff", base+"..."+head).Output()
83+
if err == nil {
84+
return string(out), nil
85+
}
86+
87+
// Fall back to two-dot syntax
88+
out, err = exec.Command("git", "diff", base, head).Output()
89+
if err != nil {
90+
return "", fmt.Errorf("git diff %s %s failed: %w", base, head, err)
91+
}
92+
return string(out), nil
93+
}

internal/output/output.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ type Finding struct {
1818
Message string
1919
Fix string
2020
Reasoning string
21+
CWE string
2122
}
2223

2324
// Stats for rendering.

internal/output/sarif.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,18 @@ type SARIFMultiformat struct {
4545
}
4646

4747
type SARIFResult struct {
48-
RuleID string `json:"ruleId"`
49-
Level string `json:"level"`
50-
Message SARIFMultiformat `json:"message"`
51-
Locations []SARIFLocation `json:"locations,omitempty"`
52-
Fixes []SARIFFix `json:"fixes,omitempty"`
48+
RuleID string `json:"ruleId"`
49+
Level string `json:"level"`
50+
Message SARIFMultiformat `json:"message"`
51+
Locations []SARIFLocation `json:"locations,omitempty"`
52+
Fixes []SARIFFix `json:"fixes,omitempty"`
53+
Taxa []SARIFTaxaReference `json:"taxa,omitempty"`
54+
}
55+
56+
// SARIFTaxaReference references an external taxonomy entry (e.g., CWE).
57+
type SARIFTaxaReference struct {
58+
ID string `json:"id"`
59+
ToolComponent SARIFMultiformat `json:"toolComponent"`
5360
}
5461

5562
type SARIFLocation struct {
@@ -126,6 +133,13 @@ func FormatSARIF(findings []Finding) (string, error) {
126133
})
127134
}
128135

136+
if f.CWE != "" {
137+
result.Taxa = append(result.Taxa, SARIFTaxaReference{
138+
ID: f.CWE,
139+
ToolComponent: SARIFMultiformat{Text: "CWE"},
140+
})
141+
}
142+
129143
results = append(results, result)
130144
}
131145

internal/review/budget.go

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,125 @@
11
package review
22

33
import (
4+
"unicode"
5+
46
"github.com/GrayCodeAI/sight/internal/diff"
57
)
68

7-
// EstimateTokens provides a rough token count for a string.
8-
// Uses the ~4 characters per token heuristic.
9+
// EstimateTokens provides a BPE-approximation token count for a string.
10+
// It splits on whitespace and punctuation, then applies a multiplier:
11+
// ~1.3 tokens per word for English prose, ~2.0 tokens per word for code.
12+
// This is significantly more accurate than the naive len(s)/4 heuristic.
913
func EstimateTokens(s string) int {
10-
return (len(s) + 3) / 4
14+
if len(s) == 0 {
15+
return 0
16+
}
17+
18+
words, codeIndicators := tokenizeWords(s)
19+
if words == 0 {
20+
// Fallback for strings that are all punctuation/symbols.
21+
return (len(s) + 3) / 4
22+
}
23+
24+
// Determine multiplier based on code density.
25+
// If >30% of "words" contain code indicators, treat as code.
26+
codeRatio := float64(codeIndicators) / float64(words)
27+
var multiplier float64
28+
if codeRatio > 0.3 {
29+
multiplier = 2.0 // code: more subword splits (camelCase, snake_case, operators)
30+
} else {
31+
multiplier = 1.3 // English prose
32+
}
33+
34+
tokens := int(float64(words)*multiplier + 0.5)
35+
if tokens < 1 {
36+
tokens = 1
37+
}
38+
return tokens
39+
}
40+
41+
// tokenizeWords splits s into word-like segments and counts how many
42+
// look like code (contain underscores, mixed case, braces, operators, etc.).
43+
func tokenizeWords(s string) (totalWords int, codeWords int) {
44+
inWord := false
45+
wordStart := 0
46+
runes := []rune(s)
47+
48+
for i, r := range runes {
49+
isWordChar := unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$'
50+
51+
if isWordChar {
52+
if !inWord {
53+
wordStart = i
54+
inWord = true
55+
}
56+
} else {
57+
if inWord {
58+
totalWords++
59+
if looksLikeCodeWord(runes[wordStart:i]) {
60+
codeWords++
61+
}
62+
inWord = false
63+
}
64+
// Count standalone punctuation clusters as tokens too
65+
// (braces, operators, etc. each become tokens in BPE).
66+
if !unicode.IsSpace(r) {
67+
totalWords++
68+
codeWords++ // punctuation outside words is almost always code
69+
}
70+
}
71+
}
72+
if inWord {
73+
totalWords++
74+
if looksLikeCodeWord(runes[wordStart:]) {
75+
codeWords++
76+
}
77+
}
78+
79+
return totalWords, codeWords
80+
}
81+
82+
// looksLikeCodeWord returns true if a word looks like source code:
83+
// camelCase, snake_case, ALL_CAPS with underscores, or contains digits
84+
// mixed with letters.
85+
func looksLikeCodeWord(word []rune) bool {
86+
hasUpper := false
87+
hasLower := false
88+
hasDigit := false
89+
hasUnderscore := false
90+
91+
for _, r := range word {
92+
switch {
93+
case unicode.IsUpper(r):
94+
hasUpper = true
95+
case unicode.IsLower(r):
96+
hasLower = true
97+
case unicode.IsDigit(r):
98+
hasDigit = true
99+
case r == '_':
100+
hasUnderscore = true
101+
}
102+
}
103+
104+
// camelCase or PascalCase: mixed case in a single word
105+
if hasUpper && hasLower && len(word) > 1 {
106+
// Check if it's truly mixed (not just capitalized first letter)
107+
for _, r := range word[1:] {
108+
if unicode.IsUpper(r) {
109+
return true
110+
}
111+
}
112+
}
113+
// snake_case
114+
if hasUnderscore {
115+
return true
116+
}
117+
// alphanumeric identifiers like "x86" or "v2"
118+
if hasDigit && (hasUpper || hasLower) {
119+
return true
120+
}
121+
122+
return false
11123
}
12124

13125
// ChunkFiles splits files into groups that fit within the token budget.

internal/review/cwe.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
package review
2+
3+
import "strings"
4+
5+
// CWEMapping maps a security finding pattern to a CWE identifier.
6+
type CWEMapping struct {
7+
ID string // e.g. "CWE-89"
8+
Name string // e.g. "SQL Injection"
9+
Keywords []string // lowercase keywords to match in finding messages
10+
}
11+
12+
// cweDatabase is the built-in set of common security weakness patterns.
13+
var cweDatabase = []CWEMapping{
14+
{
15+
ID: "CWE-89",
16+
Name: "SQL Injection",
17+
Keywords: []string{"sql injection", "sql concat", "string concatenation into sql", "raw query", "unsanitized sql"},
18+
},
19+
{
20+
ID: "CWE-79",
21+
Name: "Cross-site Scripting (XSS)",
22+
Keywords: []string{"xss", "cross-site scripting", "unescaped output", "unsanitized html", "reflected input"},
23+
},
24+
{
25+
ID: "CWE-78",
26+
Name: "OS Command Injection",
27+
Keywords: []string{"command injection", "os.exec", "exec.command", "shell injection", "unsanitized command"},
28+
},
29+
{
30+
ID: "CWE-22",
31+
Name: "Path Traversal",
32+
Keywords: []string{"path traversal", "directory traversal", "../ ", "dot dot slash", "file path manipulation"},
33+
},
34+
{
35+
ID: "CWE-918",
36+
Name: "Server-Side Request Forgery (SSRF)",
37+
Keywords: []string{"ssrf", "server-side request forgery", "unvalidated url", "open redirect to internal"},
38+
},
39+
{
40+
ID: "CWE-798",
41+
Name: "Hardcoded Credentials",
42+
Keywords: []string{"hardcoded secret", "hardcoded password", "hardcoded credential", "hardcoded api key", "embedded secret", "secret in code"},
43+
},
44+
{
45+
ID: "CWE-327",
46+
Name: "Use of Broken Crypto Algorithm",
47+
Keywords: []string{"weak crypto", "md5", "sha1 ", "des ", "broken crypto", "insecure hash", "weak hash"},
48+
},
49+
{
50+
ID: "CWE-502",
51+
Name: "Deserialization of Untrusted Data",
52+
Keywords: []string{"insecure deserialization", "unsafe deserialization", "untrusted deserialization", "pickle", "yaml.load"},
53+
},
54+
{
55+
ID: "CWE-611",
56+
Name: "XML External Entity (XXE)",
57+
Keywords: []string{"xxe", "xml external entity", "xml injection"},
58+
},
59+
{
60+
ID: "CWE-352",
61+
Name: "Cross-Site Request Forgery (CSRF)",
62+
Keywords: []string{"csrf", "cross-site request forgery", "missing csrf token"},
63+
},
64+
{
65+
ID: "CWE-200",
66+
Name: "Information Exposure",
67+
Keywords: []string{"information disclosure", "sensitive data exposure", "data leak", "credential in log", "logging sensitive"},
68+
},
69+
{
70+
ID: "CWE-362",
71+
Name: "Race Condition",
72+
Keywords: []string{"race condition", "data race", "toctou", "time of check"},
73+
},
74+
{
75+
ID: "CWE-190",
76+
Name: "Integer Overflow",
77+
Keywords: []string{"integer overflow", "integer underflow", "int overflow"},
78+
},
79+
{
80+
ID: "CWE-601",
81+
Name: "Open Redirect",
82+
Keywords: []string{"open redirect", "url redirect", "unvalidated redirect"},
83+
},
84+
{
85+
ID: "CWE-862",
86+
Name: "Missing Authorization",
87+
Keywords: []string{"missing authorization", "missing auth check", "authorization bypass", "broken access control"},
88+
},
89+
}
90+
91+
// MatchCWE checks a finding's message (and fix) against the CWE database and
92+
// returns the CWE ID if a match is found. Returns empty string if no match.
93+
func MatchCWE(message, fix string) string {
94+
lower := strings.ToLower(message + " " + fix)
95+
for _, cwe := range cweDatabase {
96+
for _, keyword := range cwe.Keywords {
97+
if strings.Contains(lower, keyword) {
98+
return cwe.ID
99+
}
100+
}
101+
}
102+
return ""
103+
}
104+
105+
// LookupCWEName returns the human-readable name for a CWE ID.
106+
func LookupCWEName(id string) string {
107+
for _, cwe := range cweDatabase {
108+
if cwe.ID == id {
109+
return cwe.Name
110+
}
111+
}
112+
return ""
113+
}

0 commit comments

Comments
 (0)