GrayCodeAI
diff --git a/‎config.go‎
Lines changed: 3 additions & 0 deletions b/‎config.go‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎incremental.go‎
Lines changed: 93 additions & 0 deletions b/‎incremental.go‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎internal/output/output.go‎
Lines changed: 1 addition & 0 deletions b/‎internal/output/output.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎internal/output/sarif.go‎
Lines changed: 19 additions & 5 deletions b/‎internal/output/sarif.go‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎internal/review/budget.go‎
Lines changed: 115 additions & 3 deletions b/‎internal/review/budget.go‎
Lines changed: 115 additions & 3 deletions
diff --git a/‎internal/review/cwe.go‎
Lines changed: 113 additions & 0 deletions b/‎internal/review/cwe.go‎
Lines changed: 113 additions & 0 deletions
@@ -64,6 +64,9 @@ func ApplyFileConfig(fc *FileConfig) []Option {
 	if fc.Parallel != nil {
 		opts = append(opts, WithParallel(*fc.Parallel))
 	}
+	if len(fc.Exclude) > 0 {
+		opts = append(opts, WithExclude(fc.Exclude...))
+	}
 
 	return opts
 }
 
@@ -0,0 +1,93 @@
+package sight
+
+import (
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+	"sync"
+)
+
+// IncrementalState tracks the last-reviewed commit SHA for incremental reviews.
+// It is safe for concurrent use.
+type IncrementalState struct {
+	mu             sync.Mutex
+	lastReviewedSHA string
+}
+
+// NewIncrementalState creates a new state tracker, optionally seeded with a
+// previously reviewed SHA for resumption.
+func NewIncrementalState(lastSHA string) *IncrementalState {
+	return &IncrementalState{lastReviewedSHA: lastSHA}
+}
+
+// LastReviewedSHA returns the SHA of the last reviewed commit.
+func (s *IncrementalState) LastReviewedSHA() string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return s.lastReviewedSHA
+}
+
+// SetLastReviewedSHA updates the last-reviewed SHA after a successful review.
+func (s *IncrementalState) SetLastReviewedSHA(sha string) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.lastReviewedSHA = sha
+}
+
+// ReviewIncremental reviews only the changes between base and head commits.
+// It uses `git diff base...head` to obtain the diff, reviews it, and records
+// the head SHA in the provided state for future incremental runs.
+//
+// If state is non-nil and has a LastReviewedSHA, that SHA is used as the base
+// instead of the provided base argument (enabling resumption).
+//
+// Pass nil for state if you don't need resumption tracking.
+func ReviewIncremental(ctx context.Context, base, head string, state *IncrementalState, opts ...Option) (*Result, error) {
+	if state != nil {
+		if last := state.LastReviewedSHA(); last != "" {
+			base = last
+		}
+	}
+
+	diffText, err := gitDiffRange(base, head)
+	if err != nil {
+		return nil, fmt.Errorf("sight: incremental diff failed: %w", err)
+	}
+
+	if strings.TrimSpace(diffText) == "" {
+		result := &Result{Report: "No new changes since last review."}
+		if state != nil {
+			state.SetLastReviewedSHA(head)
+		}
+		return result, nil
+	}
+
+	r := NewReviewer(opts...)
+	result, err := r.Review(ctx, diffText)
+	if err != nil {
+		return nil, err
+	}
+
+	if state != nil {
+		state.SetLastReviewedSHA(head)
+	}
+
+	return result, nil
+}
+
+// gitDiffRange runs `git diff base...head` and returns the output.
+func gitDiffRange(base, head string) (string, error) {
+	// Try three-dot syntax first (merge-base diff)
+	out, err := exec.Command("git", "diff", base+"..."+head).Output()
+	if err == nil {
+		return string(out), nil
+	}
+
+	// Fall back to two-dot syntax
+	out, err = exec.Command("git", "diff", base, head).Output()
+	if err != nil {
+		return "", fmt.Errorf("git diff %s %s failed: %w", base, head, err)
+	}
+	return string(out), nil
+}
@@ -18,6 +18,7 @@ type Finding struct {
 	Message   string
 	Fix       string
 	Reasoning string
+	CWE       string
 }
 
 // Stats for rendering.
 
@@ -45,11 +45,18 @@ type SARIFMultiformat struct {
 }
 
 type SARIFResult struct {
-	RuleID    string          `json:"ruleId"`
-	Level     string          `json:"level"`
-	Message   SARIFMultiformat `json:"message"`
-	Locations []SARIFLocation `json:"locations,omitempty"`
-	Fixes     []SARIFFix      `json:"fixes,omitempty"`
+	RuleID    string               `json:"ruleId"`
+	Level     string               `json:"level"`
+	Message   SARIFMultiformat     `json:"message"`
+	Locations []SARIFLocation      `json:"locations,omitempty"`
+	Fixes     []SARIFFix           `json:"fixes,omitempty"`
+	Taxa      []SARIFTaxaReference `json:"taxa,omitempty"`
+}
+
+// SARIFTaxaReference references an external taxonomy entry (e.g., CWE).
+type SARIFTaxaReference struct {
+	ID            string           `json:"id"`
+	ToolComponent SARIFMultiformat `json:"toolComponent"`
 }
 
 type SARIFLocation struct {
@@ -126,6 +133,13 @@ func FormatSARIF(findings []Finding) (string, error) {
 			})
 		}
 
+		if f.CWE != "" {
+			result.Taxa = append(result.Taxa, SARIFTaxaReference{
+				ID:            f.CWE,
+				ToolComponent: SARIFMultiformat{Text: "CWE"},
+			})
+		}
+
 		results = append(results, result)
 	}
 
 
@@ -1,13 +1,125 @@
 package review
 
 import (
+	"unicode"
+
 	"github.com/GrayCodeAI/sight/internal/diff"
 )
 
-// EstimateTokens provides a rough token count for a string.
-// Uses the ~4 characters per token heuristic.
+// EstimateTokens provides a BPE-approximation token count for a string.
+// It splits on whitespace and punctuation, then applies a multiplier:
+// ~1.3 tokens per word for English prose, ~2.0 tokens per word for code.
+// This is significantly more accurate than the naive len(s)/4 heuristic.
 func EstimateTokens(s string) int {
-	return (len(s) + 3) / 4
+	if len(s) == 0 {
+		return 0
+	}
+
+	words, codeIndicators := tokenizeWords(s)
+	if words == 0 {
+		// Fallback for strings that are all punctuation/symbols.
+		return (len(s) + 3) / 4
+	}
+
+	// Determine multiplier based on code density.
+	// If >30% of "words" contain code indicators, treat as code.
+	codeRatio := float64(codeIndicators) / float64(words)
+	var multiplier float64
+	if codeRatio > 0.3 {
+		multiplier = 2.0 // code: more subword splits (camelCase, snake_case, operators)
+	} else {
+		multiplier = 1.3 // English prose
+	}
+
+	tokens := int(float64(words)*multiplier + 0.5)
+	if tokens < 1 {
+		tokens = 1
+	}
+	return tokens
+}
+
+// tokenizeWords splits s into word-like segments and counts how many
+// look like code (contain underscores, mixed case, braces, operators, etc.).
+func tokenizeWords(s string) (totalWords int, codeWords int) {
+	inWord := false
+	wordStart := 0
+	runes := []rune(s)
+
+	for i, r := range runes {
+		isWordChar := unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$'
+
+		if isWordChar {
+			if !inWord {
+				wordStart = i
+				inWord = true
+			}
+		} else {
+			if inWord {
+				totalWords++
+				if looksLikeCodeWord(runes[wordStart:i]) {
+					codeWords++
+				}
+				inWord = false
+			}
+			// Count standalone punctuation clusters as tokens too
+			// (braces, operators, etc. each become tokens in BPE).
+			if !unicode.IsSpace(r) {
+				totalWords++
+				codeWords++ // punctuation outside words is almost always code
+			}
+		}
+	}
+	if inWord {
+		totalWords++
+		if looksLikeCodeWord(runes[wordStart:]) {
+			codeWords++
+		}
+	}
+
+	return totalWords, codeWords
+}
+
+// looksLikeCodeWord returns true if a word looks like source code:
+// camelCase, snake_case, ALL_CAPS with underscores, or contains digits
+// mixed with letters.
+func looksLikeCodeWord(word []rune) bool {
+	hasUpper := false
+	hasLower := false
+	hasDigit := false
+	hasUnderscore := false
+
+	for _, r := range word {
+		switch {
+		case unicode.IsUpper(r):
+			hasUpper = true
+		case unicode.IsLower(r):
+			hasLower = true
+		case unicode.IsDigit(r):
+			hasDigit = true
+		case r == '_':
+			hasUnderscore = true
+		}
+	}
+
+	// camelCase or PascalCase: mixed case in a single word
+	if hasUpper && hasLower && len(word) > 1 {
+		// Check if it's truly mixed (not just capitalized first letter)
+		for _, r := range word[1:] {
+			if unicode.IsUpper(r) {
+				return true
+			}
+		}
+	}
+	// snake_case
+	if hasUnderscore {
+		return true
+	}
+	// alphanumeric identifiers like "x86" or "v2"
+	if hasDigit && (hasUpper || hasLower) {
+		return true
+	}
+
+	return false
 }
 
 // ChunkFiles splits files into groups that fit within the token budget.
 
@@ -0,0 +1,113 @@
+package review
+
+import "strings"
+
+// CWEMapping maps a security finding pattern to a CWE identifier.
+type CWEMapping struct {
+	ID       string   // e.g. "CWE-89"
+	Name     string   // e.g. "SQL Injection"
+	Keywords []string // lowercase keywords to match in finding messages
+}
+
+// cweDatabase is the built-in set of common security weakness patterns.
+var cweDatabase = []CWEMapping{
+	{
+		ID:       "CWE-89",
+		Name:     "SQL Injection",
+		Keywords: []string{"sql injection", "sql concat", "string concatenation into sql", "raw query", "unsanitized sql"},
+	},
+	{
+		ID:       "CWE-79",
+		Name:     "Cross-site Scripting (XSS)",
+		Keywords: []string{"xss", "cross-site scripting", "unescaped output", "unsanitized html", "reflected input"},
+	},
+	{
+		ID:       "CWE-78",
+		Name:     "OS Command Injection",
+		Keywords: []string{"command injection", "os.exec", "exec.command", "shell injection", "unsanitized command"},
+	},
+	{
+		ID:       "CWE-22",
+		Name:     "Path Traversal",
+		Keywords: []string{"path traversal", "directory traversal", "../ ", "dot dot slash", "file path manipulation"},
+	},
+	{
+		ID:       "CWE-918",
+		Name:     "Server-Side Request Forgery (SSRF)",
+		Keywords: []string{"ssrf", "server-side request forgery", "unvalidated url", "open redirect to internal"},
+	},
+	{
+		ID:       "CWE-798",
+		Name:     "Hardcoded Credentials",
+		Keywords: []string{"hardcoded secret", "hardcoded password", "hardcoded credential", "hardcoded api key", "embedded secret", "secret in code"},
+	},
+	{
+		ID:       "CWE-327",
+		Name:     "Use of Broken Crypto Algorithm",
+		Keywords: []string{"weak crypto", "md5", "sha1 ", "des ", "broken crypto", "insecure hash", "weak hash"},
+	},
+	{
+		ID:       "CWE-502",
+		Name:     "Deserialization of Untrusted Data",
+		Keywords: []string{"insecure deserialization", "unsafe deserialization", "untrusted deserialization", "pickle", "yaml.load"},
+	},
+	{
+		ID:       "CWE-611",
+		Name:     "XML External Entity (XXE)",
+		Keywords: []string{"xxe", "xml external entity", "xml injection"},
+	},
+	{
+		ID:       "CWE-352",
+		Name:     "Cross-Site Request Forgery (CSRF)",
+		Keywords: []string{"csrf", "cross-site request forgery", "missing csrf token"},
+	},
+	{
+		ID:       "CWE-200",
+		Name:     "Information Exposure",
+		Keywords: []string{"information disclosure", "sensitive data exposure", "data leak", "credential in log", "logging sensitive"},
+	},
+	{
+		ID:       "CWE-362",
+		Name:     "Race Condition",
+		Keywords: []string{"race condition", "data race", "toctou", "time of check"},
+	},
+	{
+		ID:       "CWE-190",
+		Name:     "Integer Overflow",
+		Keywords: []string{"integer overflow", "integer underflow", "int overflow"},
+	},
+	{
+		ID:       "CWE-601",
+		Name:     "Open Redirect",
+		Keywords: []string{"open redirect", "url redirect", "unvalidated redirect"},
+	},
+	{
+		ID:       "CWE-862",
+		Name:     "Missing Authorization",
+		Keywords: []string{"missing authorization", "missing auth check", "authorization bypass", "broken access control"},
+	},
+}
+
+// MatchCWE checks a finding's message (and fix) against the CWE database and
+// returns the CWE ID if a match is found. Returns empty string if no match.
+func MatchCWE(message, fix string) string {
+	lower := strings.ToLower(message + " " + fix)
+	for _, cwe := range cweDatabase {
+		for _, keyword := range cwe.Keywords {
+			if strings.Contains(lower, keyword) {
+				return cwe.ID
+			}
+		}
+	}
+	return ""
+}
+
+// LookupCWEName returns the human-readable name for a CWE ID.
+func LookupCWEName(id string) string {
+	for _, cwe := range cweDatabase {
+		if cwe.ID == id {
+			return cwe.Name
+		}
+	}
+	return ""
+}
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,9 @@ func ApplyFileConfig(fc *FileConfig) []Option {`
`64`	`64`	`if fc.Parallel != nil {`
`65`	`65`	`opts = append(opts, WithParallel(*fc.Parallel))`
`66`	`66`	`}`
	`67`	`+ if len(fc.Exclude) > 0 {`
	`68`	`+ opts = append(opts, WithExclude(fc.Exclude...))`
	`69`	`+ }`
`67`	`70`
`68`	`71`	`return opts`
`69`	`72`	`}`
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ type Finding struct {`
`18`	`18`	`Message string`
`19`	`19`	`Fix string`
`20`	`20`	`Reasoning string`
	`21`	`+ CWE string`
`21`	`22`	`}`
`22`	`23`
`23`	`24`	`// Stats for rendering.`