Skip to content

Commit 4e62959

Browse files
committed
Fix core gaps: language context, lenient parsing, chunking, examples, CWE
- Language detection: inject primary language + file counts into prompts - Lenient JSON: strip comments, fix unescaped newlines, trailing commas; regex fallback extracts findings from completely broken JSON - Chunking: Describe/Improve now truncate large diffs within token budget - Few-shot examples: each concern has a concrete JSON finding example - CWE from LLM: added to JSON schema, LLM provides directly, falls back to matcher
1 parent 479f23d commit 4e62959

6 files changed

Lines changed: 329 additions & 17 deletions

File tree

describe.go

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strings"
77

88
"github.com/GrayCodeAI/sight/internal/diff"
9+
"github.com/GrayCodeAI/sight/internal/review"
910
)
1011

1112
// Description is the generated PR summary.
@@ -33,7 +34,7 @@ func Describe(ctx context.Context, rawDiff string, opts ...Option) (*Description
3334
return &Description{Title: "No changes", Summary: "Empty diff"}, nil
3435
}
3536

36-
prompt := buildDescribePrompt(files)
37+
prompt := buildDescribePrompt(files, cfg.maxTokens)
3738

3839
resp, err := cfg.provider.Chat(ctx, []Message{
3940
{Role: "user", Content: prompt},
@@ -69,7 +70,7 @@ Rules:
6970
- Risk: consider blast radius, backwards compatibility, data migrations
7071
- Test plan: concrete steps, not generic "run tests"`
7172

72-
func buildDescribePrompt(files []diff.File) string {
73+
func buildDescribePrompt(files []diff.File, maxTokens int) string {
7374
var b strings.Builder
7475
b.WriteString("Generate a PR description for these changes:\n\n")
7576
b.WriteString("## Stats\n")
@@ -88,27 +89,50 @@ func buildDescribePrompt(files []diff.File) string {
8889
b.WriteString("- " + f.Path + prefix + "\n")
8990
}
9091

91-
b.WriteString("\n## Diff\n\n```diff\n")
92+
// Build the diff section, truncating if it would exceed the token budget.
93+
// For Describe, keep the file list + stats but truncate the diff to fit.
94+
tokenBudget := maxTokens * 3 // leave room for the response
95+
headerTokens := review.EstimateTokens(b.String())
96+
diffBudget := tokenBudget - headerTokens - 100 // 100 token buffer
97+
98+
var diffBuilder strings.Builder
99+
diffBuilder.WriteString("\n## Diff\n\n```diff\n")
100+
truncated := false
101+
92102
for _, f := range files {
93103
if f.Deleted {
94104
continue
95105
}
96-
b.WriteString("--- " + f.Path + "\n")
106+
var fileSection strings.Builder
107+
fileSection.WriteString("--- " + f.Path + "\n")
97108
for _, h := range f.Hunks {
98109
for _, l := range h.Lines {
99110
switch l.Type {
100111
case diff.LineAdded:
101-
b.WriteString("+" + l.Content + "\n")
112+
fileSection.WriteString("+" + l.Content + "\n")
102113
case diff.LineRemoved:
103-
b.WriteString("-" + l.Content + "\n")
114+
fileSection.WriteString("-" + l.Content + "\n")
104115
case diff.LineContext:
105-
b.WriteString(" " + l.Content + "\n")
116+
fileSection.WriteString(" " + l.Content + "\n")
106117
}
107118
}
108119
}
120+
121+
sectionTokens := review.EstimateTokens(fileSection.String())
122+
currentTokens := review.EstimateTokens(diffBuilder.String())
123+
if diffBudget > 0 && currentTokens+sectionTokens > diffBudget {
124+
truncated = true
125+
break
126+
}
127+
diffBuilder.WriteString(fileSection.String())
128+
}
129+
130+
diffBuilder.WriteString("```\n")
131+
if truncated {
132+
diffBuilder.WriteString("\n(diff truncated to fit token budget — file list and stats above are complete)\n")
109133
}
110-
b.WriteString("```\n")
111134

135+
b.WriteString(diffBuilder.String())
112136
return b.String()
113137
}
114138

improve.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strings"
77

88
"github.com/GrayCodeAI/sight/internal/diff"
9+
"github.com/GrayCodeAI/sight/internal/review"
910
)
1011

1112
// Improvement represents a suggested code improvement.
@@ -43,6 +44,10 @@ func Improve(ctx context.Context, rawDiff string, opts ...Option) (*ImproveResul
4344
return &ImproveResult{}, nil
4445
}
4546

47+
// For Improve, only send the first N files that fit within the token budget.
48+
tokenBudget := cfg.maxTokens * 3 // leave room for response
49+
files = truncateFilesForBudget(files, tokenBudget)
50+
4651
prompt := buildImprovePrompt(files)
4752

4853
resp, err := cfg.provider.Chat(ctx, []Message{
@@ -144,6 +149,52 @@ func parseImprovements(response string) []Improvement {
144149
return valid
145150
}
146151

152+
// truncateFilesForBudget returns a subset of files whose combined estimated
153+
// token count fits within the given budget. It keeps files in order, including
154+
// only those that fit.
155+
func truncateFilesForBudget(files []diff.File, tokenBudget int) []diff.File {
156+
if tokenBudget <= 0 {
157+
return files
158+
}
159+
160+
overhead := review.EstimateTokens(improveSystemPrompt) + 200
161+
available := tokenBudget - overhead
162+
if available <= 0 {
163+
available = tokenBudget / 2
164+
}
165+
166+
var result []diff.File
167+
currentTokens := 0
168+
for _, f := range files {
169+
if f.Deleted {
170+
continue
171+
}
172+
fileTokens := estimateFileTokensPublic(f)
173+
if currentTokens+fileTokens > available && len(result) > 0 {
174+
break
175+
}
176+
result = append(result, f)
177+
currentTokens += fileTokens
178+
}
179+
if len(result) == 0 && len(files) > 0 {
180+
// Always include at least the first file.
181+
result = append(result, files[0])
182+
}
183+
return result
184+
}
185+
186+
// estimateFileTokensPublic approximates the token count for a single file's diff.
187+
func estimateFileTokensPublic(f diff.File) int {
188+
tokens := review.EstimateTokens(f.Path) + 20
189+
for _, h := range f.Hunks {
190+
tokens += 10
191+
for _, l := range h.Lines {
192+
tokens += review.EstimateTokens(l.Content) + 1
193+
}
194+
}
195+
return tokens
196+
}
197+
147198
func extractJSONArray(s string) string {
148199
s = strings.TrimSpace(s)
149200

internal/review/concerns.go

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ type Finding struct {
2828
Message string
2929
Fix string
3030
Reasoning string
31+
CWE string
3132
}
3233

3334
// AllConcerns returns every available concern definition.
@@ -43,7 +44,10 @@ func AllConcerns() []Concern {
4344
- Missing input validation and sanitization
4445
- SSRF, open redirects, CORS misconfiguration
4546
- Race conditions that could be exploited
46-
- Cryptographic weaknesses`,
47+
- Cryptographic weaknesses
48+
49+
Example of a correct finding:
50+
{"file": "handler.go", "line": 42, "severity": "critical", "message": "SQL injection: user input concatenated directly into query string", "fix": "Use parameterized query: db.Query(\"SELECT * FROM users WHERE id = $1\", userID)", "reasoning": "The userID variable comes from r.URL.Query() and is concatenated into the SQL string without sanitization"}`,
4751
},
4852
{
4953
Name: "bugs",
@@ -56,7 +60,10 @@ func AllConcerns() []Concern {
5660
- Integer overflow/underflow
5761
- Incorrect boolean logic or operator precedence
5862
- Unreachable code and dead branches
59-
- Type assertion failures`,
63+
- Type assertion failures
64+
65+
Example of a correct finding:
66+
{"file": "service.go", "line": 87, "severity": "high", "message": "Nil pointer dereference: result used without checking error return from GetUser", "fix": "Add nil check: if result == nil { return fmt.Errorf(\"user not found: %s\", id) }", "reasoning": "GetUser returns (nil, nil) when user is not found, so result.Name on line 88 will panic"}`,
6067
},
6168
{
6269
Name: "performance",
@@ -68,7 +75,10 @@ func AllConcerns() []Concern {
6875
- Blocking operations that could be concurrent
6976
- N+1 query patterns
7077
- Excessive string concatenation without builders
71-
- Missing connection/resource pooling`,
78+
- Missing connection/resource pooling
79+
80+
Example of a correct finding:
81+
{"file": "process.go", "line": 55, "severity": "medium", "message": "Allocation in hot loop: fmt.Sprintf called on every iteration to build key", "fix": "Pre-allocate a strings.Builder outside the loop or use string concatenation for simple key construction", "reasoning": "fmt.Sprintf allocates a new string each iteration; with 10k+ items this causes significant GC pressure"}`,
7282
},
7383
{
7484
Name: "correctness",
@@ -80,7 +90,10 @@ func AllConcerns() []Concern {
8090
- Are API contracts and interfaces respected?
8191
- Are invariants maintained across the change?
8292
- Could the change break existing callers?
83-
- Are defer/cleanup operations in the correct order?`,
93+
- Are defer/cleanup operations in the correct order?
94+
95+
Example of a correct finding:
96+
{"file": "repo.go", "line": 34, "severity": "high", "message": "Unchecked error: os.Remove return value discarded, file may not be deleted", "fix": "if err := os.Remove(path); err != nil { return fmt.Errorf(\"cleanup failed: %w\", err) }", "reasoning": "os.Remove can fail due to permissions or file-in-use; ignoring the error leaves stale temp files that accumulate over time"}`,
8497
},
8598
{
8699
Name: "style",
@@ -91,7 +104,10 @@ func AllConcerns() []Concern {
91104
- Abstraction level: is the code at a consistent abstraction level?
92105
- Error messages: are they actionable and specific?
93106
- API design: is the public surface minimal and intuitive?
94-
- Testability: is the code structured for easy testing?`,
107+
- Testability: is the code structured for easy testing?
108+
109+
Example of a correct finding:
110+
{"file": "client.go", "line": 12, "severity": "low", "message": "Exported function FetchData lacks doc comment", "fix": "// FetchData retrieves data from the remote API by ID.\nfunc FetchData(id string) (*Data, error) {", "reasoning": "Go convention requires doc comments on all exported symbols; godoc and linters flag this"}`,
95111
},
96112
}
97113
}

internal/review/prompt.go

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package review
22

33
import (
44
"fmt"
5+
"path/filepath"
6+
"sort"
57
"strings"
68

79
"github.com/GrayCodeAI/sight/internal/diff"
@@ -22,7 +24,8 @@ IMPORTANT: Respond ONLY with a JSON array of findings. Each finding must have:
2224
"severity": "critical|high|medium|low|info",
2325
"message": "Clear description of the issue",
2426
"fix": "Suggested code fix or approach",
25-
"reasoning": "Why this is a problem"
27+
"reasoning": "Why this is a problem",
28+
"cwe": "CWE-89 (if applicable, otherwise empty string)"
2629
}
2730
2831
If you find no issues, respond with an empty array: []
@@ -82,6 +85,89 @@ func BuildPrompt(concern Concern, files []diff.File, contextLines int) string {
8285
return b.String()
8386
}
8487

88+
// detectLanguages counts file extensions in the diff and returns a formatted
89+
// language context summary. The primary language is the one with the most files.
90+
func detectLanguages(files []diff.File) string {
91+
extCount := make(map[string]int)
92+
for _, f := range files {
93+
if f.Deleted {
94+
continue
95+
}
96+
ext := filepath.Ext(f.Path)
97+
if ext == "" {
98+
ext = filepath.Base(f.Path)
99+
}
100+
extCount[ext]++
101+
}
102+
if len(extCount) == 0 {
103+
return ""
104+
}
105+
106+
// Determine primary language by highest count.
107+
type extEntry struct {
108+
ext string
109+
count int
110+
}
111+
entries := make([]extEntry, 0, len(extCount))
112+
for ext, count := range extCount {
113+
entries = append(entries, extEntry{ext, count})
114+
}
115+
sort.Slice(entries, func(i, j int) bool {
116+
if entries[i].count != entries[j].count {
117+
return entries[i].count > entries[j].count
118+
}
119+
return entries[i].ext < entries[j].ext
120+
})
121+
122+
// Map extensions to language names for primary language display.
123+
langNames := map[string]string{
124+
".go": "Go",
125+
".py": "Python",
126+
".js": "JavaScript",
127+
".ts": "TypeScript",
128+
".tsx": "TypeScript (React)",
129+
".jsx": "JavaScript (React)",
130+
".java": "Java",
131+
".rs": "Rust",
132+
".rb": "Ruby",
133+
".cpp": "C++",
134+
".c": "C",
135+
".h": "C/C++ Header",
136+
".cs": "C#",
137+
".php": "PHP",
138+
".swift": "Swift",
139+
".kt": "Kotlin",
140+
".scala": "Scala",
141+
".yaml": "YAML",
142+
".yml": "YAML",
143+
".json": "JSON",
144+
".toml": "TOML",
145+
".md": "Markdown",
146+
".sql": "SQL",
147+
".sh": "Shell",
148+
".bash": "Bash",
149+
}
150+
151+
primary := entries[0].ext
152+
primaryLang := langNames[primary]
153+
if primaryLang == "" {
154+
primaryLang = strings.TrimPrefix(primary, ".")
155+
}
156+
157+
var b strings.Builder
158+
b.WriteString("## Context\n")
159+
b.WriteString(fmt.Sprintf("Primary language: %s\n", primaryLang))
160+
b.WriteString("Files: ")
161+
for i, e := range entries {
162+
if i > 0 {
163+
b.WriteString(", ")
164+
}
165+
b.WriteString(fmt.Sprintf("%d %s", e.count, e.ext))
166+
}
167+
b.WriteString("\n")
168+
return b.String()
169+
}
170+
85171
// BuildPromptEnhanced constructs a PR-Agent style prompt that separates new and
86172
// old hunks with clear section markers. This helps the LLM distinguish added
87173
// code from removed code more accurately.
@@ -90,6 +176,12 @@ func BuildPromptEnhanced(concern Concern, files []diff.File, contextLines int) s
90176

91177
b.WriteString("Review the following code changes:\n\n")
92178

179+
// Inject language context before the diff content.
180+
if langCtx := detectLanguages(files); langCtx != "" {
181+
b.WriteString(langCtx)
182+
b.WriteString("\n")
183+
}
184+
93185
for _, file := range files {
94186
if file.Deleted {
95187
continue

0 commit comments

Comments
 (0)