Skip to content

Commit b401fdb

Browse files
committed
MEDIUM: ignore code identifiers in spell check by default
Extract identifiers (function names, variables, etc.) from diff content and ignore them when spell checking file contents and the commit message body. This significantly reduces false positive spelling errors on valid code references. Added a new configuration option `no_ignore_identifiers` to disable this behavior if needed. The commit message subject remains strictly checked without identifier filtering.
1 parent 8e2e8d7 commit b401fdb

File tree

3 files changed

+321
-37
lines changed

3 files changed

+321
-37
lines changed

aspell/aspell.go

Lines changed: 73 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@ type RemoteFile struct {
2424
}
2525

2626
type Aspell struct {
27-
RemoteFile RemoteFile `yaml:"remote_file"`
28-
Mode mode `yaml:"mode"`
29-
HelpText string `yaml:"-"`
30-
IgnoreFiles []string `yaml:"ignore_files"`
31-
AllowedWords []string `yaml:"allowed"`
32-
MinLength int `yaml:"min_length"`
27+
RemoteFile RemoteFile `yaml:"remote_file"`
28+
Mode mode `yaml:"mode"`
29+
HelpText string `yaml:"-"`
30+
IgnoreFiles []string `yaml:"ignore_files"`
31+
AllowedWords []string `yaml:"allowed"`
32+
MinLength int `yaml:"min_length"`
33+
NoIgnoreIdentifiers bool `yaml:"no_ignore_identifiers"`
3334
}
3435

3536
var (
@@ -152,49 +153,84 @@ func (a Aspell) Check(subjects []string, commitsFull []string, content []map[str
152153
}
153154
}
154155

156+
// Collect identifiers (function names, variable names, etc.) from diff
157+
// content so they can be ignored during spell checking.
158+
var identifierWords []string
159+
if !a.NoIgnoreIdentifiers {
160+
seen := map[string]struct{}{}
161+
for _, file := range content {
162+
for name, v := range file {
163+
for _, word := range match.GetIdentifiersFromContent(name, v) {
164+
if _, ok := seen[word]; !ok {
165+
seen[word] = struct{}{}
166+
identifierWords = append(identifierWords, word)
167+
}
168+
}
169+
}
170+
}
171+
if len(identifierWords) > 0 {
172+
log.Printf("collected %d identifiers from diff content for spell check filtering", len(identifierWords))
173+
}
174+
}
175+
155176
var response strings.Builder
156-
var checks []string
157177
switch a.Mode {
158178
case modeDisabled:
159179
return nil
160180
case modeSubject:
161-
checks = subjects
162-
case modeCommit:
163-
checks = commitsFullData
164-
case modeAll:
165-
for _, file := range content {
166-
for name, v := range file {
167-
nextFile := false
168-
for _, filter := range a.IgnoreFiles {
169-
if match.MatchFilter(name, filter) {
170-
// log.Println("File", name, "in ignore list")
171-
nextFile = true
181+
for _, subject := range subjects {
182+
if err := a.checkSingle(subject, []string{}); err != nil {
183+
junitSuite.AddMessageFailed("commit message", "aspell check failed", err.Error())
184+
log.Println("commit message", err.Error())
185+
response.WriteString(fmt.Sprintf("%s\n", err))
186+
}
187+
}
188+
case modeCommit, modeAll:
189+
if a.Mode == modeAll {
190+
for _, file := range content {
191+
for name, v := range file {
192+
nextFile := false
193+
for _, filter := range a.IgnoreFiles {
194+
if match.MatchFilter(name, filter) {
195+
// log.Println("File", name, "in ignore list")
196+
nextFile = true
197+
continue
198+
}
199+
}
200+
if nextFile {
172201
continue
173202
}
203+
var imports []string
204+
if strings.HasSuffix(name, ".go") {
205+
imports = match.GetImportWordsFromGoFile(name)
206+
}
207+
imports = append(imports, identifierWords...)
208+
if err := a.checkSingle(v, imports); err != nil {
209+
junitSuite.AddMessageFailed(name, "aspell check failed", err.Error())
210+
log.Println(name, err.Error())
211+
response.WriteString(fmt.Sprintf("%s\n", err))
212+
}
174213
}
175-
if nextFile {
176-
continue
177-
}
178-
var imports []string
179-
if strings.HasSuffix(name, ".go") {
180-
imports = match.GetImportWordsFromGoFile(name)
181-
}
182-
if err := a.checkSingle(v, imports); err != nil {
183-
junitSuite.AddMessageFailed(name, "aspell check failed", err.Error())
184-
log.Println(name, err.Error())
214+
}
215+
}
216+
// Check commit messages: subject without identifiers, body with identifiers
217+
for _, msg := range commitsFullData {
218+
parts := strings.SplitN(msg, "\n\n", 2)
219+
// Subject — no identifier filtering (same as hash behavior)
220+
if err := a.checkSingle(parts[0], []string{}); err != nil {
221+
junitSuite.AddMessageFailed("commit message", "aspell check failed", err.Error())
222+
log.Println("commit message", err.Error())
223+
response.WriteString(fmt.Sprintf("%s\n", err))
224+
}
225+
// Body — identifier filtering allowed
226+
if len(parts) > 1 {
227+
if err := a.checkSingle(parts[1], identifierWords); err != nil {
228+
junitSuite.AddMessageFailed("commit message", "aspell check failed", err.Error())
229+
log.Println("commit message body", err.Error())
185230
response.WriteString(fmt.Sprintf("%s\n", err))
186231
}
187232
}
188233
}
189-
checks = commitsFullData
190-
}
191-
192-
for _, subject := range checks {
193-
if err := a.checkSingle(subject, []string{}); err != nil {
194-
junitSuite.AddMessageFailed("commit message", "aspell check failed", err.Error())
195-
log.Println("commit message", err.Error())
196-
response.WriteString(fmt.Sprintf("%s\n", err))
197-
}
198234
}
199235

200236
if len(response.String()) > 0 {

aspell/new.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ Add words to allowed list if its false positive`
6464
content example:
6565
mode: subject
6666
min_length: 3
67+
no_ignore_identifiers: false
6768
ignore_files:
6869
- 'gen/*'
6970
allowed:

match/identifiers.go

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
package match
2+
3+
import (
4+
"go/ast"
5+
"go/parser"
6+
"go/token"
7+
"regexp"
8+
"strings"
9+
)
10+
11+
var (
12+
// C identifiers
13+
cFuncRe = regexp.MustCompile(`\b(\w+)\s*\([^)]*\)\s*\{`)
14+
cVarRe = regexp.MustCompile(`\b(?:int|char|long|short|unsigned|signed|float|double|void|size_t|ssize_t|uint\d*_t|int\d*_t|bool|const)\s+\*?\s*(\w+)`)
15+
cStructRe = regexp.MustCompile(`\b(?:struct|enum|union|typedef)\s+(\w+)`)
16+
cDefineRe = regexp.MustCompile(`#define\s+(\w+)`)
17+
cMacroRe = regexp.MustCompile(`\b([A-Z][A-Z0-9_]{2,})\b`)
18+
19+
// Python identifiers
20+
pyFuncRe = regexp.MustCompile(`\bdef\s+(\w+)\s*\(`)
21+
pyClassRe = regexp.MustCompile(`\bclass\s+(\w+)`)
22+
pyVarRe = regexp.MustCompile(`\b(\w+)\s*=\s*`)
23+
24+
// Generic identifier pattern for other languages
25+
genericFuncRe = regexp.MustCompile(`\bfunction\s+(\w+)`)
26+
genericVarRe = regexp.MustCompile(`\b(?:let|const|var)\s+(\w+)`)
27+
28+
// Matches any identifier-like token: camelCase, PascalCase, snake_case, UPPER_CASE
29+
// Must start with a letter and be at least 2 chars.
30+
identifierTokenRe = regexp.MustCompile(`\b([a-zA-Z][a-zA-Z0-9_]{1,})\b`)
31+
)
32+
33+
// GetIdentifiersFromContent extracts function names, variable names, type names,
34+
// and other identifiers from code content. It auto-detects the language based on
35+
// the file extension. For Go files, it uses the go/ast parser for accurate extraction.
36+
func GetIdentifiersFromContent(filename, content string) []string {
37+
if strings.HasSuffix(filename, ".go") {
38+
return getGoIdentifiers(content)
39+
}
40+
return getRegexIdentifiers(filename, content)
41+
}
42+
43+
// getGoIdentifiers uses go/ast to extract all identifiers from Go source.
44+
// Since the input is often a diff (only added lines), it tries multiple
45+
// strategies to parse the content, then falls back to generic token extraction.
46+
func getGoIdentifiers(content string) []string {
47+
// Try parsing as-is first (complete file)
48+
if ids := parseGoSource(content); len(ids) > 0 {
49+
return ids
50+
}
51+
52+
// Diff content often has "+" prefixes from git — strip them
53+
stripped := stripDiffPrefixes(content)
54+
if ids := parseGoSource(stripped); len(ids) > 0 {
55+
return ids
56+
}
57+
58+
// Wrap in a synthetic file to parse partial code (e.g. function bodies)
59+
wrapped := "package _x\nfunc _() {\n" + stripped + "\n}"
60+
if ids := parseGoSource(wrapped); len(ids) > 0 {
61+
return ids
62+
}
63+
64+
// Wrap as top-level declarations (e.g. type/var/const blocks)
65+
wrapped = "package _x\n" + stripped
66+
if ids := parseGoSource(wrapped); len(ids) > 0 {
67+
return ids
68+
}
69+
70+
// AST failed — extract all identifier-like tokens
71+
return extractAllTokens(stripped)
72+
}
73+
74+
func stripDiffPrefixes(content string) string {
75+
var b strings.Builder
76+
for line := range strings.SplitSeq(content, "\n") {
77+
if trimmed, ok := strings.CutPrefix(line, "+"); ok {
78+
b.WriteString(trimmed)
79+
} else {
80+
b.WriteString(line)
81+
}
82+
b.WriteByte('\n')
83+
}
84+
return b.String()
85+
}
86+
87+
// parseGoSource parses Go source and collects every ast.Ident node.
88+
// It accepts partial ASTs (parser may return a usable tree even with errors).
89+
func parseGoSource(src string) []string {
90+
fset := token.NewFileSet()
91+
// SkipObjectResolution is faster and we don't need resolved objects.
92+
// Even with parse errors, the parser may return a partial AST — use it.
93+
f, _ := parser.ParseFile(fset, "", src, parser.SkipObjectResolution)
94+
if f == nil {
95+
return nil
96+
}
97+
98+
seen := map[string]struct{}{}
99+
ast.Inspect(f, func(n ast.Node) bool {
100+
switch node := n.(type) {
101+
case *ast.Ident:
102+
addIdent(seen, node.Name)
103+
case *ast.Field:
104+
// Extract identifiers from struct tags
105+
if node.Tag != nil {
106+
extractTagIdentifiers(seen, node.Tag.Value)
107+
}
108+
case *ast.BasicLit:
109+
// Extract identifier-like tokens from string literals
110+
// (catches words inside regex patterns, format strings, etc.)
111+
if node.Kind == token.STRING {
112+
extractStringLiteralTokens(seen, node.Value)
113+
}
114+
}
115+
return true
116+
})
117+
118+
if len(seen) == 0 {
119+
return nil
120+
}
121+
122+
result := make([]string, 0, len(seen))
123+
for word := range seen {
124+
result = append(result, word)
125+
}
126+
return result
127+
}
128+
129+
func addIdent(seen map[string]struct{}, name string) {
130+
if name == "" || name == "_" || len(name) < 2 {
131+
return
132+
}
133+
if isCommonKeyword(name) {
134+
return
135+
}
136+
lower := strings.ToLower(name)
137+
seen[lower] = struct{}{}
138+
// Also add underscore/hyphen-split parts and camelCase parts,
139+
// since aspell's checkSingle splits words the same way.
140+
for _, part := range strings.FieldsFunc(lower, func(r rune) bool {
141+
return r == '_' || r == '-'
142+
}) {
143+
if len(part) >= 2 && !isCommonKeyword(part) {
144+
seen[part] = struct{}{}
145+
}
146+
}
147+
}
148+
149+
// regexEscapeRe matches common regex escape sequences (\b, \s, \w, \d, \n, \t, etc.)
150+
var regexEscapeRe = regexp.MustCompile(`\\[bBsSwWdDntrfvpP]`)
151+
152+
// extractStringLiteralTokens extracts identifier-like tokens from Go string
153+
// literals. Regex escape sequences are stripped first so that e.g. `\bclass`
154+
// yields `class` instead of `bclass`.
155+
func extractStringLiteralTokens(seen map[string]struct{}, lit string) {
156+
// Remove Go string delimiters
157+
lit = strings.Trim(lit, "\"`")
158+
// Extract from raw content (aspell sees e.g. "bclass" from "\bclass")
159+
for _, m := range identifierTokenRe.FindAllString(lit, -1) {
160+
addIdent(seen, m)
161+
}
162+
// Also extract after stripping regex escapes (gets the real words like "class")
163+
cleaned := regexEscapeRe.ReplaceAllString(lit, " ")
164+
for _, m := range identifierTokenRe.FindAllString(cleaned, -1) {
165+
addIdent(seen, m)
166+
}
167+
}
168+
169+
func extractTagIdentifiers(seen map[string]struct{}, tag string) {
170+
tag = strings.Trim(tag, "`")
171+
tagRe := regexp.MustCompile(`\w+:"([^"]*)"`)
172+
for _, m := range tagRe.FindAllStringSubmatch(tag, -1) {
173+
if len(m) > 1 {
174+
parts := strings.SplitN(m[1], ",", 2)
175+
if parts[0] != "" && parts[0] != "-" {
176+
addIdent(seen, parts[0])
177+
}
178+
}
179+
}
180+
}
181+
182+
// extractAllTokens pulls every identifier-like token from the text.
183+
// Used as a last resort when AST parsing fails completely.
184+
func extractAllTokens(content string) []string {
185+
seen := map[string]struct{}{}
186+
for _, m := range identifierTokenRe.FindAllString(content, -1) {
187+
addIdent(seen, m)
188+
}
189+
result := make([]string, 0, len(seen))
190+
for word := range seen {
191+
result = append(result, word)
192+
}
193+
return result
194+
}
195+
196+
func getRegexIdentifiers(filename, content string) []string {
197+
seen := map[string]struct{}{}
198+
199+
var patterns []*regexp.Regexp
200+
switch {
201+
case strings.HasSuffix(filename, ".c") || strings.HasSuffix(filename, ".h"):
202+
patterns = []*regexp.Regexp{cFuncRe, cVarRe, cStructRe, cDefineRe, cMacroRe}
203+
case strings.HasSuffix(filename, ".py"):
204+
patterns = []*regexp.Regexp{pyFuncRe, pyClassRe, pyVarRe}
205+
case strings.HasSuffix(filename, ".js") || strings.HasSuffix(filename, ".ts") ||
206+
strings.HasSuffix(filename, ".jsx") || strings.HasSuffix(filename, ".tsx"):
207+
patterns = []*regexp.Regexp{genericFuncRe, genericVarRe, pyClassRe}
208+
default:
209+
// For unknown languages, extract all identifier-like tokens
210+
return extractAllTokens(content)
211+
}
212+
213+
for _, re := range patterns {
214+
for _, m := range re.FindAllStringSubmatch(content, -1) {
215+
for i := 1; i < len(m); i++ {
216+
addIdent(seen, m[i])
217+
}
218+
}
219+
}
220+
221+
result := make([]string, 0, len(seen))
222+
for word := range seen {
223+
result = append(result, word)
224+
}
225+
return result
226+
}
227+
228+
var commonKeywords = map[string]struct{}{
229+
"if": {}, "else": {}, "for": {}, "while": {}, "do": {},
230+
"switch": {}, "case": {}, "break": {}, "continue": {},
231+
"return": {}, "true": {}, "false": {}, "nil": {}, "null": {},
232+
"void": {}, "int": {}, "char": {}, "bool": {}, "string": {},
233+
"func": {}, "var": {}, "const": {}, "type": {}, "struct": {},
234+
"interface": {}, "map": {}, "range": {}, "import": {},
235+
"package": {}, "defer": {}, "go": {}, "select": {},
236+
"chan": {}, "default": {}, "class": {}, "def": {},
237+
"self": {}, "this": {}, "new": {}, "delete": {},
238+
"try": {}, "catch": {}, "throw": {}, "finally": {},
239+
"public": {}, "private": {}, "protected": {}, "static": {},
240+
"let": {}, "of": {}, "in": {}, "is": {},
241+
"error": {}, "byte": {}, "rune": {},
242+
}
243+
244+
func isCommonKeyword(word string) bool {
245+
_, ok := commonKeywords[strings.ToLower(word)]
246+
return ok
247+
}

0 commit comments

Comments
 (0)