|
| 1 | +package match |
| 2 | + |
| 3 | +import ( |
| 4 | + "go/ast" |
| 5 | + "go/parser" |
| 6 | + "go/token" |
| 7 | + "regexp" |
| 8 | + "strings" |
| 9 | +) |
| 10 | + |
| 11 | +var ( |
| 12 | + // C identifiers |
| 13 | + cFuncRe = regexp.MustCompile(`\b(\w+)\s*\([^)]*\)\s*\{`) |
| 14 | + cVarRe = regexp.MustCompile(`\b(?:int|char|long|short|unsigned|signed|float|double|void|size_t|ssize_t|uint\d*_t|int\d*_t|bool|const)\s+\*?\s*(\w+)`) |
| 15 | + cStructRe = regexp.MustCompile(`\b(?:struct|enum|union|typedef)\s+(\w+)`) |
| 16 | + cDefineRe = regexp.MustCompile(`#define\s+(\w+)`) |
| 17 | + cMacroRe = regexp.MustCompile(`\b([A-Z][A-Z0-9_]{2,})\b`) |
| 18 | + |
| 19 | + // Python identifiers |
| 20 | + pyFuncRe = regexp.MustCompile(`\bdef\s+(\w+)\s*\(`) |
| 21 | + pyClassRe = regexp.MustCompile(`\bclass\s+(\w+)`) |
| 22 | + pyVarRe = regexp.MustCompile(`\b(\w+)\s*=\s*`) |
| 23 | + |
| 24 | + // Generic identifier pattern for other languages |
| 25 | + genericFuncRe = regexp.MustCompile(`\bfunction\s+(\w+)`) |
| 26 | + genericVarRe = regexp.MustCompile(`\b(?:let|const|var)\s+(\w+)`) |
| 27 | + |
| 28 | + // Matches any identifier-like token: camelCase, PascalCase, snake_case, UPPER_CASE |
| 29 | + // Must start with a letter and be at least 2 chars. |
| 30 | + identifierTokenRe = regexp.MustCompile(`\b([a-zA-Z][a-zA-Z0-9_]{1,})\b`) |
| 31 | +) |
| 32 | + |
| 33 | +// GetIdentifiersFromContent extracts function names, variable names, type names, |
| 34 | +// and other identifiers from code content. It auto-detects the language based on |
| 35 | +// the file extension. For Go files, it uses the go/ast parser for accurate extraction. |
| 36 | +func GetIdentifiersFromContent(filename, content string) []string { |
| 37 | + if strings.HasSuffix(filename, ".go") { |
| 38 | + return getGoIdentifiers(content) |
| 39 | + } |
| 40 | + return getRegexIdentifiers(filename, content) |
| 41 | +} |
| 42 | + |
| 43 | +// getGoIdentifiers uses go/ast to extract all identifiers from Go source. |
| 44 | +// Since the input is often a diff (only added lines), it tries multiple |
| 45 | +// strategies to parse the content, then falls back to generic token extraction. |
| 46 | +func getGoIdentifiers(content string) []string { |
| 47 | + // Try parsing as-is first (complete file) |
| 48 | + if ids := parseGoSource(content); len(ids) > 0 { |
| 49 | + return ids |
| 50 | + } |
| 51 | + |
| 52 | + // Diff content often has "+" prefixes from git — strip them |
| 53 | + stripped := stripDiffPrefixes(content) |
| 54 | + if ids := parseGoSource(stripped); len(ids) > 0 { |
| 55 | + return ids |
| 56 | + } |
| 57 | + |
| 58 | + // Wrap in a synthetic file to parse partial code (e.g. function bodies) |
| 59 | + wrapped := "package _x\nfunc _() {\n" + stripped + "\n}" |
| 60 | + if ids := parseGoSource(wrapped); len(ids) > 0 { |
| 61 | + return ids |
| 62 | + } |
| 63 | + |
| 64 | + // Wrap as top-level declarations (e.g. type/var/const blocks) |
| 65 | + wrapped = "package _x\n" + stripped |
| 66 | + if ids := parseGoSource(wrapped); len(ids) > 0 { |
| 67 | + return ids |
| 68 | + } |
| 69 | + |
| 70 | + // AST failed — extract all identifier-like tokens |
| 71 | + return extractAllTokens(stripped) |
| 72 | +} |
| 73 | + |
| 74 | +func stripDiffPrefixes(content string) string { |
| 75 | + var b strings.Builder |
| 76 | + for line := range strings.SplitSeq(content, "\n") { |
| 77 | + if trimmed, ok := strings.CutPrefix(line, "+"); ok { |
| 78 | + b.WriteString(trimmed) |
| 79 | + } else { |
| 80 | + b.WriteString(line) |
| 81 | + } |
| 82 | + b.WriteByte('\n') |
| 83 | + } |
| 84 | + return b.String() |
| 85 | +} |
| 86 | + |
| 87 | +// parseGoSource parses Go source and collects every ast.Ident node. |
| 88 | +// It accepts partial ASTs (parser may return a usable tree even with errors). |
| 89 | +func parseGoSource(src string) []string { |
| 90 | + fset := token.NewFileSet() |
| 91 | + // SkipObjectResolution is faster and we don't need resolved objects. |
| 92 | + // Even with parse errors, the parser may return a partial AST — use it. |
| 93 | + f, _ := parser.ParseFile(fset, "", src, parser.SkipObjectResolution) |
| 94 | + if f == nil { |
| 95 | + return nil |
| 96 | + } |
| 97 | + |
| 98 | + seen := map[string]struct{}{} |
| 99 | + ast.Inspect(f, func(n ast.Node) bool { |
| 100 | + switch node := n.(type) { |
| 101 | + case *ast.Ident: |
| 102 | + addIdent(seen, node.Name) |
| 103 | + case *ast.Field: |
| 104 | + // Extract identifiers from struct tags |
| 105 | + if node.Tag != nil { |
| 106 | + extractTagIdentifiers(seen, node.Tag.Value) |
| 107 | + } |
| 108 | + case *ast.BasicLit: |
| 109 | + // Extract identifier-like tokens from string literals |
| 110 | + // (catches words inside regex patterns, format strings, etc.) |
| 111 | + if node.Kind == token.STRING { |
| 112 | + extractStringLiteralTokens(seen, node.Value) |
| 113 | + } |
| 114 | + } |
| 115 | + return true |
| 116 | + }) |
| 117 | + |
| 118 | + if len(seen) == 0 { |
| 119 | + return nil |
| 120 | + } |
| 121 | + |
| 122 | + result := make([]string, 0, len(seen)) |
| 123 | + for word := range seen { |
| 124 | + result = append(result, word) |
| 125 | + } |
| 126 | + return result |
| 127 | +} |
| 128 | + |
| 129 | +func addIdent(seen map[string]struct{}, name string) { |
| 130 | + if name == "" || name == "_" || len(name) < 2 { |
| 131 | + return |
| 132 | + } |
| 133 | + if isCommonKeyword(name) { |
| 134 | + return |
| 135 | + } |
| 136 | + lower := strings.ToLower(name) |
| 137 | + seen[lower] = struct{}{} |
| 138 | + // Also add underscore/hyphen-split parts and camelCase parts, |
| 139 | + // since aspell's checkSingle splits words the same way. |
| 140 | + for _, part := range strings.FieldsFunc(lower, func(r rune) bool { |
| 141 | + return r == '_' || r == '-' |
| 142 | + }) { |
| 143 | + if len(part) >= 2 && !isCommonKeyword(part) { |
| 144 | + seen[part] = struct{}{} |
| 145 | + } |
| 146 | + } |
| 147 | +} |
| 148 | + |
| 149 | +// regexEscapeRe matches common regex escape sequences (\b, \s, \w, \d, \n, \t, etc.) |
| 150 | +var regexEscapeRe = regexp.MustCompile(`\\[bBsSwWdDntrfvpP]`) |
| 151 | + |
| 152 | +// extractStringLiteralTokens extracts identifier-like tokens from Go string |
| 153 | +// literals. Regex escape sequences are stripped first so that e.g. `\bclass` |
| 154 | +// yields `class` instead of `bclass`. |
| 155 | +func extractStringLiteralTokens(seen map[string]struct{}, lit string) { |
| 156 | + // Remove Go string delimiters |
| 157 | + lit = strings.Trim(lit, "\"`") |
| 158 | + // Extract from raw content (aspell sees e.g. "bclass" from "\bclass") |
| 159 | + for _, m := range identifierTokenRe.FindAllString(lit, -1) { |
| 160 | + addIdent(seen, m) |
| 161 | + } |
| 162 | + // Also extract after stripping regex escapes (gets the real words like "class") |
| 163 | + cleaned := regexEscapeRe.ReplaceAllString(lit, " ") |
| 164 | + for _, m := range identifierTokenRe.FindAllString(cleaned, -1) { |
| 165 | + addIdent(seen, m) |
| 166 | + } |
| 167 | +} |
| 168 | + |
| 169 | +func extractTagIdentifiers(seen map[string]struct{}, tag string) { |
| 170 | + tag = strings.Trim(tag, "`") |
| 171 | + tagRe := regexp.MustCompile(`\w+:"([^"]*)"`) |
| 172 | + for _, m := range tagRe.FindAllStringSubmatch(tag, -1) { |
| 173 | + if len(m) > 1 { |
| 174 | + parts := strings.SplitN(m[1], ",", 2) |
| 175 | + if parts[0] != "" && parts[0] != "-" { |
| 176 | + addIdent(seen, parts[0]) |
| 177 | + } |
| 178 | + } |
| 179 | + } |
| 180 | +} |
| 181 | + |
| 182 | +// extractAllTokens pulls every identifier-like token from the text. |
| 183 | +// Used as a last resort when AST parsing fails completely. |
| 184 | +func extractAllTokens(content string) []string { |
| 185 | + seen := map[string]struct{}{} |
| 186 | + for _, m := range identifierTokenRe.FindAllString(content, -1) { |
| 187 | + addIdent(seen, m) |
| 188 | + } |
| 189 | + result := make([]string, 0, len(seen)) |
| 190 | + for word := range seen { |
| 191 | + result = append(result, word) |
| 192 | + } |
| 193 | + return result |
| 194 | +} |
| 195 | + |
| 196 | +func getRegexIdentifiers(filename, content string) []string { |
| 197 | + seen := map[string]struct{}{} |
| 198 | + |
| 199 | + var patterns []*regexp.Regexp |
| 200 | + switch { |
| 201 | + case strings.HasSuffix(filename, ".c") || strings.HasSuffix(filename, ".h"): |
| 202 | + patterns = []*regexp.Regexp{cFuncRe, cVarRe, cStructRe, cDefineRe, cMacroRe} |
| 203 | + case strings.HasSuffix(filename, ".py"): |
| 204 | + patterns = []*regexp.Regexp{pyFuncRe, pyClassRe, pyVarRe} |
| 205 | + case strings.HasSuffix(filename, ".js") || strings.HasSuffix(filename, ".ts") || |
| 206 | + strings.HasSuffix(filename, ".jsx") || strings.HasSuffix(filename, ".tsx"): |
| 207 | + patterns = []*regexp.Regexp{genericFuncRe, genericVarRe, pyClassRe} |
| 208 | + default: |
| 209 | + // For unknown languages, extract all identifier-like tokens |
| 210 | + return extractAllTokens(content) |
| 211 | + } |
| 212 | + |
| 213 | + for _, re := range patterns { |
| 214 | + for _, m := range re.FindAllStringSubmatch(content, -1) { |
| 215 | + for i := 1; i < len(m); i++ { |
| 216 | + addIdent(seen, m[i]) |
| 217 | + } |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + result := make([]string, 0, len(seen)) |
| 222 | + for word := range seen { |
| 223 | + result = append(result, word) |
| 224 | + } |
| 225 | + return result |
| 226 | +} |
| 227 | + |
| 228 | +var commonKeywords = map[string]struct{}{ |
| 229 | + "if": {}, "else": {}, "for": {}, "while": {}, "do": {}, |
| 230 | + "switch": {}, "case": {}, "break": {}, "continue": {}, |
| 231 | + "return": {}, "true": {}, "false": {}, "nil": {}, "null": {}, |
| 232 | + "void": {}, "int": {}, "char": {}, "bool": {}, "string": {}, |
| 233 | + "func": {}, "var": {}, "const": {}, "type": {}, "struct": {}, |
| 234 | + "interface": {}, "map": {}, "range": {}, "import": {}, |
| 235 | + "package": {}, "defer": {}, "go": {}, "select": {}, |
| 236 | + "chan": {}, "default": {}, "class": {}, "def": {}, |
| 237 | + "self": {}, "this": {}, "new": {}, "delete": {}, |
| 238 | + "try": {}, "catch": {}, "throw": {}, "finally": {}, |
| 239 | + "public": {}, "private": {}, "protected": {}, "static": {}, |
| 240 | + "let": {}, "of": {}, "in": {}, "is": {}, |
| 241 | + "error": {}, "byte": {}, "rune": {}, |
| 242 | +} |
| 243 | + |
| 244 | +func isCommonKeyword(word string) bool { |
| 245 | + _, ok := commonKeywords[strings.ToLower(word)] |
| 246 | + return ok |
| 247 | +} |
0 commit comments