Skip to content

Commit 2aa0831

Browse files
committed
feat(refs): tree-sitter identifier-aware symbol references
1 parent 7a6cd7b commit 2aa0831

File tree

3 files changed

+341
-18
lines changed

3 files changed

+341
-18
lines changed

src/core/symbol-references.ts

Lines changed: 112 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ import path from 'path';
33
import { CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME } from '../constants/codebase-context.js';
44
import { IndexCorruptedError } from '../errors/index.js';
55
import type { UsageLocation } from '../types/index.js';
6+
import { detectLanguage } from '../utils/language-detection.js';
7+
import { findIdentifierOccurrences } from '../utils/tree-sitter.js';
68

79
interface IndexedChunk {
810
content?: unknown;
@@ -59,6 +61,37 @@ function buildPreview(content: string, lineOffset: number): string {
5961
return previewLines.join('\n').trim();
6062
}
6163

64+
function buildPreviewFromFileLines(lines: string[], line: number): string {
65+
const start = Math.max(0, line - 2);
66+
const end = Math.min(lines.length, line + 1);
67+
return lines.slice(start, end).join('\n').trim();
68+
}
69+
70+
function resolveAbsoluteChunkPath(rootPath: string, chunk: IndexedChunk): string | null {
71+
if (typeof chunk.filePath === 'string' && chunk.filePath.trim()) {
72+
const raw = chunk.filePath.trim();
73+
if (path.isAbsolute(raw)) {
74+
return raw;
75+
}
76+
return path.resolve(rootPath, raw);
77+
}
78+
79+
if (typeof chunk.relativePath === 'string' && chunk.relativePath.trim()) {
80+
return path.resolve(rootPath, chunk.relativePath.trim());
81+
}
82+
83+
return null;
84+
}
85+
86+
async function fileExists(targetPath: string): Promise<boolean> {
87+
try {
88+
const stat = await fs.stat(targetPath);
89+
return stat.isFile();
90+
} catch {
91+
return false;
92+
}
93+
}
94+
6295
export async function findSymbolReferences(
6396
rootPath: string,
6497
symbol: string,
@@ -110,34 +143,95 @@ export async function findSymbolReferences(
110143
let usageCount = 0;
111144

112145
const escapedSymbol = escapeRegex(normalizedSymbol);
146+
const prefilter = new RegExp(`\\b${escapedSymbol}\\b`);
113147
const matcher = new RegExp(`\\b${escapedSymbol}\\b`, 'g');
114148

149+
// Prefilter candidate files from the keyword index. We do not trust chunk contents for
150+
// exact reference counting when Tree-sitter is available; chunks only guide which files to scan.
151+
const chunksByFile = new Map<
152+
string,
153+
{ relPath: string; absPath: string | null; chunks: IndexedChunk[] }
154+
>();
155+
115156
for (const chunkRaw of chunks) {
116157
const chunk = chunkRaw as IndexedChunk;
117-
if (typeof chunk.content !== 'string') {
118-
continue;
158+
if (typeof chunk.content !== 'string') continue;
159+
if (!prefilter.test(chunk.content)) continue;
160+
161+
const relPath = getUsageFile(rootPath, chunk);
162+
const absPath = resolveAbsoluteChunkPath(rootPath, chunk);
163+
164+
const entry = chunksByFile.get(relPath);
165+
if (entry) {
166+
entry.chunks.push(chunk);
167+
// Prefer a real absolute path when available
168+
if (!entry.absPath && absPath) {
169+
entry.absPath = absPath;
170+
}
171+
} else {
172+
chunksByFile.set(relPath, { relPath, absPath, chunks: [chunk] });
173+
}
174+
}
175+
176+
for (const entry of chunksByFile.values()) {
177+
const relPath = entry.relPath;
178+
const absPath = entry.absPath;
179+
180+
// Preferred: Tree-sitter identifier walk on the real file content.
181+
if (absPath && (await fileExists(absPath))) {
182+
try {
183+
const raw = await fs.readFile(absPath, 'utf-8');
184+
const content = raw.replace(/\r\n/g, '\n');
185+
const language = detectLanguage(absPath);
186+
const occurrences = await findIdentifierOccurrences(content, language, normalizedSymbol);
187+
188+
if (occurrences) {
189+
usageCount += occurrences.length;
190+
191+
if (usages.length < normalizedLimit && occurrences.length > 0) {
192+
const lines = content.split('\n');
193+
for (const occ of occurrences) {
194+
if (usages.length >= normalizedLimit) break;
195+
usages.push({
196+
file: relPath,
197+
line: occ.line,
198+
preview: buildPreviewFromFileLines(lines, occ.line)
199+
});
200+
}
201+
}
202+
203+
continue;
204+
}
205+
} catch {
206+
// Fall through to chunk-regex fallback (missing grammar, parse failure, etc.)
207+
}
119208
}
120209

121-
const chunkContent = chunk.content;
122-
const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1;
123-
matcher.lastIndex = 0;
210+
// Fallback: regex scan inside the matched chunks (legacy behavior).
211+
for (const chunk of entry.chunks) {
212+
if (typeof chunk.content !== 'string') continue;
124213

125-
let match: RegExpExecArray | null;
126-
while ((match = matcher.exec(chunkContent)) !== null) {
127-
usageCount += 1;
214+
const chunkContent = chunk.content;
215+
const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1;
216+
matcher.lastIndex = 0;
128217

129-
if (usages.length >= normalizedLimit) {
130-
continue;
131-
}
218+
let match: RegExpExecArray | null;
219+
while ((match = matcher.exec(chunkContent)) !== null) {
220+
usageCount += 1;
221+
222+
if (usages.length >= normalizedLimit) {
223+
continue;
224+
}
132225

133-
const prefix = chunkContent.slice(0, match.index);
134-
const lineOffset = prefix.split('\n').length - 1;
226+
const prefix = chunkContent.slice(0, match.index);
227+
const lineOffset = prefix.split('\n').length - 1;
135228

136-
usages.push({
137-
file: getUsageFile(rootPath, chunk),
138-
line: startLine + lineOffset,
139-
preview: buildPreview(chunkContent, lineOffset)
140-
});
229+
usages.push({
230+
file: relPath,
231+
line: startLine + lineOffset,
232+
preview: buildPreview(chunkContent, lineOffset)
233+
});
234+
}
141235
}
142236
}
143237

src/utils/tree-sitter.ts

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,138 @@ export async function extractTreeSitterSymbols(
381381
return null;
382382
}
383383
}
384+
385+
export interface IdentifierOccurrence {
386+
line: number;
387+
startIndex: number;
388+
endIndex: number;
389+
nodeType: string;
390+
}
391+
392+
const IDENTIFIER_NODE_TYPES = [
393+
'identifier',
394+
'type_identifier',
395+
'property_identifier',
396+
'field_identifier',
397+
'shorthand_property_identifier_pattern',
398+
'shorthand_property_identifier',
399+
'jsx_identifier',
400+
'scoped_identifier'
401+
] as const;
402+
403+
const NON_CODE_ANCESTOR_TYPE_FRAGMENTS = [
404+
'comment',
405+
'string',
406+
'template_string',
407+
'regex',
408+
'jsx_text'
409+
] as const;
410+
411+
function isInsideNonCodeContext(node: Node): boolean {
412+
let cursor: Node | null = node;
413+
let depth = 0;
414+
while (cursor && depth < 40) {
415+
const cursorType = cursor.type;
416+
for (const fragment of NON_CODE_ANCESTOR_TYPE_FRAGMENTS) {
417+
if (cursorType.includes(fragment)) {
418+
return true;
419+
}
420+
}
421+
cursor = cursor.parent;
422+
depth += 1;
423+
}
424+
return false;
425+
}
426+
427+
/**
428+
* Find identifier occurrences of `symbol` in `content` using Tree-sitter.
429+
* Returns null when Tree-sitter isn't available/supported, so callers can fall back safely.
430+
*/
431+
export async function findIdentifierOccurrences(
432+
content: string,
433+
language: string,
434+
symbol: string
435+
): Promise<IdentifierOccurrence[] | null> {
436+
const normalizedSymbol = symbol.trim();
437+
if (!normalizedSymbol) {
438+
return [];
439+
}
440+
441+
if (!supportsTreeSitter(language) || !content.trim()) {
442+
return null;
443+
}
444+
445+
if (Buffer.byteLength(content, 'utf8') > MAX_TREE_SITTER_PARSE_BYTES) {
446+
return null;
447+
}
448+
449+
try {
450+
const parser = await getParserForLanguage(language);
451+
setParseTimeout(parser);
452+
453+
let tree: ReturnType<Parser['parse']>;
454+
try {
455+
tree = parser.parse(content);
456+
} catch (error) {
457+
evictParser(language, parser);
458+
throw error;
459+
}
460+
461+
if (!tree) {
462+
evictParser(language, parser);
463+
return null;
464+
}
465+
466+
try {
467+
const hasErrorValue = tree.rootNode.hasError as unknown;
468+
const rootHasError =
469+
typeof hasErrorValue === 'function'
470+
? Boolean((hasErrorValue as () => unknown)())
471+
: Boolean(hasErrorValue);
472+
473+
if (rootHasError) {
474+
return null;
475+
}
476+
477+
const nodes = tree.rootNode.descendantsOfType([...IDENTIFIER_NODE_TYPES]);
478+
const occurrences: IdentifierOccurrence[] = [];
479+
const seen = new Set<string>();
480+
481+
for (const node of nodes) {
482+
if (!node || !node.isNamed) continue;
483+
if (node.text !== normalizedSymbol) continue;
484+
if (isInsideNonCodeContext(node)) continue;
485+
486+
const occ: IdentifierOccurrence = {
487+
line: node.startPosition.row + 1,
488+
startIndex: node.startIndex,
489+
endIndex: node.endIndex,
490+
nodeType: node.type
491+
};
492+
const key = `${occ.line}:${occ.startIndex}:${occ.endIndex}:${occ.nodeType}`;
493+
if (seen.has(key)) continue;
494+
seen.add(key);
495+
occurrences.push(occ);
496+
}
497+
498+
occurrences.sort((a, b) => {
499+
if (a.line !== b.line) return a.line - b.line;
500+
return a.startIndex - b.startIndex;
501+
});
502+
503+
return occurrences;
504+
} finally {
505+
tree.delete();
506+
}
507+
} catch (error) {
508+
evictParser(language);
509+
510+
if (isTreeSitterDebugEnabled()) {
511+
console.error(
512+
`[DEBUG] Tree-sitter identifier occurrence scan failed for '${language}':`,
513+
error instanceof Error ? error.message : String(error)
514+
);
515+
}
516+
return null;
517+
}
518+
}

0 commit comments

Comments
 (0)