Skip to content

Commit c23ffec

Browse files
authored
Merge pull request #49 from PatrickSys/feat/phase2-symbol-refs-treesitter
feat(refs): Tree-sitter identifier-aware symbol references
2 parents 7a6cd7b + 1735e3c commit c23ffec

File tree

3 files changed

+377
-21
lines changed

3 files changed

+377
-21
lines changed

src/core/symbol-references.ts

Lines changed: 121 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ import path from 'path';
33
import { CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME } from '../constants/codebase-context.js';
44
import { IndexCorruptedError } from '../errors/index.js';
55
import type { UsageLocation } from '../types/index.js';
6+
import { detectLanguage } from '../utils/language-detection.js';
7+
import { findIdentifierOccurrences } from '../utils/tree-sitter.js';
68

79
interface IndexedChunk {
810
content?: unknown;
@@ -59,6 +61,46 @@ function buildPreview(content: string, lineOffset: number): string {
5961
return previewLines.join('\n').trim();
6062
}
6163

64+
function buildPreviewFromFileLines(lines: string[], line: number): string {
65+
const start = Math.max(0, line - 2);
66+
const end = Math.min(lines.length, line + 1);
67+
return lines.slice(start, end).join('\n').trim();
68+
}
69+
70+
function resolveAbsoluteChunkPath(rootPath: string, chunk: IndexedChunk): string | null {
71+
const resolvedRoot = path.resolve(rootPath);
72+
const isWithinRoot = (candidate: string): boolean => {
73+
const resolvedCandidate = path.resolve(candidate);
74+
const relative = path.relative(resolvedRoot, resolvedCandidate);
75+
return Boolean(relative) && !relative.startsWith('..') && !path.isAbsolute(relative);
76+
};
77+
78+
if (typeof chunk.filePath === 'string' && chunk.filePath.trim()) {
79+
const raw = chunk.filePath.trim();
80+
if (path.isAbsolute(raw)) {
81+
return isWithinRoot(raw) ? raw : null;
82+
}
83+
const resolved = path.resolve(resolvedRoot, raw);
84+
return isWithinRoot(resolved) ? resolved : null;
85+
}
86+
87+
if (typeof chunk.relativePath === 'string' && chunk.relativePath.trim()) {
88+
const resolved = path.resolve(resolvedRoot, chunk.relativePath.trim());
89+
return isWithinRoot(resolved) ? resolved : null;
90+
}
91+
92+
return null;
93+
}
94+
95+
async function fileExists(targetPath: string): Promise<boolean> {
96+
try {
97+
const stat = await fs.stat(targetPath);
98+
return stat.isFile();
99+
} catch {
100+
return false;
101+
}
102+
}
103+
62104
export async function findSymbolReferences(
63105
rootPath: string,
64106
symbol: string,
@@ -110,34 +152,95 @@ export async function findSymbolReferences(
110152
let usageCount = 0;
111153

112154
const escapedSymbol = escapeRegex(normalizedSymbol);
155+
const prefilter = new RegExp(`\\b${escapedSymbol}\\b`);
113156
const matcher = new RegExp(`\\b${escapedSymbol}\\b`, 'g');
114157

158+
// Prefilter candidate files from the keyword index. We do not trust chunk contents for
159+
// exact reference counting when Tree-sitter is available; chunks only guide which files to scan.
160+
const chunksByFile = new Map<
161+
string,
162+
{ relPath: string; absPath: string | null; chunks: IndexedChunk[] }
163+
>();
164+
115165
for (const chunkRaw of chunks) {
116166
const chunk = chunkRaw as IndexedChunk;
117-
if (typeof chunk.content !== 'string') {
118-
continue;
167+
if (typeof chunk.content !== 'string') continue;
168+
if (!prefilter.test(chunk.content)) continue;
169+
170+
const relPath = getUsageFile(rootPath, chunk);
171+
const absPath = resolveAbsoluteChunkPath(rootPath, chunk);
172+
173+
const entry = chunksByFile.get(relPath);
174+
if (entry) {
175+
entry.chunks.push(chunk);
176+
// Prefer a real absolute path when available
177+
if (!entry.absPath && absPath) {
178+
entry.absPath = absPath;
179+
}
180+
} else {
181+
chunksByFile.set(relPath, { relPath, absPath, chunks: [chunk] });
119182
}
183+
}
120184

121-
const chunkContent = chunk.content;
122-
const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1;
123-
matcher.lastIndex = 0;
185+
for (const entry of chunksByFile.values()) {
186+
const relPath = entry.relPath;
187+
const absPath = entry.absPath;
188+
189+
// Preferred: Tree-sitter identifier walk on the real file content.
190+
if (absPath && (await fileExists(absPath))) {
191+
try {
192+
const raw = await fs.readFile(absPath, 'utf-8');
193+
const content = raw.replace(/\r\n/g, '\n');
194+
const language = detectLanguage(absPath);
195+
const occurrences = await findIdentifierOccurrences(content, language, normalizedSymbol);
196+
197+
if (occurrences) {
198+
usageCount += occurrences.length;
199+
200+
if (usages.length < normalizedLimit && occurrences.length > 0) {
201+
const lines = content.split('\n');
202+
for (const occ of occurrences) {
203+
if (usages.length >= normalizedLimit) break;
204+
usages.push({
205+
file: relPath,
206+
line: occ.line,
207+
preview: buildPreviewFromFileLines(lines, occ.line)
208+
});
209+
}
210+
}
211+
212+
continue;
213+
}
214+
} catch {
215+
// Fall through to chunk-regex fallback (missing grammar, parse failure, etc.)
216+
}
217+
}
124218

125-
let match: RegExpExecArray | null;
126-
while ((match = matcher.exec(chunkContent)) !== null) {
127-
usageCount += 1;
219+
// Fallback: regex scan inside the matched chunks (legacy behavior).
220+
for (const chunk of entry.chunks) {
221+
if (typeof chunk.content !== 'string') continue;
128222

129-
if (usages.length >= normalizedLimit) {
130-
continue;
131-
}
223+
const chunkContent = chunk.content;
224+
const startLine = typeof chunk.startLine === 'number' ? chunk.startLine : 1;
225+
matcher.lastIndex = 0;
132226

133-
const prefix = chunkContent.slice(0, match.index);
134-
const lineOffset = prefix.split('\n').length - 1;
227+
let match: RegExpExecArray | null;
228+
while ((match = matcher.exec(chunkContent)) !== null) {
229+
usageCount += 1;
135230

136-
usages.push({
137-
file: getUsageFile(rootPath, chunk),
138-
line: startLine + lineOffset,
139-
preview: buildPreview(chunkContent, lineOffset)
140-
});
231+
if (usages.length >= normalizedLimit) {
232+
continue;
233+
}
234+
235+
const prefix = chunkContent.slice(0, match.index);
236+
const lineOffset = prefix.split('\n').length - 1;
237+
238+
usages.push({
239+
file: relPath,
240+
line: startLine + lineOffset,
241+
preview: buildPreview(chunkContent, lineOffset)
242+
});
243+
}
141244
}
142245
}
143246

src/utils/tree-sitter.ts

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,138 @@ export async function extractTreeSitterSymbols(
381381
return null;
382382
}
383383
}
384+
385+
export interface IdentifierOccurrence {
386+
line: number;
387+
startIndex: number;
388+
endIndex: number;
389+
nodeType: string;
390+
}
391+
392+
const IDENTIFIER_NODE_TYPES = [
393+
'identifier',
394+
'type_identifier',
395+
'property_identifier',
396+
'field_identifier',
397+
'shorthand_property_identifier_pattern',
398+
'shorthand_property_identifier',
399+
'jsx_identifier',
400+
'scoped_identifier'
401+
] as const;
402+
403+
const NON_CODE_ANCESTOR_TYPE_FRAGMENTS = [
404+
'comment',
405+
'string',
406+
'template_string',
407+
'regex',
408+
'jsx_text'
409+
] as const;
410+
411+
function isInsideNonCodeContext(node: Node): boolean {
412+
let cursor: Node | null = node;
413+
let depth = 0;
414+
while (cursor && depth < 40) {
415+
const cursorType = cursor.type;
416+
for (const fragment of NON_CODE_ANCESTOR_TYPE_FRAGMENTS) {
417+
if (cursorType.includes(fragment)) {
418+
return true;
419+
}
420+
}
421+
cursor = cursor.parent;
422+
depth += 1;
423+
}
424+
return false;
425+
}
426+
427+
/**
428+
* Find identifier occurrences of `symbol` in `content` using Tree-sitter.
429+
* Returns null when Tree-sitter isn't available/supported, so callers can fall back safely.
430+
*/
431+
export async function findIdentifierOccurrences(
432+
content: string,
433+
language: string,
434+
symbol: string
435+
): Promise<IdentifierOccurrence[] | null> {
436+
const normalizedSymbol = symbol.trim();
437+
if (!normalizedSymbol) {
438+
return [];
439+
}
440+
441+
if (!supportsTreeSitter(language) || !content.trim()) {
442+
return null;
443+
}
444+
445+
if (Buffer.byteLength(content, 'utf8') > MAX_TREE_SITTER_PARSE_BYTES) {
446+
return null;
447+
}
448+
449+
try {
450+
const parser = await getParserForLanguage(language);
451+
setParseTimeout(parser);
452+
453+
let tree: ReturnType<Parser['parse']>;
454+
try {
455+
tree = parser.parse(content);
456+
} catch (error) {
457+
evictParser(language, parser);
458+
throw error;
459+
}
460+
461+
if (!tree) {
462+
evictParser(language, parser);
463+
return null;
464+
}
465+
466+
try {
467+
const hasErrorValue = tree.rootNode.hasError as unknown;
468+
const rootHasError =
469+
typeof hasErrorValue === 'function'
470+
? Boolean((hasErrorValue as () => unknown)())
471+
: Boolean(hasErrorValue);
472+
473+
if (rootHasError) {
474+
return null;
475+
}
476+
477+
const nodes = tree.rootNode.descendantsOfType([...IDENTIFIER_NODE_TYPES]);
478+
const occurrences: IdentifierOccurrence[] = [];
479+
const seen = new Set<string>();
480+
481+
for (const node of nodes) {
482+
if (!node || !node.isNamed) continue;
483+
if (node.text !== normalizedSymbol) continue;
484+
if (isInsideNonCodeContext(node)) continue;
485+
486+
const occ: IdentifierOccurrence = {
487+
line: node.startPosition.row + 1,
488+
startIndex: node.startIndex,
489+
endIndex: node.endIndex,
490+
nodeType: node.type
491+
};
492+
const key = `${occ.line}:${occ.startIndex}:${occ.endIndex}:${occ.nodeType}`;
493+
if (seen.has(key)) continue;
494+
seen.add(key);
495+
occurrences.push(occ);
496+
}
497+
498+
occurrences.sort((a, b) => {
499+
if (a.line !== b.line) return a.line - b.line;
500+
return a.startIndex - b.startIndex;
501+
});
502+
503+
return occurrences;
504+
} finally {
505+
tree.delete();
506+
}
507+
} catch (error) {
508+
evictParser(language);
509+
510+
if (isTreeSitterDebugEnabled()) {
511+
console.error(
512+
`[DEBUG] Tree-sitter identifier occurrence scan failed for '${language}':`,
513+
error instanceof Error ? error.message : String(error)
514+
);
515+
}
516+
return null;
517+
}
518+
}

0 commit comments

Comments
 (0)