Skip to content

Commit 9eafa5c

Browse files
committed
fix: make exclude patterns recursive and share via single constant
The indexer's exclude patterns were non-recursive (e.g. `coverage/**`), only matching at the project root. Nested occurrences in monorepo packages and worktrees passed through, polluting the index with generated artifacts and worktree copies. - Extract EXCLUDED_DIRECTORY_NAMES and EXCLUDED_GLOB_PATTERNS into src/constants/codebase-context.ts as the single source of truth - Indexer, file-watcher, and project-discovery all import from there - Add missing directories: .cache, .claude, .planning, worktrees, target, vendor, .nx, .turbo, .next, build - Add integration test reproducing the consumer audit failure case (nested coverage/, .claude/worktrees/, worktrees/, dist/)
1 parent edb1350 commit 9eafa5c

File tree

5 files changed

+132
-36
lines changed

5 files changed

+132
-36
lines changed

src/constants/codebase-context.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,36 @@ export const INDEXING_STATS_FILENAME = 'indexing-stats.json' as const;
2525
export const VECTOR_DB_DIRNAME = 'index' as const;
2626
export const MANIFEST_FILENAME = 'manifest.json' as const;
2727
export const RELATIONSHIPS_FILENAME = 'relationships.json' as const;
28+
29+
/**
30+
* Directories excluded from indexing, file-watching, and project discovery.
31+
* Single source of truth — all three consumers import from here.
32+
*/
33+
export const EXCLUDED_DIRECTORY_NAMES = [
34+
'.cache',
35+
'.claude',
36+
'.codebase-context',
37+
'.git',
38+
'.next',
39+
'.nx',
40+
'.planning',
41+
'.turbo',
42+
'build',
43+
'coverage',
44+
'dist',
45+
'node_modules',
46+
'target',
47+
'vendor',
48+
'worktrees'
49+
] as const;
50+
51+
/** Glob patterns that match excluded directories at any nesting depth. */
52+
export const EXCLUDED_GLOB_PATTERNS: string[] = EXCLUDED_DIRECTORY_NAMES.map(
53+
(dir) => `**/${dir}/**`
54+
);
55+
56+
/**
57+
* Additional directories skipped only during project discovery (not generated
58+
* code, just not useful roots to recurse into).
59+
*/
60+
export const DISCOVERY_ONLY_IGNORED = ['.hg', '.nuxt', '.svn', '.venv', '.yarn', 'out', 'tmp'] as const;

src/core/file-watcher.ts

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import chokidar from 'chokidar';
22
import path from 'path';
3+
import { EXCLUDED_GLOB_PATTERNS } from '../constants/codebase-context.js';
34
import { getSupportedExtensions } from '../utils/language-detection.js';
45

56
export interface FileWatcherOptions {
@@ -43,18 +44,7 @@ export function startFileWatcher(opts: FileWatcherOptions): () => void {
4344
};
4445

4546
const watcher = chokidar.watch(rootPath, {
46-
ignored: [
47-
'**/node_modules/**',
48-
'**/.codebase-context/**',
49-
'**/.git/**',
50-
'**/dist/**',
51-
'**/.nx/**',
52-
'**/.planning/**',
53-
'**/coverage/**',
54-
'**/.turbo/**',
55-
'**/.next/**',
56-
'**/.cache/**'
57-
],
47+
ignored: [...EXCLUDED_GLOB_PATTERNS],
5848
persistent: true,
5949
ignoreInitial: true,
6050
awaitWriteFinish: { stabilityThreshold: 200, pollInterval: 100 }

src/core/indexer.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import { mergeSmallChunks } from '../utils/chunking.js';
3939
import { getFileCommitDates } from '../utils/git-dates.js';
4040
import {
4141
CODEBASE_CONTEXT_DIRNAME,
42+
EXCLUDED_GLOB_PATTERNS,
4243
INDEX_FORMAT_VERSION,
4344
INDEXING_STATS_FILENAME,
4445
INDEX_META_FILENAME,
@@ -274,14 +275,7 @@ export class CodebaseIndexer {
274275
'**/*.{sql,graphql,gql}',
275276
'**/*.{json,jsonc,yaml,yml,toml,xml}'
276277
],
277-
exclude: [
278-
'node_modules/**',
279-
'dist/**',
280-
'build/**',
281-
'.git/**',
282-
'coverage/**',
283-
'.codebase-context/**'
284-
],
278+
exclude: [...EXCLUDED_GLOB_PATTERNS],
285279
respectGitignore: true,
286280
parsing: {
287281
maxFileSize: 1048576,

src/utils/project-discovery.ts

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import { promises as fs } from 'fs';
22
import type { Dirent } from 'fs';
33
import path from 'path';
4+
import {
5+
EXCLUDED_DIRECTORY_NAMES,
6+
DISCOVERY_ONLY_IGNORED
7+
} from '../constants/codebase-context.js';
48

59
export type ProjectEvidence =
610
| 'existing_index'
@@ -20,22 +24,8 @@ export interface DiscoverProjectsOptions {
2024
const DEFAULT_MAX_DEPTH = 4;
2125

2226
const IGNORED_DIRECTORY_NAMES = new Set([
23-
'.git',
24-
'.hg',
25-
'.svn',
26-
'.next',
27-
'.nuxt',
28-
'.turbo',
29-
'.venv',
30-
'.yarn',
31-
'build',
32-
'coverage',
33-
'dist',
34-
'node_modules',
35-
'out',
36-
'target',
37-
'tmp',
38-
'vendor'
27+
...EXCLUDED_DIRECTORY_NAMES,
28+
...DISCOVERY_ONLY_IGNORED
3929
]);
4030

4131
const STRONG_DIRECTORY_MARKERS = new Set(['.codebase-context', '.git']);
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
2+
import { promises as fs } from 'fs';
3+
import os from 'os';
4+
import path from 'path';
5+
import { CodebaseIndexer } from '../src/core/indexer.js';
6+
import { analyzerRegistry } from '../src/core/analyzer-registry.js';
7+
import { GenericAnalyzer } from '../src/analyzers/generic/index.js';
8+
import {
9+
CODEBASE_CONTEXT_DIRNAME,
10+
KEYWORD_INDEX_FILENAME
11+
} from '../src/constants/codebase-context.js';
12+
13+
describe('Indexer exclude patterns — nested directories', () => {
14+
let tempDir: string;
15+
16+
beforeEach(async () => {
17+
analyzerRegistry.register(new GenericAnalyzer());
18+
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'indexer-exclude-patterns-'));
19+
});
20+
21+
afterEach(async () => {
22+
await fs.rm(tempDir, { recursive: true, force: true });
23+
});
24+
25+
it('excludes nested coverage, worktrees, .claude, and dist directories', async () => {
26+
// Legitimate source file
27+
await fs.mkdir(path.join(tempDir, 'src'), { recursive: true });
28+
await fs.writeFile(
29+
path.join(tempDir, 'src', 'app.ts'),
30+
'export function main() { return "hello"; }\n'
31+
);
32+
33+
// Polluters — nested paths that should be excluded
34+
const polluters = [
35+
['packages', 'ui', 'coverage', 'prettify.js'],
36+
['.claude', 'worktrees', 'branch', 'src', 'app.ts'],
37+
['worktrees', 'portal30-pr', 'src', 'real.ts'],
38+
['apps', 'web', 'dist', 'bundle.js']
39+
];
40+
41+
for (const segments of polluters) {
42+
const dir = path.join(tempDir, ...segments.slice(0, -1));
43+
await fs.mkdir(dir, { recursive: true });
44+
await fs.writeFile(
45+
path.join(tempDir, ...segments),
46+
'export const polluter = true;\n'
47+
);
48+
}
49+
50+
const indexer = new CodebaseIndexer({
51+
rootPath: tempDir,
52+
config: {
53+
skipEmbedding: true,
54+
parsing: {
55+
maxFileSize: 1048576,
56+
chunkSize: 50,
57+
chunkOverlap: 0,
58+
parseTests: true,
59+
parseNodeModules: false
60+
}
61+
}
62+
});
63+
64+
await indexer.index();
65+
66+
const indexPath = path.join(tempDir, CODEBASE_CONTEXT_DIRNAME, KEYWORD_INDEX_FILENAME);
67+
const indexRaw = JSON.parse(await fs.readFile(indexPath, 'utf-8')) as Record<string, unknown>;
68+
const chunks = (
69+
Array.isArray(indexRaw)
70+
? indexRaw
71+
: Array.isArray(indexRaw?.chunks)
72+
? indexRaw.chunks
73+
: []
74+
) as Array<{ filePath: string }>;
75+
const indexedPaths = chunks.map((chunk) => chunk.filePath);
76+
77+
// The legitimate file must be indexed
78+
expect(indexedPaths.some((p) => p.includes('src/app.ts') || p.includes('src\\app.ts'))).toBe(
79+
true
80+
);
81+
82+
// None of the polluter paths should appear
83+
const polluterMarkers = ['coverage', '.claude', 'worktrees', 'dist'];
84+
for (const marker of polluterMarkers) {
85+
const leaked = indexedPaths.filter((p) => p.includes(marker));
86+
expect(leaked, `paths containing "${marker}" should not be indexed`).toEqual([]);
87+
}
88+
});
89+
});

0 commit comments

Comments
 (0)