Skip to content

Commit 5eab9c2

Browse files
committed
fix: harden search reliability and indexing hygiene
1 parent 924a8fc commit 5eab9c2

23 files changed

+1395
-145
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ dist/
77
*.log
88
.DS_Store
99
.env
10+
opencode.jsonc
11+
nul
1012
.vscode/
1113
*.swp
1214
*.swo

src/core/analyzer-registry.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ export class AnalyzerRegistry {
7070
const analyzer = this.findAnalyzer(filePath, content);
7171

7272
if (!analyzer) {
73-
console.warn(`No analyzer found for file: ${filePath}`);
73+
if (process.env.CODEBASE_CONTEXT_DEBUG) {
74+
console.error(`[DEBUG] No analyzer found for file: ${filePath}`);
75+
}
7476
return null;
7577
}
7678

src/core/indexer.ts

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ export class CodebaseIndexer {
194194

195195
console.error(
196196
`Incremental diff: ${diff.added.length} added, ${diff.changed.length} changed, ` +
197-
`${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged`
197+
`${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged`
198198
);
199199

200200
stats.incremental = {
@@ -210,6 +210,20 @@ export class CodebaseIndexer {
210210
this.updateProgress('complete', 100);
211211
stats.duration = Date.now() - startTime;
212212
stats.completedAt = new Date();
213+
214+
// Preserve accurate counts from the existing index (nothing changed, index is intact)
215+
try {
216+
const existingIndexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
217+
const existingChunks = JSON.parse(await fs.readFile(existingIndexPath, 'utf-8'));
218+
if (Array.isArray(existingChunks)) {
219+
stats.totalChunks = existingChunks.length;
220+
const uniqueFiles = new Set(existingChunks.map((c: { filePath?: string }) => c.filePath));
221+
stats.indexedFiles = uniqueFiles.size;
222+
}
223+
} catch {
224+
// Keyword index doesn't exist yet — keep counts as 0
225+
}
226+
213227
return stats;
214228
}
215229
}
@@ -418,8 +432,8 @@ export class CodebaseIndexer {
418432
this.updateProgress('embedding', 50);
419433
console.error(
420434
`Creating embeddings for ${chunksToEmbed.length} chunks` +
421-
(diff ? ` (${allChunks.length} total, ${chunksToEmbed.length} changed)` : '') +
422-
'...'
435+
(diff ? ` (${allChunks.length} total, ${chunksToEmbed.length} changed)` : '') +
436+
'...'
423437
);
424438

425439
// Initialize embedding provider
@@ -480,11 +494,13 @@ export class CodebaseIndexer {
480494

481495
if (diff) {
482496
// Incremental: delete old chunks for changed + deleted files, then add new
483-
const filesToDelete = [...diff.changed, ...diff.deleted]
484-
.map((rel) => path.join(this.rootPath, rel).replace(/\\/g, '/'));
497+
const filesToDelete = [...diff.changed, ...diff.deleted].map((rel) =>
498+
path.join(this.rootPath, rel).replace(/\\/g, '/')
499+
);
485500
// Also try with OS-native separators for matching
486-
const filePathsForDelete = [...diff.changed, ...diff.deleted]
487-
.map((rel) => path.resolve(this.rootPath, rel));
501+
const filePathsForDelete = [...diff.changed, ...diff.deleted].map((rel) =>
502+
path.resolve(this.rootPath, rel)
503+
);
488504
const allDeletePaths = [...new Set([...filesToDelete, ...filePathsForDelete])];
489505

490506
if (allDeletePaths.length > 0) {
@@ -495,7 +511,7 @@ export class CodebaseIndexer {
495511
}
496512
console.error(
497513
`Incremental store: deleted chunks for ${diff.changed.length + diff.deleted.length} files, ` +
498-
`added ${chunksWithEmbeddings.length} new chunks`
514+
`added ${chunksWithEmbeddings.length} new chunks`
499515
);
500516
} else {
501517
// Full: clear and re-store everything
@@ -508,7 +524,8 @@ export class CodebaseIndexer {
508524
// Keyword index always uses ALL chunks (full regen)
509525
const indexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
510526
// Memory safety: cap keyword index too
511-
const keywordChunks = allChunks.length > MAX_CHUNKS ? allChunks.slice(0, MAX_CHUNKS) : allChunks;
527+
const keywordChunks =
528+
allChunks.length > MAX_CHUNKS ? allChunks.slice(0, MAX_CHUNKS) : allChunks;
512529
await fs.writeFile(indexPath, JSON.stringify(keywordChunks));
513530

514531
// Save library usage and pattern stats (always full regen)
@@ -552,7 +569,7 @@ export class CodebaseIndexer {
552569
const manifest: FileManifest = {
553570
version: 1,
554571
generatedAt: new Date().toISOString(),
555-
files: currentHashes ?? await computeFileHashes(files, this.rootPath)
572+
files: currentHashes ?? (await computeFileHashes(files, this.rootPath))
556573
};
557574
await writeManifest(manifestPath, manifest);
558575

@@ -565,8 +582,8 @@ export class CodebaseIndexer {
565582
if (diff) {
566583
console.error(
567584
`Incremental indexing complete in ${stats.duration}ms ` +
568-
`(${diff.added.length} added, ${diff.changed.length} changed, ` +
569-
`${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged)`
585+
`(${diff.added.length} added, ${diff.changed.length} changed, ` +
586+
`${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged)`
570587
);
571588
} else {
572589
console.error(`Indexing complete in ${stats.duration}ms`);
@@ -588,6 +605,7 @@ export class CodebaseIndexer {
588605

589606
private async scanFiles(): Promise<string[]> {
590607
const files: string[] = [];
608+
const seen = new Set<string>();
591609

592610
// Read .gitignore if respecting it
593611
let ig: ReturnType<typeof ignore.default> | null = null;
@@ -614,6 +632,12 @@ export class CodebaseIndexer {
614632
});
615633

616634
for (const file of matches) {
635+
const normalizedFile = file.replace(/\\/g, '/');
636+
if (seen.has(normalizedFile)) {
637+
continue;
638+
}
639+
seen.add(normalizedFile);
640+
617641
const relativePath = path.relative(this.rootPath, file);
618642

619643
// Check gitignore

src/core/search-quality.ts

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import type { SearchResult } from '../types/index.js';
2+
import { isTestingRelatedQuery } from '../preflight/query-scope.js';
3+
4+
export interface SearchQualityAssessment {
5+
status: 'ok' | 'low_confidence';
6+
confidence: number;
7+
signals: string[];
8+
nextSteps?: string[];
9+
}
10+
11+
export function isTestArtifactPath(filePath: string): boolean {
12+
const normalized = filePath.toLowerCase().replace(/\\/g, '/');
13+
return (
14+
normalized.includes('.spec.') ||
15+
normalized.includes('.test.') ||
16+
normalized.includes('/e2e/') ||
17+
normalized.includes('/__tests__/')
18+
);
19+
}
20+
21+
export function assessSearchQuality(
22+
query: string,
23+
results: SearchResult[]
24+
): SearchQualityAssessment {
25+
if (results.length === 0) {
26+
return {
27+
status: 'low_confidence',
28+
confidence: 0,
29+
signals: ['no results returned'],
30+
nextSteps: [
31+
'Try a narrower query with one concrete symbol, route, or file hint.',
32+
'Apply search filters (framework/language/componentType/layer).',
33+
'Use get_component_usage for dependency or wiring lookups.'
34+
]
35+
};
36+
}
37+
38+
const topSlice = results.slice(0, Math.min(3, results.length));
39+
const topScore = results[0].score;
40+
const secondScore = results[1]?.score ?? topScore;
41+
const topAverage = topSlice.reduce((sum, result) => sum + result.score, 0) / topSlice.length;
42+
const topSeparation = Math.max(0, topScore - secondScore);
43+
const testRatio =
44+
topSlice.filter((result) => isTestArtifactPath(result.filePath)).length / topSlice.length;
45+
const queryIsTesting = isTestingRelatedQuery(query);
46+
47+
const signals: string[] = [];
48+
if (topScore < 0.3) {
49+
signals.push(`low top score (${topScore.toFixed(2)})`);
50+
}
51+
if (topAverage < 0.32) {
52+
signals.push(`weak top-${topSlice.length} average (${topAverage.toFixed(2)})`);
53+
}
54+
if (topSlice.length > 1 && topSeparation < 0.03) {
55+
signals.push(`tight top spread (${topSeparation.toFixed(2)})`);
56+
}
57+
if (!queryIsTesting && testRatio >= 0.67) {
58+
signals.push(
59+
`test artifacts dominate top-${topSlice.length} (${Math.round(testRatio * 100)}%)`
60+
);
61+
}
62+
63+
let confidence = topScore;
64+
if (topAverage < 0.32) confidence -= 0.08;
65+
if (topSlice.length > 1 && topSeparation < 0.03) confidence -= 0.05;
66+
if (!queryIsTesting && testRatio >= 0.67) confidence -= 0.15;
67+
confidence = Math.max(0, Math.min(1, Number(confidence.toFixed(2))));
68+
69+
const lowConfidence = signals.length >= 2 || confidence < 0.35;
70+
71+
return {
72+
status: lowConfidence ? 'low_confidence' : 'ok',
73+
confidence,
74+
signals,
75+
...(lowConfidence && {
76+
nextSteps: [
77+
'Add one or two concrete symbols, routes, or file hints to the query.',
78+
'Apply filters (framework/language/componentType/layer) to narrow candidates.',
79+
'Use get_component_usage when the question is about wiring or usages.'
80+
]
81+
})
82+
};
83+
}

0 commit comments

Comments
 (0)