Skip to content

Commit bd904af

Browse files
AB#129 AB#97 Fix multi-repo search and add .gitignore handling
Multi-repo search was completely broken: indexMultiRepo stored data in per-repo subdirs (.coderag/<repoName>/) but search/MCP/CLI only looked at root .coderag/. Fixed by merging all per-repo data into a unified root-level index after indexing. Key fixes: - Add Phase 4 merge to indexMultiRepo (fresh indexing path) - Add rebuildMergedIndex for incremental runs with missing root index - Convert LanceDB Arrow vectors to plain number[] via Array.from - Parse metadata JSON before re-upsert to prevent double-serialization - Detect stale empty BM25 index (documentCount === 0) for rebuild Also adds ensureGitignore to coderag init that auto-adds .coderag/ to .gitignore, preventing database files from being committed. 13 new tests (6 merge integration + 7 gitignore unit). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3b55c46 commit bd904af

8 files changed

Lines changed: 539 additions & 10 deletions

File tree

packages/api-server/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@code-rag/api-server",
3-
"version": "0.1.6",
3+
"version": "0.1.7",
44
"description": "REST API server for CodeRAG — Express-based server with authentication, rate limiting, and OpenAPI docs",
55
"type": "module",
66
"license": "MIT",

packages/cli/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@code-rag/cli",
3-
"version": "0.1.6",
3+
"version": "0.1.7",
44
"description": "CLI tool for CodeRAG — init, index, search, serve, and status commands for codebase context engine",
55
"type": "module",
66
"license": "MIT",
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
2+
import { mkdtemp, readFile, mkdir, writeFile, rm } from 'node:fs/promises';
3+
import { join } from 'node:path';
4+
import { tmpdir } from 'node:os';
5+
import { existsSync } from 'node:fs';
6+
import { LanceDBStore, BM25Index, type CodeRAGConfig } from '@code-rag/core';
7+
import { rebuildMergedIndex, IndexLogger } from './index-cmd.js';
8+
9+
const DIMENSIONS = 4; // Use tiny vectors for speed
10+
11+
/** Create a minimal CodeRAGConfig for testing. */
12+
function makeConfig(): CodeRAGConfig {
13+
return {
14+
version: '1',
15+
project: { name: 'test', languages: ['typescript'] },
16+
embedding: {
17+
provider: 'ollama',
18+
model: 'nomic-embed-text',
19+
dimensions: DIMENSIONS,
20+
autoStart: false,
21+
autoStop: false,
22+
docker: { image: 'ollama/ollama' as const, gpu: 'auto' as const },
23+
},
24+
llm: { provider: 'ollama', model: 'qwen2.5-coder:7b' },
25+
storage: { path: '.coderag' },
26+
ingestion: { maxTokensPerChunk: 512, exclude: [] },
27+
search: { topK: 10, vectorWeight: 0.7, bm25Weight: 0.3 },
28+
} as CodeRAGConfig;
29+
}
30+
31+
/** Seed a per-repo LanceDB store with test data. */
32+
async function seedRepoStore(
33+
storagePath: string,
34+
chunks: Array<{ id: string; content: string; nlSummary: string; filePath: string; name: string; startLine: number; endLine: number; repoName: string }>,
35+
): Promise<void> {
36+
await mkdir(storagePath, { recursive: true });
37+
38+
const store = new LanceDBStore(storagePath, DIMENSIONS);
39+
await store.connect();
40+
41+
const ids = chunks.map((c) => c.id);
42+
// Simple deterministic vectors
43+
const embeddings = chunks.map((_, i) => {
44+
const vec = new Array(DIMENSIONS).fill(0);
45+
vec[i % DIMENSIONS] = 1.0;
46+
return vec;
47+
});
48+
const metadata = chunks.map((c) => ({
49+
content: c.content,
50+
nl_summary: c.nlSummary,
51+
chunk_type: 'function',
52+
file_path: c.filePath,
53+
language: 'typescript',
54+
start_line: c.startLine,
55+
end_line: c.endLine,
56+
name: c.name,
57+
repo_name: c.repoName,
58+
}));
59+
60+
const result = await store.upsert(ids, embeddings, metadata);
61+
expect(result.isOk()).toBe(true);
62+
store.close();
63+
}
64+
65+
describe('rebuildMergedIndex', () => {
66+
let tempDir: string;
67+
let rootStorage: string;
68+
69+
beforeEach(async () => {
70+
tempDir = await mkdtemp(join(tmpdir(), 'coderag-merge-'));
71+
rootStorage = join(tempDir, '.coderag');
72+
await mkdir(rootStorage, { recursive: true });
73+
// Suppress console output (IndexLogger uses ora + console)
74+
vi.spyOn(console, 'log').mockImplementation(() => {});
75+
});
76+
77+
afterEach(async () => {
78+
vi.restoreAllMocks();
79+
await rm(tempDir, { recursive: true, force: true });
80+
});
81+
82+
it('should merge chunks from multiple per-repo stores into root', async () => {
83+
const repoAPath = join(rootStorage, 'repo-a');
84+
const repoBPath = join(rootStorage, 'repo-b');
85+
86+
await seedRepoStore(repoAPath, [
87+
{ id: 'chunk-a1', content: 'function foo() {}', nlSummary: 'A function foo', filePath: 'src/foo.ts', name: 'foo', startLine: 1, endLine: 5, repoName: 'repo-a' },
88+
{ id: 'chunk-a2', content: 'function bar() {}', nlSummary: 'A function bar', filePath: 'src/bar.ts', name: 'bar', startLine: 1, endLine: 3, repoName: 'repo-a' },
89+
]);
90+
91+
await seedRepoStore(repoBPath, [
92+
{ id: 'chunk-b1', content: 'class Widget {}', nlSummary: 'A class Widget', filePath: 'lib/widget.ts', name: 'Widget', startLine: 10, endLine: 50, repoName: 'repo-b' },
93+
]);
94+
95+
const config = makeConfig();
96+
const logger = new IndexLogger(rootStorage, true);
97+
98+
await rebuildMergedIndex(
99+
rootStorage,
100+
[
101+
{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] },
102+
{ repoName: 'repo-b', repoPath: '/tmp/b', repoStoragePath: repoBPath, parsedFiles: [] },
103+
],
104+
config,
105+
logger,
106+
);
107+
108+
// Verify root LanceDB has all 3 chunks
109+
const rootStore = new LanceDBStore(rootStorage, DIMENSIONS);
110+
await rootStore.connect();
111+
const internal = rootStore as unknown as {
112+
table: { query: () => { toArray: () => Promise<Array<{ id: string; metadata: string }>> } } | null;
113+
};
114+
const rows = await internal.table!.query().toArray();
115+
expect(rows.length).toBe(3);
116+
const ids = rows.map((r) => r.id).sort();
117+
expect(ids).toEqual(['chunk-a1', 'chunk-a2', 'chunk-b1']);
118+
rootStore.close();
119+
});
120+
121+
it('should preserve all metadata fields without double-serialization', async () => {
122+
const repoAPath = join(rootStorage, 'repo-a');
123+
124+
await seedRepoStore(repoAPath, [
125+
{ id: 'chunk-1', content: 'const x = 1;', nlSummary: 'A constant', filePath: 'src/x.ts', name: 'x', startLine: 42, endLine: 99, repoName: 'repo-a' },
126+
]);
127+
128+
const config = makeConfig();
129+
const logger = new IndexLogger(rootStorage, true);
130+
131+
await rebuildMergedIndex(
132+
rootStorage,
133+
[{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }],
134+
config,
135+
logger,
136+
);
137+
138+
// Read the merged row and parse metadata
139+
const rootStore = new LanceDBStore(rootStorage, DIMENSIONS);
140+
await rootStore.connect();
141+
const internal = rootStore as unknown as {
142+
table: { query: () => { toArray: () => Promise<Array<{ id: string; metadata: string; content: string; file_path: string }>> } } | null;
143+
};
144+
const rows = await internal.table!.query().toArray();
145+
expect(rows.length).toBe(1);
146+
147+
const row = rows[0]!;
148+
expect(row.content).toBe('const x = 1;');
149+
expect(row.file_path).toBe('src/x.ts');
150+
151+
// Parse metadata — should NOT be double-serialized
152+
const meta = JSON.parse(row.metadata) as Record<string, unknown>;
153+
expect(meta['start_line']).toBe(42);
154+
expect(meta['end_line']).toBe(99);
155+
expect(meta['name']).toBe('x');
156+
expect(meta['repo_name']).toBe('repo-a');
157+
// Verify metadata.content is a string, not another JSON blob
158+
expect(typeof meta['content']).toBe('string');
159+
expect(meta['content']).toBe('const x = 1;');
160+
161+
rootStore.close();
162+
});
163+
164+
it('should create BM25 index at root level', async () => {
165+
const repoAPath = join(rootStorage, 'repo-a');
166+
167+
await seedRepoStore(repoAPath, [
168+
{ id: 'chunk-1', content: 'authentication login', nlSummary: 'Auth login', filePath: 'auth.ts', name: 'login', startLine: 1, endLine: 10, repoName: 'repo-a' },
169+
]);
170+
171+
const config = makeConfig();
172+
const logger = new IndexLogger(rootStorage, true);
173+
174+
await rebuildMergedIndex(
175+
rootStorage,
176+
[{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }],
177+
config,
178+
logger,
179+
);
180+
181+
const bm25Path = join(rootStorage, 'bm25-index.json');
182+
expect(existsSync(bm25Path)).toBe(true);
183+
184+
const bm25Data = await readFile(bm25Path, 'utf-8');
185+
const bm25 = BM25Index.deserialize(bm25Data);
186+
const results = bm25.search('authentication login', 10);
187+
expect(results.length).toBeGreaterThan(0);
188+
expect(results[0]!.chunkId).toBe('chunk-1');
189+
});
190+
191+
it('should create graph.json at root level', async () => {
192+
const repoAPath = join(rootStorage, 'repo-a');
193+
194+
await seedRepoStore(repoAPath, [
195+
{ id: 'chunk-1', content: 'test', nlSummary: 'test', filePath: 'a.ts', name: 'a', startLine: 1, endLine: 1, repoName: 'repo-a' },
196+
]);
197+
198+
// Write a per-repo graph
199+
const graphData = {
200+
nodes: [{ id: 'node-1', filePath: 'a.ts', name: 'a', type: 'function', language: 'typescript' }],
201+
edges: [],
202+
};
203+
await writeFile(join(repoAPath, 'graph.json'), JSON.stringify(graphData), 'utf-8');
204+
205+
const config = makeConfig();
206+
const logger = new IndexLogger(rootStorage, true);
207+
208+
await rebuildMergedIndex(
209+
rootStorage,
210+
[{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }],
211+
config,
212+
logger,
213+
);
214+
215+
const rootGraphPath = join(rootStorage, 'graph.json');
216+
expect(existsSync(rootGraphPath)).toBe(true);
217+
const rootGraph = JSON.parse(await readFile(rootGraphPath, 'utf-8')) as { nodes: unknown[] };
218+
expect(rootGraph.nodes.length).toBe(1);
219+
});
220+
221+
it('should handle repo with empty LanceDB gracefully', async () => {
222+
const repoAPath = join(rootStorage, 'repo-a');
223+
await mkdir(repoAPath, { recursive: true });
224+
// No LanceDB data seeded — empty repo
225+
226+
const config = makeConfig();
227+
const logger = new IndexLogger(rootStorage, true);
228+
229+
// Should not throw
230+
await rebuildMergedIndex(
231+
rootStorage,
232+
[{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }],
233+
config,
234+
logger,
235+
);
236+
237+
// Root BM25 should exist but be empty
238+
const bm25Path = join(rootStorage, 'bm25-index.json');
239+
expect(existsSync(bm25Path)).toBe(true);
240+
const bm25Data = await readFile(bm25Path, 'utf-8');
241+
const parsed = JSON.parse(bm25Data) as { documentCount: number };
242+
expect(parsed.documentCount).toBe(0);
243+
});
244+
245+
it('should handle vectors from Arrow types (Array.from conversion)', async () => {
246+
// This test verifies that even if vector data has exotic types,
247+
// the conversion via Array.from produces valid number arrays
248+
const repoAPath = join(rootStorage, 'repo-a');
249+
250+
await seedRepoStore(repoAPath, [
251+
{ id: 'vec-test', content: 'vector test', nlSummary: 'test', filePath: 'v.ts', name: 'v', startLine: 1, endLine: 1, repoName: 'repo-a' },
252+
]);
253+
254+
const config = makeConfig();
255+
const logger = new IndexLogger(rootStorage, true);
256+
257+
await rebuildMergedIndex(
258+
rootStorage,
259+
[{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }],
260+
config,
261+
logger,
262+
);
263+
264+
// Verify root store can be queried (which means vectors were stored correctly)
265+
const rootStore = new LanceDBStore(rootStorage, DIMENSIONS);
266+
await rootStore.connect();
267+
const queryVec = new Array(DIMENSIONS).fill(0);
268+
queryVec[0] = 1.0;
269+
const queryResult = await rootStore.query(queryVec, 5);
270+
expect(queryResult.isOk()).toBe(true);
271+
if (queryResult.isOk()) {
272+
expect(queryResult.value.length).toBe(1);
273+
expect(queryResult.value[0]!.id).toBe('vec-test');
274+
}
275+
rootStore.close();
276+
});
277+
});

0 commit comments

Comments
 (0)