|
| 1 | +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; |
| 2 | +import { mkdtemp, readFile, mkdir, writeFile, rm } from 'node:fs/promises'; |
| 3 | +import { join } from 'node:path'; |
| 4 | +import { tmpdir } from 'node:os'; |
| 5 | +import { existsSync } from 'node:fs'; |
| 6 | +import { LanceDBStore, BM25Index, type CodeRAGConfig } from '@code-rag/core'; |
| 7 | +import { rebuildMergedIndex, IndexLogger } from './index-cmd.js'; |
| 8 | + |
| 9 | +const DIMENSIONS = 4; // Use tiny vectors for speed |
| 10 | + |
| 11 | +/** Create a minimal CodeRAGConfig for testing. */ |
| 12 | +function makeConfig(): CodeRAGConfig { |
| 13 | + return { |
| 14 | + version: '1', |
| 15 | + project: { name: 'test', languages: ['typescript'] }, |
| 16 | + embedding: { |
| 17 | + provider: 'ollama', |
| 18 | + model: 'nomic-embed-text', |
| 19 | + dimensions: DIMENSIONS, |
| 20 | + autoStart: false, |
| 21 | + autoStop: false, |
| 22 | + docker: { image: 'ollama/ollama' as const, gpu: 'auto' as const }, |
| 23 | + }, |
| 24 | + llm: { provider: 'ollama', model: 'qwen2.5-coder:7b' }, |
| 25 | + storage: { path: '.coderag' }, |
| 26 | + ingestion: { maxTokensPerChunk: 512, exclude: [] }, |
| 27 | + search: { topK: 10, vectorWeight: 0.7, bm25Weight: 0.3 }, |
| 28 | + } as CodeRAGConfig; |
| 29 | +} |
| 30 | + |
| 31 | +/** Seed a per-repo LanceDB store with test data. */ |
| 32 | +async function seedRepoStore( |
| 33 | + storagePath: string, |
| 34 | + chunks: Array<{ id: string; content: string; nlSummary: string; filePath: string; name: string; startLine: number; endLine: number; repoName: string }>, |
| 35 | +): Promise<void> { |
| 36 | + await mkdir(storagePath, { recursive: true }); |
| 37 | + |
| 38 | + const store = new LanceDBStore(storagePath, DIMENSIONS); |
| 39 | + await store.connect(); |
| 40 | + |
| 41 | + const ids = chunks.map((c) => c.id); |
| 42 | + // Simple deterministic vectors |
| 43 | + const embeddings = chunks.map((_, i) => { |
| 44 | + const vec = new Array(DIMENSIONS).fill(0); |
| 45 | + vec[i % DIMENSIONS] = 1.0; |
| 46 | + return vec; |
| 47 | + }); |
| 48 | + const metadata = chunks.map((c) => ({ |
| 49 | + content: c.content, |
| 50 | + nl_summary: c.nlSummary, |
| 51 | + chunk_type: 'function', |
| 52 | + file_path: c.filePath, |
| 53 | + language: 'typescript', |
| 54 | + start_line: c.startLine, |
| 55 | + end_line: c.endLine, |
| 56 | + name: c.name, |
| 57 | + repo_name: c.repoName, |
| 58 | + })); |
| 59 | + |
| 60 | + const result = await store.upsert(ids, embeddings, metadata); |
| 61 | + expect(result.isOk()).toBe(true); |
| 62 | + store.close(); |
| 63 | +} |
| 64 | + |
| 65 | +describe('rebuildMergedIndex', () => { |
| 66 | + let tempDir: string; |
| 67 | + let rootStorage: string; |
| 68 | + |
| 69 | + beforeEach(async () => { |
| 70 | + tempDir = await mkdtemp(join(tmpdir(), 'coderag-merge-')); |
| 71 | + rootStorage = join(tempDir, '.coderag'); |
| 72 | + await mkdir(rootStorage, { recursive: true }); |
| 73 | + // Suppress console output (IndexLogger uses ora + console) |
| 74 | + vi.spyOn(console, 'log').mockImplementation(() => {}); |
| 75 | + }); |
| 76 | + |
| 77 | + afterEach(async () => { |
| 78 | + vi.restoreAllMocks(); |
| 79 | + await rm(tempDir, { recursive: true, force: true }); |
| 80 | + }); |
| 81 | + |
| 82 | + it('should merge chunks from multiple per-repo stores into root', async () => { |
| 83 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 84 | + const repoBPath = join(rootStorage, 'repo-b'); |
| 85 | + |
| 86 | + await seedRepoStore(repoAPath, [ |
| 87 | + { id: 'chunk-a1', content: 'function foo() {}', nlSummary: 'A function foo', filePath: 'src/foo.ts', name: 'foo', startLine: 1, endLine: 5, repoName: 'repo-a' }, |
| 88 | + { id: 'chunk-a2', content: 'function bar() {}', nlSummary: 'A function bar', filePath: 'src/bar.ts', name: 'bar', startLine: 1, endLine: 3, repoName: 'repo-a' }, |
| 89 | + ]); |
| 90 | + |
| 91 | + await seedRepoStore(repoBPath, [ |
| 92 | + { id: 'chunk-b1', content: 'class Widget {}', nlSummary: 'A class Widget', filePath: 'lib/widget.ts', name: 'Widget', startLine: 10, endLine: 50, repoName: 'repo-b' }, |
| 93 | + ]); |
| 94 | + |
| 95 | + const config = makeConfig(); |
| 96 | + const logger = new IndexLogger(rootStorage, true); |
| 97 | + |
| 98 | + await rebuildMergedIndex( |
| 99 | + rootStorage, |
| 100 | + [ |
| 101 | + { repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }, |
| 102 | + { repoName: 'repo-b', repoPath: '/tmp/b', repoStoragePath: repoBPath, parsedFiles: [] }, |
| 103 | + ], |
| 104 | + config, |
| 105 | + logger, |
| 106 | + ); |
| 107 | + |
| 108 | + // Verify root LanceDB has all 3 chunks |
| 109 | + const rootStore = new LanceDBStore(rootStorage, DIMENSIONS); |
| 110 | + await rootStore.connect(); |
| 111 | + const internal = rootStore as unknown as { |
| 112 | + table: { query: () => { toArray: () => Promise<Array<{ id: string; metadata: string }>> } } | null; |
| 113 | + }; |
| 114 | + const rows = await internal.table!.query().toArray(); |
| 115 | + expect(rows.length).toBe(3); |
| 116 | + const ids = rows.map((r) => r.id).sort(); |
| 117 | + expect(ids).toEqual(['chunk-a1', 'chunk-a2', 'chunk-b1']); |
| 118 | + rootStore.close(); |
| 119 | + }); |
| 120 | + |
| 121 | + it('should preserve all metadata fields without double-serialization', async () => { |
| 122 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 123 | + |
| 124 | + await seedRepoStore(repoAPath, [ |
| 125 | + { id: 'chunk-1', content: 'const x = 1;', nlSummary: 'A constant', filePath: 'src/x.ts', name: 'x', startLine: 42, endLine: 99, repoName: 'repo-a' }, |
| 126 | + ]); |
| 127 | + |
| 128 | + const config = makeConfig(); |
| 129 | + const logger = new IndexLogger(rootStorage, true); |
| 130 | + |
| 131 | + await rebuildMergedIndex( |
| 132 | + rootStorage, |
| 133 | + [{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }], |
| 134 | + config, |
| 135 | + logger, |
| 136 | + ); |
| 137 | + |
| 138 | + // Read the merged row and parse metadata |
| 139 | + const rootStore = new LanceDBStore(rootStorage, DIMENSIONS); |
| 140 | + await rootStore.connect(); |
| 141 | + const internal = rootStore as unknown as { |
| 142 | + table: { query: () => { toArray: () => Promise<Array<{ id: string; metadata: string; content: string; file_path: string }>> } } | null; |
| 143 | + }; |
| 144 | + const rows = await internal.table!.query().toArray(); |
| 145 | + expect(rows.length).toBe(1); |
| 146 | + |
| 147 | + const row = rows[0]!; |
| 148 | + expect(row.content).toBe('const x = 1;'); |
| 149 | + expect(row.file_path).toBe('src/x.ts'); |
| 150 | + |
| 151 | + // Parse metadata — should NOT be double-serialized |
| 152 | + const meta = JSON.parse(row.metadata) as Record<string, unknown>; |
| 153 | + expect(meta['start_line']).toBe(42); |
| 154 | + expect(meta['end_line']).toBe(99); |
| 155 | + expect(meta['name']).toBe('x'); |
| 156 | + expect(meta['repo_name']).toBe('repo-a'); |
| 157 | + // Verify metadata.content is a string, not another JSON blob |
| 158 | + expect(typeof meta['content']).toBe('string'); |
| 159 | + expect(meta['content']).toBe('const x = 1;'); |
| 160 | + |
| 161 | + rootStore.close(); |
| 162 | + }); |
| 163 | + |
| 164 | + it('should create BM25 index at root level', async () => { |
| 165 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 166 | + |
| 167 | + await seedRepoStore(repoAPath, [ |
| 168 | + { id: 'chunk-1', content: 'authentication login', nlSummary: 'Auth login', filePath: 'auth.ts', name: 'login', startLine: 1, endLine: 10, repoName: 'repo-a' }, |
| 169 | + ]); |
| 170 | + |
| 171 | + const config = makeConfig(); |
| 172 | + const logger = new IndexLogger(rootStorage, true); |
| 173 | + |
| 174 | + await rebuildMergedIndex( |
| 175 | + rootStorage, |
| 176 | + [{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }], |
| 177 | + config, |
| 178 | + logger, |
| 179 | + ); |
| 180 | + |
| 181 | + const bm25Path = join(rootStorage, 'bm25-index.json'); |
| 182 | + expect(existsSync(bm25Path)).toBe(true); |
| 183 | + |
| 184 | + const bm25Data = await readFile(bm25Path, 'utf-8'); |
| 185 | + const bm25 = BM25Index.deserialize(bm25Data); |
| 186 | + const results = bm25.search('authentication login', 10); |
| 187 | + expect(results.length).toBeGreaterThan(0); |
| 188 | + expect(results[0]!.chunkId).toBe('chunk-1'); |
| 189 | + }); |
| 190 | + |
| 191 | + it('should create graph.json at root level', async () => { |
| 192 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 193 | + |
| 194 | + await seedRepoStore(repoAPath, [ |
| 195 | + { id: 'chunk-1', content: 'test', nlSummary: 'test', filePath: 'a.ts', name: 'a', startLine: 1, endLine: 1, repoName: 'repo-a' }, |
| 196 | + ]); |
| 197 | + |
| 198 | + // Write a per-repo graph |
| 199 | + const graphData = { |
| 200 | + nodes: [{ id: 'node-1', filePath: 'a.ts', name: 'a', type: 'function', language: 'typescript' }], |
| 201 | + edges: [], |
| 202 | + }; |
| 203 | + await writeFile(join(repoAPath, 'graph.json'), JSON.stringify(graphData), 'utf-8'); |
| 204 | + |
| 205 | + const config = makeConfig(); |
| 206 | + const logger = new IndexLogger(rootStorage, true); |
| 207 | + |
| 208 | + await rebuildMergedIndex( |
| 209 | + rootStorage, |
| 210 | + [{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }], |
| 211 | + config, |
| 212 | + logger, |
| 213 | + ); |
| 214 | + |
| 215 | + const rootGraphPath = join(rootStorage, 'graph.json'); |
| 216 | + expect(existsSync(rootGraphPath)).toBe(true); |
| 217 | + const rootGraph = JSON.parse(await readFile(rootGraphPath, 'utf-8')) as { nodes: unknown[] }; |
| 218 | + expect(rootGraph.nodes.length).toBe(1); |
| 219 | + }); |
| 220 | + |
| 221 | + it('should handle repo with empty LanceDB gracefully', async () => { |
| 222 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 223 | + await mkdir(repoAPath, { recursive: true }); |
| 224 | + // No LanceDB data seeded — empty repo |
| 225 | + |
| 226 | + const config = makeConfig(); |
| 227 | + const logger = new IndexLogger(rootStorage, true); |
| 228 | + |
| 229 | + // Should not throw |
| 230 | + await rebuildMergedIndex( |
| 231 | + rootStorage, |
| 232 | + [{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }], |
| 233 | + config, |
| 234 | + logger, |
| 235 | + ); |
| 236 | + |
| 237 | + // Root BM25 should exist but be empty |
| 238 | + const bm25Path = join(rootStorage, 'bm25-index.json'); |
| 239 | + expect(existsSync(bm25Path)).toBe(true); |
| 240 | + const bm25Data = await readFile(bm25Path, 'utf-8'); |
| 241 | + const parsed = JSON.parse(bm25Data) as { documentCount: number }; |
| 242 | + expect(parsed.documentCount).toBe(0); |
| 243 | + }); |
| 244 | + |
| 245 | + it('should handle vectors from Arrow types (Array.from conversion)', async () => { |
| 246 | + // This test verifies that even if vector data has exotic types, |
| 247 | + // the conversion via Array.from produces valid number arrays |
| 248 | + const repoAPath = join(rootStorage, 'repo-a'); |
| 249 | + |
| 250 | + await seedRepoStore(repoAPath, [ |
| 251 | + { id: 'vec-test', content: 'vector test', nlSummary: 'test', filePath: 'v.ts', name: 'v', startLine: 1, endLine: 1, repoName: 'repo-a' }, |
| 252 | + ]); |
| 253 | + |
| 254 | + const config = makeConfig(); |
| 255 | + const logger = new IndexLogger(rootStorage, true); |
| 256 | + |
| 257 | + await rebuildMergedIndex( |
| 258 | + rootStorage, |
| 259 | + [{ repoName: 'repo-a', repoPath: '/tmp/a', repoStoragePath: repoAPath, parsedFiles: [] }], |
| 260 | + config, |
| 261 | + logger, |
| 262 | + ); |
| 263 | + |
| 264 | + // Verify root store can be queried (which means vectors were stored correctly) |
| 265 | + const rootStore = new LanceDBStore(rootStorage, DIMENSIONS); |
| 266 | + await rootStore.connect(); |
| 267 | + const queryVec = new Array(DIMENSIONS).fill(0); |
| 268 | + queryVec[0] = 1.0; |
| 269 | + const queryResult = await rootStore.query(queryVec, 5); |
| 270 | + expect(queryResult.isOk()).toBe(true); |
| 271 | + if (queryResult.isOk()) { |
| 272 | + expect(queryResult.value.length).toBe(1); |
| 273 | + expect(queryResult.value[0]!.id).toBe('vec-test'); |
| 274 | + } |
| 275 | + rootStore.close(); |
| 276 | + }); |
| 277 | +}); |
0 commit comments