Skip to content

Commit a216c6d

Browse files
committed
feat(06-01): add index format metadata and headers
- Add INDEX_FORMAT_VERSION/INDEX_META_VERSION constants - Emit index-meta.json + artifact build markers (buildId/formatVersion) - Wrap keyword index and update tests for new on-disk shape
1 parent 9629447 commit a216c6d

File tree

6 files changed

+336
-8
lines changed

6 files changed

+336
-8
lines changed

src/constants/codebase-context.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,21 @@
33

44
export const CODEBASE_CONTEXT_DIRNAME = '.codebase-context' as const;
55

6+
/**
7+
* Index format version for on-disk artifacts under `.codebase-context/`.
8+
*
9+
* Bump when:
10+
* - Chunk boundaries change (AST chunking rules, split/merge behavior)
11+
* - Embedding input string changes (e.g. scope-prefix content, prepended metadata)
12+
* - Required persisted fields/artifact headers change
13+
*/
14+
export const INDEX_FORMAT_VERSION = 1 as const;
15+
16+
/** Schema version for `.codebase-context/index-meta.json` itself. */
17+
export const INDEX_META_VERSION = 1 as const;
18+
19+
export const INDEX_META_FILENAME = 'index-meta.json' as const;
20+
621
export const MEMORY_FILENAME = 'memory.json' as const;
722
export const INTELLIGENCE_FILENAME = 'intelligence.json' as const;
823
export const KEYWORD_INDEX_FILENAME = 'index.json' as const;

src/core/index-meta.ts

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import { promises as fs } from 'fs';
2+
import path from 'path';
3+
import { z } from 'zod';
4+
5+
import {
6+
CODEBASE_CONTEXT_DIRNAME,
7+
INDEX_FORMAT_VERSION,
8+
INDEX_META_FILENAME,
9+
INDEX_META_VERSION,
10+
INTELLIGENCE_FILENAME,
11+
KEYWORD_INDEX_FILENAME,
12+
VECTOR_DB_DIRNAME
13+
} from '../constants/codebase-context.js';
14+
import { IndexCorruptedError } from '../errors/index.js';
15+
16+
const ArtifactHeaderSchema = z.object({
17+
buildId: z.string().min(1),
18+
formatVersion: z.number().int().nonnegative()
19+
});
20+
21+
const KeywordIndexFileSchema = z.object({
22+
header: ArtifactHeaderSchema,
23+
chunks: z.array(z.unknown())
24+
});
25+
26+
const VectorDbBuildSchema = z.object({
27+
buildId: z.string().min(1),
28+
formatVersion: z.number().int().nonnegative()
29+
});
30+
31+
const IntelligenceFileSchema = z
32+
.object({
33+
header: ArtifactHeaderSchema
34+
})
35+
.passthrough();
36+
37+
export const IndexMetaSchema = z.object({
38+
metaVersion: z.number().int().positive(),
39+
formatVersion: z.number().int().nonnegative(),
40+
buildId: z.string().min(1),
41+
generatedAt: z.string().datetime(),
42+
toolVersion: z.string().min(1),
43+
artifacts: z
44+
.object({
45+
keywordIndex: z.object({
46+
path: z.string().min(1)
47+
}),
48+
vectorDb: z.object({
49+
path: z.string().min(1),
50+
provider: z.string().min(1)
51+
}),
52+
intelligence: z
53+
.object({
54+
path: z.string().min(1)
55+
})
56+
.optional()
57+
})
58+
.passthrough()
59+
});
60+
61+
export type IndexMeta = z.infer<typeof IndexMetaSchema>;
62+
63+
async function pathExists(targetPath: string): Promise<boolean> {
64+
try {
65+
await fs.access(targetPath);
66+
return true;
67+
} catch {
68+
return false;
69+
}
70+
}
71+
72+
async function requireFile(targetPath: string, label: string): Promise<void> {
73+
if (!(await pathExists(targetPath))) {
74+
throw new IndexCorruptedError(`${label} missing: ${targetPath}`);
75+
}
76+
}
77+
78+
async function requireDirectory(targetPath: string, label: string): Promise<void> {
79+
try {
80+
const stat = await fs.stat(targetPath);
81+
if (!stat.isDirectory()) {
82+
throw new IndexCorruptedError(`${label} is not a directory: ${targetPath}`);
83+
}
84+
} catch (error) {
85+
if (error instanceof IndexCorruptedError) throw error;
86+
throw new IndexCorruptedError(`${label} missing: ${targetPath}`);
87+
}
88+
}
89+
90+
function asIndexCorrupted(message: string, error: unknown): IndexCorruptedError {
91+
const suffix = error instanceof Error ? error.message : String(error);
92+
return new IndexCorruptedError(`${message}: ${suffix}`);
93+
}
94+
95+
export async function readIndexMeta(rootDir: string): Promise<IndexMeta> {
96+
const metaPath = path.join(rootDir, CODEBASE_CONTEXT_DIRNAME, INDEX_META_FILENAME);
97+
98+
let parsed: unknown;
99+
try {
100+
const raw = await fs.readFile(metaPath, 'utf-8');
101+
parsed = JSON.parse(raw);
102+
} catch (error) {
103+
throw asIndexCorrupted('Index meta missing or unreadable (rebuild required)', error);
104+
}
105+
106+
const result = IndexMetaSchema.safeParse(parsed);
107+
if (!result.success) {
108+
throw new IndexCorruptedError(
109+
`Index meta schema mismatch (rebuild required): ${result.error.message}`
110+
);
111+
}
112+
113+
const meta = result.data;
114+
115+
if (meta.metaVersion !== INDEX_META_VERSION) {
116+
throw new IndexCorruptedError(
117+
`Index meta version mismatch (rebuild required): expected metaVersion=${INDEX_META_VERSION}, found metaVersion=${meta.metaVersion}`
118+
);
119+
}
120+
121+
if (meta.formatVersion !== INDEX_FORMAT_VERSION) {
122+
throw new IndexCorruptedError(
123+
`Index format version mismatch (rebuild required): expected formatVersion=${INDEX_FORMAT_VERSION}, found formatVersion=${meta.formatVersion}`
124+
);
125+
}
126+
127+
return meta;
128+
}
129+
130+
export async function validateIndexArtifacts(rootDir: string, meta: IndexMeta): Promise<void> {
131+
const contextDir = path.join(rootDir, CODEBASE_CONTEXT_DIRNAME);
132+
133+
const keywordPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
134+
const vectorDir = path.join(contextDir, VECTOR_DB_DIRNAME);
135+
const vectorBuildPath = path.join(vectorDir, 'index-build.json');
136+
137+
await requireFile(keywordPath, 'Keyword index');
138+
await requireDirectory(vectorDir, 'Vector DB directory');
139+
await requireFile(vectorBuildPath, 'Vector DB build marker');
140+
141+
// Keyword index header (required)
142+
try {
143+
const raw = await fs.readFile(keywordPath, 'utf-8');
144+
const json = JSON.parse(raw);
145+
const parsed = KeywordIndexFileSchema.safeParse(json);
146+
if (!parsed.success) {
147+
throw new IndexCorruptedError(
148+
`Keyword index schema mismatch (rebuild required): ${parsed.error.message}`
149+
);
150+
}
151+
152+
const { buildId, formatVersion } = parsed.data.header;
153+
if (formatVersion !== meta.formatVersion) {
154+
throw new IndexCorruptedError(
155+
`Keyword index formatVersion mismatch (rebuild required): meta=${meta.formatVersion}, index.json=${formatVersion}`
156+
);
157+
}
158+
if (buildId !== meta.buildId) {
159+
throw new IndexCorruptedError(
160+
`Keyword index buildId mismatch (rebuild required): meta=${meta.buildId}, index.json=${buildId}`
161+
);
162+
}
163+
} catch (error) {
164+
if (error instanceof IndexCorruptedError) throw error;
165+
throw asIndexCorrupted('Keyword index corrupted (rebuild required)', error);
166+
}
167+
168+
// Vector DB build marker (required)
169+
try {
170+
const raw = await fs.readFile(vectorBuildPath, 'utf-8');
171+
const json = JSON.parse(raw);
172+
const parsed = VectorDbBuildSchema.safeParse(json);
173+
if (!parsed.success) {
174+
throw new IndexCorruptedError(
175+
`Vector DB build marker schema mismatch (rebuild required): ${parsed.error.message}`
176+
);
177+
}
178+
179+
const { buildId, formatVersion } = parsed.data;
180+
if (formatVersion !== meta.formatVersion) {
181+
throw new IndexCorruptedError(
182+
`Vector DB formatVersion mismatch (rebuild required): meta=${meta.formatVersion}, index-build.json=${formatVersion}`
183+
);
184+
}
185+
if (buildId !== meta.buildId) {
186+
throw new IndexCorruptedError(
187+
`Vector DB buildId mismatch (rebuild required): meta=${meta.buildId}, index-build.json=${buildId}`
188+
);
189+
}
190+
} catch (error) {
191+
if (error instanceof IndexCorruptedError) throw error;
192+
throw asIndexCorrupted('Vector DB build marker corrupted (rebuild required)', error);
193+
}
194+
195+
// Optional intelligence artifact: validate if present, but do not require.
196+
const intelligencePath = path.join(contextDir, INTELLIGENCE_FILENAME);
197+
if (await pathExists(intelligencePath)) {
198+
try {
199+
const raw = await fs.readFile(intelligencePath, 'utf-8');
200+
const json = JSON.parse(raw);
201+
const parsed = IntelligenceFileSchema.safeParse(json);
202+
if (!parsed.success) {
203+
throw new IndexCorruptedError(
204+
`Intelligence schema mismatch (rebuild required): ${parsed.error.message}`
205+
);
206+
}
207+
208+
const { buildId, formatVersion } = parsed.data.header;
209+
if (formatVersion !== meta.formatVersion) {
210+
throw new IndexCorruptedError(
211+
`Intelligence formatVersion mismatch (rebuild required): meta=${meta.formatVersion}, intelligence.json=${formatVersion}`
212+
);
213+
}
214+
if (buildId !== meta.buildId) {
215+
throw new IndexCorruptedError(
216+
`Intelligence buildId mismatch (rebuild required): meta=${meta.buildId}, intelligence.json=${buildId}`
217+
);
218+
}
219+
} catch (error) {
220+
if (error instanceof IndexCorruptedError) throw error;
221+
throw asIndexCorrupted('Intelligence corrupted (rebuild required)', error);
222+
}
223+
}
224+
}

src/core/indexer.ts

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*/
55

66
/* eslint-disable @typescript-eslint/no-explicit-any */
7+
import { randomUUID } from 'crypto';
78
import { promises as fs } from 'fs';
89
import path from 'path';
910
import { glob } from 'glob';
@@ -31,7 +32,10 @@ import { mergeSmallChunks } from '../utils/chunking.js';
3132
import { getFileCommitDates } from '../utils/git-dates.js';
3233
import {
3334
CODEBASE_CONTEXT_DIRNAME,
35+
INDEX_FORMAT_VERSION,
3436
INDEXING_STATS_FILENAME,
37+
INDEX_META_FILENAME,
38+
INDEX_META_VERSION,
3539
INTELLIGENCE_FILENAME,
3640
KEYWORD_INDEX_FILENAME,
3741
MANIFEST_FILENAME,
@@ -46,6 +50,26 @@ import {
4650
type ManifestDiff
4751
} from './manifest.js';
4852

53+
let cachedToolVersion: string | null = null;
54+
55+
async function getToolVersion(): Promise<string> {
56+
if (cachedToolVersion) return cachedToolVersion;
57+
58+
try {
59+
const pkgRaw = await fs.readFile(new URL('../../package.json', import.meta.url), 'utf-8');
60+
const pkg = JSON.parse(pkgRaw) as { version?: unknown };
61+
if (typeof pkg.version === 'string' && pkg.version.trim()) {
62+
cachedToolVersion = pkg.version;
63+
return cachedToolVersion;
64+
}
65+
} catch {
66+
// Best-effort — fall back below
67+
}
68+
69+
cachedToolVersion = 'unknown';
70+
return cachedToolVersion;
71+
}
72+
4973
export interface IndexerOptions {
5074
rootPath: string;
5175
config?: Partial<CodebaseConfig>;
@@ -166,6 +190,10 @@ export class CodebaseIndexer {
166190
};
167191

168192
try {
193+
const buildId = randomUUID();
194+
const generatedAt = new Date().toISOString();
195+
const toolVersion = await getToolVersion();
196+
169197
// Phase 1: Scanning
170198
this.updateProgress('scanning', 0);
171199
let files = await this.scanFiles();
@@ -250,7 +278,12 @@ export class CodebaseIndexer {
250278

251279
try {
252280
const existingIndexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
253-
const existingChunks = JSON.parse(await fs.readFile(existingIndexPath, 'utf-8'));
281+
const existing = JSON.parse(await fs.readFile(existingIndexPath, 'utf-8')) as any;
282+
const existingChunks: unknown = Array.isArray(existing)
283+
? existing
284+
: existing && Array.isArray(existing.chunks)
285+
? existing.chunks
286+
: null;
254287
if (Array.isArray(existingChunks)) {
255288
stats.totalChunks = existingChunks.length;
256289
if (stats.indexedFiles === 0) {
@@ -566,12 +599,27 @@ export class CodebaseIndexer {
566599
}
567600
}
568601

602+
// Vector DB build marker (required for version gating)
603+
// Write after semantic store step so marker reflects the latest DB state.
604+
const vectorDir = path.join(contextDir, VECTOR_DB_DIRNAME);
605+
await fs.mkdir(vectorDir, { recursive: true });
606+
await fs.writeFile(
607+
path.join(vectorDir, 'index-build.json'),
608+
JSON.stringify({ buildId, formatVersion: INDEX_FORMAT_VERSION })
609+
);
610+
569611
// Keyword index always uses ALL chunks (full regen)
570612
const indexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME);
571613
// Memory safety: cap keyword index too
572614
const keywordChunks =
573615
allChunks.length > MAX_CHUNKS ? allChunks.slice(0, MAX_CHUNKS) : allChunks;
574-
await fs.writeFile(indexPath, JSON.stringify(keywordChunks));
616+
await fs.writeFile(
617+
indexPath,
618+
JSON.stringify({
619+
header: { buildId, formatVersion: INDEX_FORMAT_VERSION },
620+
chunks: keywordChunks
621+
})
622+
);
575623

576624
// Save library usage and pattern stats (always full regen)
577625
const intelligencePath = path.join(contextDir, INTELLIGENCE_FILENAME);
@@ -594,6 +642,7 @@ export class CodebaseIndexer {
594642
}
595643

596644
const intelligence = {
645+
header: { buildId, formatVersion: INDEX_FORMAT_VERSION },
597646
libraryUsage: libraryStats,
598647
patterns: patternDetector.getAllPatterns(),
599648
goldenFiles: patternDetector.getGoldenFiles(5),
@@ -606,7 +655,7 @@ export class CodebaseIndexer {
606655
},
607656
// Internal file graph for circular dependency and unused export detection
608657
internalFileGraph: internalFileGraph.toJSON(),
609-
generatedAt: new Date().toISOString()
658+
generatedAt
610659
};
611660
await fs.writeFile(intelligencePath, JSON.stringify(intelligence, null, 2));
612661

@@ -622,10 +671,34 @@ export class CodebaseIndexer {
622671
indexedFiles: stats.indexedFiles,
623672
totalChunks: stats.totalChunks,
624673
totalFiles: stats.totalFiles,
625-
generatedAt: new Date().toISOString()
674+
generatedAt
626675
};
627676
await fs.writeFile(indexingStatsPath, JSON.stringify(persistedStats, null, 2));
628677

678+
// Index meta (authoritative) — write last so readers never observe meta pointing to missing artifacts.
679+
const metaPath = path.join(contextDir, INDEX_META_FILENAME);
680+
await fs.writeFile(
681+
metaPath,
682+
JSON.stringify(
683+
{
684+
metaVersion: INDEX_META_VERSION,
685+
formatVersion: INDEX_FORMAT_VERSION,
686+
buildId,
687+
generatedAt,
688+
toolVersion,
689+
artifacts: {
690+
keywordIndex: { path: KEYWORD_INDEX_FILENAME },
691+
vectorDb: { path: VECTOR_DB_DIRNAME, provider: 'lancedb' },
692+
intelligence: { path: INTELLIGENCE_FILENAME },
693+
manifest: { path: MANIFEST_FILENAME },
694+
indexingStats: { path: INDEXING_STATS_FILENAME }
695+
}
696+
},
697+
null,
698+
2
699+
)
700+
);
701+
629702
// Phase 5: Complete
630703
this.updateProgress('complete', 100);
631704

0 commit comments

Comments
 (0)