|
| 1 | +/** |
| 2 | + * Document Chunker — Split markdown files into searchable chunks |
| 3 | + * |
| 4 | + * Strategy: |
| 5 | + * 1. Split on H2 headers first (natural semantic boundaries) |
| 6 | + * 2. If a section exceeds target size, split on paragraph boundaries |
| 7 | + * 3. Each chunk carries metadata: file path, title, category, chunk index |
| 8 | + * |
| 9 | + * Target chunk size: 500-800 tokens (~2000-3200 chars) |
| 10 | + */ |
| 11 | + |
| 12 | +import * as fs from "fs"; |
| 13 | +import * as path from "path"; |
| 14 | +import * as crypto from "crypto"; |
| 15 | + |
| 16 | +const TARGET_CHUNK_CHARS = 2400; // ~600 tokens |
| 17 | +const MAX_CHUNK_CHARS = 3600; // ~900 tokens hard limit |
| 18 | +const MIN_CHUNK_CHARS = 200; // Don't create tiny chunks |
| 19 | + |
| 20 | +export interface DocChunk { |
| 21 | + file_path: string; // Relative path from scan root |
| 22 | + chunk_index: number; |
| 23 | + title: string; // H1 or filename |
| 24 | + section_title: string; // H2 header for this chunk (or "") |
| 25 | + category: string; // Directory name (e.g., "research", "architecture") |
| 26 | + content: string; // The chunk text |
| 27 | + file_hash: string; // SHA-256 of full file content (for change detection) |
| 28 | +} |
| 29 | + |
| 30 | +export interface DocFile { |
| 31 | + absolute_path: string; |
| 32 | + relative_path: string; |
| 33 | + content: string; |
| 34 | + hash: string; |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Extract title from markdown content (first H1, or filename) |
| 39 | + */ |
| 40 | +function extractTitle(content: string, filePath: string): string { |
| 41 | + const h1Match = content.match(/^#\s+(.+)$/m); |
| 42 | + if (h1Match) return h1Match[1].trim(); |
| 43 | + |
| 44 | + // Fall back to filename without extension |
| 45 | + return path.basename(filePath, ".md").replace(/[-_]/g, " "); |
| 46 | +} |
| 47 | + |
| 48 | +/** |
| 49 | + * Extract category from directory structure |
| 50 | + */ |
| 51 | +function extractCategory(relativePath: string): string { |
| 52 | + const parts = relativePath.split(path.sep); |
| 53 | + if (parts.length > 1) return parts[0]; |
| 54 | + return "root"; |
| 55 | +} |
| 56 | + |
| 57 | +/** |
| 58 | + * Split markdown into sections by H2 headers |
| 59 | + */ |
| 60 | +function splitByH2(content: string): Array<{ title: string; content: string }> { |
| 61 | + const sections: Array<{ title: string; content: string }> = []; |
| 62 | + const lines = content.split("\n"); |
| 63 | + let currentTitle = ""; |
| 64 | + let currentLines: string[] = []; |
| 65 | + |
| 66 | + for (const line of lines) { |
| 67 | + const h2Match = line.match(/^##\s+(.+)$/); |
| 68 | + if (h2Match) { |
| 69 | + // Save previous section if it has content |
| 70 | + if (currentLines.length > 0) { |
| 71 | + const text = currentLines.join("\n").trim(); |
| 72 | + if (text.length > 0) { |
| 73 | + sections.push({ title: currentTitle, content: text }); |
| 74 | + } |
| 75 | + } |
| 76 | + currentTitle = h2Match[1].trim(); |
| 77 | + currentLines = []; |
| 78 | + } else { |
| 79 | + currentLines.push(line); |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + // Don't forget the last section |
| 84 | + if (currentLines.length > 0) { |
| 85 | + const text = currentLines.join("\n").trim(); |
| 86 | + if (text.length > 0) { |
| 87 | + sections.push({ title: currentTitle, content: text }); |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + return sections; |
| 92 | +} |
| 93 | + |
| 94 | +/** |
| 95 | + * Split a text blob on paragraph boundaries to fit within target size |
| 96 | + */ |
| 97 | +function splitByParagraphs(text: string, maxChars: number): string[] { |
| 98 | + if (text.length <= maxChars) return [text]; |
| 99 | + |
| 100 | + const chunks: string[] = []; |
| 101 | + const paragraphs = text.split(/\n\n+/); |
| 102 | + let current = ""; |
| 103 | + |
| 104 | + for (const para of paragraphs) { |
| 105 | + if (current.length + para.length + 2 > maxChars && current.length > 0) { |
| 106 | + chunks.push(current.trim()); |
| 107 | + current = para; |
| 108 | + } else { |
| 109 | + current = current ? current + "\n\n" + para : para; |
| 110 | + } |
| 111 | + } |
| 112 | + |
| 113 | + if (current.trim().length > 0) { |
| 114 | + chunks.push(current.trim()); |
| 115 | + } |
| 116 | + |
| 117 | + return chunks; |
| 118 | +} |
| 119 | + |
| 120 | +/** |
| 121 | + * Compute SHA-256 hash of content |
| 122 | + */ |
| 123 | +function hashContent(content: string): string { |
| 124 | + return crypto.createHash("sha256").update(content).digest("hex"); |
| 125 | +} |
| 126 | + |
| 127 | +/** |
| 128 | + * Chunk a single markdown file into searchable segments |
| 129 | + */ |
| 130 | +export function chunkDocument(doc: DocFile): DocChunk[] { |
| 131 | + const title = extractTitle(doc.content, doc.relative_path); |
| 132 | + const category = extractCategory(doc.relative_path); |
| 133 | + const chunks: DocChunk[] = []; |
| 134 | + let chunkIndex = 0; |
| 135 | + |
| 136 | + // Split by H2 headers |
| 137 | + const sections = splitByH2(doc.content); |
| 138 | + |
| 139 | + for (const section of sections) { |
| 140 | + // If section fits in one chunk, use it directly |
| 141 | + if (section.content.length <= MAX_CHUNK_CHARS) { |
| 142 | + if (section.content.length >= MIN_CHUNK_CHARS) { |
| 143 | + chunks.push({ |
| 144 | + file_path: doc.relative_path, |
| 145 | + chunk_index: chunkIndex++, |
| 146 | + title, |
| 147 | + section_title: section.title, |
| 148 | + category, |
| 149 | + content: section.content, |
| 150 | + file_hash: doc.hash, |
| 151 | + }); |
| 152 | + } |
| 153 | + } else { |
| 154 | + // Section too large — split by paragraphs |
| 155 | + const subChunks = splitByParagraphs(section.content, TARGET_CHUNK_CHARS); |
| 156 | + for (const sub of subChunks) { |
| 157 | + if (sub.length >= MIN_CHUNK_CHARS) { |
| 158 | + chunks.push({ |
| 159 | + file_path: doc.relative_path, |
| 160 | + chunk_index: chunkIndex++, |
| 161 | + title, |
| 162 | + section_title: section.title, |
| 163 | + category, |
| 164 | + content: sub, |
| 165 | + file_hash: doc.hash, |
| 166 | + }); |
| 167 | + } |
| 168 | + } |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + // Edge case: file with no H2 headers and short content — one chunk |
| 173 | + if (chunks.length === 0 && doc.content.trim().length >= MIN_CHUNK_CHARS) { |
| 174 | + chunks.push({ |
| 175 | + file_path: doc.relative_path, |
| 176 | + chunk_index: 0, |
| 177 | + title, |
| 178 | + section_title: "", |
| 179 | + category, |
| 180 | + content: doc.content.trim().slice(0, MAX_CHUNK_CHARS), |
| 181 | + file_hash: doc.hash, |
| 182 | + }); |
| 183 | + } |
| 184 | + |
| 185 | + return chunks; |
| 186 | +} |
| 187 | + |
| 188 | +/** |
| 189 | + * Scan a directory for markdown files |
| 190 | + */ |
| 191 | +export function scanDirectory( |
| 192 | + dirPath: string, |
| 193 | + options: { exclude?: string[] } = {} |
| 194 | +): DocFile[] { |
| 195 | + const exclude = options.exclude || ["_archive", "node_modules", ".git"]; |
| 196 | + const files: DocFile[] = []; |
| 197 | + |
| 198 | + function walk(currentPath: string): void { |
| 199 | + let entries: fs.Dirent[]; |
| 200 | + try { |
| 201 | + entries = fs.readdirSync(currentPath, { withFileTypes: true }); |
| 202 | + } catch { |
| 203 | + return; // Permission denied or inaccessible |
| 204 | + } |
| 205 | + |
| 206 | + for (const entry of entries) { |
| 207 | + const fullPath = path.join(currentPath, entry.name); |
| 208 | + |
| 209 | + if (entry.isDirectory()) { |
| 210 | + if (!exclude.includes(entry.name)) { |
| 211 | + walk(fullPath); |
| 212 | + } |
| 213 | + } else if (entry.isFile() && entry.name.endsWith(".md")) { |
| 214 | + try { |
| 215 | + const content = fs.readFileSync(fullPath, "utf-8"); |
| 216 | + const relativePath = path.relative(dirPath, fullPath); |
| 217 | + files.push({ |
| 218 | + absolute_path: fullPath, |
| 219 | + relative_path: relativePath, |
| 220 | + content, |
| 221 | + hash: hashContent(content), |
| 222 | + }); |
| 223 | + } catch { |
| 224 | + // Skip unreadable files |
| 225 | + } |
| 226 | + } |
| 227 | + } |
| 228 | + } |
| 229 | + |
| 230 | + walk(dirPath); |
| 231 | + return files; |
| 232 | +} |
| 233 | + |
| 234 | +/** |
| 235 | + * Chunk all markdown files in a directory |
| 236 | + */ |
| 237 | +export function chunkDirectory( |
| 238 | + dirPath: string, |
| 239 | + options: { exclude?: string[] } = {} |
| 240 | +): { files: DocFile[]; chunks: DocChunk[] } { |
| 241 | + const files = scanDirectory(dirPath, options); |
| 242 | + const chunks: DocChunk[] = []; |
| 243 | + |
| 244 | + for (const file of files) { |
| 245 | + chunks.push(...chunkDocument(file)); |
| 246 | + } |
| 247 | + |
| 248 | + return { files, chunks }; |
| 249 | +} |
0 commit comments