|
| 1 | +//node |
| 2 | +import fs from 'node:fs'; |
| 3 | +import path from 'node:path'; |
| 4 | +//modules |
| 5 | +import matter from 'gray-matter'; |
| 6 | +//src |
| 7 | +import type { Chunk, RawChunk } from '../src/types'; |
| 8 | +import Store from '../src/store'; |
| 9 | +import { build, workspace } from '../src/config'; |
| 10 | +import { embed, getContextPack } from '../src/helpers'; |
| 11 | + |
| 12 | +const RULE_RE = /\b(MUST NOT|MUST|SHOULD)\b/; |
| 13 | + |
| 14 | +/** |
| 15 | + * Returns a list of all .md files in the given root directory |
| 16 | + * that match the provided glob patterns. |
| 17 | + * (super-tiny glob: only ** / *.md and *.md) |
| 18 | + */ |
| 19 | +function globSync(root: string, patterns: string[]) { |
| 20 | + const out: string[] = []; |
| 21 | + //walks the given directory recursively returning all .md files |
| 22 | + function walk(dir: string) { |
| 23 | + for (const e of fs.readdirSync(dir, { withFileTypes: true })) { |
| 24 | + const p = path.join(dir, e.name); |
| 25 | + if (e.isDirectory()) walk(p); |
| 26 | + else if (e.isFile() && e.name.endsWith('.md')) out.push(p); |
| 27 | + } |
| 28 | + } |
| 29 | + //start walking for each pattern's base path |
| 30 | + patterns.forEach(p => { |
| 31 | + const base = p.includes('**') ? p.split('**')[0] : ''; |
| 32 | + walk(path.join(root, base)); |
| 33 | + }); |
| 34 | + //return the collected .md files |
| 35 | + return out; |
| 36 | +} |
| 37 | + |
| 38 | +/** |
| 39 | + * Converts a Markdown file into an array of chunks. |
| 40 | + * ex. ** / lib.md → [ { id: 'lib:lib.md#0', ... }, ... ] |
| 41 | + */ |
| 42 | +function mdToChunks( |
| 43 | + repo: string, |
| 44 | + filePath: string |
| 45 | +) { |
| 46 | + //read the markdown file |
| 47 | + const raw = fs.readFileSync(filePath, 'utf8'); |
| 48 | + //YAML front-matter → data |
| 49 | + const { content, data } = matter(raw); |
| 50 | + //split content into lines |
| 51 | + const lines = content.split('\n'); |
| 52 | + |
| 53 | + const chunks: RawChunk[] = []; |
| 54 | + let current: string[] = []; |
| 55 | + let headings: string[] = []; |
| 56 | + let sectionIdx = 0; |
| 57 | + |
| 58 | + //flushes the current chunk |
| 59 | + const flush = () => { |
| 60 | + //create the chunk |
| 61 | + const text = current.join('\n').trim(); |
| 62 | + if (!text) return; |
| 63 | + //determine the rule level |
| 64 | + const ruleMatch = text.match(RULE_RE)?.[1] as RawChunk['rule_level']; |
| 65 | + //push the chunk |
| 66 | + chunks.push({ |
| 67 | + id: `${repo}:${filePath}#${sectionIdx++}`, |
| 68 | + repo, |
| 69 | + file: filePath, |
| 70 | + headings: [...headings], |
| 71 | + rule_level: ruleMatch, |
| 72 | + version: data?.version, |
| 73 | + updated: data?.updated, |
| 74 | + text |
| 75 | + }); |
| 76 | + current = []; |
| 77 | + }; |
| 78 | + |
| 79 | + //process each line |
| 80 | + for (const line of lines) { |
| 81 | + //check for headings |
| 82 | + const h = line.match(/^(#{1,6})\s+(.*)$/); |
| 83 | + if (h) { |
| 84 | + //flush the current chunk |
| 85 | + flush(); |
| 86 | + const level = h[1].length; |
| 87 | + const title = h[2].trim(); |
| 88 | + headings = headings.slice(0, level-1); |
| 89 | + headings[level-1] = title; |
| 90 | + continue; |
| 91 | + } |
| 92 | + current.push(line); |
| 93 | + } |
| 94 | + flush(); |
| 95 | + return chunks; |
| 96 | +} |
| 97 | + |
| 98 | +/** |
| 99 | + * Ingests all Markdown files in the specified workspace. |
| 100 | + */ |
| 101 | +async function main() { |
| 102 | + //load the context pack. ex. |
| 103 | + // pack: "Stackpress Context Pack" |
| 104 | + // version: 0.1 |
| 105 | + // order: [lib, idea, ingest, inquire, reactus, stackpress] |
| 106 | + // include: |
| 107 | + // - repo: lib |
| 108 | + // paths: ["docs/**/*.md"] |
| 109 | + // - repo: idea |
| 110 | + // paths: ["docs/**/*.md"] |
| 111 | + // - repo: ingest |
| 112 | + // paths: ["docs/**/*.md"] |
| 113 | + // - repo: inquire |
| 114 | + // paths: ["docs/**/*.md"] |
| 115 | + // - repo: reactus |
| 116 | + // paths: ["docs/**/*.md"] |
| 117 | + // - repo: stackpress |
| 118 | + // paths: ["docs/**/*.md"] |
| 119 | + // budgets: |
| 120 | + // max_chunk_tokens: 400 |
| 121 | + // overlap_tokens: 32 |
| 122 | + const pack = getContextPack(); |
| 123 | + |
| 124 | + if (!workspace) { |
| 125 | + console.error('Clients shouldn\'t run this script directly.'); |
| 126 | + process.exit(1); |
| 127 | + } |
| 128 | + |
| 129 | + //initialize the store |
| 130 | + const store = new Store(build); |
| 131 | + |
| 132 | + //ingest each repo |
| 133 | + for (const repo of pack.order) { |
| 134 | + //find the include pattern for the repo |
| 135 | + const includes = pack.include.find(item => item.repo === repo); |
| 136 | + if (!includes) continue; |
| 137 | + const depRank = pack.order.indexOf(repo) + 1; |
| 138 | + const repoRoot = path.join(workspace, repo); |
| 139 | + //find all markdown files in the repo |
| 140 | + const files = globSync(repoRoot, includes.paths); |
| 141 | + //filter out any files starting with `.` (these are private) |
| 142 | + const rawChunks = files |
| 143 | + .filter(f => !path.basename(f).startsWith('.')) |
| 144 | + .flatMap(f => mdToChunks(repo, f)); |
| 145 | + // Optional: simple size control—merge small adjacent chunks, etc. |
| 146 | + |
| 147 | + // Embed in batches |
| 148 | + const batchSize = 64; |
| 149 | + for (let i = 0; i < rawChunks.length; i += batchSize) { |
| 150 | + const batch = rawChunks.slice(i, i + batchSize); |
| 151 | + const embs = await embed(batch.map(b => b.text)); |
| 152 | + console.log( |
| 153 | + 'Embedded', |
| 154 | + rawChunks[i].repo, |
| 155 | + rawChunks[i].file, |
| 156 | + `${i}-${i + batch.length} / ${rawChunks.length}` |
| 157 | + ); |
| 158 | + batch.forEach((b, j) => { |
| 159 | + const c: Chunk = { |
| 160 | + ...b, |
| 161 | + dependency_rank: depRank, |
| 162 | + embedding: embs[j] |
| 163 | + }; |
| 164 | + store.append(repo, c); |
| 165 | + }); |
| 166 | + } |
| 167 | + console.log(`Ingested ${repo}: ${rawChunks.length} chunks`); |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +main().catch(e => { |
| 172 | + console.error(e); |
| 173 | + process.exit(1); |
| 174 | +}); |
0 commit comments