|
| 1 | +/** |
| 2 | + * Rank local markdown corpus for public search (mirrors src/local-search.ts logic). |
| 3 | + * Uses Unicode-aware tokenization so Cyrillic queries match the corpus. |
| 4 | + */ |
| 5 | +import { readdirSync, readFileSync, statSync } from "node:fs"; |
| 6 | +import { extname, join, relative } from "node:path"; |
| 7 | + |
| 8 | +const MAX_RESULTS = 5; |
| 9 | +const MAX_KNOWLEDGE_FILE_BYTES = 2 * 1024 * 1024; |
| 10 | +const MIN_TOKEN_LENGTH = 2; |
| 11 | +const STOP_WORDS = new Set([ |
| 12 | + "a", |
| 13 | + "an", |
| 14 | + "and", |
| 15 | + "are", |
| 16 | + "for", |
| 17 | + "how", |
| 18 | + "is", |
| 19 | + "into", |
| 20 | + "that", |
| 21 | + "the", |
| 22 | + "this", |
| 23 | + "what", |
| 24 | + "with", |
| 25 | +]); |
| 26 | + |
| 27 | +function tokenizeQuery(query) { |
| 28 | + return query |
| 29 | + .toLowerCase() |
| 30 | + .split(/[^\p{L}\p{N}]+/u) |
| 31 | + .filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token)); |
| 32 | +} |
| 33 | + |
| 34 | +function countOccurrences(content, token) { |
| 35 | + let count = 0; |
| 36 | + let index = content.indexOf(token); |
| 37 | + while (index !== -1) { |
| 38 | + count += 1; |
| 39 | + index = content.indexOf(token, index + token.length); |
| 40 | + } |
| 41 | + return count; |
| 42 | +} |
| 43 | + |
| 44 | +function resolveSection(content, matchIndex) { |
| 45 | + const lines = content.split("\n"); |
| 46 | + let offset = 0; |
| 47 | + let section = "Overview"; |
| 48 | + for (const line of lines) { |
| 49 | + const lineEnd = offset + line.length; |
| 50 | + if (/^#{1,6}\s+.+$/.test(line)) { |
| 51 | + section = line.replace(/^#{1,6}\s+/, "").trim(); |
| 52 | + } |
| 53 | + if (matchIndex <= lineEnd) { |
| 54 | + return section; |
| 55 | + } |
| 56 | + offset = lineEnd + 1; |
| 57 | + } |
| 58 | + return section; |
| 59 | +} |
| 60 | + |
| 61 | +function extractSnippet(content, matchIndex) { |
| 62 | + const windowStart = Math.max(0, matchIndex - 120); |
| 63 | + const windowEnd = Math.min(content.length, matchIndex + 220); |
| 64 | + const rawSnippet = content |
| 65 | + .slice(windowStart, windowEnd) |
| 66 | + .replace(/\s+/g, " ") |
| 67 | + .trim(); |
| 68 | + if (rawSnippet.length <= 220) { |
| 69 | + return rawSnippet; |
| 70 | + } |
| 71 | + return `${rawSnippet.slice(0, 217)}...`; |
| 72 | +} |
| 73 | + |
| 74 | +function rankDocuments(query, documents) { |
| 75 | + const tokens = tokenizeQuery(query); |
| 76 | + if (tokens.length === 0) { |
| 77 | + return []; |
| 78 | + } |
| 79 | + return documents |
| 80 | + .map((document) => { |
| 81 | + const lowerContent = document.content.toLowerCase(); |
| 82 | + const lowerFile = document.file.toLowerCase(); |
| 83 | + let score = 0; |
| 84 | + let firstMatchIndex = Number.POSITIVE_INFINITY; |
| 85 | + for (const token of tokens) { |
| 86 | + const fileMatches = countOccurrences(lowerFile, token); |
| 87 | + const contentMatches = countOccurrences(lowerContent, token); |
| 88 | + const matchIndex = lowerContent.indexOf(token); |
| 89 | + score += fileMatches * 10 + contentMatches; |
| 90 | + if (matchIndex !== -1 && matchIndex < firstMatchIndex) { |
| 91 | + firstMatchIndex = matchIndex; |
| 92 | + } |
| 93 | + } |
| 94 | + if (score === 0) { |
| 95 | + return null; |
| 96 | + } |
| 97 | + const resolvedMatchIndex = Number.isFinite(firstMatchIndex) ? firstMatchIndex : 0; |
| 98 | + return { |
| 99 | + file: document.file, |
| 100 | + score, |
| 101 | + section: resolveSection(document.content, resolvedMatchIndex), |
| 102 | + snippet: extractSnippet(document.content, resolvedMatchIndex), |
| 103 | + }; |
| 104 | + }) |
| 105 | + .filter((d) => d !== null) |
| 106 | + .sort((left, right) => right.score - left.score || left.file.localeCompare(right.file)); |
| 107 | +} |
| 108 | + |
| 109 | +function walkKnowledgeTree(dir) { |
| 110 | + const entries = readdirSync(dir, { withFileTypes: true }); |
| 111 | + const files = []; |
| 112 | + for (const entry of entries) { |
| 113 | + const fullPath = join(dir, entry.name); |
| 114 | + if (entry.isSymbolicLink()) { |
| 115 | + continue; |
| 116 | + } |
| 117 | + if (entry.isDirectory()) { |
| 118 | + files.push(...walkKnowledgeTree(fullPath)); |
| 119 | + continue; |
| 120 | + } |
| 121 | + if (entry.isFile() && extname(entry.name) === ".md") { |
| 122 | + let size = 0; |
| 123 | + try { |
| 124 | + size = statSync(fullPath).size; |
| 125 | + } catch { |
| 126 | + continue; |
| 127 | + } |
| 128 | + if (size > MAX_KNOWLEDGE_FILE_BYTES) { |
| 129 | + continue; |
| 130 | + } |
| 131 | + files.push(fullPath); |
| 132 | + } |
| 133 | + } |
| 134 | + return files; |
| 135 | +} |
| 136 | + |
| 137 | +function loadKnowledgeDocuments(dir) { |
| 138 | + let paths; |
| 139 | + try { |
| 140 | + paths = walkKnowledgeTree(dir); |
| 141 | + } catch { |
| 142 | + return []; |
| 143 | + } |
| 144 | + return paths.flatMap((filePath) => { |
| 145 | + try { |
| 146 | + return [ |
| 147 | + { |
| 148 | + file: relative(dir, filePath).replaceAll("\\", "/"), |
| 149 | + content: readFileSync(filePath, "utf8"), |
| 150 | + }, |
| 151 | + ]; |
| 152 | + } catch { |
| 153 | + return []; |
| 154 | + } |
| 155 | + }); |
| 156 | +} |
| 157 | + |
| 158 | +/** |
| 159 | + * @param {string} query |
| 160 | + * @param {string} rootDir |
| 161 | + * @returns {ReadonlyArray<{ file: string, section: string, snippet: string }>} |
| 162 | + */ |
| 163 | +export function rankKnowledgeForQuery(query, rootDir) { |
| 164 | + const documents = loadKnowledgeDocuments(rootDir); |
| 165 | + const ranked = rankDocuments(query, documents); |
| 166 | + return ranked.slice(0, MAX_RESULTS).map((d) => ({ |
| 167 | + file: d.file, |
| 168 | + section: d.section, |
| 169 | + snippet: d.snippet, |
| 170 | + })); |
| 171 | +} |
0 commit comments