fix(search): use corpus excerpts, honor locale, Unicode tokenization

ilyar · ilyar · commit ca5110ce0d2c · 2026-03-25T22:25:06.000+01:00
Wire /corpus (and baked knowledge/) into the Qwen prompt like MCP path.
Add KNOWLEDGE_ROOT and locale directives so ru/en answers match request.
Fallback sources from ranked docs when the model omits them.
Align local-search tokenization with \p{L}\p{N} for Cyrillic queries.

Made-with: Cursor
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
@@ -19,6 +19,7 @@ services:
       SEARCH_SERVER: "1"
       QWEN_HTTP_PORT: "8790"
       QWEN_OAUTH: "true"
+      KNOWLEDGE_ROOT: /corpus
     volumes:
       - ./knowledge:/corpus:ro
     restart: unless-stopped
diff --git a/docker/search/Dockerfile b/docker/search/Dockerfile
@@ -14,7 +14,9 @@ WORKDIR /workspace
 COPY docker/search/entrypoint.sh /usr/local/bin/qwen-entrypoint
 COPY docker/search/qwen-search.sh /usr/local/bin/qwen-search
 COPY docker/search/http-server.mjs /opt/search/http-server.mjs
+COPY docker/search/knowledge-rank.mjs /opt/search/knowledge-rank.mjs
 COPY openapi/knowledge-v1.yaml /opt/search/openapi.yaml
+COPY knowledge /opt/search/knowledge
 
 RUN chmod +x /usr/local/bin/qwen-entrypoint /usr/local/bin/qwen-search /opt/search/http-server.mjs
 
diff --git a/docker/search/http-server.mjs b/docker/search/http-server.mjs
@@ -8,6 +8,7 @@ import { spawn } from "node:child_process";
 import { readFileSync, existsSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 import { dirname, join } from "node:path";
+import { rankKnowledgeForQuery } from "./knowledge-rank.mjs";
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const PORT = parseInt(process.env.SEARCH_HTTP_PORT || process.env.QWEN_HTTP_PORT || "8790", 10);
@@ -44,6 +45,61 @@ function openapiPath() {
   return join(__dirname, "..", "..", "openapi", "knowledge-v1.yaml");
 }
 
+/** Corpus: prod mounts repo `knowledge` at /corpus; image also bakes /opt/search/knowledge. */
+function resolveKnowledgeRoot() {
+  const fromEnv = (process.env.KNOWLEDGE_ROOT || "").trim();
+  if (fromEnv && existsSync(fromEnv)) {
+    return fromEnv;
+  }
+  const candidates = ["/corpus", "/opt/search/knowledge", join(__dirname, "..", "..", "knowledge")];
+  for (const c of candidates) {
+    if (existsSync(c)) {
+      return c;
+    }
+  }
+  return "";
+}
+
+function localeDirective(locale) {
+  const l = (locale || "").trim().toLowerCase();
+  if (!l) {
+    return "Write the answer in the same language as the user query.";
+  }
+  if (l === "ru" || l.startsWith("ru-")) {
+    return "You MUST write the entire answer in Russian.";
+  }
+  if (l === "en" || l.startsWith("en-")) {
+    return "You MUST write the entire answer in English.";
+  }
+  return `You MUST write the entire answer in the primary language for locale ${locale} (BCP 47).`;
+}
+
+function buildSearchPrompt(query, locale, matches) {
+  const context =
+    matches.length > 0
+      ? matches
+          .map(
+            (m, i) =>
+              [`Source ${i + 1}: ${m.file}`, `Section: ${m.section}`, `Snippet: ${m.snippet}`].join("\n"),
+          )
+          .join("\n\n")
+      : "No local documentation excerpts were retrieved for this query (corpus missing or no token overlap).";
+
+  return [
+    "You are a knowledge assistant for Telegram Mini App (TMA) and SpawnDock documentation.",
+    localeDirective(locale),
+    "When excerpts below are relevant, base your answer strictly on them.",
+    'Respond with valid JSON only (no markdown fences): {"answer":"...","sources":[{"file":"path.md","section":"Heading"}]}',
+    "List every excerpt source you used in \"sources\"; use [] only if excerpts were not used.",
+    "",
+    "Relevant local knowledge excerpts:",
+    context,
+    "",
+    "User query:",
+    query,
+  ].join("\n");
+}
+
 function readOpenapiYaml() {
   return readFileSync(openapiPath(), "utf8");
 }
@@ -222,11 +278,25 @@ function normalizeSearchBody(rawText) {
 }
 
 async function runSearchQuery(query, locale) {
-  const stdout = await runQwenPrompt(query);
+  const knowledgeRoot = resolveKnowledgeRoot();
+  let matches = [];
+  if (knowledgeRoot) {
+    try {
+      matches = rankKnowledgeForQuery(query, knowledgeRoot);
+    } catch (err) {
+      console.error("knowledge rank error:", err instanceof Error ? err.message : err);
+    }
+  }
+  const prompt = buildSearchPrompt(query, locale, matches);
+  const stdout = await runQwenPrompt(prompt);
   const normalized = normalizeSearchBody(extractQwenCliResult(stdout));
+  let sources = normalized.sources;
+  if (sources.length === 0 && matches.length > 0) {
+    sources = matches.map((m) => ({ file: m.file, section: m.section }));
+  }
   const meta = {};
   if (locale) meta.locale_requested = locale;
-  return { answer: normalized.answer, sources: normalized.sources, meta };
+  return { answer: normalized.answer, sources, meta };
 }
 
 function sendJson(res, status, body) {
diff --git a/docker/search/knowledge-rank.mjs b/docker/search/knowledge-rank.mjs
@@ -0,0 +1,171 @@
+/**
+ * Rank local markdown corpus for public search (mirrors src/local-search.ts logic).
+ * Uses Unicode-aware tokenization so Cyrillic queries match the corpus.
+ */
+import { readdirSync, readFileSync, statSync } from "node:fs";
+import { extname, join, relative } from "node:path";
+
+const MAX_RESULTS = 5;
+const MAX_KNOWLEDGE_FILE_BYTES = 2 * 1024 * 1024;
+const MIN_TOKEN_LENGTH = 2;
+const STOP_WORDS = new Set([
+  "a",
+  "an",
+  "and",
+  "are",
+  "for",
+  "how",
+  "is",
+  "into",
+  "that",
+  "the",
+  "this",
+  "what",
+  "with",
+]);
+
+function tokenizeQuery(query) {
+  return query
+    .toLowerCase()
+    .split(/[^\p{L}\p{N}]+/u)
+    .filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
+}
+
+function countOccurrences(content, token) {
+  let count = 0;
+  let index = content.indexOf(token);
+  while (index !== -1) {
+    count += 1;
+    index = content.indexOf(token, index + token.length);
+  }
+  return count;
+}
+
+function resolveSection(content, matchIndex) {
+  const lines = content.split("\n");
+  let offset = 0;
+  let section = "Overview";
+  for (const line of lines) {
+    const lineEnd = offset + line.length;
+    if (/^#{1,6}\s+.+$/.test(line)) {
+      section = line.replace(/^#{1,6}\s+/, "").trim();
+    }
+    if (matchIndex <= lineEnd) {
+      return section;
+    }
+    offset = lineEnd + 1;
+  }
+  return section;
+}
+
+function extractSnippet(content, matchIndex) {
+  const windowStart = Math.max(0, matchIndex - 120);
+  const windowEnd = Math.min(content.length, matchIndex + 220);
+  const rawSnippet = content
+    .slice(windowStart, windowEnd)
+    .replace(/\s+/g, " ")
+    .trim();
+  if (rawSnippet.length <= 220) {
+    return rawSnippet;
+  }
+  return `${rawSnippet.slice(0, 217)}...`;
+}
+
+function rankDocuments(query, documents) {
+  const tokens = tokenizeQuery(query);
+  if (tokens.length === 0) {
+    return [];
+  }
+  return documents
+    .map((document) => {
+      const lowerContent = document.content.toLowerCase();
+      const lowerFile = document.file.toLowerCase();
+      let score = 0;
+      let firstMatchIndex = Number.POSITIVE_INFINITY;
+      for (const token of tokens) {
+        const fileMatches = countOccurrences(lowerFile, token);
+        const contentMatches = countOccurrences(lowerContent, token);
+        const matchIndex = lowerContent.indexOf(token);
+        score += fileMatches * 10 + contentMatches;
+        if (matchIndex !== -1 && matchIndex < firstMatchIndex) {
+          firstMatchIndex = matchIndex;
+        }
+      }
+      if (score === 0) {
+        return null;
+      }
+      const resolvedMatchIndex = Number.isFinite(firstMatchIndex) ? firstMatchIndex : 0;
+      return {
+        file: document.file,
+        score,
+        section: resolveSection(document.content, resolvedMatchIndex),
+        snippet: extractSnippet(document.content, resolvedMatchIndex),
+      };
+    })
+    .filter((d) => d !== null)
+    .sort((left, right) => right.score - left.score || left.file.localeCompare(right.file));
+}
+
+function walkKnowledgeTree(dir) {
+  const entries = readdirSync(dir, { withFileTypes: true });
+  const files = [];
+  for (const entry of entries) {
+    const fullPath = join(dir, entry.name);
+    if (entry.isSymbolicLink()) {
+      continue;
+    }
+    if (entry.isDirectory()) {
+      files.push(...walkKnowledgeTree(fullPath));
+      continue;
+    }
+    if (entry.isFile() && extname(entry.name) === ".md") {
+      let size = 0;
+      try {
+        size = statSync(fullPath).size;
+      } catch {
+        continue;
+      }
+      if (size > MAX_KNOWLEDGE_FILE_BYTES) {
+        continue;
+      }
+      files.push(fullPath);
+    }
+  }
+  return files;
+}
+
+function loadKnowledgeDocuments(dir) {
+  let paths;
+  try {
+    paths = walkKnowledgeTree(dir);
+  } catch {
+    return [];
+  }
+  return paths.flatMap((filePath) => {
+    try {
+      return [
+        {
+          file: relative(dir, filePath).replaceAll("\\", "/"),
+          content: readFileSync(filePath, "utf8"),
+        },
+      ];
+    } catch {
+      return [];
+    }
+  });
+}
+
+/**
+ * @param {string} query
+ * @param {string} rootDir
+ * @returns {ReadonlyArray<{ file: string, section: string, snippet: string }>}
+ */
+export function rankKnowledgeForQuery(query, rootDir) {
+  const documents = loadKnowledgeDocuments(rootDir);
+  const ranked = rankDocuments(query, documents);
+  return ranked.slice(0, MAX_RESULTS).map((d) => ({
+    file: d.file,
+    section: d.section,
+    snippet: d.snippet,
+  }));
+}
diff --git a/src/__tests__/local-search.test.ts b/src/__tests__/local-search.test.ts
@@ -13,6 +13,10 @@ describe("tokenizeQuery", () => {
   it("drops short tokens and stop words", () => {
     expect(tokenizeQuery("What is a Telegram Mini App?")).toEqual(["telegram", "mini", "app"]);
   });
+
+  it("keeps Cyrillic words as tokens", () => {
+    expect(tokenizeQuery("как сделать TMA")).toEqual(["как", "сделать", "tma"]);
+  });
 });
 
 describe("rankDocuments", () => {
@@ -32,6 +36,18 @@ describe("rankDocuments", () => {
     expect(results[0]?.file).toBe("guides/testing-tma.md");
     expect(results[0]?.section).toBe("Testing");
   });
+
+  it("matches Cyrillic query tokens in corpus text", () => {
+    const ruDocs = [
+      {
+        file: "guides/ru-tma.md",
+        content: "# Руководство\nКак сделать TMA в Telegram Mini App.",
+      },
+    ];
+    const results = rankDocuments("как сделать TMA", ruDocs);
+    expect(results.length).toBeGreaterThan(0);
+    expect(results[0]?.file).toBe("guides/ru-tma.md");
+  });
 });
 
 describe("searchLocalKnowledge", () => {
diff --git a/src/local-search.ts b/src/local-search.ts
@@ -133,7 +133,7 @@ export function rankDocuments(
 export function tokenizeQuery(query: string): ReadonlyArray<string> {
   return query
     .toLowerCase()
-    .split(/[^a-z0-9]+/g)
+    .split(/[^\p{L}\p{N}]+/u)
     .filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
 }
 
diff --git a/src/qwen/prompts.ts b/src/qwen/prompts.ts
@@ -3,7 +3,7 @@ You have access to documentation files in /data/knowledge/.
 
 When answering questions:
 1. Search the documentation files for relevant information
-2. Provide a clear, actionable answer in Russian
+2. Provide a clear, actionable answer in the same language as the user query
 3. Include code examples when applicable
 4. Always respond with valid JSON in this format:
 {

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ export function rankDocuments(`
`133`	`133`	`export function tokenizeQuery(query: string): ReadonlyArray<string> {`
`134`	`134`	`return query`
`135`	`135`	`.toLowerCase()`
`136`		`- .split(/[^a-z0-9]+/g)`
	`136`	`+ .split(/[^\p{L}\p{N}]+/u)`
`137`	`137`	`.filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));`
`138`	`138`	`}`
`139`	`139`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ You have access to documentation files in /data/knowledge/.`
`3`	`3`
`4`	`4`	`When answering questions:`
`5`	`5`	`1. Search the documentation files for relevant information`
`6`		`-2. Provide a clear, actionable answer in Russian`
	`6`	`+2. Provide a clear, actionable answer in the same language as the user query`
`7`	`7`	`3. Include code examples when applicable`
`8`	`8`	`4. Always respond with valid JSON in this format:`
`9`	`9`	`{`