Skip to content

Commit ca5110c

Browse files
committed
fix(search): use corpus excerpts, honor locale, Unicode tokenization
Wire /corpus (and baked knowledge/) into the Qwen prompt like MCP path. Add KNOWLEDGE_ROOT and locale directives so ru/en answers match request. Fallback sources from ranked docs when the model omits them. Align local-search tokenization with \p{L}\p{N} for Cyrillic queries. Made-with: Cursor
1 parent abf5ff4 commit ca5110c

7 files changed

Lines changed: 264 additions & 4 deletions

File tree

docker-compose.prod.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ services:
1919
SEARCH_SERVER: "1"
2020
QWEN_HTTP_PORT: "8790"
2121
QWEN_OAUTH: "true"
22+
KNOWLEDGE_ROOT: /corpus
2223
volumes:
2324
- ./knowledge:/corpus:ro
2425
restart: unless-stopped

docker/search/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ WORKDIR /workspace
1414
COPY docker/search/entrypoint.sh /usr/local/bin/qwen-entrypoint
1515
COPY docker/search/qwen-search.sh /usr/local/bin/qwen-search
1616
COPY docker/search/http-server.mjs /opt/search/http-server.mjs
17+
COPY docker/search/knowledge-rank.mjs /opt/search/knowledge-rank.mjs
1718
COPY openapi/knowledge-v1.yaml /opt/search/openapi.yaml
19+
COPY knowledge /opt/search/knowledge
1820

1921
RUN chmod +x /usr/local/bin/qwen-entrypoint /usr/local/bin/qwen-search /opt/search/http-server.mjs
2022

docker/search/http-server.mjs

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { spawn } from "node:child_process";
88
import { readFileSync, existsSync } from "node:fs";
99
import { fileURLToPath } from "node:url";
1010
import { dirname, join } from "node:path";
11+
import { rankKnowledgeForQuery } from "./knowledge-rank.mjs";
1112

1213
const __dirname = dirname(fileURLToPath(import.meta.url));
1314
const PORT = parseInt(process.env.SEARCH_HTTP_PORT || process.env.QWEN_HTTP_PORT || "8790", 10);
@@ -44,6 +45,61 @@ function openapiPath() {
4445
return join(__dirname, "..", "..", "openapi", "knowledge-v1.yaml");
4546
}
4647

48+
/** Corpus: prod mounts repo `knowledge` at /corpus; image also bakes /opt/search/knowledge. */
49+
function resolveKnowledgeRoot() {
50+
const fromEnv = (process.env.KNOWLEDGE_ROOT || "").trim();
51+
if (fromEnv && existsSync(fromEnv)) {
52+
return fromEnv;
53+
}
54+
const candidates = ["/corpus", "/opt/search/knowledge", join(__dirname, "..", "..", "knowledge")];
55+
for (const c of candidates) {
56+
if (existsSync(c)) {
57+
return c;
58+
}
59+
}
60+
return "";
61+
}
62+
63+
function localeDirective(locale) {
64+
const l = (locale || "").trim().toLowerCase();
65+
if (!l) {
66+
return "Write the answer in the same language as the user query.";
67+
}
68+
if (l === "ru" || l.startsWith("ru-")) {
69+
return "You MUST write the entire answer in Russian.";
70+
}
71+
if (l === "en" || l.startsWith("en-")) {
72+
return "You MUST write the entire answer in English.";
73+
}
74+
return `You MUST write the entire answer in the primary language for locale ${locale} (BCP 47).`;
75+
}
76+
77+
function buildSearchPrompt(query, locale, matches) {
78+
const context =
79+
matches.length > 0
80+
? matches
81+
.map(
82+
(m, i) =>
83+
[`Source ${i + 1}: ${m.file}`, `Section: ${m.section}`, `Snippet: ${m.snippet}`].join("\n"),
84+
)
85+
.join("\n\n")
86+
: "No local documentation excerpts were retrieved for this query (corpus missing or no token overlap).";
87+
88+
return [
89+
"You are a knowledge assistant for Telegram Mini App (TMA) and SpawnDock documentation.",
90+
localeDirective(locale),
91+
"When excerpts below are relevant, base your answer strictly on them.",
92+
'Respond with valid JSON only (no markdown fences): {"answer":"...","sources":[{"file":"path.md","section":"Heading"}]}',
93+
"List every excerpt source you used in \"sources\"; use [] only if excerpts were not used.",
94+
"",
95+
"Relevant local knowledge excerpts:",
96+
context,
97+
"",
98+
"User query:",
99+
query,
100+
].join("\n");
101+
}
102+
47103
function readOpenapiYaml() {
48104
return readFileSync(openapiPath(), "utf8");
49105
}
@@ -222,11 +278,25 @@ function normalizeSearchBody(rawText) {
222278
}
223279

224280
async function runSearchQuery(query, locale) {
225-
const stdout = await runQwenPrompt(query);
281+
const knowledgeRoot = resolveKnowledgeRoot();
282+
let matches = [];
283+
if (knowledgeRoot) {
284+
try {
285+
matches = rankKnowledgeForQuery(query, knowledgeRoot);
286+
} catch (err) {
287+
console.error("knowledge rank error:", err instanceof Error ? err.message : err);
288+
}
289+
}
290+
const prompt = buildSearchPrompt(query, locale, matches);
291+
const stdout = await runQwenPrompt(prompt);
226292
const normalized = normalizeSearchBody(extractQwenCliResult(stdout));
293+
let sources = normalized.sources;
294+
if (sources.length === 0 && matches.length > 0) {
295+
sources = matches.map((m) => ({ file: m.file, section: m.section }));
296+
}
227297
const meta = {};
228298
if (locale) meta.locale_requested = locale;
229-
return { answer: normalized.answer, sources: normalized.sources, meta };
299+
return { answer: normalized.answer, sources, meta };
230300
}
231301

232302
function sendJson(res, status, body) {

docker/search/knowledge-rank.mjs

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/**
2+
* Rank local markdown corpus for public search (mirrors src/local-search.ts logic).
3+
* Uses Unicode-aware tokenization so Cyrillic queries match the corpus.
4+
*/
5+
import { readdirSync, readFileSync, statSync } from "node:fs";
6+
import { extname, join, relative } from "node:path";
7+
8+
const MAX_RESULTS = 5;
9+
const MAX_KNOWLEDGE_FILE_BYTES = 2 * 1024 * 1024;
10+
const MIN_TOKEN_LENGTH = 2;
11+
const STOP_WORDS = new Set([
12+
"a",
13+
"an",
14+
"and",
15+
"are",
16+
"for",
17+
"how",
18+
"is",
19+
"into",
20+
"that",
21+
"the",
22+
"this",
23+
"what",
24+
"with",
25+
]);
26+
27+
function tokenizeQuery(query) {
28+
return query
29+
.toLowerCase()
30+
.split(/[^\p{L}\p{N}]+/u)
31+
.filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
32+
}
33+
34+
function countOccurrences(content, token) {
35+
let count = 0;
36+
let index = content.indexOf(token);
37+
while (index !== -1) {
38+
count += 1;
39+
index = content.indexOf(token, index + token.length);
40+
}
41+
return count;
42+
}
43+
44+
function resolveSection(content, matchIndex) {
45+
const lines = content.split("\n");
46+
let offset = 0;
47+
let section = "Overview";
48+
for (const line of lines) {
49+
const lineEnd = offset + line.length;
50+
if (/^#{1,6}\s+.+$/.test(line)) {
51+
section = line.replace(/^#{1,6}\s+/, "").trim();
52+
}
53+
if (matchIndex <= lineEnd) {
54+
return section;
55+
}
56+
offset = lineEnd + 1;
57+
}
58+
return section;
59+
}
60+
61+
function extractSnippet(content, matchIndex) {
62+
const windowStart = Math.max(0, matchIndex - 120);
63+
const windowEnd = Math.min(content.length, matchIndex + 220);
64+
const rawSnippet = content
65+
.slice(windowStart, windowEnd)
66+
.replace(/\s+/g, " ")
67+
.trim();
68+
if (rawSnippet.length <= 220) {
69+
return rawSnippet;
70+
}
71+
return `${rawSnippet.slice(0, 217)}...`;
72+
}
73+
74+
function rankDocuments(query, documents) {
75+
const tokens = tokenizeQuery(query);
76+
if (tokens.length === 0) {
77+
return [];
78+
}
79+
return documents
80+
.map((document) => {
81+
const lowerContent = document.content.toLowerCase();
82+
const lowerFile = document.file.toLowerCase();
83+
let score = 0;
84+
let firstMatchIndex = Number.POSITIVE_INFINITY;
85+
for (const token of tokens) {
86+
const fileMatches = countOccurrences(lowerFile, token);
87+
const contentMatches = countOccurrences(lowerContent, token);
88+
const matchIndex = lowerContent.indexOf(token);
89+
score += fileMatches * 10 + contentMatches;
90+
if (matchIndex !== -1 && matchIndex < firstMatchIndex) {
91+
firstMatchIndex = matchIndex;
92+
}
93+
}
94+
if (score === 0) {
95+
return null;
96+
}
97+
const resolvedMatchIndex = Number.isFinite(firstMatchIndex) ? firstMatchIndex : 0;
98+
return {
99+
file: document.file,
100+
score,
101+
section: resolveSection(document.content, resolvedMatchIndex),
102+
snippet: extractSnippet(document.content, resolvedMatchIndex),
103+
};
104+
})
105+
.filter((d) => d !== null)
106+
.sort((left, right) => right.score - left.score || left.file.localeCompare(right.file));
107+
}
108+
109+
function walkKnowledgeTree(dir) {
110+
const entries = readdirSync(dir, { withFileTypes: true });
111+
const files = [];
112+
for (const entry of entries) {
113+
const fullPath = join(dir, entry.name);
114+
if (entry.isSymbolicLink()) {
115+
continue;
116+
}
117+
if (entry.isDirectory()) {
118+
files.push(...walkKnowledgeTree(fullPath));
119+
continue;
120+
}
121+
if (entry.isFile() && extname(entry.name) === ".md") {
122+
let size = 0;
123+
try {
124+
size = statSync(fullPath).size;
125+
} catch {
126+
continue;
127+
}
128+
if (size > MAX_KNOWLEDGE_FILE_BYTES) {
129+
continue;
130+
}
131+
files.push(fullPath);
132+
}
133+
}
134+
return files;
135+
}
136+
137+
function loadKnowledgeDocuments(dir) {
138+
let paths;
139+
try {
140+
paths = walkKnowledgeTree(dir);
141+
} catch {
142+
return [];
143+
}
144+
return paths.flatMap((filePath) => {
145+
try {
146+
return [
147+
{
148+
file: relative(dir, filePath).replaceAll("\\", "/"),
149+
content: readFileSync(filePath, "utf8"),
150+
},
151+
];
152+
} catch {
153+
return [];
154+
}
155+
});
156+
}
157+
158+
/**
159+
* @param {string} query
160+
* @param {string} rootDir
161+
* @returns {ReadonlyArray<{ file: string, section: string, snippet: string }>}
162+
*/
163+
export function rankKnowledgeForQuery(query, rootDir) {
164+
const documents = loadKnowledgeDocuments(rootDir);
165+
const ranked = rankDocuments(query, documents);
166+
return ranked.slice(0, MAX_RESULTS).map((d) => ({
167+
file: d.file,
168+
section: d.section,
169+
snippet: d.snippet,
170+
}));
171+
}

src/__tests__/local-search.test.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ describe("tokenizeQuery", () => {
1313
it("drops short tokens and stop words", () => {
1414
expect(tokenizeQuery("What is a Telegram Mini App?")).toEqual(["telegram", "mini", "app"]);
1515
});
16+
17+
it("keeps Cyrillic words as tokens", () => {
18+
expect(tokenizeQuery("как сделать TMA")).toEqual(["как", "сделать", "tma"]);
19+
});
1620
});
1721

1822
describe("rankDocuments", () => {
@@ -32,6 +36,18 @@ describe("rankDocuments", () => {
3236
expect(results[0]?.file).toBe("guides/testing-tma.md");
3337
expect(results[0]?.section).toBe("Testing");
3438
});
39+
40+
it("matches Cyrillic query tokens in corpus text", () => {
41+
const ruDocs = [
42+
{
43+
file: "guides/ru-tma.md",
44+
content: "# Руководство\nКак сделать TMA в Telegram Mini App.",
45+
},
46+
];
47+
const results = rankDocuments("как сделать TMA", ruDocs);
48+
expect(results.length).toBeGreaterThan(0);
49+
expect(results[0]?.file).toBe("guides/ru-tma.md");
50+
});
3551
});
3652

3753
describe("searchLocalKnowledge", () => {

src/local-search.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ export function rankDocuments(
133133
export function tokenizeQuery(query: string): ReadonlyArray<string> {
134134
return query
135135
.toLowerCase()
136-
.split(/[^a-z0-9]+/g)
136+
.split(/[^\p{L}\p{N}]+/u)
137137
.filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
138138
}
139139

src/qwen/prompts.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ You have access to documentation files in /data/knowledge/.
33
44
When answering questions:
55
1. Search the documentation files for relevant information
6-
2. Provide a clear, actionable answer in Russian
6+
2. Provide a clear, actionable answer in the same language as the user query
77
3. Include code examples when applicable
88
4. Always respond with valid JSON in this format:
99
{

0 commit comments

Comments
 (0)