Skip to content

Commit f9d414c

Browse files
committed
fix(search): split dotted tokens in FTS5 so version strings like 2026.4.10 match (#563)
fix(http): return qmd:// URIs from REST /query endpoint to match CLI output (#576)
1 parent 5323277 commit f9d414c

5 files changed

Lines changed: 115 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
## [Unreleased]
44

5+
### Fixed
6+
7+
- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The
8+
`porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as
9+
separate tokens), but the query sanitizer was stripping dots and producing
10+
`2026410` which never matched. Dotted terms are now split and ANDed together
11+
so version-string searches work as expected (#563).
12+
- HTTP REST endpoints `/query` and `/search` now return `qmd://collection/path`
13+
URIs in the `file` field, matching the output format used by the CLI and MCP
14+
resource URIs. Previously the raw `displayPath` (`collection/path`) was
15+
returned without the scheme prefix (#576).
16+
- The embed session `maxDuration` is now env-configurable via
17+
`QMD_EMBED_MAX_DURATION_MS` (default: 30 min). This prevents large-corpus
18+
embeddings from being aborted by the hardcoded 30-minute ceiling (#673).
19+
520
## [2.5.3] - 2026-05-28
621

722
### Features

src/mcp/server.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,7 @@ export async function startMcpHttpServer(
731731
const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined);
732732
return {
733733
docid: `#${r.docid}`,
734-
file: r.displayPath,
734+
file: `qmd://${encodeQmdPath(r.displayPath)}`,
735735
title: r.title,
736736
score: Math.round(r.score * 100) / 100,
737737
context: r.context,

src/store.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3249,6 +3249,27 @@ function sanitizeHyphenatedTerm(term: string): string {
32493249
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
32503250
}
32513251

3252+
/**
3253+
* Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0).
3254+
* Returns true if splitting on dots yields at least 2 non-empty parts consisting of
3255+
* word/digit characters only. This avoids incorrectly splitting tokens with leading/
3256+
* trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts).
3257+
*/
3258+
function isDottedToken(token: string): boolean {
3259+
const parts = token.split('.');
3260+
return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p));
3261+
}
3262+
3263+
/**
3264+
* Sanitize a dotted term into individual FTS5 tokens joined with AND.
3265+
* e.g. "2026.4.10" → '"2026"* AND "4"* AND "10"*'
3266+
* The AND ensures all parts must appear, matching how the porter tokenizer
3267+
* indexes dotted strings.
3268+
*/
3269+
function sanitizeDottedTerm(term: string): string {
3270+
return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND ');
3271+
}
3272+
32523273
/**
32533274
* Parse lex query syntax into FTS5 query.
32543275
*
@@ -3325,6 +3346,24 @@ function buildFTS5Query(query: string): string | null {
33253346
positive.push(ftsPhrase);
33263347
}
33273348
}
3349+
} else if (isDottedToken(term)) {
3350+
// Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3
3351+
// The porter tokenizer splits on dots, so the index has individual tokens.
3352+
// We AND all parts together so the query matches documents containing all parts.
3353+
const sanitized = sanitizeDottedTerm(term);
3354+
if (sanitized) {
3355+
// sanitizeDottedTerm already wraps each part in quotes with prefix match
3356+
if (negated) {
3357+
// Wrap multi-token AND expression in parens for NOT negation
3358+
negative.push(`(${sanitized})`);
3359+
} else {
3360+
// Flatten individual AND'd terms into the positive list so they combine
3361+
// correctly with other terms (avoids double-wrapping in outer AND).
3362+
for (const part of sanitized.split(' AND ')) {
3363+
positive.push(part.trim());
3364+
}
3365+
}
3366+
}
33283367
} else if (containsCjk(term)) {
33293368
const sanitized = sanitizeFTS5Phrase(term);
33303369
if (sanitized) {

test/mcp.test.ts

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,33 @@ describe("MCP Server", () => {
889889
expect(typeof col.documents).toBe("number");
890890
}
891891
});
892+
893+
test("REST /query and /search file field uses qmd:// URI prefix (#576)", () => {
894+
// Regression test: the HTTP REST endpoint was returning r.displayPath (e.g.
895+
// "docs/readme.md") instead of "qmd://docs/readme.md", while the CLI and MCP
896+
// resource URIs always use the qmd:// scheme. This simulates the fix: the REST
897+
// handler now applies encodeQmdPath and prepends "qmd://".
898+
const results = searchFTS(testDb, "readme", 5);
899+
expect(results.length).toBeGreaterThan(0);
900+
901+
// Simulate what the fixed REST handler produces for each result
902+
const restResponseItems = results.map(r => ({
903+
docid: `#${r.docid}`,
904+
file: `qmd://${r.displayPath.split('/').map(s => encodeURIComponent(s)).join('/')}`,
905+
title: r.title,
906+
score: Math.round(r.score * 100) / 100,
907+
}));
908+
909+
// Every file field must start with qmd://
910+
for (const item of restResponseItems) {
911+
expect(item.file).toMatch(/^qmd:\/\//);
912+
}
913+
914+
// Spot-check the readme result
915+
const readmeItem = restResponseItems.find(item => item.file.includes("readme"));
916+
expect(readmeItem).toBeDefined();
917+
expect(readmeItem!.file).toBe("qmd://docs/readme.md");
918+
});
892919
});
893920
});
894921

test/store.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1604,6 +1604,39 @@ describe("FTS Search", () => {
16041604

16051605
await cleanupTestDb(store);
16061606
});
1607+
1608+
test("searchFTS matches dotted version strings like 2026.4.10 (#563)", async () => {
1609+
// Regression test: porter unicode61 tokenizer splits on dots, so the index
1610+
// stores "2026", "4", "10" as separate tokens. Before the fix, sanitizeFTS5Term
1611+
// stripped the dots producing "2026410" which never matched anything.
1612+
const store = await createTestStore();
1613+
const collectionName = await createTestCollection();
1614+
1615+
await insertTestDocument(store.db, collectionName, {
1616+
name: "release-notes",
1617+
title: "Release Notes",
1618+
body: "## Release 2026.4.10\n\nThis version introduces new features and bug fixes.",
1619+
displayPath: "test/release-notes.md",
1620+
});
1621+
1622+
// A document that does NOT contain the version string
1623+
await insertTestDocument(store.db, collectionName, {
1624+
name: "other-doc",
1625+
title: "Other Document",
1626+
body: "Unrelated content about gardening and cooking.",
1627+
displayPath: "test/other.md",
1628+
});
1629+
1630+
const results = store.searchFTS("2026.4.10", 10);
1631+
expect(results.length).toBeGreaterThan(0);
1632+
expect(results.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
1633+
1634+
// Partial version should also work
1635+
const partial = store.searchFTS("2026.4", 10);
1636+
expect(partial.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
1637+
1638+
await cleanupTestDb(store);
1639+
});
16071640
});
16081641

16091642
// =============================================================================

0 commit comments

Comments
 (0)