Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

## [Unreleased]

### Changes

- `QMD_FTS_TOKENIZER` env var lets you swap the FTS5 tokenizer used for
`documents_fts`. Defaults to the existing `porter unicode61`. Set to
`trigram` to make BM25 usable on CJK / mixed-language corpora —
`unicode61` splits on whitespace and indexes whole CJK sentences as a
single token, so substring queries return zero hits. Allowed values:
`porter unicode61`, `porter ascii`, `unicode61`, `ascii`, `trigram`.
Only affects newly created indexes; existing databases need a rebuild.

### Fixes

- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores)
| Variable | Default | Description |
|----------|---------|-------------|
| `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
| `QMD_FTS_TOKENIZER` | `porter unicode61` | FTS5 tokenizer for `documents_fts`. Set to `trigram` for usable BM25 on CJK / mixed-language corpora — `unicode61` splits on whitespace and indexes whole CJK sentences as a single token, so substring queries return zero hits. Allowed values: `porter unicode61`, `porter ascii`, `unicode61`, `ascii`, `trigram`. Only affects newly created indexes — existing databases need a rebuild (delete `~/.cache/qmd/index.sqlite`, then re-index) for the change to take effect. |

## How It Works

Expand Down
46 changes: 43 additions & 3 deletions src/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,43 @@ export function toVirtualPath(db: Database, absolutePath: string): string | null
// Database initialization
// =============================================================================

const DEFAULT_FTS_TOKENIZER = "porter unicode61";

// FTS5 tokenizer values that are safe to interpolate into the CREATE VIRTUAL
// TABLE statement. Built-in SQLite FTS5 tokenizers only — extension-loaded
// tokenizers (e.g. via QMD_FTS_EXTENSION) are not validated here.
const ALLOWED_FTS_TOKENIZERS: ReadonlySet<string> = new Set([
"porter unicode61",
"porter ascii",
"unicode61",
"ascii",
"trigram",
]);

/**
* Resolve the FTS5 tokenizer to use for documents_fts.
*
* Defaults to "porter unicode61" (English-tuned). Set QMD_FTS_TOKENIZER to
* "trigram" for usable BM25 on CJK / mixed-language corpora — unicode61
* splits on Unicode whitespace, which leaves CJK content as a single token
* per sentence.
*
* Changing this value only affects newly created documents_fts tables.
* Existing indexes must be rebuilt (drop the database file or run
* `qmd embed -f` after deleting the FTS table) for the change to take
* effect.
*/
export function getFtsTokenizer(): string {
const value = process.env.QMD_FTS_TOKENIZER?.trim();
if (!value) return DEFAULT_FTS_TOKENIZER;
if (!ALLOWED_FTS_TOKENIZERS.has(value)) {
throw new Error(
`Invalid QMD_FTS_TOKENIZER value: "${value}". ` +
`Allowed: ${[...ALLOWED_FTS_TOKENIZERS].map(t => `"${t}"`).join(", ")}.`
);
}
return value;
}

function createSqliteVecUnavailableError(reason: string): Error {
return new Error(
Expand Down Expand Up @@ -831,12 +868,15 @@ function initializeDatabase(db: Database): void {
`);

// FTS - index filepath (collection/path), title, and content
db.exec(`
// tokenize is interpolated from a strict whitelist (see getFtsTokenizer)
const tokenizer = getFtsTokenizer();
const createFtsSql = `
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
filepath, title, body,
tokenize='porter unicode61'
tokenize='${tokenizer}'
)
`);
`;
db.exec(createFtsSql);

// Triggers to keep FTS in sync
db.exec(`
Expand Down
65 changes: 65 additions & 0 deletions test/store.helpers.unit.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
handelize,
cleanupOrphanedVectors,
sanitizeFTS5Term,
getFtsTokenizer,
} from "../src/store";

// =============================================================================
Expand Down Expand Up @@ -287,3 +288,67 @@ describe("sanitizeFTS5Term", () => {
expect(sanitizeFTS5Term("日本語")).toBe("日本語");
});
});

// =============================================================================
// FTS Tokenizer Selection
// =============================================================================

describe("getFtsTokenizer", () => {
const ENV_KEY = "QMD_FTS_TOKENIZER";

function withEnv(value: string | undefined, fn: () => void): void {
const prev = process.env[ENV_KEY];
if (value === undefined) {
delete process.env[ENV_KEY];
} else {
process.env[ENV_KEY] = value;
}
try {
fn();
} finally {
if (prev === undefined) {
delete process.env[ENV_KEY];
} else {
process.env[ENV_KEY] = prev;
}
}
}

test("returns the porter unicode61 default when the env var is unset", () => {
withEnv(undefined, () => {
expect(getFtsTokenizer()).toBe("porter unicode61");
});
});

test("returns the porter unicode61 default when the env var is empty", () => {
withEnv("", () => {
expect(getFtsTokenizer()).toBe("porter unicode61");
});
});

test("trims surrounding whitespace before validating", () => {
withEnv(" trigram ", () => {
expect(getFtsTokenizer()).toBe("trigram");
});
});

test("accepts each whitelisted tokenizer", () => {
for (const tokenizer of ["porter unicode61", "porter ascii", "unicode61", "ascii", "trigram"]) {
withEnv(tokenizer, () => {
expect(getFtsTokenizer()).toBe(tokenizer);
});
}
});

test("rejects unknown values", () => {
withEnv("icu", () => {
expect(() => getFtsTokenizer()).toThrow(/Invalid QMD_FTS_TOKENIZER/);
});
});

test("rejects values that try to inject extra SQL", () => {
withEnv("trigram') --", () => {
expect(() => getFtsTokenizer()).toThrow(/Invalid QMD_FTS_TOKENIZER/);
});
});
});