Skip to content

Commit 2997cdb

Browse files
author
Your Name
committed
fix(search): pure BM25 relevance ranking + camelCase token splitting
Two search quality fixes: 1. Remove fan_in popularity boost from BM25 ranking. The fan_in>5 clause added +3.0 to the rank of heavily-called functions, causing popular-but-irrelevant results (e.g. 'update' with fan_in=222) to outrank relevant matches. search_graph with query= is now pure BM25 relevance + label-type differentiation only. in_deg/out_deg are still returned in results for display but do not affect sort order. 2. Add cbm_camel_split() SQLite function for FTS5 indexing. FTS5's unicode61 tokenizer treats 'createSession' as a single token 'createsession'. Searching for 'session' would not match it. cbm_camel_split() expands camelCase names into space-separated tokens: 'createSession' → 'createSession create Session' 'HTMLParser' → 'HTMLParser HTML Parser' The original name is preserved as the first token for exact-match queries. Applied in: FTS5 triggers (INSERT/DELETE/UPDATE on nodes table) and the bulk FTS5 backfill after full indexing. 3. Switch FTS5 from content='nodes' (external content) to content='' (contentless). External content mode re-verifies matches against the source table at query time, which re-tokenizes the original name and fails to match the split tokens. Contentless mode trusts the inverted index directly. Trade-off: highlight()/snippet() unavailable (never used). Requires full reindex to rebuild FTS5 tables with new schema + tokens.
1 parent ea65a25 commit 2997cdb

File tree

2 files changed

+87
-14
lines changed

2 files changed

+87
-14
lines changed

src/pipeline/pipeline.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -820,12 +820,18 @@ int cbm_pipeline_run(cbm_pipeline_t *p) {
820820
cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count));
821821

822822
/* Backfill FTS5 index: the direct B-tree dump bypasses SQLite triggers,
823-
* so the FTS5 table is empty after indexing. Populate it in bulk now. */
823+
* so the FTS5 table is empty after indexing. Populate it in bulk now.
824+
* cbm_camel_split(name) splits camelCase into individual tokens so
825+
* "updateCloudClient" becomes searchable as "update", "Cloud", "Client". */
824826
cbm_store_t *fts_store = cbm_store_open_path(db_path);
825827
if (fts_store) {
828+
/* Contentless FTS5 (content='') — use plain INSERT, not INSERT OR REPLACE.
829+
* Clear first to handle reindex scenarios, then bulk insert. */
826830
cbm_store_exec(fts_store,
827-
"INSERT OR REPLACE INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
828-
"SELECT id, name, qualified_name, label, file_path FROM nodes;");
831+
"DELETE FROM nodes_fts;");
832+
cbm_store_exec(fts_store,
833+
"INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
834+
"SELECT id, cbm_camel_split(name), qualified_name, label, file_path FROM nodes;");
829835
cbm_store_close(fts_store);
830836
}
831837

src/store/store.c

Lines changed: 78 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,19 @@ static int create_user_indexes(cbm_store_t *s) {
245245
if (rc != SQLITE_OK) return rc;
246246

247247
/* FTS5 full-text search index on node names for BM25 ranking.
248-
* content='nodes' makes it an external-content table — synced via triggers.
248+
* content='' makes it a contentless table — it stores only the inverted index,
249+
* not the original text. This is required for camelCase token splitting:
250+
* we index "createSession create Session" but the source table has "createSession".
251+
* With content='nodes', FTS5 would re-verify against the source and fail to match
252+
* the split tokens. Contentless mode trusts the inverted index directly.
253+
* Trade-off: highlight()/snippet() unavailable, but we never use them.
249254
* Each DDL statement must be executed separately for FTS5 compatibility. */
250255
{
251256
char *fts_err = NULL;
252257
int fts_rc = sqlite3_exec(s->db,
253258
"CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5("
254259
"name, qualified_name, label, file_path,"
255-
"content='nodes', content_rowid='id',"
260+
"content='', content_rowid='id',"
256261
"tokenize='unicode61 remove_diacritics 2'"
257262
");",
258263
NULL, NULL, &fts_err);
@@ -263,22 +268,30 @@ static int create_user_indexes(cbm_store_t *s) {
263268
}
264269
}
265270

266-
/* Sync triggers: keep FTS index up to date when nodes change */
271+
/* Sync triggers: keep FTS index up to date when nodes change.
272+
* cbm_camel_split(name) splits camelCase into individual tokens so
273+
* "updateCloudClient" is searchable as "update", "Cloud", "Client".
274+
* Contentless FTS5 (content='') requires delete operations to provide the
275+
* exact same tokenized content that was originally inserted. */
267276
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN"
268277
" INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)"
269-
" VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);"
278+
" VALUES (new.id, cbm_camel_split(new.name), new.qualified_name,"
279+
" new.label, new.file_path);"
270280
"END;");
271281

272282
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN"
273283
" INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)"
274-
" VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);"
284+
" VALUES ('delete', old.id, cbm_camel_split(old.name), old.qualified_name,"
285+
" old.label, old.file_path);"
275286
"END;");
276287

277288
exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN"
278289
" INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)"
279-
" VALUES ('delete', old.id, old.name, old.qualified_name, old.label, old.file_path);"
290+
" VALUES ('delete', old.id, cbm_camel_split(old.name), old.qualified_name,"
291+
" old.label, old.file_path);"
280292
" INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)"
281-
" VALUES (new.id, new.name, new.qualified_name, new.label, new.file_path);"
293+
" VALUES (new.id, cbm_camel_split(new.name), new.qualified_name,"
294+
" new.label, new.file_path);"
282295
"END;");
283296

284297
return SQLITE_OK;
@@ -338,6 +351,52 @@ static void sqlite_regexp(sqlite3_context *ctx, int argc, sqlite3_value **argv)
338351
sqlite3_result_int(ctx, rc == 0 ? 1 : 0);
339352
}
340353

354+
/* CamelCase token splitter for FTS5.
355+
* "updateCloudClient" → "updateCloudClient update Cloud Client"
356+
* "HTMLParser" → "HTMLParser HTML Parser"
357+
* "getURL" → "getURL get URL"
358+
* Preserves original name as first token for exact-match queries,
359+
* then appends space-split words for broad keyword matching. */
360+
static void sqlite_camel_split(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
361+
(void)argc;
362+
const char *input = (const char *)sqlite3_value_text(argv[0]);
363+
if (!input || !input[0]) {
364+
sqlite3_result_text(ctx, input ? input : "", -1, SQLITE_TRANSIENT);
365+
return;
366+
}
367+
368+
char buf[2048];
369+
/* Start with the original name (preserves exact-match capability) */
370+
int len = snprintf(buf, sizeof(buf), "%s ", input);
371+
372+
/* Walk input, insert space before each camelCase boundary:
373+
* - lowercase→Uppercase: "updateCloud" → "update Cloud"
374+
* - Uppercase→Uppercase+lowercase: "HTMLParser" → "HTML Parser" */
375+
for (int i = 0; input[i] && len < (int)sizeof(buf) - 2; i++) {
376+
if (i > 0) {
377+
bool split = false;
378+
/* lowercase followed by Uppercase: updateC → update C */
379+
if (input[i] >= 'A' && input[i] <= 'Z' &&
380+
input[i - 1] >= 'a' && input[i - 1] <= 'z') {
381+
split = true;
382+
}
383+
/* Uppercase followed by Uppercase+lowercase: HTMLParser → HTML Parser
384+
* Only split before the LAST uppercase in a run */
385+
if (input[i] >= 'A' && input[i] <= 'Z' &&
386+
input[i - 1] >= 'A' && input[i - 1] <= 'Z' &&
387+
input[i + 1] >= 'a' && input[i + 1] <= 'z') {
388+
split = true;
389+
}
390+
if (split) {
391+
buf[len++] = ' ';
392+
}
393+
}
394+
buf[len++] = input[i];
395+
}
396+
buf[len] = '\0';
397+
sqlite3_result_text(ctx, buf, len, SQLITE_TRANSIENT);
398+
}
399+
341400
/* Case-insensitive REGEXP variant */
342401
static void sqlite_iregexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) {
343402
(void)argc;
@@ -411,6 +470,9 @@ static cbm_store_t *store_open_internal(const char *path, bool in_memory) {
411470
/* Case-insensitive variant for search with case_sensitive=false */
412471
sqlite3_create_function(s->db, "iregexp", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL,
413472
sqlite_iregexp, NULL, NULL);
473+
/* CamelCase splitter for FTS5 indexing — used in triggers and backfill */
474+
sqlite3_create_function(s->db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC,
475+
NULL, sqlite_camel_split, NULL, NULL);
414476

415477
if (configure_pragmas(s, in_memory) != CBM_STORE_OK || init_schema(s) != CBM_STORE_OK ||
416478
create_user_indexes(s) != CBM_STORE_OK) {
@@ -463,6 +525,9 @@ cbm_store_t *cbm_store_open_path_query(const char *db_path) {
463525
sqlite_regexp, NULL, NULL);
464526
sqlite3_create_function(s->db, "iregexp", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL,
465527
sqlite_iregexp, NULL, NULL);
528+
/* CamelCase splitter for FTS5 — must be registered before triggers fire */
529+
sqlite3_create_function(s->db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC,
530+
NULL, sqlite_camel_split, NULL, NULL);
466531

467532
if (configure_pragmas(s, false) != CBM_STORE_OK) {
468533
sqlite3_close(s->db);
@@ -2060,9 +2125,12 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear
20602125
}
20612126

20622127
char fts_sql[4096];
2063-
/* Join with FTS5 table, filter by project/label, order by BM25 rank.
2064-
* Exclude noise labels (File, Folder, Module, Section, Variable, Project)
2065-
* and boost Function/Method/Class via a structural score added to BM25. */
2128+
/* Join with FTS5 table, filter by project/label, order by pure BM25 relevance.
2129+
* Exclude noise labels (File, Folder, Module, Section, Variable, Project).
2130+
* Label-type boost: prefer Functions/Methods/Routes/Classes over generic nodes.
2131+
* No fan_in/popularity boost — that corrupts relevance for discovery queries
2132+
* (e.g. "update" with fan_in=222 would outrank the actually relevant match).
2133+
* in_deg/out_deg are still returned for display but do NOT affect ranking. */
20662134
int flen = snprintf(fts_sql, sizeof(fts_sql),
20672135
"SELECT n.id, n.project, n.label, n.name, n.qualified_name, "
20682136
"n.file_path, n.start_line, n.end_line, n.properties, "
@@ -2073,7 +2141,6 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear
20732141
" WHEN n.label IN ('Class','Interface','Type') THEN 5.0 "
20742142
" WHEN n.label = 'Route' THEN 8.0 "
20752143
" ELSE 0.0 END "
2076-
" - CASE WHEN (SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') > 5 THEN 3.0 ELSE 0.0 END"
20772144
") AS rank "
20782145
"FROM nodes_fts "
20792146
"JOIN nodes n ON n.id = nodes_fts.rowid "

0 commit comments

Comments
 (0)