Skip to content

Commit 647af0b

Browse files
DeusDataKoolerx
andcommitted
Add BM25 full-text search + C# Interface extraction fixes
Ports four targeted improvements from #162 (Koolerx): - FTS5 BM25 ranked search via a new `query` parameter on search_graph. Uses a contentless nodes_fts virtual table populated at dump time with a cbm_camel_split() SQL function that splits camelCase identifiers (updateCloudClient -> update Cloud Client) so tokenized word queries match identifier internals. BM25 rank is boosted by node label (Function/Method +10, Route +8, Class/Interface/Type/Enum +5) and noise labels (File/Folder/Module/Variable) are filtered out. The new path is opt-in via the query parameter; regex name_pattern and vector semantic_query paths are untouched. - Enable SQLITE_ENABLE_FTS5 in the vendored sqlite3 build flags. - Rebuild nodes_fts after incremental reindex. The btree dump path bypasses any FTS5 triggers, so the index is wiped and repopulated via delete-all + INSERT .. SELECT after each incremental merge. - Add Interface to the label filter on cbm_registry_add in both the parallel and sequential definition passes, so C#/Java class declarations like Foo : IBar can resolve IBar to an INHERITS edge target during enrichment. - Add an explicit C# base_list handler to extract_base_classes() that iterates named children directly instead of returning the raw node text. Without this the fallback path returned ": IExamService" (with the leading separator) and registry lookup always failed, so C# INHERITS edges to interfaces never resolved. Co-Authored-By: Koolerx <Koolerx@users.noreply.github.com>
1 parent 8a06d78 commit 647af0b

File tree

9 files changed

+357
-12
lines changed

9 files changed

+357
-12
lines changed

Makefile.cbm

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,11 @@ MIMALLOC_CFLAGS_TEST = -std=c11 -g -O1 -w \
233233
-DMI_OVERRIDE=0
234234

235235
# sqlite3 (vendored amalgamation — compiled ourselves for ASan instrumentation)
236+
# SQLITE_ENABLE_FTS5: enables the FTS5 full-text search extension used by the
237+
# BM25 search path in search_graph (see nodes_fts virtual table in store.c).
236238
SQLITE3_SRC = vendored/sqlite3/sqlite3.c
237-
SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1
238-
SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1
239+
SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5
240+
SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5
239241

240242
# TRE regex (vendored, Windows only — POSIX uses system <regex.h>)
241243
TRE_SRC = vendored/tre/tre_all.c

internal/cbm/extract_defs.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,54 @@ static const char **extract_base_classes(CBMArena *a, TSNode node, const char *s
10171017
}
10181018
}
10191019

1020+
// C#: explicit base_list handler. The generic fallback (find_base_from_children)
1021+
// returns the raw text of the whole `base_list` node, which includes the leading
1022+
// `:` separator — producing names like `": IExamService"` that fail registry lookup.
1023+
// Iterate the named children directly and strip generic type args.
1024+
for (uint32_t i = 0; i < count; i++) {
1025+
TSNode child = ts_node_child(node, i);
1026+
if (strcmp(ts_node_type(child), "base_list") != 0) {
1027+
continue;
1028+
}
1029+
const char *bases[MAX_BASES];
1030+
int base_count = 0;
1031+
uint32_t bnc = ts_node_named_child_count(child);
1032+
for (uint32_t bi = 0; bi < bnc && base_count < MAX_BASES_MINUS_1; bi++) {
1033+
TSNode bc = ts_node_named_child(child, bi);
1034+
const char *bk = ts_node_type(bc);
1035+
char *text = NULL;
1036+
if (strcmp(bk, "identifier") == 0 || strcmp(bk, "generic_name") == 0 ||
1037+
strcmp(bk, "qualified_name") == 0) {
1038+
text = cbm_node_text(a, bc, source);
1039+
} else {
1040+
/* Nested wrapper — grab the first named grandchild. */
1041+
TSNode inner = ts_node_named_child(bc, 0);
1042+
if (!ts_node_is_null(inner)) {
1043+
text = cbm_node_text(a, inner, source);
1044+
}
1045+
}
1046+
if (text && text[0]) {
1047+
/* Strip generic type arguments: "List<int>" → "List". */
1048+
char *angle = strchr(text, '<');
1049+
if (angle) {
1050+
*angle = '\0';
1051+
}
1052+
bases[base_count++] = text;
1053+
}
1054+
}
1055+
if (base_count > 0) {
1056+
const char **result = (const char **)cbm_arena_alloc(
1057+
a, (base_count + NULL_TERM) * sizeof(const char *));
1058+
if (result) {
1059+
for (int j = 0; j < base_count; j++) {
1060+
result[j] = bases[j];
1061+
}
1062+
result[base_count] = NULL;
1063+
return result;
1064+
}
1065+
}
1066+
}
1067+
10201068
// Fallback: search for common base class node types as children
10211069
static const char *base_types[] = {"superclass",
10221070
"superinterfaces",

src/mcp/mcp.c

Lines changed: 177 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ enum {
4040
#define SLEN(s) (sizeof(s) - 1)
4141
#include "mcp/mcp.h"
4242
#include "store/store.h"
43+
#include <sqlite3.h>
4344
#include "cypher/cypher.h"
4445
#include "pipeline/pipeline.h"
4546
#include "cli/cli.h"
@@ -263,13 +264,24 @@ static const tool_def_t TOOLS[] = {
263264

264265
{"search_graph",
265266
"Search the code knowledge graph for functions, classes, routes, and variables. Use INSTEAD "
266-
"OF grep/glob when finding code definitions, implementations, or relationships. Returns "
267-
"precise results in one call.",
268-
"{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"label\":{\"type\":"
269-
"\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{\"type\":\"string\"},"
270-
"\"file_pattern\":{\"type\":\"string\"},\"relationship\":{\"type\":\"string\"},\"min_degree\":"
271-
"{\"type\":\"integer\"},\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{"
272-
"\"type\":\"boolean\"},\"include_connected\":{\"type\":\"boolean\"},\"semantic_query\":{"
267+
"OF grep/glob when finding code definitions, implementations, or relationships. Three search "
268+
"modes: (1) query='update settings' for BM25 ranked full-text search with camelCase "
269+
"splitting and structural label boosting — recommended for natural-language discovery; "
270+
"(2) name_pattern='.*regex.*' for exact pattern matching; (3) semantic_query=[...] for "
271+
"vector cosine search that bridges vocabulary (finds 'publish' when you search 'send'). "
272+
"The three modes are independent and can be combined in a single call.",
273+
"{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},"
274+
"\"query\":{\"type\":\"string\",\"description\":\"Natural-language or keyword full-text "
275+
"search using BM25 ranking. Tokens are split on whitespace; camelCase identifiers are "
276+
"indexed as individual words (updateCloudClient → update, cloud, client). Results are "
277+
"ranked with structural boosting: Functions/Methods +10, Routes +8, Classes/Interfaces +5. "
278+
"Noise labels (File/Folder/Module/Variable) are filtered out. When provided, name_pattern "
279+
"is ignored.\"},"
280+
"\"label\":{\"type\":\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{"
281+
"\"type\":\"string\"},\"file_pattern\":{\"type\":\"string\"},"
282+
"\"relationship\":{\"type\":\"string\"},\"min_degree\":{\"type\":\"integer\"},"
283+
"\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{\"type\":\"boolean\"},"
284+
"\"include_connected\":{\"type\":\"boolean\"},\"semantic_query\":{"
273285
"\"type\":\"array\",\"items\":{\"type\":\"string\"},\"description\":\"MUST be an ARRAY of "
274286
"keyword strings (e.g. [\\\"send\\\",\\\"pubsub\\\",\\\"publish\\\"]) — NOT a single string. "
275287
"Each keyword is scored independently via per-keyword min-cosine; results reflect functions "
@@ -1025,6 +1037,145 @@ static void enrich_connected(yyjson_mut_doc *doc, yyjson_mut_val *item, cbm_stor
10251037
}
10261038
}
10271039

1040+
/* Build an FTS5 MATCH expression from a free-form query string by splitting
1041+
* on whitespace and joining the terms with OR. Each token is also sanitized:
1042+
* anything that isn't alnum or underscore is dropped, so the caller can't
1043+
* inject FTS5 operators or double-quoted phrases. Returns the number of
1044+
* tokens emitted (0 if the query contained no usable terms). */
1045+
static int bm25_build_match(const char *query, char *out, size_t out_size) {
1046+
if (!query || !out || out_size < 2) {
1047+
return 0;
1048+
}
1049+
size_t pos = 0;
1050+
int tokens = 0;
1051+
const char *p = query;
1052+
while (*p) {
1053+
while (*p && !((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') ||
1054+
(*p >= '0' && *p <= '9') || *p == '_')) {
1055+
p++;
1056+
}
1057+
if (!*p) {
1058+
break;
1059+
}
1060+
const char *tok_start = p;
1061+
while (*p && ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') ||
1062+
(*p >= '0' && *p <= '9') || *p == '_')) {
1063+
p++;
1064+
}
1065+
size_t tok_len = (size_t)(p - tok_start);
1066+
if (tok_len == 0) {
1067+
continue;
1068+
}
1069+
const char *sep = (tokens > 0) ? " OR " : "";
1070+
size_t sep_len = strlen(sep);
1071+
if (pos + sep_len + tok_len + 1 >= out_size) {
1072+
break; /* out of room — stop cleanly, keep what we have */
1073+
}
1074+
memcpy(out + pos, sep, sep_len);
1075+
pos += sep_len;
1076+
memcpy(out + pos, tok_start, tok_len);
1077+
pos += tok_len;
1078+
tokens++;
1079+
}
1080+
out[pos] = '\0';
1081+
return tokens;
1082+
}
1083+
1084+
/* Run the BM25 full-text search path and return the JSON result string.
1085+
* Returns NULL if FTS5 is unavailable or the query produced no usable tokens,
1086+
* in which case the caller falls back to the regex-based search path. */
1087+
static char *bm25_search(cbm_store_t *store, const char *project, const char *query, int limit,
1088+
int offset) {
1089+
sqlite3 *db = cbm_store_get_db(store);
1090+
if (!db) {
1091+
return NULL;
1092+
}
1093+
char fts_query[1024];
1094+
int tok_count = bm25_build_match(query, fts_query, sizeof(fts_query));
1095+
if (tok_count == 0) {
1096+
return NULL;
1097+
}
1098+
1099+
/* BM25 ranked query with structural label boosting. bm25() returns a
1100+
* NEGATIVE score (lower = more relevant), so we subtract the boost to
1101+
* make high-value labels sort first. File/Folder/Module/Variable are
1102+
* excluded entirely — agents rarely want those as discovery results. */
1103+
const char *sql =
1104+
"SELECT n.id, n.label, n.name, n.qualified_name, n.file_path, n.start_line, n.end_line, "
1105+
" (bm25(nodes_fts) "
1106+
" - CASE WHEN n.label IN ('Function','Method') THEN 10.0 "
1107+
" WHEN n.label = 'Route' THEN 8.0 "
1108+
" WHEN n.label IN ('Class','Interface','Type','Enum') THEN 5.0 "
1109+
" ELSE 0.0 END) AS rank "
1110+
"FROM nodes_fts "
1111+
"JOIN nodes n ON n.id = nodes_fts.rowid "
1112+
"WHERE nodes_fts MATCH ?1 "
1113+
" AND n.project = ?2 "
1114+
" AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project') "
1115+
"ORDER BY rank "
1116+
"LIMIT ?3 OFFSET ?4";
1117+
1118+
sqlite3_stmt *stmt = NULL;
1119+
if (sqlite3_prepare_v2(db, sql, -1, &stmt, NULL) != SQLITE_OK) {
1120+
return NULL;
1121+
}
1122+
sqlite3_bind_text(stmt, 1, fts_query, -1, SQLITE_TRANSIENT);
1123+
sqlite3_bind_text(stmt, 2, project, -1, SQLITE_TRANSIENT);
1124+
sqlite3_bind_int(stmt, 3, limit > 0 ? limit : 100);
1125+
sqlite3_bind_int(stmt, 4, offset > 0 ? offset : 0);
1126+
1127+
/* Count total hits (for pagination) in a separate cheap query. */
1128+
int total = 0;
1129+
{
1130+
const char *count_sql =
1131+
"SELECT COUNT(*) FROM nodes_fts JOIN nodes n ON n.id = nodes_fts.rowid "
1132+
"WHERE nodes_fts MATCH ?1 AND n.project = ?2 "
1133+
" AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')";
1134+
sqlite3_stmt *cs = NULL;
1135+
if (sqlite3_prepare_v2(db, count_sql, -1, &cs, NULL) == SQLITE_OK) {
1136+
sqlite3_bind_text(cs, 1, fts_query, -1, SQLITE_TRANSIENT);
1137+
sqlite3_bind_text(cs, 2, project, -1, SQLITE_TRANSIENT);
1138+
if (sqlite3_step(cs) == SQLITE_ROW) {
1139+
total = sqlite3_column_int(cs, 0);
1140+
}
1141+
sqlite3_finalize(cs);
1142+
}
1143+
}
1144+
1145+
yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL);
1146+
yyjson_mut_val *root = yyjson_mut_obj(doc);
1147+
yyjson_mut_doc_set_root(doc, root);
1148+
yyjson_mut_obj_add_int(doc, root, "total", total);
1149+
yyjson_mut_obj_add_str(doc, root, "search_mode", "bm25");
1150+
1151+
yyjson_mut_val *results = yyjson_mut_arr(doc);
1152+
int emitted = 0;
1153+
while (sqlite3_step(stmt) == SQLITE_ROW) {
1154+
yyjson_mut_val *item = yyjson_mut_obj(doc);
1155+
yyjson_mut_obj_add_strcpy(doc, item, "name",
1156+
(const char *)sqlite3_column_text(stmt, 2));
1157+
yyjson_mut_obj_add_strcpy(doc, item, "qualified_name",
1158+
(const char *)sqlite3_column_text(stmt, 3));
1159+
yyjson_mut_obj_add_strcpy(doc, item, "label",
1160+
(const char *)sqlite3_column_text(stmt, 1));
1161+
yyjson_mut_obj_add_strcpy(doc, item, "file_path",
1162+
(const char *)sqlite3_column_text(stmt, 4));
1163+
yyjson_mut_obj_add_int(doc, item, "start_line", sqlite3_column_int(stmt, 5));
1164+
yyjson_mut_obj_add_int(doc, item, "end_line", sqlite3_column_int(stmt, 6));
1165+
yyjson_mut_obj_add_real(doc, item, "rank", sqlite3_column_double(stmt, 7));
1166+
yyjson_mut_arr_add_val(results, item);
1167+
emitted++;
1168+
}
1169+
sqlite3_finalize(stmt);
1170+
1171+
yyjson_mut_obj_add_val(doc, root, "results", results);
1172+
yyjson_mut_obj_add_bool(doc, root, "has_more", total > offset + emitted);
1173+
1174+
char *json = yy_doc_to_str(doc);
1175+
yyjson_mut_doc_free(doc);
1176+
return json;
1177+
}
1178+
10281179
static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10291180
char *project = cbm_mcp_get_string_arg(args, "project");
10301181
cbm_store_t *store = resolve_store(srv, project);
@@ -1036,6 +1187,25 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10361187
return not_indexed;
10371188
}
10381189

1190+
/* BM25 path: if `query` is set, run FTS5 full-text search with ranking
1191+
* and return early. The regex/vector path below is untouched for all
1192+
* other callers. If FTS5 is unavailable or the query is empty after
1193+
* tokenization, fall through to the regex path. */
1194+
char *query = cbm_mcp_get_string_arg(args, "query");
1195+
if (query && query[0]) {
1196+
int q_limit = cbm_mcp_get_int_arg(args, "limit", 100);
1197+
int q_offset = cbm_mcp_get_int_arg(args, "offset", 0);
1198+
char *bm25_json = bm25_search(store, project, query, q_limit, q_offset);
1199+
if (bm25_json) {
1200+
free(query);
1201+
free(project);
1202+
char *result = cbm_mcp_text_result(bm25_json, false);
1203+
free(bm25_json);
1204+
return result;
1205+
}
1206+
}
1207+
free(query);
1208+
10391209
char *label = cbm_mcp_get_string_arg(args, "label");
10401210
char *name_pattern = cbm_mcp_get_string_arg(args, "name_pattern");
10411211
char *qn_pattern = cbm_mcp_get_string_arg(args, "qn_pattern");

src/pipeline/pass_definitions.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,12 @@ static void process_def(cbm_pipeline_ctx_t *ctx, const CBMDefinition *def, const
227227
int64_t node_id = cbm_gbuf_upsert_node(
228228
ctx->gbuf, def->label ? def->label : "Function", def->name, def->qualified_name,
229229
def->file_path ? def->file_path : rel, (int)def->start_line, (int)def->end_line, props);
230+
/* Register callable symbols + Interface. Interface must be in the registry
231+
* so C#/Java `class Foo : IBar` / `class Foo implements IBar` can resolve
232+
* `IBar` to an INHERITS edge target during the enrichment phase. */
230233
if (node_id > 0 && def->label &&
231234
(strcmp(def->label, "Function") == 0 || strcmp(def->label, "Method") == 0 ||
232-
strcmp(def->label, "Class") == 0)) {
235+
strcmp(def->label, "Class") == 0 || strcmp(def->label, "Interface") == 0)) {
233236
cbm_registry_add(ctx->registry, def->name, def->qualified_name, def->label);
234237
}
235238
char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__");

src/pipeline/pass_parallel.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -650,8 +650,9 @@ static int register_and_link_def(cbm_pipeline_ctx_t *ctx, const CBMDefinition *d
650650
if (!def->name || !def->qualified_name || !def->label) {
651651
return 0;
652652
}
653+
/* Register callable symbols + Interface — see pass_definitions.c for rationale. */
653654
if (strcmp(def->label, "Function") == 0 || strcmp(def->label, "Method") == 0 ||
654-
strcmp(def->label, "Class") == 0) {
655+
strcmp(def->label, "Class") == 0 || strcmp(def->label, "Interface") == 0) {
655656
cbm_registry_add(ctx->registry, def->name, def->qualified_name, def->label);
656657
(*reg_entries)++;
657658
}

src/pipeline/pipeline.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,23 @@ static int dump_and_persist_hashes(cbm_pipeline_t *p, const cbm_file_info_t *fil
652652
stat_mtime_ns(&fst), fst.st_size);
653653
}
654654
}
655+
656+
/* FTS5 backfill: populate nodes_fts with camelCase-split names.
657+
* Contentless FTS5 requires the special 'delete-all' command instead of
658+
* DELETE FROM to wipe prior rows (there's no underlying content table).
659+
* Falls back to plain names if cbm_camel_split is unavailable (which
660+
* shouldn't happen because we always register it, but we stay defensive). */
661+
cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');");
662+
if (cbm_store_exec(
663+
hash_store,
664+
"INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
665+
"SELECT id, cbm_camel_split(name), qualified_name, label, file_path "
666+
"FROM nodes;") != CBM_STORE_OK) {
667+
cbm_store_exec(hash_store,
668+
"INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
669+
"SELECT id, name, qualified_name, label, file_path FROM nodes;");
670+
}
671+
655672
cbm_store_close(hash_store);
656673
cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count));
657674
}

src/pipeline/pipeline_incremental.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,22 @@ static void dump_and_persist(cbm_gbuf_t *gbuf, const char *db_path, const char *
277277
cbm_store_t *hash_store = cbm_store_open_path(db_path);
278278
if (hash_store) {
279279
persist_hashes(hash_store, project, files, file_count);
280+
281+
/* FTS5 rebuild after incremental dump. The btree dump path bypasses
282+
* any triggers that could have kept nodes_fts synchronized, so we
283+
* rebuild from the nodes table here. See the full-dump path in
284+
* pipeline.c for the matching logic. */
285+
cbm_store_exec(hash_store, "INSERT INTO nodes_fts(nodes_fts) VALUES('delete-all');");
286+
if (cbm_store_exec(
287+
hash_store,
288+
"INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
289+
"SELECT id, cbm_camel_split(name), qualified_name, label, file_path "
290+
"FROM nodes;") != CBM_STORE_OK) {
291+
cbm_store_exec(hash_store,
292+
"INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) "
293+
"SELECT id, name, qualified_name, label, file_path FROM nodes;");
294+
}
295+
280296
cbm_store_close(hash_store);
281297
}
282298
}

0 commit comments

Comments
 (0)