4040#define SLEN (s ) (sizeof(s) - 1)
4141#include "mcp/mcp.h"
4242#include "store/store.h"
43+ #include <sqlite3.h>
4344#include "cypher/cypher.h"
4445#include "pipeline/pipeline.h"
4546#include "cli/cli.h"
@@ -263,13 +264,24 @@ static const tool_def_t TOOLS[] = {
263264
264265 {"search_graph" ,
265266 "Search the code knowledge graph for functions, classes, routes, and variables. Use INSTEAD "
266- "OF grep/glob when finding code definitions, implementations, or relationships. Returns "
267- "precise results in one call." ,
268- "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"label\":{\"type\":"
269- "\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{\"type\":\"string\"},"
270- "\"file_pattern\":{\"type\":\"string\"},\"relationship\":{\"type\":\"string\"},\"min_degree\":"
271- "{\"type\":\"integer\"},\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{"
272- "\"type\":\"boolean\"},\"include_connected\":{\"type\":\"boolean\"},\"semantic_query\":{"
267+ "OF grep/glob when finding code definitions, implementations, or relationships. Three search "
268+ "modes: (1) query='update settings' for BM25 ranked full-text search with camelCase "
269+ "splitting and structural label boosting — recommended for natural-language discovery; "
270+ "(2) name_pattern='.*regex.*' for exact pattern matching; (3) semantic_query=[...] for "
271+ "vector cosine search that bridges vocabulary (finds 'publish' when you search 'send'). "
272+ "The three modes are independent and can be combined in a single call." ,
273+ "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},"
274+ "\"query\":{\"type\":\"string\",\"description\":\"Natural-language or keyword full-text "
275+ "search using BM25 ranking. Tokens are split on whitespace; camelCase identifiers are "
276+ "indexed as individual words (updateCloudClient → update, cloud, client). Results are "
277+ "ranked with structural boosting: Functions/Methods +10, Routes +8, Classes/Interfaces +5. "
278+ "Noise labels (File/Folder/Module/Variable) are filtered out. When provided, name_pattern "
279+ "is ignored.\"},"
280+ "\"label\":{\"type\":\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{"
281+ "\"type\":\"string\"},\"file_pattern\":{\"type\":\"string\"},"
282+ "\"relationship\":{\"type\":\"string\"},\"min_degree\":{\"type\":\"integer\"},"
283+ "\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{\"type\":\"boolean\"},"
284+ "\"include_connected\":{\"type\":\"boolean\"},\"semantic_query\":{"
273285 "\"type\":\"array\",\"items\":{\"type\":\"string\"},\"description\":\"MUST be an ARRAY of "
274286 "keyword strings (e.g. [\\\"send\\\",\\\"pubsub\\\",\\\"publish\\\"]) — NOT a single string. "
275287 "Each keyword is scored independently via per-keyword min-cosine; results reflect functions "
@@ -1025,6 +1037,145 @@ static void enrich_connected(yyjson_mut_doc *doc, yyjson_mut_val *item, cbm_stor
10251037 }
10261038}
10271039
1040+ /* Build an FTS5 MATCH expression from a free-form query string by splitting
1041+ * on whitespace and joining the terms with OR. Each token is also sanitized:
1042+ * anything that isn't alnum or underscore is dropped, so the caller can't
1043+ * inject FTS5 operators or double-quoted phrases. Returns the number of
1044+ * tokens emitted (0 if the query contained no usable terms). */
1045+ static int bm25_build_match (const char * query , char * out , size_t out_size ) {
1046+ if (!query || !out || out_size < 2 ) {
1047+ return 0 ;
1048+ }
1049+ size_t pos = 0 ;
1050+ int tokens = 0 ;
1051+ const char * p = query ;
1052+ while (* p ) {
1053+ while (* p && !((* p >= 'a' && * p <= 'z' ) || (* p >= 'A' && * p <= 'Z' ) ||
1054+ (* p >= '0' && * p <= '9' ) || * p == '_' )) {
1055+ p ++ ;
1056+ }
1057+ if (!* p ) {
1058+ break ;
1059+ }
1060+ const char * tok_start = p ;
1061+ while (* p && ((* p >= 'a' && * p <= 'z' ) || (* p >= 'A' && * p <= 'Z' ) ||
1062+ (* p >= '0' && * p <= '9' ) || * p == '_' )) {
1063+ p ++ ;
1064+ }
1065+ size_t tok_len = (size_t )(p - tok_start );
1066+ if (tok_len == 0 ) {
1067+ continue ;
1068+ }
1069+ const char * sep = (tokens > 0 ) ? " OR " : "" ;
1070+ size_t sep_len = strlen (sep );
1071+ if (pos + sep_len + tok_len + 1 >= out_size ) {
1072+ break ; /* out of room — stop cleanly, keep what we have */
1073+ }
1074+ memcpy (out + pos , sep , sep_len );
1075+ pos += sep_len ;
1076+ memcpy (out + pos , tok_start , tok_len );
1077+ pos += tok_len ;
1078+ tokens ++ ;
1079+ }
1080+ out [pos ] = '\0' ;
1081+ return tokens ;
1082+ }
1083+
1084+ /* Run the BM25 full-text search path and return the JSON result string.
1085+ * Returns NULL if FTS5 is unavailable or the query produced no usable tokens,
1086+ * in which case the caller falls back to the regex-based search path. */
1087+ static char * bm25_search (cbm_store_t * store , const char * project , const char * query , int limit ,
1088+ int offset ) {
1089+ sqlite3 * db = cbm_store_get_db (store );
1090+ if (!db ) {
1091+ return NULL ;
1092+ }
1093+ char fts_query [1024 ];
1094+ int tok_count = bm25_build_match (query , fts_query , sizeof (fts_query ));
1095+ if (tok_count == 0 ) {
1096+ return NULL ;
1097+ }
1098+
1099+ /* BM25 ranked query with structural label boosting. bm25() returns a
1100+ * NEGATIVE score (lower = more relevant), so we subtract the boost to
1101+ * make high-value labels sort first. File/Folder/Module/Variable are
1102+ * excluded entirely — agents rarely want those as discovery results. */
1103+ const char * sql =
1104+ "SELECT n.id, n.label, n.name, n.qualified_name, n.file_path, n.start_line, n.end_line, "
1105+ " (bm25(nodes_fts) "
1106+ " - CASE WHEN n.label IN ('Function','Method') THEN 10.0 "
1107+ " WHEN n.label = 'Route' THEN 8.0 "
1108+ " WHEN n.label IN ('Class','Interface','Type','Enum') THEN 5.0 "
1109+ " ELSE 0.0 END) AS rank "
1110+ "FROM nodes_fts "
1111+ "JOIN nodes n ON n.id = nodes_fts.rowid "
1112+ "WHERE nodes_fts MATCH ?1 "
1113+ " AND n.project = ?2 "
1114+ " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project') "
1115+ "ORDER BY rank "
1116+ "LIMIT ?3 OFFSET ?4" ;
1117+
1118+ sqlite3_stmt * stmt = NULL ;
1119+ if (sqlite3_prepare_v2 (db , sql , -1 , & stmt , NULL ) != SQLITE_OK ) {
1120+ return NULL ;
1121+ }
1122+ sqlite3_bind_text (stmt , 1 , fts_query , -1 , SQLITE_TRANSIENT );
1123+ sqlite3_bind_text (stmt , 2 , project , -1 , SQLITE_TRANSIENT );
1124+ sqlite3_bind_int (stmt , 3 , limit > 0 ? limit : 100 );
1125+ sqlite3_bind_int (stmt , 4 , offset > 0 ? offset : 0 );
1126+
1127+ /* Count total hits (for pagination) in a separate cheap query. */
1128+ int total = 0 ;
1129+ {
1130+ const char * count_sql =
1131+ "SELECT COUNT(*) FROM nodes_fts JOIN nodes n ON n.id = nodes_fts.rowid "
1132+ "WHERE nodes_fts MATCH ?1 AND n.project = ?2 "
1133+ " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')" ;
1134+ sqlite3_stmt * cs = NULL ;
1135+ if (sqlite3_prepare_v2 (db , count_sql , -1 , & cs , NULL ) == SQLITE_OK ) {
1136+ sqlite3_bind_text (cs , 1 , fts_query , -1 , SQLITE_TRANSIENT );
1137+ sqlite3_bind_text (cs , 2 , project , -1 , SQLITE_TRANSIENT );
1138+ if (sqlite3_step (cs ) == SQLITE_ROW ) {
1139+ total = sqlite3_column_int (cs , 0 );
1140+ }
1141+ sqlite3_finalize (cs );
1142+ }
1143+ }
1144+
1145+ yyjson_mut_doc * doc = yyjson_mut_doc_new (NULL );
1146+ yyjson_mut_val * root = yyjson_mut_obj (doc );
1147+ yyjson_mut_doc_set_root (doc , root );
1148+ yyjson_mut_obj_add_int (doc , root , "total" , total );
1149+ yyjson_mut_obj_add_str (doc , root , "search_mode" , "bm25" );
1150+
1151+ yyjson_mut_val * results = yyjson_mut_arr (doc );
1152+ int emitted = 0 ;
1153+ while (sqlite3_step (stmt ) == SQLITE_ROW ) {
1154+ yyjson_mut_val * item = yyjson_mut_obj (doc );
1155+ yyjson_mut_obj_add_strcpy (doc , item , "name" ,
1156+ (const char * )sqlite3_column_text (stmt , 2 ));
1157+ yyjson_mut_obj_add_strcpy (doc , item , "qualified_name" ,
1158+ (const char * )sqlite3_column_text (stmt , 3 ));
1159+ yyjson_mut_obj_add_strcpy (doc , item , "label" ,
1160+ (const char * )sqlite3_column_text (stmt , 1 ));
1161+ yyjson_mut_obj_add_strcpy (doc , item , "file_path" ,
1162+ (const char * )sqlite3_column_text (stmt , 4 ));
1163+ yyjson_mut_obj_add_int (doc , item , "start_line" , sqlite3_column_int (stmt , 5 ));
1164+ yyjson_mut_obj_add_int (doc , item , "end_line" , sqlite3_column_int (stmt , 6 ));
1165+ yyjson_mut_obj_add_real (doc , item , "rank" , sqlite3_column_double (stmt , 7 ));
1166+ yyjson_mut_arr_add_val (results , item );
1167+ emitted ++ ;
1168+ }
1169+ sqlite3_finalize (stmt );
1170+
1171+ yyjson_mut_obj_add_val (doc , root , "results" , results );
1172+ yyjson_mut_obj_add_bool (doc , root , "has_more" , total > offset + emitted );
1173+
1174+ char * json = yy_doc_to_str (doc );
1175+ yyjson_mut_doc_free (doc );
1176+ return json ;
1177+ }
1178+
10281179static char * handle_search_graph (cbm_mcp_server_t * srv , const char * args ) {
10291180 char * project = cbm_mcp_get_string_arg (args , "project" );
10301181 cbm_store_t * store = resolve_store (srv , project );
@@ -1036,6 +1187,25 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10361187 return not_indexed ;
10371188 }
10381189
1190+ /* BM25 path: if `query` is set, run FTS5 full-text search with ranking
1191+ * and return early. The regex/vector path below is untouched for all
1192+ * other callers. If FTS5 is unavailable or the query is empty after
1193+ * tokenization, fall through to the regex path. */
1194+ char * query = cbm_mcp_get_string_arg (args , "query" );
1195+ if (query && query [0 ]) {
1196+ int q_limit = cbm_mcp_get_int_arg (args , "limit" , 100 );
1197+ int q_offset = cbm_mcp_get_int_arg (args , "offset" , 0 );
1198+ char * bm25_json = bm25_search (store , project , query , q_limit , q_offset );
1199+ if (bm25_json ) {
1200+ free (query );
1201+ free (project );
1202+ char * result = cbm_mcp_text_result (bm25_json , false);
1203+ free (bm25_json );
1204+ return result ;
1205+ }
1206+ }
1207+ free (query );
1208+
10391209 char * label = cbm_mcp_get_string_arg (args , "label" );
10401210 char * name_pattern = cbm_mcp_get_string_arg (args , "name_pattern" );
10411211 char * qn_pattern = cbm_mcp_get_string_arg (args , "qn_pattern" );
0 commit comments