|
12 | 12 | #include <sqlite3.h> |
13 | 13 | #include "cypher/cypher.h" |
14 | 14 | #include "pipeline/pipeline.h" |
| 15 | +#include "pipeline/embedding.h" |
15 | 16 | #include "cli/cli.h" |
16 | 17 | #include "watcher/watcher.h" |
17 | 18 | #include "foundation/mem.h" |
@@ -370,6 +371,15 @@ static const tool_def_t TOOLS[] = { |
370 | 371 | "{\"type\":\"object\",\"properties\":{\"traces\":{\"type\":\"array\",\"items\":{\"type\":" |
371 | 372 | "\"object\"}},\"project\":{\"type\":" |
372 | 373 | "\"string\"}},\"required\":[\"traces\",\"project\"]}"}, |
| 374 | + |
| 375 | + {"generate_embeddings", |
| 376 | + "Generate semantic embeddings for code symbols via external embedding server. " |
| 377 | + "Requires CBM_EMBEDDING_URL environment variable (e.g., http://localhost:11434/v1 for Ollama). " |
| 378 | + "Embeddings enable hybrid BM25+vector search in search_graph, bridging the gap between " |
| 379 | + "keyword queries and conceptual code discovery.", |
| 380 | + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," |
| 381 | + "\"force\":{\"type\":\"boolean\",\"default\":false,\"description\":" |
| 382 | + "\"Re-generate all embeddings even if they already exist\"}},\"required\":[\"project\"]}"}, |
373 | 383 | }; |
374 | 384 |
|
375 | 385 | static const int TOOL_COUNT = sizeof(TOOLS) / sizeof(TOOLS[0]); |
@@ -1009,11 +1019,60 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { |
1009 | 1019 | cbm_search_output_t out = {0}; |
1010 | 1020 | cbm_store_search(store, ¶ms, &out); |
1011 | 1021 |
|
| 1022 | + /* ── Hybrid search: if query is provided and embeddings exist, run |
| 1023 | + * vector search and merge with BM25 results using RRF (k=60). |
| 1024 | + * This enables semantic search: "institution name update" finds |
| 1025 | + * updateCloudClient even though the keywords don't overlap. ── */ |
| 1026 | + cbm_rrf_result_t *rrf_results = NULL; |
| 1027 | + int rrf_count = 0; |
| 1028 | + bool used_hybrid = false; |
| 1029 | + |
| 1030 | + if (query && query[0] && cbm_embedding_is_configured()) { |
| 1031 | + int emb_count = cbm_store_count_embeddings(store, project); |
| 1032 | + if (emb_count > 0) { |
| 1033 | + cbm_embedding_config_t cfg = cbm_embedding_get_config(); |
| 1034 | + |
| 1035 | + /* Embed the query text */ |
| 1036 | + float *query_vec = cbm_embedding_embed_text(&cfg, query); |
| 1037 | + if (query_vec) { |
| 1038 | + /* Run vector search */ |
| 1039 | + cbm_vector_result_t *vec_results = NULL; |
| 1040 | + int vec_count = 0; |
| 1041 | + cbm_store_vector_search(store, project, query_vec, cfg.dims, |
| 1042 | + 50, &vec_results, &vec_count); |
| 1043 | + |
| 1044 | + if (vec_count > 0) { |
| 1045 | + /* Collect BM25 node IDs in ranked order */ |
| 1046 | + int64_t *bm25_ids = malloc((size_t)out.count * sizeof(int64_t)); |
| 1047 | + if (bm25_ids) { |
| 1048 | + for (int i = 0; i < out.count; i++) { |
| 1049 | + bm25_ids[i] = out.results[i].node.id; |
| 1050 | + } |
| 1051 | + |
| 1052 | + /* RRF merge */ |
| 1053 | + cbm_embedding_rrf_merge(bm25_ids, out.count, |
| 1054 | + vec_results, vec_count, |
| 1055 | + &rrf_results, &rrf_count); |
| 1056 | + used_hybrid = true; |
| 1057 | + free(bm25_ids); |
| 1058 | + } |
| 1059 | + } |
| 1060 | + |
| 1061 | + cbm_store_free_vector_results(vec_results, vec_count); |
| 1062 | + free(query_vec); |
| 1063 | + } |
| 1064 | + } |
| 1065 | + } |
| 1066 | + |
1012 | 1067 | yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); |
1013 | 1068 | yyjson_mut_val *root = yyjson_mut_obj(doc); |
1014 | 1069 | yyjson_mut_doc_set_root(doc, root); |
1015 | 1070 |
|
1016 | | - yyjson_mut_obj_add_int(doc, root, "total", out.total); |
| 1071 | + yyjson_mut_obj_add_int(doc, root, "total", |
| 1072 | + used_hybrid ? rrf_count : out.total); |
| 1073 | + if (used_hybrid) { |
| 1074 | + yyjson_mut_obj_add_str(doc, root, "search_mode", "hybrid_bm25_vector"); |
| 1075 | + } |
1017 | 1076 |
|
1018 | 1077 | /* For each result, look up which execution flows it participates in. |
1019 | 1078 | * This enables process-grouped search results similar to GitNexus's |
@@ -1062,14 +1121,78 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { |
1062 | 1121 |
|
1063 | 1122 | yyjson_mut_arr_add_val(results, item); |
1064 | 1123 | } |
1065 | | - if (proc_stmt) sqlite3_finalize(proc_stmt); |
1066 | 1124 |
|
1067 | 1125 | yyjson_mut_obj_add_val(doc, root, "results", results); |
1068 | | - yyjson_mut_obj_add_bool(doc, root, "has_more", out.total > offset + out.count); |
| 1126 | + yyjson_mut_obj_add_bool(doc, root, "has_more", |
| 1127 | + used_hybrid ? false : (out.total > offset + out.count)); |
| 1128 | + |
| 1129 | + /* If hybrid search found vector-only results (not in BM25), add them. |
| 1130 | + * These are semantically relevant results that keyword search missed — |
| 1131 | + * the whole point of vector search. */ |
| 1132 | + if (used_hybrid && rrf_results) { |
| 1133 | + yyjson_mut_val *vec_only = yyjson_mut_arr(doc); |
| 1134 | + int vec_only_count = 0; |
| 1135 | + |
| 1136 | + for (int i = 0; i < rrf_count && vec_only_count < 20; i++) { |
| 1137 | + if (rrf_results[i].bm25_rank < 0) { |
| 1138 | + /* This result was found ONLY by vector search */ |
| 1139 | + cbm_node_t vnode = {0}; |
| 1140 | + if (cbm_store_find_node_by_id(store, rrf_results[i].node_id, |
| 1141 | + &vnode) == CBM_STORE_OK) { |
| 1142 | + yyjson_mut_val *vitem = yyjson_mut_obj(doc); |
| 1143 | + yyjson_mut_obj_add_str(doc, vitem, "name", |
| 1144 | + vnode.name ? vnode.name : ""); |
| 1145 | + yyjson_mut_obj_add_str(doc, vitem, "qualified_name", |
| 1146 | + vnode.qualified_name ? vnode.qualified_name : ""); |
| 1147 | + yyjson_mut_obj_add_str(doc, vitem, "label", |
| 1148 | + vnode.label ? vnode.label : ""); |
| 1149 | + yyjson_mut_obj_add_str(doc, vitem, "file_path", |
| 1150 | + vnode.file_path ? vnode.file_path : ""); |
| 1151 | + yyjson_mut_obj_add_real(doc, vitem, "similarity", |
| 1152 | + rrf_results[i].similarity); |
| 1153 | + yyjson_mut_obj_add_real(doc, vitem, "rrf_score", |
| 1154 | + rrf_results[i].rrf_score); |
| 1155 | + |
| 1156 | + /* Process participation for vector-only results too */ |
| 1157 | + if (proc_stmt) { |
| 1158 | + sqlite3_reset(proc_stmt); |
| 1159 | + sqlite3_bind_int64(proc_stmt, 1, rrf_results[i].node_id); |
| 1160 | + yyjson_mut_val *vproc_arr = yyjson_mut_arr(doc); |
| 1161 | + while (sqlite3_step(proc_stmt) == SQLITE_ROW) { |
| 1162 | + yyjson_mut_val *vpobj = yyjson_mut_obj(doc); |
| 1163 | + yyjson_mut_obj_add_int(doc, vpobj, "id", |
| 1164 | + sqlite3_column_int64(proc_stmt, 0)); |
| 1165 | + const char *vplabel = |
| 1166 | + (const char *)sqlite3_column_text(proc_stmt, 1); |
| 1167 | + yyjson_mut_obj_add_strcpy(doc, vpobj, "label", |
| 1168 | + vplabel ? vplabel : ""); |
| 1169 | + yyjson_mut_obj_add_int(doc, vpobj, "step_count", |
| 1170 | + sqlite3_column_int(proc_stmt, 2)); |
| 1171 | + yyjson_mut_arr_add_val(vproc_arr, vpobj); |
| 1172 | + } |
| 1173 | + yyjson_mut_obj_add_val(doc, vitem, "processes", vproc_arr); |
| 1174 | + } |
| 1175 | + |
| 1176 | + yyjson_mut_arr_add_val(vec_only, vitem); |
| 1177 | + vec_only_count++; |
| 1178 | + cbm_node_free_fields(&vnode); |
| 1179 | + } |
| 1180 | + } |
| 1181 | + } |
| 1182 | + |
| 1183 | + if (vec_only_count > 0) { |
| 1184 | + yyjson_mut_obj_add_val(doc, root, "semantic_results", vec_only); |
| 1185 | + yyjson_mut_obj_add_int(doc, root, "semantic_result_count", vec_only_count); |
| 1186 | + } |
| 1187 | + } |
| 1188 | + |
| 1189 | + if (proc_stmt) sqlite3_finalize(proc_stmt); |
| 1190 | + // Note: proc_stmt finalize moved here to be AFTER vector-only result processing |
1069 | 1191 |
|
1070 | 1192 | char *json = yy_doc_to_str(doc); |
1071 | 1193 | yyjson_mut_doc_free(doc); |
1072 | 1194 | cbm_store_search_free(&out); |
| 1195 | + free(rrf_results); |
1073 | 1196 |
|
1074 | 1197 | free(project); |
1075 | 1198 | free(label); |
@@ -3697,6 +3820,56 @@ static char *handle_ingest_traces(cbm_mcp_server_t *srv, const char *args) { |
3697 | 3820 | return result; |
3698 | 3821 | } |
3699 | 3822 |
|
| 3823 | +/* ── generate_embeddings ─────────────────────────────────────── */ |
| 3824 | + |
| 3825 | +static char *handle_generate_embeddings(cbm_mcp_server_t *srv, const char *args) { |
| 3826 | + char *project = cbm_mcp_get_string_arg(args, "project"); |
| 3827 | + cbm_store_t *store = resolve_store(srv, project); |
| 3828 | + REQUIRE_STORE(store, project); |
| 3829 | + |
| 3830 | + if (!cbm_embedding_is_configured()) { |
| 3831 | + free(project); |
| 3832 | + return cbm_mcp_text_result( |
| 3833 | + "{\"error\":\"CBM_EMBEDDING_URL not set. " |
| 3834 | + "Set CBM_EMBEDDING_URL to an OpenAI-compatible /v1/embeddings endpoint " |
| 3835 | + "(e.g., http://localhost:11434/v1 for Ollama).\"}", true); |
| 3836 | + } |
| 3837 | + |
| 3838 | + bool force = cbm_mcp_get_bool_arg(args, "force"); |
| 3839 | + int existing = cbm_store_count_embeddings(store, project); |
| 3840 | + |
| 3841 | + int generated = cbm_embedding_generate_for_project(store, project, force); |
| 3842 | + |
| 3843 | + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); |
| 3844 | + yyjson_mut_val *root = yyjson_mut_obj(doc); |
| 3845 | + yyjson_mut_doc_set_root(doc, root); |
| 3846 | + |
| 3847 | + yyjson_mut_obj_add_str(doc, root, "status", generated >= 0 ? "success" : "error"); |
| 3848 | + yyjson_mut_obj_add_int(doc, root, "generated", generated >= 0 ? generated : 0); |
| 3849 | + yyjson_mut_obj_add_int(doc, root, "existing_before", existing); |
| 3850 | + yyjson_mut_obj_add_int(doc, root, "total_embeddings", |
| 3851 | + cbm_store_count_embeddings(store, project)); |
| 3852 | + |
| 3853 | + cbm_embedding_config_t cfg = cbm_embedding_get_config(); |
| 3854 | + yyjson_mut_obj_add_str(doc, root, "model", cfg.model ? cfg.model : ""); |
| 3855 | + yyjson_mut_obj_add_int(doc, root, "dimensions", cfg.dims); |
| 3856 | + yyjson_mut_obj_add_str(doc, root, "url", cfg.url ? cfg.url : ""); |
| 3857 | + |
| 3858 | + if (generated >= 0) { |
| 3859 | + yyjson_mut_obj_add_str(doc, root, "hint", |
| 3860 | + "Embeddings generated. search_graph with query= " |
| 3861 | + "now uses hybrid BM25+vector search with RRF merge."); |
| 3862 | + } |
| 3863 | + |
| 3864 | + char *json = yy_doc_to_str(doc); |
| 3865 | + yyjson_mut_doc_free(doc); |
| 3866 | + free(project); |
| 3867 | + |
| 3868 | + char *result = cbm_mcp_text_result(json, generated < 0); |
| 3869 | + free(json); |
| 3870 | + return result; |
| 3871 | +} |
| 3872 | + |
3700 | 3873 | /* ── Tool dispatch ────────────────────────────────────────────── */ |
3701 | 3874 |
|
3702 | 3875 | // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) |
@@ -3761,6 +3934,9 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch |
3761 | 3934 | if (strcmp(tool_name, "ingest_traces") == 0) { |
3762 | 3935 | return handle_ingest_traces(srv, args_json); |
3763 | 3936 | } |
| 3937 | + if (strcmp(tool_name, "generate_embeddings") == 0) { |
| 3938 | + return handle_generate_embeddings(srv, args_json); |
| 3939 | + } |
3764 | 3940 | char msg[256]; |
3765 | 3941 | snprintf(msg, sizeof(msg), "unknown tool: %s", tool_name); |
3766 | 3942 | return cbm_mcp_text_result(msg, true); |
|
0 commit comments