Skip to content

Commit b380ee7

Browse files
author
Your Name
committed
feat(search): hybrid BM25+vector semantic search via external embedding API
Adds semantic vector search to close the fundamental vocabulary mismatch gap: natural language queries like 'institution name update' can now find symbols like updateCloudClient even when the keywords don't overlap. Architecture: - Embeddings generated via HTTP POST to OpenAI-compatible /v1/embeddings endpoint (Ollama, llamafile, OpenAI — configured via CBM_EMBEDDING_URL) - Stored as float32 BLOBs in a new 'embeddings' table in existing SQLite DB - Cosine similarity computed via registered cbm_cosine_sim() SQLite function - Brute-force search: fast enough for <100K vectors at 384-768 dims - RRF (Reciprocal Rank Fusion, k=60) merges BM25 + vector results New files: - src/pipeline/embedding.h — API for config, text gen, HTTP client, RRF merge - src/pipeline/embedding.c — Full implementation using Mongoose HTTP + yyjson Changes: - src/store/store.c: embeddings table schema, cbm_cosine_sim() function, embedding CRUD + vector_search, batch upsert - src/store/store.h: cbm_vector_result_t, embedding function declarations - src/mcp/mcp.c: generate_embeddings tool, hybrid search in search_graph (vector-only results appear in 'semantic_results' field) - Makefile.cbm: added embedding.c to build Configuration (env vars): CBM_EMBEDDING_URL — Base URL (e.g., http://localhost:11434/v1) CBM_EMBEDDING_MODEL — Model name (default: nomic-embed-text) CBM_EMBEDDING_DIMS — Vector dimensions (default: 768) Usage: 1. Start Ollama: ollama pull nomic-embed-text && ollama serve 2. Set env: export CBM_EMBEDDING_URL=http://localhost:11434/v1 3. Generate: generate_embeddings({project: '...', force: false}) 4. Search: search_graph({query: 'institution name update'}) → BM25 results + semantic_results (vector-only matches) When CBM_EMBEDDING_URL is not set, everything works as before (BM25-only). No new dependencies — uses already-vendored Mongoose (HTTP) and yyjson (JSON).
1 parent 476ddaa commit b380ee7

File tree

6 files changed

+1004
-5
lines changed

6 files changed

+1004
-5
lines changed

Makefile.cbm

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ PIPELINE_SRCS = \
180180
src/pipeline/pass_compile_commands.c \
181181
src/pipeline/pass_infrascan.c \
182182
src/pipeline/pass_k8s.c \
183-
src/pipeline/httplink.c
183+
src/pipeline/httplink.c \
184+
src/pipeline/embedding.c
184185

185186
# Traces module (new)
186187
TRACES_SRCS = src/traces/traces.c

src/mcp/mcp.c

Lines changed: 179 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <sqlite3.h>
1313
#include "cypher/cypher.h"
1414
#include "pipeline/pipeline.h"
15+
#include "pipeline/embedding.h"
1516
#include "cli/cli.h"
1617
#include "watcher/watcher.h"
1718
#include "foundation/mem.h"
@@ -370,6 +371,15 @@ static const tool_def_t TOOLS[] = {
370371
"{\"type\":\"object\",\"properties\":{\"traces\":{\"type\":\"array\",\"items\":{\"type\":"
371372
"\"object\"}},\"project\":{\"type\":"
372373
"\"string\"}},\"required\":[\"traces\",\"project\"]}"},
374+
375+
{"generate_embeddings",
376+
"Generate semantic embeddings for code symbols via external embedding server. "
377+
"Requires CBM_EMBEDDING_URL environment variable (e.g., http://localhost:11434/v1 for Ollama). "
378+
"Embeddings enable hybrid BM25+vector search in search_graph, bridging the gap between "
379+
"keyword queries and conceptual code discovery.",
380+
"{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},"
381+
"\"force\":{\"type\":\"boolean\",\"default\":false,\"description\":"
382+
"\"Re-generate all embeddings even if they already exist\"}},\"required\":[\"project\"]}"},
373383
};
374384

375385
static const int TOOL_COUNT = sizeof(TOOLS) / sizeof(TOOLS[0]);
@@ -1009,11 +1019,60 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10091019
cbm_search_output_t out = {0};
10101020
cbm_store_search(store, &params, &out);
10111021

1022+
/* ── Hybrid search: if query is provided and embeddings exist, run
1023+
* vector search and merge with BM25 results using RRF (k=60).
1024+
* This enables semantic search: "institution name update" finds
1025+
* updateCloudClient even though the keywords don't overlap. ── */
1026+
cbm_rrf_result_t *rrf_results = NULL;
1027+
int rrf_count = 0;
1028+
bool used_hybrid = false;
1029+
1030+
if (query && query[0] && cbm_embedding_is_configured()) {
1031+
int emb_count = cbm_store_count_embeddings(store, project);
1032+
if (emb_count > 0) {
1033+
cbm_embedding_config_t cfg = cbm_embedding_get_config();
1034+
1035+
/* Embed the query text */
1036+
float *query_vec = cbm_embedding_embed_text(&cfg, query);
1037+
if (query_vec) {
1038+
/* Run vector search */
1039+
cbm_vector_result_t *vec_results = NULL;
1040+
int vec_count = 0;
1041+
cbm_store_vector_search(store, project, query_vec, cfg.dims,
1042+
50, &vec_results, &vec_count);
1043+
1044+
if (vec_count > 0) {
1045+
/* Collect BM25 node IDs in ranked order */
1046+
int64_t *bm25_ids = malloc((size_t)out.count * sizeof(int64_t));
1047+
if (bm25_ids) {
1048+
for (int i = 0; i < out.count; i++) {
1049+
bm25_ids[i] = out.results[i].node.id;
1050+
}
1051+
1052+
/* RRF merge */
1053+
cbm_embedding_rrf_merge(bm25_ids, out.count,
1054+
vec_results, vec_count,
1055+
&rrf_results, &rrf_count);
1056+
used_hybrid = true;
1057+
free(bm25_ids);
1058+
}
1059+
}
1060+
1061+
cbm_store_free_vector_results(vec_results, vec_count);
1062+
free(query_vec);
1063+
}
1064+
}
1065+
}
1066+
10121067
yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL);
10131068
yyjson_mut_val *root = yyjson_mut_obj(doc);
10141069
yyjson_mut_doc_set_root(doc, root);
10151070

1016-
yyjson_mut_obj_add_int(doc, root, "total", out.total);
1071+
yyjson_mut_obj_add_int(doc, root, "total",
1072+
used_hybrid ? rrf_count : out.total);
1073+
if (used_hybrid) {
1074+
yyjson_mut_obj_add_str(doc, root, "search_mode", "hybrid_bm25_vector");
1075+
}
10171076

10181077
/* For each result, look up which execution flows it participates in.
10191078
* This enables process-grouped search results similar to GitNexus's
@@ -1062,14 +1121,78 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) {
10621121

10631122
yyjson_mut_arr_add_val(results, item);
10641123
}
1065-
if (proc_stmt) sqlite3_finalize(proc_stmt);
10661124

10671125
yyjson_mut_obj_add_val(doc, root, "results", results);
1068-
yyjson_mut_obj_add_bool(doc, root, "has_more", out.total > offset + out.count);
1126+
yyjson_mut_obj_add_bool(doc, root, "has_more",
1127+
used_hybrid ? false : (out.total > offset + out.count));
1128+
1129+
/* If hybrid search found vector-only results (not in BM25), add them.
1130+
* These are semantically relevant results that keyword search missed —
1131+
* the whole point of vector search. */
1132+
if (used_hybrid && rrf_results) {
1133+
yyjson_mut_val *vec_only = yyjson_mut_arr(doc);
1134+
int vec_only_count = 0;
1135+
1136+
for (int i = 0; i < rrf_count && vec_only_count < 20; i++) {
1137+
if (rrf_results[i].bm25_rank < 0) {
1138+
/* This result was found ONLY by vector search */
1139+
cbm_node_t vnode = {0};
1140+
if (cbm_store_find_node_by_id(store, rrf_results[i].node_id,
1141+
&vnode) == CBM_STORE_OK) {
1142+
yyjson_mut_val *vitem = yyjson_mut_obj(doc);
1143+
yyjson_mut_obj_add_str(doc, vitem, "name",
1144+
vnode.name ? vnode.name : "");
1145+
yyjson_mut_obj_add_str(doc, vitem, "qualified_name",
1146+
vnode.qualified_name ? vnode.qualified_name : "");
1147+
yyjson_mut_obj_add_str(doc, vitem, "label",
1148+
vnode.label ? vnode.label : "");
1149+
yyjson_mut_obj_add_str(doc, vitem, "file_path",
1150+
vnode.file_path ? vnode.file_path : "");
1151+
yyjson_mut_obj_add_real(doc, vitem, "similarity",
1152+
rrf_results[i].similarity);
1153+
yyjson_mut_obj_add_real(doc, vitem, "rrf_score",
1154+
rrf_results[i].rrf_score);
1155+
1156+
/* Process participation for vector-only results too */
1157+
if (proc_stmt) {
1158+
sqlite3_reset(proc_stmt);
1159+
sqlite3_bind_int64(proc_stmt, 1, rrf_results[i].node_id);
1160+
yyjson_mut_val *vproc_arr = yyjson_mut_arr(doc);
1161+
while (sqlite3_step(proc_stmt) == SQLITE_ROW) {
1162+
yyjson_mut_val *vpobj = yyjson_mut_obj(doc);
1163+
yyjson_mut_obj_add_int(doc, vpobj, "id",
1164+
sqlite3_column_int64(proc_stmt, 0));
1165+
const char *vplabel =
1166+
(const char *)sqlite3_column_text(proc_stmt, 1);
1167+
yyjson_mut_obj_add_strcpy(doc, vpobj, "label",
1168+
vplabel ? vplabel : "");
1169+
yyjson_mut_obj_add_int(doc, vpobj, "step_count",
1170+
sqlite3_column_int(proc_stmt, 2));
1171+
yyjson_mut_arr_add_val(vproc_arr, vpobj);
1172+
}
1173+
yyjson_mut_obj_add_val(doc, vitem, "processes", vproc_arr);
1174+
}
1175+
1176+
yyjson_mut_arr_add_val(vec_only, vitem);
1177+
vec_only_count++;
1178+
cbm_node_free_fields(&vnode);
1179+
}
1180+
}
1181+
}
1182+
1183+
if (vec_only_count > 0) {
1184+
yyjson_mut_obj_add_val(doc, root, "semantic_results", vec_only);
1185+
yyjson_mut_obj_add_int(doc, root, "semantic_result_count", vec_only_count);
1186+
}
1187+
}
1188+
1189+
if (proc_stmt) sqlite3_finalize(proc_stmt);
1190+
// Note: proc_stmt finalize moved here to be AFTER vector-only result processing
10691191

10701192
char *json = yy_doc_to_str(doc);
10711193
yyjson_mut_doc_free(doc);
10721194
cbm_store_search_free(&out);
1195+
free(rrf_results);
10731196

10741197
free(project);
10751198
free(label);
@@ -3697,6 +3820,56 @@ static char *handle_ingest_traces(cbm_mcp_server_t *srv, const char *args) {
36973820
return result;
36983821
}
36993822

3823+
/* ── generate_embeddings ─────────────────────────────────────── */
3824+
3825+
static char *handle_generate_embeddings(cbm_mcp_server_t *srv, const char *args) {
3826+
char *project = cbm_mcp_get_string_arg(args, "project");
3827+
cbm_store_t *store = resolve_store(srv, project);
3828+
REQUIRE_STORE(store, project);
3829+
3830+
if (!cbm_embedding_is_configured()) {
3831+
free(project);
3832+
return cbm_mcp_text_result(
3833+
"{\"error\":\"CBM_EMBEDDING_URL not set. "
3834+
"Set CBM_EMBEDDING_URL to an OpenAI-compatible /v1/embeddings endpoint "
3835+
"(e.g., http://localhost:11434/v1 for Ollama).\"}", true);
3836+
}
3837+
3838+
bool force = cbm_mcp_get_bool_arg(args, "force");
3839+
int existing = cbm_store_count_embeddings(store, project);
3840+
3841+
int generated = cbm_embedding_generate_for_project(store, project, force);
3842+
3843+
yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL);
3844+
yyjson_mut_val *root = yyjson_mut_obj(doc);
3845+
yyjson_mut_doc_set_root(doc, root);
3846+
3847+
yyjson_mut_obj_add_str(doc, root, "status", generated >= 0 ? "success" : "error");
3848+
yyjson_mut_obj_add_int(doc, root, "generated", generated >= 0 ? generated : 0);
3849+
yyjson_mut_obj_add_int(doc, root, "existing_before", existing);
3850+
yyjson_mut_obj_add_int(doc, root, "total_embeddings",
3851+
cbm_store_count_embeddings(store, project));
3852+
3853+
cbm_embedding_config_t cfg = cbm_embedding_get_config();
3854+
yyjson_mut_obj_add_str(doc, root, "model", cfg.model ? cfg.model : "");
3855+
yyjson_mut_obj_add_int(doc, root, "dimensions", cfg.dims);
3856+
yyjson_mut_obj_add_str(doc, root, "url", cfg.url ? cfg.url : "");
3857+
3858+
if (generated >= 0) {
3859+
yyjson_mut_obj_add_str(doc, root, "hint",
3860+
"Embeddings generated. search_graph with query= "
3861+
"now uses hybrid BM25+vector search with RRF merge.");
3862+
}
3863+
3864+
char *json = yy_doc_to_str(doc);
3865+
yyjson_mut_doc_free(doc);
3866+
free(project);
3867+
3868+
char *result = cbm_mcp_text_result(json, generated < 0);
3869+
free(json);
3870+
return result;
3871+
}
3872+
37003873
/* ── Tool dispatch ────────────────────────────────────────────── */
37013874

37023875
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
@@ -3761,6 +3934,9 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch
37613934
if (strcmp(tool_name, "ingest_traces") == 0) {
37623935
return handle_ingest_traces(srv, args_json);
37633936
}
3937+
if (strcmp(tool_name, "generate_embeddings") == 0) {
3938+
return handle_generate_embeddings(srv, args_json);
3939+
}
37643940
char msg[256];
37653941
snprintf(msg, sizeof(msg), "unknown tool: %s", tool_name);
37663942
return cbm_mcp_text_result(msg, true);

0 commit comments

Comments
 (0)