Skip to content

Commit 452f5a7

Browse files
committed
Add MinHash fingerprinting and SIMILAR_TO edges for near-clone detection
Compute K=64 MinHash signatures from normalized AST node-type trigrams during function extraction, then generate SIMILAR_TO edges via LSH (b=32, r=2) for function pairs with Jaccard >= 0.95. - src/simhash/minhash.{h,c}: MinHash compute, Jaccard, hex encode/decode, LSH index with band hashing for O(n) candidate generation - src/pipeline/pass_similarity.c: post-pass reads fingerprints from node properties, builds LSH index, emits SIMILAR_TO edges with jaccard and same_file metadata. Same-language only, max 10 edges per node. - internal/cbm/cbm.h: fingerprint fields on CBMDefinition - internal/cbm/extract_defs.c: compute_fingerprint() hook at 3 extraction sites after complexity, skip functions with < 10 AST body nodes - pass_definitions.c + pass_parallel.c: serialize fingerprint to "fp" hex in properties_json for both sequential and parallel pipeline paths - pipeline.c + pipeline_incremental.c: register pass_similarity in both full and incremental post-pass lists - tests/test_simhash.c: 28 tests across 4 suites (core, LSH, edge gen, pipeline integration with generated Go project + incremental)
1 parent e07443b commit 452f5a7

File tree

14 files changed

+1946
-4
lines changed

14 files changed

+1946
-4
lines changed

Makefile.cbm

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,10 @@ PIPELINE_SRCS = \
186186
src/pipeline/pass_compile_commands.c \
187187
src/pipeline/pass_infrascan.c \
188188
src/pipeline/pass_k8s.c \
189+
src/pipeline/pass_similarity.c
190+
191+
# SimHash / MinHash module
192+
SIMHASH_SRCS = src/simhash/minhash.c
189193

190194
# Traces module (new)
191195
TRACES_SRCS = src/traces/traces.c
@@ -233,7 +237,7 @@ TRE_CFLAGS = -std=c11 -g -O1 -w -Ivendored/tre
233237
YYJSON_SRC = vendored/yyjson/yyjson.c
234238

235239
# All production sources
236-
PROD_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) $(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(UI_SRCS) $(YYJSON_SRC)
240+
PROD_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) $(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) $(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(UI_SRCS) $(YYJSON_SRC)
237241
EXISTING_C_SRCS = $(EXTRACTION_SRCS) $(LSP_SRCS) $(TS_RUNTIME_SRC) \
238242
$(GRAMMAR_SRCS) $(AC_LZ4_SRCS) $(SQLITE_WRITER_SRC)
239243

@@ -301,7 +305,9 @@ TEST_SECURITY_SRCS = tests/test_security.c
301305

302306
TEST_YAML_SRCS = tests/test_yaml.c
303307

304-
ALL_TEST_SRCS = $(TEST_FOUNDATION_SRCS) $(TEST_EXTRACTION_SRCS) $(TEST_STORE_SRCS) $(TEST_CYPHER_SRCS) $(TEST_MCP_SRCS) $(TEST_DISCOVER_SRCS) $(TEST_GRAPH_BUFFER_SRCS) $(TEST_PIPELINE_SRCS) $(TEST_WATCHER_SRCS) $(TEST_LZ4_SRCS) $(TEST_SQLITE_WRITER_SRCS) $(TEST_GO_LSP_SRCS) $(TEST_C_LSP_SRCS) $(TEST_TRACES_SRCS) $(TEST_CLI_SRCS) $(TEST_MEM_SRCS) $(TEST_UI_SRCS) $(TEST_SECURITY_SRCS) $(TEST_YAML_SRCS) $(TEST_INTEGRATION_SRCS)
308+
TEST_SIMHASH_SRCS = tests/test_simhash.c
309+
310+
ALL_TEST_SRCS = $(TEST_FOUNDATION_SRCS) $(TEST_EXTRACTION_SRCS) $(TEST_STORE_SRCS) $(TEST_CYPHER_SRCS) $(TEST_MCP_SRCS) $(TEST_DISCOVER_SRCS) $(TEST_GRAPH_BUFFER_SRCS) $(TEST_PIPELINE_SRCS) $(TEST_WATCHER_SRCS) $(TEST_LZ4_SRCS) $(TEST_SQLITE_WRITER_SRCS) $(TEST_GO_LSP_SRCS) $(TEST_C_LSP_SRCS) $(TEST_TRACES_SRCS) $(TEST_CLI_SRCS) $(TEST_MEM_SRCS) $(TEST_UI_SRCS) $(TEST_SECURITY_SRCS) $(TEST_YAML_SRCS) $(TEST_SIMHASH_SRCS) $(TEST_INTEGRATION_SRCS)
305311

306312

307313
# ── Build directories ────────────────────────────────────────────
@@ -505,8 +511,8 @@ SYSROOT_FLAG = $(if $(SYSROOT),-isysroot $(SYSROOT),)
505511

506512
# Our source files (excluding vendored, grammars, tree-sitter runtime)
507513
LINT_SRCS = $(FOUNDATION_SRCS) $(STORE_SRCS) $(CYPHER_SRCS) $(MCP_SRCS) \
508-
$(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(TRACES_SRCS) \
509-
$(WATCHER_SRCS) $(CLI_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) \
514+
$(DISCOVER_SRCS) $(GRAPH_BUFFER_SRCS) $(PIPELINE_SRCS) $(SIMHASH_SRCS) \
515+
$(TRACES_SRCS) $(WATCHER_SRCS) $(CLI_SRCS) $(EXTRACTION_SRCS) $(AC_LZ4_SRCS) \
510516
$(SQLITE_WRITER_SRC) $(MAIN_SRC)
511517
LINT_HDRS = $(wildcard src/**/*.h src/*.h $(CBM_DIR)/*.h)
512518
LINT_TEST_SRCS = $(ALL_TEST_SRCS)

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,19 @@ codebase-memory-mcp config set auto_index_limit 50000 # max files for auto-in
339339
codebase-memory-mcp config reset auto_index # reset to default
340340
```
341341

342+
### Environment Variables
343+
344+
| Variable | Default | Description |
345+
|----------|---------|-------------|
346+
| `CBM_CACHE_DIR` | `~/.cache/codebase-memory-mcp` | Override the database storage directory. All project indexes and config are stored here. |
347+
| `CBM_DIAGNOSTICS` | `false` | Set to `1` or `true` to enable periodic diagnostics output to `/tmp/cbm-diagnostics-<pid>.json`. |
348+
| `CBM_DOWNLOAD_URL` | *(GitHub releases)* | Override the download URL for updates. Used for testing or self-hosted deployments. |
349+
350+
```bash
351+
# Store indexes in a custom directory
352+
export CBM_CACHE_DIR=~/my-projects/cbm-data
353+
```
354+
342355
## Custom File Extensions
343356

344357
Map additional file extensions to supported languages via JSON config files. Useful for framework-specific extensions like `.blade.php` (Laravel) or `.mjs` (ES modules).

internal/cbm/cbm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ typedef struct {
103103
const char *route_method; // HTTP method from decorator (e.g., "POST") or NULL
104104
int complexity; // cyclomatic complexity
105105
int lines; // body line count
106+
uint32_t *fingerprint; // MinHash fingerprint (arena-allocated, K values) or NULL
107+
int fingerprint_k; // number of hash values (CBM_MINHASH_K or 0)
106108
bool is_exported;
107109
bool is_abstract;
108110
bool is_test;

internal/cbm/extract_defs.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "helpers.h"
44
#include "lang_specs.h"
55
#include "foundation/constants.h"
6+
#include "simhash/minhash.h"
67
#include "tree_sitter/api.h" // TSNode, ts_node_*
78
#include <stdint.h> // uint32_t
89
#include <string.h>
@@ -28,6 +29,30 @@ enum {
2829
NESTED_CLASS_STACK_CAP = 128,
2930
};
3031

32+
/* Compute MinHash fingerprint for a function body node and store in def.
33+
* Sets def->fingerprint (arena-allocated) and def->fingerprint_k on success,
34+
* leaves them NULL/0 if the body is too short. */
35+
static void compute_fingerprint(CBMExtractCtx *ctx, CBMDefinition *def, TSNode func_node) {
36+
/* Find the function body child */
37+
TSNode body = ts_node_child_by_field_name(func_node, TS_FIELD("body"));
38+
if (ts_node_is_null(body)) {
39+
/* Some languages use "block" or the function itself as the body */
40+
body = func_node;
41+
}
42+
cbm_minhash_t result;
43+
if (!cbm_minhash_compute(body, ctx->source, (int)ctx->language, &result)) {
44+
return; /* Too short or empty — no fingerprint */
45+
}
46+
/* Arena-allocate the fingerprint array */
47+
uint32_t *fp = cbm_arena_alloc(ctx->arena, CBM_MINHASH_K * sizeof(uint32_t));
48+
if (!fp) {
49+
return;
50+
}
51+
memcpy(fp, result.values, CBM_MINHASH_K * sizeof(uint32_t));
52+
def->fingerprint = fp;
53+
def->fingerprint_k = CBM_MINHASH_K;
54+
}
55+
3156
// Tree-sitter row is 0-based; lines are 1-based.
3257

3358
// Null-terminated array allocation: need count + 1 for terminator.
@@ -1421,6 +1446,9 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
14211446
def.complexity = cbm_count_branching(node, spec->branching_node_types);
14221447
}
14231448

1449+
// MinHash fingerprint
1450+
compute_fingerprint(ctx, &def, func_node);
1451+
14241452
// JS/TS export detection
14251453
if (ctx->language == CBM_LANG_JAVASCRIPT || ctx->language == CBM_LANG_TYPESCRIPT ||
14261454
ctx->language == CBM_LANG_TSX) {
@@ -1855,6 +1883,9 @@ static void push_method_def(CBMExtractCtx *ctx, TSNode child, const char *class_
18551883
def.complexity = cbm_count_branching(child, spec->branching_node_types);
18561884
}
18571885

1886+
// MinHash fingerprint
1887+
compute_fingerprint(ctx, &def, child);
1888+
18581889
cbm_defs_push(&ctx->result->defs, a, def);
18591890
}
18601891

@@ -1988,6 +2019,9 @@ static void extract_rust_impl(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
19882019
def.complexity = cbm_count_branching(child, spec->branching_node_types);
19892020
}
19902021

2022+
// MinHash fingerprint
2023+
compute_fingerprint(ctx, &def, child);
2024+
19912025
cbm_defs_push(&ctx->result->defs, a, def);
19922026
}
19932027
}

src/pipeline/pass_definitions.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ enum { PD_RING = 4, PD_RING_MASK = 3, PD_JSON_MARGIN = 10, PD_ESC_MARGIN = 3, PD
2020
#include "foundation/log.h"
2121
#include "foundation/compat.h"
2222
#include "cbm.h"
23+
#include "simhash/minhash.h"
2324

2425
#include <stdio.h>
2526
#include <stdlib.h>
@@ -191,6 +192,14 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
191192
append_json_string(buf, bufsize, &pos, "route_path", def->route_path);
192193
append_json_string(buf, bufsize, &pos, "route_method", def->route_method);
193194

195+
/* MinHash fingerprint — append if present and buffer has room. */
196+
if (def->fingerprint && def->fingerprint_k > 0 &&
197+
pos + CBM_MINHASH_HEX_LEN + CBM_MINHASH_JSON_OVERHEAD < bufsize) {
198+
char fp_hex[CBM_MINHASH_HEX_BUF];
199+
cbm_minhash_to_hex((const cbm_minhash_t *)def->fingerprint, fp_hex, sizeof(fp_hex));
200+
append_json_string(buf, bufsize, &pos, "fp", fp_hex);
201+
}
202+
194203
if (pos < bufsize - SKIP_ONE) {
195204
buf[pos] = '}';
196205
buf[pos + SKIP_ONE] = '\0';

src/pipeline/pass_parallel.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ enum {
3939
#include "foundation/mem.h"
4040
#include "foundation/compat_regex.h"
4141
#include "cbm.h"
42+
#include "simhash/minhash.h"
4243

4344
#include <stdatomic.h>
4445
#include <stdint.h>
@@ -212,6 +213,16 @@ static void build_def_props(char *buf, size_t bufsize, const CBMDefinition *def)
212213
append_json_str_array(buf, bufsize, &pos, "param_types", def->param_types);
213214
append_json_string(buf, bufsize, &pos, "route_path", def->route_path);
214215
append_json_string(buf, bufsize, &pos, "route_method", def->route_method);
216+
217+
/* MinHash fingerprint — append if present and buffer has room.
218+
* Hex-encoded K=64 uint32 = 512 chars + key/quotes ≈ 520 chars. */
219+
if (def->fingerprint && def->fingerprint_k > 0 &&
220+
pos + CBM_MINHASH_HEX_LEN + CBM_MINHASH_JSON_OVERHEAD < bufsize) {
221+
char fp_hex[CBM_MINHASH_HEX_BUF];
222+
cbm_minhash_to_hex((const cbm_minhash_t *)def->fingerprint, fp_hex, sizeof(fp_hex));
223+
append_json_string(buf, bufsize, &pos, "fp", fp_hex);
224+
}
225+
215226
if (pos < bufsize - SKIP_ONE) {
216227
buf[pos] = '}';
217228
buf[pos + SKIP_ONE] = '\0';

0 commit comments

Comments
 (0)