Skip to content

Commit 09ce20a

Browse files
committed
Improve MinHash quality: leaf-only tokens, structural weighting, unique trigram gate
Four changes to reduce false positives from 382K to 58K SIMILAR_TO edges on the Linux kernel while maintaining ~98% precision: 1. MIN_NODES raised to 30 (leaf tokens) — aligns with BigCloneBench standard of 50 raw source tokens / 6 lines minimum clone size 2. collect_ast_tokens now records LEAF nodes only, not internal grammar nodes. Leaf-only counting is language-agnostic: leaves correspond to actual source tokens regardless of grammar verbosity (C tree-sitter produces 2x more internal nodes than Python for the same logic) 3. hash_trigrams tracks unique structural trigram count via hash set. Functions with < 32 unique structural trigrams are skipped — too few distinct features for K=64 MinHash to produce reliable estimates 4. Structural-weighted MinHash via repetition: each trigram gets weight 0-3 based on count of non-normalised tokens (I/S/N/T = normalised). Weight-0 trigrams (all-normalised, e.g. I|I|I) are pure noise and skipped entirely. Weight 1-3 trigrams use repetition-based weighted MinHash — higher weight = more hash attempts per seed = more likely to dominate the signature. Achieves IDF-like effect without needing corpus-wide statistics. Also adds O(1) hash-set dedup in LSH query (replacing O(n) linear scan) and bucket size cap of 200 to skip oversized buckets. Linux kernel: 1:41, similarity 7.7s, 395K fp, 58K edges, ~98% precision. Tests updated with larger Go functions to meet new thresholds.
1 parent 452f5a7 commit 09ce20a

File tree

4 files changed

+379
-223
lines changed

4 files changed

+379
-223
lines changed

internal/cbm/extract_defs.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ static void compute_fingerprint(CBMExtractCtx *ctx, CBMDefinition *def, TSNode f
3636
/* Find the function body child */
3737
TSNode body = ts_node_child_by_field_name(func_node, TS_FIELD("body"));
3838
if (ts_node_is_null(body)) {
39-
/* Some languages use "block" or the function itself as the body */
4039
body = func_node;
4140
}
4241
cbm_minhash_t result;

src/simhash/minhash.c

Lines changed: 159 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,31 @@ enum { AST_WALK_CAP = 2048 };
3333
/* Hex encoding constants */
3434
enum { HEX_CHARS_PER_U32 = 8, HEX_BASE = 16 };
3535

36-
/* Trigram window and minimum threshold */
37-
enum { TRIGRAM_WINDOW = 2, MIN_TRIGRAM_COUNT = 3 };
36+
/* Trigram constants */
37+
enum { TRIGRAM_WINDOW = 2 };
38+
39+
/* Minimum unique structural trigrams for a meaningful fingerprint.
40+
* K/2 = 32 ensures most MinHash slots get populated from distinct features. */
41+
enum { MIN_UNIQUE_TRIGRAMS = 32 };
42+
43+
/* Maximum structural weight per trigram (3 tokens × 1 each). */
44+
enum { MAX_STRUCTURAL_WEIGHT = 3 };
3845

3946
/* Hash truncation mask (uint64 → uint32) */
4047
enum { U32_MASK = 0xFFFFFFFF };
4148

4249
/* Dynamic array growth constants */
4350
enum { BUCKET_INIT_CAP = 8, GROW_FACTOR = 2, ENTRY_INIT_CAP = 64, RESULT_INIT_CAP = 64 };
4451

52+
/* Maximum bucket size — skip oversized buckets (noise from trivially similar functions). */
53+
enum { MAX_BUCKET_SIZE = 200 };
54+
55+
/* Seen-set for O(1) dedup during query (simple open-addressing hash table). */
56+
enum { SEEN_SET_BITS = 14, SEEN_SET_SIZE = 16384, SEEN_SET_MASK = 16383 };
57+
58+
/* Knuth multiplicative hash constant for node_id → seen-set slot. */
59+
enum { KNUTH_MULT = 2654435761ULL };
60+
4561
/* Maximum normalised tokens per function body. */
4662
enum { MAX_TOKENS = 4096 };
4763

@@ -97,7 +113,32 @@ static const char *normalise_node_type(const char *kind) {
97113

98114
/* ── MinHash computation ─────────────────────────────────────────── */
99115

100-
/* Phase 1: Walk AST iteratively and collect normalised token types. */
116+
/* Check if a normalised token is one of the generic types (I/S/N/T).
117+
* These carry no structural info — used to compute trigram weight. */
118+
static bool is_normalised_token(const char *tok) {
119+
return tok[0] != '\0' && tok[SKIP_ONE] == '\0' &&
120+
(tok[0] == 'I' || tok[0] == 'S' || tok[0] == 'N' || tok[0] == 'T');
121+
}
122+
123+
/* Compute structural weight of a trigram: count of non-normalised tokens (0-3).
124+
* Weight 0 = pure data manipulation (noise), weight 3 = rich control flow (signal). */
125+
static int trigram_structural_weight(const char *a, const char *b, const char *c) {
126+
int w = 0;
127+
if (!is_normalised_token(a)) {
128+
w++;
129+
}
130+
if (!is_normalised_token(b)) {
131+
w++;
132+
}
133+
if (!is_normalised_token(c)) {
134+
w++;
135+
}
136+
return w;
137+
}
138+
139+
/* Phase 1: Walk AST iteratively and collect normalised LEAF token types.
140+
* Leaf-only counting is language-agnostic: leaf nodes correspond to actual
141+
* source tokens, not grammar-internal structure that varies across parsers. */
101142
static int collect_ast_tokens(TSNode root, const char **tokens, int max_tokens) {
102143
int token_count = 0;
103144
TSNode stack[AST_WALK_CAP];
@@ -107,16 +148,16 @@ static int collect_ast_tokens(TSNode root, const char **tokens, int max_tokens)
107148
while (top > 0 && token_count < max_tokens) {
108149
TSNode node = stack[--top];
109150
uint32_t child_count = ts_node_child_count(node);
110-
const char *kind = ts_node_type(node);
111151

112152
if (child_count == 0) {
153+
/* Leaf node — actual source token. Normalise and record. */
154+
const char *kind = ts_node_type(node);
113155
if (kind[0] != '\0') {
114156
tokens[token_count++] = normalise_node_type(kind);
115157
}
116158
} else {
117-
if (kind[0] != '\0' && ts_node_is_named(node)) {
118-
tokens[token_count++] = normalise_node_type(kind);
119-
}
159+
/* Internal node — push children only (skip the node itself).
160+
* Structural info comes from leaf token patterns, not grammar nodes. */
120161
for (int i = (int)child_count - SKIP_ONE; i >= 0 && top < AST_WALK_CAP; i--) {
121162
stack[top++] = ts_node_child(node, (uint32_t)i);
122163
}
@@ -125,31 +166,84 @@ static int collect_ast_tokens(TSNode root, const char **tokens, int max_tokens)
125166
return token_count;
126167
}
127168

128-
/* Phase 2: Hash trigrams from token sequence into MinHash signature. */
169+
/* Phase 2: Hash trigrams into MinHash signature with structural weighting.
170+
*
171+
* - Skip weight-0 trigrams (all tokens are I/S/N/T — pure noise)
172+
* - Use repetition-based weighted MinHash: hash w times per seed for weight w
173+
* - Track unique trigrams via hash set; reject if < MIN_UNIQUE_TRIGRAMS
174+
*
175+
* Returns the number of unique structural trigrams processed. */
176+
/* Unique-trigram set: open addressing on 64-bit hashes. */
177+
enum { UNIQ_SET_SIZE = 4096, UNIQ_SET_MASK = 4095 };
178+
179+
typedef struct {
180+
uint64_t slots[UNIQ_SET_SIZE];
181+
int count;
182+
} uniq_trig_set_t;
183+
184+
static void uniq_trig_init(uniq_trig_set_t *s) {
185+
memset(s->slots, 0, sizeof(s->slots));
186+
s->count = 0;
187+
}
188+
189+
/* Insert a trigram hash. Returns true if newly inserted. */
190+
static bool uniq_trig_insert(uniq_trig_set_t *s, uint64_t trig_hash) {
191+
uint64_t val = trig_hash | SKIP_ONE; /* ensure non-zero */
192+
uint32_t slot = (uint32_t)(trig_hash & UNIQ_SET_MASK);
193+
for (int probe = 0; probe < UNIQ_SET_SIZE; probe++) {
194+
uint32_t idx = (slot + (uint32_t)probe) & UNIQ_SET_MASK;
195+
if (s->slots[idx] == 0) {
196+
s->slots[idx] = val;
197+
s->count++;
198+
return true;
199+
}
200+
if (s->slots[idx] == val) {
201+
return false;
202+
}
203+
}
204+
return false;
205+
}
206+
207+
/* Apply weighted MinHash for one trigram: hash w times per seed. */
208+
static void weighted_minhash_update(cbm_minhash_t *out, const char *trigram, int len, int w) {
209+
for (int k = 0; k < CBM_MINHASH_K; k++) {
210+
for (int rep = 0; rep < w; rep++) {
211+
uint64_t seed = ((uint64_t)k * MAX_STRUCTURAL_WEIGHT) + (uint64_t)rep;
212+
uint64_t h = XXH3_64bits_withSeed(trigram, (size_t)len, seed);
213+
uint32_t h32 = (uint32_t)(h & U32_MASK);
214+
if (h32 < out->values[k]) {
215+
out->values[k] = h32;
216+
}
217+
}
218+
}
219+
}
220+
129221
static int hash_trigrams(const char **tokens, int token_count, cbm_minhash_t *out) {
130222
for (int k = 0; k < CBM_MINHASH_K; k++) {
131223
out->values[k] = UINT32_MAX;
132224
}
133225

226+
uniq_trig_set_t uniq;
227+
uniq_trig_init(&uniq);
134228
char trigram_buf[TRIGRAM_BUF_LEN];
135-
int trigram_count = 0;
136229

137230
for (int i = 0; i + TRIGRAM_WINDOW < token_count; i++) {
138-
int len = snprintf(trigram_buf, sizeof(trigram_buf), "%s|%s|%s", tokens[i], tokens[i + 1],
139-
tokens[i + 2]);
140-
if (len <= 0 || (size_t)len >= sizeof(trigram_buf)) {
231+
int w =
232+
trigram_structural_weight(tokens[i], tokens[i + SKIP_ONE], tokens[i + TRIGRAM_WINDOW]);
233+
if (w == 0) {
141234
continue;
142235
}
143-
trigram_count++;
144-
for (int k = 0; k < CBM_MINHASH_K; k++) {
145-
uint64_t h = XXH3_64bits_withSeed(trigram_buf, (size_t)len, (uint64_t)k);
146-
uint32_t h32 = (uint32_t)(h & U32_MASK);
147-
if (h32 < out->values[k]) {
148-
out->values[k] = h32;
149-
}
236+
237+
int len = snprintf(trigram_buf, sizeof(trigram_buf), "%s|%s|%s", tokens[i],
238+
tokens[i + SKIP_ONE], tokens[i + TRIGRAM_WINDOW]);
239+
if (len <= 0 || (size_t)len >= sizeof(trigram_buf)) {
240+
continue;
150241
}
242+
243+
uniq_trig_insert(&uniq, XXH3_64bits(trigram_buf, (size_t)len));
244+
weighted_minhash_update(out, trigram_buf, len, w);
151245
}
152-
return trigram_count;
246+
return uniq.count;
153247
}
154248

155249
bool cbm_minhash_compute(TSNode func_body, const char *source, int language, cbm_minhash_t *out) {
@@ -166,8 +260,8 @@ bool cbm_minhash_compute(TSNode func_body, const char *source, int language, cbm
166260
return false;
167261
}
168262

169-
int trigram_count = hash_trigrams(tokens, token_count, out);
170-
return trigram_count >= MIN_TRIGRAM_COUNT;
263+
int unique_structural = hash_trigrams(tokens, token_count, out);
264+
return unique_structural >= MIN_UNIQUE_TRIGRAMS;
171265
}
172266

173267
/* ── Jaccard similarity ──────────────────────────────────────────── */
@@ -304,17 +398,42 @@ void cbm_lsh_insert(cbm_lsh_index_t *idx, const cbm_lsh_entry_t *entry) {
304398
}
305399
}
306400

307-
/* Check if a node_id is already in the result buffer. */
308-
static bool result_contains(const cbm_lsh_index_t *idx, int64_t node_id) {
309-
for (int j = 0; j < idx->result_count; j++) {
310-
if (idx->result_buf[j]->node_id == node_id) {
311-
return true;
401+
/* O(1) seen-set: open-addressing hash table on node_id for dedup. */
402+
typedef struct {
403+
int64_t *slots;
404+
int cap;
405+
} seen_set_t;
406+
407+
static void seen_set_init(seen_set_t *s) {
408+
s->slots = calloc(SEEN_SET_SIZE, sizeof(int64_t));
409+
s->cap = SEEN_SET_SIZE;
410+
/* 0 means empty — node_ids are always > 0 */
411+
}
412+
413+
static bool seen_set_insert(seen_set_t *s, int64_t node_id) {
414+
if (!s->slots) {
415+
return false;
416+
}
417+
uint32_t idx = (uint32_t)(node_id * KNUTH_MULT) & SEEN_SET_MASK;
418+
for (int probe = 0; probe < SEEN_SET_SIZE; probe++) {
419+
uint32_t slot = (idx + (uint32_t)probe) & SEEN_SET_MASK;
420+
if (s->slots[slot] == 0) {
421+
s->slots[slot] = node_id;
422+
return true; /* inserted (was not present) */
423+
}
424+
if (s->slots[slot] == node_id) {
425+
return false; /* already present */
312426
}
313427
}
314-
return false;
428+
return false; /* table full */
429+
}
430+
431+
static void seen_set_free(seen_set_t *s) {
432+
free(s->slots);
433+
s->slots = NULL;
315434
}
316435

317-
/* Append a candidate to the result buffer, growing if needed. Returns false on OOM. */
436+
/* Append a candidate to the result buffer, growing if needed. */
318437
static bool result_push(cbm_lsh_index_t *idx, const cbm_lsh_entry_t *candidate) {
319438
if (idx->result_count >= idx->result_cap) {
320439
int new_cap =
@@ -340,24 +459,32 @@ void cbm_lsh_query(const cbm_lsh_index_t *idx, const cbm_minhash_t *fp,
340459
return;
341460
}
342461

343-
/* Cast away const for result buffer management — query is logically const */
344462
cbm_lsh_index_t *mut_idx = (cbm_lsh_index_t *)idx;
345463
mut_idx->result_count = 0;
346464

465+
/* O(1) dedup via open-addressing hash set */
466+
seen_set_t seen;
467+
seen_set_init(&seen);
468+
347469
for (int b = 0; b < CBM_LSH_BANDS; b++) {
348470
uint32_t h = band_hash(fp, b);
349471
const lsh_bucket_t *bucket = &idx->bands[b][h];
472+
/* Skip oversized buckets — noise from trivially similar utility functions */
473+
if (bucket->count > MAX_BUCKET_SIZE) {
474+
continue;
475+
}
350476
for (int i = 0; i < bucket->count; i++) {
351477
const cbm_lsh_entry_t *candidate = &idx->entries[bucket->items[i]];
352-
if (result_contains(idx, candidate->node_id)) {
353-
continue;
478+
if (!seen_set_insert(&seen, candidate->node_id)) {
479+
continue; /* already seen */
354480
}
355481
if (!result_push(mut_idx, candidate)) {
356482
break;
357483
}
358484
}
359485
}
360486

487+
seen_set_free(&seen);
361488
*out = mut_idx->result_buf;
362489
*count = mut_idx->result_count;
363490
}

src/simhash/minhash.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@
2222
* error on Jaccard — sufficient for a 0.95 threshold. */
2323
#define CBM_MINHASH_K 64
2424

25-
/* Minimum number of normalized AST body nodes required to compute a
26-
* fingerprint. Functions shorter than this are skipped to prevent
27-
* trivial getter/setter fingerprint explosion. */
28-
#define CBM_MINHASH_MIN_NODES 10
25+
/* Minimum number of leaf AST tokens required to compute a fingerprint.
26+
* Leaf-only counting is language-agnostic: leaf nodes correspond to
27+
* actual source tokens (identifiers, literals, keywords, operators),
28+
* not grammar-internal structure that varies across parsers.
29+
* 30 leaf tokens ≈ BigCloneBench standard of 50 raw source tokens. */
30+
#define CBM_MINHASH_MIN_NODES 30
2931

3032
/* Default Jaccard threshold for SIMILAR_TO edge emission. */
3133
#define CBM_MINHASH_JACCARD_THRESHOLD 0.95

0 commit comments

Comments
 (0)