@@ -33,15 +33,31 @@ enum { AST_WALK_CAP = 2048 };
3333/* Hex encoding constants */
3434enum { HEX_CHARS_PER_U32 = 8 , HEX_BASE = 16 };
3535
36- /* Trigram window and minimum threshold */
37- enum { TRIGRAM_WINDOW = 2 , MIN_TRIGRAM_COUNT = 3 };
36+ /* Trigram constants */
37+ enum { TRIGRAM_WINDOW = 2 };
38+
39+ /* Minimum unique structural trigrams for a meaningful fingerprint.
40+ * K/2 = 32 ensures most MinHash slots get populated from distinct features. */
41+ enum { MIN_UNIQUE_TRIGRAMS = 32 };
42+
43+ /* Maximum structural weight per trigram (3 tokens × 1 each). */
44+ enum { MAX_STRUCTURAL_WEIGHT = 3 };
3845
3946/* Hash truncation mask (uint64 → uint32) */
4047enum { U32_MASK = 0xFFFFFFFF };
4148
4249/* Dynamic array growth constants */
4350enum { BUCKET_INIT_CAP = 8 , GROW_FACTOR = 2 , ENTRY_INIT_CAP = 64 , RESULT_INIT_CAP = 64 };
4451
52+ /* Maximum bucket size — skip oversized buckets (noise from trivially similar functions). */
53+ enum { MAX_BUCKET_SIZE = 200 };
54+
55+ /* Seen-set for O(1) dedup during query (simple open-addressing hash table). */
56+ enum { SEEN_SET_BITS = 14 , SEEN_SET_SIZE = 16384 , SEEN_SET_MASK = 16383 };
57+
58+ /* Knuth multiplicative hash constant for node_id → seen-set slot. */
59+ enum { KNUTH_MULT = 2654435761ULL };
60+
4561/* Maximum normalised tokens per function body. */
4662enum { MAX_TOKENS = 4096 };
4763
@@ -97,7 +113,32 @@ static const char *normalise_node_type(const char *kind) {
97113
98114/* ── MinHash computation ─────────────────────────────────────────── */
99115
100- /* Phase 1: Walk AST iteratively and collect normalised token types. */
116+ /* Check if a normalised token is one of the generic types (I/S/N/T).
117+ * These carry no structural info — used to compute trigram weight. */
118+ static bool is_normalised_token (const char * tok ) {
119+ return tok [0 ] != '\0' && tok [SKIP_ONE ] == '\0' &&
120+ (tok [0 ] == 'I' || tok [0 ] == 'S' || tok [0 ] == 'N' || tok [0 ] == 'T' );
121+ }
122+
123+ /* Compute structural weight of a trigram: count of non-normalised tokens (0-3).
124+ * Weight 0 = pure data manipulation (noise), weight 3 = rich control flow (signal). */
125+ static int trigram_structural_weight (const char * a , const char * b , const char * c ) {
126+ int w = 0 ;
127+ if (!is_normalised_token (a )) {
128+ w ++ ;
129+ }
130+ if (!is_normalised_token (b )) {
131+ w ++ ;
132+ }
133+ if (!is_normalised_token (c )) {
134+ w ++ ;
135+ }
136+ return w ;
137+ }
138+
139+ /* Phase 1: Walk AST iteratively and collect normalised LEAF token types.
140+ * Leaf-only counting is language-agnostic: leaf nodes correspond to actual
141+ * source tokens, not grammar-internal structure that varies across parsers. */
101142static int collect_ast_tokens (TSNode root , const char * * tokens , int max_tokens ) {
102143 int token_count = 0 ;
103144 TSNode stack [AST_WALK_CAP ];
@@ -107,16 +148,16 @@ static int collect_ast_tokens(TSNode root, const char **tokens, int max_tokens)
107148 while (top > 0 && token_count < max_tokens ) {
108149 TSNode node = stack [-- top ];
109150 uint32_t child_count = ts_node_child_count (node );
110- const char * kind = ts_node_type (node );
111151
112152 if (child_count == 0 ) {
153+ /* Leaf node — actual source token. Normalise and record. */
154+ const char * kind = ts_node_type (node );
113155 if (kind [0 ] != '\0' ) {
114156 tokens [token_count ++ ] = normalise_node_type (kind );
115157 }
116158 } else {
117- if (kind [0 ] != '\0' && ts_node_is_named (node )) {
118- tokens [token_count ++ ] = normalise_node_type (kind );
119- }
159+ /* Internal node — push children only (skip the node itself).
160+ * Structural info comes from leaf token patterns, not grammar nodes. */
120161 for (int i = (int )child_count - SKIP_ONE ; i >= 0 && top < AST_WALK_CAP ; i -- ) {
121162 stack [top ++ ] = ts_node_child (node , (uint32_t )i );
122163 }
@@ -125,31 +166,84 @@ static int collect_ast_tokens(TSNode root, const char **tokens, int max_tokens)
125166 return token_count ;
126167}
127168
128- /* Phase 2: Hash trigrams from token sequence into MinHash signature. */
169+ /* Phase 2: Hash trigrams into MinHash signature with structural weighting.
170+ *
171+ * - Skip weight-0 trigrams (all tokens are I/S/N/T — pure noise)
172+ * - Use repetition-based weighted MinHash: hash w times per seed for weight w
173+ * - Track unique trigrams via hash set; reject if < MIN_UNIQUE_TRIGRAMS
174+ *
175+ * Returns the number of unique structural trigrams processed. */
176+ /* Unique-trigram set: open addressing on 64-bit hashes. */
177+ enum { UNIQ_SET_SIZE = 4096 , UNIQ_SET_MASK = 4095 };
178+
179+ typedef struct {
180+ uint64_t slots [UNIQ_SET_SIZE ];
181+ int count ;
182+ } uniq_trig_set_t ;
183+
184+ static void uniq_trig_init (uniq_trig_set_t * s ) {
185+ memset (s -> slots , 0 , sizeof (s -> slots ));
186+ s -> count = 0 ;
187+ }
188+
189+ /* Insert a trigram hash. Returns true if newly inserted. */
190+ static bool uniq_trig_insert (uniq_trig_set_t * s , uint64_t trig_hash ) {
191+ uint64_t val = trig_hash | SKIP_ONE ; /* ensure non-zero */
192+ uint32_t slot = (uint32_t )(trig_hash & UNIQ_SET_MASK );
193+ for (int probe = 0 ; probe < UNIQ_SET_SIZE ; probe ++ ) {
194+ uint32_t idx = (slot + (uint32_t )probe ) & UNIQ_SET_MASK ;
195+ if (s -> slots [idx ] == 0 ) {
196+ s -> slots [idx ] = val ;
197+ s -> count ++ ;
198+ return true;
199+ }
200+ if (s -> slots [idx ] == val ) {
201+ return false;
202+ }
203+ }
204+ return false;
205+ }
206+
207+ /* Apply weighted MinHash for one trigram: hash w times per seed. */
208+ static void weighted_minhash_update (cbm_minhash_t * out , const char * trigram , int len , int w ) {
209+ for (int k = 0 ; k < CBM_MINHASH_K ; k ++ ) {
210+ for (int rep = 0 ; rep < w ; rep ++ ) {
211+ uint64_t seed = ((uint64_t )k * MAX_STRUCTURAL_WEIGHT ) + (uint64_t )rep ;
212+ uint64_t h = XXH3_64bits_withSeed (trigram , (size_t )len , seed );
213+ uint32_t h32 = (uint32_t )(h & U32_MASK );
214+ if (h32 < out -> values [k ]) {
215+ out -> values [k ] = h32 ;
216+ }
217+ }
218+ }
219+ }
220+
129221static int hash_trigrams (const char * * tokens , int token_count , cbm_minhash_t * out ) {
130222 for (int k = 0 ; k < CBM_MINHASH_K ; k ++ ) {
131223 out -> values [k ] = UINT32_MAX ;
132224 }
133225
226+ uniq_trig_set_t uniq ;
227+ uniq_trig_init (& uniq );
134228 char trigram_buf [TRIGRAM_BUF_LEN ];
135- int trigram_count = 0 ;
136229
137230 for (int i = 0 ; i + TRIGRAM_WINDOW < token_count ; i ++ ) {
138- int len = snprintf ( trigram_buf , sizeof ( trigram_buf ), "%s|%s|%s" , tokens [ i ], tokens [ i + 1 ],
139- tokens [i + 2 ]);
140- if (len <= 0 || ( size_t ) len >= sizeof ( trigram_buf ) ) {
231+ int w =
232+ trigram_structural_weight ( tokens [ i ], tokens [ i + SKIP_ONE ], tokens [i + TRIGRAM_WINDOW ]);
233+ if (w == 0 ) {
141234 continue ;
142235 }
143- trigram_count ++ ;
144- for (int k = 0 ; k < CBM_MINHASH_K ; k ++ ) {
145- uint64_t h = XXH3_64bits_withSeed (trigram_buf , (size_t )len , (uint64_t )k );
146- uint32_t h32 = (uint32_t )(h & U32_MASK );
147- if (h32 < out -> values [k ]) {
148- out -> values [k ] = h32 ;
149- }
236+
237+ int len = snprintf (trigram_buf , sizeof (trigram_buf ), "%s|%s|%s" , tokens [i ],
238+ tokens [i + SKIP_ONE ], tokens [i + TRIGRAM_WINDOW ]);
239+ if (len <= 0 || (size_t )len >= sizeof (trigram_buf )) {
240+ continue ;
150241 }
242+
243+ uniq_trig_insert (& uniq , XXH3_64bits (trigram_buf , (size_t )len ));
244+ weighted_minhash_update (out , trigram_buf , len , w );
151245 }
152- return trigram_count ;
246+ return uniq . count ;
153247}
154248
155249bool cbm_minhash_compute (TSNode func_body , const char * source , int language , cbm_minhash_t * out ) {
@@ -166,8 +260,8 @@ bool cbm_minhash_compute(TSNode func_body, const char *source, int language, cbm
166260 return false;
167261 }
168262
169- int trigram_count = hash_trigrams (tokens , token_count , out );
170- return trigram_count >= MIN_TRIGRAM_COUNT ;
263+ int unique_structural = hash_trigrams (tokens , token_count , out );
264+ return unique_structural >= MIN_UNIQUE_TRIGRAMS ;
171265}
172266
173267/* ── Jaccard similarity ──────────────────────────────────────────── */
@@ -304,17 +398,42 @@ void cbm_lsh_insert(cbm_lsh_index_t *idx, const cbm_lsh_entry_t *entry) {
304398 }
305399}
306400
307- /* Check if a node_id is already in the result buffer. */
308- static bool result_contains (const cbm_lsh_index_t * idx , int64_t node_id ) {
309- for (int j = 0 ; j < idx -> result_count ; j ++ ) {
310- if (idx -> result_buf [j ]-> node_id == node_id ) {
311- return true;
401+ /* O(1) seen-set: open-addressing hash table on node_id for dedup. */
402+ typedef struct {
403+ int64_t * slots ;
404+ int cap ;
405+ } seen_set_t ;
406+
407+ static void seen_set_init (seen_set_t * s ) {
408+ s -> slots = calloc (SEEN_SET_SIZE , sizeof (int64_t ));
409+ s -> cap = SEEN_SET_SIZE ;
410+ /* 0 means empty — node_ids are always > 0 */
411+ }
412+
413+ static bool seen_set_insert (seen_set_t * s , int64_t node_id ) {
414+ if (!s -> slots ) {
415+ return false;
416+ }
417+ uint32_t idx = (uint32_t )(node_id * KNUTH_MULT ) & SEEN_SET_MASK ;
418+ for (int probe = 0 ; probe < SEEN_SET_SIZE ; probe ++ ) {
419+ uint32_t slot = (idx + (uint32_t )probe ) & SEEN_SET_MASK ;
420+ if (s -> slots [slot ] == 0 ) {
421+ s -> slots [slot ] = node_id ;
422+ return true; /* inserted (was not present) */
423+ }
424+ if (s -> slots [slot ] == node_id ) {
425+ return false; /* already present */
312426 }
313427 }
314- return false;
428+ return false; /* table full */
429+ }
430+
431+ static void seen_set_free (seen_set_t * s ) {
432+ free (s -> slots );
433+ s -> slots = NULL ;
315434}
316435
317- /* Append a candidate to the result buffer, growing if needed. Returns false on OOM. */
436+ /* Append a candidate to the result buffer, growing if needed. */
318437static bool result_push (cbm_lsh_index_t * idx , const cbm_lsh_entry_t * candidate ) {
319438 if (idx -> result_count >= idx -> result_cap ) {
320439 int new_cap =
@@ -340,24 +459,32 @@ void cbm_lsh_query(const cbm_lsh_index_t *idx, const cbm_minhash_t *fp,
340459 return ;
341460 }
342461
343- /* Cast away const for result buffer management — query is logically const */
344462 cbm_lsh_index_t * mut_idx = (cbm_lsh_index_t * )idx ;
345463 mut_idx -> result_count = 0 ;
346464
465+ /* O(1) dedup via open-addressing hash set */
466+ seen_set_t seen ;
467+ seen_set_init (& seen );
468+
347469 for (int b = 0 ; b < CBM_LSH_BANDS ; b ++ ) {
348470 uint32_t h = band_hash (fp , b );
349471 const lsh_bucket_t * bucket = & idx -> bands [b ][h ];
472+ /* Skip oversized buckets — noise from trivially similar utility functions */
473+ if (bucket -> count > MAX_BUCKET_SIZE ) {
474+ continue ;
475+ }
350476 for (int i = 0 ; i < bucket -> count ; i ++ ) {
351477 const cbm_lsh_entry_t * candidate = & idx -> entries [bucket -> items [i ]];
352- if (result_contains ( idx , candidate -> node_id )) {
353- continue ;
478+ if (! seen_set_insert ( & seen , candidate -> node_id )) {
479+ continue ; /* already seen */
354480 }
355481 if (!result_push (mut_idx , candidate )) {
356482 break ;
357483 }
358484 }
359485 }
360486
487+ seen_set_free (& seen );
361488 * out = mut_idx -> result_buf ;
362489 * count = mut_idx -> result_count ;
363490}
0 commit comments