Skip to content

Commit 8a06d78

Browse files
committed
Parallelize post-passes, fix mode filtering + semantic edge quality
- Parallelize pass_similarity and pass_semantic_edges via worker pool with thread-local edge buffers; sequential final merge since gbuf is not thread-safe. Adds cbm_lsh_query_into() as a thread-safe variant with caller-provided candidate buffer. - Add activatable profiling subsystem (CBM_PROFILE=1 env or --profile flag) for step-level timing of extract, resolve, corpus build, vector phases, and sqlite dump. Zero overhead when disabled. - Fix cbm_index_mode_t enum mismatch between pipeline.h (FULL=0, MODERATE=1, FAST=2) and discover.h (FULL=0, FAST=1). mode=fast silently no-op'd fast-discovery filtering because discover.c compared against the wrong value. Linux kernel fast mode went 1:40 -> 3:11 as a result; now back to 1:40. Broaden the filter guard to mode != CBM_MODE_FULL so MODERATE and FAST both get aggressive discovery. - Clamp cbm_sem_combined_score output to [0, 1]. The proximity multiplier returns up to 1.10 as a same-file boost which could push the final cosine score above 1.0. - Short-circuit semantic scoring when MinHash jaccard >= 0.95. Exact near-clones are already emitted as SIMILAR_TO edges; returning 0 here avoids flooding SEMANTICALLY_RELATED with cross-service copy-paste boilerplate and frees the edge budget for genuine vocabulary-bridged relations. - Validate search_graph semantic_query as an array of strings and return a clear error for a single-string input. Update the tool description to spell out the requirement explicitly with an example. - JSON-escape user-controlled strings (callee names, call arguments, URL paths, import local_name) in call/argument properties. Introduces cbm_json_escape() in foundation/str_util. - Skip SQLite pending_byte_page (file offset 0x40000000) during raw page writes in sqlite_writer to avoid corrupting databases that cross the 1 GiB boundary. - Migrate pretrained vector blob from UniXcoder (51K tokens) to nomic-embed-code (40856 tokens x 768d int8). Includes the extraction script under scripts/extract_nomic_vectors.py.
1 parent bf70078 commit 8a06d78

30 files changed

+84094
-103255
lines changed

Makefile.cbm

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,8 @@ FOUNDATION_SRCS = \
107107
src/foundation/compat_fs.c \
108108
src/foundation/compat_regex.c \
109109
src/foundation/mem.c \
110-
src/foundation/diagnostics.c
110+
src/foundation/diagnostics.c \
111+
src/foundation/profile.c
111112

112113
# Existing extraction C code (compiled from current location)
113114
EXTRACTION_SRCS = \
@@ -195,8 +196,8 @@ SIMHASH_SRCS = src/simhash/minhash.c
195196
# Semantic embedding module
196197
SEMANTIC_SRCS = src/semantic/semantic.c src/semantic/ast_profile.c
197198

198-
# UniXcoder pretrained vectors (assembler blob)
199-
UNIXCODER_BLOB_SRC = vendored/unixcoder/code_vectors_blob.S
199+
# nomic-embed-code pretrained vectors (assembler blob)
200+
UNIXCODER_BLOB_SRC = vendored/nomic/code_vectors_blob.S
200201

201202
# Traces module (new)
202203
TRACES_SRCS = src/traces/traces.c
@@ -418,9 +419,9 @@ $(BUILD_DIR)/test_lz4.o: $(CBM_DIR)/vendored/lz4/lz4.c | $(BUILD_DIR)
418419
$(BUILD_DIR)/test_lz4hc.o: $(CBM_DIR)/vendored/lz4/lz4hc.c | $(BUILD_DIR)
419420
$(CC) -std=c11 -D_DEFAULT_SOURCE -g -O1 $(SANITIZE) -w -I$(CBM_DIR)/vendored/lz4 -c -o $@ $<
420421

421-
# UniXcoder pretrained vector blob
422+
# nomic-embed-code pretrained vector blob
422423
UNIXCODER_OBJ = $(BUILD_DIR)/unixcoder_blob.o
423-
$(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/unixcoder/code_vectors.bin | $(BUILD_DIR)
424+
$(UNIXCODER_OBJ): $(UNIXCODER_BLOB_SRC) vendored/nomic/code_vectors.bin | $(BUILD_DIR)
424425
$(AS) -o $@ $<
425426

426427
OBJS_VENDORED_TEST = $(MIMALLOC_OBJ_TEST) $(SQLITE3_OBJ_TEST) $(TRE_OBJ_TEST) $(GRAMMAR_OBJS_TEST) $(TS_RUNTIME_OBJ_TEST) $(LSP_OBJ_TEST) $(PP_OBJ_TEST) $(MONGOOSE_OBJ_TEST) $(LZ4_OBJ_TEST) $(UNIXCODER_OBJ)

internal/cbm/sqlite_writer.c

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "sqlite_writer.h"
1818
#include "foundation/constants.h"
1919
#include "foundation/compat_thread.h"
20+
#include "foundation/profile.h"
2021

2122
#include <stddef.h> // NULL
2223
#include <stdio.h>
@@ -26,6 +27,25 @@
2627
#include <stdbool.h>
2728

2829
#define CBM_PAGE_SIZE 65536
30+
31+
/* SQLite reserves the page containing the 1 GiB file offset (the "pending byte"
32+
* used for file locking on Windows). This page MUST be skipped during allocation
33+
* otherwise integrity_check reports "2nd reference to page N" because it marks
34+
* this page as referenced before walking any tree.
35+
*
36+
* PENDING_BYTE = 0x40000000 = 1073741824 (1 GiB)
37+
* PENDING_BYTE_PAGE = (PENDING_BYTE / page_size) + 1
38+
* 64KB pages → page 16385
39+
* 32KB pages → page 32769
40+
* 16KB pages → page 65537
41+
*/
42+
#define CBM_PENDING_BYTE (0x40000000u)
43+
#define CBM_PENDING_BYTE_PAGE ((CBM_PENDING_BYTE / CBM_PAGE_SIZE) + 1)
44+
45+
/* Skip the pending byte page if allocation lands on it. */
46+
static inline uint32_t cbm_skip_pending_byte(uint32_t pgno) {
47+
return pgno == CBM_PENDING_BYTE_PAGE ? pgno + 1 : pgno;
48+
}
2949
#define SCHEMA_FORMAT 4
3050
#define FILE_FORMAT 1
3151
#define SQLITE_VERSION 3046000 // 3.46.0
@@ -467,7 +487,8 @@ static void pb_flush_leaf(PageBuilder *pb) {
467487
put_u16(pb->page + hdr + HDR_CONTENT_OFF, (uint16_t)pb->content_offset);
468488
pb->page[hdr + HDR_FRAGBYTES_OFF] = 0; // fragmented free bytes
469489

470-
// Write page to file
490+
// Write page to file. Skip the pending byte page (SQLite reserved).
491+
pb->next_page = cbm_skip_pending_byte(pb->next_page);
471492
uint32_t page_num = pb->next_page;
472493
long offset = (long)(page_num - SKIP_ONE) * CBM_PAGE_SIZE;
473494
(void)fseek(pb->fp, offset, SEEK_SET);
@@ -559,6 +580,7 @@ static int write_interior_page(PageBuilder *pb, uint8_t *page, int cell_count, i
559580
uint32_t right_child_page, const PageRef *children,
560581
int right_child_idx, bool is_index, PageRef **parents,
561582
int parent_count, int *parent_cap) {
583+
pb->next_page = cbm_skip_pending_byte(pb->next_page);
562584
uint32_t pnum = pb->next_page++;
563585
page[0] = is_index ? INTERIOR_INDEX_FLAG : INTERIOR_TABLE_FLAG;
564586
put_u16(page + HDR_FREEBLOCK_OFF, 0);
@@ -1009,6 +1031,7 @@ static uint32_t write_table_btree(FILE *fp, uint32_t *next_page, const uint8_t *
10091031
bool first_is_page1) {
10101032
if (count == 0) {
10111033
// Empty table: write a single empty leaf page
1034+
*next_page = cbm_skip_pending_byte(*next_page);
10121035
uint32_t pnum = (*next_page)++;
10131036
uint8_t page[CBM_PAGE_SIZE];
10141037
memset(page, 0, CBM_PAGE_SIZE);
@@ -1046,7 +1069,9 @@ static bool pb_promote_and_flush(PageBuilder *pb, uint8_t **cells, int *cell_len
10461069
memcpy(pb->leaves[pb->leaf_count].sep_cell, cells[prev_idx], cell_lens[prev_idx]);
10471070
pb->leaves[pb->leaf_count].sep_cell_len = cell_lens[prev_idx];
10481071

1049-
// Un-add the last cell
1072+
// Un-add the last cell — it's promoted to the interior separator.
1073+
// SQLite index B-tree interior cells are counted by integrity_check,
1074+
// so this cell exists in the interior page instead of the leaf.
10501075
pb->cell_count--;
10511076
pb->content_offset += cell_lens[prev_idx];
10521077
pb->ptr_offset -= CELL_PTR_SIZE;
@@ -1057,6 +1082,7 @@ static bool pb_promote_and_flush(PageBuilder *pb, uint8_t **cells, int *cell_len
10571082

10581083
// Write an empty index leaf page.
10591084
static uint32_t write_empty_index_leaf(FILE *fp, uint32_t *next_page) {
1085+
*next_page = cbm_skip_pending_byte(*next_page);
10601086
uint32_t pnum = (*next_page)++;
10611087
uint8_t page[CBM_PAGE_SIZE];
10621088
memset(page, 0, CBM_PAGE_SIZE);
@@ -1660,6 +1686,7 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
16601686
.token_vec_count = token_vec_count};
16611687

16621688
// Phase 1: Data tables (streaming node + edge + vector + token_vector records)
1689+
CBM_PROF_START(t_data);
16631690
uint32_t nodes_root;
16641691
uint32_t edges_root;
16651692
uint32_t vectors_root;
@@ -1669,14 +1696,17 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
16691696
(void)fclose(fp);
16701697
return rc;
16711698
}
1699+
CBM_PROF_END_N("write_db", "1_data_tables", t_data, node_count + edge_count);
16721700

16731701
// Phase 2: Metadata tables (projects, file_hashes, summaries, sqlite_sequence)
1702+
CBM_PROF_START(t_meta);
16741703
uint32_t projects_root;
16751704
uint32_t file_hashes_root;
16761705
uint32_t summaries_root;
16771706
uint32_t sqlite_seq_root;
16781707
write_metadata_tables(&w, &projects_root, &file_hashes_root, &summaries_root, &sqlite_seq_root);
16791708
uint32_t next_page = w.next_page;
1709+
CBM_PROF_END("write_db", "2_metadata_tables", t_meta);
16801710

16811711
// --- Build indexes (all sorted by key columns before writing) ---
16821712

@@ -1703,6 +1733,8 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
17031733
{edge_count, cmp_edge_by_src_tgt_type, NULL},
17041734
};
17051735

1736+
// Phase 3: Parallel sort of 11 index permutations
1737+
CBM_PROF_START(t_sort);
17061738
{
17071739
cbm_thread_t st[TOTAL_SORT_THREADS];
17081740
int nt = 0;
@@ -1720,7 +1752,10 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
17201752
cbm_thread_join(&st[i]);
17211753
}
17221754
}
1755+
CBM_PROF_END_N("write_db", "3_parallel_sort_indexes", t_sort, node_count + edge_count);
17231756

1757+
// Phase 4: Build node index B-trees (SEQUENTIAL)
1758+
CBM_PROF_START(t_node_idx);
17241759
uint32_t idx_nodes_label_root =
17251760
build_node_index_sorted(fp, &next_page, nodes, node_count, nsorts[0].perm, ncol_label);
17261761
uint32_t idx_nodes_name_root = build_node_index_sorted(fp, &next_page, nodes, node_count,
@@ -1735,9 +1770,10 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
17351770
(void)fclose(fp);
17361771
return ERR_SORT_FAILED;
17371772
}
1773+
CBM_PROF_END_N("write_db", "4_node_indexes_seq", t_node_idx, node_count * 4);
17381774

1739-
// --- Edge indexes (all sorted) ---
1740-
1775+
// Phase 5: Build edge index B-trees (SEQUENTIAL)
1776+
CBM_PROF_START(t_edge_idx);
17411777
uint32_t idx_edges_source_root =
17421778
build_edge_index_sorted(fp, &next_page, edges, edge_count, esorts[0].perm, ecell_source);
17431779
uint32_t idx_edges_target_root = build_edge_index_sorted(
@@ -1762,6 +1798,7 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
17621798
(void)fclose(fp);
17631799
return ERR_SORT_FAILED;
17641800
}
1801+
CBM_PROF_END_N("write_db", "5_edge_indexes_seq", t_edge_idx, edge_count * 7);
17651802

17661803
// Autoindex for projects(name TEXT PK) — single text column
17671804
uint32_t autoindex_projects_root;
@@ -1925,8 +1962,9 @@ int cbm_write_db(const char *path, const char *project, const char *root_path,
19251962

19261963
// Write the 100-byte SQLite file header
19271964
memcpy(page1, "SQLite format 3\000", 16);
1928-
/* Page size 65536 is encoded as 1 in the 2-byte header field */
1929-
put_u16(page1 + HDR_OFF_CBM_PAGE_SIZE, (uint16_t)SKIP_ONE);
1965+
/* Page size: 65536 is encoded as 1; all others stored directly */
1966+
put_u16(page1 + HDR_OFF_CBM_PAGE_SIZE,
1967+
CBM_PAGE_SIZE == 65536 ? (uint16_t)1 : (uint16_t)CBM_PAGE_SIZE);
19301968
page1[HDR_OFF_WRITE_VERSION] = FILE_FORMAT; // file format write version
19311969
page1[HDR_OFF_READ_VERSION] = FILE_FORMAT; // file format read version
19321970
page1[HDR_OFF_RESERVED] = 0; // reserved space per page

0 commit comments

Comments
 (0)