Skip to content

Commit 3c616e8

Browse files
committed
Primary key is now TEXT
1 parent 1e011b7 commit 3c616e8

12 files changed

Lines changed: 537 additions & 177 deletions

README.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
3636
- **Hybrid Search**: Combines vector similarity (cosine distance) with FTS5 full-text search for superior retrieval
3737
- **Smart Chunking**: Markdown-aware parsing preserves semantic boundaries
3838
- **Intelligent Sync**: Content-hash change detection skips unchanged files, atomically replaces modified ones, and cleans up deleted ones
39-
- **Transactional Safety**: Every sync operation runs inside a SAVEPOINT transaction - either fully succeeds or fully rolls back, no partially-indexed content
39+
- **Transactional Safety**: Text/file ingests run inside SAVEPOINT transactions, and directory sync uses transactional cleanup plus per-file transactional updates so failed files do not leave partial rows behind
4040
- **Efficient Storage**: Binary embeddings with configurable dimensions
4141
- **Embedding Cache**: Automatically caches computed embeddings, so re-indexing the same text skips redundant API calls and computation
4242
- **Flexible Embedding**: Use local models (llama.cpp) or [vectors.space](https://vectors.space) remote API
@@ -74,7 +74,7 @@ sqlite-memory bridges these concepts, allowing any SQLite-powered application to
7474
```sql
7575
-- Load extensions (sync is optional)
7676
.load ./vector
77-
.load ./sync
77+
.load ./cloudsync
7878
.load ./memory
7979

8080
-- Configure embedding model (choose one):
@@ -84,8 +84,8 @@ SELECT memory_set_model('local', '/path/to/nomic-embed-text-v1.5.Q8_0.gguf');
8484

8585
-- Option 2: Remote embedding via vectors.space (requires free API key from https://vectors.space)
8686
-- The provider name 'openai' selects the vectors.space OpenAI-compatible endpoint.
87-
-- SELECT memory_set_model('openai', 'text-embedding-3-small');
8887
-- SELECT memory_set_apikey('your-vectorspace-api-key');
88+
-- SELECT memory_set_model('openai', 'text-embedding-3-small');
8989

9090
-- Add some knowledge
9191
SELECT memory_add_text('SQLite is a C-language library that implements a small, fast,
@@ -160,7 +160,7 @@ All `memory_add_*` functions use content-hash change detection to avoid redundan
160160
1. **Cleanup**: Removes database entries for files that no longer exist on disk
161161
2. **Scan**: Recursively processes all matching files - adding new ones, replacing modified ones, and skipping unchanged ones
162162

163-
Every sync operation is wrapped in a SQLite SAVEPOINT transaction. If anything fails mid-sync (embedding error, disk issue, etc.), the entire operation rolls back cleanly. There is no risk of partially-indexed files or orphaned entries.
163+
`memory_add_text()` and `memory_add_file()` each run inside a SQLite SAVEPOINT transaction. `memory_add_directory()` performs its cleanup pass transactionally and then processes each file in its own transaction. If one file fails, that file rolls back cleanly and previously-committed files remain valid; there are no partially-indexed rows or orphaned chunk/FTS entries for the failed file.
164164

165165
This makes all sync functions safe to call repeatedly - for example, on a cron schedule or at agent startup - with minimal overhead.
166166

@@ -258,8 +258,8 @@ FROM dbmem_content;
258258
-- Delete by context
259259
SELECT memory_delete_context('old-project');
260260

261-
-- Delete specific memory
262-
SELECT memory_delete(1234567890);
261+
-- Delete specific memory by hash
262+
SELECT memory_delete('9e3779b97f4a7c15');
263263

264264
-- Clear all memories
265265
SELECT memory_clear();
@@ -279,8 +279,11 @@ cd sqlite-memory
279279
# Build (full build with local + remote engines)
280280
make
281281

282-
# Run tests
282+
# Run parser/core unit tests + extension loading smoke test
283283
make test
284+
285+
# Run the full SQL extension unit suite
286+
make test DEFINES="-DTEST_SQLITE_EXTENSION"
284287
```
285288

286289
### Build Configurations

src/dbmem-lembed.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,15 @@ void dbmem_logger (enum ggml_log_level level, const char *text, void *user_data)
100100

101101
// MARK: -
102102

103+
static void dbmem_local_set_error(dbmem_local_engine_t *engine, const char *message) {
104+
if (!engine || !engine->context) return;
105+
dbmem_context_set_error(engine->context, message);
106+
}
107+
103108
dbmem_local_engine_t *dbmem_local_engine_init (void *ctx, const char *model_path, char err_msg[DBMEM_ERRBUF_SIZE]) {
104109
dbmem_local_engine_t *engine = (dbmem_local_engine_t *)dbmemory_zeroalloc(sizeof(dbmem_local_engine_t));
105110
if (!engine) return NULL;
111+
engine->context = (dbmem_context *)ctx;
106112

107113
// set logger
108114
llama_log_set(dbmem_logger, engine);
@@ -212,7 +218,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
212218
// Tokenize
213219
int n_tokens = llama_tokenize(engine->vocab, text, text_len, engine->tokens, engine->tokens_capacity, true, true);
214220
if (n_tokens < 0) {
215-
dbmem_context_set_error(engine->context, "Tokenization failed (text too long?)");
221+
dbmem_local_set_error(engine, "Tokenization failed (text too long?)");
216222
return -1;
217223
}
218224

@@ -242,7 +248,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
242248
// Encode
243249
int ret = llama_encode(engine->ctx, batch);
244250
if (ret != 0) {
245-
dbmem_context_set_error(engine->context, "Llama_encode failed");
251+
dbmem_local_set_error(engine, "Llama_encode failed");
246252
return -1;
247253
}
248254

@@ -255,7 +261,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex
255261
}
256262

257263
if (!emb_ptr) {
258-
dbmem_context_set_error(engine->context, "Failed to get embeddings");
264+
dbmem_local_set_error(engine, "Failed to get embeddings");
259265
return -1;
260266
}
261267

@@ -301,5 +307,5 @@ void dbmem_local_engine_free (dbmem_local_engine_t *engine) {
301307
}
302308

303309
llama_backend_free();
310+
dbmemory_free(engine);
304311
}
305-

src/dbmem-parser.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
typedef struct {
2929
size_t start; // Byte offset in source buffer
3030
size_t end; // Byte end in source buffer
31+
int is_heading; // True if this section starts with a heading block
3132
char *text; // Stripped plain text (allocated)
3233
size_t text_len; // Length of stripped text
3334
} section_t;
@@ -113,8 +114,6 @@ static size_t find_split (const char *text, size_t len, size_t max_chars) {
113114

114115
// Push a section to dynamic array
115116
static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_heading) {
116-
UNUSED_PARAM(is_heading);
117-
118117
if (ctx->sec_count >= ctx->sec_cap) {
119118
size_t new_cap = ctx->sec_cap ? ctx->sec_cap * 2 : 16;
120119
section_t *tmp = (section_t *)dbmemory_realloc(ctx->sections, new_cap * sizeof(section_t));
@@ -126,6 +125,7 @@ static int section_push (parse_ctx_t *ctx, size_t start, size_t end, int is_head
126125
section_t *s = &ctx->sections[ctx->sec_count++];
127126
s->start = start;
128127
s->end = end;
128+
s->is_heading = is_heading;
129129
s->text = NULL;
130130
s->text_len = 0;
131131

@@ -607,7 +607,7 @@ static int parse_sections (const char *buffer, size_t buffer_size, bool skip_sem
607607
for (size_t i = 0; i < ctx->sec_count; i++) {
608608
section_t *s = &ctx->sections[i];
609609
// First section or heading starts new section
610-
if (write_idx == 0) {
610+
if (write_idx == 0 || s->is_heading) {
611611
ctx->sections[write_idx++] = *s;
612612
} else {
613613
// Extend previous section to include this one

src/dbmem-search.c

Lines changed: 58 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -52,36 +52,54 @@ typedef struct {
5252
struct {
5353
int count;
5454
double *rank;
55-
sqlite3_int64 *hash;
55+
uint64_t *hash;
5656
sqlite3_int64 *seq;
5757
} fts;
5858

5959
struct {
6060
int count;
6161
double *rank;
62-
sqlite3_int64 *hash;
62+
uint64_t *hash;
6363
sqlite3_int64 *seq;
6464
} semantic;
6565

6666
struct {
6767
int count;
6868
double *vectorScore;
6969
double *textScore;
70-
sqlite3_int64 *hash;
70+
uint64_t *hash;
7171
sqlite3_int64 *seq;
7272
int *hasVector;
7373
int *hasFts;
7474
} merge;
7575

7676
double *rank;
77-
sqlite3_int64 *hash;
77+
uint64_t *hash;
7878
sqlite3_int64 *seq;
7979

8080
} vMemorySearchCursor;
8181

82+
static int dbmem_search_bind_hash (sqlite3_stmt *vm, int index, uint64_t hash) {
83+
char hash_text[DBMEM_HASH_STR_MAXLEN];
84+
dbmem_hash_to_hex(hash, hash_text);
85+
return sqlite3_bind_text(vm, index, hash_text, -1, SQLITE_TRANSIENT);
86+
}
87+
88+
static bool dbmem_search_column_hash (sqlite3_stmt *vm, int column, uint64_t *hash) {
89+
const char *hash_text = (const char *)sqlite3_column_text(vm, column);
90+
return dbmem_hash_from_hex(hash_text, hash);
91+
}
92+
8293
// MARK: - UTILS -
8394

8495
int vMemorySearchCursorAllocate (vMemorySearchCursor *c, int entries, bool perform_fts) {
96+
if (entries <= 0) {
97+
memset(c, 0, sizeof(*c));
98+
c->max_results = entries;
99+
c->perform_fts = perform_fts;
100+
return SQLITE_OK;
101+
}
102+
85103
// one buffer to rule them all
86104
// fts (if enabled): rank, hash, seq = 3 arrays * entries
87105
// semantic: rank, hash, seq = 3 arrays * entries
@@ -94,26 +112,26 @@ int vMemorySearchCursorAllocate (vMemorySearchCursor *c, int entries, bool perfo
94112
// fts arrays
95113
if (perform_fts) {
96114
size += sizeof(double) * entries; // fts.rank
97-
size += sizeof(sqlite3_int64) * entries; // fts.hash
115+
size += sizeof(uint64_t) * entries; // fts.hash
98116
size += sizeof(sqlite3_int64) * entries; // fts.seq
99117
}
100118

101119
// semantic arrays
102120
size += sizeof(double) * entries; // semantic.rank
103-
size += sizeof(sqlite3_int64) * entries; // semantic.hash
121+
size += sizeof(uint64_t) * entries; // semantic.hash
104122
size += sizeof(sqlite3_int64) * entries; // semantic.seq
105123

106124
// merge arrays (2x entries for union of both sources)
107125
size += sizeof(double) * merge_entries; // merge.vectorScore
108126
size += sizeof(double) * merge_entries; // merge.textScore
109-
size += sizeof(sqlite3_int64) * merge_entries; // merge.hash
127+
size += sizeof(uint64_t) * merge_entries; // merge.hash
110128
size += sizeof(sqlite3_int64) * merge_entries; // merge.seq
111129
size += sizeof(int) * merge_entries; // merge.hasVector
112130
size += sizeof(int) * merge_entries; // merge.hasFts
113131

114132
// final arrays
115133
size += sizeof(double) * entries; // rank
116-
size += sizeof(sqlite3_int64) * entries; // hash
134+
size += sizeof(uint64_t) * entries; // hash
117135
size += sizeof(sqlite3_int64) * entries; // seq
118136

119137
char *buffer = (char *)dbmemory_zeroalloc(size);
@@ -127,17 +145,17 @@ int vMemorySearchCursorAllocate (vMemorySearchCursor *c, int entries, bool perfo
127145
if (perform_fts) {
128146
c->fts.rank = (double *)buffer;
129147
buffer += sizeof(double) * entries;
130-
c->fts.hash = (sqlite3_int64 *)buffer;
131-
buffer += sizeof(sqlite3_int64) * entries;
148+
c->fts.hash = (uint64_t *)buffer;
149+
buffer += sizeof(uint64_t) * entries;
132150
c->fts.seq = (sqlite3_int64 *)buffer;
133151
buffer += sizeof(sqlite3_int64) * entries;
134152
}
135153

136154
// semantic
137155
c->semantic.rank = (double *)buffer;
138156
buffer += sizeof(double) * entries;
139-
c->semantic.hash = (sqlite3_int64 *)buffer;
140-
buffer += sizeof(sqlite3_int64) * entries;
157+
c->semantic.hash = (uint64_t *)buffer;
158+
buffer += sizeof(uint64_t) * entries;
141159
c->semantic.seq = (sqlite3_int64 *)buffer;
142160
buffer += sizeof(sqlite3_int64) * entries;
143161

@@ -146,8 +164,8 @@ int vMemorySearchCursorAllocate (vMemorySearchCursor *c, int entries, bool perfo
146164
buffer += sizeof(double) * merge_entries;
147165
c->merge.textScore = (double *)buffer;
148166
buffer += sizeof(double) * merge_entries;
149-
c->merge.hash = (sqlite3_int64 *)buffer;
150-
buffer += sizeof(sqlite3_int64) * merge_entries;
167+
c->merge.hash = (uint64_t *)buffer;
168+
buffer += sizeof(uint64_t) * merge_entries;
151169
c->merge.seq = (sqlite3_int64 *)buffer;
152170
buffer += sizeof(sqlite3_int64) * merge_entries;
153171
c->merge.hasVector = (int *)buffer;
@@ -158,8 +176,8 @@ int vMemorySearchCursorAllocate (vMemorySearchCursor *c, int entries, bool perfo
158176
// final rowset
159177
c->rank = (double *)buffer;
160178
buffer += sizeof(double) * entries;
161-
c->hash = (sqlite3_int64 *)buffer;
162-
buffer += sizeof(sqlite3_int64) * entries;
179+
c->hash = (uint64_t *)buffer;
180+
buffer += sizeof(uint64_t) * entries;
163181
c->seq = (sqlite3_int64 *)buffer;
164182

165183
return SQLITE_OK;
@@ -183,7 +201,7 @@ int vMemorySearchCursorMerge(vMemorySearchCursor *c, double vectorWeight, double
183201

184202
// add/merge FTS results (already normalized to 0..1)
185203
for (int i = 0; i < c->fts.count; i++) {
186-
sqlite3_int64 hash = c->fts.hash[i];
204+
uint64_t hash = c->fts.hash[i];
187205
sqlite3_int64 seq = c->fts.seq[i];
188206

189207
// check if already in merge list
@@ -230,7 +248,7 @@ int vMemorySearchCursorMerge(vMemorySearchCursor *c, double vectorWeight, double
230248
// swap all parallel arrays together
231249
for (int i = 1; i < c->merge.count; i++) {
232250
double tempScore = c->merge.textScore[i];
233-
sqlite3_int64 tempHash = c->merge.hash[i];
251+
uint64_t tempHash = c->merge.hash[i];
234252
sqlite3_int64 tempSeq = c->merge.seq[i];
235253

236254
int j = i - 1;
@@ -269,7 +287,7 @@ static void vMemorySearchUpdateAccess(sqlite3 *db, vMemorySearchCursor *c) {
269287

270288
for (int i = 0; i < c->count; i++) {
271289
sqlite3_bind_int64(vm, 1, now);
272-
sqlite3_bind_int64(vm, 2, c->hash[i]);
290+
dbmem_search_bind_hash(vm, 2, c->hash[i]);
273291
sqlite3_step(vm);
274292
sqlite3_reset(vm);
275293
}
@@ -308,8 +326,7 @@ static int dbmem_fts_search (sqlite3 *db, vMemorySearchCursor *c, const char *in
308326
"SELECT rank, hash, seq FROM dbmem_vault_fts WHERE content MATCH ?1 ORDER BY rank LIMIT ?2";
309327
static const char *sql_with_context =
310328
"SELECT fts.rank, fts.hash, fts.seq FROM dbmem_vault_fts AS fts "
311-
"JOIN dbmem_vault AS v ON fts.hash = v.hash AND fts.seq = v.seq "
312-
"WHERE fts.content MATCH ?1 AND INSTR(',' || ?3 || ',', ',' || v.context || ',') > 0 "
329+
"WHERE fts.content MATCH ?1 AND INSTR(',' || ?3 || ',', ',' || fts.context || ',') > 0 "
313330
"ORDER BY fts.rank LIMIT ?2";
314331
const char *sql = (context) ? sql_with_context : sql_no_context;
315332

@@ -347,7 +364,10 @@ static int dbmem_fts_search (sqlite3 *db, vMemorySearchCursor *c, const char *in
347364
if (rank > rank_max) rank_max = rank;
348365

349366
c->fts.rank[count] = rank;
350-
c->fts.hash[count] = sqlite3_column_int64(vm, 1);
367+
if (!dbmem_search_column_hash(vm, 1, &c->fts.hash[count])) {
368+
rc = SQLITE_MISMATCH;
369+
break;
370+
}
351371
c->fts.seq[count] = sqlite3_column_int64(vm, 2);
352372
c->fts.count++;
353373

@@ -409,7 +429,10 @@ static int dbmem_semantic_search (sqlite3 *db, vMemorySearchCursor *c, float *em
409429

410430
// SQLITE_ROW
411431
c->semantic.rank[count] = sqlite3_column_double(vm, 0);
412-
c->semantic.hash[count] = sqlite3_column_int64(vm, 1);
432+
if (!dbmem_search_column_hash(vm, 1, &c->semantic.hash[count])) {
433+
rc = SQLITE_MISMATCH;
434+
break;
435+
}
413436
c->semantic.seq[count] = sqlite3_column_int64(vm, 2);
414437
c->semantic.count++;
415438

@@ -530,7 +553,10 @@ static int vMemorySearchCursorColumn (sqlite3_vtab_cursor *cur, sqlite3_context
530553

531554
switch (iCol) {
532555
case SEARCH_COLUMN_HASH:
533-
sqlite3_result_int64(context, c->hash[c->index]);
556+
{
557+
char hash_text[DBMEM_HASH_STR_MAXLEN];
558+
sqlite3_result_text(context, dbmem_hash_to_hex(c->hash[c->index], hash_text), -1, SQLITE_TRANSIENT);
559+
}
534560
break;
535561

536562
case SEARCH_COLUMN_SEQ:
@@ -546,7 +572,7 @@ static int vMemorySearchCursorColumn (sqlite3_vtab_cursor *cur, sqlite3_context
546572
const char *sql = (iCol == SEARCH_COLUMN_PATH) ? path_sql : snippet_sql;
547573
sqlite3_stmt *vm = NULL;
548574
if (sqlite3_prepare_v2(db, sql, -1, &vm, NULL) == SQLITE_OK) {
549-
sqlite3_bind_int64(vm, 1, c->hash[c->index]);
575+
dbmem_search_bind_hash(vm, 1, c->hash[c->index]);
550576
if (iCol == SEARCH_COLUMN_SNIPPET) sqlite3_bind_int64(vm, 2, c->seq[c->index]);
551577
if (sqlite3_step(vm) == SQLITE_ROW) sqlite3_result_value(context, sqlite3_column_value(vm, 0));
552578
}
@@ -612,6 +638,11 @@ static int vMemorySearchCursorFilter (sqlite3_vtab_cursor *cur, int idxNum, cons
612638
fetch_count = max_results;
613639
}
614640

641+
if (fetch_count <= 0) {
642+
c->count = 0;
643+
return SQLITE_OK;
644+
}
645+
615646
// allocate internal cursor buffer
616647
int rc = vMemorySearchCursorAllocate(c, fetch_count, perform_fts);
617648
if (rc != SQLITE_OK) return SQLITE_NOMEM;
@@ -698,9 +729,9 @@ static int vMemorySearchCursorFilter (sqlite3_vtab_cursor *cur, int idxNum, cons
698729
printf("=================================\n");
699730
for (int i = 0; i < c->count; i++) {
700731
double rank = c->rank[i];
701-
sqlite3_int64 hash = c->hash[i];
732+
uint64_t hash = c->hash[i];
702733
sqlite3_int64 seq = c->seq[i];
703-
printf("%3d %.3f %20lld %2lld\n", i, rank, (long long)hash, (long long)seq);
734+
printf("%3d %.3f %016llx %2lld\n", i, rank, (unsigned long long)hash, (long long)seq);
704735
}
705736
printf("=================================\n");
706737
#endif

0 commit comments

Comments
 (0)