feat: add preserve duplicate paths option (#9)

andinux · web-flow · commit dd50ee5e9564 · 2026-06-04T15:26:42.000-06:00
Add the preserve_duplicate_paths option for virtual-file/editor workflows that need distinct logical paths even when content is identical or empty.

When enabled with SELECT memory_set_option('preserve_duplicate_paths', 1), storage hashes are scoped by path so dbmem_content can keep separate rows while the embedding cache still reuses chunk embeddings by text.

Fix empty content handling so memory_add_content() and memory_add_file() can store zero-length entries without producing chunks, and keep default deduplication behavior unchanged when the option is 0.

Document the option, bump the extension version to 1.3.2, and cover default dedupe, duplicate preservation, and empty file/content behavior with unit tests.
diff --git a/API.md b/API.md
@@ -35,7 +35,7 @@ sqlite-memory enables semantic search over text content stored in SQLite. It:
 
 ## Sync Behavior
 
-All `memory_add_*` functions use **content-hash change detection** to avoid redundant embedding computation. Each piece of content is hashed before processing — if the hash already exists in the database, the content is skipped.
+By default, all `memory_add_*` functions use **content-hash change detection** to avoid redundant embedding computation. Each piece of content is hashed before processing — if the hash already exists in the database, the content is skipped. Set `preserve_duplicate_paths=1` to store distinct logical paths even when their content is identical or empty.
 
 ### Change Detection
 
@@ -197,6 +197,9 @@ SELECT memory_set_option('engine_warmup', 1);
 
 -- Set minimum score threshold
 SELECT memory_set_option('min_score', 0.75);
+
+-- Preserve separate logical paths even when content is identical
+SELECT memory_set_option('preserve_duplicate_paths', 1);
 ```
 
 ---
@@ -210,7 +213,7 @@ Retrieves a configuration option value.
 |-----------|------|-------------|
 | `key` | TEXT | Option name |
 
-**Returns:** ANY - Option value, or NULL if not set
+**Returns:** ANY - Option value, or NULL if not set. `preserve_duplicate_paths` returns `0` by default.
 
 **Example:**
 ```sql
@@ -303,6 +306,7 @@ Indexes caller-provided file content without reading from the filesystem.
 - No row is added to `dbmem_content_source` because content was supplied by the caller rather than read from the local filesystem
 - If the path was previously indexed with different content, the old entry (chunks, embeddings, FTS) is deleted and new content is reindexed
 - If the new content is already indexed under another path, the stale path is removed and the existing content entry is reused
+- Set `preserve_duplicate_paths=1` to preserve separate rows for distinct paths with identical or empty content
 - Available even when compiled with `DBMEM_OMIT_IO`
 
 **Example:**
@@ -828,6 +832,7 @@ sqlite3_memory_register_provider(db, "my-engine", &provider);
 | `embedding_cache` | INTEGER | 1 | Cache embeddings to avoid redundant computation |
 | `cache_max_entries` | INTEGER | 0 | Max cache entries (0 = no limit). When exceeded, oldest entries are evicted |
 | `search_oversample` | INTEGER | 0 | Search oversampling multiplier (0 = no oversampling). When set, retrieves N * multiplier candidates from each index before merging down to N final results |
+| `preserve_duplicate_paths` | INTEGER | 0 | Preserve distinct logical paths for identical or empty content. When enabled, `dbmem_content.hash` is path-scoped and identifies an entry rather than only the raw content |
 
 ---
 
diff --git a/README.md b/README.md
@@ -210,7 +210,7 @@ memories = recall("what's the project timeline")
 
 ## Intelligent Sync
 
-All `memory_add_*` functions use content-hash change detection to avoid redundant work:
+By default, all `memory_add_*` functions use content-hash change detection to avoid redundant work:
 
 - **`memory_add_text`**: Computes a hash of the content. If the same content was already indexed, it is skipped entirely. No duplicate embeddings are ever created.
 - **`memory_add_file`**: Reads the file and hashes its content. If the file was previously indexed with different content, the old entry (chunks, embeddings, FTS) is atomically replaced. Unchanged files are skipped. Absolute file paths are stored as portable logical suffixes, while the original local path is retained only in local metadata.
@@ -219,6 +219,14 @@ All `memory_add_*` functions use content-hash change detection to avoid redundan
   1. **Cleanup**: Removes database entries for files that no longer exist on disk
   2. **Scan**: Recursively processes all matching files - adding new ones, replacing modified ones, and skipping unchanged ones. Stored paths are relative to the scanned directory root, with local provenance retained only in local metadata.
 
+For virtual-file or editor workflows that need separate logical paths even when content is identical or empty, enable path-preserving storage:
+
+```sql
+SELECT memory_set_option('preserve_duplicate_paths', 1);
+```
+
+In this mode, `dbmem_content.hash` identifies the stored entry and is scoped by path.
+
 `memory_add_text()`, `memory_add_file()`, and `memory_add_content()` each run inside a SQLite SAVEPOINT transaction. `memory_add_directory()` performs its cleanup pass transactionally and then processes each file in its own transaction. If one file fails, that file rolls back cleanly and previously-committed files remain valid; there are no partially-indexed rows or orphaned chunk/FTS entries for the failed file.
 
 This makes all sync functions safe to call repeatedly - for example, on a cron schedule or at agent startup - with minimal overhead.
@@ -300,6 +308,7 @@ SELECT memory_set_option('search_oversample', 4); -- Fetch 4x candidates before
 
 -- File processing
 SELECT memory_set_option('extensions', 'md,txt,rst');  -- File types to index
+SELECT memory_set_option('preserve_duplicate_paths', 1); -- Keep duplicate/empty virtual paths
 
 -- Embedding cache (enabled by default)
 SELECT memory_set_option('embedding_cache', 0);        -- Disable cache
diff --git a/src/sqlite-memory.c b/src/sqlite-memory.c
@@ -64,6 +64,7 @@ SQLITE_EXTENSION_INIT1
 #define DBMEM_SETTINGS_KEY_EMBEDDING_CACHE      "embedding_cache"
 #define DBMEM_SETTINGS_KEY_CACHE_MAX_ENTRIES    "cache_max_entries"
 #define DBMEM_SETTINGS_KEY_SEARCH_OVERSAMPLE    "search_oversample"
+#define DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS   "preserve_duplicate_paths"
 #define DBMEM_SETTINGS_KEY_SCHEMA_VERSION       "schema_version"
 
 #define DBMEM_SCHEMA_VERSION                    4
@@ -126,6 +127,7 @@ struct dbmem_context {
     bool        embedding_cache;                // Enable/disable embedding cache (default: true)
     int         cache_max_entries;              // Max cache entries (0 = no limit)
     int         search_oversample;             // Search oversampling multiplier (0 = no oversampling)
+    bool        preserve_duplicate_paths;       // Keep separate rows for distinct paths with identical content
 
     // Cache
     float       *cache_buffer;                  // Reusable buffer for cache hits
@@ -181,6 +183,16 @@ static bool dbmem_value_hash (sqlite3_value *value, uint64_t *hash) {
     }
 }
 
+static uint64_t dbmem_storage_hash_compute (const char *buffer, size_t len, const char *path, bool preserve_duplicate_paths) {
+    uint64_t content_hash = dbmem_hash_compute(buffer, len);
+    if (!preserve_duplicate_paths || !path || !path[0]) return content_hash;
+
+    uint64_t parts[2];
+    parts[0] = content_hash;
+    parts[1] = dbmem_hash_compute(path, strlen(path));
+    return dbmem_hash_compute(parts, sizeof(parts));
+}
+
 // MARK: - Settings -
 
 static int dbmem_settings_write (sqlite3 *db, const char *key, const char *text_value, sqlite3_int64 int_value, const sqlite3_value *sql_value, int bind_type) {
@@ -326,6 +338,12 @@ static int dbmem_settings_sync (dbmem_context *ctx, const char *key, sqlite3_val
         return 0;
     }
 
+    if (strcasecmp(key, DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS) == 0) {
+        int n = sqlite3_value_int(value);
+        ctx->preserve_duplicate_paths = (n > 0) ? 1 : 0;
+        return 0;
+    }
+
     if (strcasecmp(key, DBMEM_SETTINGS_KEY_PROVIDER) == 0) {
         char *provider = dbmem_strdup((const char *)sqlite3_value_text(value));
         if (provider) {
@@ -668,10 +686,10 @@ static bool dbmem_database_check_if_stored (sqlite3 *db, uint64_t hash, int64_t
     rc = sqlite3_step(vm);
     if (rc == SQLITE_DONE) rc = SQLITE_OK;
     else if (rc != SQLITE_ROW) goto cleanup;
-
-    // SQLITE_ROW case
-    sqlite3_int64 saved_len = sqlite3_column_int64(vm, 0);
-    result = (saved_len == len);
+    else {
+        sqlite3_int64 saved_len = sqlite3_column_int64(vm, 0);
+        result = (saved_len == len);
+    }
 
 cleanup:
     if (vm) sqlite3_finalize(vm);
@@ -2390,7 +2408,11 @@ static void dbmem_get_option (sqlite3_context *context, int argc, sqlite3_value
 
     rc = sqlite3_step(vm);
     if (rc == SQLITE_DONE) {
-        sqlite3_result_null(context);
+        if (strcasecmp(key, DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS) == 0) {
+            sqlite3_result_int(context, 0);
+        } else {
+            sqlite3_result_null(context);
+        }
         rc = SQLITE_OK;
     } else if (rc == SQLITE_ROW) {
         sqlite3_result_value(context, sqlite3_column_value(vm, 0));
@@ -2616,7 +2638,7 @@ static int dbmem_process_callback (const char *text, size_t len, size_t offset,
 }
 
 static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t len) {
-    uint64_t hash = dbmem_hash_compute(buffer, (size_t)len);
+    uint64_t hash = dbmem_storage_hash_compute(buffer, (size_t)len, ctx->path, ctx->preserve_duplicate_paths);
     const char *saved_path = ctx->path;
     char *unique_path = NULL;
     bool transaction_started = false;
@@ -2625,6 +2647,7 @@ static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t
         unique_path = dbmem_path_unique_storage_copy(ctx->db, ctx->path, ctx->source_path);
         if (!unique_path) return SQLITE_NOMEM;
         ctx->path = unique_path;
+        hash = dbmem_storage_hash_compute(buffer, (size_t)len, ctx->path, ctx->preserve_duplicate_paths);
     }
 
     sqlite3 *db = ctx->db;
@@ -2638,7 +2661,7 @@ static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t
         }
         dbmem_database_delete_stale_path(db, ctx->path, hash);
 
-        if (dbmem_database_check_if_stored(ctx->db, hash, len)) {
+        if (!ctx->preserve_duplicate_paths && dbmem_database_check_if_stored(ctx->db, hash, len)) {
             if (ctx->source_path) {
                 char *stored_path = dbmem_database_path_for_hash_copy(ctx->db, hash);
                 if (!stored_path) {
@@ -2670,6 +2693,8 @@ static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t
         if (rc != SQLITE_OK) goto cleanup;
     }
 
+    if (len == 0) goto cleanup;
+
     rc = dbmem_parse(buffer, (size_t)len, &settings);
 
     if (rc == SQLITE_OK && !ctx->dimension_saved) {
@@ -3529,20 +3554,25 @@ static void dbmem_sql_reindex (sqlite3_context *context, int argc, sqlite3_value
             break;
         }
 
-        uint64_t value_hash = dbmem_hash_compute(value, (size_t)value_len);
-        bool hash_matches = (stored_hash == value_hash);
-        bool value_has_vault = dbmem_database_hash_has_vault(db, value_hash);
-        bool needs_reindex = !hash_matches || !value_has_vault;
+        uint64_t content_hash = dbmem_hash_compute(value, (size_t)value_len);
+        uint64_t scoped_hash = dbmem_storage_hash_compute(value, (size_t)value_len, path, true);
+        bool hash_matches = (stored_hash == content_hash || stored_hash == scoped_hash);
+        uint64_t target_hash = hash_matches
+            ? stored_hash
+            : dbmem_storage_hash_compute(value, (size_t)value_len, path, ctx->preserve_duplicate_paths);
+        bool target_has_vault = (value_len == 0) || dbmem_database_hash_has_vault(db, target_hash);
+        bool needs_hash_update = !hash_matches;
+        bool needs_reindex = (value_len > 0) && (!hash_matches || !target_has_vault);
 
-        if (needs_reindex && !value_has_vault) {
+        if (needs_reindex) {
             ctx->path = path;
             ctx->context = ctx_name;
             rc = dbmem_process_buffer(ctx, value, value_len);
         }
 
-        if (rc == SQLITE_OK && needs_reindex) {
-            rc = dbmem_database_update_content_hash(db, path, value_hash);
-            if (rc == SQLITE_OK && !hash_matches) {
+        if (rc == SQLITE_OK && needs_hash_update) {
+            rc = dbmem_database_update_content_hash(db, path, target_hash);
+            if (rc == SQLITE_OK) {
                 rc = dbmem_database_delete_index_hash(db, stored_hash);
             }
         }
@@ -3566,7 +3596,7 @@ static void dbmem_sql_reindex (sqlite3_context *context, int argc, sqlite3_value
         dbmemory_free(ctx_name);
 
         if (rc != SQLITE_OK) break;
-        if (needs_reindex) processed++;
+        if (needs_reindex || needs_hash_update) processed++;
     }
 
 done:
diff --git a/src/sqlite-memory.h b/src/sqlite-memory.h
@@ -26,7 +26,7 @@
 extern "C" {
 #endif
 
-#define SQLITE_DBMEMORY_VERSION "1.3.1"
+#define SQLITE_DBMEMORY_VERSION "1.3.2"
 
 // public API
 SQLITE_DBMEMORY_API int sqlite3_memory_init (sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi);
diff --git a/test/unittest.c b/test/unittest.c