Skip to content

Commit 093c94f

Browse files
authored
fix(core): clean up delete vectors and cloud sync (#733)
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 7945c1e commit 093c94f

File tree

8 files changed

+173
-87
lines changed

8 files changed

+173
-87
lines changed

src/basic_memory/cli/commands/cloud/project_sync.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -124,22 +124,6 @@ def sync_project_command(
124124

125125
if success:
126126
console.print(f"[green]{name} synced successfully[/green]")
127-
128-
# Trigger database sync if not a dry run
129-
if not dry_run:
130-
131-
async def _trigger_db_sync():
132-
async with get_client(project_name=name) as client:
133-
return await ProjectClient(client).sync(
134-
project_data.external_id, force_full=False
135-
)
136-
137-
try:
138-
with force_routing(cloud=True):
139-
result = run_with_cleanup(_trigger_db_sync())
140-
console.print(f"[dim]Database sync initiated: {result.get('message')}[/dim]")
141-
except Exception as e:
142-
console.print(f"[yellow]Warning: Could not trigger database sync: {e}[/yellow]")
143127
else:
144128
console.print(f"[red]{name} sync failed[/red]")
145129
raise typer.Exit(1)
@@ -202,22 +186,6 @@ def bisync_project_command(
202186
sync_entry.last_sync = datetime.now()
203187
sync_entry.bisync_initialized = True
204188
ConfigManager().save_config(config)
205-
206-
# Trigger database sync if not a dry run
207-
if not dry_run:
208-
209-
async def _trigger_db_sync():
210-
async with get_client(project_name=name) as client:
211-
return await ProjectClient(client).sync(
212-
project_data.external_id, force_full=False
213-
)
214-
215-
try:
216-
with force_routing(cloud=True):
217-
result = run_with_cleanup(_trigger_db_sync())
218-
console.print(f"[dim]Database sync initiated: {result.get('message')}[/dim]")
219-
except Exception as e:
220-
console.print(f"[yellow]Warning: Could not trigger database sync: {e}[/yellow]")
221189
else:
222190
console.print(f"[red]{name} bisync failed[/red]")
223191
raise typer.Exit(1)

src/basic_memory/repository/search_repository.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ async def sync_entity_vectors(self, entity_id: int) -> None:
7070
"""Sync semantic vector chunks for an entity."""
7171
...
7272

73+
async def delete_entity_vector_rows(self, entity_id: int) -> None:
74+
"""Delete semantic vector chunks and embeddings for one entity."""
75+
...
76+
7377
async def sync_entity_vectors_batch(
7478
self,
7579
entity_ids: list[int],

src/basic_memory/repository/search_repository_base.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,15 @@ async def execute_query(
454454
logger.debug(f"Query executed successfully in {elapsed_time:.2f}s.")
455455
return result
456456

457+
async def delete_entity_vector_rows(self, entity_id: int) -> None:
458+
"""Delete one entity's derived vector rows using the backend's cleanup path."""
459+
await self._ensure_vector_tables()
460+
461+
async with db.scoped_session(self.session_maker) as session:
462+
await self._prepare_vector_session(session)
463+
await self._delete_entity_chunks(session, entity_id)
464+
await session.commit()
465+
457466
# ------------------------------------------------------------------
458467
# Shared semantic search: guard, text processing, chunking
459468
# ------------------------------------------------------------------

src/basic_memory/repository/sqlite_search_repository.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -565,21 +565,6 @@ async def _delete_stale_chunks(
565565
stale_params,
566566
)
567567

568-
async def delete_entity_vector_rows(self, entity_id: int) -> None:
569-
"""Delete one entity's vec rows on a sqlite-vec-enabled connection."""
570-
await self._ensure_vector_tables()
571-
572-
async with db.scoped_session(self.session_maker) as session:
573-
await self._ensure_sqlite_vec_loaded(session)
574-
575-
# Constraint: sqlite-vec virtual tables are only visible after vec0 is
576-
# loaded on this exact connection.
577-
# Why: generic repository sessions can reach search_vector_chunks but still
578-
# fail with "no such module: vec0" when touching embeddings.
579-
# Outcome: service-level cleanup routes vec-table deletes through this helper.
580-
await self._delete_entity_chunks(session, entity_id)
581-
await session.commit()
582-
583568
async def delete_project_vector_rows(self) -> None:
584569
"""Delete all vector rows for this project on a sqlite-vec-enabled connection."""
585570
await self._ensure_vector_tables()

src/basic_memory/services/search_service.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,6 @@ def _entity_embeddings_enabled(entity: Entity) -> bool:
660660
async def _clear_entity_vectors(self, entity_id: int) -> None:
661661
"""Delete derived vector rows for one entity."""
662662
from basic_memory.repository.search_repository_base import SearchRepositoryBase
663-
from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
664663

665664
# Trigger: semantic indexing is disabled for this repository instance.
666665
# Why: repositories only create vector tables when semantic search is enabled.
@@ -671,17 +670,7 @@ async def _clear_entity_vectors(self, entity_id: int) -> None:
671670
):
672671
return
673672

674-
params = {"project_id": self.repository.project_id, "entity_id": entity_id}
675-
if isinstance(self.repository, SQLiteSearchRepository):
676-
await self.repository.delete_entity_vector_rows(entity_id)
677-
else:
678-
await self.repository.execute_query(
679-
text(
680-
"DELETE FROM search_vector_chunks "
681-
"WHERE project_id = :project_id AND entity_id = :entity_id"
682-
),
683-
params,
684-
)
673+
await self.repository.delete_entity_vector_rows(entity_id)
685674

686675
async def index_entity_file(
687676
self,
@@ -889,7 +878,7 @@ async def delete_by_entity_id(self, entity_id: int):
889878
await self.repository.delete_by_entity_id(entity_id)
890879

891880
async def handle_delete(self, entity: Entity):
892-
"""Handle complete entity deletion from search index including observations and relations.
881+
"""Handle complete entity deletion from search and semantic index state.
893882
894883
This replicates the logic from sync_service.handle_delete() to properly clean up
895884
all search index entries for an entity and its related data.
@@ -916,3 +905,8 @@ async def handle_delete(self, entity: Entity):
916905
await self.delete_by_permalink(permalink)
917906
else:
918907
await self.delete_by_entity_id(entity.id)
908+
909+
# Trigger: entity deletion removes the source rows for this note.
910+
# Why: semantic chunks/embeddings are stored separately from search_index rows.
911+
# Outcome: deleting an entity clears both full-text and vector-derived search state.
912+
await self._clear_entity_vectors(entity.id)

tests/cli/cloud/test_project_sync_command.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Tests for cloud sync and bisync command behavior."""
22

33
import importlib
4-
from contextlib import asynccontextmanager
54
from types import SimpleNamespace
65

76
import pytest
@@ -20,11 +19,10 @@
2019
["cloud", "bisync", "--name", "research"],
2120
],
2221
)
23-
def test_cloud_sync_commands_use_incremental_db_sync(monkeypatch, argv, config_manager):
24-
"""Cloud sync commands should not force a full database re-index after file sync."""
22+
def test_cloud_sync_commands_skip_explicit_cloud_project_sync(monkeypatch, argv, config_manager):
23+
"""Cloud sync commands should not trigger an extra explicit cloud project sync."""
2524
project_sync_command = importlib.import_module("basic_memory.cli.commands.cloud.project_sync")
2625

27-
seen: dict[str, object] = {}
2826
config = config_manager.load_config()
2927
config.set_project_mode("research", ProjectMode.CLOUD)
3028
config_manager.save_config(config)
@@ -50,30 +48,10 @@ def test_cloud_sync_commands_use_incremental_db_sync(monkeypatch, argv, config_m
5048
monkeypatch.setattr(project_sync_command, "project_sync", lambda *args, **kwargs: True)
5149
monkeypatch.setattr(project_sync_command, "project_bisync", lambda *args, **kwargs: True)
5250

53-
@asynccontextmanager
54-
async def fake_get_client(*, project_name=None, workspace=None):
55-
seen["project_name"] = project_name
56-
seen["workspace"] = workspace
57-
yield object()
58-
59-
class FakeProjectClient:
60-
def __init__(self, _client):
61-
pass
62-
63-
async def sync(self, external_id: str, force_full: bool = False):
64-
seen["external_id"] = external_id
65-
seen["force_full"] = force_full
66-
return {"message": "queued"}
67-
68-
monkeypatch.setattr(project_sync_command, "get_client", fake_get_client)
69-
monkeypatch.setattr(project_sync_command, "ProjectClient", FakeProjectClient)
70-
7151
result = runner.invoke(app, argv)
7252

7353
assert result.exit_code == 0, result.output
74-
assert seen["project_name"] == "research"
75-
assert seen["external_id"] == "external-project-id"
76-
assert seen["force_full"] is False
54+
assert "Database sync initiated" not in result.output
7755

7856

7957
def test_cloud_bisync_fails_fast_when_sync_entry_disappears(monkeypatch, config_manager):

tests/services/test_entity_service.py

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66

77
import pytest
88
import yaml
9+
from sqlalchemy import text
910

10-
from basic_memory.config import ProjectConfig, BasicMemoryConfig
11+
from basic_memory import db
12+
from basic_memory.config import ProjectConfig, BasicMemoryConfig, DatabaseBackend
1113
from basic_memory.markdown import EntityParser
1214
from basic_memory.models import Entity as EntityModel
1315
from basic_memory.repository import EntityRepository
@@ -19,6 +21,98 @@
1921
from basic_memory.utils import generate_permalink
2022

2123

24+
class _DeleteTestEmbeddingProvider:
25+
"""Deterministic embedding provider for entity delete cleanup tests."""
26+
27+
model_name = "delete-test"
28+
dimensions = 4
29+
30+
async def embed_query(self, text: str) -> list[float]:
31+
return self._vectorize(text)
32+
33+
async def embed_documents(self, texts: list[str]) -> list[list[float]]:
34+
return [self._vectorize(text) for text in texts]
35+
36+
@staticmethod
37+
def _vectorize(text: str) -> list[float]:
38+
normalized = text.lower()
39+
if "semantic" in normalized:
40+
return [1.0, 0.0, 0.0, 0.0]
41+
if "cleanup" in normalized:
42+
return [0.0, 1.0, 0.0, 0.0]
43+
return [0.0, 0.0, 1.0, 0.0]
44+
45+
46+
async def _count_entity_search_state(
47+
session_maker,
48+
app_config: BasicMemoryConfig,
49+
project_id: int,
50+
entity_id: int,
51+
) -> tuple[int, int, int]:
52+
"""Return counts for all derived search rows tied to one entity."""
53+
embedding_join = (
54+
"e.chunk_id = c.id"
55+
if app_config.database_backend == DatabaseBackend.POSTGRES
56+
else "e.rowid = c.id"
57+
)
58+
params = {"project_id": project_id, "entity_id": entity_id}
59+
60+
async with db.scoped_session(session_maker) as session:
61+
search_index_rows = await session.execute(
62+
text(
63+
"SELECT COUNT(*) FROM search_index "
64+
"WHERE project_id = :project_id AND entity_id = :entity_id"
65+
),
66+
params,
67+
)
68+
vector_chunk_rows = await session.execute(
69+
text(
70+
"SELECT COUNT(*) FROM search_vector_chunks "
71+
"WHERE project_id = :project_id AND entity_id = :entity_id"
72+
),
73+
params,
74+
)
75+
vector_embedding_rows = await session.execute(
76+
text(
77+
"SELECT COUNT(*) FROM search_vector_embeddings e "
78+
"JOIN search_vector_chunks c ON "
79+
f"{embedding_join} "
80+
"WHERE c.project_id = :project_id AND c.entity_id = :entity_id"
81+
),
82+
params,
83+
)
84+
85+
return (
86+
int(search_index_rows.scalar_one()),
87+
int(vector_chunk_rows.scalar_one()),
88+
int(vector_embedding_rows.scalar_one()),
89+
)
90+
91+
92+
@pytest.fixture
93+
def entity_service_with_search(
94+
entity_repository: EntityRepository,
95+
observation_repository,
96+
relation_repository,
97+
entity_parser: EntityParser,
98+
file_service: FileService,
99+
link_resolver,
100+
search_service: SearchService,
101+
app_config: BasicMemoryConfig,
102+
) -> EntityService:
103+
"""Create EntityService with a real attached search service."""
104+
return EntityService(
105+
entity_parser=entity_parser,
106+
entity_repository=entity_repository,
107+
observation_repository=observation_repository,
108+
relation_repository=relation_repository,
109+
file_service=file_service,
110+
link_resolver=link_resolver,
111+
search_service=search_service,
112+
app_config=app_config,
113+
)
114+
115+
22116
@pytest.mark.asyncio
23117
async def test_create_entity(
24118
entity_service: EntityService, file_service: FileService, project_config: ProjectConfig
@@ -227,6 +321,61 @@ async def test_delete_entity_by_id(entity_service: EntityService):
227321
await entity_service.get_by_permalink(entity_data.permalink)
228322

229323

324+
@pytest.mark.asyncio
325+
async def test_delete_entity_removes_search_and_vector_state(
326+
entity_service_with_search: EntityService,
327+
search_service: SearchService,
328+
session_maker,
329+
app_config: BasicMemoryConfig,
330+
):
331+
"""Deleting an entity should clear all of its full-text and semantic search state."""
332+
if app_config.database_backend == DatabaseBackend.SQLITE:
333+
pytest.importorskip("sqlite_vec")
334+
335+
repository = search_service.repository
336+
repository._semantic_enabled = True
337+
repository._embedding_provider = _DeleteTestEmbeddingProvider()
338+
repository._vector_dimensions = repository._embedding_provider.dimensions
339+
repository._vector_tables_initialized = False
340+
await search_service.init_search_index()
341+
342+
entity = await entity_service_with_search.create_entity(
343+
EntitySchema(
344+
title="Semantic Delete Target",
345+
directory="test",
346+
note_type="note",
347+
content=dedent("""
348+
# Semantic Delete Target
349+
350+
- [note] Semantic cleanup should remove every derived row
351+
- references [[Cleanup Target]]
352+
""").strip(),
353+
)
354+
)
355+
356+
await search_service.index_entity(entity)
357+
await search_service.sync_entity_vectors(entity.id)
358+
359+
search_rows, chunk_rows, embedding_rows = await _count_entity_search_state(
360+
session_maker,
361+
app_config,
362+
search_service.repository.project_id,
363+
entity.id,
364+
)
365+
assert search_rows >= 3
366+
assert chunk_rows > 0
367+
assert embedding_rows > 0
368+
369+
assert await entity_service_with_search.delete_entity(entity.id) is True
370+
371+
assert await _count_entity_search_state(
372+
session_maker,
373+
app_config,
374+
search_service.repository.project_id,
375+
entity.id,
376+
) == (0, 0, 0)
377+
378+
230379
@pytest.mark.asyncio
231380
async def test_get_entity_by_permalink_not_found(entity_service: EntityService):
232381
"""Test handling of non-existent entity retrieval."""

tests/services/test_initialization.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,3 @@ def capture_warning(message: str) -> None:
196196
"ensure_frontmatter_on_sync=True overrides disable_permalinks=True" in message
197197
for message in warnings
198198
)
199-

0 commit comments

Comments
 (0)