Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/basic_memory/services/project_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,25 +997,33 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
)

# --- Count queries (tables exist) ---
# Filter by entity existence to exclude stale rows from deleted entities
# that remain in derived search tables (search_index, search_vector_chunks)
entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"

si_result = await self.repository.execute_query(
text(
"SELECT COUNT(DISTINCT entity_id) FROM search_index WHERE project_id = :project_id"
"SELECT COUNT(DISTINCT entity_id) FROM search_index "
f"WHERE project_id = :project_id {entity_exists}"
),
{"project_id": project_id},
)
total_indexed_entities = si_result.scalar() or 0

try:
chunks_result = await self.repository.execute_query(
text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
text(
"SELECT COUNT(*) FROM search_vector_chunks "
f"WHERE project_id = :project_id {entity_exists}"
),
{"project_id": project_id},
)
total_chunks = chunks_result.scalar() or 0

entities_with_chunks_result = await self.repository.execute_query(
text(
"SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
"WHERE project_id = :project_id"
f"WHERE project_id = :project_id {entity_exists}"
),
{"project_id": project_id},
)
Expand Down
51 changes: 51 additions & 0 deletions src/basic_memory/services/search_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,11 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
"""
entities = await self.entity_repository.find_all()
entity_ids = [entity.id for entity in entities]

# Clean up stale rows in search_index and search_vector_chunks
# that reference entity_ids no longer in the entity table
await self._purge_stale_search_rows(set(entity_ids))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Ensure vector tables exist before purging stale rows

reindex_vectors now calls _purge_stale_search_rows before sync_entity_vectors_batch, but table creation happens inside sync_entity_vectors_batch (_ensure_vector_tables). On a first-time embeddings run (for example bm reindex --embeddings on a project that has never built vectors), the new purge issues DELETE statements against search_vector_chunks/search_vector_embeddings before those tables exist, which raises a DB error and aborts reindexing.

Useful? React with 👍 / 👎.


batch_result = await self.repository.sync_entity_vectors_batch(
entity_ids,
progress_callback=progress_callback,
Expand All @@ -419,6 +424,52 @@ async def reindex_vectors(self, progress_callback=None) -> dict:

return stats

async def _purge_stale_search_rows(self, valid_entity_ids: set[int]) -> None:
"""Remove rows from search_index and search_vector_chunks for deleted entities.

Trigger: entities are deleted but their derived search rows remain
Why: stale rows inflate embedding coverage stats in project info
Outcome: search tables only contain rows for entities that still exist
"""
from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
from sqlalchemy import text

project_id = self.repository.project_id
stale_entity_filter = (
"entity_id NOT IN (SELECT id FROM entity WHERE project_id = :project_id)"
)
params = {"project_id": project_id}

# Delete stale search_index rows
await self.repository.execute_query(
text(
f"DELETE FROM search_index WHERE project_id = :project_id AND {stale_entity_filter}"
),
params,
)

# SQLite vec has no CASCADE — must delete embeddings before chunks
if isinstance(self.repository, SQLiteSearchRepository):
await self.repository.execute_query(
text(
"DELETE FROM search_vector_embeddings WHERE rowid IN ("
"SELECT id FROM search_vector_chunks "
f"WHERE project_id = :project_id AND {stale_entity_filter})"
),
params,
)

# Postgres CASCADE handles embedding deletion automatically
await self.repository.execute_query(
text(
f"DELETE FROM search_vector_chunks "
f"WHERE project_id = :project_id AND {stale_entity_filter}"
),
params,
)

logger.info("Purged stale search rows for deleted entities", project_id=project_id)

async def index_entity_file(
self,
entity: Entity,
Expand Down
41 changes: 41 additions & 0 deletions tests/services/test_project_service_embedding_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,47 @@ async def test_embedding_status_healthy(project_service: ProjectService, test_gr
assert status.reindex_reason is None


@pytest.mark.asyncio
async def test_embedding_status_excludes_stale_entity_ids(
project_service: ProjectService, test_graph, test_project
):
"""Stale rows in search_index for deleted entities should not inflate counts.

Regression test for #670: after reindex, project info reported missing embeddings
because stale entity_ids in search_index/search_vector_chunks inflated total_indexed_entities.
"""
# Insert a stale search_index row for an entity_id that doesn't exist in the entity table
stale_entity_id = 999999
await project_service.repository.execute_query(
text(
"INSERT INTO search_index "
"(entity_id, project_id, type, title, permalink, content_stems, "
"content_snippet, file_path, metadata) "
"VALUES (:eid, :pid, 'entity', 'Stale Note', 'stale-note', "
"'stale content', 'stale snippet', 'stale.md', '{}')"
),
{"eid": stale_entity_id, "pid": test_project.id},
)

with patch.object(
type(project_service),
"config_manager",
new_callable=lambda: property(
lambda self: _config_manager_with(semantic_search_enabled=True)
),
):
status = await project_service.get_embedding_status(test_project.id)

# The stale entity_id should NOT be counted in total_indexed_entities
real_entity_result = await project_service.repository.execute_query(
text("SELECT COUNT(*) FROM entity WHERE project_id = :pid"),
{"pid": test_project.id},
)
real_entity_count = real_entity_result.scalar() or 0

assert status.total_indexed_entities <= real_entity_count


@pytest.mark.asyncio
async def test_get_project_info_includes_embedding_status(
project_service: ProjectService, test_graph, test_project
Expand Down
Loading