basicmachines-co · phernandez · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/src/basic_memory/services/project_service.py b/src/basic_memory/services/project_service.py
@@ -997,25 +997,33 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
             )
 
         # --- Count queries (tables exist) ---
+        # Filter by entity existence to exclude stale rows from deleted entities
+        # that remain in derived search tables (search_index, search_vector_chunks)
+        entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
+
         si_result = await self.repository.execute_query(
             text(
-                "SELECT COUNT(DISTINCT entity_id) FROM search_index WHERE project_id = :project_id"
+                "SELECT COUNT(DISTINCT entity_id) FROM search_index "
+                f"WHERE project_id = :project_id {entity_exists}"
             ),
             {"project_id": project_id},
         )
         total_indexed_entities = si_result.scalar() or 0
 
         try:
             chunks_result = await self.repository.execute_query(
-                text("SELECT COUNT(*) FROM search_vector_chunks WHERE project_id = :project_id"),
+                text(
+                    "SELECT COUNT(*) FROM search_vector_chunks "
+                    f"WHERE project_id = :project_id {entity_exists}"
+                ),
                 {"project_id": project_id},
             )
             total_chunks = chunks_result.scalar() or 0
 
             entities_with_chunks_result = await self.repository.execute_query(
                 text(
                     "SELECT COUNT(DISTINCT entity_id) FROM search_vector_chunks "
-                    "WHERE project_id = :project_id"
+                    f"WHERE project_id = :project_id {entity_exists}"
                 ),
                 {"project_id": project_id},
             )

diff --git a/src/basic_memory/services/search_service.py b/src/basic_memory/services/search_service.py
@@ -403,6 +403,11 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
         """
         entities = await self.entity_repository.find_all()
         entity_ids = [entity.id for entity in entities]
+
+        # Clean up stale rows in search_index and search_vector_chunks
+        # that reference entity_ids no longer in the entity table
+        await self._purge_stale_search_rows(set(entity_ids))
+
         batch_result = await self.repository.sync_entity_vectors_batch(
             entity_ids,
             progress_callback=progress_callback,
@@ -419,6 +424,52 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
 
         return stats
 
+    async def _purge_stale_search_rows(self, valid_entity_ids: set[int]) -> None:
+        """Remove rows from search_index and search_vector_chunks for deleted entities.
+
+        Trigger: entities are deleted but their derived search rows remain
+        Why: stale rows inflate embedding coverage stats in project info
+        Outcome: search tables only contain rows for entities that still exist
+        """
+        from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
+        from sqlalchemy import text
+
+        project_id = self.repository.project_id
+        stale_entity_filter = (
+            "entity_id NOT IN (SELECT id FROM entity WHERE project_id = :project_id)"
+        )
+        params = {"project_id": project_id}
+
+        # Delete stale search_index rows
+        await self.repository.execute_query(
+            text(
+                f"DELETE FROM search_index WHERE project_id = :project_id AND {stale_entity_filter}"
+            ),
+            params,
+        )
+
+        # SQLite vec has no CASCADE — must delete embeddings before chunks
+        if isinstance(self.repository, SQLiteSearchRepository):
+            await self.repository.execute_query(
+                text(
+                    "DELETE FROM search_vector_embeddings WHERE rowid IN ("
+                    "SELECT id FROM search_vector_chunks "
+                    f"WHERE project_id = :project_id AND {stale_entity_filter})"
+                ),
+                params,
+            )
+
+        # Postgres CASCADE handles embedding deletion automatically
+        await self.repository.execute_query(
+            text(
+                f"DELETE FROM search_vector_chunks "
+                f"WHERE project_id = :project_id AND {stale_entity_filter}"
+            ),
+            params,
+        )
+
+        logger.info("Purged stale search rows for deleted entities", project_id=project_id)
+
     async def index_entity_file(
         self,
         entity: Entity,

diff --git a/tests/services/test_project_service_embedding_status.py b/tests/services/test_project_service_embedding_status.py
@@ -251,6 +251,47 @@ async def test_embedding_status_healthy(project_service: ProjectService, test_gr
     assert status.reindex_reason is None
 
 
+@pytest.mark.asyncio
+async def test_embedding_status_excludes_stale_entity_ids(
+    project_service: ProjectService, test_graph, test_project
+):
+    """Stale rows in search_index for deleted entities should not inflate counts.
+
+    Regression test for #670: after reindex, project info reported missing embeddings
+    because stale entity_ids in search_index/search_vector_chunks inflated total_indexed_entities.
+    """
+    # Insert a stale search_index row for an entity_id that doesn't exist in the entity table
+    stale_entity_id = 999999
+    await project_service.repository.execute_query(
+        text(
+            "INSERT INTO search_index "
+            "(entity_id, project_id, type, title, permalink, content_stems, "
+            "content_snippet, file_path, metadata) "
+            "VALUES (:eid, :pid, 'entity', 'Stale Note', 'stale-note', "
+            "'stale content', 'stale snippet', 'stale.md', '{}')"
+        ),
+        {"eid": stale_entity_id, "pid": test_project.id},
+    )
+
+    with patch.object(
+        type(project_service),
+        "config_manager",
+        new_callable=lambda: property(
+            lambda self: _config_manager_with(semantic_search_enabled=True)
+        ),
+    ):
+        status = await project_service.get_embedding_status(test_project.id)
+
+    # The stale entity_id should NOT be counted in total_indexed_entities
+    real_entity_result = await project_service.repository.execute_query(
+        text("SELECT COUNT(*) FROM entity WHERE project_id = :pid"),
+        {"pid": test_project.id},
+    )
+    real_entity_count = real_entity_result.scalar() or 0
+
+    assert status.total_indexed_entities <= real_entity_count
+
+
 @pytest.mark.asyncio
 async def test_get_project_info_includes_embedding_status(
     project_service: ProjectService, test_graph, test_project