Skip to content

Commit e846ae8

Browse files
phernandezclaude
andcommitted
fix: semantic embeddings not generated on fresh DB or upgrade
The previous backfill trigger relied on Alembic revision tracking, but alembic_version only stores the head revision — intermediate revisions (like the backfill trigger) are invisible after a multi-step upgrade or fresh DB creation. Three changes fix this: 1. Replace Alembic revision check with a simple "entities exist but embeddings are empty" check that works regardless of migration path 2. Generate embeddings during sync — after FTS indexing, batch-embed all synced entities at the end of the sync operation 3. Add background backfill at MCP startup for the upgrade path (entities already exist, no embeddings) without blocking server readiness Also adds clear startup logging for semantic embedding status so issues are easy to spot in the logs. 📋 Covers: fresh DB, upgrade from pre-embedding version, db reset, interrupted backfill Signed-off-by: Pedro Hernandez <pedro@basicmachines.co> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 63e4bcd commit e846ae8

File tree

5 files changed

+201
-79
lines changed

5 files changed

+201
-79
lines changed

src/basic_memory/db.py

Lines changed: 34 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -43,40 +43,37 @@
4343
_engine: Optional[AsyncEngine] = None
4444
_session_maker: Optional[async_sessionmaker[AsyncSession]] = None
4545

46-
# Alembic revision that enables one-time automatic embedding backfill.
47-
SEMANTIC_EMBEDDING_BACKFILL_REVISION = "i2c3d4e5f6g7"
4846

49-
50-
async def _load_applied_alembic_revisions(
47+
async def _needs_semantic_embedding_backfill(
48+
app_config: BasicMemoryConfig,
5149
session_maker: async_sessionmaker[AsyncSession],
52-
) -> set[str]:
53-
"""Load applied Alembic revisions from alembic_version.
50+
) -> bool:
51+
"""Check if entities exist but vector embeddings are empty.
5452
55-
Returns an empty set when the version table does not exist yet
56-
(fresh database before first migration).
53+
This is the reliable way to detect that embeddings need to be generated,
54+
regardless of how migrations were applied (fresh DB, upgrade, reset, etc.).
5755
"""
56+
if not app_config.semantic_search_enabled:
57+
return False
58+
5859
try:
5960
async with scoped_session(session_maker) as session:
60-
result = await session.execute(text("SELECT version_num FROM alembic_version"))
61-
return {str(row[0]) for row in result.fetchall() if row[0]}
61+
entity_count = (
62+
await session.execute(text("SELECT COUNT(*) FROM entity"))
63+
).scalar() or 0
64+
if entity_count == 0:
65+
return False
66+
67+
# Check if vector chunks table exists and is empty
68+
embedding_count = (
69+
await session.execute(text("SELECT COUNT(*) FROM search_vector_chunks"))
70+
).scalar() or 0
71+
72+
return embedding_count == 0
6273
except Exception as exc:
63-
error_message = str(exc).lower()
64-
if "alembic_version" in error_message and (
65-
"no such table" in error_message or "does not exist" in error_message
66-
):
67-
return set()
68-
raise
69-
70-
71-
def _should_run_semantic_embedding_backfill(
72-
revisions_before_upgrade: set[str],
73-
revisions_after_upgrade: set[str],
74-
) -> bool:
75-
"""Check if this migration run newly applied the backfill-trigger revision."""
76-
return (
77-
SEMANTIC_EMBEDDING_BACKFILL_REVISION in revisions_after_upgrade
78-
and SEMANTIC_EMBEDDING_BACKFILL_REVISION not in revisions_before_upgrade
79-
)
74+
# Table might not exist yet (pre-migration)
75+
logger.debug(f"Could not check embedding status: {exc}")
76+
return False
8077

8178

8279
async def _run_semantic_embedding_backfill(
@@ -480,26 +477,9 @@ async def run_migrations(
480477
Note: Alembic tracks which migrations have been applied via the alembic_version table,
481478
so it's safe to call this multiple times - it will only run pending migrations.
482479
"""
483-
logger.debug("Running database migrations...")
480+
logger.info("Running database migrations...")
484481
temp_engine: AsyncEngine | None = None
485482
try:
486-
revisions_before_upgrade: set[str] = set()
487-
# Trigger: run_migrations() can be invoked before module-level session maker is set.
488-
# Why: we still need reliable before/after revision detection for one-time backfill.
489-
# Outcome: create a short-lived session maker when needed, then dispose it immediately.
490-
if _session_maker is None:
491-
precheck_engine, temp_session_maker = _create_engine_and_session(
492-
app_config.database_path,
493-
database_type,
494-
app_config,
495-
)
496-
try:
497-
revisions_before_upgrade = await _load_applied_alembic_revisions(temp_session_maker)
498-
finally:
499-
await precheck_engine.dispose()
500-
else:
501-
revisions_before_upgrade = await _load_applied_alembic_revisions(_session_maker)
502-
503483
# Get the absolute path to the alembic directory relative to this file
504484
alembic_dir = Path(__file__).parent / "alembic"
505485
config = Config()
@@ -519,7 +499,7 @@ async def run_migrations(
519499
config.set_main_option("sqlalchemy.url", db_url)
520500

521501
command.upgrade(config, "head")
522-
logger.debug("Migrations completed successfully")
502+
logger.info("Migrations completed successfully")
523503

524504
# Get session maker - ensure we don't trigger recursive migration calls
525505
if _session_maker is None:
@@ -541,12 +521,14 @@ async def run_migrations(
541521
else:
542522
await SQLiteSearchRepository(session_maker, 1).init_search_index()
543523

544-
revisions_after_upgrade = await _load_applied_alembic_revisions(session_maker)
545-
if _should_run_semantic_embedding_backfill(
546-
revisions_before_upgrade,
547-
revisions_after_upgrade,
548-
):
549-
await _run_semantic_embedding_backfill(app_config, session_maker)
524+
# Check if backfill is needed — actual backfill runs in background
525+
# from the MCP server lifespan to avoid blocking startup.
526+
if await _needs_semantic_embedding_backfill(app_config, session_maker):
527+
logger.info(
528+
"Semantic embeddings missing — backfill will run in background after startup"
529+
)
530+
else:
531+
logger.info("Semantic embeddings: up to date")
550532
except Exception as e: # pragma: no cover
551533
logger.error(f"Error running migrations: {e}")
552534
raise

src/basic_memory/mcp/server.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,71 @@
22
Basic Memory FastMCP server.
33
"""
44

5+
import asyncio
56
import time
67
from contextlib import asynccontextmanager
78

89
from fastmcp import FastMCP
910
from loguru import logger
11+
from sqlalchemy import text
12+
from sqlalchemy.ext.asyncio import async_sessionmaker, AsyncSession
1013

1114
from basic_memory import db
1215
from basic_memory.cli.auth import CLIAuth
16+
from basic_memory.config import BasicMemoryConfig
17+
from basic_memory.db import (
18+
scoped_session,
19+
_needs_semantic_embedding_backfill,
20+
_run_semantic_embedding_backfill,
21+
)
1322
from basic_memory.mcp.container import McpContainer, set_container
1423
from basic_memory.services.initialization import initialize_app
1524

1625

26+
async def _log_embedding_status(session_maker: async_sessionmaker[AsyncSession]) -> None:
27+
"""Log a clear summary of semantic embedding status at startup."""
28+
try:
29+
async with scoped_session(session_maker) as session:
30+
entity_count = (
31+
await session.execute(text("SELECT COUNT(*) FROM entity"))
32+
).scalar() or 0
33+
chunk_count = (
34+
await session.execute(text("SELECT COUNT(*) FROM search_vector_chunks"))
35+
).scalar() or 0
36+
embedding_count = (
37+
await session.execute(text("SELECT COUNT(*) FROM search_vector_embeddings_rowids"))
38+
).scalar() or 0
39+
40+
if entity_count == 0:
41+
logger.info("Semantic embeddings: no entities yet")
42+
elif embedding_count == 0:
43+
logger.warning(
44+
f"Semantic embeddings: EMPTY — {entity_count} entities have no embeddings. "
45+
"Backfill running in background..."
46+
)
47+
else:
48+
logger.info(
49+
f"Semantic embeddings: {embedding_count} embeddings "
50+
f"across {chunk_count} chunks for {entity_count} entities"
51+
)
52+
except Exception as exc:
53+
logger.debug(f"Could not check embedding status at startup: {exc}")
54+
55+
56+
async def _background_embedding_backfill(
57+
config: BasicMemoryConfig,
58+
session_maker: async_sessionmaker[AsyncSession],
59+
) -> None:
60+
"""Run semantic embedding backfill in the background without blocking startup."""
61+
try:
62+
if await _needs_semantic_embedding_backfill(config, session_maker):
63+
logger.info("Background embedding backfill starting...")
64+
await _run_semantic_embedding_backfill(config, session_maker)
65+
await _log_embedding_status(session_maker)
66+
except Exception as exc:
67+
logger.error(f"Background embedding backfill failed: {exc}")
68+
69+
1770
@asynccontextmanager
1871
async def lifespan(app: FastMCP):
1972
"""Lifecycle manager for the MCP server.
@@ -70,6 +123,16 @@ async def lifespan(app: FastMCP):
70123
# Initialize app (runs migrations, reconciles projects)
71124
await initialize_app(container.config)
72125

126+
# Log embedding status so it's easy to spot in the logs
127+
backfill_task: asyncio.Task | None = None # type: ignore[type-arg]
128+
if config.semantic_search_enabled and db._session_maker is not None:
129+
await _log_embedding_status(db._session_maker)
130+
# Launch backfill in background so MCP server is ready immediately
131+
backfill_task = asyncio.create_task(
132+
_background_embedding_backfill(config, db._session_maker),
133+
name="embedding-backfill",
134+
)
135+
73136
# Create and start sync coordinator (lifecycle centralized in coordinator)
74137
sync_coordinator = container.create_sync_coordinator()
75138
await sync_coordinator.start()
@@ -79,6 +142,15 @@ async def lifespan(app: FastMCP):
79142
finally:
80143
# Shutdown - coordinator handles clean task cancellation
81144
logger.debug("Shutting down Basic Memory MCP server")
145+
146+
# Cancel embedding backfill if still running
147+
if backfill_task is not None and not backfill_task.done():
148+
backfill_task.cancel()
149+
try:
150+
await backfill_task
151+
except asyncio.CancelledError:
152+
logger.info("Background embedding backfill cancelled during shutdown")
153+
82154
await sync_coordinator.stop()
83155

84156
# Only shutdown DB if we created it (not if test fixture provided it)

src/basic_memory/sync/sync_service.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,12 +293,16 @@ async def sync(
293293
for path in report.deleted:
294294
await self.handle_delete(path)
295295

296-
# then new and modified
296+
# then new and modified — collect entity IDs for batch vector embedding
297+
synced_entity_ids: list[int] = []
298+
297299
for path in report.new:
298300
entity, _ = await self.sync_file(path, new=True)
299301

302+
if entity is not None:
303+
synced_entity_ids.append(entity.id)
300304
# Track if file was skipped
301-
if entity is None and await self._should_skip_file(path):
305+
elif await self._should_skip_file(path):
302306
failure_info = self._file_failures[path]
303307
report.skipped_files.append(
304308
SkippedFile(
@@ -312,8 +316,10 @@ async def sync(
312316
for path in report.modified:
313317
entity, _ = await self.sync_file(path, new=False)
314318

319+
if entity is not None:
320+
synced_entity_ids.append(entity.id)
315321
# Track if file was skipped
316-
if entity is None and await self._should_skip_file(path):
322+
elif await self._should_skip_file(path):
317323
failure_info = self._file_failures[path]
318324
report.skipped_files.append(
319325
SkippedFile(
@@ -331,6 +337,26 @@ async def sync(
331337
else:
332338
logger.info("Skipping relation resolution - no file changes detected")
333339

340+
# Batch-generate vector embeddings for all synced entities
341+
if synced_entity_ids and self.app_config.semantic_search_enabled:
342+
try:
343+
logger.info(
344+
f"Generating semantic embeddings for {len(synced_entity_ids)} entities..."
345+
)
346+
batch_result = await self.search_service.sync_entity_vectors_batch(
347+
synced_entity_ids
348+
)
349+
logger.info(
350+
f"Semantic embeddings complete: "
351+
f"synced={batch_result.entities_synced}, "
352+
f"failed={batch_result.entities_failed}"
353+
)
354+
except SemanticDependenciesMissingError:
355+
logger.warning(
356+
"Semantic search dependencies missing — vector embeddings skipped. "
357+
"Run 'bm reindex --embeddings' after resolving the dependency issue."
358+
)
359+
334360
# Update scan watermark after successful sync
335361
# Use the timestamp from sync start (not end) to ensure we catch files
336362
# created during the sync on the next iteration

tests/mcp/test_project_context.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ async def test_returns_none_when_no_default_and_no_project(config_manager, monke
3131
config_manager.save_config(cfg)
3232

3333
monkeypatch.delenv("BASIC_MEMORY_MCP_PROJECT", raising=False)
34+
3435
# Prevent API fallback from returning a project via stale dependency overrides
3536
async def _no_api_fallback():
3637
return None
@@ -117,6 +118,7 @@ async def test_returns_none_when_no_default(config_manager, monkeypatch):
117118
config_manager.save_config(cfg)
118119

119120
monkeypatch.delenv("BASIC_MEMORY_MCP_PROJECT", raising=False)
121+
120122
# Prevent API fallback from returning a project via stale dependency overrides
121123
async def _no_api_fallback():
122124
return None

0 commit comments

Comments
 (0)