|
| 1 | +"""Integration regression test for get_embedding_status against a real vec0 table. |
| 2 | +
|
| 3 | +Regression for #658: after a successful `bm reindex --embeddings`, `bm project info` |
| 4 | +still reported "sqlite-vec is unavailable", "Indexed 0/N", and "Chunks 0", and |
| 5 | +recommended an unnecessary reindex. |
| 6 | +
|
| 7 | +Root cause: get_embedding_status() ran the vec0 JOIN count queries on a bare pooled |
| 8 | +ProjectRepository session that never loaded the sqlite-vec extension, so SQLite raised |
| 9 | +"no such module: vec0", which the except block mis-reported as "unavailable". |
| 10 | +
|
| 11 | +This test exercises the real failure path: it builds a REAL vec0 virtual table, writes a |
| 12 | +real embedding into it via the search repository, then queries get_embedding_status through |
| 13 | +a ProjectRepository session that did NOT pre-load the extension (mirroring the bug). The |
| 14 | +healthy unit test substitutes a plain regular table for vec0 and therefore does not cover |
| 15 | +this path. |
| 16 | +""" |
| 17 | + |
| 18 | +import os |
| 19 | +import sqlite3 |
| 20 | + |
| 21 | +import pytest |
| 22 | +from sqlalchemy import text |
| 23 | + |
| 24 | +from basic_memory import db |
| 25 | +from basic_memory.config import BasicMemoryConfig, DatabaseBackend |
| 26 | +from basic_memory.repository.entity_repository import EntityRepository |
| 27 | +from basic_memory.repository.project_repository import ProjectRepository |
| 28 | +from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository |
| 29 | +from basic_memory.services.project_service import ProjectService |
| 30 | + |
| 31 | + |
| 32 | +def _is_postgres() -> bool: |
| 33 | + return os.environ.get("BASIC_MEMORY_TEST_POSTGRES", "").lower() in ("1", "true", "yes") |
| 34 | + |
| 35 | + |
| 36 | +def _unit_vector(dimensions: int) -> list[float]: |
| 37 | + """Return a deterministic unit-norm vector for the vec0 embedding column.""" |
| 38 | + # vec0 stores float[dimensions]; the actual values don't matter for the count |
| 39 | + # queries, but using a normalized vector keeps the row well-formed. |
| 40 | + vec = [0.0] * dimensions |
| 41 | + vec[0] = 1.0 |
| 42 | + return vec |
| 43 | + |
| 44 | + |
| 45 | +@pytest.mark.asyncio |
| 46 | +async def test_embedding_status_reads_real_vec0_table(engine_factory, test_project, config_manager): |
| 47 | + """get_embedding_status must report a populated real vec0 table as healthy. |
| 48 | +
|
| 49 | + Before the fix, the vec0 JOIN ran on a session without sqlite-vec loaded and |
| 50 | + raised "no such module: vec0", which the except block mapped to |
| 51 | + vector_tables_exist=False + reindex_recommended=True. |
| 52 | + """ |
| 53 | + # Trigger: Postgres test matrix executes the same suite. |
| 54 | + # Why: vec0 + per-connection sqlite-vec loading is SQLite-specific. |
| 55 | + # Outcome: keep the regression on the backend that can actually hit this path. |
| 56 | + if _is_postgres(): |
| 57 | + pytest.skip("Real vec0 table handling is SQLite-specific.") |
| 58 | + |
| 59 | + # Trigger: Python build without SQLite extension loading (#711 — python.org |
| 60 | + # macOS / some Windows interpreters lack enable_load_extension). |
| 61 | + # Why: this test creates a REAL vec0 virtual table during setup, which is |
| 62 | + # impossible without loading the sqlite-vec extension. |
| 63 | + # Outcome: skip the regression as an environment-capability gap; the codebase |
| 64 | + # already degrades gracefully in that scenario (covered by the unit test). |
| 65 | + _probe = sqlite3.connect(":memory:") |
| 66 | + if not hasattr(_probe, "enable_load_extension"): |
| 67 | + _probe.close() |
| 68 | + pytest.skip( |
| 69 | + "Python build does not support SQLite extension loading — " |
| 70 | + "cannot create real vec0 tables" |
| 71 | + ) |
| 72 | + _probe.close() |
| 73 | + |
| 74 | + _engine, session_maker = engine_factory |
| 75 | + project_id = test_project.id |
| 76 | + |
| 77 | + # --- Build a REAL vec0 table via the search repository --- |
| 78 | + # Semantic enabled with a fastembed provider so _ensure_vector_tables creates |
| 79 | + # the vec0-backed search_vector_embeddings table (float[384]). |
| 80 | + app_config = BasicMemoryConfig( |
| 81 | + env="test", |
| 82 | + database_backend=DatabaseBackend.SQLITE, |
| 83 | + semantic_search_enabled=True, |
| 84 | + ) |
| 85 | + search_repo = SQLiteSearchRepository( |
| 86 | + session_maker, |
| 87 | + project_id=project_id, |
| 88 | + app_config=app_config, |
| 89 | + ) |
| 90 | + await search_repo._ensure_vector_tables() |
| 91 | + dimensions = search_repo._vector_dimensions |
| 92 | + |
| 93 | + # --- Seed a real entity + search_index row so counts are non-zero --- |
| 94 | + # Use the repository so model-level defaults (external_id) are applied. |
| 95 | + entity_repo = EntityRepository(session_maker, project_id=project_id) |
| 96 | + entity = await entity_repo.create( |
| 97 | + { |
| 98 | + "title": "Vec Note", |
| 99 | + "note_type": "note", |
| 100 | + "content_type": "text/markdown", |
| 101 | + "project_id": project_id, |
| 102 | + "permalink": "vec-note", |
| 103 | + "file_path": "vec-note.md", |
| 104 | + } |
| 105 | + ) |
| 106 | + entity_id = entity.id |
| 107 | + |
| 108 | + async with db.scoped_session(session_maker) as session: |
| 109 | + await session.execute( |
| 110 | + text( |
| 111 | + "INSERT INTO search_index " |
| 112 | + "(id, entity_id, project_id, type, title, permalink, content_stems, " |
| 113 | + "content_snippet, file_path, metadata) " |
| 114 | + "VALUES (:id, :eid, :pid, 'entity', 'Vec Note', 'vec-note', " |
| 115 | + "'vec content', 'vec snippet', 'vec-note.md', '{}')" |
| 116 | + ), |
| 117 | + {"id": entity_id, "eid": entity_id, "pid": project_id}, |
| 118 | + ) |
| 119 | + await session.commit() |
| 120 | + |
| 121 | + # --- Insert a chunk + a real embedding into the vec0 table --- |
| 122 | + # _write_embeddings writes the embedding into the vec0 virtual table keyed by |
| 123 | + # rowid == chunk id, exactly like the reindex path. |
| 124 | + async with db.scoped_session(session_maker) as session: |
| 125 | + await search_repo._ensure_sqlite_vec_loaded(session) |
| 126 | + chunk_result = await session.execute( |
| 127 | + text( |
| 128 | + "INSERT INTO search_vector_chunks " |
| 129 | + "(entity_id, project_id, chunk_key, chunk_text, source_hash, " |
| 130 | + "entity_fingerprint, embedding_model) " |
| 131 | + "VALUES (:eid, :pid, 'chunk-1', 'vec content', 'hash', " |
| 132 | + "'fp-hash', 'bge-small-en-v1.5') " |
| 133 | + "RETURNING id" |
| 134 | + ), |
| 135 | + {"eid": entity_id, "pid": project_id}, |
| 136 | + ) |
| 137 | + chunk_id = chunk_result.scalar_one() |
| 138 | + |
| 139 | + await search_repo._write_embeddings( |
| 140 | + session, |
| 141 | + [(chunk_id, "vec content")], |
| 142 | + [_unit_vector(dimensions)], |
| 143 | + ) |
| 144 | + await session.commit() |
| 145 | + |
| 146 | + # Evict the vec-loaded connection from the pool. sqlite-vec is loaded |
| 147 | + # per-connection, so disposing forces get_embedding_status onto a brand-new |
| 148 | + # connection that never loaded the extension — exactly the #658 bug condition |
| 149 | + # (e.g. a fresh `bm project info` process after `bm reindex --embeddings`). |
| 150 | + await _engine.dispose() |
| 151 | + |
| 152 | + # --- Query status through a fresh ProjectRepository (no extension preloaded) --- |
| 153 | + project_repository = ProjectRepository(session_maker) |
| 154 | + project_service = ProjectService(project_repository) |
| 155 | + |
| 156 | + status = await project_service.get_embedding_status(project_id) |
| 157 | + |
| 158 | + assert status.semantic_search_enabled is True |
| 159 | + # The vec0 JOIN must succeed, so the table is reported as present and healthy. |
| 160 | + assert status.vector_tables_exist is True |
| 161 | + assert status.reindex_recommended is False |
| 162 | + assert status.reindex_reason is None |
| 163 | + # Counts must reflect the real data, not the false "0" from the unavailable path. |
| 164 | + assert status.total_indexed_entities == 1 |
| 165 | + assert status.total_chunks == 1 |
| 166 | + assert status.total_entities_with_chunks == 1 |
| 167 | + assert status.total_embeddings == 1 |
| 168 | + assert status.orphaned_chunks == 0 |
0 commit comments