test: address PR 593 review feedback

phernandez · phernandez · commit 0c530c1b7e2c · 2026-02-20T19:43:49.000-06:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
diff --git a/src/basic_memory/repository/sqlite_search_repository.py b/src/basic_memory/repository/sqlite_search_repository.py
@@ -58,6 +58,9 @@ def __init__(
         self._vector_dimensions = 384
 
         if self._semantic_enabled and self._embedding_provider is None:
+            # Constraint: SQLite maps L2 distance to cosine similarity via 1 - L2²/2.
+            # This conversion is correct only for unit-normalized embeddings.
+            # Provider implementations must return normalized vectors.
             self._embedding_provider = create_embedding_provider(self._app_config)
         if self._embedding_provider is not None:
             self._vector_dimensions = self._embedding_provider.dimensions
diff --git a/test-int/semantic/test_search_diagnostics.py b/test-int/semantic/test_search_diagnostics.py
@@ -18,6 +18,7 @@
     SearchCombo,
     create_search_service,
     _create_fastembed_provider,
+    skip_if_needed,
 )
 
 
@@ -134,6 +135,7 @@ async def test_similarity_score_spread(sqlite_engine_factory, tmp_path):
     If the top relevant result and the worst irrelevant result have similar scores,
     the similarity formula is too compressed to be useful.
     """
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -189,6 +191,7 @@ async def test_observation_noise_vs_entity(sqlite_engine_factory, tmp_path):
     A common issue: observations like "Dark cocoa powder gives richer flavor"
     can match broadly because they lack parent context.
     """
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -244,6 +247,7 @@ async def test_rrf_fusion_preserves_strong_vector_match(sqlite_engine_factory, t
     This is the core claim of issue #577 — that RRF dilutes strong vector scores.
     Let's verify with a controlled corpus.
     """
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -307,9 +311,12 @@ async def test_rrf_fusion_preserves_strong_vector_match(sqlite_engine_factory, t
 async def test_similarity_formula_analysis(sqlite_engine_factory, tmp_path):
     """Analyze the raw distance-to-similarity mapping for real queries.
 
-    The current formula: similarity = 1 / (1 + distance)
-    This test dumps raw distances so we can evaluate alternative formulas.
+    Production formulas are backend-specific:
+    - SQLite: similarity = max(0, 1 - L2²/2) for normalized embeddings
+    - Postgres: similarity = max(0, 1 - cosine_distance)
+    This test compares old and new mappings for diagnostics.
     """
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -353,6 +360,7 @@ async def test_similarity_formula_analysis(sqlite_engine_factory, tmp_path):
 @pytest.mark.benchmark
 async def test_min_similarity_filters_noise(sqlite_engine_factory, tmp_path):
     """Verify that min_similarity actually removes low-quality matches."""
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -419,6 +427,7 @@ async def test_min_similarity_filters_noise(sqlite_engine_factory, tmp_path):
 @pytest.mark.benchmark
 async def test_chunking_produces_reasonable_chunks(sqlite_engine_factory, tmp_path):
     """Verify that the chunking logic produces chunks with enough context."""
+    skip_if_needed(DIAG_COMBO)
     provider = _create_fastembed_provider()
     service = await create_search_service(
         sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
diff --git a/tests/repository/test_distance_to_similarity.py b/tests/repository/test_distance_to_similarity.py
@@ -0,0 +1,24 @@
+"""Unit tests for backend-specific distance-to-similarity conversions."""
+
+import pytest
+
+from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
+from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
+
+
+def test_sqlite_distance_to_similarity_formula():
+    """SQLite converts L2 distance to cosine similarity for normalized vectors."""
+    repo = SQLiteSearchRepository.__new__(SQLiteSearchRepository)
+
+    assert repo._distance_to_similarity(0.0) == 1.0
+    assert repo._distance_to_similarity(1.0) == pytest.approx(0.5)
+    assert repo._distance_to_similarity(2.0) == 0.0
+
+
+def test_postgres_distance_to_similarity_formula():
+    """Postgres converts pgvector cosine distance to cosine similarity."""
+    repo = PostgresSearchRepository.__new__(PostgresSearchRepository)
+
+    assert repo._distance_to_similarity(0.0) == 1.0
+    assert repo._distance_to_similarity(1.0) == 0.0
+    assert repo._distance_to_similarity(2.0) == 0.0