Skip to content

Commit 0c530c1

Browse files
committed
test: address PR 593 review feedback
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 6a18f1e commit 0c530c1

3 files changed

Lines changed: 38 additions & 2 deletions

File tree

src/basic_memory/repository/sqlite_search_repository.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ def __init__(
5858
self._vector_dimensions = 384
5959

6060
if self._semantic_enabled and self._embedding_provider is None:
61+
# Constraint: SQLite maps L2 distance to cosine similarity via 1 - L2²/2.
62+
# This conversion is correct only for unit-normalized embeddings.
63+
# Provider implementations must return normalized vectors.
6164
self._embedding_provider = create_embedding_provider(self._app_config)
6265
if self._embedding_provider is not None:
6366
self._vector_dimensions = self._embedding_provider.dimensions

test-int/semantic/test_search_diagnostics.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
SearchCombo,
1919
create_search_service,
2020
_create_fastembed_provider,
21+
skip_if_needed,
2122
)
2223

2324

@@ -134,6 +135,7 @@ async def test_similarity_score_spread(sqlite_engine_factory, tmp_path):
134135
If the top relevant result and the worst irrelevant result have similar scores,
135136
the similarity formula is too compressed to be useful.
136137
"""
138+
skip_if_needed(DIAG_COMBO)
137139
provider = _create_fastembed_provider()
138140
service = await create_search_service(
139141
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -189,6 +191,7 @@ async def test_observation_noise_vs_entity(sqlite_engine_factory, tmp_path):
189191
A common issue: observations like "Dark cocoa powder gives richer flavor"
190192
can match broadly because they lack parent context.
191193
"""
194+
skip_if_needed(DIAG_COMBO)
192195
provider = _create_fastembed_provider()
193196
service = await create_search_service(
194197
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -244,6 +247,7 @@ async def test_rrf_fusion_preserves_strong_vector_match(sqlite_engine_factory, t
244247
This is the core claim of issue #577 — that RRF dilutes strong vector scores.
245248
Let's verify with a controlled corpus.
246249
"""
250+
skip_if_needed(DIAG_COMBO)
247251
provider = _create_fastembed_provider()
248252
service = await create_search_service(
249253
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -307,9 +311,12 @@ async def test_rrf_fusion_preserves_strong_vector_match(sqlite_engine_factory, t
307311
async def test_similarity_formula_analysis(sqlite_engine_factory, tmp_path):
308312
"""Analyze the raw distance-to-similarity mapping for real queries.
309313
310-
The current formula: similarity = 1 / (1 + distance)
311-
This test dumps raw distances so we can evaluate alternative formulas.
314+
Production formulas are backend-specific:
315+
- SQLite: similarity = max(0, 1 - L2²/2) for normalized embeddings
316+
- Postgres: similarity = max(0, 1 - cosine_distance)
317+
This test compares old and new mappings for diagnostics.
312318
"""
319+
skip_if_needed(DIAG_COMBO)
313320
provider = _create_fastembed_provider()
314321
service = await create_search_service(
315322
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -353,6 +360,7 @@ async def test_similarity_formula_analysis(sqlite_engine_factory, tmp_path):
353360
@pytest.mark.benchmark
354361
async def test_min_similarity_filters_noise(sqlite_engine_factory, tmp_path):
355362
"""Verify that min_similarity actually removes low-quality matches."""
363+
skip_if_needed(DIAG_COMBO)
356364
provider = _create_fastembed_provider()
357365
service = await create_search_service(
358366
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
@@ -419,6 +427,7 @@ async def test_min_similarity_filters_noise(sqlite_engine_factory, tmp_path):
419427
@pytest.mark.benchmark
420428
async def test_chunking_produces_reasonable_chunks(sqlite_engine_factory, tmp_path):
421429
"""Verify that the chunking logic produces chunks with enough context."""
430+
skip_if_needed(DIAG_COMBO)
422431
provider = _create_fastembed_provider()
423432
service = await create_search_service(
424433
sqlite_engine_factory, DIAG_COMBO, tmp_path, embedding_provider=provider
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Unit tests for backend-specific distance-to-similarity conversions."""
2+
3+
import pytest
4+
5+
from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
6+
from basic_memory.repository.sqlite_search_repository import SQLiteSearchRepository
7+
8+
9+
def test_sqlite_distance_to_similarity_formula():
10+
"""SQLite converts L2 distance to cosine similarity for normalized vectors."""
11+
repo = SQLiteSearchRepository.__new__(SQLiteSearchRepository)
12+
13+
assert repo._distance_to_similarity(0.0) == 1.0
14+
assert repo._distance_to_similarity(1.0) == pytest.approx(0.5)
15+
assert repo._distance_to_similarity(2.0) == 0.0
16+
17+
18+
def test_postgres_distance_to_similarity_formula():
19+
"""Postgres converts pgvector cosine distance to cosine similarity."""
20+
repo = PostgresSearchRepository.__new__(PostgresSearchRepository)
21+
22+
assert repo._distance_to_similarity(0.0) == 1.0
23+
assert repo._distance_to_similarity(1.0) == 0.0
24+
assert repo._distance_to_similarity(2.0) == 0.0

0 commit comments

Comments
 (0)