Skip to content

Commit d6de38c

Browse files
dzmitrys-devclaude
andcommitted
fix(14-A): handle numpy-array sparse-vector attrs in ingest()
fastembed's sparse embedders return numpy.ndarray for `.indices` and `.values`. The previous `getattr(svec, "indices", []) or []` short-circuit raised "ValueError: The truth value of an array with more than one element is ambiguous" because numpy arrays don't define multi-element __bool__. Replace the truthiness fallback with explicit `is None` checks. Convert indices to plain int and values to plain float so the Qdrant SparseVector gets native Python types regardless of the embedder's output dtype. Discovered while running the first end-to-end longmemeval_s ingest against live Qdrant — the unit suite mocked sparse embedders with plain Python lists, so the numpy path was never exercised. Add regression test `test_ingest_handles_numpy_sparse_vector_attrs` that fakes numpy-shaped sparse outputs and asserts ingest() succeeds with native int/float lists in the upserted SparseVector. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4ffef84 commit d6de38c

2 files changed

Lines changed: 71 additions & 2 deletions

File tree

src/supamem/eval/longmemeval_ingest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,15 @@ def ingest(
192192

193193
for (sid, text, axis), dvec, svec in zip(chunks, dense_iter, sparse_iter):
194194
dense_vec = [float(x) for x in dvec]
195+
# NOTE: fastembed returns numpy arrays for `indices`/`values`. The
196+
# `array or default` truthiness pattern raises "truth value of an
197+
# array with more than one element is ambiguous" — use explicit
198+
# None checks instead.
199+
_idx = getattr(svec, "indices", None)
200+
_val = getattr(svec, "values", None)
195201
sparse_vec = qmodels.SparseVector(
196-
indices=list(getattr(svec, "indices", []) or []),
197-
values=[float(v) for v in getattr(svec, "values", []) or []],
202+
indices=[int(i) for i in (_idx if _idx is not None else [])],
203+
values=[float(v) for v in (_val if _val is not None else [])],
198204
)
199205
point = qmodels.PointStruct(
200206
id=point_id,

tests/test_longmemeval_ingest.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,3 +309,66 @@ def test_ingest_does_not_mutate_caller_cfg(patch_embedders: None) -> None:
309309
original_collection = cfg.collection
310310
ingest_mod.ingest(cfg, [rec], client=client, suite="longmemeval_s")
311311
assert cfg.collection == original_collection
312+
313+
314+
# Test 9 — regression for numpy-array sparse-vector handling -----------------
315+
316+
317+
def test_ingest_handles_numpy_sparse_vector_attrs(
318+
monkeypatch: pytest.MonkeyPatch,
319+
) -> None:
320+
"""Real fastembed sparse embedders return numpy arrays for indices/values.
321+
322+
Regression: the previous ``getattr(svec, "indices", []) or []`` short-circuit
323+
raised ``ValueError: The truth value of an array with more than one element
324+
is ambiguous`` because numpy arrays don't support multi-element ``__bool__``.
325+
The fix uses explicit ``is None`` checks. This test fakes numpy-shaped
326+
sparse outputs to lock in that behavior without loading fastembed.
327+
"""
328+
np = pytest.importorskip("numpy")
329+
330+
class _NumpySparse:
331+
# Mimics fastembed.SparseEmbedding's array-typed attributes.
332+
def __init__(self) -> None:
333+
self.indices = np.array([0, 1, 2], dtype=np.int64)
334+
self.values = np.array([0.5, 0.3, 0.2], dtype=np.float32)
335+
336+
def _yield_numpy(batch):
337+
for _ in batch:
338+
yield _NumpySparse()
339+
340+
fake_dense = MagicMock()
341+
fake_dense.embed.side_effect = lambda batch: ([0.1] * 384 for _ in batch)
342+
fake_sparse = MagicMock()
343+
fake_sparse.embed.side_effect = _yield_numpy
344+
monkeypatch.setattr(ingest_mod, "build_dense_embedder", lambda *a, **k: fake_dense)
345+
monkeypatch.setattr(ingest_mod, "build_sparse_embedder", lambda *a, **k: fake_sparse)
346+
347+
client = MagicMock()
348+
client.get_collections.return_value = MagicMock(collections=[])
349+
350+
rec = _make_raw_record(
351+
"q",
352+
"single_session_user",
353+
sessions=[[{"role": "user", "content": "hello"}]],
354+
)
355+
cfg = _cfg()
356+
# Must not raise — fix converts numpy → list explicitly via `is None` check.
357+
count = ingest_mod.ingest(cfg, [rec], client=client, suite="longmemeval_s")
358+
assert count == 1
359+
360+
# Confirm the upsert payload carried plain-Python int/float lists,
361+
# not numpy arrays (Qdrant SparseVector wants native lists).
362+
upsert_calls = client.upsert.call_args_list
363+
assert upsert_calls, "expected at least one upsert call"
364+
points = upsert_calls[0].kwargs.get("points") or upsert_calls[0].args[1]
365+
sparse = points[0].vector["__sparse__"] if "__sparse__" in points[0].vector else None
366+
# The sparse-vector key may differ; resolve generically.
367+
if sparse is None:
368+
for v in points[0].vector.values():
369+
if hasattr(v, "indices") and hasattr(v, "values"):
370+
sparse = v
371+
break
372+
assert sparse is not None
373+
assert all(isinstance(i, int) for i in sparse.indices)
374+
assert all(isinstance(v, float) for v in sparse.values)

0 commit comments

Comments
 (0)