fix(14-A): write chunk text under payload['document'] to match retrieval contract

dzmitrys-dev · claude · dzmitrys-dev · commit fb8e040aae1e · 2026-05-05T16:25:48.000+03:00
Production retrieval (`tuned_hybrid.py`) reads chunk body via
`payload.get("document")`. Phase 14-A's `ingest()` was writing
`payload["text"]` instead, so every bench query returned chunk objects
with empty `.text` attributes — the scoped/unscoped passes ran end-to-end
but measured question-only token counts (`tpca ≈ input_tokens_p50`)
instead of retrieval-augmented context.

Discovered by direct Qdrant payload scroll comparison after the first
full bench run produced a deceptive-looking PASS (tpca=25.94 vs gate
≤962.2) with `recall_at_5=0.0` across all 470 records and 5 axes —
classic too-good-to-be-true.

Add an explicit anti-regression assertion in
`test_ingest_upserts_one_point_per_haystack_turn`: payload MUST contain
"document" and MUST NOT contain "text". Locks the contract so future
edits can't silently re-introduce the field-name drift.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/supamem/eval/longmemeval_ingest.py b/src/supamem/eval/longmemeval_ingest.py
@@ -209,8 +209,13 @@ def ingest(
                 _SPARSE_VECTOR_NAME: sparse_vec,
             },
             payload={
+                # Production retrieval (`tuned_hybrid.py`) reads chunk text
+                # from `payload["document"]`. Using "text" here makes
+                # retrieved chunks have an empty `.text` attribute — the
+                # bench scoped/unscoped passes then measure nothing
+                # meaningful. Match the production contract.
                 "session_id": sid,
-                "text": text,
+                "document": text,
                 "axis": axis,
             },
         )
diff --git a/tests/test_longmemeval_ingest.py b/tests/test_longmemeval_ingest.py
@@ -196,7 +196,7 @@ def test_ingest_payload_index_idempotent(patch_embedders: None) -> None:
 
 
 def test_ingest_upserts_one_point_per_haystack_turn(patch_embedders: None) -> None:
-    """2 sessions × 3 turns → exactly 6 upserted points; payloads carry session_id + text."""
+    """2 sessions × 3 turns → exactly 6 upserted points; payloads carry session_id + document."""
     client = MagicMock()
     client.get_collections.return_value = MagicMock(collections=[])
 
@@ -231,7 +231,16 @@ def test_ingest_upserts_one_point_per_haystack_turn(patch_embedders: None) -> No
         payload = getattr(pt, "payload", None)
         assert payload is not None
         assert "session_id" in payload
-        assert "text" in payload
+        # Production retrieval contract: chunk text MUST live under
+        # `payload["document"]` so `tuned_hybrid.py` reads it via
+        # `payload.get("document")`. Writing to `payload["text"]` would
+        # make retrieved chunks have empty `.text` attributes and the
+        # bench would silently measure question-only token counts.
+        assert "document" in payload
+        assert "text" not in payload, (
+            "ingest must write chunk text under payload['document'], not 'text', "
+            "to match the production retrieval contract (tuned_hybrid)."
+        )
         assert payload["session_id"] in {"sess-A", "sess-B"}