fix(rerank): unswallow exception, shim Qwen2 prepare_for_model, cache reranker on backend

dzmitrys-dev · dzmitrys-dev · commit 18772e537323 · 2026-05-04T19:45:10.000+03:00
Three independent retrieval-path bugs surfaced by Phase 10 --full LongMemEval_S:

1. tuned_hybrid.py was swallowing every reranker exception with bare
   `except Exception`, hiding the actual failure cause and violating the
   project hard constraint (CLAUDE.md: "NEVER suppress errors in
   indexing/retrieval paths"). Replaced with type+message logging plus
   full traceback.

2. mxbai-rerank 0.1.6 calls `tokenizer.prepare_for_model(...)`
   unconditionally, but transformers &gt;=4.50 no longer exposes this
   method on the slow Qwen2Tokenizer (and PreTrainedTokenizerBase no
   longer provides a fallback impl). mxbai-rerank upstream is
   effectively unmaintained (only 1 PR ever merged, no fix released).
   Bound a minimal hand-written prepare_for_model on the tokenizer
   instance covering the exact call signature mxbai uses
   (add_special_tokens=False, padding=False, truncation="only_second").

3. tuned_hybrid.query() called load_reranker() on every retrieval call,
   constructing a fresh wrapper with `_model=None` each time, which
   then triggered a fresh model weight load. Cache the reranker on the
   backend instance keyed by name so a long-lived process loads weights
   exactly once.

Verified via uv run pytest (620 passed, 4 skipped, 1 upstream warning)
and end-to-end via supamem eval --suite longmemeval_s --full
(470 questions; rerank now actually runs, weights load once per process).
diff --git a/src/supamem/rerankers/mxbai_v2.py b/src/supamem/rerankers/mxbai_v2.py
@@ -35,6 +35,66 @@ def _ensure(self) -> Any:
             from mxbai_rerank import MxbaiRerankV2  # noqa: PLC0415
 
             self._model = MxbaiRerankV2(self.config.reranker_model_id)
+
+            # Compatibility shim: mxbai-rerank 0.1.6 calls
+            # `tokenizer.prepare_for_model(ids, pair_ids, ...)` unconditionally,
+            # but the slow Qwen2Tokenizer in transformers >=4.50 no longer
+            # exposes this method (and PreTrainedTokenizerBase no longer
+            # provides a fallback impl as of recent releases). Upstream
+            # mxbai-rerank is unmaintained (no fix released as of 2026-05-04).
+            # Implement the minimal call signature mxbai uses:
+            # add_special_tokens=False, padding=False, truncation="only_second".
+            try:
+                tok = getattr(self._model, "tokenizer", None)
+                if tok is not None and not hasattr(tok, "prepare_for_model"):
+                    def _shim_prepare_for_model(
+                        ids,
+                        pair_ids=None,
+                        *,
+                        truncation=None,
+                        max_length=None,
+                        padding=False,  # noqa: ARG001
+                        return_attention_mask=False,
+                        return_token_type_ids=False,
+                        add_special_tokens=False,  # noqa: ARG001
+                        **_kwargs,
+                    ):
+                        a = list(ids)
+                        b = list(pair_ids) if pair_ids is not None else []
+                        if max_length is not None:
+                            if truncation == "only_second":
+                                budget = max_length - len(a)
+                                if budget < 0:
+                                    a = a[:max_length]
+                                    b = []
+                                elif len(b) > budget:
+                                    b = b[:budget]
+                            elif truncation in (
+                                True,
+                                "longest_first",
+                                "only_first",
+                            ):
+                                while len(a) + len(b) > max_length:
+                                    if truncation == "only_first" or len(a) > len(b):
+                                        a.pop()
+                                    else:
+                                        b.pop()
+                        combined = a + b
+                        out: dict[str, Any] = {"input_ids": combined}
+                        if return_attention_mask:
+                            out["attention_mask"] = [1] * len(combined)
+                        if return_token_type_ids:
+                            out["token_type_ids"] = [0] * len(a) + [1] * len(b)
+                        return out
+
+                    tok.prepare_for_model = _shim_prepare_for_model
+            except Exception as _shim_exc:  # noqa: BLE001
+                err_console.print(
+                    "[supamem.warn]prepare_for_model shim failed "
+                    f"({type(_shim_exc).__name__}: {_shim_exc}); "
+                    "rerank may still raise"
+                )
+
             elapsed_ms = (time.perf_counter() - t0) * 1000.0
             try:
                 from supamem.stats.counter import bump  # noqa: PLC0415
diff --git a/src/supamem/retrieval/tuned_hybrid.py b/src/supamem/retrieval/tuned_hybrid.py
@@ -171,6 +171,12 @@ def __init__(self, *, config: ResolvedConfig, minimal_setup: bool = False) -> No
         self._client: Any | None = None
         self._dense: Any | None = None
         self._sparse: Any | None = None
+        # D-POOL: cache the reranker plugin instance on the backend so a
+        # process-long retrieval session reuses one model load. load_reranker
+        # constructs a fresh wrapper per call; without this cache the
+        # wrapper's _ensure() lazy-load reloads the model every query.
+        self._reranker: Any | None = None
+        self._reranker_name: str | None = None
         self._minimal_setup = minimal_setup
 
     def _ensure(self):
@@ -213,13 +219,25 @@ def query(
         from supamem.rerankers import load_reranker  # noqa: PLC0415
 
         reranker_name = getattr(self.config, "reranker_name", "off")
-        try:
-            reranker = load_reranker(reranker_name, self.config)
-        except LookupError:
-            # Fail-soft: treat unknown reranker as off; ResolvedConfig's
-            # validation gate (load_config) is the canonical fail-closed
-            # surface — at backend.query() time we never abort retrieval.
+        # D-POOL: reuse cached reranker if name unchanged. load_reranker
+        # constructs a fresh wrapper each call — caching here keeps the
+        # underlying model loaded for the lifetime of the backend.
+        if reranker_name == "off":
             reranker = None
+        elif (
+            self._reranker is not None and self._reranker_name == reranker_name
+        ):
+            reranker = self._reranker
+        else:
+            try:
+                reranker = load_reranker(reranker_name, self.config)
+            except LookupError:
+                # Fail-soft: treat unknown reranker as off; ResolvedConfig's
+                # validation gate (load_config) is the canonical fail-closed
+                # surface — at backend.query() time we never abort retrieval.
+                reranker = None
+            self._reranker = reranker
+            self._reranker_name = reranker_name
 
         # D-POOL-01: widen prefetch only when reranker is on.
         prefetch_limit = (
@@ -299,14 +317,21 @@ def query(
             t0 = _time.perf_counter()
             try:
                 reranked = reranker.rerank(text, pre_rerank)
-            except Exception:
+            except Exception as _rerank_exc:
                 # Plugin failure → fall through to off-path semantics
                 # (T-RERANK-INVAR mitigation: never silently drop hits).
+                # Surface exception class+message so plugin authors and
+                # bench runs can see WHY rerank failed, not just that it did.
+                import traceback as _tb  # noqa: PLC0415
+
                 from supamem.console import err_console  # noqa: PLC0415
 
                 err_console.print(
-                    "[supamem.warn]reranker raised — falling back to off-branch"
+                    f"[supamem.warn]reranker raised "
+                    f"({type(_rerank_exc).__name__}: {_rerank_exc}) "
+                    "— falling back to off-branch"
                 )
+                err_console.print(_tb.format_exc())
                 reranker = None
                 reranked = []
             elapsed_ms = (_time.perf_counter() - t0) * 1000.0