fix(api): normalize torch default dtype to float32 after concurrent model init (#2167)

nicoloboschi · web-flow · commit 30fb287d1045 · 2026-06-12T11:51:07.000+02:00
transformers' dtype context manager (entered by SentenceTransformer / CrossEncoder / from_pretrained) does a non-thread-safe save/restore of the process-global default dtype. When an fp16 embedding model and an fp32 reranker/query-analyzer load in parallel during MemoryEngine.initialize(), an unlucky interleave can leave the global default stuck at float16. Every later encode() then emits NaN vectors that pgvector rejects ("NaN not allowed in vector") on MPS, or raises "c10::Half != float" on CPU -- non-deterministically across restarts. Keep the model loads fully parallel and, once asyncio.gather() has joined every load thread, normalize the global default dtype back to float32 -- the inference state a healthy boot already converges to. The reset is race-free (all threads have finished) and only touches torch if a local provider actually loaded it. Fixes #2162
diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -15,6 +15,7 @@
 import inspect
 import json
 import logging
+import sys
 import time
 import uuid
 from collections.abc import Awaitable, Callable
@@ -2567,6 +2568,28 @@ async def verify_llm():
                 f"first-time model download legitimately needs more time."
             ) from e
 
+        # Normalize torch's process-global default dtype back to float32 after the
+        # concurrent local model loads. transformers' dtype context manager (entered
+        # by SentenceTransformer / CrossEncoder / from_pretrained) does a
+        # NON-thread-safe save/restore of the global default dtype: when an fp16 and
+        # an fp32 model load in parallel above, an unlucky interleave can leave the
+        # default stuck at float16, after which every encode() emits NaN vectors that
+        # pgvector rejects ("NaN not allowed in vector") on MPS, or raises
+        # "c10::Half != float" on CPU — non-deterministically across restarts. By the
+        # time gather() returns, all load threads have joined, so resetting the
+        # default here is race-free, keeps the loads fully parallel, and converges on
+        # the float32 inference state a healthy boot already reaches. torch is only
+        # imported (in sys.modules) if a local provider actually loaded a model.
+        # See https://github.com/vectorize-io/hindsight/issues/2162.
+        torch_mod = sys.modules.get("torch")
+        if torch_mod is not None and torch_mod.get_default_dtype() != torch_mod.float32:
+            logger.warning(
+                "torch default dtype was left at %s after concurrent model init; "
+                "restoring float32 to avoid NaN embedding vectors (issue #2162).",
+                torch_mod.get_default_dtype(),
+            )
+            torch_mod.set_default_dtype(torch_mod.float32)
+
         # Run database migrations if enabled
         if self._run_migrations:
             if not self.db_url:
diff --git a/hindsight-api-slim/tests/test_model_load_default_dtype.py b/hindsight-api-slim/tests/test_model_load_default_dtype.py
@@ -0,0 +1,90 @@
+"""
+Startup must leave torch's global default dtype at float32 regardless of how the
+concurrent local model loads interleave.
+
+Covers issue #2162: transformers' dtype context manager (entered by
+SentenceTransformer / CrossEncoder / from_pretrained) does a NON-thread-safe
+save/restore of the *process-global* default dtype. When an fp16 embedding model
+and an fp32 reranker/query-analyzer load in parallel at startup, an unlucky
+interleave leaves the global default stuck at float16 — every later encode() then
+emits NaN vectors that pgvector rejects ("NaN not allowed in vector") on MPS, or
+raises "c10::Half != float" on CPU, non-deterministically across restarts.
+
+MemoryEngine.initialize() loads the models in parallel (for speed) and then, once
+the gather has joined every load thread, normalizes the global default dtype back
+to float32 — the inference state a healthy boot already converges to. This test
+simulates the poisoning by having a model load flip the default to float16, then
+asserts initialize() leaves it at float32.
+"""
+
+import pytest
+
+from hindsight_api import MemoryEngine
+from hindsight_api.engine.task_backend import SyncTaskBackend
+
+
+class _StopInit(Exception):
+    """Sentinel to abort initialize() right after the model-load gather."""
+
+
+class _PoisoningEmbeddings:
+    """Local embedding stub that mimics an fp16 load poisoning the global dtype."""
+
+    provider_name = "local"
+
+    async def initialize(self) -> None:
+        import torch
+
+        # Reproduce the symptom of transformers' racy dtype restore: the global
+        # default is left at float16 after the (parallel) load.
+        torch.set_default_dtype(torch.float16)
+
+
+class _NoopCrossEncoder:
+    provider_name = "local"
+
+    async def initialize(self) -> None:
+        return None
+
+
+class _NoopQueryAnalyzer:
+    def load(self) -> None:
+        return None
+
+
+@pytest.mark.asyncio
+async def test_global_default_dtype_restored_to_float32_after_init():
+    """A load that leaves the torch default at float16 is normalized back to float32."""
+    import torch
+
+    original = torch.get_default_dtype()
+    try:
+        engine = MemoryEngine(
+            # Non-pg0 URL so start_pg0() is a no-op and __init__ never connects.
+            db_url="postgresql://u:p@localhost:5999/db",
+            memory_llm_provider="none",
+            memory_llm_api_key=None,
+            memory_llm_model="none",
+            embeddings=_PoisoningEmbeddings(),
+            cross_encoder=_NoopCrossEncoder(),
+            query_analyzer=_NoopQueryAnalyzer(),
+            run_migrations=False,
+            skip_llm_verification=True,
+            lazy_reranker=False,  # load the cross-encoder eagerly, in the gather
+            task_backend=SyncTaskBackend(),
+        )
+
+        # Abort right after the post-gather dtype restore, before any real DB work.
+        async def _stop(*args, **kwargs):
+            raise _StopInit
+
+        engine._backend.initialize = _stop  # type: ignore[method-assign]
+
+        with pytest.raises(_StopInit):
+            await engine.initialize()
+
+        # The embedding load poisoned the default to float16; initialize() must
+        # have normalized it back so later encode() can't emit NaN vectors.
+        assert torch.get_default_dtype() == torch.float32
+    finally:
+        torch.set_default_dtype(original)