Final fix for MacOS CI failure

jlarson4 · jlarson4 · commit 10ef8221622c · 2026-05-22T15:28:06.000-05:00
diff --git a/tests/unit/model_bridge/test_bridge_generate_no_tokenizer.py b/tests/unit/model_bridge/test_bridge_generate_no_tokenizer.py
@@ -29,45 +29,6 @@ def test_generate_without_tokenizer_stop_at_eos_false_kv_cache(tokenizer_free_br
     assert bridge.tokenizer is None
 
     tokens = _PROMPT_TOKENS.clone()
-
-    # === TEMP DEBUG: localize CI-only NaN; remove after diagnosing ===
-    import sys
-
-    def _diag(label: str, t: torch.Tensor) -> None:
-        print(
-            f"[DIAG] {label}: nan={torch.isnan(t).any().item()} "
-            f"inf={torch.isinf(t).any().item()} "
-            f"shape={tuple(t.shape)} dtype={t.dtype} "
-            f"sample={t.flatten()[:4].tolist()}",
-            file=sys.stderr,
-            flush=True,
-        )
-
-    with torch.no_grad():
-        bl = bridge(tokens, return_type="logits")
-    _diag("bridge_fwd_no_cache", bl)
-
-    with torch.no_grad():
-        ho = bridge.original_model(tokens)
-    _diag("hf_fwd_no_cache", ho.logits)
-
-    with torch.no_grad():
-        ho_cache = bridge.original_model(tokens, use_cache=True)
-    _diag("hf_fwd_step0_use_cache", ho_cache.logits)
-    print(
-        f"[DIAG] cache_type={type(ho_cache.past_key_values).__name__}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    next_id = ho_cache.logits[:, -1, :].argmax(-1, keepdim=True)
-    with torch.no_grad():
-        ho_step1 = bridge.original_model(
-            next_id, past_key_values=ho_cache.past_key_values, use_cache=True
-        )
-    _diag("hf_fwd_step1_with_cache", ho_step1.logits)
-    # === END TEMP DEBUG ===
-
     output = bridge.generate(
         tokens,
         max_new_tokens=3,
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -2386,6 +2386,14 @@ def _generate_tokens(
                                 forward_kwargs["position_ids"] = forward_kwargs["position_ids"][
                                     :, -1:
                                 ]
+                            # HF v5 + macOS-arm64 NaNs when inferring the mask
+                            # from past_key_values + 1-token input. Pass it.
+                            if "attention_mask" not in forward_kwargs:
+                                forward_kwargs["attention_mask"] = torch.ones(
+                                    (current_tokens.shape[0], current_tokens.shape[1]),
+                                    dtype=torch.long,
+                                    device=current_tokens.device,
+                                )
                             logits = self(
                                 current_tokens[:, -1:],
                                 return_type="logits",