fix: vision loss forward pass falls back to exclude on crash (#223)

abrichr · claude · web-flow · commit d348f1b6bca4 · 2026-03-29T09:53:49.000-04:00
Qwen3's vision-language merge changes internal sequence length
unpredictably. Both include and checkpoint modes crash intermittently
with attention mask mismatches (mask too large OR too small depending
on generated sequence length).

Fix: catch IndexError/RuntimeError from the vision forward pass and
retry with exclude mode (text-only, no vision tensors) for that step.
Training never crashes — some steps get vision-aware gradients, some
get text-only gradients, but all steps contribute to learning.

This is the pragmatic fix. The proper fix (capturing logits during
generation to avoid re-forward entirely) is future work.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/training/standalone/trainer.py b/openadapt_evals/training/standalone/trainer.py
@@ -446,24 +446,38 @@ def _compute_rollout_loss(self, rollout: Rollout, advantage: float, scale: float
 
             full_inputs = {k: v.to(device) for k, v in full_inputs.items()}
 
-            outputs = self._model(**full_inputs)
-
-            # For "exclude" mode, logits shape matches input_ids (no vision merge).
-            # For "include"/"checkpoint", Qwen3's vision merge changes the
-            # sequence length.  Slice action logits from the END of the
-            # output sequence (action tokens are always last).
             n_action = action_ids.shape[1]
-            if vision_loss_mode == "exclude":
-                al = outputs.logits[:, prompt_len - 1: prompt_len - 1 + n_action, :]
-            else:
-                # Post-merge: total output length differs from input_ids length.
-                # Action tokens are the last n_action tokens in the sequence.
-                seq_len = outputs.logits.shape[1]
-                al = outputs.logits[:, seq_len - n_action - 1: seq_len - 1, :]
 
-            lp = torch.nn.functional.log_softmax(al, dim=-1)
+            # Forward pass with fallback: if include/checkpoint mode crashes
+            # due to Qwen3's vision merge changing sequence length (attention
+            # mask mismatch), retry with exclude mode for this step.
+            try:
+                outputs = self._model(**full_inputs)
+                if vision_loss_mode == "exclude":
+                    al = outputs.logits[:, prompt_len - 1: prompt_len - 1 + n_action, :]
+                else:
+                    seq_len = outputs.logits.shape[1]
+                    al = outputs.logits[:, seq_len - n_action - 1: seq_len - 1, :]
+            except (IndexError, RuntimeError) as fwd_err:
+                if vision_loss_mode != "exclude":
+                    logger.warning(
+                        "Vision forward pass failed (%s), retrying with "
+                        "exclude mode for this step: %s",
+                        vision_loss_mode, fwd_err,
+                    )
+                    fallback_inputs = {
+                        k: v for k, v in prompt_inputs.items()
+                        if k not in _VISION_KEYS
+                    }
+                    fallback_inputs["input_ids"] = full_ids
+                    fallback_inputs["attention_mask"] = torch.ones_like(full_ids)
+                    fallback_inputs = {k: v.to(device) for k, v in fallback_inputs.items()}
+                    outputs = self._model(**fallback_inputs)
+                    al = outputs.logits[:, prompt_len - 1: prompt_len - 1 + n_action, :]
+                else:
+                    raise
 
-            # Gather log-probs for the actual action token IDs
+            lp = torch.nn.functional.log_softmax(al, dim=-1)
             action_token_ids = action_ids.to(device)
             tlp = lp.gather(2, action_token_ids.unsqueeze(-1)).squeeze(-1)
             slp = tlp.sum()