fix: patch model.forward() directly instead of wrapper class (#253)

abrichr · claude · web-flow · commit 0f381b1c6ff0 · 2026-03-29T19:36:51.000-04:00
TRL unwraps models via Accelerate, stripping wrapper classes. The fix:
patch forward() on the model instance itself. This survives unwrapping.

- patch_model_for_trl(model) → returns cache_fn
- cache_fn(inputs) caches pixel_values from processor output
- Patched forward() injects cached pixel_values when TRL omits them
- Patched __call__ also injects (covers all call paths)
- trl_wrapper passes original model to TRL (not a wrapper)
- cache_vision_fn passed through to rollout_func

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/training/trl_rollout.py b/openadapt_evals/training/trl_rollout.py
@@ -367,6 +367,7 @@ def make_waa_rollout_func(
     stuck_threshold: int = 3,
     on_before_collect: Optional[Callable] = None,
     on_rollout_complete: Optional[Callable] = None,
+    cache_vision_fn: Optional[Callable] = None,
 ) -> Callable:
     """Create a TRL-compatible rollout_func for WAA environments.
 
@@ -645,7 +646,11 @@ def generate_fn(screenshot_bytes: bytes, instruction: str):
 
             # Cache vision inputs so the VLMModelWrapper can inject
             # pixel_values during TRL's training forward pass.
-            if hasattr(model, "cache_vision_inputs"):
+            # Cache vision inputs so the patched forward() can inject
+            # pixel_values during TRL's training step and generate() calls.
+            if cache_vision_fn is not None:
+                cache_vision_fn(dict(inputs.items()) if hasattr(inputs, "items") else inputs)
+            elif hasattr(model, "cache_vision_inputs"):
                 model.cache_vision_inputs(inputs)
 
             with torch.no_grad():
diff --git a/openadapt_evals/training/trl_wrapper.py b/openadapt_evals/training/trl_wrapper.py
@@ -147,13 +147,14 @@ def train(self) -> str:
         if self._on_model_loaded:
             self._on_model_loaded(model, processor)
 
-        # --- Wrap model for TRL multimodal compatibility ---
-        # TRL's GRPOTrainer calls model.forward(input_ids=...) during the
-        # training step without pixel_values. VLMs need pixel_values to
-        # produce meaningful logits. The wrapper caches vision inputs from
-        # our rollout generation and injects them into TRL's forward pass.
-        from openadapt_evals.training.vlm_wrapper import VLMModelWrapper
-        vlm_wrapper = VLMModelWrapper(model)
+        # --- Patch model for TRL multimodal compatibility ---
+        # TRL's GRPOTrainer calls model.forward(input_ids=...) and
+        # model.generate(input_ids=...) without pixel_values. VLMs need
+        # pixel_values. We patch the model's forward() directly on the
+        # instance so it survives TRL/Accelerate unwrapping (which strips
+        # wrapper classes). The cache_fn is passed to rollout_func.
+        from openadapt_evals.training.vlm_wrapper import patch_model_for_trl
+        cache_vision_fn = patch_model_for_trl(model)
 
         # --- Rollout function (from our config) ---
         from openadapt_evals.adapters.waa.live import WAALiveAdapter, WAALiveConfig
@@ -176,6 +177,7 @@ def train(self) -> str:
             temperature=self._config.temperature,
             on_before_collect=self._on_before_collect,
             on_rollout_complete=self._on_rollout_complete,
+            cache_vision_fn=cache_vision_fn,
         )
 
         # --- Reward ---
@@ -268,7 +270,7 @@ def on_step_end(self, args, state, control, **kwargs):
 
         # --- Train ---
         trainer = _TRLTrainer(
-            model=vlm_wrapper,
+            model=model,
             processing_class=processor,
             args=trl_config,
             train_dataset=dataset,
diff --git a/openadapt_evals/training/vlm_wrapper.py b/openadapt_evals/training/vlm_wrapper.py
@@ -1,75 +1,157 @@
-"""VLM model wrapper for TRL compatibility.
+"""VLM model patching for TRL compatibility.
 
-TRL's GRPOTrainer was designed for text-only LLMs. During the training
-step, it calls model.forward(input_ids=...) to recompute logprobs under
-the current policy. For multimodal VLMs, this forward pass also needs
-pixel_values and image_grid_thw — but TRL doesn't know about them.
+TRL's GRPOTrainer was designed for text-only LLMs. It unwraps models
+via Accelerate, which strips any external wrapper class. The fix:
+patch the model's forward() method directly on the instance. This
+survives unwrapping because it's on the model object, not a wrapper.
 
-This wrapper solves the problem by caching vision inputs during rollout
-generation (when we have the images) and injecting them during TRL's
-forward pass (when TRL only passes input_ids).
+Two functions:
+- ``patch_model_for_trl(model)``: patches model.forward to inject
+  cached pixel_values. Returns a ``cache_vision_inputs`` callable.
+- ``VLMModelWrapper``: legacy wrapper class (kept for backward compat,
+  delegates to patch_model_for_trl internally).
 
 Usage:
-    from openadapt_evals.training.vlm_wrapper import VLMModelWrapper
+    from openadapt_evals.training.vlm_wrapper import patch_model_for_trl
 
-    wrapper = VLMModelWrapper(model)
-    trainer = GRPOTrainer(model=wrapper, ...)
+    cache_fn = patch_model_for_trl(model)
 
     # During rollout generation:
     inputs = processor(text=..., images=[img], return_tensors="pt")
-    wrapper.cache_vision_inputs(inputs)
-    outputs = wrapper.generate(**inputs, ...)
+    cache_fn(inputs)  # cache pixel_values
+    outputs = model.generate(**inputs, ...)  # model sees image ✓
 
     # During TRL's training forward pass:
-    # TRL calls wrapper.forward(input_ids=...) — we inject cached vision inputs
+    # TRL calls model.forward(input_ids=...) → patched forward injects
+    # cached pixel_values automatically. Model sees image ✓
 """
 
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import Any, Callable
 
 logger = logging.getLogger(__name__)
 
 
-class VLMModelWrapper:
-    """Wraps a VLM so TRL's forward pass gets pixel_values.
+def patch_model_for_trl(model: Any) -> Callable[[dict[str, Any]], None]:
+    """Patch a VLM's forward() to auto-inject cached vision inputs.
+
+    This patches the model instance directly (not a wrapper class),
+    so it survives TRL/Accelerate unwrapping.
 
-    Caches vision tensors (pixel_values, image_grid_thw) during rollout
-    generation and injects them during forward passes that lack them.
+    Args:
+        model: A HuggingFace VLM (may be a PeftModel).
 
-    This is the standard adapter pattern for making framework-incompatible
-    models work with training frameworks. TRL calls model.forward() with
-    only input_ids; we intercept and add the vision inputs.
+    Returns:
+        A ``cache_vision_inputs(inputs_dict)`` function. Call this during
+        rollout generation to cache pixel_values for the training forward.
+    """
+    # Mutable state shared between cache_fn and patched forward
+    _cache: dict[str, Any] = {}
+    _logged_inject = [False]
+    _logged_miss = [False]
+
+    original_forward = model.forward
+
+    def _patched_forward(input_ids: Any = None, **kwargs: Any) -> Any:
+        """Forward with automatic vision input injection."""
+        if "pixel_values" not in kwargs and _cache:
+            for key, val in _cache.items():
+                if key not in kwargs:
+                    if hasattr(val, "to") and hasattr(input_ids, "device"):
+                        kwargs[key] = val.to(input_ids.device)
+                    else:
+                        kwargs[key] = val
+            if not _logged_inject[0]:
+                _logged_inject[0] = True
+                logger.info(
+                    "VLM forward patch: injecting cached vision inputs "
+                    "(keys=%s). TRL called forward() without pixel_values.",
+                    list(_cache.keys()),
+                )
+        elif "pixel_values" not in kwargs and not _cache:
+            if not _logged_miss[0]:
+                _logged_miss[0] = True
+                logger.warning(
+                    "VLM forward patch: forward() called without pixel_values "
+                    "and no cache. Model is blind. Call cache_fn() first.",
+                )
+        return original_forward(input_ids=input_ids, **kwargs)
+
+    # Patch the model instance
+    model.forward = _patched_forward
+
+    # Also patch __call__ if it routes to forward (most HF models do)
+    # This ensures model(input_ids=...) also gets the injection.
+    original_call = model.__class__.__call__
+
+    def _patched_call(self_model, *args, **kwargs):
+        # If called without pixel_values, inject from cache
+        if "pixel_values" not in kwargs and _cache:
+            for key, val in _cache.items():
+                if key not in kwargs:
+                    input_ids = kwargs.get("input_ids", args[0] if args else None)
+                    if hasattr(val, "to") and input_ids is not None and hasattr(input_ids, "device"):
+                        kwargs[key] = val.to(input_ids.device)
+                    else:
+                        kwargs[key] = val
+        return original_call(self_model, *args, **kwargs)
+
+    # Only patch __call__ on the instance, not the class
+    import types
+    model.__call__ = types.MethodType(_patched_call, model)
+
+    logger.info(
+        "VLM forward patch installed on %s. Vision inputs will be "
+        "auto-injected during TRL's forward passes.",
+        type(model).__name__,
+    )
+
+    def cache_vision_inputs(inputs: dict[str, Any]) -> None:
+        """Cache vision tensors for injection into forward passes.
+
+        Args:
+            inputs: Dict from processor(text=..., images=...) or a dict
+                with pixel_values and optionally image_grid_thw.
+        """
+        _cache.clear()
+        for key in ("pixel_values", "image_grid_thw"):
+            if key in inputs:
+                val = inputs[key]
+                if hasattr(val, "detach"):
+                    _cache[key] = val.detach().clone()
+                else:
+                    _cache[key] = val
+        if _cache:
+            logger.debug("Cached vision inputs: keys=%s", list(_cache.keys()))
+
+    return cache_vision_inputs
+
+
+class VLMModelWrapper:
+    """Legacy wrapper — delegates to patch_model_for_trl internally.
+
+    Kept for backward compatibility with existing code that creates
+    VLMModelWrapper(model). New code should use patch_model_for_trl()
+    directly and pass the original model to TRL.
     """
 
     def __init__(self, model: Any):
-        # Store model WITHOUT going through __setattr__ (which delegates to model)
         object.__setattr__(self, "_vlm_model", model)
+        object.__setattr__(self, "_cache_fn", patch_model_for_trl(model))
         object.__setattr__(self, "_vision_cache", None)
         object.__setattr__(self, "_cache_hits", 0)
         object.__setattr__(self, "_cache_misses", 0)
 
-        # --- PEFT / quantization compatibility ---
-        # TRL's validate_quantization_for_training() checks for PEFT via:
-        #   1. isinstance(model, PeftModel) — fails because wrapper isn't PeftModel
-        #   2. hasattr(model, "peft_config") — works via our __getattr__
-        #   3. Checking model.is_quantized / model.quantization_method
-        #
-        # The isinstance check is the blocker. We solve it by making the
-        # wrapper's __class__ inherit from the wrapped model's type, so
-        # isinstance(wrapper, PeftModel) returns True.
+        # PEFT isinstance compatibility
         try:
             from peft import PeftModel
             if isinstance(model, PeftModel):
-                # Create a new class that inherits from BOTH our wrapper
-                # and the actual model class. This makes isinstance work
-                # while keeping our forward/generate/cache methods.
                 combined = type(
                     "VLMPeftModelWrapper",
                     (VLMModelWrapper, type(model)),
                     {
-                        # Ensure our methods take priority (MRO)
                         "forward": VLMModelWrapper.forward,
                         "generate": VLMModelWrapper.generate,
                         "__call__": VLMModelWrapper.__call__,
@@ -78,101 +160,28 @@ def __init__(self, model: Any):
                     },
                 )
                 object.__setattr__(self, "__class__", combined)
-                logger.info(
-                    "VLMModelWrapper: PEFT isinstance compatibility enabled "
-                    "(wrapped model is %s)", type(model).__name__,
-                )
-        except ImportError:
+        except (ImportError, Exception):
             pass
-        except Exception as exc:
-            # If dynamic class fails, fall back to attribute-level compat
-            logger.warning(
-                "VLMModelWrapper: PEFT isinstance setup failed: %s. "
-                "Falling back to attribute-level compatibility.", exc,
-            )
 
     def cache_vision_inputs(self, inputs: dict[str, Any]) -> None:
-        """Cache vision tensors from a processor output dict.
-
-        Call this during rollout generation, right after processor() and
-        before generate(). The cached tensors will be injected into
-        subsequent forward() calls that lack pixel_values.
-
-        Args:
-            inputs: Dict from processor(text=..., images=...) containing
-                pixel_values and optionally image_grid_thw.
-        """
-        cache = {}
-        for key in ("pixel_values", "image_grid_thw"):
-            if key in inputs:
-                # Clone and detach to avoid gradient issues
-                val = inputs[key]
-                if hasattr(val, "detach"):
-                    cache[key] = val.detach().clone()
-                else:
-                    cache[key] = val
-        if cache:
-            object.__setattr__(self, "_vision_cache", cache)
+        cache_fn = object.__getattribute__(self, "_cache_fn")
+        cache_fn(inputs)
 
     def forward(self, input_ids: Any = None, **kwargs: Any) -> Any:
-        """Forward pass with automatic vision input injection.
-
-        If kwargs lacks pixel_values and we have cached vision inputs,
-        inject them. This is the key fix: TRL calls model.forward()
-        with only input_ids, but VLMs need pixel_values too.
-        """
         model = object.__getattribute__(self, "_vlm_model")
-        cache = object.__getattribute__(self, "_vision_cache")
-
-        if "pixel_values" not in kwargs and cache is not None:
-            for key, val in cache.items():
-                if key not in kwargs:
-                    # Move to same device as input_ids
-                    if hasattr(val, "to") and hasattr(input_ids, "device"):
-                        kwargs[key] = val.to(input_ids.device)
-                    else:
-                        kwargs[key] = val
-            hits = object.__getattribute__(self, "_cache_hits")
-            object.__setattr__(self, "_cache_hits", hits + 1)
-            if hits == 0:
-                logger.info(
-                    "VLMModelWrapper: injecting cached vision inputs into "
-                    "forward pass (keys=%s). This means TRL called forward() "
-                    "without pixel_values — the wrapper is working as intended.",
-                    list(cache.keys()),
-                )
-        elif "pixel_values" not in kwargs and cache is None:
-            misses = object.__getattribute__(self, "_cache_misses")
-            object.__setattr__(self, "_cache_misses", misses + 1)
-            if misses == 0:
-                logger.warning(
-                    "VLMModelWrapper: forward() called without pixel_values "
-                    "and no cached vision inputs available. The model is blind. "
-                    "Ensure cache_vision_inputs() is called during generation.",
-                )
-
-        return model(input_ids=input_ids, **kwargs)
+        return model.forward(input_ids=input_ids, **kwargs)
 
     def generate(self, **kwargs: Any) -> Any:
-        """Generate with the underlying model. No interception needed —
-        our generate_fn passes pixel_values explicitly."""
         model = object.__getattribute__(self, "_vlm_model")
         return model.generate(**kwargs)
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        """Route __call__ to forward for compatibility with TRL."""
         return self.forward(*args, **kwargs)
 
     def __getattr__(self, name: str) -> Any:
-        """Delegate all other attribute access to the wrapped model.
-
-        This makes the wrapper transparent: trainer.model.config,
-        trainer.model.parameters(), etc. all work as expected.
-        """
         model = object.__getattribute__(self, "_vlm_model")
         return getattr(model, name)
 
     def __setattr__(self, name: str, value: Any) -> None:
-        """Delegate attribute setting to the wrapped model."""
         model = object.__getattribute__(self, "_vlm_model")
         setattr(model, name, value)
diff --git a/tests/test_trl_integration.py b/tests/test_trl_integration.py
@@ -301,22 +301,19 @@ def test_wrapper_passes_callbacks_to_rollout_func(self):
 # ---------------------------------------------------------------------------
 
 
-class TestVLMModelWrapperIntegration:
-    """Verify VLMModelWrapper is wired into the TRL training pipeline."""
+class TestVLMPatchIntegration:
+    """Verify VLM model patching is wired into the TRL training pipeline."""
 
-    def test_wrapper_used_in_train_source(self):
-        """trl_wrapper.train() wraps the model in VLMModelWrapper."""
+    def test_patch_used_in_train_source(self):
+        """trl_wrapper.train() patches the model for VLM compatibility."""
         import inspect
         from openadapt_evals.training import trl_wrapper
 
         source = inspect.getsource(trl_wrapper.GRPOTrainer.train)
-        assert "VLMModelWrapper" in source, (
-            "GRPOTrainer.train() must wrap the model in VLMModelWrapper "
-            "before passing to TRL. Without this, TRL's forward pass "
-            "won't have pixel_values and the VLM will be blind."
-        )
-        assert "vlm_wrapper" in source.lower() or "VLMModelWrapper(model)" in source, (
-            "train() must create VLMModelWrapper(model) to wrap the model."
+        assert "patch_model_for_trl" in source, (
+            "GRPOTrainer.train() must call patch_model_for_trl(model) "
+            "to patch forward() for pixel_values injection. Without this, "
+            "TRL's forward pass won't have pixel_values and the VLM will be blind."
         )
 
     def test_generate_fn_calls_cache_vision_inputs(self):
diff --git a/tests/test_vlm_wrapper.py b/tests/test_vlm_wrapper.py