fix: replace AutoModelForVision2Seq with AutoModelForImageTextToText for transformers 5.x (#178)

abrichr · claude · web-flow · commit b017b6e52071 · 2026-03-21T15:17:57.000-04:00
* fix: skip verify_apps, close_all, activate_window in lightweight mode

These setup entry types hang (120s timeout), crash the WAA server, or
are unnecessary for task execution. In lightweight mode (the default),
they are now skipped entirely — both the verify_apps step injected from
related_apps and any close_all / activate_window entries in the task
config array. Each skipped entry is recorded with status "skipped" in
_last_setup_results for auditability.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* fix: replace AutoModelForVision2Seq with AutoModelForImageTextToText for transformers 5.x

AutoModelForVision2Seq was removed in transformers 5.x (shipped on AWS DL AMI).
Use AutoModelForImageTextToText as the primary import with a fallback to
AutoModelForVision2Seq for older transformers versions.

Files updated:
- scripts/train_trl_grpo.py
- scripts/train_grpo_example.py
- openadapt_evals/agents/qwen3vl_agent.py
- openadapt_evals/agents/smol_agent.py
- examples/http_agent_server.py (comment only)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/http_agent_server.py b/examples/http_agent_server.py
@@ -36,8 +36,12 @@ def load_model():
     """Load your model here. Called once at startup."""
     log.info("Loading model... (replace this with your model loading code)")
     # Example:
-    #   from transformers import AutoModelForVision2Seq, AutoProcessor
-    #   model = AutoModelForVision2Seq.from_pretrained("your-model")
+    #   from transformers import AutoProcessor
+    #   try:
+    #       from transformers import AutoModelForImageTextToText as AutoVLM
+    #   except ImportError:
+    #       from transformers import AutoModelForVision2Seq as AutoVLM
+    #   model = AutoVLM.from_pretrained("your-model")
     #   processor = AutoProcessor.from_pretrained("your-model")
     #   return model, processor
     return None
diff --git a/openadapt_evals/agents/qwen3vl_agent.py b/openadapt_evals/agents/qwen3vl_agent.py
@@ -553,12 +553,16 @@ def _load_model(self) -> None:
         logger.info(f"Loading model: {hf_model_id}")
 
         # Qwen3-VL uses the same architecture class as Qwen2.5-VL in
-        # current transformers versions. Try AutoModelForVision2Seq first
-        # (more generic), then fall back to the specific class.
+        # current transformers versions. Try AutoModelForImageTextToText first
+        # (transformers 5.x), then AutoModelForVision2Seq (older), then
+        # fall back to the specific class.
         try:
-            from transformers import AutoModelForVision2Seq
+            try:
+                from transformers import AutoModelForImageTextToText as AutoVLM
+            except ImportError:
+                from transformers import AutoModelForVision2Seq as AutoVLM
 
-            self._model = AutoModelForVision2Seq.from_pretrained(
+            self._model = AutoVLM.from_pretrained(
                 hf_model_id,
                 torch_dtype=resolved_dtype,
                 device_map=self.device,
diff --git a/openadapt_evals/agents/smol_agent.py b/openadapt_evals/agents/smol_agent.py
@@ -329,7 +329,12 @@ def _load_model(self) -> None:
 
         try:
             import torch
-            from transformers import AutoModelForVision2Seq, AutoProcessor
+            from transformers import AutoProcessor
+
+            try:
+                from transformers import AutoModelForImageTextToText as AutoVLM
+            except ImportError:
+                from transformers import AutoModelForVision2Seq as AutoVLM
         except ImportError as e:
             raise RuntimeError(
                 "SmolOperatorAgent requires transformers and torch. "
@@ -348,7 +353,7 @@ def _load_model(self) -> None:
         resolved_dtype = dtype_map.get(self.torch_dtype, "auto")
 
         logger.info(f"Loading model: {self.model_path}")
-        self._model = AutoModelForVision2Seq.from_pretrained(
+        self._model = AutoVLM.from_pretrained(
             self.model_path,
             torch_dtype=resolved_dtype,
             device_map=self.device,
diff --git a/scripts/train_grpo_example.py b/scripts/train_grpo_example.py
@@ -39,7 +39,12 @@
 import torch
 from peft import LoraConfig, get_peft_model
 from PIL import Image
-from transformers import AutoModelForVision2Seq, AutoProcessor
+from transformers import AutoProcessor
+
+try:
+    from transformers import AutoModelForImageTextToText as AutoVLM
+except ImportError:
+    from transformers import AutoModelForVision2Seq as AutoVLM
 
 from openadapt_evals.adapters.base import BenchmarkAction, BenchmarkObservation
 from openadapt_evals.adapters.rl_env import RLEnvironment
@@ -227,7 +232,7 @@ def main(
     # 1. Load model with LoRA
     print(f"Loading {model_name} ...")
     processor = AutoProcessor.from_pretrained(model_name)
-    model = AutoModelForVision2Seq.from_pretrained(
+    model = AutoVLM.from_pretrained(
         model_name, torch_dtype=torch.bfloat16, device_map="auto"
     )
     lora = LoraConfig(
diff --git a/scripts/train_trl_grpo.py b/scripts/train_trl_grpo.py
@@ -88,11 +88,16 @@ def load_model_standard(model_name, lora_r=16):
     """Load model with standard HuggingFace + PEFT."""
     import torch
     from peft import LoraConfig, get_peft_model
-    from transformers import AutoModelForVision2Seq, AutoProcessor
+    from transformers import AutoProcessor
+
+    try:
+        from transformers import AutoModelForImageTextToText as AutoVLM
+    except ImportError:
+        from transformers import AutoModelForVision2Seq as AutoVLM
 
     logger.info("Loading model (standard): %s", model_name)
     processor = AutoProcessor.from_pretrained(model_name)
-    model = AutoModelForVision2Seq.from_pretrained(
+    model = AutoVLM.from_pretrained(
         model_name,
         torch_dtype=torch.bfloat16,
         device_map="auto",