Skip to content

Commit b017b6e

Browse files
abrichrclaude
andauthored
fix: replace AutoModelForVision2Seq with AutoModelForImageTextToText for transformers 5.x (#178)
* fix: skip verify_apps, close_all, activate_window in lightweight mode These setup entry types hang (120s timeout), crash the WAA server, or are unnecessary for task execution. In lightweight mode (the default), they are now skipped entirely — both the verify_apps step injected from related_apps and any close_all / activate_window entries in the task config array. Each skipped entry is recorded with status "skipped" in _last_setup_results for auditability. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: replace AutoModelForVision2Seq with AutoModelForImageTextToText for transformers 5.x AutoModelForVision2Seq was removed in transformers 5.x (shipped on AWS DL AMI). Use AutoModelForImageTextToText as the primary import with a fallback to AutoModelForVision2Seq for older transformers versions. Files updated: - scripts/train_trl_grpo.py - scripts/train_grpo_example.py - openadapt_evals/agents/qwen3vl_agent.py - openadapt_evals/agents/smol_agent.py - examples/http_agent_server.py (comment only) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5899c4c commit b017b6e

5 files changed

Lines changed: 35 additions & 12 deletions

File tree

examples/http_agent_server.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ def load_model():
3636
"""Load your model here. Called once at startup."""
3737
log.info("Loading model... (replace this with your model loading code)")
3838
# Example:
39-
# from transformers import AutoModelForVision2Seq, AutoProcessor
40-
# model = AutoModelForVision2Seq.from_pretrained("your-model")
39+
# from transformers import AutoProcessor
40+
# try:
41+
# from transformers import AutoModelForImageTextToText as AutoVLM
42+
# except ImportError:
43+
# from transformers import AutoModelForVision2Seq as AutoVLM
44+
# model = AutoVLM.from_pretrained("your-model")
4145
# processor = AutoProcessor.from_pretrained("your-model")
4246
# return model, processor
4347
return None

openadapt_evals/agents/qwen3vl_agent.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -553,12 +553,16 @@ def _load_model(self) -> None:
553553
logger.info(f"Loading model: {hf_model_id}")
554554

555555
# Qwen3-VL uses the same architecture class as Qwen2.5-VL in
556-
# current transformers versions. Try AutoModelForVision2Seq first
557-
# (more generic), then fall back to the specific class.
556+
# current transformers versions. Try AutoModelForImageTextToText first
557+
# (transformers 5.x), then AutoModelForVision2Seq (older), then
558+
# fall back to the specific class.
558559
try:
559-
from transformers import AutoModelForVision2Seq
560+
try:
561+
from transformers import AutoModelForImageTextToText as AutoVLM
562+
except ImportError:
563+
from transformers import AutoModelForVision2Seq as AutoVLM
560564

561-
self._model = AutoModelForVision2Seq.from_pretrained(
565+
self._model = AutoVLM.from_pretrained(
562566
hf_model_id,
563567
torch_dtype=resolved_dtype,
564568
device_map=self.device,

openadapt_evals/agents/smol_agent.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,12 @@ def _load_model(self) -> None:
329329

330330
try:
331331
import torch
332-
from transformers import AutoModelForVision2Seq, AutoProcessor
332+
from transformers import AutoProcessor
333+
334+
try:
335+
from transformers import AutoModelForImageTextToText as AutoVLM
336+
except ImportError:
337+
from transformers import AutoModelForVision2Seq as AutoVLM
333338
except ImportError as e:
334339
raise RuntimeError(
335340
"SmolOperatorAgent requires transformers and torch. "
@@ -348,7 +353,7 @@ def _load_model(self) -> None:
348353
resolved_dtype = dtype_map.get(self.torch_dtype, "auto")
349354

350355
logger.info(f"Loading model: {self.model_path}")
351-
self._model = AutoModelForVision2Seq.from_pretrained(
356+
self._model = AutoVLM.from_pretrained(
352357
self.model_path,
353358
torch_dtype=resolved_dtype,
354359
device_map=self.device,

scripts/train_grpo_example.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@
3939
import torch
4040
from peft import LoraConfig, get_peft_model
4141
from PIL import Image
42-
from transformers import AutoModelForVision2Seq, AutoProcessor
42+
from transformers import AutoProcessor
43+
44+
try:
45+
from transformers import AutoModelForImageTextToText as AutoVLM
46+
except ImportError:
47+
from transformers import AutoModelForVision2Seq as AutoVLM
4348

4449
from openadapt_evals.adapters.base import BenchmarkAction, BenchmarkObservation
4550
from openadapt_evals.adapters.rl_env import RLEnvironment
@@ -227,7 +232,7 @@ def main(
227232
# 1. Load model with LoRA
228233
print(f"Loading {model_name} ...")
229234
processor = AutoProcessor.from_pretrained(model_name)
230-
model = AutoModelForVision2Seq.from_pretrained(
235+
model = AutoVLM.from_pretrained(
231236
model_name, torch_dtype=torch.bfloat16, device_map="auto"
232237
)
233238
lora = LoraConfig(

scripts/train_trl_grpo.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,16 @@ def load_model_standard(model_name, lora_r=16):
8888
"""Load model with standard HuggingFace + PEFT."""
8989
import torch
9090
from peft import LoraConfig, get_peft_model
91-
from transformers import AutoModelForVision2Seq, AutoProcessor
91+
from transformers import AutoProcessor
92+
93+
try:
94+
from transformers import AutoModelForImageTextToText as AutoVLM
95+
except ImportError:
96+
from transformers import AutoModelForVision2Seq as AutoVLM
9297

9398
logger.info("Loading model (standard): %s", model_name)
9499
processor = AutoProcessor.from_pretrained(model_name)
95-
model = AutoModelForVision2Seq.from_pretrained(
100+
model = AutoVLM.from_pretrained(
96101
model_name,
97102
torch_dtype=torch.bfloat16,
98103
device_map="auto",

0 commit comments

Comments
 (0)