Skip to content

Commit d196dce

Browse files
abrichrclaude
andauthored
fix: use keyword args for Qwen VL processor call (#58)
* fix: replace AutoModelForVision2Seq with AutoModelForImageTextToText for transformers 5.x AutoModelForVision2Seq was removed in transformers 5.x (shipped on AWS DL AMI). Use AutoModelForImageTextToText as the primary import with a fallback to AutoModelForVision2Seq for older transformers versions. Files updated: - openadapt_ml/training/grpo/trainer.py - openadapt_ml/cloud/modal_cloud.py - docs/grpo_trl_rewrite_draft.py (comment only) Note: openadapt_ml/training/trl_trainer.py already had the correct try/except pattern and was not modified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: use keyword args for Qwen VL processor to avoid positional conflict Qwen2_5_VLProcessor.__call__() expects text= and images= as keyword args. Passing text as positional arg conflicts with images kwarg: TypeError: got multiple values for argument 'images' Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ec45b27 commit d196dce

File tree

3 files changed

+16
-8
lines changed

3 files changed

+16
-8
lines changed

docs/grpo_trl_rewrite_draft.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -952,7 +952,7 @@ def train_grpo(waa_config: WAATrainingConfig | None = None) -> str:
952952
# to verify this works with Unsloth's patched model loading.
953953
#
954954
# If incompatible, we can:
955-
# (a) Use standard HF model loading (AutoModelForVision2Seq)
955+
# (a) Use standard HF model loading (AutoModelForImageTextToText)
956956
# (b) Load with Unsloth, then pass the model to TRL
957957
# (c) Use Unsloth's GRPOTrainer fork (if available)
958958
#

openadapt_ml/cloud/modal_cloud.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,12 @@ def infer(
336336
if not hasattr(infer, "_model"):
337337
print(f"Loading base model: {_base}")
338338
try:
339-
from transformers import AutoModelForVision2Seq
339+
try:
340+
from transformers import AutoModelForImageTextToText as AutoVLM
341+
except ImportError:
342+
from transformers import AutoModelForVision2Seq as AutoVLM
340343

341-
infer._model = AutoModelForVision2Seq.from_pretrained(
344+
infer._model = AutoVLM.from_pretrained(
342345
_base,
343346
torch_dtype=torch.bfloat16,
344347
device_map="auto",

openadapt_ml/training/grpo/trainer.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
- beta=0.0 (no KL penalty) per DAPO/Open-Reasoner-Zero. Simpler, saves
2424
memory (no reference model needed).
2525
- Per-step backward to avoid OOM on long trajectories.
26-
- Standard HF model loading: AutoModelForVision2Seq + AutoProcessor + PEFT.
26+
- Standard HF model loading: AutoModelForImageTextToText + AutoProcessor + PEFT.
2727
- Standard PEFT checkpointing: model.save_pretrained().
2828
"""
2929

@@ -222,7 +222,12 @@ def _load_model_and_processor(config: GRPOConfig) -> tuple[Any, Any]:
222222
(model, processor) tuple. Model has LoRA adapters attached.
223223
"""
224224
from peft import LoraConfig, PeftModel, get_peft_model
225-
from transformers import AutoModelForVision2Seq, AutoProcessor
225+
from transformers import AutoProcessor
226+
227+
try:
228+
from transformers import AutoModelForImageTextToText as AutoVLM
229+
except ImportError:
230+
from transformers import AutoModelForVision2Seq as AutoVLM
226231

227232
processor = AutoProcessor.from_pretrained(config.model_name)
228233

@@ -239,7 +244,7 @@ def _load_model_and_processor(config: GRPOConfig) -> tuple[Any, Any]:
239244
bnb_4bit_quant_type="nf4",
240245
)
241246

242-
model = AutoModelForVision2Seq.from_pretrained(config.model_name, **load_kwargs)
247+
model = AutoVLM.from_pretrained(config.model_name, **load_kwargs)
243248

244249
if config.lora_checkpoint:
245250
logger.info("Loading existing LoRA from %s", config.lora_checkpoint)
@@ -322,7 +327,7 @@ def agent_fn(obs: Any) -> BenchmarkAction:
322327
else:
323328
text_input = messages[-1]["content"]
324329

325-
inputs = processor(text_input, images=[image], return_tensors="pt")
330+
inputs = processor(text=[text_input], images=[image], return_tensors="pt")
326331
inputs = {k: v.to(model.device) for k, v in inputs.items()}
327332

328333
with torch.no_grad():
@@ -519,7 +524,7 @@ def _compute_rollout_loss(
519524
text_input = messages[-1]["content"]
520525

521526
prompt_inputs = self._processor(
522-
text_input, images=[image], return_tensors="pt"
527+
text=[text_input], images=[image], return_tensors="pt"
523528
)
524529
prompt_ids = prompt_inputs["input_ids"]
525530
prompt_len = prompt_ids.shape[1]

0 commit comments

Comments
 (0)