fix: use AutoProcessor.from_pretrained to load Qwen VL processor correctly

Copilot · lstein · Copilot · commit 666fdfa43c31 · 2026-03-25T03:34:49.000Z
Co-authored-by: lstein <111189+lstein@users.noreply.github.com> Agent-Logs-Url: https://github.com/lstein/InvokeAI/sessions/4d4417be-0f61-4faa-a21c-16e9ce81fec7
diff --git a/invokeai/app/invocations/qwen_image_edit_text_encoder.py b/invokeai/app/invocations/qwen_image_edit_text_encoder.py
@@ -1,6 +1,6 @@
 import torch
 from PIL import Image as PILImage
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation
 from invokeai.app.invocations.fields import (
@@ -71,25 +71,20 @@ def _encode(self, context: InvocationContext, images: list[PILImage.Image]) -> t
 
         messages = [{"role": "user", "content": content}]
 
+        # Load the full processor (image_processor + tokenizer) from the tokenizer submodel path.
+        # Using AutoProcessor.from_pretrained ensures all components are loaded correctly
+        # regardless of whether the model uses Qwen2VLProcessor or Qwen2_5_VLProcessor.
+        tokenizer_config = context.models.get_config(self.qwen_vl_encoder.tokenizer)
+        tokenizer_abs_path = context.models.get_absolute_path(tokenizer_config)
+        processor = AutoProcessor.from_pretrained(str(tokenizer_abs_path), local_files_only=True)
+
         text_encoder_info = context.models.load(self.qwen_vl_encoder.text_encoder)
-        with (
-            text_encoder_info.model_on_device() as (_, text_encoder),
-            context.models.load(self.qwen_vl_encoder.tokenizer).model_on_device() as (_, tokenizer),
-        ):
+        with text_encoder_info.model_on_device() as (_, text_encoder):
             device = get_effective_device(text_encoder)
 
             context.util.signal_progress("Running Qwen2.5-VL text/vision encoder")
             assert isinstance(text_encoder, Qwen2_5_VLForConditionalGeneration)
 
-            # The tokenizer for Qwen2.5-VL is actually a processor that handles images+text
-            # In the diffusers pipeline, the processor is used for this
-            # We need to use the processor to format inputs properly
-            if isinstance(tokenizer, Qwen2VLProcessor):
-                processor = tokenizer
-            else:
-                # Fall back to creating processor from tokenizer
-                processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=None)
-
             # Apply chat template and process inputs
             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             inputs = processor(