|
1 | 1 | import torch |
2 | 2 | from PIL import Image as PILImage |
3 | | -from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor |
| 3 | +from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration |
4 | 4 |
|
5 | 5 | from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation |
6 | 6 | from invokeai.app.invocations.fields import ( |
@@ -71,25 +71,20 @@ def _encode(self, context: InvocationContext, images: list[PILImage.Image]) -> t |
71 | 71 |
|
72 | 72 | messages = [{"role": "user", "content": content}] |
73 | 73 |
|
| 74 | + # Load the full processor (image_processor + tokenizer) from the tokenizer submodel path. |
| 75 | + # Using AutoProcessor.from_pretrained ensures all components are loaded correctly |
| 76 | + # regardless of whether the model uses Qwen2VLProcessor or Qwen2_5_VLProcessor. |
| 77 | + tokenizer_config = context.models.get_config(self.qwen_vl_encoder.tokenizer) |
| 78 | + tokenizer_abs_path = context.models.get_absolute_path(tokenizer_config) |
| 79 | + processor = AutoProcessor.from_pretrained(str(tokenizer_abs_path), local_files_only=True) |
| 80 | + |
74 | 81 | text_encoder_info = context.models.load(self.qwen_vl_encoder.text_encoder) |
75 | | - with ( |
76 | | - text_encoder_info.model_on_device() as (_, text_encoder), |
77 | | - context.models.load(self.qwen_vl_encoder.tokenizer).model_on_device() as (_, tokenizer), |
78 | | - ): |
| 82 | + with text_encoder_info.model_on_device() as (_, text_encoder): |
79 | 83 | device = get_effective_device(text_encoder) |
80 | 84 |
|
81 | 85 | context.util.signal_progress("Running Qwen2.5-VL text/vision encoder") |
82 | 86 | assert isinstance(text_encoder, Qwen2_5_VLForConditionalGeneration) |
83 | 87 |
|
84 | | - # The tokenizer for Qwen2.5-VL is actually a processor that handles images+text |
85 | | - # In the diffusers pipeline, the processor is used for this |
86 | | - # We need to use the processor to format inputs properly |
87 | | - if isinstance(tokenizer, Qwen2VLProcessor): |
88 | | - processor = tokenizer |
89 | | - else: |
90 | | - # Fall back to creating processor from tokenizer |
91 | | - processor = Qwen2VLProcessor(tokenizer=tokenizer, image_processor=None) |
92 | | - |
93 | 88 | # Apply chat template and process inputs |
94 | 89 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
95 | 90 | inputs = processor( |
|
0 commit comments