lstein
diff --git a/‎invokeai/app/invocations/qwen_image_model_loader.py‎
Lines changed: 62 additions & 22 deletions b/‎invokeai/app/invocations/qwen_image_model_loader.py‎
Lines changed: 62 additions & 22 deletions
diff --git a/‎invokeai/app/invocations/qwen_image_text_encoder.py‎
Lines changed: 33 additions & 9 deletions b/‎invokeai/app/invocations/qwen_image_text_encoder.py‎
Lines changed: 33 additions & 9 deletions
diff --git a/‎invokeai/backend/model_manager/configs/factory.py‎
Lines changed: 9 additions & 0 deletions b/‎invokeai/backend/model_manager/configs/factory.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎invokeai/backend/model_manager/configs/qwen_vl_encoder.py‎
Lines changed: 154 additions & 0 deletions b/‎invokeai/backend/model_manager/configs/qwen_vl_encoder.py‎
Lines changed: 154 additions & 0 deletions
@@ -34,19 +34,22 @@ class QwenImageModelLoaderOutput(BaseInvocationOutput):
     title="Main Model - Qwen Image",
     tags=["model", "qwen_image"],
     category="model",
-    version="1.1.0",
+    version="1.2.0",
     classification=Classification.Prototype,
 )
 class QwenImageModelLoaderInvocation(BaseInvocation):
     """Loads a Qwen Image model, outputting its submodels.
 
     The transformer is always loaded from the main model (Diffusers or GGUF).
 
-    For GGUF quantized models, the VAE and Qwen VL encoder must come from a
-    separate Diffusers model specified in the "Component Source" field.
+    Components can be mixed and matched:
+    - VAE: standalone Qwen Image VAE checkpoint, the Component Source (Diffusers),
+      or the main model if it's Diffusers.
+    - Qwen VL Encoder: standalone Qwen2.5-VL encoder, the Component Source
+      (Diffusers), or the main model if it's Diffusers.
 
-    For Diffusers models, all components are extracted from the main model
-    automatically. The "Component Source" field is ignored.
+    Together, the standalone VAE and standalone encoder allow running a GGUF
+    transformer without ever downloading the full ~40 GB Diffusers pipeline.
     """
 
     model: ModelIdentifierField = InputField(
@@ -57,11 +60,31 @@ class QwenImageModelLoaderInvocation(BaseInvocation):
         title="Transformer",
     )
 
+    vae_model: Optional[ModelIdentifierField] = InputField(
+        default=None,
+        description="Standalone Qwen Image VAE model. "
+        "If not provided, VAE will be loaded from the Component Source (or from the main model if it is Diffusers).",
+        input=Input.Direct,
+        ui_model_base=BaseModelType.QwenImage,
+        ui_model_type=ModelType.VAE,
+        title="VAE",
+    )
+
+    qwen_vl_encoder_model: Optional[ModelIdentifierField] = InputField(
+        default=None,
+        description="Standalone Qwen2.5-VL encoder model. "
+        "If not provided, the encoder will be loaded from the Component Source "
+        "(or from the main model if it is Diffusers).",
+        input=Input.Direct,
+        ui_model_type=ModelType.QwenVLEncoder,
+        title="Qwen VL Encoder",
+    )
+
     component_source: Optional[ModelIdentifierField] = InputField(
         default=None,
-        description="Diffusers Qwen Image model to extract the VAE and Qwen VL encoder from. "
-        "Required when using a GGUF quantized transformer. "
-        "Ignored when the main model is already in Diffusers format.",
+        description="Diffusers Qwen Image model to extract VAE and/or Qwen VL encoder from. "
+        "Use this if you don't have separate VAE/encoder models. "
+        "Ignored for any submodel that is provided separately.",
         input=Input.Direct,
         ui_model_base=BaseModelType.QwenImage,
         ui_model_type=ModelType.Main,
@@ -76,32 +99,49 @@ def invoke(self, context: InvocationContext) -> QwenImageModelLoaderOutput:
         # Transformer always comes from the main model
         transformer = self.model.model_copy(update={"submodel_type": SubModelType.Transformer})
 
-        if main_is_diffusers:
-            # Diffusers model: extract all components directly
+        # Resolve VAE: standalone override > main (if Diffusers) > component source
+        if self.vae_model is not None:
+            vae = self.vae_model.model_copy(update={"submodel_type": SubModelType.VAE})
+        elif main_is_diffusers:
             vae = self.model.model_copy(update={"submodel_type": SubModelType.VAE})
+        elif self.component_source is not None:
+            self._validate_component_source_format(context, self.component_source)
+            vae = self.component_source.model_copy(update={"submodel_type": SubModelType.VAE})
+        else:
+            raise ValueError(
+                "No source for VAE. Either set 'VAE' to a standalone Qwen Image VAE, "
+                "or set 'Component Source' to a Diffusers Qwen Image model."
+            )
+
+        # Resolve Qwen VL encoder: standalone override > main (if Diffusers) > component source
+        if self.qwen_vl_encoder_model is not None:
+            tokenizer = self.qwen_vl_encoder_model.model_copy(update={"submodel_type": SubModelType.Tokenizer})
+            text_encoder = self.qwen_vl_encoder_model.model_copy(update={"submodel_type": SubModelType.TextEncoder})
+        elif main_is_diffusers:
             tokenizer = self.model.model_copy(update={"submodel_type": SubModelType.Tokenizer})
             text_encoder = self.model.model_copy(update={"submodel_type": SubModelType.TextEncoder})
         elif self.component_source is not None:
-            # GGUF/checkpoint transformer: get VAE + encoder from the component source
-            source_config = context.models.get_config(self.component_source)
-            if source_config.format != ModelFormat.Diffusers:
-                raise ValueError(
-                    f"The Component Source model must be in Diffusers format. "
-                    f"The selected model '{source_config.name}' is in {source_config.format.value} format."
-                )
-            vae = self.component_source.model_copy(update={"submodel_type": SubModelType.VAE})
+            self._validate_component_source_format(context, self.component_source)
             tokenizer = self.component_source.model_copy(update={"submodel_type": SubModelType.Tokenizer})
             text_encoder = self.component_source.model_copy(update={"submodel_type": SubModelType.TextEncoder})
         else:
             raise ValueError(
-                "No source for VAE and Qwen VL encoder. "
-                "GGUF quantized models only contain the transformer — "
-                "please set 'Component Source' to a Diffusers Qwen Image model "
-                "to provide the VAE and text encoder."
+                "No source for Qwen VL encoder. "
+                "Either set 'Qwen VL Encoder' to a standalone Qwen2.5-VL encoder, "
+                "or set 'Component Source' to a Diffusers Qwen Image model."
             )
 
         return QwenImageModelLoaderOutput(
             transformer=TransformerField(transformer=transformer, loras=[]),
             qwen_vl_encoder=QwenVLEncoderField(tokenizer=tokenizer, text_encoder=text_encoder),
             vae=VAEField(vae=vae),
         )
+
+    @staticmethod
+    def _validate_component_source_format(context: InvocationContext, model: ModelIdentifierField) -> None:
+        source_config = context.models.get_config(model)
+        if source_config.format != ModelFormat.Diffusers:
+            raise ValueError(
+                f"The Component Source model must be in Diffusers format. "
+                f"The selected model '{source_config.name}' is in {source_config.format.value} format."
+            )
@@ -161,17 +161,35 @@ def _encode(
         # Build the processor
         tokenizer_config = context.models.get_config(self.qwen_vl_encoder.tokenizer)
         model_root = context.models.get_absolute_path(tokenizer_config)
-        tokenizer_dir = model_root / "tokenizer"
 
-        tokenizer = AutoTokenizer.from_pretrained(str(tokenizer_dir), local_files_only=True)
+        # Single-file checkpoints (e.g. ComfyUI fp8_scaled): model_root is the
+        # safetensors file itself, so there's no tokenizer/processor folder
+        # alongside it. Fall back to the canonical Qwen2.5-VL repo on HF (small
+        # ~10 MB download for tokenizer+processor configs, cached for offline use).
+        if model_root.is_file():
+            HF_REPO = "Qwen/Qwen2.5-VL-7B-Instruct"
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(HF_REPO, local_files_only=True)
+            except OSError:
+                tokenizer = AutoTokenizer.from_pretrained(HF_REPO)
+            try:
+                image_processor = _ImageProcessorCls.from_pretrained(HF_REPO, local_files_only=True)
+            except OSError:
+                try:
+                    image_processor = _ImageProcessorCls.from_pretrained(HF_REPO)
+                except Exception:
+                    image_processor = _ImageProcessorCls()
+        else:
+            tokenizer_dir = model_root / "tokenizer"
+            tokenizer = AutoTokenizer.from_pretrained(str(tokenizer_dir), local_files_only=True)
 
-        image_processor = None
-        for search_dir in [model_root / "processor", tokenizer_dir, model_root, model_root / "image_processor"]:
-            if (search_dir / "preprocessor_config.json").exists():
-                image_processor = _ImageProcessorCls.from_pretrained(str(search_dir), local_files_only=True)
-                break
-        if image_processor is None:
-            image_processor = _ImageProcessorCls()
+            image_processor = None
+            for search_dir in [model_root / "processor", tokenizer_dir, model_root, model_root / "image_processor"]:
+                if (search_dir / "preprocessor_config.json").exists():
+                    image_processor = _ImageProcessorCls.from_pretrained(str(search_dir), local_files_only=True)
+                    break
+            if image_processor is None:
+                image_processor = _ImageProcessorCls()
 
         processor = Qwen2_5_VLProcessor(
             tokenizer=tokenizer,
@@ -264,6 +282,12 @@ def _load_quantized_encoder(self, context: InvocationContext):
 
         encoder_config = context.models.get_config(self.qwen_vl_encoder.text_encoder)
         model_root = context.models.get_absolute_path(encoder_config)
+        if model_root.is_file():
+            # Single-file checkpoint (e.g. ComfyUI fp8_scaled): BnB can't load from
+            # a single file, and the checkpoint is already FP8-compressed anyway.
+            # Fall back to the cached path; the user effectively gets fp8 instead of
+            # int8/nf4, which is comparable in size.
+            return self._load_cached_encoder(context)
         encoder_path = model_root / "text_encoder"
 
         if self.quantization == "nf4":
 
@@ -90,6 +90,10 @@
     Qwen3Encoder_GGUF_Config,
     Qwen3Encoder_Qwen3Encoder_Config,
 )
+from invokeai.backend.model_manager.configs.qwen_vl_encoder import (
+    QwenVLEncoder_Checkpoint_Config,
+    QwenVLEncoder_Diffusers_Config,
+)
 from invokeai.backend.model_manager.configs.siglip import SigLIP_Diffusers_Config
 from invokeai.backend.model_manager.configs.spandrel import Spandrel_Checkpoint_Config
 from invokeai.backend.model_manager.configs.t2i_adapter import (
@@ -111,6 +115,7 @@
     VAE_Checkpoint_Anima_Config,
     VAE_Checkpoint_Flux2_Config,
     VAE_Checkpoint_FLUX_Config,
+    VAE_Checkpoint_QwenImage_Config,
     VAE_Checkpoint_SD1_Config,
     VAE_Checkpoint_SD2_Config,
     VAE_Checkpoint_SDXL_Config,
@@ -194,6 +199,7 @@
         Annotated[VAE_Checkpoint_SDXL_Config, VAE_Checkpoint_SDXL_Config.get_tag()],
         Annotated[VAE_Checkpoint_FLUX_Config, VAE_Checkpoint_FLUX_Config.get_tag()],
         Annotated[VAE_Checkpoint_Flux2_Config, VAE_Checkpoint_Flux2_Config.get_tag()],
+        Annotated[VAE_Checkpoint_QwenImage_Config, VAE_Checkpoint_QwenImage_Config.get_tag()],
         Annotated[VAE_Checkpoint_Anima_Config, VAE_Checkpoint_Anima_Config.get_tag()],
         # VAE - diffusers format
         Annotated[VAE_Diffusers_SD1_Config, VAE_Diffusers_SD1_Config.get_tag()],
@@ -242,6 +248,9 @@
         Annotated[Qwen3Encoder_Qwen3Encoder_Config, Qwen3Encoder_Qwen3Encoder_Config.get_tag()],
         Annotated[Qwen3Encoder_Checkpoint_Config, Qwen3Encoder_Checkpoint_Config.get_tag()],
         Annotated[Qwen3Encoder_GGUF_Config, Qwen3Encoder_GGUF_Config.get_tag()],
+        # Qwen VL Encoder (Qwen2.5-VL multimodal encoder for Qwen Image)
+        Annotated[QwenVLEncoder_Diffusers_Config, QwenVLEncoder_Diffusers_Config.get_tag()],
+        Annotated[QwenVLEncoder_Checkpoint_Config, QwenVLEncoder_Checkpoint_Config.get_tag()],
         # TI - file format
         Annotated[TI_File_SD1_Config, TI_File_SD1_Config.get_tag()],
         Annotated[TI_File_SD2_Config, TI_File_SD2_Config.get_tag()],
 
@@ -0,0 +1,154 @@
+import json
+from pathlib import Path
+from typing import Any, Iterable, Literal, Self
+
+from pydantic import Field
+from safetensors import safe_open
+
+from invokeai.backend.model_manager.configs.base import Checkpoint_Config_Base, Config_Base
+from invokeai.backend.model_manager.configs.identification_utils import (
+    NotAMatchError,
+    raise_for_override_fields,
+    raise_if_not_dir,
+    raise_if_not_file,
+)
+from invokeai.backend.model_manager.model_on_disk import ModelOnDisk
+from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelFormat, ModelType
+
+_RECOGNIZED_TEXT_ENCODER_CLASSES = {
+    "Qwen2_5_VLForConditionalGeneration",
+    "Qwen2VLForConditionalGeneration",
+}
+
+
+def _has_qwen_vl_keys(keys: Iterable[str]) -> bool:
+    """A Qwen2.5-VL/Qwen2-VL checkpoint must have both LM weights and a visual
+    tower — that's what distinguishes it from text-only Qwen3/Qwen2 encoders."""
+    has_lm = False
+    has_vision = False
+    for k in keys:
+        if not isinstance(k, str):
+            continue
+        if not has_lm and (k == "model.embed_tokens.weight" or k.startswith("model.layers.")):
+            has_lm = True
+        if not has_vision and (k.startswith("visual.patch_embed.") or k.startswith("visual.blocks.")):
+            has_vision = True
+        if has_lm and has_vision:
+            return True
+    return False
+
+
+def _read_safetensors_keys(path: Path) -> list[str]:
+    """Read only the key index from a safetensors file without loading tensor data.
+
+    Avoids holding multi-GB encoder weights in RAM just to classify the file.
+    """
+    with safe_open(str(path), framework="pt", device="cpu") as f:
+        return list(f.keys())
+
+
+class QwenVLEncoder_Diffusers_Config(Config_Base):
+    """Configuration for standalone Qwen2.5-VL encoder models in diffusers-style folder layout.
+
+    Expected structure:
+        <model_root>/
+            text_encoder/
+                config.json (with `_class_name` or `architectures` listing
+                             `Qwen2_5_VLForConditionalGeneration`)
+                model.safetensors
+            tokenizer/
+                tokenizer_config.json
+                ...
+            processor/                  (optional, for vision preprocessing)
+                preprocessor_config.json
+
+    This lets users avoid downloading the full ~40 GB Qwen Image diffusers pipeline
+    when they only need the Qwen2.5-VL encoder for use with a GGUF transformer.
+    """
+
+    base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
+    type: Literal[ModelType.QwenVLEncoder] = Field(default=ModelType.QwenVLEncoder)
+    format: Literal[ModelFormat.QwenVLEncoder] = Field(default=ModelFormat.QwenVLEncoder)
+
+    @classmethod
+    def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
+        raise_if_not_dir(mod)
+
+        raise_for_override_fields(cls, override_fields)
+
+        # Reject anything that looks like a full pipeline (those are matched as Main models).
+        if (mod.path / "model_index.json").exists() or (mod.path / "transformer").exists():
+            raise NotAMatchError(
+                "directory looks like a full diffusers pipeline (has model_index.json or transformer folder), "
+                "not a standalone Qwen VL encoder"
+            )
+
+        text_encoder_dir = mod.path / "text_encoder"
+        tokenizer_dir = mod.path / "tokenizer"
+
+        if not text_encoder_dir.is_dir():
+            raise NotAMatchError("missing text_encoder/ subfolder")
+        if not tokenizer_dir.is_dir():
+            raise NotAMatchError("missing tokenizer/ subfolder")
+
+        config_path = text_encoder_dir / "config.json"
+        if not config_path.is_file():
+            raise NotAMatchError(f"missing {config_path}")
+
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                cfg = json.load(f)
+        except (OSError, json.JSONDecodeError) as e:
+            raise NotAMatchError(f"could not read text_encoder/config.json: {e}") from e
+
+        class_name = cfg.get("_class_name")
+        architectures = cfg.get("architectures") or []
+        candidates = {class_name, *architectures} - {None}
+
+        if not candidates & _RECOGNIZED_TEXT_ENCODER_CLASSES:
+            raise NotAMatchError(
+                f"text_encoder class is {sorted(candidates) or 'unknown'}, "
+                f"expected one of {sorted(_RECOGNIZED_TEXT_ENCODER_CLASSES)}"
+            )
+
+        return cls(**override_fields)
+
+
+class QwenVLEncoder_Checkpoint_Config(Checkpoint_Config_Base, Config_Base):
+    """Configuration for single-file Qwen2.5-VL encoder checkpoints (safetensors).
+
+    This matches ComfyUI-style consolidated single-file encoders such as
+    `qwen_2.5_vl_7b_fp8_scaled.safetensors`, which bundle the language model
+    and the visual tower into one file (typically with FP8 + per-tensor
+    `weight_scale` ComfyUI quantization).
+
+    The matching tokenizer + processor are pulled from HuggingFace
+    (`Qwen/Qwen2.5-VL-7B-Instruct`) on first use and cached for offline use.
+    """
+
+    base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
+    type: Literal[ModelType.QwenVLEncoder] = Field(default=ModelType.QwenVLEncoder)
+    format: Literal[ModelFormat.Checkpoint] = Field(default=ModelFormat.Checkpoint)
+
+    @classmethod
+    def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
+        raise_if_not_file(mod)
+
+        raise_for_override_fields(cls, override_fields)
+
+        # Only safetensors checkpoints are supported as single-file Qwen VL encoders.
+        # Reject other extensions cheaply before attempting to read keys.
+        if mod.path.suffix != ".safetensors":
+            raise NotAMatchError(f"expected a .safetensors file, got {mod.path.suffix or '(no suffix)'}")
+
+        # Read only the key index — a 7GB fp8 encoder weighs ~7GB on disk, but we
+        # only need the key names to classify it, not the tensor data.
+        try:
+            keys = _read_safetensors_keys(mod.path)
+        except Exception as e:
+            raise NotAMatchError(f"could not read safetensors header: {e}") from e
+
+        if not _has_qwen_vl_keys(keys):
+            raise NotAMatchError("state dict does not look like a Qwen2.5-VL/Qwen2-VL checkpoint")
+
+        return cls(**override_fields)