Add Qwen3.5 4b, Disable thinking, fix RGB bug (#2319)

Matvezy · PawelPeczek-Roboflow · web-flow · commit 9e10c7d2a641 · 2026-05-14T11:58:16.000+02:00
* 4b

* alias fix

* style

* style

* update names and drop reasoininh button

* take out extra alais hnadling

* rgb processing

* rgb processing

* fix v1 version

---------

Co-authored-by: Paweł Pęczek &lt;146137186+PawelPeczek-Roboflow@users.noreply.github.com&gt;
diff --git a/docs/foundation/qwen3.5.md b/docs/foundation/qwen3.5.md
@@ -1,38 +1,39 @@
 # Qwen 3.5
 
-<a href="https://github.com/QwenLM/Qwen3.5" target="_blank">Qwen 3.5-VL</a> is a vision-language model developed by Alibaba.
+<a href="https://github.com/QwenLM/Qwen3.5" target="_blank">Qwen 3.5</a> is a vision-language model developed by Alibaba.
 
-You can use Qwen 3.5-VL for a range of multimodal tasks, including image understanding, visual question answering, and document analysis. It also supports a "thinking" mode that lets the model generate reasoning tokens before answering.
+You can use Qwen 3.5 for a range of multimodal tasks, including image understanding, visual question answering, and document analysis. It also supports a "thinking" mode that lets the model generate reasoning tokens before answering.
 
-You can deploy Qwen 3.5-VL with Inference.
+You can deploy Qwen 3.5 with Inference.
 
 ### Model Variants
 
-Qwen 3.5-VL is available in two sizes:
+Qwen 3.5 is available in three sizes:
 
 | Model ID | Parameters |
 |:---------|:-----------|
 | `qwen3_5-0.8b` | 0.8B |
 | `qwen3_5-2b` | 2B |
+| `qwen3_5-4b` | 4B |
 
 ### Execution Modes
 
-Qwen 3.5-VL supports both local and remote execution modes when used in workflows:
+Qwen 3.5 supports both local and remote execution modes when used in workflows:
 
 - **Local execution**: The model runs directly on your inference server (GPU recommended)
 - **Remote execution**: The model can be invoked via HTTP API on a remote inference server
 
 ### Installation
 
-To install inference with the extra dependencies necessary to run Qwen 3.5-VL, run
+To install inference with the extra dependencies necessary to run Qwen 3.5, run
 
 ```pip install "inference[transformers]"```
 
 or
 
 ```pip install "inference-gpu[transformers]"```
 
-### How to Use Qwen 3.5-VL
+### How to Use Qwen 3.5
 
 Create a new Python file called `app.py` and add the following code:
 
@@ -61,7 +62,7 @@ Above, replace:
 1. `prompt` with the prompt for the model.
 2. The image URL with the path to the image that you want to run inference on.
 
-To use Qwen 3.5-VL with Inference, you will need a Roboflow API key. If you don't already have a Roboflow account, <a href="https://app.roboflow.com" target="_blank">sign up for a free Roboflow account</a>.
+To use Qwen 3.5 with Inference, you will need a Roboflow API key. If you don't already have a Roboflow account, <a href="https://app.roboflow.com" target="_blank">sign up for a free Roboflow account</a>.
 
 Then, run the Python script you have created:
 
diff --git a/inference/core/entities/requests/inference.py b/inference/core/entities/requests/inference.py
@@ -330,7 +330,7 @@ class LMMInferenceRequest(CVInferenceRequest):
     )
     enable_thinking: bool = Field(
         default=False,
-        description="If true, enables thinking/reasoning mode for models that support it (e.g. Qwen3.5-VL). The model's reasoning will be included in the response.",
+        description="If true, enables thinking/reasoning mode for models that support it (e.g. Qwen3.5). The model's reasoning will be included in the response.",
     )
     max_new_tokens: Optional[int] = Field(
         default=None,
diff --git a/inference/core/registries/roboflow.py b/inference/core/registries/roboflow.py
@@ -70,6 +70,7 @@
     "perception_encoder": ("embed", "perception_encoder"),
     "qwen3_5-0.8b": ("lmm", "qwen3_5-0.8b"),
     "qwen3_5-2b": ("lmm", "qwen3_5-2b"),
+    "qwen3_5-4b": ("lmm", "qwen3_5-4b"),
 }
 
 STUB_VERSION_ID = "0"
diff --git a/inference/core/workflows/core_steps/loader.py b/inference/core/workflows/core_steps/loader.py
@@ -287,6 +287,9 @@
 from inference.core.workflows.core_steps.models.foundation.qwen3_5vl.v1 import (
     Qwen35VLBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.qwen3_5vl.v2 import (
+    Qwen35VLBlockV2,
+)
 from inference.core.workflows.core_steps.models.foundation.qwen3_6_openrouter.v1 import (
     Qwen36OpenRouterBlockV1,
 )
@@ -932,6 +935,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         Qwen25VLBlockV1,
         Qwen3VLBlockV1,
         Qwen35VLBlockV1,
+        Qwen35VLBlockV2,
         Qwen35OpenRouterBlockV1,
         Qwen36OpenRouterBlockV1,
         OpenAICompatibleBlockV1,
diff --git a/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v1.py b/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v1.py
@@ -57,6 +57,7 @@ class BlockManifest(WorkflowBlockManifest):
                 "Alibaba",
             ],
             "is_vlm_block": True,
+            "deprecated": True,
             "ui_manifest": {
                 "section": "model",
                 "icon": "fal fa-atom",
diff --git a/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v2.py b/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v2.py
@@ -0,0 +1,243 @@
+from typing import List, Literal, Optional, Type, Union
+
+from pydantic import ConfigDict, Field
+
+from inference.core.entities.requests.inference import LMMInferenceRequest
+from inference.core.env import (
+    HOSTED_CORE_MODEL_URL,
+    LOCAL_INFERENCE_API_URL,
+    WORKFLOWS_REMOTE_API_TARGET,
+)
+from inference.core.managers.base import ModelManager
+from inference.core.workflows.core_steps.common.entities import StepExecutionMode
+from inference.core.workflows.execution_engine.entities.base import (
+    Batch,
+    OutputDefinition,
+    WorkflowImageData,
+)
+from inference.core.workflows.execution_engine.entities.types import (
+    DICTIONARY_KIND,
+    IMAGE_KIND,
+    ROBOFLOW_MODEL_ID_KIND,
+    ImageInputField,
+    Selector,
+)
+from inference.core.workflows.prototypes.block import (
+    BlockResult,
+    WorkflowBlock,
+    WorkflowBlockManifest,
+)
+from inference_sdk import InferenceHTTPClient
+
+
+##########################################################################
+# Qwen3.5 Workflow Block Manifest
+##########################################################################
+class BlockManifest(WorkflowBlockManifest):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "name": "Qwen3.5",
+            "version": "v2",
+            "short_description": "Run Qwen3.5 on an image.",
+            "long_description": (
+                "This workflow block runs Qwen3.5—a vision language model that accepts an image "
+                "and an optional text prompt—and returns a text answer based on a conversation template."
+            ),
+            "license": "Apache-2.0",
+            "block_type": "model",
+            "search_keywords": [
+                "Qwen3.5",
+                "qwen3.5",
+                "vision language model",
+                "VLM",
+                "Alibaba",
+            ],
+            "is_vlm_block": True,
+            "ui_manifest": {
+                "section": "model",
+                "icon": "fal fa-atom",
+                "blockPriority": 5.7,
+            },
+        },
+        protected_namespaces=(),
+    )
+    type: Literal["roboflow_core/qwen3_5vl@v2"]
+
+    images: Selector(kind=[IMAGE_KIND]) = ImageInputField
+    prompt: Optional[str] = Field(
+        default=None,
+        description="Optional text prompt to provide additional context to Qwen3.5. Otherwise it will just be a default one, which may affect the desired model behavior.",
+        examples=["What is in this image?"],
+    )
+    model_version: Union[
+        Literal["qwen3_5-0.8b", "qwen3_5-2b", "qwen3_5-4b"],
+        Selector(kind=[ROBOFLOW_MODEL_ID_KIND]),
+        str,
+    ] = Field(
+        default="qwen3_5-0.8b",
+        description="The Qwen3.5 model to be used for inference.",
+        examples=["qwen3_5-0.8b", "qwen3_5-2b", "qwen3_5-4b"],
+    )
+
+    system_prompt: Optional[str] = Field(
+        default=None,
+        description="Optional system prompt to provide additional context to Qwen3.5.",
+        examples=["You are a helpful assistant."],
+    )
+
+    max_new_tokens: Optional[int] = Field(
+        default=None,
+        description="Maximum number of tokens to generate. If not set, the model's default will be used.",
+    )
+
+    @classmethod
+    def describe_outputs(cls) -> List[OutputDefinition]:
+        return [
+            OutputDefinition(
+                name="parsed_output",
+                kind=[DICTIONARY_KIND],
+                description="A parsed version of the output, provided as a dictionary containing the text.",
+            ),
+        ]
+
+    @classmethod
+    def get_parameters_accepting_batches(cls) -> List[str]:
+        return ["images"]
+
+    @classmethod
+    def get_execution_engine_compatibility(cls) -> Optional[str]:
+        return ">=1.3.0,<2.0.0"
+
+    @classmethod
+    def get_supported_model_variants(cls) -> Optional[List[str]]:
+        return ["qwen3_5-0.8b", "qwen3_5-2b", "qwen3_5-4b"]
+
+
+##########################################################################
+# Qwen3.5 Workflow Block
+##########################################################################
+class Qwen35VLBlockV2(WorkflowBlock):
+    def __init__(
+        self,
+        model_manager: ModelManager,
+        api_key: Optional[str],
+        step_execution_mode: StepExecutionMode,
+    ):
+        self._model_manager = model_manager
+        self._api_key = api_key
+        self._step_execution_mode = step_execution_mode
+
+    @classmethod
+    def get_init_parameters(cls) -> List[str]:
+        return ["model_manager", "api_key", "step_execution_mode"]
+
+    @classmethod
+    def get_manifest(cls) -> Type[WorkflowBlockManifest]:
+        return BlockManifest
+
+    def run(
+        self,
+        images: Batch[WorkflowImageData],
+        model_version: str,
+        prompt: Optional[str],
+        system_prompt: Optional[str],
+        max_new_tokens: Optional[int] = None,
+    ) -> BlockResult:
+        if self._step_execution_mode == StepExecutionMode.LOCAL:
+            return self.run_locally(
+                images=images,
+                model_version=model_version,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                max_new_tokens=max_new_tokens,
+            )
+        elif self._step_execution_mode == StepExecutionMode.REMOTE:
+            return self.run_remotely(
+                images=images,
+                model_version=model_version,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                max_new_tokens=max_new_tokens,
+            )
+        else:
+            raise ValueError(
+                f"Unknown step execution mode: {self._step_execution_mode}"
+            )
+
+    def run_remotely(
+        self,
+        images: Batch[WorkflowImageData],
+        model_version: str,
+        prompt: Optional[str],
+        system_prompt: Optional[str],
+        max_new_tokens: Optional[int] = None,
+    ) -> BlockResult:
+        api_url = (
+            LOCAL_INFERENCE_API_URL
+            if WORKFLOWS_REMOTE_API_TARGET != "hosted"
+            else HOSTED_CORE_MODEL_URL
+        )
+        client = InferenceHTTPClient(
+            api_url=api_url,
+            api_key=self._api_key,
+        )
+        if WORKFLOWS_REMOTE_API_TARGET == "hosted":
+            client.select_api_v0()
+
+        prompt = prompt or "Describe what's in this image."
+        system_prompt = (
+            system_prompt
+            or "You are a Qwen3.5 model that can answer questions about any image."
+        )
+        combined_prompt = prompt + "<system_prompt>" + system_prompt
+
+        predictions = []
+        for image in images:
+            result = client.infer_lmm(
+                inference_input=image.base64_image,
+                model_id=model_version,
+                prompt=combined_prompt,
+                model_id_in_path=True,
+                enable_thinking=False,
+                max_new_tokens=max_new_tokens,
+            )
+            response_text = result.get("response", result)
+            predictions.append({"parsed_output": response_text})
+
+        return predictions
+
+    def run_locally(
+        self,
+        images: Batch[WorkflowImageData],
+        model_version: str,
+        prompt: Optional[str],
+        system_prompt: Optional[str],
+        max_new_tokens: Optional[int] = None,
+    ) -> BlockResult:
+        inference_images = [
+            i.to_inference_format(numpy_preferred=False) for i in images
+        ]
+        prompt = prompt or "Describe what's in this image."
+        system_prompt = system_prompt or "You are a helpful assistant."
+        prompts = [prompt + "<system_prompt>" + system_prompt] * len(inference_images)
+        self._model_manager.add_model(model_id=model_version, api_key=self._api_key)
+
+        predictions = []
+        for image, single_prompt in zip(inference_images, prompts):
+            request_kwargs = dict(
+                api_key=self._api_key,
+                model_id=model_version,
+                image=image,
+                source="workflow-execution",
+                prompt=single_prompt,
+                enable_thinking=False,
+            )
+            if max_new_tokens is not None:
+                request_kwargs["max_new_tokens"] = max_new_tokens
+            request = LMMInferenceRequest(**request_kwargs)
+            prediction = self._model_manager.infer_from_request_sync(
+                model_id=model_version, request=request
+            )
+            response_text = prediction.response
+            predictions.append({"parsed_output": response_text})
+        return predictions
diff --git a/inference/models/utils.py b/inference/models/utils.py
@@ -1001,6 +1001,7 @@ def get_roboflow_model(*args, **kwargs):
         for variant in [
             "qwen3_5-0.8b",
             "qwen3_5-2b",
+            "qwen3_5-4b",
             "qwen3_5-0.8b-peft",
             "qwen3_5-2b-peft",
         ]:
diff --git a/inference_models/docs/models/qwen35.md b/inference_models/docs/models/qwen35.md
@@ -27,6 +27,7 @@ Qwen3.5 pre-trained models are available and do **not** require a Roboflow API k
 |----------|-------------|
 | `qwen3_5-0.8b` | 0.8B parameter model - compact and efficient |
 | `qwen3_5-2b` | 2B parameter model - better accuracy |
+| `qwen3_5-4b` | 4B parameter model - highest accuracy |
 
 You can also use fine-tuned models from Roboflow by specifying `project/version` as the model ID (requires API key).
 
diff --git a/inference_models/inference_models/models/qwen3_5/qwen3_5_hf.py b/inference_models/inference_models/models/qwen3_5/qwen3_5_hf.py
@@ -181,6 +181,8 @@ def pre_process_generation(
         enable_thinking: bool = False,
         **kwargs,
     ) -> dict:
+        if isinstance(images, np.ndarray):
+            images = images[:, :, ::-1].copy()
         # Handle prompt and system prompt parsing logic from original implementation
         if prompt is None:
             prompt = "Describe what's in this image."
diff --git a/inference_models/inference_models/models/qwen3vl/qwen3vl_hf.py b/inference_models/inference_models/models/qwen3vl/qwen3vl_hf.py
@@ -204,6 +204,8 @@ def pre_process_generation(
         image_size: Optional[Tuple[int, int]] = None,
         **kwargs,
     ) -> dict:
+        if isinstance(images, np.ndarray):
+            images = images[:, :, ::-1].copy()
         # Handle prompt and system prompt parsing logic from original implementation
         if prompt is None:
             prompt = "Describe what's in this image."
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -113,7 +113,7 @@ nav:
           - Segment Anything (Segmentation): foundation/sam.md
           - Segment Anything 2 (Segmentation): foundation/sam2.md
           - Segment Anything 3 (Segmentation): foundation/sam3.md
-          - Qwen 3.5: foundation/qwen3.5-vl.md
+          - Qwen 3.5: foundation/qwen3.5.md
           - SmolVLM2: foundation/smolvlm.md
           - YOLO-World (Object Detection): foundation/yolo_world.md
           - OwlV2 (Object Detection): foundation/owlv2.md
diff --git a/tests/workflows/unit_tests/core_steps/models/foundation/test_vlm_remote_execution.py b/tests/workflows/unit_tests/core_steps/models/foundation/test_vlm_remote_execution.py

Original file line number	Diff line number	Diff line change
`@@ -330,7 +330,7 @@ class LMMInferenceRequest(CVInferenceRequest):`
`330`	`330`	`)`
`331`	`331`	`enable_thinking: bool = Field(`
`332`	`332`	`default=False,`
`333`		`- description="If true, enables thinking/reasoning mode for models that support it (e.g. Qwen3.5-VL). The model's reasoning will be included in the response.",`
	`333`	`+ description="If true, enables thinking/reasoning mode for models that support it (e.g. Qwen3.5). The model's reasoning will be included in the response.",`
`334`	`334`	`)`
`335`	`335`	`max_new_tokens: Optional[int] = Field(`
`336`	`336`	`default=None,`
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@`
`70`	`70`	`"perception_encoder": ("embed", "perception_encoder"),`
`71`	`71`	`"qwen3_5-0.8b": ("lmm", "qwen3_5-0.8b"),`
`72`	`72`	`"qwen3_5-2b": ("lmm", "qwen3_5-2b"),`
	`73`	`+ "qwen3_5-4b": ("lmm", "qwen3_5-4b"),`
`73`	`74`	`}`
`74`	`75`
`75`	`76`	`STUB_VERSION_ID = "0"`