Fix GLM OCR token forwarding (#2216)

hansent · web-flow · commit 410890af35d1 · 2026-04-10T11:05:33.000Z
diff --git a/inference/core/workflows/core_steps/models/foundation/glm_ocr/v1.py b/inference/core/workflows/core_steps/models/foundation/glm_ocr/v1.py
@@ -106,6 +106,10 @@ class BlockManifest(WorkflowBlockManifest):
             },
         },
     )
+    max_new_tokens: Optional[int] = Field(
+        default=None,
+        description="Maximum number of tokens to generate. If not set, the model default will be used.",
+    )
 
     model_config = ConfigDict(
         json_schema_extra={
@@ -199,19 +203,22 @@ def run(
         model_version: str,
         task_type: str,
         prompt: Optional[str],
+        max_new_tokens: Optional[int] = None,
     ) -> BlockResult:
         resolved_prompt = _resolve_prompt(task_type, prompt)
         if self._step_execution_mode == StepExecutionMode.LOCAL:
             return self.run_locally(
                 images=images,
                 model_version=model_version,
                 prompt=resolved_prompt,
+                max_new_tokens=max_new_tokens,
             )
         elif self._step_execution_mode == StepExecutionMode.REMOTE:
             return self.run_remotely(
                 images=images,
                 model_version=model_version,
                 prompt=resolved_prompt,
+                max_new_tokens=max_new_tokens,
             )
         else:
             raise ValueError(
@@ -223,6 +230,7 @@ def run_remotely(
         images: Batch[WorkflowImageData],
         model_version: str,
         prompt: str,
+        max_new_tokens: Optional[int] = None,
     ) -> BlockResult:
         api_url = (
             LOCAL_INFERENCE_API_URL
@@ -243,6 +251,7 @@ def run_remotely(
                 model_id=model_version,
                 prompt=prompt,
                 model_id_in_path=True,
+                max_new_tokens=max_new_tokens,
             )
             response_text = result.get("response", result)
             predictions.append({"parsed_output": response_text})
@@ -254,6 +263,7 @@ def run_locally(
         images: Batch[WorkflowImageData],
         model_version: str,
         prompt: str,
+        max_new_tokens: Optional[int] = None,
     ) -> BlockResult:
         inference_images = [
             i.to_inference_format(numpy_preferred=False) for i in images
@@ -263,13 +273,16 @@ def run_locally(
 
         predictions = []
         for image in inference_images:
-            request = LMMInferenceRequest(
+            request_kwargs = dict(
                 api_key=self._api_key,
                 model_id=model_version,
                 image=image,
                 source="workflow-execution",
                 prompt=prompt,
             )
+            if max_new_tokens is not None:
+                request_kwargs["max_new_tokens"] = max_new_tokens
+            request = LMMInferenceRequest(**request_kwargs)
             prediction = self._model_manager.infer_from_request_sync(
                 model_id=model_version, request=request
             )
diff --git a/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v1.py b/inference/core/workflows/core_steps/models/foundation/qwen3_5vl/v1.py
@@ -182,6 +182,8 @@ def run(
                 model_version=model_version,
                 prompt=prompt,
                 system_prompt=system_prompt,
+                enable_thinking=enable_thinking,
+                max_new_tokens=max_new_tokens,
             )
         else:
             raise ValueError(
@@ -194,6 +196,8 @@ def run_remotely(
         model_version: str,
         prompt: Optional[str],
         system_prompt: Optional[str],
+        enable_thinking: bool = False,
+        max_new_tokens: Optional[int] = None,
     ) -> BlockResult:
         api_url = (
             LOCAL_INFERENCE_API_URL
@@ -221,6 +225,8 @@ def run_remotely(
                 model_id=model_version,
                 prompt=combined_prompt,
                 model_id_in_path=True,
+                enable_thinking=enable_thinking,
+                max_new_tokens=max_new_tokens,
             )
             response_text = result.get("response", result)
             predictions.append({"parsed_output": response_text, "thinking": ""})
diff --git a/inference_models/inference_models/models/glm_ocr/glm_ocr_hf.py b/inference_models/inference_models/models/glm_ocr/glm_ocr_hf.py
@@ -3,7 +3,7 @@
 """
 
 from threading import Lock
-from typing import Any, List, Union
+from typing import Any, List, Optional, Union
 
 import numpy as np
 import torch
@@ -99,7 +99,7 @@ def recognize_table(
         self,
         images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
         input_color_format: ColorFormat = None,
-        max_new_tokens: int = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+        max_new_tokens: Optional[int] = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
         do_sample: bool = INFERENCE_MODELS_GLM_OCR_DEFAULT_DO_SAMPLE,
         skip_special_tokens: bool = True,
         **kwargs,
@@ -118,7 +118,7 @@ def recognize_formula(
         self,
         images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
         input_color_format: ColorFormat = None,
-        max_new_tokens: int = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+        max_new_tokens: Optional[int] = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
         do_sample: bool = INFERENCE_MODELS_GLM_OCR_DEFAULT_DO_SAMPLE,
         skip_special_tokens: bool = True,
         **kwargs,
@@ -137,7 +137,7 @@ def recognize_text(
         self,
         images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
         input_color_format: ColorFormat = None,
-        max_new_tokens: int = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+        max_new_tokens: Optional[int] = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
         do_sample: bool = INFERENCE_MODELS_GLM_OCR_DEFAULT_DO_SAMPLE,
         skip_special_tokens: bool = True,
         **kwargs,
@@ -157,7 +157,7 @@ def prompt(
         images: Union[torch.Tensor, List[torch.Tensor], np.ndarray, List[np.ndarray]],
         prompt: str = None,
         input_color_format: ColorFormat = None,
-        max_new_tokens: int = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+        max_new_tokens: Optional[int] = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
         do_sample: bool = INFERENCE_MODELS_GLM_OCR_DEFAULT_DO_SAMPLE,
         skip_special_tokens: bool = True,
         **kwargs,
@@ -211,10 +211,12 @@ def pre_process_generation(
     def generate(
         self,
         inputs: dict,
-        max_new_tokens: int = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+        max_new_tokens: Optional[int] = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
         do_sample: bool = INFERENCE_MODELS_GLM_OCR_DEFAULT_DO_SAMPLE,
         **kwargs,
     ) -> torch.Tensor:
+        if max_new_tokens is None:
+            max_new_tokens = INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS
         input_len = inputs["input_ids"].shape[-1]
 
         with self._lock, torch.inference_mode():
diff --git a/inference_models/tests/unit_tests/models/test_glm_ocr_hf.py b/inference_models/tests/unit_tests/models/test_glm_ocr_hf.py
@@ -0,0 +1,24 @@
+from unittest.mock import MagicMock
+
+import numpy as np
+
+from inference_models.configuration import (
+    INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS,
+)
+from inference_models.models.glm_ocr.glm_ocr_hf import GlmOcrHF
+
+
+def test_generate_uses_default_max_new_tokens_when_none_is_given() -> None:
+    model = MagicMock()
+    model.generate.return_value = np.array([[11, 12, 21, 22]])
+    glm_ocr = GlmOcrHF(model=model, processor=MagicMock(), device=MagicMock())
+
+    result = glm_ocr.generate(
+        inputs={"input_ids": np.array([[11, 12]])},
+        max_new_tokens=None,
+    )
+
+    assert model.generate.call_args.kwargs["max_new_tokens"] == (
+        INFERENCE_MODELS_GLM_OCR_DEFAULT_MAX_NEW_TOKENS
+    )
+    assert result.tolist() == [[21, 22]]
diff --git a/inference_sdk/http/client.py b/inference_sdk/http/client.py
@@ -1601,6 +1601,8 @@ def infer_lmm(
         model_id: str,
         prompt: Optional[str] = None,
         model_id_in_path: bool = False,
+        max_new_tokens: Optional[int] = None,
+        enable_thinking: Optional[bool] = None,
     ) -> Union[dict, List[dict]]:
         """Run inference using a Large Multimodal Model (LMM).
 
@@ -1620,6 +1622,10 @@ def infer_lmm(
             model_id_in_path (bool, optional): If True, includes model_id in the URL path
                 (e.g., /infer/lmm/florence-2-base) which enables path-based routing.
                 If False (default), model_id is only sent in the request body.
+            max_new_tokens (Optional[int], optional): Maximum number of tokens to generate.
+                If not provided, the server-side model default is used.
+            enable_thinking (Optional[bool], optional): Enables reasoning mode for models
+                that support it. If not provided, the server-side model default is used.
 
         Returns:
             Union[dict, List[dict]]: Inference results containing the model response.
@@ -1632,6 +1638,10 @@ def infer_lmm(
         extra_payload = {"model_id": model_id}
         if prompt is not None:
             extra_payload["prompt"] = prompt
+        if max_new_tokens is not None:
+            extra_payload["max_new_tokens"] = max_new_tokens
+        if enable_thinking is not None:
+            extra_payload["enable_thinking"] = enable_thinking
 
         if model_id_in_path:
             endpoint = f"/infer/lmm/{model_id}"
@@ -1652,6 +1662,8 @@ async def infer_lmm_async(
         model_id: str,
         prompt: Optional[str] = None,
         model_id_in_path: bool = False,
+        max_new_tokens: Optional[int] = None,
+        enable_thinking: Optional[bool] = None,
     ) -> Union[dict, List[dict]]:
         """Run inference using a Large Multimodal Model (LMM) asynchronously.
 
@@ -1666,6 +1678,10 @@ async def infer_lmm_async(
             model_id_in_path (bool, optional): If True, includes model_id in the URL path
                 (e.g., /infer/lmm/florence-2-base) which enables path-based routing.
                 If False (default), model_id is only sent in the request body.
+            max_new_tokens (Optional[int], optional): Maximum number of tokens to generate.
+                If not provided, the server-side model default is used.
+            enable_thinking (Optional[bool], optional): Enables reasoning mode for models
+                that support it. If not provided, the server-side model default is used.
 
         Returns:
             Union[dict, List[dict]]: Inference results containing the model response.
@@ -1677,6 +1693,10 @@ async def infer_lmm_async(
         extra_payload = {"model_id": model_id}
         if prompt is not None:
             extra_payload["prompt"] = prompt
+        if max_new_tokens is not None:
+            extra_payload["max_new_tokens"] = max_new_tokens
+        if enable_thinking is not None:
+            extra_payload["enable_thinking"] = enable_thinking
 
         if model_id_in_path:
             endpoint = f"/infer/lmm/{model_id}"
diff --git a/tests/inference_sdk/unit_tests/http/test_client.py b/tests/inference_sdk/unit_tests/http/test_client.py
@@ -3755,6 +3755,39 @@ def test_infer_from_workflow_when_no_parameters_given(
     }, "Request payload must contain api key and inputs"
 
 
+@mock.patch.object(client, "load_static_inference_input")
+def test_infer_lmm_when_generation_parameters_given(
+    load_static_inference_input_mock: MagicMock,
+    requests_mock: Mocker,
+) -> None:
+    api_url = "http://some.com"
+    http_client = InferenceHTTPClient(api_key="my-api-key", api_url=api_url)
+    load_static_inference_input_mock.return_value = [("base64_image", 0.5)]
+    requests_mock.post(
+        f"{api_url}/infer/lmm/glm-ocr",
+        json={"response": "recognized text"},
+    )
+
+    result = http_client.infer_lmm(
+        inference_input="/some/image.jpg",
+        model_id="glm-ocr",
+        prompt="Text Recognition:",
+        model_id_in_path=True,
+        max_new_tokens=4096,
+        enable_thinking=True,
+    )
+
+    assert result == {"response": "recognized text"}
+    assert requests_mock.request_history[0].json() == {
+        "api_key": "my-api-key",
+        "image": {"type": "base64", "value": "base64_image"},
+        "model_id": "glm-ocr",
+        "prompt": "Text Recognition:",
+        "max_new_tokens": 4096,
+        "enable_thinking": True,
+    }
+
+
 @mock.patch.object(client, "load_nested_batches_of_inference_input")
 @pytest.mark.parametrize(
     "legacy_endpoints, endpoint_to_use, parameter_name",
diff --git a/tests/workflows/unit_tests/core_steps/models/foundation/test_vlm_remote_execution.py b/tests/workflows/unit_tests/core_steps/models/foundation/test_vlm_remote_execution.py
@@ -1,4 +1,4 @@
-"""Unit tests for VLM blocks remote execution (Florence2, Moondream2, SmolVLM, Qwen)."""
+"""Unit tests for VLM blocks remote execution."""
 
 from unittest.mock import MagicMock, patch
 
@@ -171,6 +171,48 @@ def test_run_remotely_calls_infer_lmm(
         mock_client.infer_lmm.assert_called_once()
 
 
+class TestGLMOCRRemote:
+    """Tests for GLM-OCR remote execution."""
+
+    @patch(
+        "inference.core.workflows.core_steps.models.foundation.glm_ocr.v1.InferenceHTTPClient"
+    )
+    def test_run_remotely_forwards_max_new_tokens(
+        self, mock_client_cls, mock_model_manager, mock_workflow_image_data
+    ):
+        from inference.core.workflows.core_steps.models.foundation.glm_ocr.v1 import (
+            GLMOCRBlockV1,
+        )
+
+        mock_client = MagicMock()
+        mock_client.infer_lmm.return_value = {"response": "recognized text"}
+        mock_client_cls.return_value = mock_client
+
+        block = GLMOCRBlockV1(
+            model_manager=mock_model_manager,
+            api_key="test_api_key",
+            step_execution_mode=StepExecutionMode.REMOTE,
+        )
+
+        result = block.run(
+            images=[mock_workflow_image_data],
+            model_version="glm-ocr",
+            task_type="text-recognition",
+            prompt=None,
+            max_new_tokens=4096,
+        )
+
+        assert len(result) == 1
+        assert result[0]["parsed_output"] == "recognized text"
+        mock_client.infer_lmm.assert_called_once_with(
+            inference_input=mock_workflow_image_data.base64_image,
+            model_id="glm-ocr",
+            prompt="Text Recognition:",
+            model_id_in_path=True,
+            max_new_tokens=4096,
+        )
+
+
 class TestQwen25VLRemote:
     """Tests for Qwen2.5-VL remote execution."""
 
@@ -206,6 +248,50 @@ def test_run_remotely_calls_infer_lmm(
         mock_client.infer_lmm.assert_called_once()
 
 
+class TestQwen35VLRemote:
+    """Tests for Qwen3.5-VL remote execution."""
+
+    @patch(
+        "inference.core.workflows.core_steps.models.foundation.qwen3_5vl.v1.InferenceHTTPClient"
+    )
+    def test_run_remotely_forwards_generation_parameters(
+        self, mock_client_cls, mock_model_manager, mock_workflow_image_data
+    ):
+        from inference.core.workflows.core_steps.models.foundation.qwen3_5vl.v1 import (
+            Qwen35VLBlockV1,
+        )
+
+        mock_client = MagicMock()
+        mock_client.infer_lmm.return_value = {"response": "This is a test response."}
+        mock_client_cls.return_value = mock_client
+
+        block = Qwen35VLBlockV1(
+            model_manager=mock_model_manager,
+            api_key="test_api_key",
+            step_execution_mode=StepExecutionMode.REMOTE,
+        )
+
+        result = block.run(
+            images=[mock_workflow_image_data],
+            model_version="qwen3_5-2b",
+            prompt="Describe this image",
+            system_prompt="You are helpful.",
+            enable_thinking=True,
+            max_new_tokens=1024,
+        )
+
+        assert len(result) == 1
+        assert "parsed_output" in result[0]
+        mock_client.infer_lmm.assert_called_once_with(
+            inference_input=mock_workflow_image_data.base64_image,
+            model_id="qwen3_5-2b",
+            prompt="Describe this image<system_prompt>You are helpful.",
+            model_id_in_path=True,
+            enable_thinking=True,
+            max_new_tokens=1024,
+        )
+
+
 class TestQwen3VLRemote:
     """Tests for Qwen3-VL remote execution."""