From e91968d1985cafbe40802bd46c67f3de76769440 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Mon, 30 Mar 2026 18:36:10 +0800
Subject: [PATCH 1/8] first commit

---
 .../image_preprocessor/__init__.py            |  10 +-
 .../get_image_preprocessor.py                 |  23 +-
 .../image_preprocessor_adaptive.py            | 502 +----------------
 fastdeploy/input/image_processors/__init__.py |  14 +
 .../image_processors/adaptive_processor.py    | 527 ++++++++++++++++++
 .../image_processors/paddleocr_processor.py   | 227 ++++++++
 .../input/image_processors/qwen3_processor.py | 333 +++++++++++
 .../input/image_processors/qwen_processor.py  | 332 +++++++++++
 .../paddleocr_vl_processor/image_processor.py | 220 +-------
 .../qwen3_vl_processor/image_processor.py     | 320 +----------
 .../qwen_vl_processor/image_processor.py      | 319 +----------
 11 files changed, 1473 insertions(+), 1354 deletions(-)
 create mode 100644 fastdeploy/input/image_processors/adaptive_processor.py
 create mode 100644 fastdeploy/input/image_processors/paddleocr_processor.py
 create mode 100644 fastdeploy/input/image_processors/qwen3_processor.py
 create mode 100644 fastdeploy/input/image_processors/qwen_processor.py

diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py
index c11444e6758..ec3d3e833c0 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py
@@ -14,7 +14,13 @@
 # limitations under the License.
 """
 
-from .get_image_preprocessor import get_image_preprocessor
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.
+
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    get_image_preprocessor,
+)
 
 __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
index 0ff6f7d1ed5..ead34a0ce0b 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
@@ -14,21 +14,10 @@
 # limitations under the License.
 """
 
-"""get image preprocessor"""
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.
 
-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-
-def get_image_preprocessor(args):
-    """
-    get_image_preprocessor from args
-    """
-
-    if args.vision_model_name_or_path is None:
-        return None
-
-    data_processor_logger.info("use AdaptiveImageProcessor")
-    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
-    return image_preprocess
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    get_image_preprocessor,
+)
diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
index cd81274654e..deaa5494c12 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
@@ -14,498 +14,12 @@
 # limitations under the License.
 """
 
-"""image preprocessor adaptive"""
-
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.
+
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    make_batched_images,
+    make_batched_videos,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image
-from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarrray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-__all__ = [
-    "AdaptiveImageProcessor",
-]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
-    """dummy"""
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
-
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
-
-    raise ValueError(f"Could not make batched video from {videos}")
-
-
-class AdaptiveImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a adaptive image processor that dynamically resizes images based on the original images.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use when resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
-            in the image.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
-            The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-            The max pixels of the image to resize the image.
-        patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
-        temporal_conv_size (`int`, *optional*, defaults to 2):
-            The temporal conv size in resampler.
-        merge_size (`int`, *optional*, defaults to 2):
-            The merge size of the vision encoder to llm encoder.
-    """
-
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_conv_size: int = 2,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        """init"""
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_conv_size = temporal_conv_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
-        self.do_convert_rgb = do_convert_rgb
-
-    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
-        """设定pixels"""
-        if min_pixels is not None:
-            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
-            self.min_pixels = min_pixels
-            self.size["min_pixels"] = int(min_pixels)
-        if max_pixels is not None:
-            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
-            self.max_pixels = max_pixels
-            self.size["max_pixels"] = int(max_pixels)
-
-    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
-        """dummy"""
-        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,
-            min_pixels=actual_min_pixels,
-            max_pixels=actual_max_pixels,
-        )
-        return (resized_height, resized_width), (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        do_resize: bool = True,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
-                If pixel values range from 0 to 1, set `do_rescale=False`.
-            vision_info (`List[Dict]`, *optional*):
-                Optional list of dictionaries containing additional information about vision inputs.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Scale factor to use if rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Mean to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Standard deviation to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        if predetermined_grid_thw is not None:
-            assert len(predetermined_grid_thw) == len(
-                images
-            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
-
-        for img_idx, image in enumerate(images):
-            if do_resize:
-                if predetermined_grid_thw is not None:
-                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
-                    resized_height *= self.patch_size
-                    resized_width *= self.patch_size
-                else:
-                    resized_height, resized_width = smart_resize(
-                        height,
-                        width,
-                        factor=self.patch_size * self.merge_size,
-                        min_pixels=self.min_pixels,
-                        max_pixels=self.max_pixels,
-                    )
-                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
-                # 直接fromarray，不要靠paddleformers里面的
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-            if do_rescale:
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-
-            processed_images.append(image)
-        patches = np.array(processed_images)
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])
-
-        channel = patches.shape[1]  # [time, C, H, W]
-        grid_t = patches.shape[0]
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        patches = patches.reshape(
-            [
-                grid_t,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
-        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.patch_size * self.patch_size,
-            ]
-        )  # [grid_t * grid_h * grid_w, C * psz * psz]
-
-        return flatten_patches, (grid_t, grid_h, grid_w)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        do_resize: bool = True,
-        size: Optional[Union[int, List[int]]] = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            videos (`VideoInput`):
-                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
-                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if images is not None:
-            images = make_batched_images(images)
-        if videos is not None:
-            videos = make_batched_videos(videos)
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        if images is not None:
-            pixel_values, vision_grid_thws = [], []
-            for img_idx, image in enumerate(images):
-                if predetermined_grid_thw is not None:
-                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
-                else:
-                    predetermined_grid_thw_one = None
-                patches, image_grid_thw = self._preprocess(
-                    image,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw_one,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(image_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-            data = {
-                "pixel_values": pixel_values,
-                "image_grid_thw": vision_grid_thws,
-            }
-
-        if videos is not None:
-            pixel_values, vision_grid_thws = [], []
-            for images in videos:
-                patches, video_grid_thw = self._preprocess(
-                    images,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(video_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-
-            data = {
-                "pixel_values_videos": pixel_values,
-                "video_grid_thw": vision_grid_thws,
-            }
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/image_processors/__init__.py b/fastdeploy/input/image_processors/__init__.py
index a9cc79cc9d7..0f5df8e741b 100644
--- a/fastdeploy/input/image_processors/__init__.py
+++ b/fastdeploy/input/image_processors/__init__.py
@@ -11,3 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    get_image_preprocessor,
+)
+from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
+    ImageProcessor as PaddleOCRImageProcessor,
+)
+from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
+    ImageProcessor as Qwen3ImageProcessor,
+)
+from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
+    ImageProcessor as QwenImageProcessor,
+)
diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py
new file mode 100644
index 00000000000..47e677e4917
--- /dev/null
+++ b/fastdeploy/input/image_processors/adaptive_processor.py
@@ -0,0 +1,527 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""image preprocessor adaptive"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image
+from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
+from fastdeploy.utils import data_processor_logger
+
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+__all__ = [
+    "AdaptiveImageProcessor",
+    "get_image_preprocessor",
+    "make_batched_images",
+    "make_batched_videos",
+]
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    """dummy"""
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class AdaptiveImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a adaptive image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
+            in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_conv_size (`int`, *optional*, defaults to 2):
+            The temporal conv size in resampler.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_conv_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        """init"""
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_conv_size = temporal_conv_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
+        """设定pixels"""
+        if min_pixels is not None:
+            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
+            self.min_pixels = min_pixels
+            self.size["min_pixels"] = int(min_pixels)
+        if max_pixels is not None:
+            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
+            self.max_pixels = max_pixels
+            self.size["max_pixels"] = int(max_pixels)
+
+    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
+        """dummy"""
+        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=actual_min_pixels,
+            max_pixels=actual_max_pixels,
+        )
+        return (resized_height, resized_width), (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = True,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
+                If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        if predetermined_grid_thw is not None:
+            assert len(predetermined_grid_thw) == len(
+                images
+            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
+
+        for img_idx, image in enumerate(images):
+            if do_resize:
+                if predetermined_grid_thw is not None:
+                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
+                    resized_height *= self.patch_size
+                    resized_width *= self.patch_size
+                else:
+                    resized_height, resized_width = smart_resize(
+                        height,
+                        width,
+                        factor=self.patch_size * self.merge_size,
+                        min_pixels=self.min_pixels,
+                        max_pixels=self.max_pixels,
+                    )
+                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
+                # 直接fromarray，不要靠paddleformers里面的
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+            if do_rescale:
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])
+
+        channel = patches.shape[1]  # [time, C, H, W]
+        grid_t = patches.shape[0]
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        patches = patches.reshape(
+            [
+                grid_t,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
+        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.patch_size * self.patch_size,
+            ]
+        )  # [grid_t * grid_h * grid_w, C * psz * psz]
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = True,
+        size: Optional[Union[int, List[int]]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for img_idx, image in enumerate(images):
+                if predetermined_grid_thw is not None:
+                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
+                else:
+                    predetermined_grid_thw_one = None
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw_one,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {
+                "pixel_values": pixel_values,
+                "image_grid_thw": vision_grid_thws,
+            }
+
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+
+            data = {
+                "pixel_values_videos": pixel_values,
+                "video_grid_thw": vision_grid_thws,
+            }
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+def get_image_preprocessor(args):
+    """
+    get_image_preprocessor from args
+    """
+
+    if args.vision_model_name_or_path is None:
+        return None
+
+    data_processor_logger.info("use AdaptiveImageProcessor")
+    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
+    return image_preprocess
diff --git a/fastdeploy/input/image_processors/paddleocr_processor.py b/fastdeploy/input/image_processors/paddleocr_processor.py
new file mode 100644
index 00000000000..a28f03075df
--- /dev/null
+++ b/fastdeploy/input/image_processors/paddleocr_processor.py
@@ -0,0 +1,227 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""Image processor class for PaddleOCR-VL."""
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_utils import (
+    ImageInput,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+)
+
+from fastdeploy.input.image_processors.common import (
+    smart_resize_paddleocr as smart_resize,
+)
+
+_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:
+        num_patches -= 1
+    return num_patches * patch_size
+
+
+class ImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: int = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir):
+        pretrained_model_dir = Path(pretrained_model_dir)
+        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
+        with open(image_processor_config_path, "r", encoding="utf-8") as f:
+            image_processor_config = json.load(f)
+        return cls(**image_processor_config)
+
+    def _preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ):
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [image.convert("RGB") for image in images]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+
+                image = image.resize((resized_width, resized_height), resample=self.resample)
+
+            image = to_numpy_array(image)
+
+            if do_rescale:
+                image = (image * rescale_factor).astype(np.float32)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image -= np.array(image_mean, dtype=np.float32)
+                image /= np.array(image_std, dtype=np.float32)
+
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images,
+        videos=None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors=None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if videos is not None:
+            raise NotImplementedError("Videos are not yet supported")
+
+        patches, image_grid_thw = self._preprocess(
+            images,
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_convert_rgb=do_convert_rgb,
+        )
+        pixel_values = np.array(patches)
+        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/image_processors/qwen3_processor.py b/fastdeploy/input/image_processors/qwen3_processor.py
new file mode 100644
index 00000000000..5927a0f9699
--- /dev/null
+++ b/fastdeploy/input/image_processors/qwen3_processor.py
@@ -0,0 +1,333 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.utils import data_processor_logger
+
+IMAGE_MEAN = [0.5, 0.5, 0.5]
+IMAGE_STD = [0.5, 0.5, 0.5]
+
+MIN_PIXELS = 65536
+MAX_PIXELS = 16777216
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+class ImageProcessor(BaseImageProcessor):
+    """
+    Adaptive image processor for dynamic image resizing and preprocessing.
+
+    This processor handles image resizing, rescaling, normalization and format conversion.
+    It dynamically adjusts image dimensions based on original size and specified constraints.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 16,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        image_mean: Union[float, List[float]] = IMAGE_MEAN,
+        image_std: Union[float, List[float]] = IMAGE_STD,
+        rescale_factor: float = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize image processor with configuration parameters.
+
+        Args:
+            patch_size (int): Spatial patch size for vision encoder
+            merge_size (int): Merge size between vision and LLM encoders
+            temporal_patch_size (int): Temporal patch size for video processing
+            min_pixels (int): Minimum allowed pixels in resized image
+            max_pixels (int): Maximum allowed pixels in resized image
+            image_mean (float/list): Mean values for normalization per channel
+            image_std (float/list): Std values for normalization per channel
+            rescale_factor (float): Scaling factor for pixel values (default 1/255)
+            do_rescale (bool): Whether to rescale images
+            do_normalize (bool): Whether to normalize images
+            resample: Resampling method for image resizing
+            **kwargs: Additional base class arguments
+        """
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+
+        self.resample = resample
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: int,
+        max_pixels: int,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        rescale_factor: float,
+        do_rescale: bool,
+        do_normalize: bool,
+        resample: PILImageResampling,
+        data_format: Optional[ChannelDimension],
+        input_data_format: Optional[Union[str, ChannelDimension]],
+    ):
+        """
+        Internal method for image preprocessing pipeline.
+
+        Args:
+            images: Input image or batch of images
+            min_pixels: Minimum allowed pixels in output
+            max_pixels: Maximum allowed pixels in output
+            image_mean: Normalization mean values
+            image_std: Normalization std values
+            rescale_factor: Pixel value scaling factor
+            do_rescale: Whether to rescale pixel values
+            do_normalize: Whether to normalize pixel values
+            resample: Resampling method
+            data_format: Output channel format
+            input_data_format: Input channel format
+
+        Returns:
+            tuple: (flatten_patches, grid_dimensions)
+                - flatten_patches: Flattened image patches
+                - grid_dimensions: Grid dimensions [t, h, w]
+        """
+        images = make_list_of_images(images)
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Get original dimensions and calculate optimal resize dimensions
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+        processed_images = []
+        for image in images:
+            if height != resized_height or width != resized_width:
+                # Convert to uint8 before resizing to avoid double scaling
+                image = image.astype("uint8")
+                # Convert to PIL Image and resize
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+
+            if do_rescale and do_normalize:
+                # Adjust mean and std for combined rescale+normalize
+                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
+                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
+                do_rescale = False  # Skip separate rescale step
+
+            # mutual exclusion and upper branch
+            if do_rescale:
+                image = image.astype(np.float32)
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+            processed_images.append(image)
+
+        # Convert processed images to numpy array
+        patches = np.array(processed_images)
+
+        # Pad temporal dimension if needed
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(
+                patches[-1][np.newaxis],
+                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
+                axis=0,
+            )
+            patches = np.concatenate([patches, repeats], axis=0)
+
+        # Convert to channels-first format if needed
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
+
+        grid_t, channel = patches.shape[:2]
+        grid_t = grid_t // self.temporal_patch_size
+
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        # Reshape into hierarchical patch structure
+        patches = patches.reshape(
+            [
+                grid_t,
+                self.temporal_patch_size,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # Reorder dimensions for better memory access pattern
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
+        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
+            ]
+        )
+
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        rescale_factor: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
+    ):
+        """
+        Main preprocessing method for images/videos.
+
+        Args:
+            images: Input image/video data
+            min_pixels: Override for minimum pixels
+            max_pixels: Override for maximum pixels
+            image_mean: Override for normalization mean
+            image_std: Override for normalization std
+            rescale_factor: Override for rescaling factor
+            do_rescale: Override for rescaling flag
+            do_normalize: Override for normalization flag
+            resample: Override for resampling method
+            return_tensors: Desired output tensor format
+            data_format: Output channel dimension format
+            input_data_format: Input channel dimension format
+
+        Returns:
+            BatchFeature: Processed features containing:
+                - pixel_values: Preprocessed pixel data
+                - grid_thw: Grid dimensions [temporal, height, width]
+
+        Raises:
+            ValueError: For invalid image types or dimensions
+        """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        pixel_values, grid_thw = self._preprocess(
+            images,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            image_mean=image_mean,
+            image_std=image_std,
+            rescale_factor=rescale_factor,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/image_processors/qwen_processor.py b/fastdeploy/input/image_processors/qwen_processor.py
new file mode 100644
index 00000000000..7c3df2b69bf
--- /dev/null
+++ b/fastdeploy/input/image_processors/qwen_processor.py
@@ -0,0 +1,332 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.utils import data_processor_logger
+
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+class ImageProcessor(BaseImageProcessor):
+    """
+    Adaptive image processor for dynamic image resizing and preprocessing.
+
+    This processor handles image resizing, rescaling, normalization and format conversion.
+    It dynamically adjusts image dimensions based on original size and specified constraints.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
+        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
+        rescale_factor: float = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize image processor with configuration parameters.
+
+        Args:
+            patch_size (int): Spatial patch size for vision encoder
+            merge_size (int): Merge size between vision and LLM encoders
+            temporal_patch_size (int): Temporal patch size for video processing
+            min_pixels (int): Minimum allowed pixels in resized image
+            max_pixels (int): Maximum allowed pixels in resized image
+            image_mean (float/list): Mean values for normalization per channel
+            image_std (float/list): Std values for normalization per channel
+            rescale_factor (float): Scaling factor for pixel values (default 1/255)
+            do_rescale (bool): Whether to rescale images
+            do_normalize (bool): Whether to normalize images
+            resample: Resampling method for image resizing
+            **kwargs: Additional base class arguments
+        """
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+
+        self.resample = resample
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: int,
+        max_pixels: int,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        rescale_factor: float,
+        do_rescale: bool,
+        do_normalize: bool,
+        resample: PILImageResampling,
+        data_format: Optional[ChannelDimension],
+        input_data_format: Optional[Union[str, ChannelDimension]],
+    ):
+        """
+        Internal method for image preprocessing pipeline.
+
+        Args:
+            images: Input image or batch of images
+            min_pixels: Minimum allowed pixels in output
+            max_pixels: Maximum allowed pixels in output
+            image_mean: Normalization mean values
+            image_std: Normalization std values
+            rescale_factor: Pixel value scaling factor
+            do_rescale: Whether to rescale pixel values
+            do_normalize: Whether to normalize pixel values
+            resample: Resampling method
+            data_format: Output channel format
+            input_data_format: Input channel format
+
+        Returns:
+            tuple: (flatten_patches, grid_dimensions)
+                - flatten_patches: Flattened image patches
+                - grid_dimensions: Grid dimensions [t, h, w]
+        """
+        images = make_list_of_images(images)
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Get original dimensions and calculate optimal resize dimensions
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+        processed_images = []
+        for image in images:
+            if height != resized_height or width != resized_width:
+                # Convert to uint8 before resizing to avoid double scaling
+                image = image.astype("uint8")
+                # Convert to PIL Image and resize
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+
+            if do_rescale and do_normalize:
+                # Adjust mean and std for combined rescale+normalize
+                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
+                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
+                do_rescale = False  # Skip separate rescale step
+
+            if do_rescale:
+                image = image.astype(np.float32)
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+            processed_images.append(image)
+
+        # Convert processed images to numpy array
+        patches = np.array(processed_images)
+
+        # Pad temporal dimension if needed
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(
+                patches[-1][np.newaxis],
+                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
+                axis=0,
+            )
+            patches = np.concatenate([patches, repeats], axis=0)
+
+        # Convert to channels-first format if needed
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
+
+        grid_t, channel = patches.shape[:2]
+        grid_t = grid_t // self.temporal_patch_size
+
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        # Reshape into hierarchical patch structure
+        patches = patches.reshape(
+            [
+                grid_t,
+                self.temporal_patch_size,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # Reorder dimensions for better memory access pattern
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
+        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
+            ]
+        )
+
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        rescale_factor: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
+    ):
+        """
+        Main preprocessing method for images/videos.
+
+        Args:
+            images: Input image/video data
+            min_pixels: Override for minimum pixels
+            max_pixels: Override for maximum pixels
+            image_mean: Override for normalization mean
+            image_std: Override for normalization std
+            rescale_factor: Override for rescaling factor
+            do_rescale: Override for rescaling flag
+            do_normalize: Override for normalization flag
+            resample: Override for resampling method
+            return_tensors: Desired output tensor format
+            data_format: Output channel dimension format
+            input_data_format: Input channel dimension format
+
+        Returns:
+            BatchFeature: Processed features containing:
+                - pixel_values: Preprocessed pixel data
+                - grid_thw: Grid dimensions [temporal, height, width]
+
+        Raises:
+            ValueError: For invalid image types or dimensions
+        """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        pixel_values, grid_thw = self._preprocess(
+            images,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            image_mean=image_mean,
+            image_std=image_std,
+            rescale_factor=rescale_factor,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py
index a6e318e1ed7..ef86d77b714 100644
--- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py
+++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py
@@ -14,216 +14,12 @@
 # limitations under the License.
 """
 
-"""Image processor class for Keye."""
-
-# TODO: Support videos
-
-import json
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_utils import (
-    ImageInput,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-)
-
-from fastdeploy.input.image_processors.common import (
-    smart_resize_paddleocr as smart_resize,
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.paddleocr_processor
+# This file will be removed in a future version.
+
+from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
+    ImageProcessor,
+    make_batched_images,
+    smart_resize,
 )
-
-_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-def adjust_size(size, patch_size):
-    num_patches = size // patch_size
-    if num_patches % 2 != 0:
-        num_patches -= 1
-    return num_patches * patch_size
-
-
-class ImageProcessor(BaseImageProcessor):
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: int = 3,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 28 * 28 * 130,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_patch_size: int = 1,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_patch_size = temporal_patch_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
-        self.do_convert_rgb = do_convert_rgb
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_dir):
-        pretrained_model_dir = Path(pretrained_model_dir)
-        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
-        with open(image_processor_config_path, "r", encoding="utf-8") as f:
-            image_processor_config = json.load(f)
-        return cls(**image_processor_config)
-
-    def _preprocess(
-        self,
-        images,
-        do_resize: Optional[bool] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-    ):
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [image.convert("RGB") for image in images]
-
-        width, height = images[0].size
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        for image in images:
-            if do_resize:
-                resized_height, resized_width = smart_resize(
-                    height,
-                    width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
-                )
-
-                image = image.resize((resized_width, resized_height), resample=self.resample)
-
-            image = to_numpy_array(image)
-
-            if do_rescale:
-                image = (image * rescale_factor).astype(np.float32)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image -= np.array(image_mean, dtype=np.float32)
-                image /= np.array(image_std, dtype=np.float32)
-
-            processed_images.append(image)
-
-        patches = np.array(processed_images)
-        patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-        patches = patches.reshape(
-            grid_t,
-            self.temporal_patch_size,
-            channel,
-            grid_h,
-            self.patch_size,
-            grid_w,
-            self.patch_size,
-        )
-        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
-        assert self.temporal_patch_size == 1
-        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images,
-        videos=None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-        return_tensors=None,
-    ):
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if videos is not None:
-            raise NotImplementedError("Videos are not yet supported")
-
-        patches, image_grid_thw = self._preprocess(
-            images,
-            do_resize=do_resize,
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_convert_rgb=do_convert_rgb,
-        )
-        pixel_values = np.array(patches)
-        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/qwen3_vl_processor/image_processor.py b/fastdeploy/input/qwen3_vl_processor/image_processor.py
index 5927a0f9699..2b0afe4c047 100644
--- a/fastdeploy/input/qwen3_vl_processor/image_processor.py
+++ b/fastdeploy/input/qwen3_vl_processor/image_processor.py
@@ -14,320 +14,10 @@
 # limitations under the License.
 """
 
-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.qwen3_processor
+# This file will be removed in a future version.
 
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
+    ImageProcessor,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
-from fastdeploy.utils import data_processor_logger
-
-IMAGE_MEAN = [0.5, 0.5, 0.5]
-IMAGE_STD = [0.5, 0.5, 0.5]
-
-MIN_PIXELS = 65536
-MAX_PIXELS = 16777216
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 16,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = IMAGE_MEAN,
-        image_std: Union[float, List[float]] = IMAGE_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            # mutual exclusion and upper branch
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/qwen_vl_processor/image_processor.py b/fastdeploy/input/qwen_vl_processor/image_processor.py
index 7c3df2b69bf..3a5a77ea6d8 100644
--- a/fastdeploy/input/qwen_vl_processor/image_processor.py
+++ b/fastdeploy/input/qwen_vl_processor/image_processor.py
@@ -14,319 +14,10 @@
 # limitations under the License.
 """
 
-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.qwen_processor
+# This file will be removed in a future version.
 
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
+    ImageProcessor,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 14,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
-        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)

From a48fc5f6bfa5f06f15ea2c2bd45730979a1149b9 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Mon, 30 Mar 2026 19:32:04 +0800
Subject: [PATCH 2/8] step 9~10

---
 fastdeploy/input/multimodal_processor.py      | 673 ++++++++++++++++++
 fastdeploy/input/preprocess.py                |  86 +--
 .../input/test_image_preprocessor_adaptive.py |   8 +-
 3 files changed, 698 insertions(+), 69 deletions(-)
 create mode 100644 fastdeploy/input/multimodal_processor.py

diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py
new file mode 100644
index 00000000000..3e5d5c896f1
--- /dev/null
+++ b/fastdeploy/input/multimodal_processor.py
@@ -0,0 +1,673 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""Unified multimodal processor for all VL model types.
+
+Consolidates the four separate VL processor wrappers (QwenVLProcessor,
+Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
+single class that dispatches per ``model_type``.
+"""
+
+import pickle
+from collections.abc import Mapping
+from typing import Any, Dict, Optional
+
+import numpy as np
+
+from fastdeploy.input.base_processor import BaseTextProcessor
+from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
+from fastdeploy.utils import data_processor_logger
+
+QWEN_VL = "qwen_vl"
+QWEN3_VL = "qwen3_vl"
+PADDLEOCR_VL = "paddleocr_vl"
+ERNIE4_5_VL = "ernie4_5_vl"
+
+_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
+
+_QWEN_EXPECTED_KWARGS = {
+    "video_max_frames": int,
+    "video_min_frames": int,
+}
+
+_ERNIE_EXPECTED_KWARGS = {
+    "spatial_conv_size": int,
+    "temporal_conv_size": int,
+    "image_min_pixels": int,
+    "image_max_pixels": int,
+    "video_min_pixels": int,
+    "video_max_pixels": int,
+    "video_target_frames": int,
+    "video_frames_sample": str,
+    "video_max_frames": int,
+    "video_min_frames": int,
+    "video_fps": int,
+}
+
+_TYPES_ACCEPT_URL_SUFFIX = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL}
+
+_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
+
+_SAMPLING_EPS = 1e-5
+
+
+class MultiModalProcessor(BaseTextProcessor):
+    """Unified multimodal processor for all supported VL model types.
+
+    Dispatches image-processor creation, config initialisation, and
+    encoding logic based on ``model_type``.
+    """
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        model_type: str,
+        config=None,
+        limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        reasoning_parser_obj=None,
+        tool_parser_obj=None,
+        enable_processor_cache: bool = False,
+    ):
+        if model_type not in _SUPPORTED_MODEL_TYPES:
+            raise ValueError(
+                f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
+            )
+        self.model_type = model_type
+        self.config = config
+        self.enable_processor_cache = enable_processor_cache
+
+        tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
+
+        super().__init__(
+            model_name_or_path,
+            tokenizer_type=tokenizer_type,
+            reasoning_parser_obj=reasoning_parser_obj,
+            tool_parser_obj=tool_parser_obj,
+        )
+
+        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
+
+        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
+        self._init_mm_processor(processor_kwargs)
+        self._init_mm_config()
+        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
+
+    def _load_tokenizer(self):
+        """Load the appropriate tokenizer based on model_type."""
+        if self.tokenizer_type == "ernie4_5":
+            from paddleformers.transformers import AutoTokenizer as PFAutoTokenizer
+
+            tokenizer = PFAutoTokenizer.from_pretrained(self.model_name_or_path)
+        else:
+            from paddleformers.transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
+        return tokenizer
+
+    def _init_mm_processor(self, processor_kwargs: dict):
+        """Create the model-type-specific internal DataProcessor."""
+        if self.model_type == QWEN_VL:
+            from fastdeploy.input.qwen_vl_processor.process import DataProcessor
+
+            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokens_per_second=tokens_per_second,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == QWEN3_VL:
+            from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
+
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == PADDLEOCR_VL:
+            from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
+
+            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokens_per_second=tokens_per_second,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == ERNIE4_5_VL:
+            from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
+
+            self.processor = DataProcessor(
+                tokenizer_name=self.model_name_or_path,
+                image_preprocessor_name=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                **processor_kwargs,
+            )
+            self.processor.eval()
+
+    def _init_mm_config(self):
+        """Set model-type-specific multimodal configuration attributes."""
+        if self.model_type in (QWEN_VL, QWEN3_VL):
+            self.image_patch_id = self.processor.image_token_id
+        elif self.model_type == PADDLEOCR_VL:
+            self.image_patch_id = self.processor.image_patch_id
+        elif self.model_type == ERNIE4_5_VL:
+            self.image_patch_id = self.processor.image_patch_id
+            self.spatial_conv_size = self.processor.spatial_conv_size
+
+    def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
+        """Parse and validate multimodal processor kwargs."""
+        if not kwargs:
+            return {}
+
+        try:
+            if not isinstance(kwargs, dict):
+                raise ValueError("mm-processor-kwargs must be a dictionary")
+
+            data_processor_logger.info(f"Processing kwargs: {kwargs}")
+
+            if self.model_type == ERNIE4_5_VL:
+                expected_types = _ERNIE_EXPECTED_KWARGS
+            else:
+                expected_types = _QWEN_EXPECTED_KWARGS
+
+            for key, value in kwargs.items():
+                if key in expected_types and not isinstance(value, expected_types[key]):
+                    raise ValueError(
+                        f"Invalid type for {key}: expected "
+                        f"{expected_types[key].__name__}, got {type(value).__name__}"
+                    )
+            return kwargs
+
+        except Exception as e:
+            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
+            return {}
+
+    def _parse_limits(self, limits: Optional[dict]) -> dict:
+        """Parse multimodal input limits, merging with defaults."""
+        if not limits:
+            return dict(_DEFAULT_MM_LIMITS)
+
+        try:
+            if not isinstance(limits, dict):
+                raise ValueError("limit-mm-per-prompt must be a dictionary")
+            data_processor_logger.info(f"_parse_limits:{limits}")
+            return {**_DEFAULT_MM_LIMITS, **limits}
+        except Exception as e:
+            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
+            return dict(_DEFAULT_MM_LIMITS)
+
+    def _check_mm_limits(self, item):
+        """Validate multimodal inputs against configured limits."""
+        if isinstance(item, dict):
+            mm_data = item
+        else:
+            mm_data = {"image": [], "video": []}
+            accept_url_suffix = self.model_type in _TYPES_ACCEPT_URL_SUFFIX
+
+            for message in item:
+                if isinstance(message.get("content"), list):
+                    for part in message["content"]:
+                        part_type = part.get("type")
+                        if accept_url_suffix:
+                            if part_type in ("image_url", "image"):
+                                mm_data["image"].append(part)
+                            elif part_type in ("video_url", "video"):
+                                mm_data["video"].append(part)
+                        else:
+                            if part_type == "image":
+                                mm_data["image"].append(part)
+                            elif part_type == "video":
+                                mm_data["video"].append(part)
+
+        for modality, data in mm_data.items():
+            if modality in self.limit_mm_per_prompt:
+                limit = self.limit_mm_per_prompt[modality]
+                if len(data) > limit:
+                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
+
+    def _get_processor_cache(self, socket, mm_hashes: list) -> list:
+        """Retrieve cached processor results for the given hashes."""
+        req = pickle.dumps(mm_hashes)
+        socket.send_multipart([b"", req])
+        _, resp = socket.recv_multipart()
+        mm_items = pickle.loads(resp)
+        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
+        return mm_items
+
+    def _update_processor_cache(self, socket, mm_hashes: list, mm_items):
+        """Update the processor cache with new results."""
+        req = pickle.dumps((mm_hashes, mm_items))
+        socket.send_multipart([b"", req])
+        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
+        """Return per-modality max token counts, if available."""
+        if self.model_type == ERNIE4_5_VL:
+            return self.processor.get_mm_max_tokens_per_item(seq_len)
+        return None
+
+    def process_request_dict(self, request, max_model_len=None):
+        """Process a request dictionary into model inputs."""
+        if self.model_type == QWEN_VL:
+            return self._process_request_qwen_vl(request, max_model_len)
+        elif self.model_type == QWEN3_VL:
+            return self._process_request_qwen3_vl(request, max_model_len)
+        elif self.model_type == PADDLEOCR_VL:
+            return self._process_request_paddleocr_vl(request, max_model_len)
+        elif self.model_type == ERNIE4_5_VL:
+            return self._process_request_ernie4_5_vl(request, max_model_len)
+
+    def _process_request_qwen_vl(self, request, max_model_len):
+        """Process request for qwen_vl model type."""
+        request = self._apply_default_parameters(request)
+        if not request.get("eos_token_ids"):
+            request["eos_token_ids"] = self.eos_token_ids
+
+        process_stop_token_ids(request, self.update_stop_seq)
+
+        bad_words = request.get("bad_words")
+        bad_words_token_ids = request.get("bad_words_token_ids")
+        if bad_words:
+            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+            request["bad_words_token_ids"] = bad_words_token_ids
+
+        if request.get("prompt"):
+            multimodal_data = request.get("multimodal_data")
+            if multimodal_data is None:
+                multimodal_data = {}
+            self._check_mm_limits(multimodal_data)
+            images = multimodal_data.get("image", None)
+            videos = multimodal_data.get("video", None)
+            outputs = self.processor.text2ids(request["prompt"], images, videos)
+
+        elif request.get("messages"):
+            messages = request["messages"]
+            self._check_mm_limits(messages)
+            chat_template_kwargs = request.get("chat_template_kwargs")
+            if chat_template_kwargs:
+                if isinstance(chat_template_kwargs, dict):
+                    for k, v in chat_template_kwargs.items():
+                        if k not in request or request[k] is None:
+                            request[k] = v
+                else:
+                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+            request.setdefault("enable_thinking", False)
+            outputs = self.processor.request2ids(request)
+
+        else:
+            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+
+        if request.get("completion_token_ids"):
+            self.append_completion_tokens(outputs, request["completion_token_ids"])
+
+        # qwen25_vl not support thinking
+        request["enable_thinking"] = False
+
+        outputs = self.pack_outputs(outputs)
+
+        request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+        request["multimodal_inputs"] = outputs
+
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+
+        if self.reasoning_parser:
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            parts = request["request_id"].split("_")
+            if len(parts) > 1:
+                real_req_id = parts[0]
+                index = int(parts[1])
+                n = request.get("n", 1)
+                for idx in range(index * n, (index + 1) * n):
+                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            else:
+                self.model_status_dict[request["request_id"]] = model_status
+            request["enable_thinking"] = model_status == "think_start"
+
+        data_processor_logger.info(f"Processed request {request}")
+        return request
+
+    def _process_request_qwen3_vl(self, request, max_model_len):
+        """Process request for qwen3_vl model type."""
+        request = self._apply_default_parameters(request)
+        if not request.get("eos_token_ids"):
+            request["eos_token_ids"] = self.eos_token_ids
+
+        stop_sequences = request.get("stop", [])
+        if stop_sequences:
+            stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
+            request["stop_token_ids"] = stop_seqs
+            request["stop_seqs_len"] = stop_seqs_len
+
+        bad_words = request.get("bad_words")
+        bad_words_token_ids = request.get("bad_words_token_ids")
+        if bad_words:
+            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+            request["bad_words_token_ids"] = bad_words_token_ids
+
+        if request.get("prompt_token_ids"):
+            messages = request.get("messages")
+            if messages:
+                self._check_mm_limits(messages)
+            request.setdefault("enable_thinking", False)
+            outputs = self.processor.prompt_token_ids2outputs(request)
+
+        elif request.get("prompt"):
+            multimodal_data = request.get("multimodal_data")
+            if multimodal_data is None:
+                multimodal_data = {}
+            self._check_mm_limits(multimodal_data)
+            images = multimodal_data.get("image", None)
+            videos = multimodal_data.get("video", None)
+            outputs = self.processor.text2ids(request["prompt"], images, videos)
+
+        elif request.get("messages"):
+            messages = request["messages"]
+            self._check_mm_limits(messages)
+            chat_template_kwargs = request.get("chat_template_kwargs")
+            if chat_template_kwargs:
+                if isinstance(chat_template_kwargs, dict):
+                    for k, v in chat_template_kwargs.items():
+                        if k not in request or request[k] is None:
+                            request[k] = v
+                else:
+                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+            request.setdefault("enable_thinking", False)
+            outputs = self.processor.request2ids(request)
+
+        else:
+            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+
+        if request.get("completion_token_ids"):
+            self.append_completion_tokens(outputs, request["completion_token_ids"])
+
+        # qwen3_vl not support thinking
+        request["enable_thinking"] = False
+
+        outputs = self.pack_outputs(outputs)
+
+        request["prompt_token_ids"] = (
+            outputs["input_ids"].tolist() if not request.get("prompt_token_ids") else request["prompt_token_ids"]
+        )
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+        request["multimodal_inputs"] = outputs
+
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+
+        data_processor_logger.info(f"Processed request {request}")
+        return request
+
+    def _process_request_paddleocr_vl(self, request, max_model_len):
+        """Process request for paddleocr_vl model type."""
+        request = self._apply_default_parameters(request)
+        if not request.get("eos_token_ids"):
+            request["eos_token_ids"] = self.eos_token_ids
+
+        process_stop_token_ids(request, self.update_stop_seq)
+
+        if request.get("prompt"):
+            multimodal_data = request.get("multimodal_data")
+            if multimodal_data is None:
+                multimodal_data = {}
+            self._check_mm_limits(multimodal_data)
+            images = multimodal_data.get("image", None)
+            videos = multimodal_data.get("video", None)
+            outputs = self.processor.text2ids(request["prompt"], images, videos)
+
+        elif request.get("messages"):
+            messages = request["messages"]
+            self._check_mm_limits(messages)
+            outputs = self.processor.request2ids(request)
+
+        else:
+            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+
+        metadata = request.get("metadata")
+        if metadata and metadata.get("generated_token_ids"):
+            self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"])
+
+        outputs = self.pack_outputs(outputs)
+
+        request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+        request["multimodal_inputs"] = outputs
+
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+
+        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+            request["top_p"] = _SAMPLING_EPS
+            request["top_k"] = 1
+
+        if self.reasoning_parser:
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            parts = request["request_id"].split("_")
+            if len(parts) > 1:
+                real_req_id = parts[0]
+                index = int(parts[1])
+                n = request.get("n", 1)
+                for idx in range(index * n, (index + 1) * n):
+                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            else:
+                self.model_status_dict[request["request_id"]] = model_status
+            request["enable_thinking"] = model_status == "think_start"
+
+        return request
+
+    def _process_request_ernie4_5_vl(self, request, max_model_len):
+        """Process request for ernie4_5_vl model type."""
+        request = self._apply_default_parameters(request)
+        if not request.get("eos_token_ids"):
+            request["eos_token_ids"] = self.eos_token_ids
+
+        process_stop_token_ids(request, self.update_stop_seq)
+
+        bad_words = request.get("bad_words")
+        bad_words_token_ids = request.get("bad_words_token_ids")
+        if bad_words:
+            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+            request["bad_words_token_ids"] = bad_words_token_ids
+
+        logits_processors_args = self._prepare_think_stop_sentence(
+            request.get("logits_processors_args") or {}, max_model_len
+        )
+        request["logits_processors_args"] = logits_processors_args
+
+        if request.get("prompt_token_ids"):
+            messages = request.get("messages")
+            if messages:
+                self._check_mm_limits(messages)
+            request.setdefault("enable_thinking", True)
+            outputs = self.processor.prompt_token_ids2outputs(request)
+        elif request.get("prompt"):
+            multimodal_data = request.get("multimodal_data")
+            if multimodal_data is None:
+                multimodal_data = {}
+            self._check_mm_limits(multimodal_data)
+            images = multimodal_data.get("image", None)
+            videos = multimodal_data.get("video", None)
+            request["prompt_tokens"] = request.get("prompt")
+            request.setdefault("enable_thinking", True)
+            outputs = self.processor.text2ids(request["prompt"], images, videos)
+        elif request.get("messages"):
+            messages = request["messages"]
+            self._check_mm_limits(messages)
+            chat_template_kwargs = request.get("chat_template_kwargs")
+            if chat_template_kwargs:
+                if isinstance(chat_template_kwargs, dict):
+                    for k, v in chat_template_kwargs.items():
+                        if k not in request or request[k] is None:
+                            request[k] = v
+                else:
+                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+            request.setdefault("enable_thinking", True)
+            outputs = self.processor.request2ids(request)
+        else:
+            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+
+        if request.get("completion_token_ids"):
+            self.append_completion_tokens(outputs, request["completion_token_ids"])
+
+        outputs = self.pack_outputs(outputs)
+        request["prompt_token_ids"] = (
+            outputs["input_ids"].tolist()
+            if ("prompt_token_ids" not in request or not request["prompt_token_ids"])
+            else request["prompt_token_ids"]
+        )
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+        request["multimodal_inputs"] = outputs
+
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+        logits_processors_args = self._update_thinking_prompt_state(
+            request["prompt_token_ids"], request.get("logits_processors_args") or {}
+        )
+        request["logits_processors_args"] = logits_processors_args
+
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+        if request.get("reasoning_max_tokens") is None:
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
+
+        if self.reasoning_parser:
+            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            parts = request["request_id"].split("_")
+            if len(parts) > 1:
+                real_req_id = parts[0]
+                index = int(parts[1])
+                n = request.get("n", 1)
+                for idx in range(index * n, (index + 1) * n):
+                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+            else:
+                self.model_status_dict[request["request_id"]] = model_status
+            request["enable_thinking"] = model_status == "think_start"
+        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+            request["top_p"] = _SAMPLING_EPS
+            request["top_k"] = 1
+        if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
+            request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
+
+        data_processor_logger.info(f"Processed request {request}")
+        return request
+
+    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens to existing multimodal outputs."""
+        if self.model_type in (QWEN_VL, QWEN3_VL):
+            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
+        elif self.model_type == PADDLEOCR_VL:
+            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
+        elif self.model_type == ERNIE4_5_VL:
+            self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
+
+    def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
+
+        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
+        multimodal_inputs["position_ids"].append(pos_ids)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def _append_generated_tokens_qwen(self, multimodal_inputs, generated_token_ids):
+        """Append generated tokens for paddleocr_vl (uses metadata.generated_token_ids)."""
+        self._append_completion_tokens_qwen(multimodal_inputs, generated_token_ids)
+
+    def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens for ernie4_5_vl."""
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+
+        start = multimodal_inputs["cur_position"]
+        for i in range(num_tokens):
+            multimodal_inputs["position_ids"].append([start + i] * 3)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def pack_outputs(self, outputs):
+        """Convert intermediate processing outputs to final format."""
+        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
+            return self._pack_outputs_qwen(outputs)
+        elif self.model_type == ERNIE4_5_VL:
+            return self._pack_outputs_ernie(outputs)
+
+    def _pack_outputs_qwen(self, outputs):
+        """Pack outputs for qwen_vl / qwen3_vl / paddleocr_vl."""
+        if not outputs["images"]:
+            outputs["images"] = None
+            outputs["grid_thw"] = None
+            outputs["image_type_ids"] = None
+        else:
+            outputs["images"] = np.vstack(outputs["images"])
+            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
+            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
+
+        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
+        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
+        outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
+
+        outputs["image_patch_id"] = self.processor.image_token_id
+        outputs["video_patch_id"] = self.processor.video_token_id
+        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
+
+        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
+        return outputs
+
+    def _pack_outputs_ernie(self, outputs):
+        """Pack outputs for ernie4_5_vl."""
+        if not outputs["images"]:
+            outputs["images"] = None
+            outputs["grid_thw"] = None
+            outputs["image_type_ids"] = None
+        else:
+            outputs["images"] = np.vstack(outputs["images"])
+            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
+            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
+
+        outputs["image_patch_id"] = self.image_patch_id
+        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
+        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
+        outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
+        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
+        return outputs
diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py
index 8568d1ff32d..0261177eeaa 100644
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -106,74 +106,34 @@ def create_processor(self):
                         tool_parser_obj=tool_parser_obj,
                     )
             else:
-                if ErnieArchitectures.contains_ernie_arch(architecture):
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.ernie4_5_vl_processor import (
-                            Ernie4_5_VLProcessor,
-                        )
-                    else:
-                        from fastdeploy.input.v1.ernie4_5_vl_processor import (
-                            Ernie4_5_VLProcessor,
-                        )
+                from fastdeploy.input.multimodal_processor import (
+                    ERNIE4_5_VL,
+                    PADDLEOCR_VL,
+                    QWEN3_VL,
+                    QWEN_VL,
+                    MultiModalProcessor,
+                )
 
-                    self.processor = Ernie4_5_VLProcessor(
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        tool_parser_obj=tool_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                if ErnieArchitectures.contains_ernie_arch(architecture):
+                    model_type = ERNIE4_5_VL
                 elif "PaddleOCRVL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.paddleocr_vl_processor import (
-                            PaddleOCRVLProcessor,
-                        )
-                    else:
-                        from fastdeploy.input.v1.paddleocr_vl_processor import (
-                            PaddleOCRVLProcessor,
-                        )
-
-                    self.processor = PaddleOCRVLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                    )
+                    model_type = PADDLEOCR_VL
                 elif "Qwen2_5_VL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-                    else:
-                        from fastdeploy.input.v1.qwen_vl_processor import (
-                            QwenVLProcessor,
-                        )
-
-                    self.processor = QwenVLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                    model_type = QWEN_VL
                 elif "Qwen3VL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
-                    else:
-                        from fastdeploy.input.v1.qwen3_vl_processor import (
-                            Qwen3VLProcessor,
-                        )
-
-                    self.processor = Qwen3VLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                    model_type = QWEN3_VL
                 else:
                     raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
 
+                self.processor = MultiModalProcessor(
+                    model_name_or_path=self.model_name_or_path,
+                    model_type=model_type,
+                    config=self.model_config,
+                    limit_mm_per_prompt=self.limit_mm_per_prompt,
+                    mm_processor_kwargs=self.mm_processor_kwargs,
+                    reasoning_parser_obj=reasoning_parser_obj,
+                    tool_parser_obj=tool_parser_obj,
+                    enable_processor_cache=self.enable_processor_cache,
+                )
+
         return self.processor
diff --git a/tests/input/test_image_preprocessor_adaptive.py b/tests/input/test_image_preprocessor_adaptive.py
index cc9ed857554..d01ce6e179e 100644
--- a/tests/input/test_image_preprocessor_adaptive.py
+++ b/tests/input/test_image_preprocessor_adaptive.py
@@ -340,9 +340,7 @@ def test_preprocess_scaled_image_warning(self):
         # Create a scaled image (values between 0-1)
         img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
         # Use patch to capture warning
-        with patch(
-            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
-        ) as mock_logger:
+        with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
             # Directly call _preprocess, pass scaled image
             self.processor._preprocess(
                 [img_array],  # Pass scaled numpy array
@@ -356,9 +354,7 @@ def test_preprocess_invalid_images_check(self):
         """Test invalid image check in preprocess (line 464)"""
         # Test invalid image type - need to ensure valid_images returns False
         # Use patch to make valid_images return False, but make_batched_images succeeds
-        with patch(
-            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
-        ) as mock_valid:
+        with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
             mock_valid.return_value = False
             valid_images_list = [Image.new("RGB", (224, 224))]  # Valid image, but valid_images returns False
             with self.assertRaises(ValueError) as context:

From f5bdd67062dddce30d1ac2436e2cf85fbaa361be Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Tue, 31 Mar 2026 16:11:39 +0800
Subject: [PATCH 3/8] update multimodal

---
 fastdeploy/input/multimodal_processor.py | 350 ++++++-----------------
 1 file changed, 88 insertions(+), 262 deletions(-)

diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py
index 3e5d5c896f1..afada3d2e3d 100644
--- a/fastdeploy/input/multimodal_processor.py
+++ b/fastdeploy/input/multimodal_processor.py
@@ -265,268 +265,117 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]
         return None
 
     def process_request_dict(self, request, max_model_len=None):
-        """Process a request dictionary into model inputs."""
-        if self.model_type == QWEN_VL:
-            return self._process_request_qwen_vl(request, max_model_len)
-        elif self.model_type == QWEN3_VL:
-            return self._process_request_qwen3_vl(request, max_model_len)
-        elif self.model_type == PADDLEOCR_VL:
-            return self._process_request_paddleocr_vl(request, max_model_len)
-        elif self.model_type == ERNIE4_5_VL:
-            return self._process_request_ernie4_5_vl(request, max_model_len)
+        """Process a request dictionary into model inputs.
 
-    def _process_request_qwen_vl(self, request, max_model_len):
-        """Process request for qwen_vl model type."""
+        Unified template-method flow for all VL model types.  Per-model
+        differences are handled by small conditional branches rather than
+        duplicating the entire pipeline.
+        """
         request = self._apply_default_parameters(request)
+
         if not request.get("eos_token_ids"):
             request["eos_token_ids"] = self.eos_token_ids
 
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
+        self._process_stop_tokens(request)
 
-        if request.get("prompt"):
-            multimodal_data = request.get("multimodal_data")
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request["prompt"], images, videos)
+        if self.model_type != PADDLEOCR_VL:
+            self._process_bad_words(request)
 
-        elif request.get("messages"):
-            messages = request["messages"]
-            self._check_mm_limits(messages)
-            chat_template_kwargs = request.get("chat_template_kwargs")
-            if chat_template_kwargs:
-                if isinstance(chat_template_kwargs, dict):
-                    for k, v in chat_template_kwargs.items():
-                        if k not in request or request[k] is None:
-                            request[k] = v
-                else:
-                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-            request.setdefault("enable_thinking", False)
-            outputs = self.processor.request2ids(request)
+        if self.model_type == ERNIE4_5_VL:
+            logits_processors_args = self._prepare_think_stop_sentence(
+                request.get("logits_processors_args") or {}, max_model_len
+            )
+            request["logits_processors_args"] = logits_processors_args
 
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+        outputs = self._tokenize_request(request)
 
-        if request.get("completion_token_ids"):
-            self.append_completion_tokens(outputs, request["completion_token_ids"])
+        self._process_post_tokens(request, outputs)
 
-        # qwen25_vl not support thinking
-        request["enable_thinking"] = False
+        if self.model_type in (QWEN_VL, QWEN3_VL):
+            request["enable_thinking"] = False
 
         outputs = self.pack_outputs(outputs)
 
-        request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
+            pass  # preserve existing prompt_token_ids
+        else:
+            request["prompt_token_ids"] = outputs["input_ids"].tolist()
         request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
         request["multimodal_inputs"] = outputs
 
         if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
             request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
 
+        if self.model_type == ERNIE4_5_VL:
+            logits_processors_args = self._update_thinking_prompt_state(
+                request["prompt_token_ids"], request.get("logits_processors_args") or {}
+            )
+            request["logits_processors_args"] = logits_processors_args
+
         max_tokens = max_model_len - len(request["prompt_token_ids"])
         if request.get("max_tokens") is None:
             request["max_tokens"] = max(1, max_tokens)
         else:
             request["max_tokens"] = min(max_tokens, request["max_tokens"])
 
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-            parts = request["request_id"].split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request["request_id"]] = model_status
-            request["enable_thinking"] = model_status == "think_start"
-
-        data_processor_logger.info(f"Processed request {request}")
-        return request
-
-    def _process_request_qwen3_vl(self, request, max_model_len):
-        """Process request for qwen3_vl model type."""
-        request = self._apply_default_parameters(request)
-        if not request.get("eos_token_ids"):
-            request["eos_token_ids"] = self.eos_token_ids
-
-        stop_sequences = request.get("stop", [])
-        if stop_sequences:
-            stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
-            request["stop_token_ids"] = stop_seqs
-            request["stop_seqs_len"] = stop_seqs_len
-
-        bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
-
-        if request.get("prompt_token_ids"):
-            messages = request.get("messages")
-            if messages:
-                self._check_mm_limits(messages)
-            request.setdefault("enable_thinking", False)
-            outputs = self.processor.prompt_token_ids2outputs(request)
-
-        elif request.get("prompt"):
-            multimodal_data = request.get("multimodal_data")
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request["prompt"], images, videos)
-
-        elif request.get("messages"):
-            messages = request["messages"]
-            self._check_mm_limits(messages)
-            chat_template_kwargs = request.get("chat_template_kwargs")
-            if chat_template_kwargs:
-                if isinstance(chat_template_kwargs, dict):
-                    for k, v in chat_template_kwargs.items():
-                        if k not in request or request[k] is None:
-                            request[k] = v
-                else:
-                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-            request.setdefault("enable_thinking", False)
-            outputs = self.processor.request2ids(request)
-
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        if request.get("completion_token_ids"):
-            self.append_completion_tokens(outputs, request["completion_token_ids"])
-
-        # qwen3_vl not support thinking
-        request["enable_thinking"] = False
-
-        outputs = self.pack_outputs(outputs)
+        if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
 
-        request["prompt_token_ids"] = (
-            outputs["input_ids"].tolist() if not request.get("prompt_token_ids") else request["prompt_token_ids"]
-        )
-        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
-        request["multimodal_inputs"] = outputs
+        if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
+            if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+                request["top_p"] = _SAMPLING_EPS
+                request["top_k"] = 1
 
-        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
-            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+        if self.model_type != QWEN3_VL and self.reasoning_parser:
+            self._apply_reasoning_parser(request)
 
-        max_tokens = max_model_len - len(request["prompt_token_ids"])
-        if request.get("max_tokens") is None:
-            request["max_tokens"] = max(1, max_tokens)
-        else:
-            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+        if self.model_type == ERNIE4_5_VL:
+            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
+                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
 
         data_processor_logger.info(f"Processed request {request}")
         return request
 
-    def _process_request_paddleocr_vl(self, request, max_model_len):
-        """Process request for paddleocr_vl model type."""
-        request = self._apply_default_parameters(request)
-        if not request.get("eos_token_ids"):
-            request["eos_token_ids"] = self.eos_token_ids
-
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        if request.get("prompt"):
-            multimodal_data = request.get("multimodal_data")
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request["prompt"], images, videos)
-
-        elif request.get("messages"):
-            messages = request["messages"]
-            self._check_mm_limits(messages)
-            outputs = self.processor.request2ids(request)
-
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        metadata = request.get("metadata")
-        if metadata and metadata.get("generated_token_ids"):
-            self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"])
-
-        outputs = self.pack_outputs(outputs)
-
-        request["prompt_token_ids"] = outputs["input_ids"].tolist()
-        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
-        request["multimodal_inputs"] = outputs
-
-        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
-            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
-
-        max_tokens = max_model_len - len(request["prompt_token_ids"])
-        if request.get("max_tokens") is None:
-            request["max_tokens"] = max(1, max_tokens)
+    def _process_stop_tokens(self, request):
+        """Handle stop token processing based on model type."""
+        if self.model_type == QWEN3_VL:
+            stop_sequences = request.get("stop", [])
+            if stop_sequences:
+                stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
+                request["stop_token_ids"] = stop_seqs
+                request["stop_seqs_len"] = stop_seqs_len
         else:
-            request["max_tokens"] = min(max_tokens, request["max_tokens"])
-
-        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
-            request["top_p"] = _SAMPLING_EPS
-            request["top_k"] = 1
-
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-            parts = request["request_id"].split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request["request_id"]] = model_status
-            request["enable_thinking"] = model_status == "think_start"
-
-        return request
-
-    def _process_request_ernie4_5_vl(self, request, max_model_len):
-        """Process request for ernie4_5_vl model type."""
-        request = self._apply_default_parameters(request)
-        if not request.get("eos_token_ids"):
-            request["eos_token_ids"] = self.eos_token_ids
-
-        process_stop_token_ids(request, self.update_stop_seq)
+            process_stop_token_ids(request, self.update_stop_seq)
 
+    def _process_bad_words(self, request):
+        """Process bad_words into token ids."""
         bad_words = request.get("bad_words")
         bad_words_token_ids = request.get("bad_words_token_ids")
         if bad_words:
             bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
             request["bad_words_token_ids"] = bad_words_token_ids
 
-        logits_processors_args = self._prepare_think_stop_sentence(
-            request.get("logits_processors_args") or {}, max_model_len
-        )
-        request["logits_processors_args"] = logits_processors_args
+    def _tokenize_request(self, request):
+        """Core tokenization dispatch: prompt_token_ids > prompt > messages."""
+        default_thinking = True if self.model_type == ERNIE4_5_VL else False
 
-        if request.get("prompt_token_ids"):
+        if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
             messages = request.get("messages")
             if messages:
                 self._check_mm_limits(messages)
-            request.setdefault("enable_thinking", True)
-            outputs = self.processor.prompt_token_ids2outputs(request)
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.prompt_token_ids2outputs(request)
+
         elif request.get("prompt"):
-            multimodal_data = request.get("multimodal_data")
-            if multimodal_data is None:
-                multimodal_data = {}
+            multimodal_data = request.get("multimodal_data") or {}
             self._check_mm_limits(multimodal_data)
             images = multimodal_data.get("image", None)
             videos = multimodal_data.get("video", None)
-            request["prompt_tokens"] = request.get("prompt")
-            request.setdefault("enable_thinking", True)
-            outputs = self.processor.text2ids(request["prompt"], images, videos)
+            if self.model_type == ERNIE4_5_VL:
+                request["prompt_tokens"] = request.get("prompt")
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.text2ids(request["prompt"], images, videos)
+
         elif request.get("messages"):
             messages = request["messages"]
             self._check_mm_limits(messages)
@@ -538,58 +387,35 @@ def _process_request_ernie4_5_vl(self, request, max_model_len):
                             request[k] = v
                 else:
                     raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-            request.setdefault("enable_thinking", True)
-            outputs = self.processor.request2ids(request)
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.request2ids(request)
+
         else:
             raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
 
-        if request.get("completion_token_ids"):
-            self.append_completion_tokens(outputs, request["completion_token_ids"])
-
-        outputs = self.pack_outputs(outputs)
-        request["prompt_token_ids"] = (
-            outputs["input_ids"].tolist()
-            if ("prompt_token_ids" not in request or not request["prompt_token_ids"])
-            else request["prompt_token_ids"]
-        )
-        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
-        request["multimodal_inputs"] = outputs
-
-        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
-            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
-        logits_processors_args = self._update_thinking_prompt_state(
-            request["prompt_token_ids"], request.get("logits_processors_args") or {}
-        )
-        request["logits_processors_args"] = logits_processors_args
-
-        max_tokens = max_model_len - len(request["prompt_token_ids"])
-        if request.get("max_tokens") is None:
-            request["max_tokens"] = max(1, max_tokens)
+    def _process_post_tokens(self, request, outputs):
+        """Handle post-tokenization token appending."""
+        if self.model_type == PADDLEOCR_VL:
+            metadata = request.get("metadata")
+            if metadata and metadata.get("generated_token_ids"):
+                self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"])
         else:
-            request["max_tokens"] = min(max_tokens, request["max_tokens"])
-        if request.get("reasoning_max_tokens") is None:
-            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
-
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-            parts = request["request_id"].split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request["request_id"]] = model_status
-            request["enable_thinking"] = model_status == "think_start"
-        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
-            request["top_p"] = _SAMPLING_EPS
-            request["top_k"] = 1
-        if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
-            request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
-
-        data_processor_logger.info(f"Processed request {request}")
-        return request
+            if request.get("completion_token_ids"):
+                self.append_completion_tokens(outputs, request["completion_token_ids"])
+
+    def _apply_reasoning_parser(self, request):
+        """Apply reasoning parser and update model status dict."""
+        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+        parts = request["request_id"].split("_")
+        if len(parts) > 1:
+            real_req_id = parts[0]
+            index = int(parts[1])
+            n = request.get("n", 1)
+            for idx in range(index * n, (index + 1) * n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+        else:
+            self.model_status_dict[request["request_id"]] = model_status
+        request["enable_thinking"] = model_status == "think_start"
 
     def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
         """Append completion tokens to existing multimodal outputs."""

From 7646a49a5a05b058dd0f9c44257096e8c361a6c5 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Tue, 31 Mar 2026 16:24:08 +0800
Subject: [PATCH 4/8] update multimodal

---
 fastdeploy/input/multimodal_processor.py | 49 ++++++------------------
 1 file changed, 11 insertions(+), 38 deletions(-)

diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py
index afada3d2e3d..2b9211250b2 100644
--- a/fastdeploy/input/multimodal_processor.py
+++ b/fastdeploy/input/multimodal_processor.py
@@ -398,7 +398,7 @@ def _process_post_tokens(self, request, outputs):
         if self.model_type == PADDLEOCR_VL:
             metadata = request.get("metadata")
             if metadata and metadata.get("generated_token_ids"):
-                self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"])
+                self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
         else:
             if request.get("completion_token_ids"):
                 self.append_completion_tokens(outputs, request["completion_token_ids"])
@@ -419,12 +419,10 @@ def _apply_reasoning_parser(self, request):
 
     def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
         """Append completion tokens to existing multimodal outputs."""
-        if self.model_type in (QWEN_VL, QWEN3_VL):
-            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
-        elif self.model_type == PADDLEOCR_VL:
-            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
-        elif self.model_type == ERNIE4_5_VL:
+        if self.model_type == ERNIE4_5_VL:
             self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
+        else:
+            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
 
     def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
         """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
@@ -436,10 +434,6 @@ def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids
         multimodal_inputs["position_ids"].append(pos_ids)
         multimodal_inputs["cur_position"] += num_tokens
 
-    def _append_generated_tokens_qwen(self, multimodal_inputs, generated_token_ids):
-        """Append generated tokens for paddleocr_vl (uses metadata.generated_token_ids)."""
-        self._append_completion_tokens_qwen(multimodal_inputs, generated_token_ids)
-
     def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
         """Append completion tokens for ernie4_5_vl."""
         num_tokens = len(completion_token_ids)
@@ -453,13 +447,6 @@ def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_id
 
     def pack_outputs(self, outputs):
         """Convert intermediate processing outputs to final format."""
-        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
-            return self._pack_outputs_qwen(outputs)
-        elif self.model_type == ERNIE4_5_VL:
-            return self._pack_outputs_ernie(outputs)
-
-    def _pack_outputs_qwen(self, outputs):
-        """Pack outputs for qwen_vl / qwen3_vl / paddleocr_vl."""
         if not outputs["images"]:
             outputs["images"] = None
             outputs["grid_thw"] = None
@@ -471,29 +458,15 @@ def _pack_outputs_qwen(self, outputs):
 
         outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
         outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
-        outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
-
-        outputs["image_patch_id"] = self.processor.image_token_id
-        outputs["video_patch_id"] = self.processor.video_token_id
-        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-
         outputs["mm_num_token_func"] = self.processor.mm_num_tokens
-        return outputs
 
-    def _pack_outputs_ernie(self, outputs):
-        """Pack outputs for ernie4_5_vl."""
-        if not outputs["images"]:
-            outputs["images"] = None
-            outputs["grid_thw"] = None
-            outputs["image_type_ids"] = None
+        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
+            outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
+            outputs["image_patch_id"] = self.processor.image_token_id
+            outputs["video_patch_id"] = self.processor.video_token_id
+            outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
         else:
-            outputs["images"] = np.vstack(outputs["images"])
-            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
-            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
+            outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
+            outputs["image_patch_id"] = self.image_patch_id
 
-        outputs["image_patch_id"] = self.image_patch_id
-        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
-        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
-        outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
         return outputs

From d9199abbe5c8cb030ae47dd4a1a16d2343b2a228 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Wed, 1 Apr 2026 15:13:03 +0800
Subject: [PATCH 5/8] fix load tokenizer

---
 .../input/image_processors/adaptive_processor.py      |  2 +-
 fastdeploy/input/multimodal_processor.py              | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py
index 47e677e4917..e5ad4960391 100644
--- a/fastdeploy/input/image_processors/adaptive_processor.py
+++ b/fastdeploy/input/image_processors/adaptive_processor.py
@@ -64,7 +64,7 @@
     List["np.ndarray"],
     List["paddle.Tensor"],
     List[List["PIL.Image.Image"]],
-    List[List["np.ndarrray"]],
+    List[List["np.ndarray"]],
     List[List["paddle.Tensor"]],
 ]
 
diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py
index 2b9211250b2..143160b0fce 100644
--- a/fastdeploy/input/multimodal_processor.py
+++ b/fastdeploy/input/multimodal_processor.py
@@ -109,9 +109,16 @@ def __init__(
     def _load_tokenizer(self):
         """Load the appropriate tokenizer based on model_type."""
         if self.tokenizer_type == "ernie4_5":
-            from paddleformers.transformers import AutoTokenizer as PFAutoTokenizer
+            import os
 
-            tokenizer = PFAutoTokenizer.from_pretrained(self.model_name_or_path)
+            from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+            vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
+            for name in vocab_file_names:
+                if os.path.exists(os.path.join(self.model_name_or_path, name)):
+                    Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
+                    break
+            tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
         else:
             from paddleformers.transformers import AutoTokenizer
 

From 70c5af31a065b2193cc1a37cc4056a2e0269478c Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Wed, 1 Apr 2026 18:01:53 +0800
Subject: [PATCH 6/8] add unit test

---
 tests/input/test_multimodal_processor.py | 1130 ++++++++++++++++++++++
 1 file changed, 1130 insertions(+)
 create mode 100644 tests/input/test_multimodal_processor.py

diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py
new file mode 100644
index 00000000000..bfce5b302a9
--- /dev/null
+++ b/tests/input/test_multimodal_processor.py
@@ -0,0 +1,1130 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import pickle
+import unittest
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+
+from fastdeploy.input.multimodal_processor import (
+    _DEFAULT_MM_LIMITS,
+    _SAMPLING_EPS,
+    ERNIE4_5_VL,
+    PADDLEOCR_VL,
+    QWEN3_VL,
+    QWEN_VL,
+    MultiModalProcessor,
+)
+from fastdeploy.input.utils import IDS_TYPE_FLAG
+
+
+def _make_processor(model_type, **overrides):
+    """Create a MultiModalProcessor instance with __init__ bypassed.
+
+    Manually sets the minimum attributes required by the methods under test.
+    """
+    with patch.object(MultiModalProcessor, "__init__", return_value=None):
+        proc = MultiModalProcessor.__new__(MultiModalProcessor)
+    proc.model_type = model_type
+    proc.config = MagicMock()
+    proc.enable_processor_cache = False
+    proc.model_name_or_path = "/mock/model"
+    proc.tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
+    proc.limit_mm_per_prompt = dict(_DEFAULT_MM_LIMITS)
+    proc.eos_token_ids = [2]
+    proc.eos_token_id_len = 1
+    proc.pad_token_id = 0
+    proc.reasoning_parser = None
+    proc.tool_parser_obj = None
+    proc.model_status_dict = {}
+    proc.decode_status = {}
+    proc.tool_parser_dict = {}
+    proc.generation_config = MagicMock()
+    proc.generation_config.top_p = 0.7
+    proc.generation_config.temperature = 1.0
+    proc.generation_config.repetition_penalty = 1.0
+    proc.generation_config.frequency_penalty = 0.0
+    proc.generation_config.presence_penalty = 0.0
+
+    # Mock tokenizer
+    tokenizer = MagicMock()
+    tokenizer.eos_token_id = 2
+    tokenizer.eos_token = "</s>"
+    tokenizer.bos_token_id = 1
+    tokenizer.bos_token = "<s>"
+    tokenizer.pad_token_id = 0
+    tokenizer.vocab_size = 32000
+    tokenizer.chat_template = "dummy"
+    tokenizer.tokenize.return_value = ["hello"]
+    tokenizer.convert_tokens_to_ids.return_value = [100]
+    tokenizer.decode.return_value = "hello"
+    proc.tokenizer = tokenizer
+
+    # Mock processor (the internal DataProcessor)
+    processor = MagicMock()
+    processor.image_token_id = 151655
+    processor.video_token_id = 151656
+    processor.image_patch_id = 151655
+    processor.spatial_conv_size = 14
+    processor.mm_num_tokens = MagicMock(return_value=1)
+    processor._compute_text_positions.return_value = np.array([[3, 4], [3, 4], [3, 4]])
+    proc.processor = processor
+
+    # Set attributes normally set by _init_mm_config
+    if model_type in (QWEN_VL, QWEN3_VL):
+        proc.image_patch_id = processor.image_token_id
+    elif model_type == PADDLEOCR_VL:
+        proc.image_patch_id = processor.image_patch_id
+    elif model_type == ERNIE4_5_VL:
+        proc.image_patch_id = processor.image_patch_id
+        proc.spatial_conv_size = processor.spatial_conv_size
+
+    # Apply any overrides
+    for k, v in overrides.items():
+        setattr(proc, k, v)
+    return proc
+
+
+# ===================================================================
+# __init__ validation
+# ===================================================================
+class TestMultiModalProcessorInitValidation(unittest.TestCase):
+
+    def test_unsupported_model_type_raises(self):
+        """Line 86: unsupported model_type should raise ValueError."""
+        with self.assertRaises(Exception):
+            # We need to let __init__ run the model_type check.
+            # Mock the parts that come after the check to isolate it.
+            with patch.object(MultiModalProcessor, "__init__", wraps=MultiModalProcessor.__init__) as _:
+                proc = object.__new__(MultiModalProcessor)
+                # Call the real __init__ which should fail on model_type check
+                MultiModalProcessor.__init__(proc, "/mock", model_type="unsupported_type")
+
+
+# ===================================================================
+# _parse_processor_kwargs
+# ===================================================================
+class TestParseProcessorKwargs(unittest.TestCase):
+
+    def test_empty_kwargs_returns_empty(self):
+        proc = _make_processor(QWEN_VL)
+        self.assertEqual(proc._parse_processor_kwargs(None), {})
+        self.assertEqual(proc._parse_processor_kwargs({}), {})
+
+    def test_valid_qwen_kwargs(self):
+        """Lines 196, 198-204: valid kwargs for qwen model type."""
+        proc = _make_processor(QWEN_VL)
+        kwargs = {"video_max_frames": 10, "video_min_frames": 1}
+        result = proc._parse_processor_kwargs(kwargs)
+        self.assertEqual(result, kwargs)
+
+    def test_valid_ernie_kwargs(self):
+        """Lines 193-194: valid kwargs for ernie model type."""
+        proc = _make_processor(ERNIE4_5_VL)
+        kwargs = {"spatial_conv_size": 2, "temporal_conv_size": 1, "video_max_frames": 32}
+        result = proc._parse_processor_kwargs(kwargs)
+        self.assertEqual(result, kwargs)
+
+    def test_invalid_type_not_dict(self):
+        """Lines 188-189: non-dict kwargs should return empty."""
+        proc = _make_processor(QWEN_VL)
+        result = proc._parse_processor_kwargs("invalid")
+        self.assertEqual(result, {})
+
+    def test_invalid_value_type(self):
+        """Lines 199-200: wrong value type should return empty."""
+        proc = _make_processor(QWEN_VL)
+        result = proc._parse_processor_kwargs({"video_max_frames": "ten"})
+        self.assertEqual(result, {})
+
+    def test_mixed_valid_invalid_value_types(self):
+        proc = _make_processor(ERNIE4_5_VL)
+        result = proc._parse_processor_kwargs({"spatial_conv_size": 2, "image_min_pixels": "bad"})
+        self.assertEqual(result, {})
+
+    def test_unknown_keys_pass_through(self):
+        """Keys not in expected_types are not validated, just passed through."""
+        proc = _make_processor(QWEN_VL)
+        kwargs = {"unknown_key": "any_value"}
+        result = proc._parse_processor_kwargs(kwargs)
+        self.assertEqual(result, kwargs)
+
+
+# ===================================================================
+# _parse_limits
+# ===================================================================
+class TestParseLimits(unittest.TestCase):
+
+    def test_none_returns_defaults(self):
+        proc = _make_processor(QWEN_VL)
+        self.assertEqual(proc._parse_limits(None), dict(_DEFAULT_MM_LIMITS))
+
+    def test_valid_limits_merged(self):
+        """Lines 219: valid limits merged with defaults."""
+        proc = _make_processor(QWEN_VL)
+        result = proc._parse_limits({"image": 5, "video": 3})
+        self.assertEqual(result, {"image": 5, "video": 3, "audio": 1})
+
+    def test_partial_limits(self):
+        proc = _make_processor(QWEN_VL)
+        result = proc._parse_limits({"image": 10})
+        self.assertEqual(result, {"image": 10, "video": 1, "audio": 1})
+
+    def test_invalid_type_returns_defaults(self):
+        """Lines 216-217, 220-222: non-dict returns defaults."""
+        proc = _make_processor(QWEN_VL)
+        result = proc._parse_limits("invalid")
+        self.assertEqual(result, dict(_DEFAULT_MM_LIMITS))
+
+
+# ===================================================================
+# _check_mm_limits
+# ===================================================================
+class TestCheckMMLimits(unittest.TestCase):
+
+    def test_dict_input_within_limits(self):
+        """Lines 226-227: dict input within limits passes."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 2, "video": 1, "audio": 1}
+        mm_data = {"image": ["img1"], "video": ["vid1"]}
+        proc._check_mm_limits(mm_data)  # should not raise
+
+    def test_dict_input_exceeds_limit(self):
+        """Lines 247-251: dict input exceeding limit raises ValueError."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        mm_data = {"image": ["img1", "img2"]}
+        with self.assertRaises(ValueError) as ctx:
+            proc._check_mm_limits(mm_data)
+        self.assertIn("Too many image items", str(ctx.exception))
+
+    def test_messages_input_qwen_vl_accepts_url_suffix(self):
+        """Lines 229-240: messages with image_url/video_url for qwen_vl."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "file://img.jpg"}},
+                    {"type": "text", "text": "describe"},
+                ],
+            }
+        ]
+        proc._check_mm_limits(messages)  # should not raise
+
+    def test_messages_input_qwen_vl_image_type(self):
+        """Lines 237: 'image' type also accepted for url_suffix models."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {"role": "user", "content": [{"type": "image", "image": "data"}]},
+        ]
+        proc._check_mm_limits(messages)
+
+    def test_messages_input_qwen_vl_video_url_type(self):
+        """Lines 239-240: video_url type for qwen_vl."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {"role": "user", "content": [{"type": "video_url", "video_url": {"url": "file://vid.mp4"}}]},
+        ]
+        proc._check_mm_limits(messages)
+
+    def test_messages_input_ernie_only_accepts_plain_types(self):
+        """Lines 241-245: ernie4_5_vl only accepts 'image'/'video' types, not *_url."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        # image_url should NOT be counted for ernie
+        messages = [
+            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": "file://img.jpg"}}]},
+        ]
+        proc._check_mm_limits(messages)  # no exception since image_url not counted
+
+    def test_messages_input_ernie_image_type(self):
+        """Lines 242-243: ernie 'image' type is counted."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": "data1"},
+                    {"type": "image", "image": "data2"},
+                ],
+            }
+        ]
+        with self.assertRaises(ValueError):
+            proc._check_mm_limits(messages)
+
+    def test_messages_input_ernie_video_type(self):
+        """Lines 244-245: ernie 'video' type is counted."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {"role": "user", "content": [{"type": "video", "video": "data"}]},
+        ]
+        proc._check_mm_limits(messages)  # within limit
+
+    def test_messages_exceed_video_limit(self):
+        """Lines 247-251: video exceeding limit raises ValueError."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video_url", "video_url": {"url": "file://v1.mp4"}},
+                    {"type": "video_url", "video_url": {"url": "file://v2.mp4"}},
+                ],
+            }
+        ]
+        with self.assertRaises(ValueError) as ctx:
+            proc._check_mm_limits(messages)
+        self.assertIn("Too many video items", str(ctx.exception))
+
+    def test_messages_with_string_content_skipped(self):
+        """Messages with string content (not list) should be skipped."""
+        proc = _make_processor(QWEN_VL)
+        proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+        messages = [
+            {"role": "user", "content": "just text"},
+        ]
+        proc._check_mm_limits(messages)  # should not raise
+
+
+# ===================================================================
+# _get_processor_cache / _update_processor_cache
+# ===================================================================
+class TestProcessorCache(unittest.TestCase):
+
+    def test_get_processor_cache(self):
+        """Lines 255-260: retrieve cached results via socket."""
+        proc = _make_processor(QWEN_VL)
+        mock_socket = MagicMock()
+        mm_hashes = ["hash1", "hash2"]
+        expected_items = [{"data": "item1"}, {"data": "item2"}]
+        mock_socket.recv_multipart.return_value = [b"", pickle.dumps(expected_items)]
+
+        result = proc._get_processor_cache(mock_socket, mm_hashes)
+
+        mock_socket.send_multipart.assert_called_once()
+        self.assertEqual(result, expected_items)
+
+    def test_update_processor_cache(self):
+        """Lines 264-266: update cache via socket."""
+        proc = _make_processor(QWEN_VL)
+        mock_socket = MagicMock()
+        mm_hashes = ["hash1"]
+        mm_items = [{"data": "item1"}]
+
+        proc._update_processor_cache(mock_socket, mm_hashes, mm_items)
+
+        mock_socket.send_multipart.assert_called_once()
+        sent_data = mock_socket.send_multipart.call_args[0][0]
+        self.assertEqual(sent_data[0], b"")
+        unpacked = pickle.loads(sent_data[1])
+        self.assertEqual(unpacked, (mm_hashes, mm_items))
+
+
+# ===================================================================
+# get_mm_max_tokens_per_item
+# ===================================================================
+class TestGetMmMaxTokensPerItem(unittest.TestCase):
+
+    def test_ernie_returns_processor_result(self):
+        """Line 271: ernie delegates to processor."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.get_mm_max_tokens_per_item.return_value = {"image": 512}
+        result = proc.get_mm_max_tokens_per_item(1024)
+        self.assertEqual(result, {"image": 512})
+
+    def test_non_ernie_returns_none(self):
+        """Line 272: non-ernie returns None."""
+        proc = _make_processor(QWEN_VL)
+        self.assertIsNone(proc.get_mm_max_tokens_per_item(1024))
+
+        proc2 = _make_processor(QWEN3_VL)
+        self.assertIsNone(proc2.get_mm_max_tokens_per_item(1024))
+
+
+# ===================================================================
+# _process_stop_tokens
+# ===================================================================
+class TestProcessStopTokens(unittest.TestCase):
+
+    def test_qwen3_vl_stop_handling(self):
+        """Lines 348-353: qwen3_vl uses update_stop_seq differently."""
+        proc = _make_processor(QWEN3_VL)
+        proc.update_stop_seq = MagicMock(return_value=([[100]], [1]))
+        request = {"stop": ["<stop>"]}
+        proc._process_stop_tokens(request)
+        self.assertEqual(request["stop_token_ids"], [[100]])
+        self.assertEqual(request["stop_seqs_len"], [1])
+
+    def test_qwen3_vl_no_stop(self):
+        """Lines 348-350: qwen3_vl with empty stop list."""
+        proc = _make_processor(QWEN3_VL)
+        proc.update_stop_seq = MagicMock()
+        request = {"stop": []}
+        proc._process_stop_tokens(request)
+        proc.update_stop_seq.assert_not_called()
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_non_qwen3_uses_process_stop_token_ids(self, mock_process):
+        """Lines 354-355: non-qwen3 uses process_stop_token_ids utility."""
+        proc = _make_processor(QWEN_VL)
+        proc.update_stop_seq = MagicMock()
+        request = {}
+        proc._process_stop_tokens(request)
+        mock_process.assert_called_once_with(request, proc.update_stop_seq)
+
+
+# ===================================================================
+# _process_bad_words
+# ===================================================================
+class TestProcessBadWords(unittest.TestCase):
+
+    def test_with_bad_words(self):
+        """Lines 359-363: bad_words are processed."""
+        proc = _make_processor(QWEN_VL)
+        proc.update_bad_words = MagicMock(return_value=[100, 200])
+        request = {"bad_words": ["bad", "word"], "bad_words_token_ids": [50]}
+        proc._process_bad_words(request)
+        proc.update_bad_words.assert_called_once_with(["bad", "word"], [50])
+        self.assertEqual(request["bad_words_token_ids"], [100, 200])
+
+    def test_without_bad_words(self):
+        """Lines 361: no bad_words means no processing."""
+        proc = _make_processor(QWEN_VL)
+        proc.update_bad_words = MagicMock()
+        request = {}
+        proc._process_bad_words(request)
+        proc.update_bad_words.assert_not_called()
+
+
+# ===================================================================
+# _tokenize_request
+# ===================================================================
+class TestTokenizeRequest(unittest.TestCase):
+
+    def test_prompt_token_ids_qwen3_vl(self):
+        """Lines 369-374: prompt_token_ids path for qwen3_vl."""
+        proc = _make_processor(QWEN3_VL)
+        expected = {"input_ids": [1, 2, 3]}
+        proc.processor.prompt_token_ids2outputs.return_value = expected
+
+        request = {"prompt_token_ids": [1, 2, 3], "messages": [{"role": "user", "content": "hi"}]}
+        result = proc._tokenize_request(request)
+        self.assertEqual(result, expected)
+        self.assertFalse(request.get("enable_thinking", True))  # default_thinking=False for qwen3_vl
+
+    def test_prompt_token_ids_ernie(self):
+        """Lines 369-374: prompt_token_ids path for ernie."""
+        proc = _make_processor(ERNIE4_5_VL)
+        expected = {"input_ids": [1, 2, 3]}
+        proc.processor.prompt_token_ids2outputs.return_value = expected
+
+        request = {"prompt_token_ids": [1, 2, 3]}
+        result = proc._tokenize_request(request)
+        self.assertEqual(result, expected)
+        self.assertTrue(request.get("enable_thinking"))  # default_thinking=True for ernie
+
+    def test_prompt_path(self):
+        """Lines 376-384: prompt text path."""
+        proc = _make_processor(QWEN_VL)
+        expected = {"input_ids": [10, 20]}
+        proc.processor.text2ids.return_value = expected
+
+        request = {"prompt": "hello", "multimodal_data": {"image": [], "video": []}}
+        result = proc._tokenize_request(request)
+        proc.processor.text2ids.assert_called_once_with("hello", [], [])
+        self.assertEqual(result, expected)
+
+    def test_prompt_path_ernie_sets_prompt_tokens(self):
+        """Lines 381-382: ernie sets prompt_tokens from prompt."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.text2ids.return_value = {"input_ids": [1]}
+
+        request = {"prompt": "test prompt"}
+        proc._tokenize_request(request)
+        self.assertEqual(request["prompt_tokens"], "test prompt")
+
+    def test_messages_path(self):
+        """Lines 386-398: messages path."""
+        proc = _make_processor(QWEN_VL)
+        expected = {"input_ids": [5, 6]}
+        proc.processor.request2ids.return_value = expected
+
+        request = {"messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]}
+        result = proc._tokenize_request(request)
+        proc.processor.request2ids.assert_called_once()
+        self.assertEqual(result, expected)
+
+    def test_messages_path_with_chat_template_kwargs(self):
+        """Lines 389-394: chat_template_kwargs are merged into request."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.request2ids.return_value = {"input_ids": [1]}
+
+        request = {
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "chat_template_kwargs": {"enable_thinking": True},
+        }
+        proc._tokenize_request(request)
+        self.assertTrue(request.get("enable_thinking"))
+
+    def test_messages_path_chat_template_kwargs_no_overwrite(self):
+        """Lines 393: existing request keys are not overwritten."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.request2ids.return_value = {"input_ids": [1]}
+
+        request = {
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "chat_template_kwargs": {"enable_thinking": True},
+            "enable_thinking": False,
+        }
+        proc._tokenize_request(request)
+        self.assertFalse(request["enable_thinking"])
+
+    def test_messages_path_invalid_chat_template_kwargs(self):
+        """Lines 395-396: non-dict chat_template_kwargs raises."""
+        proc = _make_processor(QWEN_VL)
+        request = {
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "chat_template_kwargs": "invalid",
+        }
+        with self.assertRaises(ValueError) as ctx:
+            proc._tokenize_request(request)
+        self.assertIn("must be a dict", str(ctx.exception))
+
+    def test_no_input_raises(self):
+        """Lines 400-401: no prompt/messages/prompt_token_ids raises."""
+        proc = _make_processor(QWEN_VL)
+        with self.assertRaises(ValueError) as ctx:
+            proc._tokenize_request({"request_id": "test"})
+        self.assertIn("must contain", str(ctx.exception))
+
+    def test_prompt_path_no_multimodal_data(self):
+        """Lines 377: prompt with no multimodal_data passes None for images/videos."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.text2ids.return_value = {"input_ids": [1]}
+
+        request = {"prompt": "hello"}
+        proc._tokenize_request(request)
+        proc.processor.text2ids.assert_called_once_with("hello", None, None)
+
+
+# ===================================================================
+# _process_post_tokens
+# ===================================================================
+class TestProcessPostTokens(unittest.TestCase):
+
+    def test_paddleocr_with_metadata_generated_tokens(self):
+        """Lines 405-408: paddleocr_vl appends via _append_completion_tokens_qwen."""
+        proc = _make_processor(PADDLEOCR_VL)
+        proc._append_completion_tokens_qwen = MagicMock()
+        outputs = {"input_ids": [1, 2]}
+        request = {"metadata": {"generated_token_ids": [10, 11]}}
+        proc._process_post_tokens(request, outputs)
+        proc._append_completion_tokens_qwen.assert_called_once_with(outputs, [10, 11])
+
+    def test_paddleocr_without_metadata(self):
+        """Lines 405-406: paddleocr_vl with no metadata does nothing."""
+        proc = _make_processor(PADDLEOCR_VL)
+        proc._append_completion_tokens_qwen = MagicMock()
+        outputs = {"input_ids": [1]}
+        proc._process_post_tokens({}, outputs)
+        proc._append_completion_tokens_qwen.assert_not_called()
+
+    def test_non_paddleocr_with_completion_tokens(self):
+        """Lines 410-411: non-paddleocr uses append_completion_tokens."""
+        proc = _make_processor(QWEN_VL)
+        proc.append_completion_tokens = MagicMock()
+        outputs = {"input_ids": [1]}
+        request = {"completion_token_ids": [5, 6]}
+        proc._process_post_tokens(request, outputs)
+        proc.append_completion_tokens.assert_called_once_with(outputs, [5, 6])
+
+    def test_non_paddleocr_without_completion_tokens(self):
+        """Lines 410: no completion_token_ids does nothing."""
+        proc = _make_processor(QWEN_VL)
+        proc.append_completion_tokens = MagicMock()
+        outputs = {"input_ids": [1]}
+        proc._process_post_tokens({}, outputs)
+        proc.append_completion_tokens.assert_not_called()
+
+
+# ===================================================================
+# _apply_reasoning_parser
+# ===================================================================
+class TestApplyReasoningParser(unittest.TestCase):
+
+    def test_basic_request_id(self):
+        """Lines 415-425: basic request_id (no underscore split)."""
+        proc = _make_processor(QWEN_VL)
+        proc.reasoning_parser = MagicMock()
+        proc.reasoning_parser.get_model_status.return_value = "think_start"
+        proc.model_status_dict = {}
+
+        request = {"request_id": "req1", "prompt_token_ids": [1, 2, 3]}
+        proc._apply_reasoning_parser(request)
+
+        self.assertEqual(proc.model_status_dict["req1"], "think_start")
+        self.assertTrue(request["enable_thinking"])
+
+    def test_compound_request_id(self):
+        """Lines 416-422: request_id with underscore is split."""
+        proc = _make_processor(QWEN_VL)
+        proc.reasoning_parser = MagicMock()
+        proc.reasoning_parser.get_model_status.return_value = "think_end"
+        proc.model_status_dict = {}
+
+        request = {"request_id": "req1_2", "prompt_token_ids": [1, 2], "n": 3}
+        proc._apply_reasoning_parser(request)
+
+        # index=2, n=3 → range(6, 9)
+        for idx in [6, 7, 8]:
+            self.assertEqual(proc.model_status_dict[f"req1_{idx}"], "think_end")
+        self.assertFalse(request["enable_thinking"])
+
+    def test_compound_request_id_default_n(self):
+        """Lines 420: default n=1."""
+        proc = _make_processor(QWEN_VL)
+        proc.reasoning_parser = MagicMock()
+        proc.reasoning_parser.get_model_status.return_value = "think_start"
+        proc.model_status_dict = {}
+
+        request = {"request_id": "req1_0", "prompt_token_ids": [1]}
+        proc._apply_reasoning_parser(request)
+
+        self.assertIn("req1_0", proc.model_status_dict)
+        self.assertTrue(request["enable_thinking"])
+
+
+# ===================================================================
+# append_completion_tokens
+# ===================================================================
+class TestAppendCompletionTokens(unittest.TestCase):
+
+    def test_ernie_dispatches_to_ernie_method(self):
+        """Lines 429-430: ernie dispatches to _append_completion_tokens_ernie."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc._append_completion_tokens_ernie = MagicMock()
+        inputs = {"input_ids": [1]}
+        proc.append_completion_tokens(inputs, [2, 3])
+        proc._append_completion_tokens_ernie.assert_called_once_with(inputs, [2, 3])
+
+    def test_non_ernie_dispatches_to_qwen_method(self):
+        """Lines 431-432: non-ernie dispatches to _append_completion_tokens_qwen."""
+        proc = _make_processor(QWEN_VL)
+        proc._append_completion_tokens_qwen = MagicMock()
+        inputs = {"input_ids": [1]}
+        proc.append_completion_tokens(inputs, [2, 3])
+        proc._append_completion_tokens_qwen.assert_called_once_with(inputs, [2, 3])
+
+
+class TestAppendCompletionTokensQwen(unittest.TestCase):
+
+    def test_qwen_append(self):
+        """Lines 436-442: appends tokens, token_type_ids, position_ids for qwen."""
+        proc = _make_processor(QWEN_VL)
+        multimodal_inputs = {
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+            "cur_position": 3,
+        }
+        proc._append_completion_tokens_qwen(multimodal_inputs, [4, 5])
+
+        self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
+        self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
+        self.assertEqual(multimodal_inputs["cur_position"], 5)
+        self.assertEqual(len(multimodal_inputs["position_ids"]), 2)
+
+
+class TestAppendCompletionTokensErnie(unittest.TestCase):
+
+    def test_ernie_append(self):
+        """Lines 446-453: appends tokens with IDS_TYPE_FLAG for ernie."""
+        proc = _make_processor(ERNIE4_5_VL)
+        multimodal_inputs = {
+            "input_ids": [10, 20],
+            "token_type_ids": [IDS_TYPE_FLAG["text"], IDS_TYPE_FLAG["text"]],
+            "position_ids": [[0, 0, 0], [1, 1, 1]],
+            "cur_position": 2,
+        }
+        proc._append_completion_tokens_ernie(multimodal_inputs, [30, 40, 50])
+
+        self.assertEqual(multimodal_inputs["input_ids"], [10, 20, 30, 40, 50])
+        self.assertEqual(len(multimodal_inputs["token_type_ids"]), 5)
+        self.assertTrue(all(t == IDS_TYPE_FLAG["text"] for t in multimodal_inputs["token_type_ids"]))
+        self.assertEqual(multimodal_inputs["position_ids"], [[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]])
+        self.assertEqual(multimodal_inputs["cur_position"], 5)
+
+
+# ===================================================================
+# pack_outputs
+# ===================================================================
+class TestPackOutputs(unittest.TestCase):
+
+    def test_qwen_with_images(self):
+        """Lines 457-474: qwen pack_outputs with image data."""
+        proc = _make_processor(QWEN_VL)
+        outputs = {
+            "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
+            "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
+            "image_type_ids": [0, 1],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+        result = proc.pack_outputs(outputs)
+
+        self.assertIsNotNone(result["images"])
+        self.assertEqual(result["images"].shape[0], 4)
+        self.assertIsNotNone(result["grid_thw"])
+        self.assertEqual(result["input_ids"].dtype, np.int64)
+        self.assertEqual(result["token_type_ids"].dtype, np.int64)
+        self.assertEqual(result["position_ids"].dtype, np.int64)
+        self.assertEqual(result["image_patch_id"], proc.processor.image_token_id)
+        self.assertEqual(result["video_patch_id"], proc.processor.video_token_id)
+
+    def test_qwen_without_images(self):
+        """Lines 457-460: empty images set to None."""
+        proc = _make_processor(QWEN_VL)
+        outputs = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2],
+            "token_type_ids": [0, 0],
+            "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])],
+        }
+        result = proc.pack_outputs(outputs)
+
+        self.assertIsNone(result["images"])
+        self.assertIsNone(result["grid_thw"])
+        self.assertIsNone(result["image_type_ids"])
+
+    def test_ernie_pack_outputs(self):
+        """Lines 475-477: ernie uses different position_ids handling."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.image_patch_id = 9999
+        outputs = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2],
+            "token_type_ids": [0, 0],
+            "position_ids": [[0, 0, 0], [1, 1, 1]],
+        }
+        result = proc.pack_outputs(outputs)
+
+        self.assertIsNone(result["images"])
+        self.assertEqual(result["position_ids"].dtype, np.int64)
+        self.assertEqual(result["position_ids"].shape, (2, 3))
+        self.assertEqual(result["image_patch_id"], 9999)
+        self.assertNotIn("video_patch_id", result)
+
+    def test_paddleocr_with_images(self):
+        """Lines 470-474: paddleocr uses same path as qwen."""
+        proc = _make_processor(PADDLEOCR_VL)
+        outputs = {
+            "images": [np.array([[1, 2]])],
+            "grid_thw": [np.array([1, 1, 2])],
+            "image_type_ids": [0],
+            "input_ids": [1],
+            "token_type_ids": [0],
+            "position_ids": [np.array([[0], [0], [0]])],
+        }
+        result = proc.pack_outputs(outputs)
+
+        self.assertIsNotNone(result["images"])
+        self.assertEqual(result["image_patch_id"], proc.processor.image_token_id)
+        self.assertEqual(result["video_patch_id"], proc.processor.video_token_id)
+
+
+# ===================================================================
+# process_request_dict (integration-level tests for flow coverage)
+# ===================================================================
+class TestProcessRequestDict(unittest.TestCase):
+
+    def _make_mock_outputs(self):
+        return {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3, 4, 5],
+            "token_type_ids": [0, 0, 0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])],
+        }
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_qwen_vl_messages_flow(self, mock_stop):
+        """Lines 281-344: full flow for qwen_vl with messages."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.request2ids.return_value = self._make_mock_outputs()
+
+        request = {
+            "request_id": "test1",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}],
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertIn("prompt_token_ids", result)
+        self.assertIn("multimodal_inputs", result)
+        self.assertEqual(result["prompt_token_ids_len"], len(result["prompt_token_ids"]))
+        self.assertFalse(result.get("enable_thinking"))  # qwen_vl sets False
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_qwen3_vl_with_prompt_token_ids(self, mock_stop):
+        """Lines 306-307: qwen3_vl with existing prompt_token_ids preserved."""
+        proc = _make_processor(QWEN3_VL)
+        outputs = self._make_mock_outputs()
+        proc.processor.prompt_token_ids2outputs.return_value = outputs
+
+        request = {
+            "request_id": "test2",
+            "prompt_token_ids": [10, 20, 30],
+            "messages": [{"role": "user", "content": "hi"}],
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        # prompt_token_ids should be preserved (not overwritten)
+        self.assertEqual(result["prompt_token_ids"], [10, 20, 30])
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_ernie_flow(self, mock_stop):
+        """Lines 291-295, 316-320, 328-329, 339-341: ernie-specific branches."""
+        proc = _make_processor(ERNIE4_5_VL)
+        outputs = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+        }
+        proc.processor.request2ids.return_value = outputs
+
+        request = {
+            "request_id": "test3",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}],
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertIn("prompt_token_ids", result)
+        self.assertIn("logits_processors_args", result)
+        # ernie sets default reasoning_max_tokens when None
+        self.assertIn("reasoning_max_tokens", result)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_ernie_low_top_p(self, mock_stop):
+        """Lines 331-334: ernie with top_p below _SAMPLING_EPS."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.request2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+        }
+
+        request = {
+            "request_id": "test4",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "top_p": 0.0,
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertAlmostEqual(result["top_p"], _SAMPLING_EPS)
+        self.assertEqual(result["top_k"], 1)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_paddleocr_low_top_p(self, mock_stop):
+        """Lines 331-334: paddleocr with top_p below _SAMPLING_EPS."""
+        proc = _make_processor(PADDLEOCR_VL)
+        proc.processor.request2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+
+        request = {
+            "request_id": "test5",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "top_p": 0.0,
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertAlmostEqual(result["top_p"], _SAMPLING_EPS)
+        self.assertEqual(result["top_k"], 1)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_qwen_vl_with_reasoning_parser(self, mock_stop):
+        """Lines 336-337: qwen_vl with reasoning parser (not qwen3)."""
+        proc = _make_processor(QWEN_VL)
+        mock_parser = MagicMock()
+        mock_parser.get_model_status.return_value = "think_start"
+        proc.reasoning_parser = mock_parser
+        proc.processor.request2ids.return_value = self._make_mock_outputs()
+
+        request = {
+            "request_id": "test6",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertTrue(result["enable_thinking"])
+        self.assertIn("test6", proc.model_status_dict)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_qwen3_skips_reasoning_parser(self, mock_stop):
+        """Lines 336: qwen3_vl does NOT apply reasoning parser."""
+        proc = _make_processor(QWEN3_VL)
+        mock_parser = MagicMock()
+        proc.reasoning_parser = mock_parser
+        proc.processor.request2ids.return_value = self._make_mock_outputs()
+
+        request = {
+            "request_id": "test7",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+        }
+        proc.process_request_dict(request, max_model_len=100)
+
+        mock_parser.get_model_status.assert_not_called()
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_ernie_response_max_tokens_with_thinking_disabled(self, mock_stop):
+        """Lines 339-341: ernie with response_max_tokens and enable_thinking=False."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.request2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+        }
+
+        request = {
+            "request_id": "test8",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}],
+            "response_max_tokens": 10,
+            "enable_thinking": False,
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertLessEqual(result["max_tokens"], 10)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_prompt_truncation(self, mock_stop):
+        """Lines 313-314: prompt exceeding max_model_len is truncated."""
+        proc = _make_processor(QWEN_VL)
+        long_ids = list(range(200))
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": long_ids,
+            "token_type_ids": [0] * 200,
+            "position_ids": [np.array([list(range(200))] * 3)],
+        }
+
+        request = {"request_id": "test9", "prompt": "hello " * 100}
+        result = proc.process_request_dict(request, max_model_len=50)
+
+        self.assertLessEqual(len(result["prompt_token_ids"]), 49)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_max_tokens_default(self, mock_stop):
+        """Lines 322-324: max_tokens defaults to remaining model len."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+
+        request = {"request_id": "test10", "prompt": "hello"}
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        expected_max = 100 - len(result["prompt_token_ids"])
+        self.assertEqual(result["max_tokens"], max(1, expected_max))
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_max_tokens_capped(self, mock_stop):
+        """Lines 325-326: user max_tokens capped by remaining model len."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+
+        request = {"request_id": "test11", "prompt": "hello", "max_tokens": 5000}
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        remaining = 100 - len(result["prompt_token_ids"])
+        self.assertEqual(result["max_tokens"], remaining)
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_paddleocr_skips_bad_words(self, mock_stop):
+        """Lines 288-289: paddleocr skips _process_bad_words."""
+        proc = _make_processor(PADDLEOCR_VL)
+        proc.update_bad_words = MagicMock()
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2],
+            "token_type_ids": [0, 0],
+            "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])],
+        }
+
+        request = {"request_id": "test12", "prompt": "hi", "bad_words": ["test"]}
+        proc.process_request_dict(request, max_model_len=100)
+
+        proc.update_bad_words.assert_not_called()
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_eos_token_ids_not_overwritten(self, mock_stop):
+        """Lines 283-284: existing eos_token_ids preserved."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2],
+            "token_type_ids": [0, 0],
+            "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])],
+        }
+
+        request = {"request_id": "test13", "prompt": "hi", "eos_token_ids": [99]}
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertEqual(result["eos_token_ids"], [99])
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_ernie_reasoning_max_tokens_default(self, mock_stop):
+        """Lines 328-329: ernie sets default reasoning_max_tokens."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.request2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+        }
+
+        request = {
+            "request_id": "test14",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}],
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertIn("reasoning_max_tokens", result)
+        self.assertEqual(result["reasoning_max_tokens"], max(int(result["max_tokens"] * 0.8), 1))
+
+    @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids")
+    def test_prompt_path_flow(self, mock_stop):
+        """Lines 297-299, 304-310: prompt path flow."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.text2ids.return_value = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+
+        request = {
+            "request_id": "test15",
+            "prompt": "hello world",
+        }
+        result = proc.process_request_dict(request, max_model_len=100)
+
+        self.assertEqual(result["prompt_token_ids"], [1, 2, 3])
+        self.assertIn("multimodal_inputs", result)
+
+
+# ===================================================================
+# _init_mm_config (via _make_processor + direct attribute check)
+# ===================================================================
+class TestInitMmConfig(unittest.TestCase):
+
+    def test_qwen_vl_sets_image_patch_id(self):
+        """Lines 174-175: qwen_vl/qwen3_vl sets image_patch_id from image_token_id."""
+        proc = _make_processor(QWEN_VL)
+        proc.processor.image_token_id = 12345
+        proc._init_mm_config()
+        self.assertEqual(proc.image_patch_id, 12345)
+
+    def test_qwen3_vl_sets_image_patch_id(self):
+        proc = _make_processor(QWEN3_VL)
+        proc.processor.image_token_id = 67890
+        proc._init_mm_config()
+        self.assertEqual(proc.image_patch_id, 67890)
+
+    def test_paddleocr_sets_image_patch_id(self):
+        """Lines 176-177: paddleocr sets image_patch_id from processor."""
+        proc = _make_processor(PADDLEOCR_VL)
+        proc.processor.image_patch_id = 11111
+        proc._init_mm_config()
+        self.assertEqual(proc.image_patch_id, 11111)
+
+    def test_ernie_sets_image_patch_id_and_spatial_conv(self):
+        """Lines 178-180: ernie sets image_patch_id and spatial_conv_size."""
+        proc = _make_processor(ERNIE4_5_VL)
+        proc.processor.image_patch_id = 22222
+        proc.processor.spatial_conv_size = 14
+        proc._init_mm_config()
+        self.assertEqual(proc.image_patch_id, 22222)
+        self.assertEqual(proc.spatial_conv_size, 14)
+
+
+# ===================================================================
+# _load_tokenizer (just the branch coverage, actual loading is mocked)
+# ===================================================================
+class TestLoadTokenizer(unittest.TestCase):
+
+    @patch("fastdeploy.input.multimodal_processor.MultiModalProcessor.__init__", return_value=None)
+    def test_auto_tokenizer_path(self, mock_init):
+        """Lines 123-125: non-ernie path loads AutoTokenizer."""
+        proc = MultiModalProcessor.__new__(MultiModalProcessor)
+        proc.model_name_or_path = "/mock/model"
+        proc.tokenizer_type = "auto"
+
+        with patch("fastdeploy.input.multimodal_processor.MultiModalProcessor._load_tokenizer") as mock_load:
+            mock_load.return_value = MagicMock()
+            mock_load.assert_called_once()
+
+
+if __name__ == "__main__":
+    unittest.main()

From d1663a316044b048d5d1be8ab0abfa4d0e20b5f4 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Wed, 1 Apr 2026 19:41:44 +0800
Subject: [PATCH 7/8] fix unit test & AdaptiveImageProcessor

---
 .../image_processors/adaptive_processor.py    | 15 ++++++--------
 tests/input/test_multimodal_processor.py      | 20 ++++++++++---------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py
index e5ad4960391..4a5539bec57 100644
--- a/fastdeploy/input/image_processors/adaptive_processor.py
+++ b/fastdeploy/input/image_processors/adaptive_processor.py
@@ -454,6 +454,8 @@ def preprocess(
         if images is not None and not valid_images(images):
             raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
 
+        data = {}
+
         if images is not None:
             pixel_values, vision_grid_thws = [], []
             for img_idx, image in enumerate(images):
@@ -479,10 +481,8 @@ def preprocess(
                 vision_grid_thws.append(image_grid_thw)
             pixel_values = np.array(pixel_values)
             vision_grid_thws = np.array(vision_grid_thws)
-            data = {
-                "pixel_values": pixel_values,
-                "image_grid_thw": vision_grid_thws,
-            }
+            data["pixel_values"] = pixel_values
+            data["image_grid_thw"] = vision_grid_thws
 
         if videos is not None:
             pixel_values, vision_grid_thws = [], []
@@ -505,11 +505,8 @@ def preprocess(
                 vision_grid_thws.append(video_grid_thw)
             pixel_values = np.array(pixel_values)
             vision_grid_thws = np.array(vision_grid_thws)
-
-            data = {
-                "pixel_values_videos": pixel_values,
-                "video_grid_thw": vision_grid_thws,
-            }
+            data["pixel_values_videos"] = pixel_values
+            data["video_grid_thw"] = vision_grid_thws
 
         return BatchFeature(data=data, tensor_type=return_tensors)
 
diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py
index bfce5b302a9..8c0aabda1a6 100644
--- a/tests/input/test_multimodal_processor.py
+++ b/tests/input/test_multimodal_processor.py
@@ -1114,16 +1114,18 @@ def test_ernie_sets_image_patch_id_and_spatial_conv(self):
 # ===================================================================
 class TestLoadTokenizer(unittest.TestCase):
 
-    @patch("fastdeploy.input.multimodal_processor.MultiModalProcessor.__init__", return_value=None)
-    def test_auto_tokenizer_path(self, mock_init):
-        """Lines 123-125: non-ernie path loads AutoTokenizer."""
-        proc = MultiModalProcessor.__new__(MultiModalProcessor)
-        proc.model_name_or_path = "/mock/model"
-        proc.tokenizer_type = "auto"
+    def test_auto_tokenizer_path(self):
+        """Lines 123-125: non-ernie path loads AutoTokenizer via paddleformers."""
+        proc = _make_processor(QWEN_VL)
+        mock_tokenizer = MagicMock()
+        mock_auto_tokenizer = MagicMock()
+        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer
+
+        with patch.dict("sys.modules", {"paddleformers.transformers": MagicMock(AutoTokenizer=mock_auto_tokenizer)}):
+            result = proc._load_tokenizer()
 
-        with patch("fastdeploy.input.multimodal_processor.MultiModalProcessor._load_tokenizer") as mock_load:
-            mock_load.return_value = MagicMock()
-            mock_load.assert_called_once()
+        mock_auto_tokenizer.from_pretrained.assert_called_once_with("/mock/model", padding_side="left", use_fast=True)
+        self.assertEqual(result, mock_tokenizer)
 
 
 if __name__ == "__main__":

From 47658fe3c7bc771d0c4ee5ce80bf99d2665f99e4 Mon Sep 17 00:00:00 2001
From: luukunn <981429396@qq.com>
Date: Wed, 1 Apr 2026 21:17:49 +0800
Subject: [PATCH 8/8] Delete unused code

---
 .../image_processors/paddleocr_processor.py   |  8 ++--
 fastdeploy/input/multimodal_processor.py      | 34 ++------------
 tests/input/test_multimodal_processor.py      | 45 ++-----------------
 3 files changed, 10 insertions(+), 77 deletions(-)

diff --git a/fastdeploy/input/image_processors/paddleocr_processor.py b/fastdeploy/input/image_processors/paddleocr_processor.py
index a28f03075df..8c304defeb0 100644
--- a/fastdeploy/input/image_processors/paddleocr_processor.py
+++ b/fastdeploy/input/image_processors/paddleocr_processor.py
@@ -38,16 +38,14 @@
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 
 
-def make_batched_images(images) -> List[List[ImageInput]]:
+def make_batched_images(images) -> List[ImageInput]:
     """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
+    Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
     Args:
         images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
             The input image.
-
     Returns:
-        list: A list of images.
+        List[ImageInput]: A flat list of images.
     """
     if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
         return [img for img_list in images for img in img_list]
diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py
index 143160b0fce..3284d19770a 100644
--- a/fastdeploy/input/multimodal_processor.py
+++ b/fastdeploy/input/multimodal_processor.py
@@ -21,7 +21,6 @@
 single class that dispatches per ``model_type``.
 """
 
-import pickle
 from collections.abc import Mapping
 from typing import Any, Dict, Optional
 
@@ -57,8 +56,6 @@
     "video_fps": int,
 }
 
-_TYPES_ACCEPT_URL_SUFFIX = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL}
-
 _DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
 
 _SAMPLING_EPS = 1e-5
@@ -227,22 +224,14 @@ def _check_mm_limits(self, item):
             mm_data = item
         else:
             mm_data = {"image": [], "video": []}
-            accept_url_suffix = self.model_type in _TYPES_ACCEPT_URL_SUFFIX
-
             for message in item:
                 if isinstance(message.get("content"), list):
                     for part in message["content"]:
                         part_type = part.get("type")
-                        if accept_url_suffix:
-                            if part_type in ("image_url", "image"):
-                                mm_data["image"].append(part)
-                            elif part_type in ("video_url", "video"):
-                                mm_data["video"].append(part)
-                        else:
-                            if part_type == "image":
-                                mm_data["image"].append(part)
-                            elif part_type == "video":
-                                mm_data["video"].append(part)
+                        if part_type in ("image_url", "image"):
+                            mm_data["image"].append(part)
+                        elif part_type in ("video_url", "video"):
+                            mm_data["video"].append(part)
 
         for modality, data in mm_data.items():
             if modality in self.limit_mm_per_prompt:
@@ -250,21 +239,6 @@ def _check_mm_limits(self, item):
                 if len(data) > limit:
                     raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
 
-    def _get_processor_cache(self, socket, mm_hashes: list) -> list:
-        """Retrieve cached processor results for the given hashes."""
-        req = pickle.dumps(mm_hashes)
-        socket.send_multipart([b"", req])
-        _, resp = socket.recv_multipart()
-        mm_items = pickle.loads(resp)
-        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-        return mm_items
-
-    def _update_processor_cache(self, socket, mm_hashes: list, mm_items):
-        """Update the processor cache with new results."""
-        req = pickle.dumps((mm_hashes, mm_items))
-        socket.send_multipart([b"", req])
-        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
-
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
         """Return per-modality max token counts, if available."""
         if self.model_type == ERNIE4_5_VL:
diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py
index 8c0aabda1a6..5f0e781b6c5 100644
--- a/tests/input/test_multimodal_processor.py
+++ b/tests/input/test_multimodal_processor.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """
 
-import pickle
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -106,13 +105,9 @@ class TestMultiModalProcessorInitValidation(unittest.TestCase):
 
     def test_unsupported_model_type_raises(self):
         """Line 86: unsupported model_type should raise ValueError."""
-        with self.assertRaises(Exception):
-            # We need to let __init__ run the model_type check.
-            # Mock the parts that come after the check to isolate it.
-            with patch.object(MultiModalProcessor, "__init__", wraps=MultiModalProcessor.__init__) as _:
-                proc = object.__new__(MultiModalProcessor)
-                # Call the real __init__ which should fail on model_type check
-                MultiModalProcessor.__init__(proc, "/mock", model_type="unsupported_type")
+        with self.assertRaises(ValueError):
+            # Directly construct with unsupported model_type to trigger validation
+            MultiModalProcessor("/mock", model_type="unsupported_type")
 
 
 # ===================================================================
@@ -307,40 +302,6 @@ def test_messages_with_string_content_skipped(self):
         proc._check_mm_limits(messages)  # should not raise
 
 
-# ===================================================================
-# _get_processor_cache / _update_processor_cache
-# ===================================================================
-class TestProcessorCache(unittest.TestCase):
-
-    def test_get_processor_cache(self):
-        """Lines 255-260: retrieve cached results via socket."""
-        proc = _make_processor(QWEN_VL)
-        mock_socket = MagicMock()
-        mm_hashes = ["hash1", "hash2"]
-        expected_items = [{"data": "item1"}, {"data": "item2"}]
-        mock_socket.recv_multipart.return_value = [b"", pickle.dumps(expected_items)]
-
-        result = proc._get_processor_cache(mock_socket, mm_hashes)
-
-        mock_socket.send_multipart.assert_called_once()
-        self.assertEqual(result, expected_items)
-
-    def test_update_processor_cache(self):
-        """Lines 264-266: update cache via socket."""
-        proc = _make_processor(QWEN_VL)
-        mock_socket = MagicMock()
-        mm_hashes = ["hash1"]
-        mm_items = [{"data": "item1"}]
-
-        proc._update_processor_cache(mock_socket, mm_hashes, mm_items)
-
-        mock_socket.send_multipart.assert_called_once()
-        sent_data = mock_socket.send_multipart.call_args[0][0]
-        self.assertEqual(sent_data[0], b"")
-        unpacked = pickle.loads(sent_data[1])
-        self.assertEqual(unpacked, (mm_hashes, mm_items))
-
-
 # ===================================================================
 # get_mm_max_tokens_per_item
 # ===================================================================