From e91968d1985cafbe40802bd46c67f3de76769440 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 30 Mar 2026 18:36:10 +0800 Subject: [PATCH 1/8] first commit --- .../image_preprocessor/__init__.py | 10 +- .../get_image_preprocessor.py | 23 +- .../image_preprocessor_adaptive.py | 502 +---------------- fastdeploy/input/image_processors/__init__.py | 14 + .../image_processors/adaptive_processor.py | 527 ++++++++++++++++++ .../image_processors/paddleocr_processor.py | 227 ++++++++ .../input/image_processors/qwen3_processor.py | 333 +++++++++++ .../input/image_processors/qwen_processor.py | 332 +++++++++++ .../paddleocr_vl_processor/image_processor.py | 220 +------- .../qwen3_vl_processor/image_processor.py | 320 +---------- .../qwen_vl_processor/image_processor.py | 319 +---------- 11 files changed, 1473 insertions(+), 1354 deletions(-) create mode 100644 fastdeploy/input/image_processors/adaptive_processor.py create mode 100644 fastdeploy/input/image_processors/paddleocr_processor.py create mode 100644 fastdeploy/input/image_processors/qwen3_processor.py create mode 100644 fastdeploy/input/image_processors/qwen_processor.py diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py index c11444e6758..ec3d3e833c0 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/__init__.py @@ -14,7 +14,13 @@ # limitations under the License. """ -from .get_image_preprocessor import get_image_preprocessor -from .image_preprocessor_adaptive import AdaptiveImageProcessor +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + get_image_preprocessor, +) __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"] diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py index 0ff6f7d1ed5..ead34a0ce0b 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py @@ -14,21 +14,10 @@ # limitations under the License. """ -"""get image preprocessor""" +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. -from fastdeploy.utils import data_processor_logger - -from .image_preprocessor_adaptive import AdaptiveImageProcessor - - -def get_image_preprocessor(args): - """ - get_image_preprocessor from args - """ - - if args.vision_model_name_or_path is None: - return None - - data_processor_logger.info("use AdaptiveImageProcessor") - image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path) - return image_preprocess +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + get_image_preprocessor, +) diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py index cd81274654e..deaa5494c12 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py @@ -14,498 +14,12 @@ # limitations under the License. """ -"""image preprocessor adaptive""" - -from typing import List, Optional, Union - -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - convert_to_rgb, - normalize, - rescale, - resize, - to_channel_dimension_format, +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.adaptive_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + make_batched_images, + make_batched_videos, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_valid_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image -from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -IMAGE_FACTOR = 28 -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 -MAX_RATIO = 200 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarrray"]], - List[List["paddle.Tensor"]], -] - - -__all__ = [ - "AdaptiveImageProcessor", -] - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos -def make_batched_videos(videos) -> List[VideoInput]: - """dummy""" - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - -class AdaptiveImageProcessor(BaseImageProcessor): - r""" - Constructs a adaptive image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel - in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_conv_size (`int`, *optional*, defaults to 2): - The temporal conv size in resampler. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 56 * 56, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_conv_size: int = 2, - merge_size: int = 2, - **kwargs, - ) -> None: - """init""" - super().__init__(**kwargs) - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_conv_size = temporal_conv_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} - self.do_convert_rgb = do_convert_rgb - - def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): - """设定pixels""" - if min_pixels is not None: - assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}") - self.min_pixels = min_pixels - self.size["min_pixels"] = int(min_pixels) - if max_pixels is not None: - assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}") - self.max_pixels = max_pixels - self.size["max_pixels"] = int(max_pixels) - - def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None): - """dummy""" - actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels - actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=actual_min_pixels, - max_pixels=actual_max_pixels, - ) - return (resized_height, resized_width), ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - do_resize: bool = True, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. - If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`List[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - resample (`PILImageResampling`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - images = make_list_of_images(images) - - if do_convert_rgb: - images = [convert_to_rgb(image) for image in images] - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = height, width - processed_images = [] - - if predetermined_grid_thw is not None: - assert len(predetermined_grid_thw) == len( - images - ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}" - - for img_idx, image in enumerate(images): - if do_resize: - if predetermined_grid_thw is not None: - (resized_height, resized_width) = predetermined_grid_thw[img_idx] - resized_height *= self.patch_size - resized_width *= self.patch_size - else: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错 - # 直接fromarray,不要靠paddleformers里面的 - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - if do_rescale: - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - - processed_images.append(image) - patches = np.array(processed_images) - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) - - channel = patches.shape[1] # [time, C, H, W] - grid_t = patches.shape[0] - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - patches = patches.reshape( - [ - grid_t, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz] - patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.patch_size * self.patch_size, - ] - ) # [grid_t * grid_h * grid_w, C * psz * psz] - - return flatten_patches, (grid_t, grid_h, grid_w) - - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: bool = True, - size: Optional[Union[int, List[int]]] = None, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - videos (`VideoInput`): - Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If - passing in videos with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with - the longest edge resized to keep the input aspect ratio. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if images is not None: - images = make_batched_images(images) - if videos is not None: - videos = make_batched_videos(videos) - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - if images is not None: - pixel_values, vision_grid_thws = [], [] - for img_idx, image in enumerate(images): - if predetermined_grid_thw is not None: - predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]] - else: - predetermined_grid_thw_one = None - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw_one, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = { - "pixel_values": pixel_values, - "image_grid_thw": vision_grid_thws, - } - - if videos is not None: - pixel_values, vision_grid_thws = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw, - ) - pixel_values.extend(patches) - vision_grid_thws.append(video_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - - data = { - "pixel_values_videos": pixel_values, - "video_grid_thw": vision_grid_thws, - } - - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/__init__.py b/fastdeploy/input/image_processors/__init__.py index a9cc79cc9d7..0f5df8e741b 100644 --- a/fastdeploy/input/image_processors/__init__.py +++ b/fastdeploy/input/image_processors/__init__.py @@ -11,3 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401 + AdaptiveImageProcessor, + get_image_preprocessor, +) +from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401 + ImageProcessor as PaddleOCRImageProcessor, +) +from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401 + ImageProcessor as Qwen3ImageProcessor, +) +from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401 + ImageProcessor as QwenImageProcessor, +) diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py new file mode 100644 index 00000000000..47e677e4917 --- /dev/null +++ b/fastdeploy/input/image_processors/adaptive_processor.py @@ -0,0 +1,527 @@ +""" +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""image preprocessor adaptive""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_valid_image, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image +from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize +from fastdeploy.utils import data_processor_logger + +OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + +IMAGE_FACTOR = 28 +MIN_PIXELS = 4 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 +MAX_RATIO = 200 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarrray"]], + List[List["paddle.Tensor"]], +] + + +__all__ = [ + "AdaptiveImageProcessor", + "get_image_preprocessor", + "make_batched_images", + "make_batched_videos", +] + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched images from {images}") + + +# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos +def make_batched_videos(videos) -> List[VideoInput]: + """dummy""" + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + + +class AdaptiveImageProcessor(BaseImageProcessor): + r""" + Constructs a adaptive image processor that dynamically resizes images based on the original images. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats for each channel + in the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spacial patch size of the vision encoder. + temporal_conv_size (`int`, *optional*, defaults to 2): + The temporal conv size in resampler. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] + + def __init__( + self, + do_resize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_pixels: int = 56 * 56, + max_pixels: int = 28 * 28 * 1280, + patch_size: int = 14, + temporal_conv_size: int = 2, + merge_size: int = 2, + **kwargs, + ) -> None: + """init""" + super().__init__(**kwargs) + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.patch_size = patch_size + self.temporal_conv_size = temporal_conv_size + self.merge_size = merge_size + self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} + self.do_convert_rgb = do_convert_rgb + + def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): + """设定pixels""" + if min_pixels is not None: + assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int" + data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}") + self.min_pixels = min_pixels + self.size["min_pixels"] = int(min_pixels) + if max_pixels is not None: + assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int" + data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}") + self.max_pixels = max_pixels + self.size["max_pixels"] = int(max_pixels) + + def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None): + """dummy""" + actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels + actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=actual_min_pixels, + max_pixels=actual_max_pixels, + ) + return (resized_height, resized_width), ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + do_resize: bool = True, + resample: PILImageResampling = None, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = False, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + predetermined_grid_thw=None, + ): + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. + If pixel values range from 0 to 1, set `do_rescale=False`. + vision_info (`List[Dict]`, *optional*): + Optional list of dictionaries containing additional information about vision inputs. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Mean to use if normalizing the image. + Can be a float or a list of floats corresponding to the number of channels in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Standard deviation to use if normalizing the image. + Can be a float or a list of floats corresponding to the number of channels in the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + images = make_list_of_images(images) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = height, width + processed_images = [] + + if predetermined_grid_thw is not None: + assert len(predetermined_grid_thw) == len( + images + ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}" + + for img_idx, image in enumerate(images): + if do_resize: + if predetermined_grid_thw is not None: + (resized_height, resized_width) = predetermined_grid_thw[img_idx] + resized_height *= self.patch_size + resized_width *= self.patch_size + else: + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels, + ) + image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错 + # 直接fromarray,不要靠paddleformers里面的 + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + if do_rescale: + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + + processed_images.append(image) + patches = np.array(processed_images) + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) + + channel = patches.shape[1] # [time, C, H, W] + grid_t = patches.shape[0] + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + patches = patches.reshape( + [ + grid_t, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz] + patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.patch_size * self.patch_size, + ] + ) # [grid_t * grid_h * grid_w, C * psz * psz] + + return flatten_patches, (grid_t, grid_h, grid_w) + + def preprocess( + self, + images: ImageInput, + videos: VideoInput = None, + do_resize: bool = True, + size: Optional[Union[int, List[int]]] = None, + resample: PILImageResampling = None, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = False, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + predetermined_grid_thw=None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + videos (`VideoInput`): + Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If + passing in videos with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if images is not None: + images = make_batched_images(images) + if videos is not None: + videos = make_batched_videos(videos) + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + if images is not None: + pixel_values, vision_grid_thws = [], [] + for img_idx, image in enumerate(images): + if predetermined_grid_thw is not None: + predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]] + else: + predetermined_grid_thw_one = None + patches, image_grid_thw = self._preprocess( + image, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + predetermined_grid_thw=predetermined_grid_thw_one, + ) + pixel_values.extend(patches) + vision_grid_thws.append(image_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data = { + "pixel_values": pixel_values, + "image_grid_thw": vision_grid_thws, + } + + if videos is not None: + pixel_values, vision_grid_thws = [], [] + for images in videos: + patches, video_grid_thw = self._preprocess( + images, + do_resize=do_resize, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + predetermined_grid_thw=predetermined_grid_thw, + ) + pixel_values.extend(patches) + vision_grid_thws.append(video_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + + data = { + "pixel_values_videos": pixel_values, + "video_grid_thw": vision_grid_thws, + } + + return BatchFeature(data=data, tensor_type=return_tensors) + + +def get_image_preprocessor(args): + """ + get_image_preprocessor from args + """ + + if args.vision_model_name_or_path is None: + return None + + data_processor_logger.info("use AdaptiveImageProcessor") + image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path) + return image_preprocess diff --git a/fastdeploy/input/image_processors/paddleocr_processor.py b/fastdeploy/input/image_processors/paddleocr_processor.py new file mode 100644 index 00000000000..a28f03075df --- /dev/null +++ b/fastdeploy/input/image_processors/paddleocr_processor.py @@ -0,0 +1,227 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""Image processor class for PaddleOCR-VL.""" + +import json +from pathlib import Path +from typing import Dict, List, Optional, Union + +import numpy as np +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_utils import ( + ImageInput, + is_valid_image, + make_list_of_images, + to_numpy_array, +) + +from fastdeploy.input.image_processors.common import ( + smart_resize_paddleocr as smart_resize, +) + +_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched images from {images}") + + +def adjust_size(size, patch_size): + num_patches = size // patch_size + if num_patches % 2 != 0: + num_patches -= 1 + return num_patches * patch_size + + +class ImageProcessor(BaseImageProcessor): + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] + + def __init__( + self, + do_resize: bool = True, + resample: int = 3, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_pixels: int = 28 * 28 * 130, + max_pixels: int = 28 * 28 * 1280, + patch_size: int = 14, + temporal_patch_size: int = 1, + merge_size: int = 2, + **kwargs, + ) -> None: + super().__init__() + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.merge_size = merge_size + self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used + self.do_convert_rgb = do_convert_rgb + + @classmethod + def from_pretrained(cls, pretrained_model_dir): + pretrained_model_dir = Path(pretrained_model_dir) + image_processor_config_path = pretrained_model_dir / "preprocessor_config.json" + with open(image_processor_config_path, "r", encoding="utf-8") as f: + image_processor_config = json.load(f) + return cls(**image_processor_config) + + def _preprocess( + self, + images, + do_resize: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + ): + images = make_list_of_images(images) + + if do_convert_rgb: + images = [image.convert("RGB") for image in images] + + width, height = images[0].size + resized_height, resized_width = height, width + processed_images = [] + + for image in images: + if do_resize: + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels, + ) + + image = image.resize((resized_width, resized_height), resample=self.resample) + + image = to_numpy_array(image) + + if do_rescale: + image = (image * rescale_factor).astype(np.float32) + + if do_normalize: + image = image.astype(np.float32) + image -= np.array(image_mean, dtype=np.float32) + image /= np.array(image_std, dtype=np.float32) + + processed_images.append(image) + + patches = np.array(processed_images) + patches = patches.transpose(0, 3, 1, 2) + if patches.shape[0] == 1: + patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + channel = patches.shape[1] + grid_t = patches.shape[0] // self.temporal_patch_size + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + + patches = patches.reshape( + grid_t, + self.temporal_patch_size, + channel, + grid_h, + self.patch_size, + grid_w, + self.patch_size, + ) + patches = patches.transpose(0, 3, 5, 2, 1, 4, 6) + assert self.temporal_patch_size == 1 + flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size) + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images, + videos=None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors=None, + ): + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if videos is not None: + raise NotImplementedError("Videos are not yet supported") + + patches, image_grid_thw = self._preprocess( + images, + do_resize=do_resize, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + ) + pixel_values = np.array(patches) + data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/qwen3_processor.py b/fastdeploy/input/image_processors/qwen3_processor.py new file mode 100644 index 00000000000..5927a0f9699 --- /dev/null +++ b/fastdeploy/input/image_processors/qwen3_processor.py @@ -0,0 +1,333 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize +from fastdeploy.utils import data_processor_logger + +IMAGE_MEAN = [0.5, 0.5, 0.5] +IMAGE_STD = [0.5, 0.5, 0.5] + +MIN_PIXELS = 65536 +MAX_PIXELS = 16777216 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarray"]], + List[List["paddle.Tensor"]], +] + + +class ImageProcessor(BaseImageProcessor): + """ + Adaptive image processor for dynamic image resizing and preprocessing. + + This processor handles image resizing, rescaling, normalization and format conversion. + It dynamically adjusts image dimensions based on original size and specified constraints. + """ + + def __init__( + self, + patch_size: int = 16, + merge_size: int = 2, + temporal_patch_size: int = 2, + min_pixels: int = MIN_PIXELS, + max_pixels: int = MAX_PIXELS, + image_mean: Union[float, List[float]] = IMAGE_MEAN, + image_std: Union[float, List[float]] = IMAGE_STD, + rescale_factor: float = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> None: + """ + Initialize image processor with configuration parameters. + + Args: + patch_size (int): Spatial patch size for vision encoder + merge_size (int): Merge size between vision and LLM encoders + temporal_patch_size (int): Temporal patch size for video processing + min_pixels (int): Minimum allowed pixels in resized image + max_pixels (int): Maximum allowed pixels in resized image + image_mean (float/list): Mean values for normalization per channel + image_std (float/list): Std values for normalization per channel + rescale_factor (float): Scaling factor for pixel values (default 1/255) + do_rescale (bool): Whether to rescale images + do_normalize (bool): Whether to normalize images + resample: Resampling method for image resizing + **kwargs: Additional base class arguments + """ + super().__init__(**kwargs) + self.patch_size = patch_size + self.merge_size = merge_size + self.temporal_patch_size = temporal_patch_size + + self.min_pixels = min_pixels + self.max_pixels = max_pixels + + self.image_mean = image_mean + self.image_std = image_std + self.rescale_factor = rescale_factor + self.do_rescale = do_rescale + self.do_normalize = do_normalize + + self.resample = resample + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: int, + max_pixels: int, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + rescale_factor: float, + do_rescale: bool, + do_normalize: bool, + resample: PILImageResampling, + data_format: Optional[ChannelDimension], + input_data_format: Optional[Union[str, ChannelDimension]], + ): + """ + Internal method for image preprocessing pipeline. + + Args: + images: Input image or batch of images + min_pixels: Minimum allowed pixels in output + max_pixels: Maximum allowed pixels in output + image_mean: Normalization mean values + image_std: Normalization std values + rescale_factor: Pixel value scaling factor + do_rescale: Whether to rescale pixel values + do_normalize: Whether to normalize pixel values + resample: Resampling method + data_format: Output channel format + input_data_format: Input channel format + + Returns: + tuple: (flatten_patches, grid_dimensions) + - flatten_patches: Flattened image patches + - grid_dimensions: Grid dimensions [t, h, w] + """ + images = make_list_of_images(images) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # Get original dimensions and calculate optimal resize dimensions + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, # Combine patch and merge factors + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + + processed_images = [] + for image in images: + if height != resized_height or width != resized_width: + # Convert to uint8 before resizing to avoid double scaling + image = image.astype("uint8") + # Convert to PIL Image and resize + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + + if do_rescale and do_normalize: + # Adjust mean and std for combined rescale+normalize + image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) + image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) + do_rescale = False # Skip separate rescale step + + # mutual exclusion and upper branch + if do_rescale: + image = image.astype(np.float32) + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = image.astype(np.float32) + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + processed_images.append(image) + + # Convert processed images to numpy array + patches = np.array(processed_images) + + # Pad temporal dimension if needed + if patches.shape[0] % self.temporal_patch_size != 0: + repeats = np.repeat( + patches[-1][np.newaxis], + self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), + axis=0, + ) + patches = np.concatenate([patches, repeats], axis=0) + + # Convert to channels-first format if needed + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] + + grid_t, channel = patches.shape[:2] + grid_t = grid_t // self.temporal_patch_size + + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + # Reshape into hierarchical patch structure + patches = patches.reshape( + [ + grid_t, + self.temporal_patch_size, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # Reorder dimensions for better memory access pattern + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] + patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.temporal_patch_size * self.patch_size * self.patch_size, + ] + ) + + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + rescale_factor: Optional[float] = None, + do_rescale: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, + ): + """ + Main preprocessing method for images/videos. + + Args: + images: Input image/video data + min_pixels: Override for minimum pixels + max_pixels: Override for maximum pixels + image_mean: Override for normalization mean + image_std: Override for normalization std + rescale_factor: Override for rescaling factor + do_rescale: Override for rescaling flag + do_normalize: Override for normalization flag + resample: Override for resampling method + return_tensors: Desired output tensor format + data_format: Output channel dimension format + input_data_format: Input channel dimension format + + Returns: + BatchFeature: Processed features containing: + - pixel_values: Preprocessed pixel data + - grid_thw: Grid dimensions [temporal, height, width] + + Raises: + ValueError: For invalid image types or dimensions + """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + pixel_values, grid_thw = self._preprocess( + images, + min_pixels=min_pixels, + max_pixels=max_pixels, + image_mean=image_mean, + image_std=image_std, + rescale_factor=rescale_factor, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + data = {"pixel_values": pixel_values, "grid_thw": grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/image_processors/qwen_processor.py b/fastdeploy/input/image_processors/qwen_processor.py new file mode 100644 index 00000000000..7c3df2b69bf --- /dev/null +++ b/fastdeploy/input/image_processors/qwen_processor.py @@ -0,0 +1,332 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from typing import List, Optional, Union + +import numpy as np +import paddle +import PIL +from paddleformers.transformers.feature_extraction_utils import BatchFeature +from paddleformers.transformers.image_processing_utils import BaseImageProcessor +from paddleformers.transformers.image_transforms import ( + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from paddleformers.transformers.image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_list_of_images, + to_numpy_array, + valid_images, +) +from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType +from PIL import Image + +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize +from fastdeploy.utils import data_processor_logger + +OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] +OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] + +MIN_PIXELS = 4 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 + + +VideoInput = Union[ + List["PIL.Image.Image"], + "np.ndarray", + "paddle.Tensor", + List["np.ndarray"], + List["paddle.Tensor"], + List[List["PIL.Image.Image"]], + List[List["np.ndarray"]], + List[List["paddle.Tensor"]], +] + + +class ImageProcessor(BaseImageProcessor): + """ + Adaptive image processor for dynamic image resizing and preprocessing. + + This processor handles image resizing, rescaling, normalization and format conversion. + It dynamically adjusts image dimensions based on original size and specified constraints. + """ + + def __init__( + self, + patch_size: int = 14, + merge_size: int = 2, + temporal_patch_size: int = 2, + min_pixels: int = MIN_PIXELS, + max_pixels: int = MAX_PIXELS, + image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN, + image_std: Union[float, List[float]] = OPENAI_CLIP_STD, + rescale_factor: float = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> None: + """ + Initialize image processor with configuration parameters. + + Args: + patch_size (int): Spatial patch size for vision encoder + merge_size (int): Merge size between vision and LLM encoders + temporal_patch_size (int): Temporal patch size for video processing + min_pixels (int): Minimum allowed pixels in resized image + max_pixels (int): Maximum allowed pixels in resized image + image_mean (float/list): Mean values for normalization per channel + image_std (float/list): Std values for normalization per channel + rescale_factor (float): Scaling factor for pixel values (default 1/255) + do_rescale (bool): Whether to rescale images + do_normalize (bool): Whether to normalize images + resample: Resampling method for image resizing + **kwargs: Additional base class arguments + """ + super().__init__(**kwargs) + self.patch_size = patch_size + self.merge_size = merge_size + self.temporal_patch_size = temporal_patch_size + + self.min_pixels = min_pixels + self.max_pixels = max_pixels + + self.image_mean = image_mean + self.image_std = image_std + self.rescale_factor = rescale_factor + self.do_rescale = do_rescale + self.do_normalize = do_normalize + + self.resample = resample + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: int, + max_pixels: int, + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + rescale_factor: float, + do_rescale: bool, + do_normalize: bool, + resample: PILImageResampling, + data_format: Optional[ChannelDimension], + input_data_format: Optional[Union[str, ChannelDimension]], + ): + """ + Internal method for image preprocessing pipeline. + + Args: + images: Input image or batch of images + min_pixels: Minimum allowed pixels in output + max_pixels: Maximum allowed pixels in output + image_mean: Normalization mean values + image_std: Normalization std values + rescale_factor: Pixel value scaling factor + do_rescale: Whether to rescale pixel values + do_normalize: Whether to normalize pixel values + resample: Resampling method + data_format: Output channel format + input_data_format: Input channel format + + Returns: + tuple: (flatten_patches, grid_dimensions) + - flatten_patches: Flattened image patches + - grid_dimensions: Grid dimensions [t, h, w] + """ + images = make_list_of_images(images) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + data_processor_logger.warning( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # Get original dimensions and calculate optimal resize dimensions + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = smart_resize( + height, + width, + factor=self.patch_size * self.merge_size, # Combine patch and merge factors + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + + processed_images = [] + for image in images: + if height != resized_height or width != resized_width: + # Convert to uint8 before resizing to avoid double scaling + image = image.astype("uint8") + # Convert to PIL Image and resize + image = Image.fromarray(image) + image = resize( + image, + size=(resized_height, resized_width), + resample=resample, + data_format=input_data_format, + ) + + if do_rescale and do_normalize: + # Adjust mean and std for combined rescale+normalize + image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) + image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) + do_rescale = False # Skip separate rescale step + + if do_rescale: + image = image.astype(np.float32) + image = rescale(image, scale=rescale_factor, data_format=input_data_format) + + if do_normalize: + image = image.astype(np.float32) + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] + processed_images.append(image) + + # Convert processed images to numpy array + patches = np.array(processed_images) + + # Pad temporal dimension if needed + if patches.shape[0] % self.temporal_patch_size != 0: + repeats = np.repeat( + patches[-1][np.newaxis], + self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), + axis=0, + ) + patches = np.concatenate([patches, repeats], axis=0) + + # Convert to channels-first format if needed + if data_format == ChannelDimension.LAST: + patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] + + grid_t, channel = patches.shape[:2] + grid_t = grid_t // self.temporal_patch_size + + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) + # Reshape into hierarchical patch structure + patches = patches.reshape( + [ + grid_t, + self.temporal_patch_size, + channel, + grid_h // self.merge_size, + self.merge_size, + self.patch_size, + grid_w // self.merge_size, + self.merge_size, + self.patch_size, + ] + ) + # Reorder dimensions for better memory access pattern + # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] + patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) + + flatten_patches = patches.reshape( + [ + grid_t * grid_h * grid_w, + channel * self.temporal_patch_size * self.patch_size * self.patch_size, + ] + ) + + return flatten_patches, np.array([grid_t, grid_h, grid_w]) + + def preprocess( + self, + images: Union[ImageInput, VideoInput], + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + rescale_factor: Optional[float] = None, + do_rescale: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: Optional[PILImageResampling] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, + ): + """ + Main preprocessing method for images/videos. + + Args: + images: Input image/video data + min_pixels: Override for minimum pixels + max_pixels: Override for maximum pixels + image_mean: Override for normalization mean + image_std: Override for normalization std + rescale_factor: Override for rescaling factor + do_rescale: Override for rescaling flag + do_normalize: Override for normalization flag + resample: Override for resampling method + return_tensors: Desired output tensor format + data_format: Output channel dimension format + input_data_format: Input channel dimension format + + Returns: + BatchFeature: Processed features containing: + - pixel_values: Preprocessed pixel data + - grid_thw: Grid dimensions [temporal, height, width] + + Raises: + ValueError: For invalid image types or dimensions + """ + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if images is not None and not valid_images(images): + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + + pixel_values, grid_thw = self._preprocess( + images, + min_pixels=min_pixels, + max_pixels=max_pixels, + image_mean=image_mean, + image_std=image_std, + rescale_factor=rescale_factor, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + data = {"pixel_values": pixel_values, "grid_thw": grid_thw} + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py index a6e318e1ed7..ef86d77b714 100644 --- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py @@ -14,216 +14,12 @@ # limitations under the License. """ -"""Image processor class for Keye.""" - -# TODO: Support videos - -import json -from pathlib import Path -from typing import Dict, List, Optional, Union - -import numpy as np -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_utils import ( - ImageInput, - is_valid_image, - make_list_of_images, - to_numpy_array, -) - -from fastdeploy.input.image_processors.common import ( - smart_resize_paddleocr as smart_resize, +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.paddleocr_processor +# This file will be removed in a future version. + +from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401 + ImageProcessor, + make_batched_images, + smart_resize, ) - -_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -def adjust_size(size, patch_size): - num_patches = size // patch_size - if num_patches % 2 != 0: - num_patches -= 1 - return num_patches * patch_size - - -class ImageProcessor(BaseImageProcessor): - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: int = 3, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_patch_size: int = 1, - merge_size: int = 2, - **kwargs, - ) -> None: - super().__init__() - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used - self.do_convert_rgb = do_convert_rgb - - @classmethod - def from_pretrained(cls, pretrained_model_dir): - pretrained_model_dir = Path(pretrained_model_dir) - image_processor_config_path = pretrained_model_dir / "preprocessor_config.json" - with open(image_processor_config_path, "r", encoding="utf-8") as f: - image_processor_config = json.load(f) - return cls(**image_processor_config) - - def _preprocess( - self, - images, - do_resize: Optional[bool] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - ): - images = make_list_of_images(images) - - if do_convert_rgb: - images = [image.convert("RGB") for image in images] - - width, height = images[0].size - resized_height, resized_width = height, width - processed_images = [] - - for image in images: - if do_resize: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - - image = image.resize((resized_width, resized_height), resample=self.resample) - - image = to_numpy_array(image) - - if do_rescale: - image = (image * rescale_factor).astype(np.float32) - - if do_normalize: - image = image.astype(np.float32) - image -= np.array(image_mean, dtype=np.float32) - image /= np.array(image_std, dtype=np.float32) - - processed_images.append(image) - - patches = np.array(processed_images) - patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) - channel = patches.shape[1] - grid_t = patches.shape[0] // self.temporal_patch_size - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - patches = patches.reshape( - grid_t, - self.temporal_patch_size, - channel, - grid_h, - self.patch_size, - grid_w, - self.patch_size, - ) - patches = patches.transpose(0, 3, 5, 2, 1, 4, 6) - assert self.temporal_patch_size == 1 - flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size) - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images, - videos=None, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - return_tensors=None, - ): - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if videos is not None: - raise NotImplementedError("Videos are not yet supported") - - patches, image_grid_thw = self._preprocess( - images, - do_resize=do_resize, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_convert_rgb=do_convert_rgb, - ) - pixel_values = np.array(patches) - data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/qwen3_vl_processor/image_processor.py b/fastdeploy/input/qwen3_vl_processor/image_processor.py index 5927a0f9699..2b0afe4c047 100644 --- a/fastdeploy/input/qwen3_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen3_vl_processor/image_processor.py @@ -14,320 +14,10 @@ # limitations under the License. """ -from typing import List, Optional, Union +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.qwen3_processor +# This file will be removed in a future version. -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, +from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401 + ImageProcessor, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize -from fastdeploy.utils import data_processor_logger - -IMAGE_MEAN = [0.5, 0.5, 0.5] -IMAGE_STD = [0.5, 0.5, 0.5] - -MIN_PIXELS = 65536 -MAX_PIXELS = 16777216 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 16, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = IMAGE_MEAN, - image_std: Union[float, List[float]] = IMAGE_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - # mutual exclusion and upper branch - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/qwen_vl_processor/image_processor.py b/fastdeploy/input/qwen_vl_processor/image_processor.py index 7c3df2b69bf..3a5a77ea6d8 100644 --- a/fastdeploy/input/qwen_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen_vl_processor/image_processor.py @@ -14,319 +14,10 @@ # limitations under the License. """ -from typing import List, Optional, Union +# Backward compatibility: this module has been migrated to +# fastdeploy.input.image_processors.qwen_processor +# This file will be removed in a future version. -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, +from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401 + ImageProcessor, ) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 14, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN, - image_std: Union[float, List[float]] = OPENAI_CLIP_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) From a48fc5f6bfa5f06f15ea2c2bd45730979a1149b9 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Mon, 30 Mar 2026 19:32:04 +0800 Subject: [PATCH 2/8] step 9~10 --- fastdeploy/input/multimodal_processor.py | 673 ++++++++++++++++++ fastdeploy/input/preprocess.py | 86 +-- .../input/test_image_preprocessor_adaptive.py | 8 +- 3 files changed, 698 insertions(+), 69 deletions(-) create mode 100644 fastdeploy/input/multimodal_processor.py diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py new file mode 100644 index 00000000000..3e5d5c896f1 --- /dev/null +++ b/fastdeploy/input/multimodal_processor.py @@ -0,0 +1,673 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +"""Unified multimodal processor for all VL model types. + +Consolidates the four separate VL processor wrappers (QwenVLProcessor, +Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a +single class that dispatches per ``model_type``. +""" + +import pickle +from collections.abc import Mapping +from typing import Any, Dict, Optional + +import numpy as np + +from fastdeploy.input.base_processor import BaseTextProcessor +from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids +from fastdeploy.utils import data_processor_logger + +QWEN_VL = "qwen_vl" +QWEN3_VL = "qwen3_vl" +PADDLEOCR_VL = "paddleocr_vl" +ERNIE4_5_VL = "ernie4_5_vl" + +_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL} + +_QWEN_EXPECTED_KWARGS = { + "video_max_frames": int, + "video_min_frames": int, +} + +_ERNIE_EXPECTED_KWARGS = { + "spatial_conv_size": int, + "temporal_conv_size": int, + "image_min_pixels": int, + "image_max_pixels": int, + "video_min_pixels": int, + "video_max_pixels": int, + "video_target_frames": int, + "video_frames_sample": str, + "video_max_frames": int, + "video_min_frames": int, + "video_fps": int, +} + +_TYPES_ACCEPT_URL_SUFFIX = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL} + +_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1} + +_SAMPLING_EPS = 1e-5 + + +class MultiModalProcessor(BaseTextProcessor): + """Unified multimodal processor for all supported VL model types. + + Dispatches image-processor creation, config initialisation, and + encoding logic based on ``model_type``. + """ + + def __init__( + self, + model_name_or_path: str, + model_type: str, + config=None, + limit_mm_per_prompt: Optional[Dict[str, Any]] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + reasoning_parser_obj=None, + tool_parser_obj=None, + enable_processor_cache: bool = False, + ): + if model_type not in _SUPPORTED_MODEL_TYPES: + raise ValueError( + f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}." + ) + self.model_type = model_type + self.config = config + self.enable_processor_cache = enable_processor_cache + + tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto" + + super().__init__( + model_name_or_path, + tokenizer_type=tokenizer_type, + reasoning_parser_obj=reasoning_parser_obj, + tool_parser_obj=tool_parser_obj, + ) + + data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") + + processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) + self._init_mm_processor(processor_kwargs) + self._init_mm_config() + self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) + + def _load_tokenizer(self): + """Load the appropriate tokenizer based on model_type.""" + if self.tokenizer_type == "ernie4_5": + from paddleformers.transformers import AutoTokenizer as PFAutoTokenizer + + tokenizer = PFAutoTokenizer.from_pretrained(self.model_name_or_path) + else: + from paddleformers.transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True) + return tokenizer + + def _init_mm_processor(self, processor_kwargs: dict): + """Create the model-type-specific internal DataProcessor.""" + if self.model_type == QWEN_VL: + from fastdeploy.input.qwen_vl_processor.process import DataProcessor + + tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2) + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokens_per_second=tokens_per_second, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == QWEN3_VL: + from fastdeploy.input.qwen3_vl_processor.process import DataProcessor + + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == PADDLEOCR_VL: + from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor + + tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2) + self.processor = DataProcessor( + model_path=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + tokens_per_second=tokens_per_second, + tokenizer=self.tokenizer, + **processor_kwargs, + ) + elif self.model_type == ERNIE4_5_VL: + from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor + + self.processor = DataProcessor( + tokenizer_name=self.model_name_or_path, + image_preprocessor_name=self.model_name_or_path, + enable_processor_cache=self.enable_processor_cache, + **processor_kwargs, + ) + self.processor.eval() + + def _init_mm_config(self): + """Set model-type-specific multimodal configuration attributes.""" + if self.model_type in (QWEN_VL, QWEN3_VL): + self.image_patch_id = self.processor.image_token_id + elif self.model_type == PADDLEOCR_VL: + self.image_patch_id = self.processor.image_patch_id + elif self.model_type == ERNIE4_5_VL: + self.image_patch_id = self.processor.image_patch_id + self.spatial_conv_size = self.processor.spatial_conv_size + + def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict: + """Parse and validate multimodal processor kwargs.""" + if not kwargs: + return {} + + try: + if not isinstance(kwargs, dict): + raise ValueError("mm-processor-kwargs must be a dictionary") + + data_processor_logger.info(f"Processing kwargs: {kwargs}") + + if self.model_type == ERNIE4_5_VL: + expected_types = _ERNIE_EXPECTED_KWARGS + else: + expected_types = _QWEN_EXPECTED_KWARGS + + for key, value in kwargs.items(): + if key in expected_types and not isinstance(value, expected_types[key]): + raise ValueError( + f"Invalid type for {key}: expected " + f"{expected_types[key].__name__}, got {type(value).__name__}" + ) + return kwargs + + except Exception as e: + data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") + return {} + + def _parse_limits(self, limits: Optional[dict]) -> dict: + """Parse multimodal input limits, merging with defaults.""" + if not limits: + return dict(_DEFAULT_MM_LIMITS) + + try: + if not isinstance(limits, dict): + raise ValueError("limit-mm-per-prompt must be a dictionary") + data_processor_logger.info(f"_parse_limits:{limits}") + return {**_DEFAULT_MM_LIMITS, **limits} + except Exception as e: + data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") + return dict(_DEFAULT_MM_LIMITS) + + def _check_mm_limits(self, item): + """Validate multimodal inputs against configured limits.""" + if isinstance(item, dict): + mm_data = item + else: + mm_data = {"image": [], "video": []} + accept_url_suffix = self.model_type in _TYPES_ACCEPT_URL_SUFFIX + + for message in item: + if isinstance(message.get("content"), list): + for part in message["content"]: + part_type = part.get("type") + if accept_url_suffix: + if part_type in ("image_url", "image"): + mm_data["image"].append(part) + elif part_type in ("video_url", "video"): + mm_data["video"].append(part) + else: + if part_type == "image": + mm_data["image"].append(part) + elif part_type == "video": + mm_data["video"].append(part) + + for modality, data in mm_data.items(): + if modality in self.limit_mm_per_prompt: + limit = self.limit_mm_per_prompt[modality] + if len(data) > limit: + raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") + + def _get_processor_cache(self, socket, mm_hashes: list) -> list: + """Retrieve cached processor results for the given hashes.""" + req = pickle.dumps(mm_hashes) + socket.send_multipart([b"", req]) + _, resp = socket.recv_multipart() + mm_items = pickle.loads(resp) + data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") + return mm_items + + def _update_processor_cache(self, socket, mm_hashes: list, mm_items): + """Update the processor cache with new results.""" + req = pickle.dumps((mm_hashes, mm_items)) + socket.send_multipart([b"", req]) + data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]: + """Return per-modality max token counts, if available.""" + if self.model_type == ERNIE4_5_VL: + return self.processor.get_mm_max_tokens_per_item(seq_len) + return None + + def process_request_dict(self, request, max_model_len=None): + """Process a request dictionary into model inputs.""" + if self.model_type == QWEN_VL: + return self._process_request_qwen_vl(request, max_model_len) + elif self.model_type == QWEN3_VL: + return self._process_request_qwen3_vl(request, max_model_len) + elif self.model_type == PADDLEOCR_VL: + return self._process_request_paddleocr_vl(request, max_model_len) + elif self.model_type == ERNIE4_5_VL: + return self._process_request_ernie4_5_vl(request, max_model_len) + + def _process_request_qwen_vl(self, request, max_model_len): + """Process request for qwen_vl model type.""" + request = self._apply_default_parameters(request) + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids + + process_stop_token_ids(request, self.update_stop_seq) + + bad_words = request.get("bad_words") + bad_words_token_ids = request.get("bad_words_token_ids") + if bad_words: + bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) + request["bad_words_token_ids"] = bad_words_token_ids + + if request.get("prompt"): + multimodal_data = request.get("multimodal_data") + if multimodal_data is None: + multimodal_data = {} + self._check_mm_limits(multimodal_data) + images = multimodal_data.get("image", None) + videos = multimodal_data.get("video", None) + outputs = self.processor.text2ids(request["prompt"], images, videos) + + elif request.get("messages"): + messages = request["messages"] + self._check_mm_limits(messages) + chat_template_kwargs = request.get("chat_template_kwargs") + if chat_template_kwargs: + if isinstance(chat_template_kwargs, dict): + for k, v in chat_template_kwargs.items(): + if k not in request or request[k] is None: + request[k] = v + else: + raise ValueError("Invalid input: chat_template_kwargs must be a dict") + request.setdefault("enable_thinking", False) + outputs = self.processor.request2ids(request) + + else: + raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) + + # qwen25_vl not support thinking + request["enable_thinking"] = False + + outputs = self.pack_outputs(outputs) + + request["prompt_token_ids"] = outputs["input_ids"].tolist() + request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) + request["multimodal_inputs"] = outputs + + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + + max_tokens = max_model_len - len(request["prompt_token_ids"]) + if request.get("max_tokens") is None: + request["max_tokens"] = max(1, max_tokens) + else: + request["max_tokens"] = min(max_tokens, request["max_tokens"]) + + if self.reasoning_parser: + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" + + data_processor_logger.info(f"Processed request {request}") + return request + + def _process_request_qwen3_vl(self, request, max_model_len): + """Process request for qwen3_vl model type.""" + request = self._apply_default_parameters(request) + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids + + stop_sequences = request.get("stop", []) + if stop_sequences: + stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) + request["stop_token_ids"] = stop_seqs + request["stop_seqs_len"] = stop_seqs_len + + bad_words = request.get("bad_words") + bad_words_token_ids = request.get("bad_words_token_ids") + if bad_words: + bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) + request["bad_words_token_ids"] = bad_words_token_ids + + if request.get("prompt_token_ids"): + messages = request.get("messages") + if messages: + self._check_mm_limits(messages) + request.setdefault("enable_thinking", False) + outputs = self.processor.prompt_token_ids2outputs(request) + + elif request.get("prompt"): + multimodal_data = request.get("multimodal_data") + if multimodal_data is None: + multimodal_data = {} + self._check_mm_limits(multimodal_data) + images = multimodal_data.get("image", None) + videos = multimodal_data.get("video", None) + outputs = self.processor.text2ids(request["prompt"], images, videos) + + elif request.get("messages"): + messages = request["messages"] + self._check_mm_limits(messages) + chat_template_kwargs = request.get("chat_template_kwargs") + if chat_template_kwargs: + if isinstance(chat_template_kwargs, dict): + for k, v in chat_template_kwargs.items(): + if k not in request or request[k] is None: + request[k] = v + else: + raise ValueError("Invalid input: chat_template_kwargs must be a dict") + request.setdefault("enable_thinking", False) + outputs = self.processor.request2ids(request) + + else: + raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) + + # qwen3_vl not support thinking + request["enable_thinking"] = False + + outputs = self.pack_outputs(outputs) + + request["prompt_token_ids"] = ( + outputs["input_ids"].tolist() if not request.get("prompt_token_ids") else request["prompt_token_ids"] + ) + request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) + request["multimodal_inputs"] = outputs + + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + + max_tokens = max_model_len - len(request["prompt_token_ids"]) + if request.get("max_tokens") is None: + request["max_tokens"] = max(1, max_tokens) + else: + request["max_tokens"] = min(max_tokens, request["max_tokens"]) + + data_processor_logger.info(f"Processed request {request}") + return request + + def _process_request_paddleocr_vl(self, request, max_model_len): + """Process request for paddleocr_vl model type.""" + request = self._apply_default_parameters(request) + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids + + process_stop_token_ids(request, self.update_stop_seq) + + if request.get("prompt"): + multimodal_data = request.get("multimodal_data") + if multimodal_data is None: + multimodal_data = {} + self._check_mm_limits(multimodal_data) + images = multimodal_data.get("image", None) + videos = multimodal_data.get("video", None) + outputs = self.processor.text2ids(request["prompt"], images, videos) + + elif request.get("messages"): + messages = request["messages"] + self._check_mm_limits(messages) + outputs = self.processor.request2ids(request) + + else: + raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + + metadata = request.get("metadata") + if metadata and metadata.get("generated_token_ids"): + self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"]) + + outputs = self.pack_outputs(outputs) + + request["prompt_token_ids"] = outputs["input_ids"].tolist() + request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) + request["multimodal_inputs"] = outputs + + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + + max_tokens = max_model_len - len(request["prompt_token_ids"]) + if request.get("max_tokens") is None: + request["max_tokens"] = max(1, max_tokens) + else: + request["max_tokens"] = min(max_tokens, request["max_tokens"]) + + if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: + request["top_p"] = _SAMPLING_EPS + request["top_k"] = 1 + + if self.reasoning_parser: + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" + + return request + + def _process_request_ernie4_5_vl(self, request, max_model_len): + """Process request for ernie4_5_vl model type.""" + request = self._apply_default_parameters(request) + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids + + process_stop_token_ids(request, self.update_stop_seq) + + bad_words = request.get("bad_words") + bad_words_token_ids = request.get("bad_words_token_ids") + if bad_words: + bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) + request["bad_words_token_ids"] = bad_words_token_ids + + logits_processors_args = self._prepare_think_stop_sentence( + request.get("logits_processors_args") or {}, max_model_len + ) + request["logits_processors_args"] = logits_processors_args + + if request.get("prompt_token_ids"): + messages = request.get("messages") + if messages: + self._check_mm_limits(messages) + request.setdefault("enable_thinking", True) + outputs = self.processor.prompt_token_ids2outputs(request) + elif request.get("prompt"): + multimodal_data = request.get("multimodal_data") + if multimodal_data is None: + multimodal_data = {} + self._check_mm_limits(multimodal_data) + images = multimodal_data.get("image", None) + videos = multimodal_data.get("video", None) + request["prompt_tokens"] = request.get("prompt") + request.setdefault("enable_thinking", True) + outputs = self.processor.text2ids(request["prompt"], images, videos) + elif request.get("messages"): + messages = request["messages"] + self._check_mm_limits(messages) + chat_template_kwargs = request.get("chat_template_kwargs") + if chat_template_kwargs: + if isinstance(chat_template_kwargs, dict): + for k, v in chat_template_kwargs.items(): + if k not in request or request[k] is None: + request[k] = v + else: + raise ValueError("Invalid input: chat_template_kwargs must be a dict") + request.setdefault("enable_thinking", True) + outputs = self.processor.request2ids(request) + else: + raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) + + outputs = self.pack_outputs(outputs) + request["prompt_token_ids"] = ( + outputs["input_ids"].tolist() + if ("prompt_token_ids" not in request or not request["prompt_token_ids"]) + else request["prompt_token_ids"] + ) + request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) + request["multimodal_inputs"] = outputs + + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + logits_processors_args = self._update_thinking_prompt_state( + request["prompt_token_ids"], request.get("logits_processors_args") or {} + ) + request["logits_processors_args"] = logits_processors_args + + max_tokens = max_model_len - len(request["prompt_token_ids"]) + if request.get("max_tokens") is None: + request["max_tokens"] = max(1, max_tokens) + else: + request["max_tokens"] = min(max_tokens, request["max_tokens"]) + if request.get("reasoning_max_tokens") is None: + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) + + if self.reasoning_parser: + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" + if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: + request["top_p"] = _SAMPLING_EPS + request["top_k"] = 1 + if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False: + request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"]) + + data_processor_logger.info(f"Processed request {request}") + return request + + def append_completion_tokens(self, multimodal_inputs, completion_token_ids): + """Append completion tokens to existing multimodal outputs.""" + if self.model_type in (QWEN_VL, QWEN3_VL): + self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) + elif self.model_type == PADDLEOCR_VL: + self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) + elif self.model_type == ERNIE4_5_VL: + self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids) + + def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids): + """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl.""" + num_tokens = len(completion_token_ids) + multimodal_inputs["input_ids"].extend(completion_token_ids) + multimodal_inputs["token_type_ids"].extend([0] * num_tokens) + + pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens) + multimodal_inputs["position_ids"].append(pos_ids) + multimodal_inputs["cur_position"] += num_tokens + + def _append_generated_tokens_qwen(self, multimodal_inputs, generated_token_ids): + """Append generated tokens for paddleocr_vl (uses metadata.generated_token_ids).""" + self._append_completion_tokens_qwen(multimodal_inputs, generated_token_ids) + + def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids): + """Append completion tokens for ernie4_5_vl.""" + num_tokens = len(completion_token_ids) + multimodal_inputs["input_ids"].extend(completion_token_ids) + multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) + + start = multimodal_inputs["cur_position"] + for i in range(num_tokens): + multimodal_inputs["position_ids"].append([start + i] * 3) + multimodal_inputs["cur_position"] += num_tokens + + def pack_outputs(self, outputs): + """Convert intermediate processing outputs to final format.""" + if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL): + return self._pack_outputs_qwen(outputs) + elif self.model_type == ERNIE4_5_VL: + return self._pack_outputs_ernie(outputs) + + def _pack_outputs_qwen(self, outputs): + """Pack outputs for qwen_vl / qwen3_vl / paddleocr_vl.""" + if not outputs["images"]: + outputs["images"] = None + outputs["grid_thw"] = None + outputs["image_type_ids"] = None + else: + outputs["images"] = np.vstack(outputs["images"]) + outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) + outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) + + outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) + outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) + outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64) + + outputs["image_patch_id"] = self.processor.image_token_id + outputs["video_patch_id"] = self.processor.video_token_id + outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) + + outputs["mm_num_token_func"] = self.processor.mm_num_tokens + return outputs + + def _pack_outputs_ernie(self, outputs): + """Pack outputs for ernie4_5_vl.""" + if not outputs["images"]: + outputs["images"] = None + outputs["grid_thw"] = None + outputs["image_type_ids"] = None + else: + outputs["images"] = np.vstack(outputs["images"]) + outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) + outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) + + outputs["image_patch_id"] = self.image_patch_id + outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) + outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) + outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64) + outputs["mm_num_token_func"] = self.processor.mm_num_tokens + return outputs diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 8568d1ff32d..0261177eeaa 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -106,74 +106,34 @@ def create_processor(self): tool_parser_obj=tool_parser_obj, ) else: - if ErnieArchitectures.contains_ernie_arch(architecture): - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) - else: - from fastdeploy.input.v1.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) + from fastdeploy.input.multimodal_processor import ( + ERNIE4_5_VL, + PADDLEOCR_VL, + QWEN3_VL, + QWEN_VL, + MultiModalProcessor, + ) - self.processor = Ernie4_5_VLProcessor( - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - tool_parser_obj=tool_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + if ErnieArchitectures.contains_ernie_arch(architecture): + model_type = ERNIE4_5_VL elif "PaddleOCRVL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) - else: - from fastdeploy.input.v1.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) - - self.processor = PaddleOCRVLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - ) + model_type = PADDLEOCR_VL elif "Qwen2_5_VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen_vl_processor import QwenVLProcessor - else: - from fastdeploy.input.v1.qwen_vl_processor import ( - QwenVLProcessor, - ) - - self.processor = QwenVLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + model_type = QWEN_VL elif "Qwen3VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor - else: - from fastdeploy.input.v1.qwen3_vl_processor import ( - Qwen3VLProcessor, - ) - - self.processor = Qwen3VLProcessor( - config=self.model_config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=self.limit_mm_per_prompt, - mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj, - enable_processor_cache=self.enable_processor_cache, - ) + model_type = QWEN3_VL else: raise ValueError(f"Unsupported model processor architecture: {architecture}. ") + self.processor = MultiModalProcessor( + model_name_or_path=self.model_name_or_path, + model_type=model_type, + config=self.model_config, + limit_mm_per_prompt=self.limit_mm_per_prompt, + mm_processor_kwargs=self.mm_processor_kwargs, + reasoning_parser_obj=reasoning_parser_obj, + tool_parser_obj=tool_parser_obj, + enable_processor_cache=self.enable_processor_cache, + ) + return self.processor diff --git a/tests/input/test_image_preprocessor_adaptive.py b/tests/input/test_image_preprocessor_adaptive.py index cc9ed857554..d01ce6e179e 100644 --- a/tests/input/test_image_preprocessor_adaptive.py +++ b/tests/input/test_image_preprocessor_adaptive.py @@ -340,9 +340,7 @@ def test_preprocess_scaled_image_warning(self): # Create a scaled image (values between 0-1) img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 # Use patch to capture warning - with patch( - "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger" - ) as mock_logger: + with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger: # Directly call _preprocess, pass scaled image self.processor._preprocess( [img_array], # Pass scaled numpy array @@ -356,9 +354,7 @@ def test_preprocess_invalid_images_check(self): """Test invalid image check in preprocess (line 464)""" # Test invalid image type - need to ensure valid_images returns False # Use patch to make valid_images return False, but make_batched_images succeeds - with patch( - "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images" - ) as mock_valid: + with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid: mock_valid.return_value = False valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False with self.assertRaises(ValueError) as context: From f5bdd67062dddce30d1ac2436e2cf85fbaa361be Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Tue, 31 Mar 2026 16:11:39 +0800 Subject: [PATCH 3/8] update multimodal --- fastdeploy/input/multimodal_processor.py | 350 ++++++----------------- 1 file changed, 88 insertions(+), 262 deletions(-) diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py index 3e5d5c896f1..afada3d2e3d 100644 --- a/fastdeploy/input/multimodal_processor.py +++ b/fastdeploy/input/multimodal_processor.py @@ -265,268 +265,117 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int] return None def process_request_dict(self, request, max_model_len=None): - """Process a request dictionary into model inputs.""" - if self.model_type == QWEN_VL: - return self._process_request_qwen_vl(request, max_model_len) - elif self.model_type == QWEN3_VL: - return self._process_request_qwen3_vl(request, max_model_len) - elif self.model_type == PADDLEOCR_VL: - return self._process_request_paddleocr_vl(request, max_model_len) - elif self.model_type == ERNIE4_5_VL: - return self._process_request_ernie4_5_vl(request, max_model_len) + """Process a request dictionary into model inputs. - def _process_request_qwen_vl(self, request, max_model_len): - """Process request for qwen_vl model type.""" + Unified template-method flow for all VL model types. Per-model + differences are handled by small conditional branches rather than + duplicating the entire pipeline. + """ request = self._apply_default_parameters(request) + if not request.get("eos_token_ids"): request["eos_token_ids"] = self.eos_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - bad_words = request.get("bad_words") - bad_words_token_ids = request.get("bad_words_token_ids") - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request["bad_words_token_ids"] = bad_words_token_ids + self._process_stop_tokens(request) - if request.get("prompt"): - multimodal_data = request.get("multimodal_data") - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request["prompt"], images, videos) + if self.model_type != PADDLEOCR_VL: + self._process_bad_words(request) - elif request.get("messages"): - messages = request["messages"] - self._check_mm_limits(messages) - chat_template_kwargs = request.get("chat_template_kwargs") - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if k not in request or request[k] is None: - request[k] = v - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - request.setdefault("enable_thinking", False) - outputs = self.processor.request2ids(request) + if self.model_type == ERNIE4_5_VL: + logits_processors_args = self._prepare_think_stop_sentence( + request.get("logits_processors_args") or {}, max_model_len + ) + request["logits_processors_args"] = logits_processors_args - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") + outputs = self._tokenize_request(request) - if request.get("completion_token_ids"): - self.append_completion_tokens(outputs, request["completion_token_ids"]) + self._process_post_tokens(request, outputs) - # qwen25_vl not support thinking - request["enable_thinking"] = False + if self.model_type in (QWEN_VL, QWEN3_VL): + request["enable_thinking"] = False outputs = self.pack_outputs(outputs) - request["prompt_token_ids"] = outputs["input_ids"].tolist() + if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"): + pass # preserve existing prompt_token_ids + else: + request["prompt_token_ids"] = outputs["input_ids"].tolist() request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) request["multimodal_inputs"] = outputs if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + if self.model_type == ERNIE4_5_VL: + logits_processors_args = self._update_thinking_prompt_state( + request["prompt_token_ids"], request.get("logits_processors_args") or {} + ) + request["logits_processors_args"] = logits_processors_args + max_tokens = max_model_len - len(request["prompt_token_ids"]) if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_tokens) else: request["max_tokens"] = min(max_tokens, request["max_tokens"]) - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - parts = request["request_id"].split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request["request_id"]] = model_status - request["enable_thinking"] = model_status == "think_start" - - data_processor_logger.info(f"Processed request {request}") - return request - - def _process_request_qwen3_vl(self, request, max_model_len): - """Process request for qwen3_vl model type.""" - request = self._apply_default_parameters(request) - if not request.get("eos_token_ids"): - request["eos_token_ids"] = self.eos_token_ids - - stop_sequences = request.get("stop", []) - if stop_sequences: - stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) - request["stop_token_ids"] = stop_seqs - request["stop_seqs_len"] = stop_seqs_len - - bad_words = request.get("bad_words") - bad_words_token_ids = request.get("bad_words_token_ids") - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request["bad_words_token_ids"] = bad_words_token_ids - - if request.get("prompt_token_ids"): - messages = request.get("messages") - if messages: - self._check_mm_limits(messages) - request.setdefault("enable_thinking", False) - outputs = self.processor.prompt_token_ids2outputs(request) - - elif request.get("prompt"): - multimodal_data = request.get("multimodal_data") - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request["prompt"], images, videos) - - elif request.get("messages"): - messages = request["messages"] - self._check_mm_limits(messages) - chat_template_kwargs = request.get("chat_template_kwargs") - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if k not in request or request[k] is None: - request[k] = v - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - request.setdefault("enable_thinking", False) - outputs = self.processor.request2ids(request) - - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - if request.get("completion_token_ids"): - self.append_completion_tokens(outputs, request["completion_token_ids"]) - - # qwen3_vl not support thinking - request["enable_thinking"] = False - - outputs = self.pack_outputs(outputs) + if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None: + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) - request["prompt_token_ids"] = ( - outputs["input_ids"].tolist() if not request.get("prompt_token_ids") else request["prompt_token_ids"] - ) - request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) - request["multimodal_inputs"] = outputs + if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL): + if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: + request["top_p"] = _SAMPLING_EPS + request["top_k"] = 1 - if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: - request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] + if self.model_type != QWEN3_VL and self.reasoning_parser: + self._apply_reasoning_parser(request) - max_tokens = max_model_len - len(request["prompt_token_ids"]) - if request.get("max_tokens") is None: - request["max_tokens"] = max(1, max_tokens) - else: - request["max_tokens"] = min(max_tokens, request["max_tokens"]) + if self.model_type == ERNIE4_5_VL: + if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False: + request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"]) data_processor_logger.info(f"Processed request {request}") return request - def _process_request_paddleocr_vl(self, request, max_model_len): - """Process request for paddleocr_vl model type.""" - request = self._apply_default_parameters(request) - if not request.get("eos_token_ids"): - request["eos_token_ids"] = self.eos_token_ids - - process_stop_token_ids(request, self.update_stop_seq) - - if request.get("prompt"): - multimodal_data = request.get("multimodal_data") - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request["prompt"], images, videos) - - elif request.get("messages"): - messages = request["messages"] - self._check_mm_limits(messages) - outputs = self.processor.request2ids(request) - - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - metadata = request.get("metadata") - if metadata and metadata.get("generated_token_ids"): - self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"]) - - outputs = self.pack_outputs(outputs) - - request["prompt_token_ids"] = outputs["input_ids"].tolist() - request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) - request["multimodal_inputs"] = outputs - - if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: - request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] - - max_tokens = max_model_len - len(request["prompt_token_ids"]) - if request.get("max_tokens") is None: - request["max_tokens"] = max(1, max_tokens) + def _process_stop_tokens(self, request): + """Handle stop token processing based on model type.""" + if self.model_type == QWEN3_VL: + stop_sequences = request.get("stop", []) + if stop_sequences: + stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) + request["stop_token_ids"] = stop_seqs + request["stop_seqs_len"] = stop_seqs_len else: - request["max_tokens"] = min(max_tokens, request["max_tokens"]) - - if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: - request["top_p"] = _SAMPLING_EPS - request["top_k"] = 1 - - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - parts = request["request_id"].split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request["request_id"]] = model_status - request["enable_thinking"] = model_status == "think_start" - - return request - - def _process_request_ernie4_5_vl(self, request, max_model_len): - """Process request for ernie4_5_vl model type.""" - request = self._apply_default_parameters(request) - if not request.get("eos_token_ids"): - request["eos_token_ids"] = self.eos_token_ids - - process_stop_token_ids(request, self.update_stop_seq) + process_stop_token_ids(request, self.update_stop_seq) + def _process_bad_words(self, request): + """Process bad_words into token ids.""" bad_words = request.get("bad_words") bad_words_token_ids = request.get("bad_words_token_ids") if bad_words: bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) request["bad_words_token_ids"] = bad_words_token_ids - logits_processors_args = self._prepare_think_stop_sentence( - request.get("logits_processors_args") or {}, max_model_len - ) - request["logits_processors_args"] = logits_processors_args + def _tokenize_request(self, request): + """Core tokenization dispatch: prompt_token_ids > prompt > messages.""" + default_thinking = True if self.model_type == ERNIE4_5_VL else False - if request.get("prompt_token_ids"): + if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL): messages = request.get("messages") if messages: self._check_mm_limits(messages) - request.setdefault("enable_thinking", True) - outputs = self.processor.prompt_token_ids2outputs(request) + request.setdefault("enable_thinking", default_thinking) + return self.processor.prompt_token_ids2outputs(request) + elif request.get("prompt"): - multimodal_data = request.get("multimodal_data") - if multimodal_data is None: - multimodal_data = {} + multimodal_data = request.get("multimodal_data") or {} self._check_mm_limits(multimodal_data) images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) - request["prompt_tokens"] = request.get("prompt") - request.setdefault("enable_thinking", True) - outputs = self.processor.text2ids(request["prompt"], images, videos) + if self.model_type == ERNIE4_5_VL: + request["prompt_tokens"] = request.get("prompt") + request.setdefault("enable_thinking", default_thinking) + return self.processor.text2ids(request["prompt"], images, videos) + elif request.get("messages"): messages = request["messages"] self._check_mm_limits(messages) @@ -538,58 +387,35 @@ def _process_request_ernie4_5_vl(self, request, max_model_len): request[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") - request.setdefault("enable_thinking", True) - outputs = self.processor.request2ids(request) + request.setdefault("enable_thinking", default_thinking) + return self.processor.request2ids(request) + else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - if request.get("completion_token_ids"): - self.append_completion_tokens(outputs, request["completion_token_ids"]) - - outputs = self.pack_outputs(outputs) - request["prompt_token_ids"] = ( - outputs["input_ids"].tolist() - if ("prompt_token_ids" not in request or not request["prompt_token_ids"]) - else request["prompt_token_ids"] - ) - request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) - request["multimodal_inputs"] = outputs - - if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: - request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] - logits_processors_args = self._update_thinking_prompt_state( - request["prompt_token_ids"], request.get("logits_processors_args") or {} - ) - request["logits_processors_args"] = logits_processors_args - - max_tokens = max_model_len - len(request["prompt_token_ids"]) - if request.get("max_tokens") is None: - request["max_tokens"] = max(1, max_tokens) + def _process_post_tokens(self, request, outputs): + """Handle post-tokenization token appending.""" + if self.model_type == PADDLEOCR_VL: + metadata = request.get("metadata") + if metadata and metadata.get("generated_token_ids"): + self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"]) else: - request["max_tokens"] = min(max_tokens, request["max_tokens"]) - if request.get("reasoning_max_tokens") is None: - request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) - - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) - parts = request["request_id"].split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request["request_id"]] = model_status - request["enable_thinking"] = model_status == "think_start" - if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS: - request["top_p"] = _SAMPLING_EPS - request["top_k"] = 1 - if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False: - request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"]) - - data_processor_logger.info(f"Processed request {request}") - return request + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) + + def _apply_reasoning_parser(self, request): + """Apply reasoning parser and update model status dict.""" + model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"]) + parts = request["request_id"].split("_") + if len(parts) > 1: + real_req_id = parts[0] + index = int(parts[1]) + n = request.get("n", 1) + for idx in range(index * n, (index + 1) * n): + self.model_status_dict[f"{real_req_id}_{idx}"] = model_status + else: + self.model_status_dict[request["request_id"]] = model_status + request["enable_thinking"] = model_status == "think_start" def append_completion_tokens(self, multimodal_inputs, completion_token_ids): """Append completion tokens to existing multimodal outputs.""" From 7646a49a5a05b058dd0f9c44257096e8c361a6c5 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Tue, 31 Mar 2026 16:24:08 +0800 Subject: [PATCH 4/8] update multimodal --- fastdeploy/input/multimodal_processor.py | 49 ++++++------------------ 1 file changed, 11 insertions(+), 38 deletions(-) diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py index afada3d2e3d..2b9211250b2 100644 --- a/fastdeploy/input/multimodal_processor.py +++ b/fastdeploy/input/multimodal_processor.py @@ -398,7 +398,7 @@ def _process_post_tokens(self, request, outputs): if self.model_type == PADDLEOCR_VL: metadata = request.get("metadata") if metadata and metadata.get("generated_token_ids"): - self._append_generated_tokens_qwen(outputs, metadata["generated_token_ids"]) + self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"]) else: if request.get("completion_token_ids"): self.append_completion_tokens(outputs, request["completion_token_ids"]) @@ -419,12 +419,10 @@ def _apply_reasoning_parser(self, request): def append_completion_tokens(self, multimodal_inputs, completion_token_ids): """Append completion tokens to existing multimodal outputs.""" - if self.model_type in (QWEN_VL, QWEN3_VL): - self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) - elif self.model_type == PADDLEOCR_VL: - self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) - elif self.model_type == ERNIE4_5_VL: + if self.model_type == ERNIE4_5_VL: self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids) + else: + self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids) def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids): """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl.""" @@ -436,10 +434,6 @@ def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids multimodal_inputs["position_ids"].append(pos_ids) multimodal_inputs["cur_position"] += num_tokens - def _append_generated_tokens_qwen(self, multimodal_inputs, generated_token_ids): - """Append generated tokens for paddleocr_vl (uses metadata.generated_token_ids).""" - self._append_completion_tokens_qwen(multimodal_inputs, generated_token_ids) - def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids): """Append completion tokens for ernie4_5_vl.""" num_tokens = len(completion_token_ids) @@ -453,13 +447,6 @@ def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_id def pack_outputs(self, outputs): """Convert intermediate processing outputs to final format.""" - if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL): - return self._pack_outputs_qwen(outputs) - elif self.model_type == ERNIE4_5_VL: - return self._pack_outputs_ernie(outputs) - - def _pack_outputs_qwen(self, outputs): - """Pack outputs for qwen_vl / qwen3_vl / paddleocr_vl.""" if not outputs["images"]: outputs["images"] = None outputs["grid_thw"] = None @@ -471,29 +458,15 @@ def _pack_outputs_qwen(self, outputs): outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) - outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64) - - outputs["image_patch_id"] = self.processor.image_token_id - outputs["video_patch_id"] = self.processor.video_token_id - outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) - outputs["mm_num_token_func"] = self.processor.mm_num_tokens - return outputs - def _pack_outputs_ernie(self, outputs): - """Pack outputs for ernie4_5_vl.""" - if not outputs["images"]: - outputs["images"] = None - outputs["grid_thw"] = None - outputs["image_type_ids"] = None + if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL): + outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64) + outputs["image_patch_id"] = self.processor.image_token_id + outputs["video_patch_id"] = self.processor.video_token_id + outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) else: - outputs["images"] = np.vstack(outputs["images"]) - outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) - outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) + outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64) + outputs["image_patch_id"] = self.image_patch_id - outputs["image_patch_id"] = self.image_patch_id - outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) - outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) - outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64) - outputs["mm_num_token_func"] = self.processor.mm_num_tokens return outputs From d9199abbe5c8cb030ae47dd4a1a16d2343b2a228 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 1 Apr 2026 15:13:03 +0800 Subject: [PATCH 5/8] fix load tokenizer --- .../input/image_processors/adaptive_processor.py | 2 +- fastdeploy/input/multimodal_processor.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py index 47e677e4917..e5ad4960391 100644 --- a/fastdeploy/input/image_processors/adaptive_processor.py +++ b/fastdeploy/input/image_processors/adaptive_processor.py @@ -64,7 +64,7 @@ List["np.ndarray"], List["paddle.Tensor"], List[List["PIL.Image.Image"]], - List[List["np.ndarrray"]], + List[List["np.ndarray"]], List[List["paddle.Tensor"]], ] diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py index 2b9211250b2..143160b0fce 100644 --- a/fastdeploy/input/multimodal_processor.py +++ b/fastdeploy/input/multimodal_processor.py @@ -109,9 +109,16 @@ def __init__( def _load_tokenizer(self): """Load the appropriate tokenizer based on model_type.""" if self.tokenizer_type == "ernie4_5": - from paddleformers.transformers import AutoTokenizer as PFAutoTokenizer + import os - tokenizer = PFAutoTokenizer.from_pretrained(self.model_name_or_path) + from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer + + vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"] + for name in vocab_file_names: + if os.path.exists(os.path.join(self.model_name_or_path, name)): + Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name + break + tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path) else: from paddleformers.transformers import AutoTokenizer From 70c5af31a065b2193cc1a37cc4056a2e0269478c Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 1 Apr 2026 18:01:53 +0800 Subject: [PATCH 6/8] add unit test --- tests/input/test_multimodal_processor.py | 1130 ++++++++++++++++++++++ 1 file changed, 1130 insertions(+) create mode 100644 tests/input/test_multimodal_processor.py diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py new file mode 100644 index 00000000000..bfce5b302a9 --- /dev/null +++ b/tests/input/test_multimodal_processor.py @@ -0,0 +1,1130 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import pickle +import unittest +from unittest.mock import MagicMock, patch + +import numpy as np + +from fastdeploy.input.multimodal_processor import ( + _DEFAULT_MM_LIMITS, + _SAMPLING_EPS, + ERNIE4_5_VL, + PADDLEOCR_VL, + QWEN3_VL, + QWEN_VL, + MultiModalProcessor, +) +from fastdeploy.input.utils import IDS_TYPE_FLAG + + +def _make_processor(model_type, **overrides): + """Create a MultiModalProcessor instance with __init__ bypassed. + + Manually sets the minimum attributes required by the methods under test. + """ + with patch.object(MultiModalProcessor, "__init__", return_value=None): + proc = MultiModalProcessor.__new__(MultiModalProcessor) + proc.model_type = model_type + proc.config = MagicMock() + proc.enable_processor_cache = False + proc.model_name_or_path = "/mock/model" + proc.tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto" + proc.limit_mm_per_prompt = dict(_DEFAULT_MM_LIMITS) + proc.eos_token_ids = [2] + proc.eos_token_id_len = 1 + proc.pad_token_id = 0 + proc.reasoning_parser = None + proc.tool_parser_obj = None + proc.model_status_dict = {} + proc.decode_status = {} + proc.tool_parser_dict = {} + proc.generation_config = MagicMock() + proc.generation_config.top_p = 0.7 + proc.generation_config.temperature = 1.0 + proc.generation_config.repetition_penalty = 1.0 + proc.generation_config.frequency_penalty = 0.0 + proc.generation_config.presence_penalty = 0.0 + + # Mock tokenizer + tokenizer = MagicMock() + tokenizer.eos_token_id = 2 + tokenizer.eos_token = "" + tokenizer.bos_token_id = 1 + tokenizer.bos_token = "" + tokenizer.pad_token_id = 0 + tokenizer.vocab_size = 32000 + tokenizer.chat_template = "dummy" + tokenizer.tokenize.return_value = ["hello"] + tokenizer.convert_tokens_to_ids.return_value = [100] + tokenizer.decode.return_value = "hello" + proc.tokenizer = tokenizer + + # Mock processor (the internal DataProcessor) + processor = MagicMock() + processor.image_token_id = 151655 + processor.video_token_id = 151656 + processor.image_patch_id = 151655 + processor.spatial_conv_size = 14 + processor.mm_num_tokens = MagicMock(return_value=1) + processor._compute_text_positions.return_value = np.array([[3, 4], [3, 4], [3, 4]]) + proc.processor = processor + + # Set attributes normally set by _init_mm_config + if model_type in (QWEN_VL, QWEN3_VL): + proc.image_patch_id = processor.image_token_id + elif model_type == PADDLEOCR_VL: + proc.image_patch_id = processor.image_patch_id + elif model_type == ERNIE4_5_VL: + proc.image_patch_id = processor.image_patch_id + proc.spatial_conv_size = processor.spatial_conv_size + + # Apply any overrides + for k, v in overrides.items(): + setattr(proc, k, v) + return proc + + +# =================================================================== +# __init__ validation +# =================================================================== +class TestMultiModalProcessorInitValidation(unittest.TestCase): + + def test_unsupported_model_type_raises(self): + """Line 86: unsupported model_type should raise ValueError.""" + with self.assertRaises(Exception): + # We need to let __init__ run the model_type check. + # Mock the parts that come after the check to isolate it. + with patch.object(MultiModalProcessor, "__init__", wraps=MultiModalProcessor.__init__) as _: + proc = object.__new__(MultiModalProcessor) + # Call the real __init__ which should fail on model_type check + MultiModalProcessor.__init__(proc, "/mock", model_type="unsupported_type") + + +# =================================================================== +# _parse_processor_kwargs +# =================================================================== +class TestParseProcessorKwargs(unittest.TestCase): + + def test_empty_kwargs_returns_empty(self): + proc = _make_processor(QWEN_VL) + self.assertEqual(proc._parse_processor_kwargs(None), {}) + self.assertEqual(proc._parse_processor_kwargs({}), {}) + + def test_valid_qwen_kwargs(self): + """Lines 196, 198-204: valid kwargs for qwen model type.""" + proc = _make_processor(QWEN_VL) + kwargs = {"video_max_frames": 10, "video_min_frames": 1} + result = proc._parse_processor_kwargs(kwargs) + self.assertEqual(result, kwargs) + + def test_valid_ernie_kwargs(self): + """Lines 193-194: valid kwargs for ernie model type.""" + proc = _make_processor(ERNIE4_5_VL) + kwargs = {"spatial_conv_size": 2, "temporal_conv_size": 1, "video_max_frames": 32} + result = proc._parse_processor_kwargs(kwargs) + self.assertEqual(result, kwargs) + + def test_invalid_type_not_dict(self): + """Lines 188-189: non-dict kwargs should return empty.""" + proc = _make_processor(QWEN_VL) + result = proc._parse_processor_kwargs("invalid") + self.assertEqual(result, {}) + + def test_invalid_value_type(self): + """Lines 199-200: wrong value type should return empty.""" + proc = _make_processor(QWEN_VL) + result = proc._parse_processor_kwargs({"video_max_frames": "ten"}) + self.assertEqual(result, {}) + + def test_mixed_valid_invalid_value_types(self): + proc = _make_processor(ERNIE4_5_VL) + result = proc._parse_processor_kwargs({"spatial_conv_size": 2, "image_min_pixels": "bad"}) + self.assertEqual(result, {}) + + def test_unknown_keys_pass_through(self): + """Keys not in expected_types are not validated, just passed through.""" + proc = _make_processor(QWEN_VL) + kwargs = {"unknown_key": "any_value"} + result = proc._parse_processor_kwargs(kwargs) + self.assertEqual(result, kwargs) + + +# =================================================================== +# _parse_limits +# =================================================================== +class TestParseLimits(unittest.TestCase): + + def test_none_returns_defaults(self): + proc = _make_processor(QWEN_VL) + self.assertEqual(proc._parse_limits(None), dict(_DEFAULT_MM_LIMITS)) + + def test_valid_limits_merged(self): + """Lines 219: valid limits merged with defaults.""" + proc = _make_processor(QWEN_VL) + result = proc._parse_limits({"image": 5, "video": 3}) + self.assertEqual(result, {"image": 5, "video": 3, "audio": 1}) + + def test_partial_limits(self): + proc = _make_processor(QWEN_VL) + result = proc._parse_limits({"image": 10}) + self.assertEqual(result, {"image": 10, "video": 1, "audio": 1}) + + def test_invalid_type_returns_defaults(self): + """Lines 216-217, 220-222: non-dict returns defaults.""" + proc = _make_processor(QWEN_VL) + result = proc._parse_limits("invalid") + self.assertEqual(result, dict(_DEFAULT_MM_LIMITS)) + + +# =================================================================== +# _check_mm_limits +# =================================================================== +class TestCheckMMLimits(unittest.TestCase): + + def test_dict_input_within_limits(self): + """Lines 226-227: dict input within limits passes.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 2, "video": 1, "audio": 1} + mm_data = {"image": ["img1"], "video": ["vid1"]} + proc._check_mm_limits(mm_data) # should not raise + + def test_dict_input_exceeds_limit(self): + """Lines 247-251: dict input exceeding limit raises ValueError.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + mm_data = {"image": ["img1", "img2"]} + with self.assertRaises(ValueError) as ctx: + proc._check_mm_limits(mm_data) + self.assertIn("Too many image items", str(ctx.exception)) + + def test_messages_input_qwen_vl_accepts_url_suffix(self): + """Lines 229-240: messages with image_url/video_url for qwen_vl.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "file://img.jpg"}}, + {"type": "text", "text": "describe"}, + ], + } + ] + proc._check_mm_limits(messages) # should not raise + + def test_messages_input_qwen_vl_image_type(self): + """Lines 237: 'image' type also accepted for url_suffix models.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + {"role": "user", "content": [{"type": "image", "image": "data"}]}, + ] + proc._check_mm_limits(messages) + + def test_messages_input_qwen_vl_video_url_type(self): + """Lines 239-240: video_url type for qwen_vl.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + {"role": "user", "content": [{"type": "video_url", "video_url": {"url": "file://vid.mp4"}}]}, + ] + proc._check_mm_limits(messages) + + def test_messages_input_ernie_only_accepts_plain_types(self): + """Lines 241-245: ernie4_5_vl only accepts 'image'/'video' types, not *_url.""" + proc = _make_processor(ERNIE4_5_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + # image_url should NOT be counted for ernie + messages = [ + {"role": "user", "content": [{"type": "image_url", "image_url": {"url": "file://img.jpg"}}]}, + ] + proc._check_mm_limits(messages) # no exception since image_url not counted + + def test_messages_input_ernie_image_type(self): + """Lines 242-243: ernie 'image' type is counted.""" + proc = _make_processor(ERNIE4_5_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": "data1"}, + {"type": "image", "image": "data2"}, + ], + } + ] + with self.assertRaises(ValueError): + proc._check_mm_limits(messages) + + def test_messages_input_ernie_video_type(self): + """Lines 244-245: ernie 'video' type is counted.""" + proc = _make_processor(ERNIE4_5_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + {"role": "user", "content": [{"type": "video", "video": "data"}]}, + ] + proc._check_mm_limits(messages) # within limit + + def test_messages_exceed_video_limit(self): + """Lines 247-251: video exceeding limit raises ValueError.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + { + "role": "user", + "content": [ + {"type": "video_url", "video_url": {"url": "file://v1.mp4"}}, + {"type": "video_url", "video_url": {"url": "file://v2.mp4"}}, + ], + } + ] + with self.assertRaises(ValueError) as ctx: + proc._check_mm_limits(messages) + self.assertIn("Too many video items", str(ctx.exception)) + + def test_messages_with_string_content_skipped(self): + """Messages with string content (not list) should be skipped.""" + proc = _make_processor(QWEN_VL) + proc.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} + messages = [ + {"role": "user", "content": "just text"}, + ] + proc._check_mm_limits(messages) # should not raise + + +# =================================================================== +# _get_processor_cache / _update_processor_cache +# =================================================================== +class TestProcessorCache(unittest.TestCase): + + def test_get_processor_cache(self): + """Lines 255-260: retrieve cached results via socket.""" + proc = _make_processor(QWEN_VL) + mock_socket = MagicMock() + mm_hashes = ["hash1", "hash2"] + expected_items = [{"data": "item1"}, {"data": "item2"}] + mock_socket.recv_multipart.return_value = [b"", pickle.dumps(expected_items)] + + result = proc._get_processor_cache(mock_socket, mm_hashes) + + mock_socket.send_multipart.assert_called_once() + self.assertEqual(result, expected_items) + + def test_update_processor_cache(self): + """Lines 264-266: update cache via socket.""" + proc = _make_processor(QWEN_VL) + mock_socket = MagicMock() + mm_hashes = ["hash1"] + mm_items = [{"data": "item1"}] + + proc._update_processor_cache(mock_socket, mm_hashes, mm_items) + + mock_socket.send_multipart.assert_called_once() + sent_data = mock_socket.send_multipart.call_args[0][0] + self.assertEqual(sent_data[0], b"") + unpacked = pickle.loads(sent_data[1]) + self.assertEqual(unpacked, (mm_hashes, mm_items)) + + +# =================================================================== +# get_mm_max_tokens_per_item +# =================================================================== +class TestGetMmMaxTokensPerItem(unittest.TestCase): + + def test_ernie_returns_processor_result(self): + """Line 271: ernie delegates to processor.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.get_mm_max_tokens_per_item.return_value = {"image": 512} + result = proc.get_mm_max_tokens_per_item(1024) + self.assertEqual(result, {"image": 512}) + + def test_non_ernie_returns_none(self): + """Line 272: non-ernie returns None.""" + proc = _make_processor(QWEN_VL) + self.assertIsNone(proc.get_mm_max_tokens_per_item(1024)) + + proc2 = _make_processor(QWEN3_VL) + self.assertIsNone(proc2.get_mm_max_tokens_per_item(1024)) + + +# =================================================================== +# _process_stop_tokens +# =================================================================== +class TestProcessStopTokens(unittest.TestCase): + + def test_qwen3_vl_stop_handling(self): + """Lines 348-353: qwen3_vl uses update_stop_seq differently.""" + proc = _make_processor(QWEN3_VL) + proc.update_stop_seq = MagicMock(return_value=([[100]], [1])) + request = {"stop": [""]} + proc._process_stop_tokens(request) + self.assertEqual(request["stop_token_ids"], [[100]]) + self.assertEqual(request["stop_seqs_len"], [1]) + + def test_qwen3_vl_no_stop(self): + """Lines 348-350: qwen3_vl with empty stop list.""" + proc = _make_processor(QWEN3_VL) + proc.update_stop_seq = MagicMock() + request = {"stop": []} + proc._process_stop_tokens(request) + proc.update_stop_seq.assert_not_called() + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_non_qwen3_uses_process_stop_token_ids(self, mock_process): + """Lines 354-355: non-qwen3 uses process_stop_token_ids utility.""" + proc = _make_processor(QWEN_VL) + proc.update_stop_seq = MagicMock() + request = {} + proc._process_stop_tokens(request) + mock_process.assert_called_once_with(request, proc.update_stop_seq) + + +# =================================================================== +# _process_bad_words +# =================================================================== +class TestProcessBadWords(unittest.TestCase): + + def test_with_bad_words(self): + """Lines 359-363: bad_words are processed.""" + proc = _make_processor(QWEN_VL) + proc.update_bad_words = MagicMock(return_value=[100, 200]) + request = {"bad_words": ["bad", "word"], "bad_words_token_ids": [50]} + proc._process_bad_words(request) + proc.update_bad_words.assert_called_once_with(["bad", "word"], [50]) + self.assertEqual(request["bad_words_token_ids"], [100, 200]) + + def test_without_bad_words(self): + """Lines 361: no bad_words means no processing.""" + proc = _make_processor(QWEN_VL) + proc.update_bad_words = MagicMock() + request = {} + proc._process_bad_words(request) + proc.update_bad_words.assert_not_called() + + +# =================================================================== +# _tokenize_request +# =================================================================== +class TestTokenizeRequest(unittest.TestCase): + + def test_prompt_token_ids_qwen3_vl(self): + """Lines 369-374: prompt_token_ids path for qwen3_vl.""" + proc = _make_processor(QWEN3_VL) + expected = {"input_ids": [1, 2, 3]} + proc.processor.prompt_token_ids2outputs.return_value = expected + + request = {"prompt_token_ids": [1, 2, 3], "messages": [{"role": "user", "content": "hi"}]} + result = proc._tokenize_request(request) + self.assertEqual(result, expected) + self.assertFalse(request.get("enable_thinking", True)) # default_thinking=False for qwen3_vl + + def test_prompt_token_ids_ernie(self): + """Lines 369-374: prompt_token_ids path for ernie.""" + proc = _make_processor(ERNIE4_5_VL) + expected = {"input_ids": [1, 2, 3]} + proc.processor.prompt_token_ids2outputs.return_value = expected + + request = {"prompt_token_ids": [1, 2, 3]} + result = proc._tokenize_request(request) + self.assertEqual(result, expected) + self.assertTrue(request.get("enable_thinking")) # default_thinking=True for ernie + + def test_prompt_path(self): + """Lines 376-384: prompt text path.""" + proc = _make_processor(QWEN_VL) + expected = {"input_ids": [10, 20]} + proc.processor.text2ids.return_value = expected + + request = {"prompt": "hello", "multimodal_data": {"image": [], "video": []}} + result = proc._tokenize_request(request) + proc.processor.text2ids.assert_called_once_with("hello", [], []) + self.assertEqual(result, expected) + + def test_prompt_path_ernie_sets_prompt_tokens(self): + """Lines 381-382: ernie sets prompt_tokens from prompt.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.text2ids.return_value = {"input_ids": [1]} + + request = {"prompt": "test prompt"} + proc._tokenize_request(request) + self.assertEqual(request["prompt_tokens"], "test prompt") + + def test_messages_path(self): + """Lines 386-398: messages path.""" + proc = _make_processor(QWEN_VL) + expected = {"input_ids": [5, 6]} + proc.processor.request2ids.return_value = expected + + request = {"messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}]} + result = proc._tokenize_request(request) + proc.processor.request2ids.assert_called_once() + self.assertEqual(result, expected) + + def test_messages_path_with_chat_template_kwargs(self): + """Lines 389-394: chat_template_kwargs are merged into request.""" + proc = _make_processor(QWEN_VL) + proc.processor.request2ids.return_value = {"input_ids": [1]} + + request = { + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "chat_template_kwargs": {"enable_thinking": True}, + } + proc._tokenize_request(request) + self.assertTrue(request.get("enable_thinking")) + + def test_messages_path_chat_template_kwargs_no_overwrite(self): + """Lines 393: existing request keys are not overwritten.""" + proc = _make_processor(QWEN_VL) + proc.processor.request2ids.return_value = {"input_ids": [1]} + + request = { + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "chat_template_kwargs": {"enable_thinking": True}, + "enable_thinking": False, + } + proc._tokenize_request(request) + self.assertFalse(request["enable_thinking"]) + + def test_messages_path_invalid_chat_template_kwargs(self): + """Lines 395-396: non-dict chat_template_kwargs raises.""" + proc = _make_processor(QWEN_VL) + request = { + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "chat_template_kwargs": "invalid", + } + with self.assertRaises(ValueError) as ctx: + proc._tokenize_request(request) + self.assertIn("must be a dict", str(ctx.exception)) + + def test_no_input_raises(self): + """Lines 400-401: no prompt/messages/prompt_token_ids raises.""" + proc = _make_processor(QWEN_VL) + with self.assertRaises(ValueError) as ctx: + proc._tokenize_request({"request_id": "test"}) + self.assertIn("must contain", str(ctx.exception)) + + def test_prompt_path_no_multimodal_data(self): + """Lines 377: prompt with no multimodal_data passes None for images/videos.""" + proc = _make_processor(QWEN_VL) + proc.processor.text2ids.return_value = {"input_ids": [1]} + + request = {"prompt": "hello"} + proc._tokenize_request(request) + proc.processor.text2ids.assert_called_once_with("hello", None, None) + + +# =================================================================== +# _process_post_tokens +# =================================================================== +class TestProcessPostTokens(unittest.TestCase): + + def test_paddleocr_with_metadata_generated_tokens(self): + """Lines 405-408: paddleocr_vl appends via _append_completion_tokens_qwen.""" + proc = _make_processor(PADDLEOCR_VL) + proc._append_completion_tokens_qwen = MagicMock() + outputs = {"input_ids": [1, 2]} + request = {"metadata": {"generated_token_ids": [10, 11]}} + proc._process_post_tokens(request, outputs) + proc._append_completion_tokens_qwen.assert_called_once_with(outputs, [10, 11]) + + def test_paddleocr_without_metadata(self): + """Lines 405-406: paddleocr_vl with no metadata does nothing.""" + proc = _make_processor(PADDLEOCR_VL) + proc._append_completion_tokens_qwen = MagicMock() + outputs = {"input_ids": [1]} + proc._process_post_tokens({}, outputs) + proc._append_completion_tokens_qwen.assert_not_called() + + def test_non_paddleocr_with_completion_tokens(self): + """Lines 410-411: non-paddleocr uses append_completion_tokens.""" + proc = _make_processor(QWEN_VL) + proc.append_completion_tokens = MagicMock() + outputs = {"input_ids": [1]} + request = {"completion_token_ids": [5, 6]} + proc._process_post_tokens(request, outputs) + proc.append_completion_tokens.assert_called_once_with(outputs, [5, 6]) + + def test_non_paddleocr_without_completion_tokens(self): + """Lines 410: no completion_token_ids does nothing.""" + proc = _make_processor(QWEN_VL) + proc.append_completion_tokens = MagicMock() + outputs = {"input_ids": [1]} + proc._process_post_tokens({}, outputs) + proc.append_completion_tokens.assert_not_called() + + +# =================================================================== +# _apply_reasoning_parser +# =================================================================== +class TestApplyReasoningParser(unittest.TestCase): + + def test_basic_request_id(self): + """Lines 415-425: basic request_id (no underscore split).""" + proc = _make_processor(QWEN_VL) + proc.reasoning_parser = MagicMock() + proc.reasoning_parser.get_model_status.return_value = "think_start" + proc.model_status_dict = {} + + request = {"request_id": "req1", "prompt_token_ids": [1, 2, 3]} + proc._apply_reasoning_parser(request) + + self.assertEqual(proc.model_status_dict["req1"], "think_start") + self.assertTrue(request["enable_thinking"]) + + def test_compound_request_id(self): + """Lines 416-422: request_id with underscore is split.""" + proc = _make_processor(QWEN_VL) + proc.reasoning_parser = MagicMock() + proc.reasoning_parser.get_model_status.return_value = "think_end" + proc.model_status_dict = {} + + request = {"request_id": "req1_2", "prompt_token_ids": [1, 2], "n": 3} + proc._apply_reasoning_parser(request) + + # index=2, n=3 → range(6, 9) + for idx in [6, 7, 8]: + self.assertEqual(proc.model_status_dict[f"req1_{idx}"], "think_end") + self.assertFalse(request["enable_thinking"]) + + def test_compound_request_id_default_n(self): + """Lines 420: default n=1.""" + proc = _make_processor(QWEN_VL) + proc.reasoning_parser = MagicMock() + proc.reasoning_parser.get_model_status.return_value = "think_start" + proc.model_status_dict = {} + + request = {"request_id": "req1_0", "prompt_token_ids": [1]} + proc._apply_reasoning_parser(request) + + self.assertIn("req1_0", proc.model_status_dict) + self.assertTrue(request["enable_thinking"]) + + +# =================================================================== +# append_completion_tokens +# =================================================================== +class TestAppendCompletionTokens(unittest.TestCase): + + def test_ernie_dispatches_to_ernie_method(self): + """Lines 429-430: ernie dispatches to _append_completion_tokens_ernie.""" + proc = _make_processor(ERNIE4_5_VL) + proc._append_completion_tokens_ernie = MagicMock() + inputs = {"input_ids": [1]} + proc.append_completion_tokens(inputs, [2, 3]) + proc._append_completion_tokens_ernie.assert_called_once_with(inputs, [2, 3]) + + def test_non_ernie_dispatches_to_qwen_method(self): + """Lines 431-432: non-ernie dispatches to _append_completion_tokens_qwen.""" + proc = _make_processor(QWEN_VL) + proc._append_completion_tokens_qwen = MagicMock() + inputs = {"input_ids": [1]} + proc.append_completion_tokens(inputs, [2, 3]) + proc._append_completion_tokens_qwen.assert_called_once_with(inputs, [2, 3]) + + +class TestAppendCompletionTokensQwen(unittest.TestCase): + + def test_qwen_append(self): + """Lines 436-442: appends tokens, token_type_ids, position_ids for qwen.""" + proc = _make_processor(QWEN_VL) + multimodal_inputs = { + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + "cur_position": 3, + } + proc._append_completion_tokens_qwen(multimodal_inputs, [4, 5]) + + self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5]) + self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0]) + self.assertEqual(multimodal_inputs["cur_position"], 5) + self.assertEqual(len(multimodal_inputs["position_ids"]), 2) + + +class TestAppendCompletionTokensErnie(unittest.TestCase): + + def test_ernie_append(self): + """Lines 446-453: appends tokens with IDS_TYPE_FLAG for ernie.""" + proc = _make_processor(ERNIE4_5_VL) + multimodal_inputs = { + "input_ids": [10, 20], + "token_type_ids": [IDS_TYPE_FLAG["text"], IDS_TYPE_FLAG["text"]], + "position_ids": [[0, 0, 0], [1, 1, 1]], + "cur_position": 2, + } + proc._append_completion_tokens_ernie(multimodal_inputs, [30, 40, 50]) + + self.assertEqual(multimodal_inputs["input_ids"], [10, 20, 30, 40, 50]) + self.assertEqual(len(multimodal_inputs["token_type_ids"]), 5) + self.assertTrue(all(t == IDS_TYPE_FLAG["text"] for t in multimodal_inputs["token_type_ids"])) + self.assertEqual(multimodal_inputs["position_ids"], [[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]) + self.assertEqual(multimodal_inputs["cur_position"], 5) + + +# =================================================================== +# pack_outputs +# =================================================================== +class TestPackOutputs(unittest.TestCase): + + def test_qwen_with_images(self): + """Lines 457-474: qwen pack_outputs with image data.""" + proc = _make_processor(QWEN_VL) + outputs = { + "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])], + "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])], + "image_type_ids": [0, 1], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + } + result = proc.pack_outputs(outputs) + + self.assertIsNotNone(result["images"]) + self.assertEqual(result["images"].shape[0], 4) + self.assertIsNotNone(result["grid_thw"]) + self.assertEqual(result["input_ids"].dtype, np.int64) + self.assertEqual(result["token_type_ids"].dtype, np.int64) + self.assertEqual(result["position_ids"].dtype, np.int64) + self.assertEqual(result["image_patch_id"], proc.processor.image_token_id) + self.assertEqual(result["video_patch_id"], proc.processor.video_token_id) + + def test_qwen_without_images(self): + """Lines 457-460: empty images set to None.""" + proc = _make_processor(QWEN_VL) + outputs = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2], + "token_type_ids": [0, 0], + "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])], + } + result = proc.pack_outputs(outputs) + + self.assertIsNone(result["images"]) + self.assertIsNone(result["grid_thw"]) + self.assertIsNone(result["image_type_ids"]) + + def test_ernie_pack_outputs(self): + """Lines 475-477: ernie uses different position_ids handling.""" + proc = _make_processor(ERNIE4_5_VL) + proc.image_patch_id = 9999 + outputs = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2], + "token_type_ids": [0, 0], + "position_ids": [[0, 0, 0], [1, 1, 1]], + } + result = proc.pack_outputs(outputs) + + self.assertIsNone(result["images"]) + self.assertEqual(result["position_ids"].dtype, np.int64) + self.assertEqual(result["position_ids"].shape, (2, 3)) + self.assertEqual(result["image_patch_id"], 9999) + self.assertNotIn("video_patch_id", result) + + def test_paddleocr_with_images(self): + """Lines 470-474: paddleocr uses same path as qwen.""" + proc = _make_processor(PADDLEOCR_VL) + outputs = { + "images": [np.array([[1, 2]])], + "grid_thw": [np.array([1, 1, 2])], + "image_type_ids": [0], + "input_ids": [1], + "token_type_ids": [0], + "position_ids": [np.array([[0], [0], [0]])], + } + result = proc.pack_outputs(outputs) + + self.assertIsNotNone(result["images"]) + self.assertEqual(result["image_patch_id"], proc.processor.image_token_id) + self.assertEqual(result["video_patch_id"], proc.processor.video_token_id) + + +# =================================================================== +# process_request_dict (integration-level tests for flow coverage) +# =================================================================== +class TestProcessRequestDict(unittest.TestCase): + + def _make_mock_outputs(self): + return { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3, 4, 5], + "token_type_ids": [0, 0, 0, 0, 0], + "position_ids": [np.array([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]])], + } + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_qwen_vl_messages_flow(self, mock_stop): + """Lines 281-344: full flow for qwen_vl with messages.""" + proc = _make_processor(QWEN_VL) + proc.processor.request2ids.return_value = self._make_mock_outputs() + + request = { + "request_id": "test1", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}], + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertIn("prompt_token_ids", result) + self.assertIn("multimodal_inputs", result) + self.assertEqual(result["prompt_token_ids_len"], len(result["prompt_token_ids"])) + self.assertFalse(result.get("enable_thinking")) # qwen_vl sets False + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_qwen3_vl_with_prompt_token_ids(self, mock_stop): + """Lines 306-307: qwen3_vl with existing prompt_token_ids preserved.""" + proc = _make_processor(QWEN3_VL) + outputs = self._make_mock_outputs() + proc.processor.prompt_token_ids2outputs.return_value = outputs + + request = { + "request_id": "test2", + "prompt_token_ids": [10, 20, 30], + "messages": [{"role": "user", "content": "hi"}], + } + result = proc.process_request_dict(request, max_model_len=100) + + # prompt_token_ids should be preserved (not overwritten) + self.assertEqual(result["prompt_token_ids"], [10, 20, 30]) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_ernie_flow(self, mock_stop): + """Lines 291-295, 316-320, 328-329, 339-341: ernie-specific branches.""" + proc = _make_processor(ERNIE4_5_VL) + outputs = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + } + proc.processor.request2ids.return_value = outputs + + request = { + "request_id": "test3", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}], + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertIn("prompt_token_ids", result) + self.assertIn("logits_processors_args", result) + # ernie sets default reasoning_max_tokens when None + self.assertIn("reasoning_max_tokens", result) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_ernie_low_top_p(self, mock_stop): + """Lines 331-334: ernie with top_p below _SAMPLING_EPS.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.request2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + } + + request = { + "request_id": "test4", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "top_p": 0.0, + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertAlmostEqual(result["top_p"], _SAMPLING_EPS) + self.assertEqual(result["top_k"], 1) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_paddleocr_low_top_p(self, mock_stop): + """Lines 331-334: paddleocr with top_p below _SAMPLING_EPS.""" + proc = _make_processor(PADDLEOCR_VL) + proc.processor.request2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + } + + request = { + "request_id": "test5", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "top_p": 0.0, + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertAlmostEqual(result["top_p"], _SAMPLING_EPS) + self.assertEqual(result["top_k"], 1) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_qwen_vl_with_reasoning_parser(self, mock_stop): + """Lines 336-337: qwen_vl with reasoning parser (not qwen3).""" + proc = _make_processor(QWEN_VL) + mock_parser = MagicMock() + mock_parser.get_model_status.return_value = "think_start" + proc.reasoning_parser = mock_parser + proc.processor.request2ids.return_value = self._make_mock_outputs() + + request = { + "request_id": "test6", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertTrue(result["enable_thinking"]) + self.assertIn("test6", proc.model_status_dict) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_qwen3_skips_reasoning_parser(self, mock_stop): + """Lines 336: qwen3_vl does NOT apply reasoning parser.""" + proc = _make_processor(QWEN3_VL) + mock_parser = MagicMock() + proc.reasoning_parser = mock_parser + proc.processor.request2ids.return_value = self._make_mock_outputs() + + request = { + "request_id": "test7", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + } + proc.process_request_dict(request, max_model_len=100) + + mock_parser.get_model_status.assert_not_called() + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_ernie_response_max_tokens_with_thinking_disabled(self, mock_stop): + """Lines 339-341: ernie with response_max_tokens and enable_thinking=False.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.request2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + } + + request = { + "request_id": "test8", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hi"}]}], + "response_max_tokens": 10, + "enable_thinking": False, + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertLessEqual(result["max_tokens"], 10) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_prompt_truncation(self, mock_stop): + """Lines 313-314: prompt exceeding max_model_len is truncated.""" + proc = _make_processor(QWEN_VL) + long_ids = list(range(200)) + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": long_ids, + "token_type_ids": [0] * 200, + "position_ids": [np.array([list(range(200))] * 3)], + } + + request = {"request_id": "test9", "prompt": "hello " * 100} + result = proc.process_request_dict(request, max_model_len=50) + + self.assertLessEqual(len(result["prompt_token_ids"]), 49) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_max_tokens_default(self, mock_stop): + """Lines 322-324: max_tokens defaults to remaining model len.""" + proc = _make_processor(QWEN_VL) + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + } + + request = {"request_id": "test10", "prompt": "hello"} + result = proc.process_request_dict(request, max_model_len=100) + + expected_max = 100 - len(result["prompt_token_ids"]) + self.assertEqual(result["max_tokens"], max(1, expected_max)) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_max_tokens_capped(self, mock_stop): + """Lines 325-326: user max_tokens capped by remaining model len.""" + proc = _make_processor(QWEN_VL) + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + } + + request = {"request_id": "test11", "prompt": "hello", "max_tokens": 5000} + result = proc.process_request_dict(request, max_model_len=100) + + remaining = 100 - len(result["prompt_token_ids"]) + self.assertEqual(result["max_tokens"], remaining) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_paddleocr_skips_bad_words(self, mock_stop): + """Lines 288-289: paddleocr skips _process_bad_words.""" + proc = _make_processor(PADDLEOCR_VL) + proc.update_bad_words = MagicMock() + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2], + "token_type_ids": [0, 0], + "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])], + } + + request = {"request_id": "test12", "prompt": "hi", "bad_words": ["test"]} + proc.process_request_dict(request, max_model_len=100) + + proc.update_bad_words.assert_not_called() + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_eos_token_ids_not_overwritten(self, mock_stop): + """Lines 283-284: existing eos_token_ids preserved.""" + proc = _make_processor(QWEN_VL) + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2], + "token_type_ids": [0, 0], + "position_ids": [np.array([[0, 1], [0, 1], [0, 1]])], + } + + request = {"request_id": "test13", "prompt": "hi", "eos_token_ids": [99]} + result = proc.process_request_dict(request, max_model_len=100) + + self.assertEqual(result["eos_token_ids"], [99]) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_ernie_reasoning_max_tokens_default(self, mock_stop): + """Lines 328-329: ernie sets default reasoning_max_tokens.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.request2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + } + + request = { + "request_id": "test14", + "messages": [{"role": "user", "content": [{"type": "text", "text": "hello"}]}], + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertIn("reasoning_max_tokens", result) + self.assertEqual(result["reasoning_max_tokens"], max(int(result["max_tokens"] * 0.8), 1)) + + @patch("fastdeploy.input.multimodal_processor.process_stop_token_ids") + def test_prompt_path_flow(self, mock_stop): + """Lines 297-299, 304-310: prompt path flow.""" + proc = _make_processor(QWEN_VL) + proc.processor.text2ids.return_value = { + "images": [], + "grid_thw": [], + "image_type_ids": [], + "input_ids": [1, 2, 3], + "token_type_ids": [0, 0, 0], + "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], + } + + request = { + "request_id": "test15", + "prompt": "hello world", + } + result = proc.process_request_dict(request, max_model_len=100) + + self.assertEqual(result["prompt_token_ids"], [1, 2, 3]) + self.assertIn("multimodal_inputs", result) + + +# =================================================================== +# _init_mm_config (via _make_processor + direct attribute check) +# =================================================================== +class TestInitMmConfig(unittest.TestCase): + + def test_qwen_vl_sets_image_patch_id(self): + """Lines 174-175: qwen_vl/qwen3_vl sets image_patch_id from image_token_id.""" + proc = _make_processor(QWEN_VL) + proc.processor.image_token_id = 12345 + proc._init_mm_config() + self.assertEqual(proc.image_patch_id, 12345) + + def test_qwen3_vl_sets_image_patch_id(self): + proc = _make_processor(QWEN3_VL) + proc.processor.image_token_id = 67890 + proc._init_mm_config() + self.assertEqual(proc.image_patch_id, 67890) + + def test_paddleocr_sets_image_patch_id(self): + """Lines 176-177: paddleocr sets image_patch_id from processor.""" + proc = _make_processor(PADDLEOCR_VL) + proc.processor.image_patch_id = 11111 + proc._init_mm_config() + self.assertEqual(proc.image_patch_id, 11111) + + def test_ernie_sets_image_patch_id_and_spatial_conv(self): + """Lines 178-180: ernie sets image_patch_id and spatial_conv_size.""" + proc = _make_processor(ERNIE4_5_VL) + proc.processor.image_patch_id = 22222 + proc.processor.spatial_conv_size = 14 + proc._init_mm_config() + self.assertEqual(proc.image_patch_id, 22222) + self.assertEqual(proc.spatial_conv_size, 14) + + +# =================================================================== +# _load_tokenizer (just the branch coverage, actual loading is mocked) +# =================================================================== +class TestLoadTokenizer(unittest.TestCase): + + @patch("fastdeploy.input.multimodal_processor.MultiModalProcessor.__init__", return_value=None) + def test_auto_tokenizer_path(self, mock_init): + """Lines 123-125: non-ernie path loads AutoTokenizer.""" + proc = MultiModalProcessor.__new__(MultiModalProcessor) + proc.model_name_or_path = "/mock/model" + proc.tokenizer_type = "auto" + + with patch("fastdeploy.input.multimodal_processor.MultiModalProcessor._load_tokenizer") as mock_load: + mock_load.return_value = MagicMock() + mock_load.assert_called_once() + + +if __name__ == "__main__": + unittest.main() From d1663a316044b048d5d1be8ab0abfa4d0e20b5f4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 1 Apr 2026 19:41:44 +0800 Subject: [PATCH 7/8] fix unit test & AdaptiveImageProcessor --- .../image_processors/adaptive_processor.py | 15 ++++++-------- tests/input/test_multimodal_processor.py | 20 ++++++++++--------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fastdeploy/input/image_processors/adaptive_processor.py b/fastdeploy/input/image_processors/adaptive_processor.py index e5ad4960391..4a5539bec57 100644 --- a/fastdeploy/input/image_processors/adaptive_processor.py +++ b/fastdeploy/input/image_processors/adaptive_processor.py @@ -454,6 +454,8 @@ def preprocess( if images is not None and not valid_images(images): raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") + data = {} + if images is not None: pixel_values, vision_grid_thws = [], [] for img_idx, image in enumerate(images): @@ -479,10 +481,8 @@ def preprocess( vision_grid_thws.append(image_grid_thw) pixel_values = np.array(pixel_values) vision_grid_thws = np.array(vision_grid_thws) - data = { - "pixel_values": pixel_values, - "image_grid_thw": vision_grid_thws, - } + data["pixel_values"] = pixel_values + data["image_grid_thw"] = vision_grid_thws if videos is not None: pixel_values, vision_grid_thws = [], [] @@ -505,11 +505,8 @@ def preprocess( vision_grid_thws.append(video_grid_thw) pixel_values = np.array(pixel_values) vision_grid_thws = np.array(vision_grid_thws) - - data = { - "pixel_values_videos": pixel_values, - "video_grid_thw": vision_grid_thws, - } + data["pixel_values_videos"] = pixel_values + data["video_grid_thw"] = vision_grid_thws return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py index bfce5b302a9..8c0aabda1a6 100644 --- a/tests/input/test_multimodal_processor.py +++ b/tests/input/test_multimodal_processor.py @@ -1114,16 +1114,18 @@ def test_ernie_sets_image_patch_id_and_spatial_conv(self): # =================================================================== class TestLoadTokenizer(unittest.TestCase): - @patch("fastdeploy.input.multimodal_processor.MultiModalProcessor.__init__", return_value=None) - def test_auto_tokenizer_path(self, mock_init): - """Lines 123-125: non-ernie path loads AutoTokenizer.""" - proc = MultiModalProcessor.__new__(MultiModalProcessor) - proc.model_name_or_path = "/mock/model" - proc.tokenizer_type = "auto" + def test_auto_tokenizer_path(self): + """Lines 123-125: non-ernie path loads AutoTokenizer via paddleformers.""" + proc = _make_processor(QWEN_VL) + mock_tokenizer = MagicMock() + mock_auto_tokenizer = MagicMock() + mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer + + with patch.dict("sys.modules", {"paddleformers.transformers": MagicMock(AutoTokenizer=mock_auto_tokenizer)}): + result = proc._load_tokenizer() - with patch("fastdeploy.input.multimodal_processor.MultiModalProcessor._load_tokenizer") as mock_load: - mock_load.return_value = MagicMock() - mock_load.assert_called_once() + mock_auto_tokenizer.from_pretrained.assert_called_once_with("/mock/model", padding_side="left", use_fast=True) + self.assertEqual(result, mock_tokenizer) if __name__ == "__main__": From 47658fe3c7bc771d0c4ee5ce80bf99d2665f99e4 Mon Sep 17 00:00:00 2001 From: luukunn <981429396@qq.com> Date: Wed, 1 Apr 2026 21:17:49 +0800 Subject: [PATCH 8/8] Delete unused code --- .../image_processors/paddleocr_processor.py | 8 ++-- fastdeploy/input/multimodal_processor.py | 34 ++------------ tests/input/test_multimodal_processor.py | 45 ++----------------- 3 files changed, 10 insertions(+), 77 deletions(-) diff --git a/fastdeploy/input/image_processors/paddleocr_processor.py b/fastdeploy/input/image_processors/paddleocr_processor.py index a28f03075df..8c304defeb0 100644 --- a/fastdeploy/input/image_processors/paddleocr_processor.py +++ b/fastdeploy/input/image_processors/paddleocr_processor.py @@ -38,16 +38,14 @@ _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] -def make_batched_images(images) -> List[List[ImageInput]]: +def make_batched_images(images) -> List[ImageInput]: """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - + Accepts images in list or nested list format, and makes a flat list of images for preprocessing. Args: images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): The input image. - Returns: - list: A list of images. + List[ImageInput]: A flat list of images. """ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): return [img for img_list in images for img in img_list] diff --git a/fastdeploy/input/multimodal_processor.py b/fastdeploy/input/multimodal_processor.py index 143160b0fce..3284d19770a 100644 --- a/fastdeploy/input/multimodal_processor.py +++ b/fastdeploy/input/multimodal_processor.py @@ -21,7 +21,6 @@ single class that dispatches per ``model_type``. """ -import pickle from collections.abc import Mapping from typing import Any, Dict, Optional @@ -57,8 +56,6 @@ "video_fps": int, } -_TYPES_ACCEPT_URL_SUFFIX = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL} - _DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1} _SAMPLING_EPS = 1e-5 @@ -227,22 +224,14 @@ def _check_mm_limits(self, item): mm_data = item else: mm_data = {"image": [], "video": []} - accept_url_suffix = self.model_type in _TYPES_ACCEPT_URL_SUFFIX - for message in item: if isinstance(message.get("content"), list): for part in message["content"]: part_type = part.get("type") - if accept_url_suffix: - if part_type in ("image_url", "image"): - mm_data["image"].append(part) - elif part_type in ("video_url", "video"): - mm_data["video"].append(part) - else: - if part_type == "image": - mm_data["image"].append(part) - elif part_type == "video": - mm_data["video"].append(part) + if part_type in ("image_url", "image"): + mm_data["image"].append(part) + elif part_type in ("video_url", "video"): + mm_data["video"].append(part) for modality, data in mm_data.items(): if modality in self.limit_mm_per_prompt: @@ -250,21 +239,6 @@ def _check_mm_limits(self, item): if len(data) > limit: raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") - def _get_processor_cache(self, socket, mm_hashes: list) -> list: - """Retrieve cached processor results for the given hashes.""" - req = pickle.dumps(mm_hashes) - socket.send_multipart([b"", req]) - _, resp = socket.recv_multipart() - mm_items = pickle.loads(resp) - data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") - return mm_items - - def _update_processor_cache(self, socket, mm_hashes: list, mm_items): - """Update the processor cache with new results.""" - req = pickle.dumps((mm_hashes, mm_items)) - socket.send_multipart([b"", req]) - data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") - def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]: """Return per-modality max token counts, if available.""" if self.model_type == ERNIE4_5_VL: diff --git a/tests/input/test_multimodal_processor.py b/tests/input/test_multimodal_processor.py index 8c0aabda1a6..5f0e781b6c5 100644 --- a/tests/input/test_multimodal_processor.py +++ b/tests/input/test_multimodal_processor.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import pickle import unittest from unittest.mock import MagicMock, patch @@ -106,13 +105,9 @@ class TestMultiModalProcessorInitValidation(unittest.TestCase): def test_unsupported_model_type_raises(self): """Line 86: unsupported model_type should raise ValueError.""" - with self.assertRaises(Exception): - # We need to let __init__ run the model_type check. - # Mock the parts that come after the check to isolate it. - with patch.object(MultiModalProcessor, "__init__", wraps=MultiModalProcessor.__init__) as _: - proc = object.__new__(MultiModalProcessor) - # Call the real __init__ which should fail on model_type check - MultiModalProcessor.__init__(proc, "/mock", model_type="unsupported_type") + with self.assertRaises(ValueError): + # Directly construct with unsupported model_type to trigger validation + MultiModalProcessor("/mock", model_type="unsupported_type") # =================================================================== @@ -307,40 +302,6 @@ def test_messages_with_string_content_skipped(self): proc._check_mm_limits(messages) # should not raise -# =================================================================== -# _get_processor_cache / _update_processor_cache -# =================================================================== -class TestProcessorCache(unittest.TestCase): - - def test_get_processor_cache(self): - """Lines 255-260: retrieve cached results via socket.""" - proc = _make_processor(QWEN_VL) - mock_socket = MagicMock() - mm_hashes = ["hash1", "hash2"] - expected_items = [{"data": "item1"}, {"data": "item2"}] - mock_socket.recv_multipart.return_value = [b"", pickle.dumps(expected_items)] - - result = proc._get_processor_cache(mock_socket, mm_hashes) - - mock_socket.send_multipart.assert_called_once() - self.assertEqual(result, expected_items) - - def test_update_processor_cache(self): - """Lines 264-266: update cache via socket.""" - proc = _make_processor(QWEN_VL) - mock_socket = MagicMock() - mm_hashes = ["hash1"] - mm_items = [{"data": "item1"}] - - proc._update_processor_cache(mock_socket, mm_hashes, mm_items) - - mock_socket.send_multipart.assert_called_once() - sent_data = mock_socket.send_multipart.call_args[0][0] - self.assertEqual(sent_data[0], b"") - unpacked = pickle.loads(sent_data[1]) - self.assertEqual(unpacked, (mm_hashes, mm_items)) - - # =================================================================== # get_mm_max_tokens_per_item # ===================================================================