Merge branch 'main' of https://github.com/ModelTC/LightLLM into support_gemma4

WANDY666 · WANDY666 · commit 819497ce0b23 · 2026-05-21T04:25:03.000Z
diff --git a/docs/CN/source/tutorial/api_server_args.rst b/docs/CN/source/tutorial/api_server_args.rst
@@ -272,6 +272,18 @@ PD 分离模式参数
 
     多模态资源的缓存服务器容量，默认为 ``200``
 
+.. option:: --max_image_token_count
+
+    单张图片在转换为 token 后允许的最大 token 数量，默认为 ``6128``
+
+    当任意图片超过该阈值时，请求会被拒绝。
+
+.. option:: --max_image_pixels
+
+    单张图片在预处理缩放前允许的最大像素数量，默认为 ``8294400``（约等于 4K 图片像素总量）。
+
+    当输入图片超过该阈值时，LightLLM 会先自动将其缩放到该像素预算内，再继续后续流程。
+
 .. option:: --visual_infer_batch_size
 
     每次推理批次中处理的图像数量，默认为 ``1``
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
@@ -270,6 +270,18 @@ Multimodal Parameters
 
     Cache server capacity for multimodal resources, default is ``200``
 
+.. option:: --max_image_token_count
+
+    Maximum allowed token count for a single image after tokenization, default is ``6128``
+
+    Requests are rejected when any image exceeds this limit.
+
+.. option:: --max_image_pixels
+
+    Maximum allowed pixel count for a single image before preprocessing resize, default is ``8294400`` (about 4K image pixels).
+
+    If an input image exceeds this threshold, LightLLM automatically resizes it down to this pixel budget before continuing.
+
 .. option:: --visual_infer_batch_size
 
     Number of images processed in each inference batch, default is ``1``
diff --git a/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py b/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py
@@ -65,7 +65,7 @@ def _context_attention_kernel(
         out=None,
     ):
         if self.network_config_["layer_types"][self.layer_num_] == "sliding_attention":
-            window_size = (self.sliding_window - 1, self.sliding_window - 1)
+            window_size = (self.sliding_window - 1, 0)
             use_sliding_window = True
         else:
             window_size = (-1, -1)
@@ -92,7 +92,7 @@ def _token_attention_kernel(
         self, q: torch.Tensor, infer_state: LlamaInferStateInfo, layer_weight: GptOssTransformerLayerWeight, out=None
     ):
         if self.network_config_["layer_types"][self.layer_num_] == "sliding_attention":
-            window_size = (self.sliding_window - 1, self.sliding_window - 1)
+            window_size = (self.sliding_window - 1, 0)
             use_sliding_window = True
         else:
             window_size = (-1, -1)
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -443,6 +443,18 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--cache_capacity", type=int, default=200, help="cache server capacity for multimodal resources"
     )
+    parser.add_argument(
+        "--max_image_token_count",
+        type=int,
+        default=8192,
+        help="maximum allowed token count for one image after tokenization",
+    )
+    parser.add_argument(
+        "--max_image_pixels",
+        type=int,
+        default=8294400,
+        help="maximum allowed pixel count for one image before resize preprocessing",
+    )
     parser.add_argument(
         "--embed_cache_storage_size",
         type=float,
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -95,6 +95,8 @@ class StartArgs:
     enable_decode_microbatch_overlap: bool = field(default=False)
     enable_prefill_microbatch_overlap: bool = field(default=False)
     cache_capacity: int = field(default=200)
+    max_image_token_count: int = field(default=8192)
+    max_image_pixels: int = field(default=8294400)
     embed_cache_storage_size: float = field(default=4)
     data_type: Optional[str] = field(
         default=None, metadata={"choices": ["fp16", "float16", "bf16", "bfloat16", "fp32", "float32"]}
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -181,6 +181,17 @@ async def _alloc_resource(self, items, md5sums, token_nums, datas):
             self.cache_client.root.set_items_data(update_data_ids)
         return
 
+    def _assert_image_token_count(self, token_num: int):
+        if token_num > self.args.max_image_token_count:
+            err_msg = (
+                f"single image token count {token_num} exceeds max_image_token_count {self.args.max_image_token_count}."
+                f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
+                f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
+            )
+            logger.warning(err_msg)
+            raise ValueError(err_msg)
+        return
+
     async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams, sampling_params: SamplingParams):
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
@@ -190,6 +201,7 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
                 data = img.read()
                 # must after init_imageitem_extral_params
                 token_num = self.tokenizer.get_image_token_length(img)
+                self._assert_image_token_count(token_num)
                 md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
                 md5sums.append(md5sum)
                 img.md5 = md5sum
@@ -245,7 +257,9 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         for img in multimodal_params.images:
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            self._assert_image_token_count(token_num)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -81,7 +81,16 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         for img in multimodal_params.images:
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            if token_num > self.args.max_image_token_count:
+                err_msg = (
+                    f"the image token count {token_num} > max_image_token_count {self.args.max_image_token_count}. "
+                    f"You can increase this limit by setting --max_image_token_count to a larger value when starting "
+                    f"LightLLM. Warning: increasing this limit raises runtime OOM risk."
+                )
+                logger.warning(err_msg)
+                raise ValueError(err_msg)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)
diff --git a/lightllm/server/multimodal_params.py b/lightllm/server/multimodal_params.py
@@ -4,7 +4,7 @@
 import librosa
 import base64
 import numpy as np
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 from io import BytesIO
 from concurrent.futures import ThreadPoolExecutor
 from PIL import Image, ImageFile
@@ -13,6 +13,7 @@
 from lightllm.utils.error_utils import ClientDisconnected
 from lightllm.utils.multimodal_utils import fetch_resource
 from lightllm.utils.log_utils import init_logger
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 logger = init_logger(__name__)
@@ -131,6 +132,9 @@ def __init__(self, **kwargs):
         self.extra_params = {}
 
     async def preload(self, request: Request):
+
+        max_image_pixels = get_env_start_args().max_image_pixels
+
         try:
             if self._type == "url":
                 timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
@@ -141,8 +145,14 @@ async def preload(self, request: Request):
             elif self._type == "image_size":
                 # image_size 代表直接传入图片的 width，height，主要是用于一些场景
                 # 的 token 计数判断, 所以只需要图片长宽信息，不需要具体图片的内容信息
-                self.image_w = self._data[0]
-                self.image_h = self._data[1]
+                src_w = self._data[0]
+                src_h = self._data[1]
+                self.image_w, self.image_h = _resize_image_dimensions_if_needed(src_w, src_h, max_image_pixels)
+                if (self.image_w, self.image_h) != (src_w, src_h):
+                    logger.warning(
+                        f"image_size pixels {src_w * src_h} exceed max_image_pixels={max_image_pixels}, "
+                        f"resized to {self.image_w}x{self.image_h}"
+                    )
                 return
             else:
                 raise ValueError(f"cannot read image which type is {self._type}!")
@@ -151,7 +161,24 @@ async def preload(self, request: Request):
             # Decoding is mainly done in the C libraries (libjpeg/libpng/libwebp), which releases the GIL,
             # and multiple threads can achieve true parallelism.
             loop = asyncio.get_running_loop()
-            self.image_w, self.image_h = await loop.run_in_executor(_IMAGE_VERIFY_POOL, _verify_image_bytes, img_data)
+            # 1) Verify original input bytes first.
+            src_w, src_h = await loop.run_in_executor(_IMAGE_VERIFY_POOL, _verify_image_bytes, img_data)
+            # 2) Resize (or no-op) after verification.
+            img_data, resized_w, resized_h = await loop.run_in_executor(
+                _IMAGE_VERIFY_POOL,
+                _resize_image_bytes_if_needed,
+                img_data,
+                src_w,
+                src_h,
+                max_image_pixels,
+            )
+            self.image_w, self.image_h = resized_w, resized_h
+
+            if (resized_w, resized_h) != (src_w, src_h):
+                logger.warning(
+                    f"image pixels {src_w * src_h} exceed max_image_pixels={max_image_pixels},"
+                    f" resized to {self.image_w}x{self.image_h}"
+                )
 
             self._preload_data = img_data
             return
@@ -245,3 +272,45 @@ def _verify_image_bytes(img_data: bytes) -> Tuple[int, int]:
         w, h = image.size
         image.load()
     return w, h
+
+
+def _resize_image_bytes_if_needed(
+    img_data: bytes, src_w: int, src_h: int, max_image_pixels: int
+) -> Tuple[bytes, int, int]:
+    """
+    Resize image bytes to satisfy max pixel constraint and return resized bytes with size.
+    """
+    new_w, new_h = _resize_image_dimensions_if_needed(src_w, src_h, max_image_pixels)
+    if (new_w, new_h) == (src_w, src_h):
+        return img_data, src_w, src_h
+
+    with Image.open(BytesIO(img_data)) as image:
+        resampling = Image.Resampling.LANCZOS if hasattr(Image, "Resampling") else Image.LANCZOS
+        resized_image = image.resize((new_w, new_h), resampling).convert("RGB")
+
+        buffer = BytesIO()
+        resized_image.save(buffer, format="JPEG", quality=96, optimize=True)
+        return buffer.getvalue(), new_w, new_h
+
+
+def _resize_image_dimensions_if_needed(src_w: int, src_h: int, max_image_pixels: int) -> Tuple[int, int]:
+    """
+    Compute resized (w, h) under a max pixel budget while preserving aspect ratio.
+    """
+    old_pixels = src_w * src_h
+    if old_pixels <= max_image_pixels:
+        return src_w, src_h
+
+    scale = (max_image_pixels / old_pixels) ** 0.5
+    new_w = max(1, int(src_w * scale))
+    new_h = max(1, int(src_h * scale))
+
+    # Avoid overflow from integer rounding.
+    while new_w * new_h > max_image_pixels:
+        if new_w >= new_h:
+            new_w = max(1, new_w - 1)
+        else:
+            new_h = max(1, new_h - 1)
+
+    assert new_w > 0 and new_h > 0, "resized image dimensions must be positive"
+    return new_w, new_h
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
@@ -436,9 +436,11 @@ def _generate_new_batch(self):
         new_batch = self.req_queue.generate_new_batch(
             Batch.merge_two_batch(self.running_batch, self.schedule_new_batch)
         )
+
+        if new_batch is not None and len(new_batch.reqs) > 0:
+            logger.info(f"generate new batch, {new_batch.simple_log()}")
+
         self.schedule_new_batch = Batch.merge_two_batch(self.schedule_new_batch, new_batch)
-        if self.schedule_new_batch is not None:
-            logger.info(f"gen new batch, {self.schedule_new_batch.simple_log()}")
         return
 
     def _multinode_tp_generate_new_batch(self):
diff --git a/test/advanced_config/mixed_quantization/qwen3_5-122b-moe-only-fp8.yaml b/test/advanced_config/mixed_quantization/qwen3_5-122b-moe-only-fp8.yaml
@@ -0,0 +1,4 @@
+quant_type: none
+mix_bits:
+  - name: "fused_moe"
+    quant_type: "deepgemm-fp8w8a8-b128"
diff --git a/test/benchmark/service/benchmark_multiturn.py b/test/benchmark/service/benchmark_multiturn.py
diff --git a/test/start_scripts/qwen35/122b.sh b/test/start_scripts/qwen35/122b.sh