Merge branch 'support_gemma4' of https://github.com/ModelTC/LightLLM into support_gemma4

WANDY666 · WANDY666 · commit 91051f0852f0 · 2026-05-20T07:10:02.000Z
diff --git a/lightllm/models/gemma4/layer_infer/pre_layer_infer.py b/lightllm/models/gemma4/layer_infer/pre_layer_infer.py
@@ -5,6 +5,7 @@
 from lightllm.models.llama.layer_infer.pre_layer_infer import LlamaPreLayerInfer
 from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
 from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.common.basemodel.triton_kernel.multimodal_emb import multimodal_emb
 
 
 class Gemma4PreLayerInfer(LlamaMultimodalPreLayerInfer):
@@ -67,3 +68,31 @@ def _tpsp_sp_split(self, input: torch.Tensor, infer_state):
             assert not self.has_ple, "gemma4 PLE + enable_tpsp_mix_mode not implemented"
             return super()._tpsp_sp_split(input=input, infer_state=infer_state)
         return input
+
+    def _multimodal_emb(
+        self,
+        out: torch.Tensor,
+        input_ids: torch.Tensor,
+        layer_weight,
+        embed_cache: torch.Tensor,
+        img_token_lens: torch.Tensor,
+        img_start_token_ids: torch.Tensor,
+        img_start_locs_in_cache: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        修改多模态的 embed 计算的细节实现方式,调用本地的 multimodal_text_embed_scale_ 参数。
+        """
+        multimodal_emb(
+            out=out,
+            prompt_ids=input_ids,
+            text_weight_embs=layer_weight.wte_weight_.weight,
+            embed_cache=embed_cache,
+            img_token_lens=img_token_lens,
+            img_start_token_ids=img_start_token_ids,
+            img_start_locs_in_cache=img_start_locs_in_cache,
+            tp_text_start_token_id=layer_weight.wte_weight_.tp_vocab_start_id,
+            tp_text_end_token_id=layer_weight.wte_weight_.tp_vocab_end_id,
+            tp_world_size=self.tp_world_size_,
+            text_embed_scale=self.multimodal_text_embed_scale_,
+        )
+        return
diff --git a/lightllm/models/gemma4/model.py b/lightllm/models/gemma4/model.py
@@ -1,9 +1,7 @@
-import math
 import os
 import json
 import torch
 from lightllm.models.registry import ModelRegistry
-from lightllm.common.basemodel.multimodal_tokenizer import BaseMultiModalTokenizer
 from lightllm.common.basemodel.attention.triton.fp import TritonAttBackend
 from lightllm.common.kv_cache_mem_manager.mem_utils import select_mem_manager_class
 from lightllm.common.build_utils import repair_config
@@ -21,90 +19,6 @@
 logger = init_logger(__name__)
 
 
-class Gemma4Tokenizer(BaseMultiModalTokenizer):
-    def __init__(self, tokenizer, model_cfg, image_processor=None):
-        super().__init__(tokenizer)
-        self.image_token_index = model_cfg.get("image_token_id", 258880)
-        self.boi_token_index = model_cfg.get("boi_token_id", 255999)
-        self.eoi_token_index = model_cfg.get("eoi_token_id", 258882)
-        self.image_processor = image_processor
-        self.image_length = model_cfg.get("vision_soft_tokens_per_image", 280)
-        self.patch_size = getattr(self.image_processor, "patch_size", 16)
-        self.pooling_kernel_size = getattr(self.image_processor, "pooling_kernel_size", 3)
-        self.max_soft_tokens = getattr(self.image_processor, "max_soft_tokens", self.image_length)
-        # HF Gemma-4 tokenizer does not prepend BOS even with add_special_tokens=True.
-        self.bos_token_id = tokenizer.bos_token_id
-
-    def init_imageitem_extral_params(self, img, multi_params, sampling_params):
-        return
-
-    def init_audioitem_extral_params(self, audio, multi_params, sampling_params):
-        raise NotImplementedError
-
-    def get_image_token_length(self, img):
-        if self.image_processor is None or img.image_w <= 0 or img.image_h <= 0:
-            return self.image_length
-
-        patch, kernel = self.patch_size, self.pooling_kernel_size
-        unit = patch * kernel
-        num_patches_orig = (img.image_h / patch) * (img.image_w / patch)
-        scale = math.sqrt(self.max_soft_tokens * kernel ** 2 / num_patches_orig)
-        target_h = max(unit, int(math.floor(img.image_h * scale / unit)) * unit)
-        target_w = max(unit, int(math.floor(img.image_w * scale / unit)) * unit)
-        num_patches = (target_h // patch) * (target_w // patch)
-        return min(num_patches // kernel ** 2, self.max_soft_tokens)
-
-    def get_audio_token_length(self, audio):
-        raise NotImplementedError
-
-    def encode(self, prompt, multimodal_params=None, add_special_tokens=False):
-        origin_ids = self.tokenizer(prompt, add_special_tokens=False).input_ids
-        if (
-            add_special_tokens
-            and self.bos_token_id is not None
-            and (len(origin_ids) == 0 or origin_ids[0] != self.bos_token_id)
-        ):
-            origin_ids = [self.bos_token_id] + origin_ids
-
-        images = [] if multimodal_params is None else getattr(multimodal_params, "images", [])
-        if not images:
-            return origin_ids
-
-        input_ids = []
-        image_id = 0
-        start = 0
-        while True:
-            try:
-                image_start = origin_ids.index(self.image_token_index, start)
-            except ValueError:
-                break
-
-            input_ids.extend(origin_ids[start:image_start])
-            image_end = image_start + 1
-            while image_end < len(origin_ids) and origin_ids[image_end] == self.image_token_index:
-                image_end += 1
-            if image_id >= len(images):
-                raise ValueError("image token error")
-
-            img = images[image_id]
-            if not input_ids or input_ids[-1] != self.boi_token_index:
-                input_ids.append(self.boi_token_index)
-            img.start_idx = len(input_ids)
-            input_ids.extend(range(img.token_id, img.token_id + img.token_num))
-            input_ids.append(self.eoi_token_index)
-
-            if image_end < len(origin_ids) and origin_ids[image_end] == self.eoi_token_index:
-                image_end += 1
-            start = image_end
-            image_id += 1
-
-        input_ids.extend(origin_ids[start:])
-        image_cnt = len(images)
-        if image_cnt != image_id:
-            raise ValueError(f"invalid image tag num: {image_cnt} vs {image_id}!")
-        return input_ids
-
-
 @ModelRegistry("gemma4", is_multimodal=True)
 class Gemma4TpPartModel(LlamaTpPartModel):
     pre_and_post_weight_class = Gemma4PreAndPostLayerWeight
diff --git a/lightllm/models/gemma4/tokenizer.py b/lightllm/models/gemma4/tokenizer.py
@@ -0,0 +1,93 @@
+import math
+
+from lightllm.common.basemodel.multimodal_tokenizer import BaseMultiModalTokenizer
+from lightllm.server.core.objs.sampling_params import SamplingParams
+from lightllm.server.multimodal_params import AudioItem, ImageItem, MultimodalParams
+
+
+class Gemma4Tokenizer(BaseMultiModalTokenizer):
+    def __init__(self, tokenizer, model_cfg, image_processor=None):
+        super().__init__(tokenizer)
+        self.image_token_index = model_cfg.get("image_token_id", 258880)
+        self.boi_token_index = model_cfg.get("boi_token_id", 255999)
+        self.eoi_token_index = model_cfg.get("eoi_token_id", 258882)
+        self.image_processor = image_processor
+        self.image_length = model_cfg.get("vision_soft_tokens_per_image", 280)
+        self.patch_size = getattr(self.image_processor, "patch_size", 16)
+        self.pooling_kernel_size = getattr(self.image_processor, "pooling_kernel_size", 3)
+        self.max_soft_tokens = getattr(self.image_processor, "max_soft_tokens", self.image_length)
+        # HF Gemma-4 tokenizer does not prepend BOS even with add_special_tokens=True.
+        self.bos_token_id = tokenizer.bos_token_id
+
+    def init_imageitem_extral_params(
+        self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
+    ):
+        return
+
+    def init_audioitem_extral_params(
+        self, audio: AudioItem, multi_params: MultimodalParams, sampling_params: SamplingParams
+    ):
+        raise NotImplementedError
+
+    def get_image_token_length(self, img: ImageItem):
+        if self.image_processor is None or img.image_w <= 0 or img.image_h <= 0:
+            return self.image_length
+
+        patch, kernel = self.patch_size, self.pooling_kernel_size
+        unit = patch * kernel
+        num_patches_orig = (img.image_h / patch) * (img.image_w / patch)
+        scale = math.sqrt(self.max_soft_tokens * kernel ** 2 / num_patches_orig)
+        target_h = max(unit, int(math.floor(img.image_h * scale / unit)) * unit)
+        target_w = max(unit, int(math.floor(img.image_w * scale / unit)) * unit)
+        num_patches = (target_h // patch) * (target_w // patch)
+        return min(num_patches // kernel ** 2, self.max_soft_tokens)
+
+    def get_audio_token_length(self, audio: AudioItem):
+        raise NotImplementedError
+
+    def encode(self, prompt, multimodal_params: MultimodalParams = None, add_special_tokens=False):
+        origin_ids = self.tokenizer(prompt, add_special_tokens=False).input_ids
+        if (
+            add_special_tokens
+            and self.bos_token_id is not None
+            and (len(origin_ids) == 0 or origin_ids[0] != self.bos_token_id)
+        ):
+            origin_ids = [self.bos_token_id] + origin_ids
+
+        images = [] if multimodal_params is None else getattr(multimodal_params, "images", [])
+        if not images:
+            return origin_ids
+
+        input_ids = []
+        image_id = 0
+        start = 0
+        while True:
+            try:
+                image_start = origin_ids.index(self.image_token_index, start)
+            except ValueError:
+                break
+
+            input_ids.extend(origin_ids[start:image_start])
+            image_end = image_start + 1
+            while image_end < len(origin_ids) and origin_ids[image_end] == self.image_token_index:
+                image_end += 1
+            if image_id >= len(images):
+                raise ValueError("image token error")
+
+            img = images[image_id]
+            if not input_ids or input_ids[-1] != self.boi_token_index:
+                input_ids.append(self.boi_token_index)
+            img.start_idx = len(input_ids)
+            input_ids.extend(range(img.token_id, img.token_id + img.token_num))
+            input_ids.append(self.eoi_token_index)
+
+            if image_end < len(origin_ids) and origin_ids[image_end] == self.eoi_token_index:
+                image_end += 1
+            start = image_end
+            image_id += 1
+
+        input_ids.extend(origin_ids[start:])
+        image_cnt = len(images)
+        if image_cnt != image_id:
+            raise ValueError(f"invalid image tag num: {image_cnt} vs {image_id}!")
+        return input_ids
diff --git a/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py b/lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py
@@ -70,19 +70,42 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
             img_start_locs_in_cache, dtype=torch.long, device="cpu", pin_memory=True
         ).cuda(non_blocking=True)
 
+        self._multimodal_emb(
+            out=out,
+            input_ids=input_ids,
+            layer_weight=layer_weight,
+            embed_cache=cpu_embed_cache_tensor,
+            img_token_lens=img_token_lens,
+            img_start_token_ids=img_start_token_ids,
+            img_start_locs_in_cache=img_start_locs_in_cache,
+        )
+        if self.tp_world_size_ > 1:
+            all_reduce(out, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
+        return out
+
+    def _multimodal_emb(
+        self,
+        out: torch.Tensor,
+        input_ids: torch.Tensor,
+        layer_weight: LlamaPreAndPostLayerWeight,
+        embed_cache: torch.Tensor,
+        img_token_lens: torch.Tensor,
+        img_start_token_ids: torch.Tensor,
+        img_start_locs_in_cache: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        方便子类继承修改多模态的embed计算的细节实现方式。
+        """
         multimodal_emb(
             out=out,
             prompt_ids=input_ids,
             text_weight_embs=layer_weight.wte_weight_.weight,
-            embed_cache=cpu_embed_cache_tensor,
+            embed_cache=embed_cache,
             img_token_lens=img_token_lens,
             img_start_token_ids=img_start_token_ids,
             img_start_locs_in_cache=img_start_locs_in_cache,
             tp_text_start_token_id=layer_weight.wte_weight_.tp_vocab_start_id,
             tp_text_end_token_id=layer_weight.wte_weight_.tp_vocab_end_id,
             tp_world_size=self.tp_world_size_,
-            text_embed_scale=getattr(self, "multimodal_text_embed_scale_", 1.0),
         )
-        if self.tp_world_size_ > 1:
-            all_reduce(out, group=infer_state.dist_group, op=dist.ReduceOp.SUM, async_op=False)
-        return out
+        return
diff --git a/lightllm/server/tokenizer.py b/lightllm/server/tokenizer.py
@@ -31,7 +31,7 @@
 from ..models.qwen3_vl.model import QWen3VLTokenizer
 from ..models.internvl.model import InternvlTokenizer
 from ..models.gemma3.model import Gemma3Tokenizer
-from ..models.gemma4.model import Gemma4Tokenizer
+from ..models.gemma4.tokenizer import Gemma4Tokenizer
 from ..models.qwen3_omni_moe_thinker.model import QWen3OmniTokenizer
 
 # A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.