diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index cf2452935e..a5c7266c3e 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -399,13 +399,14 @@ async def _get_multimodal_prompt_input(self, mm_processor_kwargs=mm_processor_kwargs) else: results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs=mm_processor_kwargs) - results = await self.vl_encoder.async_infer(results) + if not self.vl_encoder.model._turbomind_native_vision: + results = await self.vl_encoder.async_infer(results) results = await self.vl_encoder.wrap_for_turbomind(messages=results, - chat_template=chat_template, - tokenizer=self.tokenizer, - sequence_start=sequence_start, - tools=tools, - chat_template_kwargs=chat_template_kwargs) + chat_template=chat_template, + tokenizer=self.tokenizer, + sequence_start=sequence_start, + tools=tools, + chat_template_kwargs=chat_template_kwargs) elif self.backend == 'pytorch': if self.vl_encoder._uses_new_preprocess: input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages, diff --git a/lmdeploy/turbomind/models/__init__.py b/lmdeploy/turbomind/models/__init__.py index 8402a3edaf..1d5f18f4ce 100644 --- a/lmdeploy/turbomind/models/__init__.py +++ b/lmdeploy/turbomind/models/__init__.py @@ -6,5 +6,6 @@ from .llama import LlamaModel # noqa: F401 from .mixtral import MixtralModel # noqa: F401 from .qwen2 import Qwen2Model # noqa: F401 +from .qwen2_vl import Qwen2VLModel # noqa: F401 from .qwen3 import Qwen3TextModel # noqa: F401 from .qwen3_5 import Qwen3_5Model, Qwen3_5TextModel, Qwen3_5VisionModel # noqa: F401 diff --git a/lmdeploy/turbomind/models/internvl.py b/lmdeploy/turbomind/models/internvl.py index ee3f344850..2603df4f1c 100644 --- a/lmdeploy/turbomind/models/internvl.py +++ b/lmdeploy/turbomind/models/internvl.py @@ -3,9 +3,32 @@ HF-style InternVL/InternS1).""" from __future__ import annotations +from types import SimpleNamespace +from typing import Any + +import _turbomind as _tm +import torch from transformers import PretrainedConfig +from lmdeploy.vl.constants import Modality + +from ..builders import ( + AttentionBuilder, + Builder, + LayerNormBuilder, + ModuleListBuilder, + ModuleListConfig, + NormBuilder, + SplitSide, + VisionModelBuilder, + make_layer_norm_config, + make_norm_config, +) +from ..builders._base import ParallelGroup +from ..linear import Linear, transform_output_dim from ..supported_models import SUPPORTED_ARCHS +from ..text_model import TextModel +from ..weight_format import TrivialFormat from .base import INPUT_MODELS @@ -15,6 +38,14 @@ def _cfg_get(cfg, name: str, default=None): return getattr(cfg, name, default) +def _to_tm_norm_type(norm_type: str): + if norm_type == 'layer_norm': + return _tm.NormType.LAYER_NORM + if norm_type == 'rms_norm': + return _tm.NormType.RMS_NORM + raise ValueError(f'Unsupported InternVit vision norm_type: {norm_type!r}') + + def map_interns1_hf_keys(name: str) -> str: """Map Intern-S1 HF VLM checkpoint keys to the Qwen3 text loader layout.""" language_model_prefix = 'model.language_model.' @@ -26,12 +57,339 @@ def map_interns1_hf_keys(name: str) -> str: return name +def map_internvl_hf_keys(name: str) -> str: + """Map InternVL HF vision keys to the InternVit loader layout.""" + if name.startswith('vision_tower.') or name.startswith('multi_modal_projector.'): + return f'model.{name}' + return name + + +def map_legacy_internvl_keys(name: str) -> str: + """Map legacy InternVLChatModel ViT keys to the InternVit loader layout.""" + if name == 'vision_model.embeddings.class_embedding': + return 'model.vision_tower.embeddings.cls_token' + if name == 'vision_model.embeddings.position_embedding': + return 'model.vision_tower.embeddings.position_embeddings' + + patch_embed = 'vision_model.embeddings.patch_embedding.' + if name.startswith(patch_embed): + suffix = name[len(patch_embed):] + return f'model.vision_tower.embeddings.patch_embeddings.projection.{suffix}' + + block_prefix = 'vision_model.encoder.layers.' + if name.startswith(block_prefix): + rest = name[len(block_prefix):] + if '.' not in rest: + return name + layer_id, rest = rest.split('.', 1) + prefix = f'model.vision_tower.encoder.layer.{layer_id}.' + + if rest == 'ls1': + return prefix + 'lambda_1' + if rest == 'ls2': + return prefix + 'lambda_2' + if rest.startswith('norm1.'): + return prefix + 'layernorm_before.' + rest[len('norm1.'):] + if rest.startswith('norm2.'): + return prefix + 'layernorm_after.' + rest[len('norm2.'):] + if rest.startswith('attn.qkv.'): + return prefix + 'attention.qkv.' + rest[len('attn.qkv.'):] + if rest.startswith('attn.proj.'): + return prefix + 'attention.projection_layer.' + rest[len('attn.proj.'):] + if rest.startswith('attn.q_norm.'): + return prefix + 'attention.q_norm.' + rest[len('attn.q_norm.'):] + if rest.startswith('attn.k_norm.'): + return prefix + 'attention.k_norm.' + rest[len('attn.k_norm.'):] + if rest.startswith('mlp.'): + return prefix + rest + + if name.startswith('mlp1.0.'): + return 'model.multi_modal_projector.layer_norm.' + name[len('mlp1.0.'):] + if name.startswith('mlp1.1.'): + return 'model.multi_modal_projector.linear_1.' + name[len('mlp1.1.'):] + if name.startswith('mlp1.3.'): + return 'model.multi_modal_projector.linear_2.' + name[len('mlp1.3.'):] + + return name + + +def _validate_legacy_internvl_chat(cfg): + cfg = _legacy_namespace(cfg) + if getattr(cfg, 'ps_version', None) != 'v2': + raise ValueError( + f"InternVLChatModel TurboMind native ViT requires ps_version='v2', " + f"got {getattr(cfg, 'ps_version', None)!r}.") + + +def _legacy_namespace(cfg): + return SimpleNamespace(**cfg) if isinstance(cfg, dict) else cfg + + +def _legacy_square_size(value) -> int: + if isinstance(value, (list, tuple)): + if len(value) != 2 or value[0] != value[1]: + raise ValueError(f'legacy InternVit expects a square size, got {value!r}') + value = value[0] + return int(value) + + +@transform_output_dim +def _split_packed_vision_qkv(tensor: torch.Tensor): + """Split packed vision QKV layout [Q | K | V] along output dim.""" + if tensor.shape[-1] % 3 != 0: + raise ValueError(f'packed vision qkv output dim is not divisible by 3: {tuple(tensor.shape)}') + return tuple(x.contiguous() for x in tensor.chunk(3, dim=-1)) + + +class InternVitVisionModel(TextModel): + """InternVit weight model rooted at ``ModelRoot.vision_model``.""" + + def __init__(self, cfg: PretrainedConfig, *, resolver, parent_cfg: PretrainedConfig): + super().__init__(cfg, resolver=resolver) + + self._hidden = int(cfg.hidden_size) + self._heads = int(cfg.num_attention_heads) + self._depth = int(cfg.num_hidden_layers) + self._inter = int(cfg.intermediate_size) + self._channels = int(cfg.num_channels) + image_h, image_w = cfg.image_size + patch_h, patch_w = cfg.patch_size + self._image_h, self._image_w = int(image_h), int(image_w) + self._patch_h, self._patch_w = int(patch_h), int(patch_w) + self._norm_eps = float(cfg.layer_norm_eps) + self._norm_type = _to_tm_norm_type(cfg.norm_type) + self._use_qk_norm = bool(cfg.use_qk_norm) + self._head_dim = self._hidden // self._heads + self._patch_in_dim = self._channels * self._patch_h * self._patch_w + self._num_patches = (self._image_h // self._patch_h) * (self._image_w // self._patch_w) + self._downsample_ratio = float(parent_cfg.downsample_ratio) + self._image_seq_length = int(parent_cfg.image_seq_length) + self._out_hidden = int(parent_cfg.text_config.hidden_size) + self._projector_scale = int(round(1.0 / self._downsample_ratio)) + self._projector_in_dim = self._hidden * self._projector_scale * self._projector_scale + + def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): + items = [] + for input_mm in multimodal: + modality = input_mm.get('modality', Modality.IMAGE) + if modality not in (Modality.IMAGE, Modality.IMAGE.value, 'image'): + raise ValueError(f'InternVit TurboMind does not support modality {modality!r}') + + pixel_values = self._tm_tensor(input_mm['pixel_values']) + token_begin = int(input_mm['offset']) + token_end = token_begin + int(input_mm['image_tokens']) + items.append( + _tm.multimodal.InternVitItem( + modality=_tm.multimodal.Modality.IMAGE, + data=pixel_values, + token_begin=token_begin, + token_end=token_end, + )) + + return _tm.multimodal.InternVitInput(items) + + def model(self, pfx): + self._build_vision_model(pfx + 'model.vision_tower', pfx + 'model.multi_modal_projector') + + def _build_vision_model(self, vision_pfx, projector_pfx): + cfg = self._make_root_cfg() + root = self._restore_dtype(VisionModelBuilder( + cfg, self._ctx, root_handles=self._root_handles, tp=self._model_tp)) + + emb_pfx = vision_pfx + 'embeddings' + root._add_tensor('cls_token', (emb_pfx + 'cls_token').pop()) + root._add_tensor('position_embeddings', (emb_pfx + 'position_embeddings').pop()) + root._add_linear('patch_embed', self._patch_embed(emb_pfx + 'patch_embeddings.projection')) + root.blocks = self.vit_blocks(vision_pfx + 'encoder.layer') + + root.projector_norm = self._layer_norm(projector_pfx + 'layer_norm', + dim=self._projector_in_dim, + norm_eps=1e-5) + root._add_linear('projector_fc1', self._linear(projector_pfx + 'linear_1'), SplitSide.OUTPUT) + root._add_linear('projector_fc2', self._linear(projector_pfx + 'linear_2'), SplitSide.INPUT) + root.build() + + def _make_root_cfg(self): + cfg = _tm.InternVitConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._hidden + cfg.depth = self._depth + cfg.patch_in_dim = self._patch_in_dim + cfg.in_channels = self._channels + cfg.image_height = self._image_h + cfg.image_width = self._image_w + cfg.patch_height = self._patch_h + cfg.patch_width = self._patch_w + cfg.num_patches = self._num_patches + cfg.image_seq_length = self._image_seq_length + cfg.norm_type = self._norm_type + return cfg + + def _make_block_cfg(self): + cfg = _tm.InternVitBlockConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._hidden + cfg.head_num = self._heads + cfg.intermediate_size = self._inter + cfg.norm_eps = self._norm_eps + return cfg + + def _make_attn_cfg(self): + cfg = _tm.AttentionConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._hidden + cfg.head_dim = self._head_dim + cfg.head_num = self._heads + cfg.kv_head_num = self._heads + cfg.window_size = 0 + cfg.causal = False + return cfg + + def vit_blocks(self, pfx): + blocks = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i, p in pfx.slices(0, self._depth): + blocks[i] = self.vit_block(p) + return blocks.build() + + def vit_block(self, pfx): + b = self._restore_dtype(Builder(self._make_block_cfg(), self._ctx)) + b.tp = self._model_tp + + b.norm1 = self._vision_norm(pfx + 'layernorm_before') + b.norm2 = self._vision_norm(pfx + 'layernorm_after') + b.attention = self.vit_attn(pfx + 'attention') + b._add_linear('mlp_fc1', self._linear(pfx + 'mlp.fc1'), SplitSide.OUTPUT) + b._add_linear('mlp_fc2', self._linear(pfx + 'mlp.fc2'), SplitSide.INPUT) + b._add_tensor('lambda_1', (pfx + 'lambda_1').pop()) + b._add_tensor('lambda_2', (pfx + 'lambda_2').pop()) + return b.build() + + def vit_attn(self, pfx): + q = self._linear(pfx + 'q_proj') + k = self._linear(pfx + 'k_proj') + v = self._linear(pfx + 'v_proj') + o = self._linear(pfx + 'projection_layer') + + cfg = self._make_attn_cfg() + attn_tp = self._model_tp if self._heads % self._model_tp.size == 0 else ParallelGroup(1, None) + m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp)) + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + if self._use_qk_norm and (pfx + 'q_norm').has('weight') and (pfx + 'k_norm').has('weight'): + m.q_norm = self._rms_norm(pfx + 'q_norm', tp=attn_tp) + m.k_norm = self._rms_norm(pfx + 'k_norm', tp=attn_tp) + return m.build() + + def _tm_tensor(self, tensor: torch.Tensor): + if not isinstance(tensor, torch.Tensor): + raise TypeError(f'InternVit multimodal data should be a torch.Tensor, got {type(tensor).__name__}') + return _tm.from_dlpack(tensor.contiguous()) + + def _restore_dtype(self, builder): + builder.config.data_type = self._resolver.data_type + return builder + + def _patch_embed(self, pfx): + weight = pfx.pop('weight') + weight = weight.reshape(weight.shape[0], -1).t().contiguous() + tensors = {'weight': weight} + if pfx.has('bias'): + tensors['bias'] = pfx.pop('bias') + return Linear(tensors=tensors, weight_format=TrivialFormat()) + + def _vision_norm(self, pfx): + if self._norm_type == _tm.NormType.LAYER_NORM: + return self._layer_norm(pfx, dim=self._hidden, norm_eps=self._norm_eps) + elif self._norm_type == _tm.NormType.RMS_NORM: + return self._rms_norm(pfx, tp=ParallelGroup(1, None)) + else: + raise ValueError(f'Unsupported InternVit vision norm_type: {self._norm_type!r}') + + def _rms_norm(self, pfx, tp: ParallelGroup): + weight = pfx.pop('weight') + tp_size = tp.size + dim = weight.shape[-1] + if tp_size > 1: + assert dim % tp_size == 0, ( + f'{pfx}.weight dim={dim} is not divisible by tp={tp_size}') + dim //= tp_size + cfg = make_norm_config(dim=dim, norm_eps=self._norm_eps) + cfg.data_type = self._resolver.data_type + m = self._restore_dtype(NormBuilder(cfg, self._ctx)) + if tp_size > 1: + m.tp = tp + m._add_tensor('weight', weight, SplitSide.OUTPUT) + else: + m.set_weight(weight) + return m.build() + + def _layer_norm(self, pfx, *, dim: int, norm_eps: float): + weight = pfx.pop('weight') + bias = pfx.pop('bias') if pfx.has('bias') else None + cfg = make_layer_norm_config(dim=dim, data_type=self._resolver.data_type, norm_eps=norm_eps) + m = self._restore_dtype(LayerNormBuilder(cfg, self._ctx)) + m.set_weight(weight, bias=bias) + return m.build() + + +class LegacyInternVitVisionModel(InternVitVisionModel): + """Legacy InternVLChatModel ViT adapter for the canonical InternVit layout. + + Legacy InternVL stores attention as a single ``attn.qkv`` linear. Other + weights are normalized through ``map_legacy_internvl_keys``. + """ + + def __init__(self, cfg: PretrainedConfig, *, resolver, parent_cfg: PretrainedConfig): + cfg = _legacy_namespace(cfg) + parent_cfg = _legacy_namespace(parent_cfg) + llm_cfg = _legacy_namespace(parent_cfg.llm_config) + image_size = _legacy_square_size(cfg.image_size) + patch_size = _legacy_square_size(cfg.patch_size) + downsample_ratio = float(parent_cfg.downsample_ratio) + image_seq_length = int((image_size // patch_size)**2 * (downsample_ratio**2)) + + normalized_cfg = SimpleNamespace( + hidden_size=cfg.hidden_size, + num_attention_heads=cfg.num_attention_heads, + num_hidden_layers=cfg.num_hidden_layers, + intermediate_size=cfg.intermediate_size, + num_channels=cfg.num_channels, + image_size=(image_size, image_size), + patch_size=(patch_size, patch_size), + layer_norm_eps=cfg.layer_norm_eps, + norm_type=cfg.norm_type, + use_qk_norm=cfg.qk_normalization, + ) + normalized_parent_cfg = SimpleNamespace( + downsample_ratio=downsample_ratio, + image_seq_length=image_seq_length, + text_config=llm_cfg, + ) + super().__init__(normalized_cfg, resolver=resolver, parent_cfg=normalized_parent_cfg) + + def vit_attn(self, pfx): + q, k, v = _split_packed_vision_qkv(self._linear(pfx + 'qkv')) + o = self._linear(pfx + 'projection_layer') + + cfg = self._make_attn_cfg() + attn_tp = self._model_tp if self._heads % self._model_tp.size == 0 else ParallelGroup(1, None) + m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp)) + m.add_qkv_proj(q, k, v) + m.add_o_proj(o) + if self._use_qk_norm and (pfx + 'q_norm').has('weight') and (pfx + 'k_norm').has('weight'): + m.q_norm = self._rms_norm(pfx + 'q_norm', tp=attn_tp) + m.k_norm = self._rms_norm(pfx + 'k_norm', tp=attn_tp) + return m.build() + + @INPUT_MODELS.register_module(name='internvl') class InternVLModel: """Aggregate source model for InternVL checkpoints with any registered text model.""" - def __init__(self, cfg: PretrainedConfig, *, resolver): + _vision = True + + def __init__(self, cfg: PretrainedConfig, *, resolver, vision_resolver=None, disable_vision_encoder: bool = False): llm_cfg = _cfg_get(cfg, 'llm_config') if llm_cfg is None: llm_cfg = _cfg_get(cfg, 'text_config') @@ -55,9 +413,28 @@ def __init__(self, cfg: PretrainedConfig, *, resolver): self.text_model = text_model_cls(llm_cfg, resolver=resolver) archs = _cfg_get(cfg, 'architectures') or [] self._checkpoint_mappings = [] - if archs and archs[0] == 'InternS1ForConditionalGeneration': + arch = archs[0] if archs else None + if arch == 'InternS1ForConditionalGeneration': self._checkpoint_mappings.append(map_interns1_hf_keys) - self.vision_model = None + elif arch == 'InternVLForConditionalGeneration': + self._checkpoint_mappings.append(map_internvl_hf_keys) + elif arch == 'InternVLChatModel': + self._checkpoint_mappings.append(map_legacy_internvl_keys) + vision_cfg = cfg.vision_config if hasattr(cfg, 'vision_config') else None + if not disable_vision_encoder and vision_cfg is not None: + if arch == 'InternVLChatModel': + _validate_legacy_internvl_chat(cfg) + self.vision_model = LegacyInternVitVisionModel(vision_cfg, + resolver=vision_resolver or resolver, + parent_cfg=cfg) + elif arch in ('InternS1ForConditionalGeneration', 'InternVLForConditionalGeneration'): + self.vision_model = InternVitVisionModel(vision_cfg, + resolver=vision_resolver or resolver, + parent_cfg=cfg) + else: + raise ValueError(f'InternVL TurboMind vision architecture {arch!r} is not supported.') + else: + self.vision_model = None def bind_runtime(self, *, ctx, root_handles, attn_tp, mlp_tp, model_tp): @@ -68,6 +445,14 @@ def bind_runtime(self, *, ctx, root_handles, mlp_tp=mlp_tp, model_tp=model_tp, ) + if self.vision_model is not None: + self.vision_model.bind_runtime( + ctx=ctx, + root_handles=root_handles, + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) @property def _vocab_size(self): @@ -77,5 +462,12 @@ def _vocab_size(self): def _loader_mappings(self): return self._checkpoint_mappings + list(getattr(type(self.text_model), '_loader_mappings', [])) + def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): + if self.vision_model is None: + raise ValueError('InternVL TurboMind vision encoder is not available.') + return self.vision_model.to_turbomind_multimodal(multimodal) + def model(self, pfx): self.text_model.model(pfx + 'language_model') + if self.vision_model is not None: + self.vision_model.model(pfx) diff --git a/lmdeploy/turbomind/models/qwen2_vl.py b/lmdeploy/turbomind/models/qwen2_vl.py new file mode 100644 index 0000000000..1c58d4735e --- /dev/null +++ b/lmdeploy/turbomind/models/qwen2_vl.py @@ -0,0 +1,449 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Qwen2-VL / Qwen2.5-VL aggregate source model for TurboMind.""" +from __future__ import annotations + +import math +from typing import Any + +import _turbomind as _tm +import torch + +from lmdeploy.vl.constants import Modality + +from ..builders import ( + AttentionBuilder, + Builder, + LayerNormBuilder, + ModuleListBuilder, + ModuleListConfig, + NormBuilder, + SplitSide, + VisionModelBuilder, + make_layer_norm_config, + make_norm_config, +) +from ..builders._base import ParallelGroup +from ..linear import Linear, transform_input_dim, transform_output_dim +from ..text_model import TextModel +from ..weight_format import TrivialFormat +from .base import INPUT_MODELS +from .qwen2 import Qwen2Model +from .qwen3_5 import ( + _assert_trivial, + _pad_head_dim_in, + _pad_head_dim_out, + _split_packed_vision_qkv, +) +from .utils import reorder_rotary_emb + +_VIT_HEAD_DIM_PADDED = { + 64: 64, + 80: 128, +} + + +@transform_output_dim +def _pad_output_dim(t: torch.Tensor, *, dst_dim: int) -> torch.Tensor: + pad = dst_dim - t.shape[-1] + if pad <= 0: + return t + return torch.cat([t, t.new_zeros(t.shape[:-1] + (pad, ))], dim=-1).contiguous() + + +@transform_input_dim +def _pad_input_dim(t: torch.Tensor, *, dst_dim: int) -> torch.Tensor: + pad = dst_dim - t.shape[0] + if pad <= 0: + return t + return torch.cat([t, t.new_zeros((pad, ) + t.shape[1:])], dim=0).contiguous() + + +def _padded_vit_head_dim(real_hd: int) -> int: + if real_hd not in _VIT_HEAD_DIM_PADDED: + raise NotImplementedError( + f'Qwen2 ViT head_dim={real_hd} is not supported; ' + f'known: {sorted(_VIT_HEAD_DIM_PADDED)}') + return _VIT_HEAD_DIM_PADDED[real_hd] + + +def _to_tm_norm_type(norm_type: str): + if norm_type == 'layer_norm': + return _tm.NormType.LAYER_NORM + if norm_type == 'rms_norm': + return _tm.NormType.RMS_NORM + raise ValueError(f'Unsupported Qwen2 ViT norm_type: {norm_type!r}') + + +class _BaseQwen2VisionModel(TextModel): + """Common Qwen2-VL vision sub-tree rooted at ``ModelRoot.vision_model``.""" + + _gated_mlp = False + _norm_type = '' + _use_window_attention = False + + def __init__(self, cfg, *, resolver): + super().__init__(cfg, resolver=resolver) + + self._vis_hidden = self._vision_hidden_size(cfg) + self._vis_out_hidden = int(getattr(cfg, 'out_hidden_size', getattr(cfg, 'hidden_size', self._vis_hidden))) + self._vis_inter = self._vision_intermediate_size(cfg) + self._vis_depth = int(cfg.depth) + self._vis_heads = int(cfg.num_heads) + self._vis_in_chans = int(getattr(cfg, 'in_channels', getattr(cfg, 'in_chans', 3))) + self._vis_patch = int(cfg.patch_size) + self._vis_temporal = int(cfg.temporal_patch_size) + self._vis_spatial_merge = int(cfg.spatial_merge_size) + self._vis_norm_eps = 1e-6 + self._window_size = self._vision_window_size(cfg) + self._fullatt_block_indexes = self._vision_fullatt_block_indexes(cfg) + + self._patch_in_dim = (self._vis_in_chans + * self._vis_temporal + * self._vis_patch + * self._vis_patch) + + def _vision_hidden_size(self, cfg): + return int(getattr(cfg, 'hidden_size', getattr(cfg, 'embed_dim', 0))) + + def _vision_intermediate_size(self, cfg): + raise NotImplementedError + + def _vision_window_size(self, cfg): + return 0 + + def _vision_fullatt_block_indexes(self, cfg): + return [] + + def _torch_dtype(self): + if self._resolver.data_type == _tm.DataType.TYPE_FP16: + return torch.float16 + if self._resolver.data_type == _tm.DataType.TYPE_BF16: + return torch.bfloat16 + if self._resolver.data_type == _tm.DataType.TYPE_FP32: + return torch.float32 + return None + + def _tm_tensor(self, tensor: torch.Tensor): + if not isinstance(tensor, torch.Tensor): + raise TypeError(f'Qwen2 ViT multimodal data should be a torch.Tensor, got {type(tensor).__name__}') + target_dtype = self._torch_dtype() + if target_dtype is not None and tensor.is_floating_point() and tensor.dtype != target_dtype: + tensor = tensor.to(target_dtype) + return _tm.from_dlpack(tensor.contiguous()) + + @staticmethod + def _grid_thw(grid_thw) -> tuple[int, int, int]: + if isinstance(grid_thw, torch.Tensor): + values = grid_thw.flatten().tolist() + else: + values = list(grid_thw) + if len(values) != 3: + raise ValueError(f'Qwen2 ViT grid_thw should contain 3 values, got {values!r}') + return int(values[0]), int(values[1]), int(values[2]) + + @staticmethod + def _token_range(input_mm: dict[str, Any]) -> tuple[int, int]: + offset = input_mm['offset'] + if isinstance(offset, torch.Tensor): + values = offset.flatten().tolist() + elif isinstance(offset, (list, tuple)): + values = list(offset) + else: + values = [offset] + if len(values) == 2: + return int(values[0]), int(values[1]) + if len(values) != 1: + raise ValueError(f'Qwen2 ViT offset should contain 1 or 2 values, got {values!r}') + tokens = input_mm['image_tokens'] + if isinstance(tokens, torch.Tensor): + tokens = tokens.flatten()[0].item() + return int(values[0]), int(values[0]) + int(tokens) + + def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): + items = [] + for input_mm in multimodal: + modality = input_mm.get('modality', Modality.IMAGE) + if modality not in (Modality.IMAGE, Modality.IMAGE.value, 'image'): + raise ValueError(f'Qwen2 TurboMind native vision only supports image inputs, got {modality!r}') + + data = self._tm_tensor(input_mm['pixel_values']) + grid_thw = self._grid_thw(input_mm['image_grid_thw']) + token_begin, token_end = self._token_range(input_mm) + items.append( + _tm.multimodal.QwenVitItem( + modality=_tm.multimodal.Modality.IMAGE, + data=data, + token_begin=token_begin, + token_end=token_end, + grid_thw=grid_thw, + )) + + return _tm.multimodal.QwenVitInput(items) + + def model(self, pfx): + self._build_vision_model(pfx + 'visual') + + def _restore_dtype(self, builder): + builder.config.data_type = self._resolver.data_type + return builder + + def _make_vision_root_cfg(self): + cfg = _tm.QwenVitConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._vis_hidden + cfg.out_hidden_dim = self._vis_out_hidden + cfg.depth = self._vis_depth + cfg.head_num = self._vis_heads + cfg.intermediate_size = self._padded_inter_size() + cfg.patch_in_dim = self._patch_in_dim + cfg.in_channels = self._vis_in_chans + cfg.patch_size = self._vis_patch + cfg.temporal_patch_size = self._vis_temporal + cfg.spatial_merge_size = self._vis_spatial_merge + cfg.window_size = self._window_size + cfg.gated_mlp = self._gated_mlp + cfg.use_window_attention = self._use_window_attention + cfg.norm_type = _to_tm_norm_type(self._norm_type) + cfg.fullatt_block_indexes = self._fullatt_block_indexes + cfg.norm_eps = self._vis_norm_eps + return cfg + + def _build_vision_model(self, pfx): + cfg = self._make_vision_root_cfg() + root = self._restore_dtype(VisionModelBuilder( + cfg, self._ctx, + root_handles=self._root_handles, + tp=self._model_tp)) + + root._add_linear('patch_embed', self._patch_embed(pfx + 'patch_embed.proj')) + root.blocks = self.vit_blocks(pfx + 'blocks') + root.merger_norm = self._vision_norm(pfx + 'merger.ln_q', dim=self._vis_hidden) + root._add_linear('merger_fc1', self._linear(pfx + 'merger.mlp.0'), SplitSide.OUTPUT) + root._add_linear('merger_fc2', self._linear(pfx + 'merger.mlp.2'), SplitSide.INPUT) + root.build() + + def _patch_embed(self, pfx): + weight = pfx.pop('weight') + weight = weight.reshape(weight.shape[0], -1).t().contiguous() + return Linear(tensors={'weight': weight}, weight_format=TrivialFormat()) + + def vit_blocks(self, pfx): + blocks = ModuleListBuilder(ModuleListConfig(), self._ctx) + for i, p in pfx.slices(0, self._vis_depth): + blocks[i] = self.vit_block(p) + return blocks.build() + + def vit_block(self, pfx): + inter_size = self._padded_inter_size() + cfg = _tm.QwenVitBlockConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._vis_hidden + cfg.head_num = self._vis_heads + cfg.intermediate_size = inter_size + cfg.norm_eps = self._vis_norm_eps + + b = self._restore_dtype(Builder(cfg, self._ctx)) + b.tp = self._model_tp + b.norm1 = self._vision_norm(pfx + 'norm1', dim=self._vis_hidden) + b.norm2 = self._vision_norm(pfx + 'norm2', dim=self._vis_hidden) + b.attention = self.vit_attn(pfx + 'attn') + self._add_mlp(b, pfx, inter_size=inter_size) + return b.build() + + def _add_mlp(self, builder, pfx, *, inter_size: int): + raise NotImplementedError + + def _padded_inter_size(self): + # Bias/activation kernels vectorize half/bf16 in 8-element chunks. + # Pad the global intermediate so each TP shard has aligned width. + align = max(1, self._model_tp.size) * 8 + return ((self._vis_inter + align - 1) // align) * align + + def _pad_plain_mlp(self, fc1: Linear, fc2: Linear, *, inter_size: int): + if inter_size == self._vis_inter: + return fc1, fc2 + return _pad_output_dim(fc1, dst_dim=inter_size), _pad_input_dim(fc2, dst_dim=inter_size) + + def _pad_mlp(self, gate: Linear, up: Linear, down: Linear, *, inter_size: int): + if inter_size == self._vis_inter: + return gate, up, down + return (_pad_output_dim(gate, dst_dim=inter_size), + _pad_output_dim(up, dst_dim=inter_size), + _pad_input_dim(down, dst_dim=inter_size)) + + def _make_vision_attn_cfg(self): + real_hd = self._vis_hidden // self._vis_heads + padded_hd = _padded_vit_head_dim(real_hd) + cfg = _tm.AttentionConfig() + cfg.data_type = self._resolver.data_type + cfg.hidden_dim = self._vis_hidden + cfg.head_dim = padded_hd + cfg.head_num = self._vis_heads + cfg.kv_head_num = self._vis_heads + cfg.window_size = 0 + cfg.causal = False + cfg.softmax_scale = (1.0 / math.sqrt(real_hd) if padded_hd != real_hd else 0.0) + return cfg + + def vit_attn(self, pfx): + cfg = self._make_vision_attn_cfg() + real_hd = self._vis_hidden // self._vis_heads + padded_hd = cfg.head_dim + H = cfg.head_num + + q, k, v = _split_packed_vision_qkv(self._linear(pfx + 'qkv')) + q = reorder_rotary_emb(q, real_hd, real_hd, resolver=self._resolver) + k = reorder_rotary_emb(k, real_hd, real_hd, resolver=self._resolver) + proj = self._linear(pfx + 'proj') + + if padded_hd != real_hd: + for ln, name in [(q, 'q'), (k, 'k'), (v, 'v'), (proj, 'proj')]: + _assert_trivial(ln, name) + pad_kwargs = dict(num_heads=H, src_hd=real_hd, dst_hd=padded_hd) + q = _pad_head_dim_out(q, **pad_kwargs) + k = _pad_head_dim_out(k, **pad_kwargs) + v = _pad_head_dim_out(v, **pad_kwargs) + proj = _pad_head_dim_in(proj, **pad_kwargs) + + attn_tp = self._model_tp if self._vis_heads % self._model_tp.size == 0 else ParallelGroup(1, None) + m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp)) + m.add_qkv_proj(q, k, v) + m.add_o_proj(proj) + return m.build() + + def _vision_norm(self, pfx, *, dim: int): + raise NotImplementedError + + def _layer_norm(self, pfx, *, dim: int): + weight = pfx.pop('weight') + bias = pfx.pop('bias') if pfx.has('bias') else None + cfg = make_layer_norm_config(dim=dim, + data_type=self._resolver.data_type, + norm_eps=self._vis_norm_eps) + m = self._restore_dtype(LayerNormBuilder(cfg, self._ctx)) + m.set_weight(weight, bias=bias) + return m.build() + + def _rms_norm(self, pfx, *, dim: int): + weight = pfx.pop('weight') + cfg = make_norm_config(dim=dim, norm_eps=self._vis_norm_eps) + cfg.data_type = self._resolver.data_type + m = self._restore_dtype(NormBuilder(cfg, self._ctx)) + m.set_weight(weight) + return m.build() + + +class Qwen2VisionModel(_BaseQwen2VisionModel): + """Qwen2-VL vision tower.""" + + _norm_type = 'layer_norm' + + def _vision_hidden_size(self, cfg): + return int(cfg.embed_dim) + + def _vision_intermediate_size(self, cfg): + return int(self._vis_hidden * cfg.mlp_ratio) + + def _vision_norm(self, pfx, *, dim: int): + return self._layer_norm(pfx, dim=dim) + + def _add_mlp(self, builder, pfx, *, inter_size: int): + fc1, fc2 = self._pad_plain_mlp( + self._linear(pfx + 'mlp.fc1'), + self._linear(pfx + 'mlp.fc2'), + inter_size=inter_size, + ) + builder._add_linear('mlp_fc1', fc1, SplitSide.OUTPUT) + builder._add_linear('mlp_fc2', fc2, SplitSide.INPUT) + + +class Qwen2_5VisionModel(_BaseQwen2VisionModel): + """Qwen2.5-VL vision tower.""" + + _gated_mlp = True + _norm_type = 'rms_norm' + _use_window_attention = True + + def _vision_intermediate_size(self, cfg): + return int(cfg.intermediate_size) + + def _vision_window_size(self, cfg): + return int(cfg.window_size) + + def _vision_fullatt_block_indexes(self, cfg): + return [int(x) for x in cfg.fullatt_block_indexes] + + def _vision_norm(self, pfx, *, dim: int): + return self._rms_norm(pfx, dim=dim) + + def _add_mlp(self, builder, pfx, *, inter_size: int): + gate, up, down = self._pad_mlp( + self._linear(pfx + 'mlp.gate_proj'), + self._linear(pfx + 'mlp.up_proj'), + self._linear(pfx + 'mlp.down_proj'), + inter_size=inter_size, + ) + builder._add_linear('mlp_gate', gate, SplitSide.OUTPUT) + builder._add_linear('mlp_fc1', up, SplitSide.OUTPUT) + builder._add_linear('mlp_fc2', down, SplitSide.INPUT) + + +_VISION_MODEL_CLS = { + 'Qwen2VLForConditionalGeneration': Qwen2VisionModel, + 'Qwen2_5_VLForConditionalGeneration': Qwen2_5VisionModel, +} + + +@INPUT_MODELS.register_module(name='qwen2_vl') +class Qwen2VLModel: + """Aggregate source model for Qwen2-VL and Qwen2.5-VL checkpoints.""" + + _vision = True + + def __init__(self, cfg, *, resolver, vision_resolver=None, disable_vision_encoder: bool = False): + text_cfg = getattr(cfg, 'text_config', cfg) + if text_cfg is None: + raise ValueError('Qwen2VLModel requires a checkpoint with text_config.') + if not hasattr(text_cfg, 'tie_word_embeddings'): + text_cfg.tie_word_embeddings = getattr(cfg, 'tie_word_embeddings', False) + self.text_model = Qwen2Model(text_cfg, resolver=resolver) + + archs = getattr(cfg, 'architectures', None) or [] + self._arch = archs[0] if archs else '' + vision_cfg = getattr(cfg, 'vision_config', None) + if disable_vision_encoder or vision_cfg is None: + self.vision_model = None + else: + vision_cls = _VISION_MODEL_CLS.get(self._arch) + if vision_cls is None: + raise ValueError(f'Unsupported Qwen2-VL architecture: {self._arch!r}') + self.vision_model = vision_cls(vision_cfg, resolver=vision_resolver or resolver) + + def bind_runtime(self, *, ctx, root_handles, attn_tp, mlp_tp, model_tp): + for m in (self.text_model, self.vision_model): + if m is not None: + m.bind_runtime( + ctx=ctx, + root_handles=root_handles, + attn_tp=attn_tp, + mlp_tp=mlp_tp, + model_tp=model_tp, + ) + + @property + def _vocab_size(self): + return self.text_model.cfg.vocab_size + + @property + def _loader_mappings(self): + return list(getattr(type(self.text_model), '_loader_mappings', [])) + + def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): + if self.vision_model is None: + raise ValueError('Qwen2 TurboMind vision encoder is not available.') + return self.vision_model.to_turbomind_multimodal(multimodal) + + def model(self, pfx): + self.text_model.model(pfx) + if self.vision_model is not None: + self.vision_model.model(pfx) diff --git a/lmdeploy/turbomind/models/qwen3_5.py b/lmdeploy/turbomind/models/qwen3_5.py index 0d13f372ae..45ccd9daa3 100644 --- a/lmdeploy/turbomind/models/qwen3_5.py +++ b/lmdeploy/turbomind/models/qwen3_5.py @@ -45,6 +45,7 @@ _act_type_id, make_layer_norm_config, ) +from ..builders._base import ParallelGroup from ..builders.attention import split_output_gate from ..linear import Linear, transform_input_dim, transform_output_dim from ..text_model import TextModel @@ -402,7 +403,7 @@ def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): token_begin, token_end = self._offset_pair(input_mm['offset']) items.append( - _tm.multimodal.Qwen3_5VitItem( + _tm.multimodal.QwenVitItem( modality=tm_modality, data=data, token_begin=token_begin, @@ -410,7 +411,7 @@ def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]): grid_thw=grid_thw, )) - return _tm.multimodal.Qwen3_5VitInput(items) + return _tm.multimodal.QwenVitInput(items) # ------------------------------------------------------------------ # model() — build the vision sub-tree @@ -448,7 +449,7 @@ def _build_vision_model(self, pfx): root.build() def _make_vision_root_cfg(self): - cfg = _tm.Qwen3_5VitConfig() + cfg = _tm.QwenVitConfig() cfg.data_type = self._resolver.data_type cfg.hidden_dim = self._vis_hidden cfg.out_hidden_dim = self._vis_out_hidden @@ -461,6 +462,8 @@ def _make_vision_root_cfg(self): cfg.temporal_patch_size = self._vis_temporal cfg.num_position_embeddings = self._vis_pos_n cfg.spatial_merge_size = self._vis_spatial_merge + # Qwen3.5 ViT MLP uses the tanh-approximation GELU. + cfg.gelu_tanh = True cfg.norm_eps = self._vis_norm_eps return cfg @@ -481,7 +484,7 @@ def vit_blocks(self, pfx): return blocks.build() def vit_block(self, pfx): - cfg = _tm.Qwen3_5VitBlockConfig() + cfg = _tm.QwenVitBlockConfig() cfg.data_type = self._resolver.data_type cfg.hidden_dim = self._vis_hidden cfg.head_num = self._vis_heads @@ -546,8 +549,9 @@ def vit_attn(self, pfx): v = _pad_head_dim_out(v, **pad_kwargs) proj = _pad_head_dim_in(proj, **pad_kwargs) + attn_tp = self._model_tp if self._vis_heads % self._model_tp.size == 0 else ParallelGroup(1, None) m = self._restore_dtype( - AttentionBuilder(cfg, self._ctx, tp=self._model_tp)) + AttentionBuilder(cfg, self._ctx, tp=attn_tp)) m.add_qkv_proj(q, k, v) m.add_o_proj(proj) return m.build() diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py index 95f6f24d32..43c31ac465 100644 --- a/lmdeploy/turbomind/supported_models.py +++ b/lmdeploy/turbomind/supported_models.py @@ -8,6 +8,9 @@ # Qwen2 / Qwen2-MoE Qwen2ForCausalLM='qwen2', Qwen2MoeForCausalLM='qwen2-moe', + # Qwen2-vl / Qwen2.5-vl + Qwen2VLForConditionalGeneration='qwen2_vl', + Qwen2_5_VLForConditionalGeneration='qwen2_vl', # Qwen3 Qwen3ForCausalLM='qwen3', Qwen3MoeForCausalLM='qwen3-moe', diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py index f2a5f62ccf..e8d3aaf6ac 100644 --- a/lmdeploy/vl/engine.py +++ b/lmdeploy/vl/engine.py @@ -203,6 +203,14 @@ async def wrap_for_turbomind( ... } """ + if self.model._turbomind_native_vision: + return await self.wrap_for_pytorch(messages, + chat_template, + tokenizer, + sequence_start, + tools=tools, + chat_template_kwargs=chat_template_kwargs) + result = self.model.to_turbomind(messages, chat_template, tokenizer, diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py index 1534ad3388..69dead5450 100644 --- a/lmdeploy/vl/model/internvl.py +++ b/lmdeploy/vl/model/internvl.py @@ -76,6 +76,7 @@ class InternVLVisionModel(VisionModel): """InternVL vision model.""" _arch = 'InternVLChatModel' + _turbomind_native_vision = True def __init__(self, model_path: str, @@ -210,7 +211,7 @@ def preprocess(self, messages: list[dict]) -> list[dict]: pixel_values = self.processor(image, params) image_tokens = (pixel_values.shape[0] * self.image_tokens_per_patch) outputs.append( - dict(pixel_values=pixel_values, + dict(pixel_values=pixel_values.to(self.mm_feature_dtype), image_tokens=image_tokens, image_token_id=self.image_token_id, image_size=image.size)) diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py index 3816cfe491..2c6e836b97 100644 --- a/lmdeploy/vl/model/internvl3_hf.py +++ b/lmdeploy/vl/model/internvl3_hf.py @@ -35,6 +35,7 @@ class InternVL3VisionModel(InternVLVisionModel): """Internvl3 vision model.""" _arch = ['InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration'] + _turbomind_native_vision = True def __init__(self, model_path: str, @@ -107,7 +108,7 @@ def preprocess(self, messages: list[dict]) -> list[dict]: cur_num_patches = image_num_patches[idx] pixel_values = image_pixel_values[cum_num_patches:cum_num_patches + cur_num_patches, ...] cum_num_patches += cur_num_patches - data = dict(pixel_values=pixel_values, + data = dict(pixel_values=pixel_values.to(self.mm_feature_dtype), image_tokens=self.image_tokens_per_patch * cur_num_patches, image_token_id=self.image_token_id) outputs.append(data) diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py index 98e3f8cd09..d87c65fafa 100644 --- a/lmdeploy/vl/model/qwen2.py +++ b/lmdeploy/vl/model/qwen2.py @@ -5,28 +5,14 @@ from lmdeploy.vl.model.utils import disable_logging -def check_qwen_vl_deps_install(): - """Check qwen_vl_utils.""" - try: - import qwen_vl_utils # noqa: F401 - except ImportError: - raise ImportError('please install qwen_vl_utils by `pip install qwen_vl_utils`' # noqa: E501 - ) - try: - from transformers import Qwen2VLForConditionalGeneration # noqa: F401 - except ImportError: - raise ImportError('please install latest transformers by ' - 'pip install git+https://github.com/huggingface/transformers.git') - - @VISION_MODELS.register_module() class Qwen2VLModel(VisionModel): """Qwen2VL model.""" _arch = ['Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration'] + _turbomind_native_vision = True def build_preprocessor(self, trust_remote_code: bool = False): - check_qwen_vl_deps_install() from transformers import AutoProcessor self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) tokenizer = self.processor.tokenizer @@ -35,16 +21,12 @@ def build_preprocessor(self, trust_remote_code: bool = False): def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess()` for spec.""" - from qwen_vl_utils import process_vision_info - images = self.collect_multimodal_items(messages) - optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'} + optional_keys = {'min_pixels', 'max_pixels'} outputs = [] for modality, image, params in images: - item = dict(type='image', image=image) - item.update({key: params[key] for key in params.keys() if key in optional_keys}) - image_inputs, _ = process_vision_info([dict(content=[item])]) - result = self.processor.image_processor(images=image_inputs, return_tensors='pt') + image_kwargs = {key: params[key] for key in params.keys() if key in optional_keys} + result = self.processor.image_processor(images=[image], return_tensors='pt', **image_kwargs) merge_length = self.processor.image_processor.merge_size**2 image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id)) @@ -53,7 +35,6 @@ def preprocess(self, messages: list[dict]) -> list[dict]: return messages def build_model(self, trust_remote_code: bool = False): - check_qwen_vl_deps_install() arch = self.hf_config.architectures[0] if arch == 'Qwen2VLForConditionalGeneration': from transformers import Qwen2VLForConditionalGeneration as AutoModelCls diff --git a/src/turbomind/kernels/activation.cu b/src/turbomind/kernels/activation.cu index cab79c6ee7..6d9f7a5ae7 100644 --- a/src/turbomind/kernels/activation.cu +++ b/src/turbomind/kernels/activation.cu @@ -2,10 +2,13 @@ #include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/activation.h" +#include "src/turbomind/kernels/activation_ops.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" #include "src/turbomind/utils/cuda_utils.h" +#include + namespace turbomind { template @@ -193,4 +196,95 @@ void Activation(Tensor& gate_up, // TM_CUDA_CHECK(cudaGetLastError()); } +template class Activation> +__global__ void AddBiasActivationKernel(T* data, const T* __restrict__ bias, int64_t stride, int num, int dim) +{ + const int ti = blockIdx.x; + const int di = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size; + + if (ti >= num || di >= dim) { + return; + } + + Array x_vec; + Load(x_vec, data + ti * stride + di); + + auto x = cast(x_vec); + + if (bias) { + Array bias_vec; + Ldg(bias_vec, bias + di); + using namespace ops; + x = x + cast(bias_vec); + } + + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + x[i] = Activation::apply(x[i]); + } + + Store(data + ti * stride + di, cast(x)); +} + +void invokeAddBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream) +{ + if (x.size() == 0) { + return; + } + + TM_CHECK_EQ(x.ndim(), 2); + if (bias) { + TM_CHECK_EQ(bias.shape(-1), x.shape(-1)); + TM_CHECK_EQ(bias.dtype(), x.dtype()); + } + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int max_vec = sizeof(uint4) / sizeof(T); + constexpr int threads = 512; + + const int num = x.shape(0); + const int dim = x.shape(1); + const int64_t stride = x.stride(0); + + int best_vec_size = 1; + for (int v = max_vec; v >= 1; v >>= 1) { + if (dim % v == 0 && stride % v == 0) { + best_vec_size = v; + break; + } + } + + auto launch = [&](auto vec_size_) { + constexpr int vec_size = decltype(vec_size_)::value; + const dim3 grid(num, cdiv(dim, threads * vec_size)); + if (type == ActivationType::kGeluPytorchTanh) { + AddBiasActivationKernel + <<>>(x.data(), bias.data_or((T*)nullptr), stride, num, dim); + } + else if (type == ActivationType::kGelu) { + AddBiasActivationKernel + <<>>(x.data(), bias.data_or((T*)nullptr), stride, num, dim); + } + else { + TM_LOG_FATAL("unsupported add-bias activation type: {}", (int)type); + } + }; + + switch (best_vec_size) { + case 8: + return launch(std::integral_constant{}); + case 4: + return launch(std::integral_constant{}); + case 2: + return launch(std::integral_constant{}); + default: + return launch(std::integral_constant{}); + } + }; + + TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + } // namespace turbomind diff --git a/src/turbomind/kernels/activation.h b/src/turbomind/kernels/activation.h index 2eceeb9d89..1d13c63bdc 100644 --- a/src/turbomind/kernels/activation.h +++ b/src/turbomind/kernels/activation.h @@ -20,4 +20,9 @@ void Activation(Tensor& gate_up, // ActivationType type, cudaStream_t stream); +// In-place add-bias + unary activation: x <- activation(x + bias). +// `x` is a 2D tensor; `bias` (optional) broadcasts over the last dim. +// Supports kGelu (erf) and kGeluPytorchTanh (tanh approximation). +void invokeAddBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream); + } // namespace turbomind diff --git a/src/turbomind/kernels/attention/attention_universal.h b/src/turbomind/kernels/attention/attention_universal.h index c7960de01a..e6b192c2d2 100644 --- a/src/turbomind/kernels/attention/attention_universal.h +++ b/src/turbomind/kernels/attention/attention_universal.h @@ -234,11 +234,12 @@ struct AttentionUniversal { rope.init(di); PRAGMA_UNROLL for (int s = 0; s < ITER_S; ++s) { - const int ti = (offset.y + s * Map::kDeltaS) / CTA_H + query_idx + history_len; - rope.apply(vec_Q[s][c], ti); + const int qi = (offset.y + s * Map::kDeltaS) / CTA_H + query_idx; + const int ti = qi + history_len; + rope.apply(vec_Q[s][c], ti, qi); if constexpr (kProcessKV) { if (s == 0) { - rope.apply(vec_K[0][c], ti); + rope.apply(vec_K[0][c], ti, qi); } } } diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu index 14d64caed2..1715394074 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu +++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu @@ -132,8 +132,9 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks, rope.init(di); PRAGMA_UNROLL for (int s = 0; s < ITER_S; ++s) { - const int ti = history_len + offset.y + s * Map::kDeltaS + token_idx; // sequence local - rope.apply(vec_K[s][c], ti); + const int qi = offset.y + s * Map::kDeltaS + token_idx; + const int ti = history_len + qi; // sequence local + rope.apply(vec_K[s][c], ti, qi); } } } @@ -430,7 +431,7 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k, PRAGMA_UNROLL for (int s = 0; s < ITER_S; ++s) { const int ti = offset.y + s * Map::kDeltaS + token_idx; // sequence local - rope.apply(out_K[s][c], ti); + rope.apply(out_K[s][c], ti, ti); } } } diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h index 91a5316d62..d4281333a0 100644 --- a/src/turbomind/kernels/attention/rotary_embedding.h +++ b/src/turbomind/kernels/attention/rotary_embedding.h @@ -82,8 +82,8 @@ struct FastRoPE { } // mrope is an operation applied on top of any base rope type if (param_.mrope_mode != MropeMode::kNone) { - param_.mrope.position_ids += batch_idx * param_.mrope.stride; param_.mrope.position_delta += batch_idx; + param_.mrope.position_offsets += batch_idx; param_.mrope.length += batch_idx; } } @@ -110,7 +110,7 @@ struct FastRoPE { } template - __device__ void apply(Array& x, float timestep) + __device__ void apply(Array& x, float timestep, int token_idx) { if (param_.mrope_mode == MropeMode::kNone) { // Most models apply rotary embedding in half precision @@ -120,17 +120,18 @@ struct FastRoPE { } } else if (param_.mrope_mode == MropeMode::kChunked) { - apply_mrope_impl(x, timestep); + apply_mrope_impl(x, timestep, token_idx); } else if (param_.mrope_mode == MropeMode::kInterleaved) { - apply_mrope_impl(x, timestep); + apply_mrope_impl(x, timestep, token_idx); } } - __device__ __forceinline__ MropeCoord get_mrope_coord(float timestep) const + __device__ __forceinline__ MropeCoord get_mrope_coord(float timestep, int token_idx) const { - if (timestep < *param_.mrope.length) { - const int* t = param_.mrope.position_ids + 3 * (int)timestep; + if (token_idx < *param_.mrope.length) { + const int row = *param_.mrope.position_offsets + token_idx; + const int* t = param_.mrope.position_ids + 3 * row; return {t[0], t[1], t[2]}; } const int pos = (int)timestep + (*param_.mrope.position_delta); @@ -180,9 +181,9 @@ struct FastRoPE { } template - __device__ __forceinline__ void apply_mrope_impl(Array& x, float timestep) const + __device__ __forceinline__ void apply_mrope_impl(Array& x, float timestep, int token_idx) const { - const MropeCoord coord = get_mrope_coord(timestep); + const MropeCoord coord = get_mrope_coord(timestep, token_idx); PRAGMA_UNROLL for (int i = 0; i < N; i += 2) { const int pair_idx = (i + idx_) >> 1; diff --git a/src/turbomind/kernels/norm/norm.h b/src/turbomind/kernels/norm/norm.h new file mode 100644 index 0000000000..80b9894f9c --- /dev/null +++ b/src/turbomind/kernels/norm/norm.h @@ -0,0 +1,14 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +namespace turbomind { + +enum class NormType +{ + kNone, + kLayerNorm, + kRMSNorm, +}; + +} // namespace turbomind diff --git a/src/turbomind/models/CMakeLists.txt b/src/turbomind/models/CMakeLists.txt index 376b035664..18bf9caad7 100644 --- a/src/turbomind/models/CMakeLists.txt +++ b/src/turbomind/models/CMakeLists.txt @@ -15,15 +15,14 @@ add_library(models STATIC model_weight.cc model_root.cc vision_model.cc - qwen3_5vit/fast_pos_embed.cu - qwen3_5vit/fast_rotary_pos_emb.cu - qwen3_5vit/fused_embed_merge.cu - qwen3_5vit/qkv_preprocess.cu - qwen3_5vit/mrope_position_ids.cu - qwen3_5vit/bias_gelu.cu - qwen3_5vit/qwen3_5vit_block_weight.cc - qwen3_5vit/qwen3_5vit_weight.cc - qwen3_5vit/qwen3_5vit.cc + qwenvit/qwenvit_kernels.cu + qwenvit/qwenvit_block_weight.cc + qwenvit/qwenvit_weight.cc + qwenvit/qwenvit.cc + internvit/internvit_kernels.cu + internvit/internvit_block_weight.cc + internvit/internvit_weight.cc + internvit/internvit.cc llama/LlamaLinear.cu llama/BlockManager.cc llama/BlockTrie.cc @@ -75,7 +74,7 @@ if(BUILD_TEST) CUDA::cudart) add_executable(test_mrope_position_ids - qwen3_5vit/test_mrope_position_ids.cu) + qwenvit/test_mrope_position_ids.cu) set_property(TARGET test_mrope_position_ids PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(test_mrope_position_ids PRIVATE models diff --git a/src/turbomind/models/internvit/internvit.cc b/src/turbomind/models/internvit/internvit.cc new file mode 100644 index 0000000000..c18ce1f2c0 --- /dev/null +++ b/src/turbomind/models/internvit/internvit.cc @@ -0,0 +1,561 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/internvit/internvit.h" + +#include "src/turbomind/comm/device_comm.h" +#include "src/turbomind/engine/request.h" +#include "src/turbomind/kernels/activation.h" +#include "src/turbomind/kernels/attention/attention.h" +#include "src/turbomind/kernels/norm/layer_norm.h" +#include "src/turbomind/kernels/norm/rms_norm.h" +#include "src/turbomind/models/internvit/internvit_block_weight.h" +#include "src/turbomind/models/internvit/internvit_input.h" +#include "src/turbomind/models/internvit/internvit_kernels.h" +#include "src/turbomind/models/internvit/internvit_weight.h" +#include "src/turbomind/models/layer_norm_weight.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/models/llama/LlamaLinear.h" +#include "src/turbomind/models/llama/SequenceManager.h" +#include "src/turbomind/models/norm_weight.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/memory_utils.h" + +#include +#include +#include +#include +#include + +namespace turbomind { + +struct InternVit::Impl { + const InternVitWeight& weights_; + const core::InternVitConfig& config_; + const comm::HostComm& h_tp_group; + LlamaLinear& linear_; + comm::DeviceCommImpl* const d_comm_; + const int tp_group_; + const int tp_size_; + const DataType engine_data_type_; + + Buffer_ attn_cu_seqlens_buf_; + + struct Data { + Tensor batch_input; + int batch_size{}; + std::vector> image_embeds_coords; + std::vector> input_embeds_coords; + Tensor_ attn_cu_seqlens; + Tensor_ attn_finished; + int token_num{}; + int seq_len{}; + + void Clear() + { + batch_size = 0; + token_num = 0; + seq_len = 0; + image_embeds_coords.clear(); + input_embeds_coords.clear(); + } + }; + + std::vector data_; + + Impl(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases): + weights_{weights}, + config_{weights.config()}, + h_tp_group{ctx.comm.h_comm}, + linear_{*ctx.linear}, + d_comm_{ctx.comm.d_comm}, + tp_group_{ctx.comm.d_tp_group}, + tp_size_{ctx.comm.h_tp_group ? ctx.comm.h_tp_group->n_ranks() : 1}, + engine_data_type_{engine.data_type} + { + const auto& cfg = weights.config(); + for (int i = 0; i < phases; ++i) { + auto& d = data_.emplace_back(); + d.batch_input = {{engine.max_forward_token_num, cfg.in_channels, cfg.image_height, cfg.image_width}, + cfg.data_type, + kCPUpinned}; + d.attn_cu_seqlens = Tensor_{{engine.max_forward_token_num + 1}, kDEVICE}; + d.attn_finished = Tensor_{{engine.max_forward_token_num}, kDEVICE}; + } + attn_cu_seqlens_buf_ = {engine.max_forward_token_num + 1, kCPUpinned}; + } + + void AllReduceSum(Tensor& tensor, cudaStream_t stream) const + { + if (d_comm_ && tp_size_ > 1) { + d_comm_->AllReduceSum( + tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream); + TM_CUDA_CHECK(cudaGetLastError()); + } + } + + void ApplyNorm(Tensor& out, const Tensor& input, const core::Module& norm, NormType norm_type) const + { + auto stream = core::Context::stream().handle(); + switch (norm_type) { + case NormType::kLayerNorm: { + const auto& ln = static_cast(norm); + invokeLayerNorm(out, input, ln.weight, ln.bias, ln.norm_eps_, stream); + break; + } + case NormType::kRMSNorm: { + const auto& rms = static_cast(norm); + invokeRMSNorm(out, input, rms.weight, rms.norm_eps_, stream); + break; + } + default: + TM_LOG_FATAL("unsupported InternVit norm type: {}", (int)norm_type); + } + TM_CUDA_CHECK(cudaGetLastError()); + } + + void ResidualScaleNorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& branch_output, + const Tensor& branch_scale, + const Tensor& branch_bias, + const core::Module* norm, + NormType norm_type) const + { + auto stream = core::Context::stream().handle(); + switch (norm_type) { + case NormType::kLayerNorm: { + const auto& ln = static_cast(*norm); + invokeInternVitResidualScaleNorm(hidden_states, + residual, + branch_output, + branch_scale, + branch_bias, + ln.weight, + ln.bias, + ln.norm_eps_, + norm_type, + stream); + break; + } + case NormType::kRMSNorm: { + const auto& rms = static_cast(*norm); + invokeInternVitResidualScaleNorm(hidden_states, + residual, + branch_output, + branch_scale, + branch_bias, + rms.weight, + {}, + rms.norm_eps_, + norm_type, + stream); + break; + } + case NormType::kNone: { + invokeInternVitResidualScaleNorm( + hidden_states, residual, branch_output, branch_scale, branch_bias, {}, {}, 0.f, norm_type, stream); + break; + } + default: + TM_LOG_FATAL("unsupported InternVit norm type: {}", (int)norm_type); + } + TM_CUDA_CHECK(cudaGetLastError()); + } + + int Add(RequestCache& c) + { + const auto& [r, s] = std::tie(*c.req, *c.seq); + if (!r.mm_inputs) { + return Request::kOk; + } + + if ((not r.session.start_flag) or (not r.session.end_flag)) { + return Request::kInvalid; + } + + const auto mm_inputs = std::dynamic_pointer_cast(r.mm_inputs); + if (!mm_inputs) { + return Request::kInvalid; + } + + for (const auto& item : mm_inputs->items) { + if (item.modality != multimodal::Modality::kImage) { + return Request::kInvalid; + } + + const int tokens = item.token_end - item.token_begin; + if (tokens <= 0) { + return Request::kInvalid; + } + + auto mm_item = std::make_shared( + MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, std::array{}}); + s.multimodal_inputs.push_back(mm_item); + } + + return Request::kOk; + } + + void Add(int /*phase*/, TensorMap& env) + { + const Buffer_ rc = env.at("requests").buffer(); + for (int i = 0; i < rc.size(); ++i) { + auto& c = *TM_CHECK_NOTNULL(rc[i]); + if (c.status == 0) { + c.status = Add(c); + } + } + } + + void Setup(int phase, TensorMap& env) + { + auto& d = data_.at(phase); + auto& b = *env.at("batch").data()[0]; + auto& copy = *env.at("copy").data()[0]; + const auto& cfg = config_; + + int input_ids_offsets = 0; + int image_embeds_offsets = 0; + d.Clear(); + std::vector pixel_values; + + const auto& rc = b.rc; + for (int i = 0; i < rc.size(); ++i) { + const auto& c = *rc[i]; + const auto& s = *c.seq; + + if ((not c.autoregres) && (not s.multimodal_inputs.empty())) { + Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}}; + for (const auto& mm : s.multimodal_inputs) { + auto o = mm->interval & text; + if (auto size = (int)o.size()) { + pixel_values.push_back(mm->data); + d.batch_size += mm->data.shape(0); + + const int text_offset = input_ids_offsets + o.begin() - text.begin(); + const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin(); + d.input_embeds_coords.emplace_back(size, text_offset); + d.image_embeds_coords.emplace_back(size, image_offset); + + image_embeds_offsets += (int)mm->interval.size(); + } + } + } + + input_ids_offsets += c.autoregres ? 1 : c.input_len; + } + + if (d.batch_size > 0) { + // batch input + if (d.batch_size > d.batch_input.shape(0)) { + core::ContextGuard ctx{Allocator{kCPUpinned}}; + Layout layout{d.batch_size, cfg.in_channels, cfg.image_height, cfg.image_width}; + d.batch_input = {layout, cfg.data_type, kCPUpinned}; + } + + ssize_t batch_offset = 0; + for (const auto& pixel_value : pixel_values) { + TM_CHECK_EQ(pixel_value.ndim(), 4); + TM_CHECK_EQ(pixel_value.shape(1), cfg.in_channels); + TM_CHECK_EQ(pixel_value.shape(2), cfg.image_height); + TM_CHECK_EQ(pixel_value.shape(3), cfg.image_width); + TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype()); + Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0))); + batch_offset += pixel_value.shape(0); + } + TM_CHECK_EQ(batch_offset, d.batch_size); + + // attention meta + d.seq_len = cfg.num_patches + 1; + d.token_num = d.batch_size * d.seq_len; + + if (d.attn_cu_seqlens.size() < d.batch_size + 1) { + d.attn_cu_seqlens = Tensor_{{d.batch_size + 1}, kDEVICE}; + } + if (d.attn_finished.size() < d.batch_size) { + d.attn_finished = Tensor_{{d.batch_size}, kDEVICE}; + } + if (attn_cu_seqlens_buf_.size() < d.batch_size + 1) { + core::ContextGuard ctx{Allocator{kCPUpinned}}; + attn_cu_seqlens_buf_ = {d.batch_size + 1, kCPUpinned}; + } + + for (int i = 0; i <= d.batch_size; ++i) { + attn_cu_seqlens_buf_[i] = i * d.seq_len; + } + copy(attn_cu_seqlens_buf_.data(), d.batch_size + 1, d.attn_cu_seqlens.data()); + Clear(d.attn_finished.slice(0, d.batch_size)); + } + h_tp_group->Sync(); + } + + Tensor PatchEmbedding(Data& d) + { + const auto& cfg = config_; + auto stream = core::Context::stream().handle(); + + Tensor host_input = d.batch_input.slice(0, d.batch_size); + Tensor input = empty_like(host_input, kDEVICE); + Copy(host_input, input); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor patches{{d.batch_size * cfg.num_patches, cfg.patch_in_dim}, cfg.data_type, kDEVICE}; + invokeInternVitPatchify(patches, + input, + d.batch_size, + cfg.in_channels, + cfg.image_height, + cfg.image_width, + cfg.patch_height, + cfg.patch_width, + stream); + + Tensor patch_embeds; + TM_SCOPE_CALL(linear_.Forward(patches, *weights_.patch_embed, patch_embeds)); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor hidden{{d.token_num, cfg.hidden_dim}, cfg.data_type, kDEVICE}; + invokeInternVitAddEmbeddings(hidden, + patch_embeds, + weights_.patch_embed->bias, + weights_.cls_token, + weights_.position_embeddings, + d.batch_size, + cfg.num_patches, + cfg.hidden_dim, + stream); + return hidden; + } + + template + AttentionParams CreateVitAttentionParams( + Tensor& attn_output, Tensor& qkv, Tensor& kv, const Data& d, const AttentionWeight& attn, int layer_id) + { + const int local_head_num = attn.head_num / attn.tp_size; + const int head_dim = attn.head_dim; + const int token_num = d.token_num; + + AttentionParams params{}; + params.out = (T*)attn_output.raw_data(); + params.q = (T*)qkv.raw_data(); + + params.stride = (int64_t)local_head_num * 3 * head_dim; + + params.cu_q_len = d.attn_cu_seqlens.data(); + params.cu_k_len = d.attn_cu_seqlens.data(); + params.finished = d.attn_finished.data(); + + params.linear_iter_params = LinearIteratorParams{ + kv.raw_data(), + 2 * token_num * head_dim, + token_num * head_dim, + }; + + params.token_num = token_num; + params.batch_size = d.batch_size; + params.max_q_len = d.seq_len; + params.max_k_len = d.seq_len; + + params.num_heads = local_head_num; + params.num_kv_heads = local_head_num; + params.size_per_head = head_dim; + params.causal = false; + params.layer_id = layer_id; + + double scaling = 1.; + if (attn.softmax_scale) { + scaling *= attn.softmax_scale; + } + else { + scaling /= std::sqrt((float)head_dim); + } + params.inv_sqrt_dh = scaling * std::log2(std::exp(1.)); + + params.window_size = 0; + params.rope_param.type = RopeType::kNull; + params.max_split_k = 1; + params.cp_size = 1; + params.stream = core::Context::stream().handle(); + return params; + } + + template + void Attn(Tensor& input, Tensor& output, Data& d, int layer_id) + { + auto* attn = weights_.block(layer_id)->attention.get(); + auto stream = core::Context::stream().handle(); + + Tensor qkv; + TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv)); + TM_CUDA_CHECK(cudaGetLastError()); + + const int local_head_num = attn->head_num / attn->tp_size; + const int head_dim = attn->head_dim; + const int local_dim = local_head_num * head_dim; + + ApplyBias(qkv, attn->w_qkv->bias, stream); + + if (attn->q_norm && attn->k_norm) { + Tensor sums{{2, d.token_num}, kFloat, kDEVICE}; + invokeInternVitPreRMSNorm(sums, qkv, local_dim, stream); + if (attn->tp_size > 1) { + AllReduceSum(sums, stream); + } + invokeInternVitPostRMSNorm(qkv, + sums, + attn->q_norm->weight, + attn->k_norm->weight, + local_dim, + config_.hidden_dim, + attn->q_norm->norm_eps_, + stream); + } + + Tensor kv{{local_head_num, 2, d.token_num, head_dim}, qkv.dtype(), qkv.device()}; + invokeInternVitPrepareQKV(kv, qkv, local_head_num, head_dim, stream); + + Tensor attn_output{{d.token_num, local_dim}, qkv.dtype(), qkv.device()}; + auto params = CreateVitAttentionParams(attn_output, qkv, kv, d, *attn, layer_id); + dispatchAttention(params); + TM_CUDA_CHECK(cudaGetLastError()); + + TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output)); + TM_CUDA_CHECK(cudaGetLastError()); + if (attn->tp_size > 1) { + AllReduceSum(output, stream); + } + } + + void Mlp(Tensor& input, Tensor& output, int layer_id) + { + auto* block = weights_.block(layer_id); + auto stream = core::Context::stream().handle(); + + Tensor inter; + TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter)); + TM_CUDA_CHECK(cudaGetLastError()); + + invokeAddBiasActivation(inter, block->mlp_fc1->bias, ActivationType::kGelu, stream); + + TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output)); + TM_CUDA_CHECK(cudaGetLastError()); + AllReduceSum(output, stream); + } + + Tensor Projector(Tensor& hidden, Data& d) + { + const auto& cfg = config_; + auto stream = core::Context::stream().handle(); + + const int grid_size = (int)std::sqrt(cfg.num_patches); + TM_CHECK_EQ(grid_size * grid_size, cfg.num_patches); + TM_CHECK_EQ(cfg.image_seq_length, (grid_size / 2) * (grid_size / 2)); + + Tensor shuffled{{d.batch_size * cfg.image_seq_length, cfg.hidden_dim * 4}, cfg.data_type, kDEVICE}; + invokeInternVitPixelShuffle(shuffled, hidden, grid_size, stream); + + Tensor projector_normed{{d.batch_size * cfg.image_seq_length, cfg.hidden_dim * 4}, cfg.data_type, kDEVICE}; + invokeLayerNorm(projector_normed, + shuffled, + weights_.projector_norm->weight, + weights_.projector_norm->bias, + weights_.projector_norm->norm_eps_, + stream); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor inter; + TM_SCOPE_CALL(linear_.Forward(projector_normed, *weights_.projector_fc1, inter)); + TM_CUDA_CHECK(cudaGetLastError()); + + invokeAddBiasActivation(inter, weights_.projector_fc1->bias, ActivationType::kGelu, stream); + + Tensor output; + TM_SCOPE_CALL(linear_.Forward(inter, *weights_.projector_fc2, output)); + TM_CUDA_CHECK(cudaGetLastError()); + AllReduceSum(output, stream); + ApplyBias(output, weights_.projector_fc2->bias, stream); + TM_CUDA_CHECK(cudaGetLastError()); + return output; + } + + void Forward(int phase, TensorMap& args) + { + const auto& cfg = config_; + auto& d = data_.at(phase); + if (d.batch_size == 0) { + return; + } + + auto stream = core::Context::stream().handle(); + auto residual = PatchEmbedding(d); + + Tensor hidden_states = [&]() { + Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{}; + if (symm_buf && d.token_num * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) { + return Tensor{symm_buf.view(cfg.data_type), {d.token_num, cfg.hidden_dim}}; + } + else { + return Tensor{{d.token_num, cfg.hidden_dim}, cfg.data_type, kDEVICE}; + } + }(); + + ApplyNorm(hidden_states, residual, *weights_.block(0)->norm1, config_.norm_type); + + for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) { + auto* block = weights_.block(layer_id); + + auto invoke = [&](auto t) { + using T = decltype(t); + Attn(hidden_states, hidden_states, d, layer_id); + }; + TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke); + ResidualScaleNorm(hidden_states, + residual, + hidden_states, + block->lambda_1, + block->attention->wo->bias, + block->norm2.get(), + config_.norm_type); + + Mlp(hidden_states, hidden_states, layer_id); + + const bool is_last_layer = layer_id + 1 == cfg.depth; + ResidualScaleNorm(hidden_states, + residual, + hidden_states, + block->lambda_2, + block->mlp_fc2->bias, + is_last_layer ? nullptr : weights_.block(layer_id + 1)->norm1.get(), + is_last_layer ? NormType::kNone : config_.norm_type); + } + + Tensor image_embeds = Projector(residual, d); + EnsureFloatDtype(image_embeds, engine_data_type_); + + args.produce("multimodal", + MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf()); + } +}; + +InternVit::InternVit(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases): + impl_{std::make_unique(engine, ctx, weights, phases)} +{ +} + +InternVit::~InternVit() = default; + +void InternVit::Run(BatchOp op, int phase, TensorMap& env) +{ + switch (op) { + case BatchOp::kAdd: + return impl_->Add(phase, env); + case BatchOp::kSetup: + return impl_->Setup(phase, env); + case BatchOp::kForward: + return impl_->Forward(phase, env); + default: + return; + } +} + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit.h b/src/turbomind/models/internvit/internvit.h new file mode 100644 index 0000000000..5d5ba46b9e --- /dev/null +++ b/src/turbomind/models/internvit/internvit.h @@ -0,0 +1,26 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/models/vision_model.h" + +#include + +namespace turbomind { + +class InternVitWeight; + +class InternVit: public VisionModel { +public: + InternVit(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases); + + ~InternVit() override; + + void Run(BatchOp op, int phase, TensorMap& env) override; + +private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_block_weight.cc b/src/turbomind/models/internvit/internvit_block_weight.cc new file mode 100644 index 0000000000..ec92ed1cbc --- /dev/null +++ b/src/turbomind/models/internvit/internvit_block_weight.cc @@ -0,0 +1,31 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/internvit/internvit_block_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +void InternVitBlockWeight::prepare() +{ + for_each_child([](const char* /*name*/, core::Module* child) { + if (child) { + child->prepare(); + } + }); + + if (lambda_1) { + EnsureFloatDtype(lambda_1, data_type); + } + if (lambda_2) { + EnsureFloatDtype(lambda_2, data_type); + } +} + +TM_MODULE_REGISTER(InternVitBlockWeight, core::InternVitBlockConfig); + +TM_MODULE_METHODS(InternVitBlockWeight, INTERNVIT_BLOCK_CHILDREN, INTERNVIT_BLOCK_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_block_weight.h b/src/turbomind/models/internvit/internvit_block_weight.h new file mode 100644 index 0000000000..dff96502df --- /dev/null +++ b/src/turbomind/models/internvit/internvit_block_weight.h @@ -0,0 +1,72 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/models/attention_weight.h" + +namespace turbomind::core { + +struct InternVitBlockConfig: ModuleConfig { + InternVitBlockConfig(): ModuleConfig{"InternVitBlockWeight"} {} + +#define INTERNVIT_BLOCK_FIELDS(X) \ + X(DataType, data_type) \ + X(int, hidden_dim) \ + X(int, head_num) \ + X(int, intermediate_size) \ + X(float, norm_eps, 1e-6f) + + INTERNVIT_BLOCK_FIELDS(TM_MEMBER) + TM_FOR_EACH(InternVitBlockConfig, INTERNVIT_BLOCK_FIELDS) + +#undef INTERNVIT_BLOCK_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class LinearWeight; + +class InternVitBlockWeight: public core::Module { +public: + const char* type() const override + { + return "InternVitBlockWeight"; + } + + InternVitBlockWeight() = default; + explicit InternVitBlockWeight(const core::InternVitBlockConfig& cfg): + data_type{cfg.data_type}, + hidden_dim{cfg.hidden_dim}, + head_num{cfg.head_num}, + intermediate_size{cfg.intermediate_size}, + norm_eps{cfg.norm_eps} + { + } + + void prepare() override; + +#define INTERNVIT_BLOCK_CHILDREN(X) \ + X(core::Module, norm1) \ + X(core::Module, norm2) \ + X(AttentionWeight, attention) \ + X(LinearWeight, mlp_fc1) \ + X(LinearWeight, mlp_fc2) + +#define INTERNVIT_BLOCK_PARAMS(X) \ + X(lambda_1) \ + X(lambda_2) + + TM_MODULE_DECLARE(InternVitBlockWeight, INTERNVIT_BLOCK_CHILDREN, INTERNVIT_BLOCK_PARAMS) + + DataType data_type{}; + int hidden_dim{}; + int head_num{}; + int intermediate_size{}; + float norm_eps{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_input.h b/src/turbomind/models/internvit/internvit_input.h new file mode 100644 index 0000000000..34f50a72b0 --- /dev/null +++ b/src/turbomind/models/internvit/internvit_input.h @@ -0,0 +1,37 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/engine/multimodal_input.h" + +#include +#include + +namespace turbomind { +namespace multimodal { + +struct InternVitItem { + Modality modality; + Tensor data; + int token_begin; + int token_end; + + InternVitItem() = default; + + InternVitItem(Modality modality, Tensor data, int token_begin, int token_end): + modality{modality}, data{std::move(data)}, token_begin{token_begin}, token_end{token_end} + { + } +}; + +struct InternVitInput final: Input { + std::vector items; + + InternVitInput() = default; + + explicit InternVitInput(std::vector items): items{std::move(items)} {} +}; + +} // namespace multimodal +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_kernels.cu b/src/turbomind/models/internvit/internvit_kernels.cu new file mode 100644 index 0000000000..4b85ad0cfb --- /dev/null +++ b/src/turbomind/models/internvit/internvit_kernels.cu @@ -0,0 +1,700 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/internvit/internvit_kernels.h" + +#include "src/turbomind/core/check.h" +#include "src/turbomind/kernels/core/array.h" +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" +#include "src/turbomind/utils/cuda_utils.h" + +#include "cub/block/block_reduce.cuh" + +#include +#include + +#include + +namespace turbomind { + +namespace { + +struct SumPair { + float s{}; + float sq{}; + + __device__ SumPair operator+(const SumPair& other) const + { + return {s + other.s, sq + other.sq}; + } +}; + +template +__global__ void patchIm2ColKernel(T* out, + const T* input, + int channels, + int image_h, + int image_w, + int patch_h, + int patch_w, + int grid_w, + int patch_area, + int patch_in_dim, + int num_patches) +{ + const int batch = blockIdx.x; + const int patch = blockIdx.y; + const int row = batch * num_patches + patch; + const int ph = patch / grid_w; + const int pw = patch - ph * grid_w; + + for (int k = threadIdx.x; k < patch_in_dim; k += blockDim.x) { + const int c = k / patch_area; + const int rem = k - c * patch_area; + const int ih = rem / patch_w; + const int iw = rem - ih * patch_w; + + const int64_t src = + ((int64_t)batch * channels + c) * image_h * image_w + (ph * patch_h + ih) * image_w + pw * patch_w + iw; + out[(int64_t)row * patch_in_dim + k] = input[src]; + } +} + +template +__global__ void addEmbeddingsVecKernel(T* out, + const T* patch, + const T* patch_bias, + const T* cls_token, + const T* pos, + int seq_len, + int num_patches, + int tiles) +{ + const int vec_id = blockIdx.x * blockDim.x + threadIdx.x; + const int token = blockIdx.y; + const int batch = blockIdx.z; + if (vec_id >= tiles) { + return; + } + + Array pos_vec; + Array out_vec; + Array bias_vec{}; + Load(pos_vec, pos + (token * tiles + vec_id) * vec_size); + + if (token == 0) { + Load(out_vec, cls_token + vec_id * vec_size); + } + else { + Load(out_vec, patch + (((int64_t)batch * num_patches + token - 1) * tiles + vec_id) * vec_size); + if (patch_bias) { + Load(bias_vec, patch_bias + vec_id * vec_size); + } + } + + using namespace ops; + Store(out + (((int64_t)batch * seq_len + token) * tiles + vec_id) * vec_size, + cast(cast(out_vec) + cast(bias_vec) + cast(pos_vec))); +} + +template +__global__ void preRMSNormKernel(float* sums, const T* qkv, int token_num, int local_dim, int qkv_dim) +{ + const int token = blockIdx.x; + const int part = blockIdx.y; // 0: q, 1: k + const int base = token * qkv_dim + part * local_dim; + + using namespace ops; + float sum = 0.f; + for (int d = threadIdx.x * vec_size; d < local_dim; d += block_dim * vec_size) { + Array qkv_vec; + Load(qkv_vec, qkv + base + d); + + const auto qkv_float = cast(qkv_vec); + const auto sq = qkv_float * qkv_float; + + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += sq[i]; + } + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + sum = BlockReduce{temp_storage}.Sum(sum); + + if (threadIdx.x == 0) { + sums[part * token_num + token] = sum; + } +} + +template +__global__ void postRMSNormKernel(T* qkv, + const float* sums, + const T* q_weight, + const T* k_weight, + int token_num, + int local_dim, + int qkv_dim, + int hidden_dim, + int tiles, + float eps) +{ + const int token = blockIdx.x; + const int tile_block = blockIdx.y; + const int part = blockIdx.z; + const int vec_id = tile_block * blockDim.x + threadIdx.x; + if (vec_id >= tiles) { + return; + } + + const int d = vec_id * vec_size; + const int base = token * qkv_dim + part * local_dim + d; + + Array qkv_vec; + Array weight_vec; + Load(qkv_vec, qkv + base); + Ldg(weight_vec, (part == 0 ? q_weight : k_weight) + d); + + const float inv = rsqrtf(sums[part * token_num + token] / hidden_dim + eps); + + using namespace ops; + Store(qkv + base, cast((cast(qkv_vec) * cast(weight_vec)) * inv)); +} + +template +__global__ void prepareQKVKernel(T* kv, const T* qkv, int token_num, int local_head_num) +{ + static_assert(head_dim % vec_size == 0); + constexpr int kVecPerHead = head_dim / vec_size; + constexpr int kHeadsPerWarp = WARP_SIZE / kVecPerHead; + static_assert(kVecPerHead * kHeadsPerWarp == WARP_SIZE); + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x - warp_id * WARP_SIZE; + const int head_slot = lane_id / kVecPerHead; + const int vec_id = lane_id - head_slot * kVecPerHead; + + const int token = blockIdx.x * warps_per_block + warp_id; + if (token >= token_num) { + return; + } + + const int head_group = blockIdx.y; + const int part = blockIdx.z; + const int head = head_group * kHeadsPerWarp + head_slot; + if (head >= local_head_num) { + return; + } + + const int local_dim = local_head_num * head_dim; + const int qkv_dim = 3 * local_dim; + const int offset = vec_id * vec_size; + const int src_offset = (part == 0 ? local_dim : 2 * local_dim) + head * head_dim + offset; + const int64_t dst = (((int64_t)head * 2 + part) * token_num + token) * head_dim + offset; + + Array qkv_vec; + Load(qkv_vec, qkv + token * qkv_dim + src_offset); + Store(kv + dst, qkv_vec); +} + +// residual <- residual + (branch_output + optional branch_bias) * branch_scale +template +__global__ void residualScaleKernel(T* __restrict__ residual, + const T* __restrict__ branch_output, + const T* __restrict__ branch_scale, + const T* __restrict__ branch_bias, + int hidden_dim, + int tiles) +{ + const int token = blockIdx.x; + const int vec_id = blockIdx.y * blockDim.x + threadIdx.x; + if (vec_id >= tiles) { + return; + } + + const int d = vec_id * vec_size; + residual += (int64_t)token * hidden_dim + d; + branch_output += (int64_t)token * hidden_dim + d; + branch_scale += d; + if (branch_bias) { + branch_bias += d; + } + + Array residual_vec; + Array branch_vec; + Array branch_scale_vec; + Array branch_bias_vec{}; + + Load(residual_vec, residual); + Load(branch_vec, branch_output); + Ldg(branch_scale_vec, branch_scale); + if (branch_bias) { + Ldg(branch_bias_vec, branch_bias); + } + + using namespace ops; + Store( + residual, + cast(cast(residual_vec) + cast(branch_vec + branch_bias_vec) * cast(branch_scale_vec))); +} + +// residual <- residual + (branch_output + optional branch_bias) * branch_scale +// hidden_states <- LayerNorm(residual) * norm_weight + optional norm_bias +template +__global__ void residualScaleLayerNormKernel(T* hidden_states, + T* residual, + const T* branch_output, + const T* branch_scale, + const T* branch_bias, + const T* norm_weight, + const T* norm_bias, + int hidden_dim, + float eps) +{ + const int token = blockIdx.x; + const int di = threadIdx.x * vec_size; + + residual += (int64_t)token * hidden_dim; + branch_output += (int64_t)token * hidden_dim; + hidden_states += (int64_t)token * hidden_dim; + + Array sum_v{}; + Array sq_v{}; + Array residual_vec; + Array branch_vec; + Array branch_scale_vec; + Array branch_bias_vec{}; + + using namespace ops; + for (int i = di; i < hidden_dim; i += block_dim * vec_size) { + Load(residual_vec, residual + i); + Load(branch_vec, branch_output + i); + Ldg(branch_scale_vec, branch_scale + i); + if (branch_bias) { + Ldg(branch_bias_vec, branch_bias + i); + } + + residual_vec = cast(cast(residual_vec) + + cast(branch_vec + branch_bias_vec) * cast(branch_scale_vec)); + Store(residual + i, residual_vec); + + const auto residual_float = cast(residual_vec); + sum_v = sum_v + residual_float; + sq_v = sq_v + residual_float * residual_float; + } + + SumPair pair{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + pair.s += sum_v[i]; + pair.sq += sq_v[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + pair = BlockReduce{temp_storage}.Sum(pair); + + __shared__ float shared_mean; + __shared__ float shared_inv_std; + + if (threadIdx.x == 0) { + const float inv_dim = 1.f / hidden_dim; + const float mean = pair.s * inv_dim; + const float var = fmaxf(pair.sq * inv_dim - mean * mean, 0.f); + shared_mean = mean; + shared_inv_std = rsqrtf(var + eps); + } + + __syncthreads(); + + const float mean = shared_mean; + const float inv_std = shared_inv_std; + + Array weight_vec; + Array bias_vec{}; + for (int i = di; i < hidden_dim; i += block_dim * vec_size) { + Load(residual_vec, residual + i); + Ldg(weight_vec, norm_weight + i); + if (norm_bias) { + Ldg(bias_vec, norm_bias + i); + } + + Array out_vec; + PRAGMA_UNROLL + for (int j = 0; j < vec_size; ++j) { + out_vec[j] = (static_cast(residual_vec[j]) - mean) * inv_std * static_cast(weight_vec[j]) + + static_cast(bias_vec[j]); + } + Store(hidden_states + i, cast(out_vec)); + } +} + +// residual <- residual + (branch_output + optional branch_bias) * branch_scale +// hidden_states <- RMSNorm(residual) * norm_weight +template +__global__ void residualScaleRMSNormKernel(T* hidden_states, + T* residual, + const T* branch_output, + const T* branch_scale, + const T* branch_bias, + const T* norm_weight, + int hidden_dim, + float eps) +{ + const int token = blockIdx.x; + const int di = threadIdx.x * vec_size; + + residual += (int64_t)token * hidden_dim; + branch_output += (int64_t)token * hidden_dim; + hidden_states += (int64_t)token * hidden_dim; + + Array sq_v{}; + Array residual_vec; + Array branch_vec; + Array branch_scale_vec; + Array branch_bias_vec{}; + + using namespace ops; + for (int i = di; i < hidden_dim; i += block_dim * vec_size) { + Load(residual_vec, residual + i); + Load(branch_vec, branch_output + i); + Ldg(branch_scale_vec, branch_scale + i); + if (branch_bias) { + Ldg(branch_bias_vec, branch_bias + i); + } + + residual_vec = cast(cast(residual_vec) + + cast(branch_vec + branch_bias_vec) * cast(branch_scale_vec)); + Store(residual + i, residual_vec); + + const auto residual_float = cast(residual_vec); + sq_v = sq_v + residual_float * residual_float; + } + + float sum{}; + PRAGMA_UNROLL + for (int i = 0; i < vec_size; ++i) { + sum += sq_v[i]; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + sum = BlockReduce{temp_storage}.Sum(sum); + + __shared__ float shared_inv_rms; + if (threadIdx.x == 0) { + shared_inv_rms = rsqrtf(sum / hidden_dim + eps); + } + + __syncthreads(); + + const float inv_rms = shared_inv_rms; + + using namespace ops; + Array weight_vec; + for (int i = di; i < hidden_dim; i += block_dim * vec_size) { + Load(residual_vec, residual + i); + Ldg(weight_vec, norm_weight + i); + Store(hidden_states + i, cast(cast(residual_vec) * inv_rms * cast(weight_vec))); + } +} + +template +__global__ void pixelShuffleKernel(T* __restrict__ out, + const T* __restrict__ hidden, + int grid_size, + int out_grid, + int hidden_dim, + int seq_len, + int tiles) +{ + const int vec_id = blockIdx.x * blockDim.x + threadIdx.x; + if (vec_id >= tiles) { + return; + } + + const int token = blockIdx.y; + const int batch = blockIdx.z; + const int ow = token / out_grid; + const int oh = token - ow * out_grid; + + const int c = vec_id * vec_size; + const int in_token = 1 + (ow * 2) * grid_size + oh * 2; + const int64_t in0 = ((int64_t)batch * seq_len + in_token) * hidden_dim + c; + const int64_t out0 = ((int64_t)batch * out_grid * out_grid + token) * (hidden_dim * 4) + c; + + Array v; + Load(v, hidden + in0); + Store(out + out0, v); + Load(v, hidden + in0 + hidden_dim); + Store(out + out0 + hidden_dim, v); + Load(v, hidden + in0 + (int64_t)grid_size * hidden_dim); + Store(out + out0 + 2 * hidden_dim, v); + Load(v, hidden + in0 + ((int64_t)grid_size + 1) * hidden_dim); + Store(out + out0 + 3 * hidden_dim, v); +} + +} // namespace + +void invokeInternVitPatchify(Tensor& patches, + const Tensor& pixel_values, + int batch_size, + int channels, + int image_h, + int image_w, + int patch_h, + int patch_w, + cudaStream_t stream) +{ + TM_CHECK_EQ(patches.ndim(), 2); + TM_CHECK_EQ(pixel_values.ndim(), 4); + TM_CHECK_EQ(patches.dtype(), pixel_values.dtype()); + + const int grid_h = image_h / patch_h; + const int grid_w = image_w / patch_w; + const int num_patches = grid_h * grid_w; + const int patch_area = patch_h * patch_w; + const int patch_in_dim = channels * patch_area; + + auto invoke = [&](auto t) { + using T = decltype(t); + const dim3 grid(batch_size, num_patches); + const dim3 block(256); + patchIm2ColKernel<<>>((T*)patches.raw_data(), + (const T*)pixel_values.raw_data(), + channels, + image_h, + image_w, + patch_h, + patch_w, + grid_w, + patch_area, + patch_in_dim, + num_patches); + }; + TM_DISPATCH_PRIMARY_DTYPES(patches.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitAddEmbeddings(Tensor& hidden, + const Tensor& patch_embeds, + const Tensor& patch_bias, + const Tensor& cls_token, + const Tensor& position_embeddings, + int batch_size, + int num_patches, + int hidden_dim, + cudaStream_t stream) +{ + constexpr int Vec = 4; + constexpr int kThreads = 256; + const int seq_len = num_patches + 1; + const int tiles = hidden_dim / Vec; + const dim3 grid((tiles + kThreads - 1) / kThreads, seq_len, batch_size); + + TM_CHECK_EQ(hidden_dim % Vec, 0); + + auto invoke = [&](auto t) { + using T = decltype(t); + addEmbeddingsVecKernel + <<>>((T*)hidden.raw_data(), + (const T*)patch_embeds.raw_data(), + patch_bias ? (const T*)patch_bias.raw_data() : nullptr, + (const T*)cls_token.raw_data(), + (const T*)position_embeddings.raw_data(), + seq_len, + num_patches, + tiles); + }; + TM_DISPATCH_PRIMARY_DTYPES(hidden.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitPreRMSNorm(Tensor& sums, const Tensor& qkv, int local_dim, cudaStream_t stream) +{ + TM_CHECK_EQ(sums.dtype(), kFloat); + constexpr int kThreads = 64; + const int token_num = qkv.shape(0); + const int qkv_dim = qkv.shape(1); + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int kVecSize = sizeof(uint4) / sizeof(T); + TM_CHECK_EQ(local_dim % kVecSize, 0); + preRMSNormKernel<<>>( + (float*)sums.raw_data(), (const T*)qkv.raw_data(), token_num, local_dim, qkv_dim); + }; + TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitPostRMSNorm(Tensor& qkv, + const Tensor& sums, + const Tensor& q_weight, + const Tensor& k_weight, + int local_dim, + int hidden_dim, + float eps, + cudaStream_t stream) +{ + constexpr int kThreads = 256; + const int token_num = qkv.shape(0); + const int qkv_dim = qkv.shape(1); + + TM_CHECK_EQ(q_weight.size(), local_dim); + TM_CHECK_EQ(k_weight.size(), local_dim); + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int kVecSize = sizeof(uint4) / sizeof(T); + TM_CHECK_EQ(local_dim % kVecSize, 0); + const int tiles = local_dim / kVecSize; + const int tile_blocks = (tiles + kThreads - 1) / kThreads; + const dim3 grid(token_num, tile_blocks, 2); + postRMSNormKernel<<>>((T*)qkv.raw_data(), + (const float*)sums.raw_data(), + (const T*)q_weight.raw_data(), + (const T*)k_weight.raw_data(), + token_num, + local_dim, + qkv_dim, + hidden_dim, + tiles, + eps); + }; + TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitPrepareQKV(Tensor& kv, const Tensor& qkv, int local_head_num, int head_dim, cudaStream_t stream) +{ + auto invoke = [&](auto t) { + using T = decltype(t); + auto dispatch = [&](auto head_dim_c) { + constexpr int kHeadDim = decltype(head_dim_c)::value; + constexpr int kVecSize = sizeof(uint4) / sizeof(T); + constexpr int kVecPerHead = kHeadDim / kVecSize; + constexpr int kHeadsPerWarp = WARP_SIZE / kVecPerHead; + constexpr int kWarpsPerBlock = 4; + static_assert(kVecPerHead * kHeadsPerWarp == WARP_SIZE); + + const int token_num = qkv.shape(0); + const int head_group_num = (local_head_num + kHeadsPerWarp - 1) / kHeadsPerWarp; + const dim3 grid((token_num + kWarpsPerBlock - 1) / kWarpsPerBlock, head_group_num, 2); + const dim3 block(kWarpsPerBlock * WARP_SIZE); + prepareQKVKernel + <<>>((T*)kv.raw_data(), (const T*)qkv.raw_data(), token_num, local_head_num); + }; + + if (head_dim == 64) { + dispatch(std::integral_constant{}); + } + else if (head_dim == 128) { + dispatch(std::integral_constant{}); + } + else { + TM_LOG_FATAL("unsupported InternVit PrepareQKV head_dim: {}", head_dim); + } + }; + TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitResidualScaleNorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& branch_output, + const Tensor& branch_scale, + const Tensor& branch_bias, + const Tensor& norm_weight, + const Tensor& norm_bias, + float eps, + NormType norm_type, + cudaStream_t stream) +{ + TM_CHECK_EQ(residual.ndim(), 2); + TM_CHECK_EQ(branch_output.size(), residual.size()); + TM_CHECK_EQ(branch_output.dtype(), residual.dtype()); + + const int hidden_dim = residual.shape(1); + const int token_num = residual.shape(0); + + TM_CHECK_EQ(branch_scale.size(), hidden_dim); + + if (norm_type != NormType::kNone) { + TM_CHECK(norm_weight); + TM_CHECK_EQ(norm_weight.size(), hidden_dim); + } + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int kNormThreads = 512; + constexpr int kVecSize = sizeof(uint4) / sizeof(T); + TM_CHECK_EQ(hidden_dim % kVecSize, 0); + + if (norm_type == NormType::kNone) { + const int kThreads = hidden_dim <= 1024 ? 128 : 256; + const int tiles = hidden_dim / kVecSize; + const dim3 grid(token_num, (tiles + kThreads - 1) / kThreads); + residualScaleKernel + <<>>((T*)residual.raw_data(), + (const T*)branch_output.raw_data(), + (const T*)branch_scale.raw_data(), + branch_bias ? (const T*)branch_bias.raw_data() : nullptr, + hidden_dim, + tiles); + } + else if (norm_type == NormType::kLayerNorm) { + residualScaleLayerNormKernel + <<>>((T*)hidden_states.raw_data(), + (T*)residual.raw_data(), + (const T*)branch_output.raw_data(), + (const T*)branch_scale.raw_data(), + branch_bias ? (const T*)branch_bias.raw_data() : nullptr, + (const T*)norm_weight.raw_data(), + norm_bias ? (const T*)norm_bias.raw_data() : nullptr, + hidden_dim, + eps); + } + else if (norm_type == NormType::kRMSNorm) { + residualScaleRMSNormKernel + <<>>((T*)hidden_states.raw_data(), + (T*)residual.raw_data(), + (const T*)branch_output.raw_data(), + (const T*)branch_scale.raw_data(), + branch_bias ? (const T*)branch_bias.raw_data() : nullptr, + (const T*)norm_weight.raw_data(), + hidden_dim, + eps); + } + else { + TM_LOG_FATAL("unsupported InternVit residual norm type: {}", (int)norm_type); + } + }; + TM_DISPATCH_PRIMARY_DTYPES(residual.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeInternVitPixelShuffle(Tensor& output, const Tensor& hidden, int grid_size, cudaStream_t stream) +{ + // kVecSize=4 was faster than 16-byte vectors in pixel-shuffle benchmarks. + constexpr int kVecSize = 4; + constexpr int kThreads = 128; + + const int out_grid = grid_size / 2; + const int seq_len = grid_size * grid_size + 1; + const int hidden_dim = hidden.shape(1); + const int batch_size = output.shape(0) / (out_grid * out_grid); + const int tiles = hidden_dim / kVecSize; + + auto invoke = [&](auto t) { + using T = decltype(t); + const dim3 grid((tiles + kThreads - 1) / kThreads, out_grid * out_grid, batch_size); + pixelShuffleKernel<<>>( + (T*)output.raw_data(), (const T*)hidden.raw_data(), grid_size, out_grid, hidden_dim, seq_len, tiles); + }; + TM_DISPATCH_PRIMARY_DTYPES(output.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_kernels.h b/src/turbomind/models/internvit/internvit_kernels.h new file mode 100644 index 0000000000..d90c0da02b --- /dev/null +++ b/src/turbomind/models/internvit/internvit_kernels.h @@ -0,0 +1,58 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/kernels/norm/norm.h" + +#include + +namespace turbomind { + +void invokeInternVitPatchify(Tensor& patches, + const Tensor& pixel_values, + int batch_size, + int channels, + int image_h, + int image_w, + int patch_h, + int patch_w, + cudaStream_t stream); + +void invokeInternVitAddEmbeddings(Tensor& hidden, + const Tensor& patch_embeds, + const Tensor& patch_bias, + const Tensor& cls_token, + const Tensor& position_embeddings, + int batch_size, + int num_patches, + int hidden_dim, + cudaStream_t stream); + +void invokeInternVitPreRMSNorm(Tensor& sums, const Tensor& qkv, int local_dim, cudaStream_t stream); + +void invokeInternVitPostRMSNorm(Tensor& qkv, + const Tensor& sums, + const Tensor& q_weight, + const Tensor& k_weight, + int local_dim, + int hidden_dim, + float eps, + cudaStream_t stream); + +void invokeInternVitPrepareQKV(Tensor& kv, const Tensor& qkv, int local_head_num, int head_dim, cudaStream_t stream); + +void invokeInternVitResidualScaleNorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& branch_output, + const Tensor& branch_scale, + const Tensor& branch_bias, + const Tensor& norm_weight, + const Tensor& norm_bias, + float eps, + NormType norm_type, + cudaStream_t stream); + +void invokeInternVitPixelShuffle(Tensor& output, const Tensor& hidden, int grid_size, cudaStream_t stream); + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_weight.cc b/src/turbomind/models/internvit/internvit_weight.cc new file mode 100644 index 0000000000..94a29db2ab --- /dev/null +++ b/src/turbomind/models/internvit/internvit_weight.cc @@ -0,0 +1,64 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/internvit/internvit_weight.h" + +#include "src/turbomind/core/registry.h" +#include "src/turbomind/models/internvit/internvit_block_weight.h" +#include "src/turbomind/models/layer_norm_weight.h" +#include "src/turbomind/models/linear_weight.h" +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +InternVitWeight::InternVitWeight(const core::InternVitConfig& cfg): config_{cfg} {} + +void InternVitWeight::prepare() +{ + for_each_child([](const char* /*name*/, core::Module* child) { + if (child) { + child->prepare(); + } + }); + + if (cls_token) { + EnsureFloatDtype(cls_token, config_.data_type); + } + if (position_embeddings) { + EnsureFloatDtype(position_embeddings, config_.data_type); + } +} + +bool InternVitWeight::verify(std::vector& missing) +{ + core::Module::verify(missing); + if (!patch_embed) { + missing.push_back(full_path() + ": missing patch_embed"); + } + if (!cls_token) { + missing.push_back(full_path() + ": missing cls_token"); + } + if (!position_embeddings) { + missing.push_back(full_path() + ": missing position_embeddings"); + } + if (!blocks || blocks->size() != config_.depth) { + missing.push_back(full_path() + ": blocks count mismatch (expected " + std::to_string(config_.depth) + ")"); + } + if (!projector_norm || !projector_fc1 || !projector_fc2) { + missing.push_back(full_path() + ": missing projector"); + } + return missing.empty(); +} + +InternVitBlockWeight* InternVitWeight::block(int i) const +{ + if (!blocks) { + return nullptr; + } + return static_cast(blocks->child(std::to_string(i))); +} + +TM_MODULE_REGISTER(InternVitWeight, core::InternVitConfig); + +TM_MODULE_METHODS(InternVitWeight, INTERNVIT_WEIGHT_CHILDREN, INTERNVIT_WEIGHT_PARAMS) + +} // namespace turbomind diff --git a/src/turbomind/models/internvit/internvit_weight.h b/src/turbomind/models/internvit/internvit_weight.h new file mode 100644 index 0000000000..98d064a149 --- /dev/null +++ b/src/turbomind/models/internvit/internvit_weight.h @@ -0,0 +1,94 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/kernels/norm/norm.h" +#include "src/turbomind/models/vision_model_weight.h" + +#include + +namespace turbomind::core { + +struct InternVitConfig: ModuleConfig { + InternVitConfig(): ModuleConfig{"InternVitWeight"} {} + + DataType data_type{}; + int hidden_dim{0}; + int depth{0}; + int patch_in_dim{0}; + int in_channels{0}; + int image_height{0}; + int image_width{0}; + int patch_height{0}; + int patch_width{0}; + int num_patches{0}; + int image_seq_length{0}; + NormType norm_type{NormType::kRMSNorm}; + +#define INTERNVIT_FIELDS(X) \ + X(DataType, data_type) \ + X(int, hidden_dim) \ + X(int, depth) \ + X(int, patch_in_dim) \ + X(int, in_channels) \ + X(int, image_height) \ + X(int, image_width) \ + X(int, patch_height) \ + X(int, patch_width) \ + X(int, num_patches) \ + X(int, image_seq_length) \ + X(NormType, norm_type, NormType::kRMSNorm) + + TM_FOR_EACH(InternVitConfig, INTERNVIT_FIELDS) + +#undef INTERNVIT_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +class InternVitBlockWeight; +class LayerNormWeight; +class LinearWeight; + +class InternVitWeight: public VisionModelWeight { +public: + const char* type() const override + { + return "InternVitWeight"; + } + + InternVitWeight() = default; + explicit InternVitWeight(const core::InternVitConfig& cfg); + + void prepare() override; + bool verify(std::vector& missing) override; + +#define INTERNVIT_WEIGHT_CHILDREN(X) \ + X(LinearWeight, patch_embed) \ + X(core::ModuleList, blocks) \ + X(LayerNormWeight, projector_norm) \ + X(LinearWeight, projector_fc1) \ + X(LinearWeight, projector_fc2) + +#define INTERNVIT_WEIGHT_PARAMS(X) \ + X(cls_token) \ + X(position_embeddings) + + TM_MODULE_DECLARE(InternVitWeight, INTERNVIT_WEIGHT_CHILDREN, INTERNVIT_WEIGHT_PARAMS) + + const core::InternVitConfig& config() const noexcept + { + return config_; + } + + InternVitBlockWeight* block(int i) const; + +private: + core::InternVitConfig config_{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/llama/llama_rope.h b/src/turbomind/models/llama/llama_rope.h index 32a697204e..ec45bc103e 100644 --- a/src/turbomind/models/llama/llama_rope.h +++ b/src/turbomind/models/llama/llama_rope.h @@ -39,9 +39,9 @@ struct Llama3RopeKernelParam { struct MropeRopeKernelParam { int3 section; - int stride{}; int* position_ids{}; int* position_delta{}; + int* position_offsets{}; int* length{}; }; diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index cc0e529eb4..2f07791bce 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -68,8 +68,9 @@ struct AttentionData { Buffer_ rope_base; - Tensor_ mrope_position_ids; + Buffer_ mrope_position_ids; Buffer_ mrope_position_delta; + Buffer_ mrope_position_offsets; Buffer_ mrope_length; // borrowed from env @@ -136,10 +137,8 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(int quant rope_base_buf_ = {bsz + 1, kCPUpinned}; } if (rope_param_.mrope_mode != MropeMode::kNone) { - // mrope device buffers are allocated lazily — borrowed from env when the vision encoder - // produced them, or owned (allocated in legacy_mrope_setup) when only r.inputs supplies them. - mrope_position_delta_buf_ = {bsz, kCPUpinned}; - mrope_length_buf_ = {bsz, kCPUpinned}; + mrope_default_buf_ = Buffer_{std::max(bsz, 3), kDEVICE}; + Clear(mrope_default_buf_); } const int max_blocks = bsz * cdiv(engine.session_len, engine_param_.cache_block_seq_len); for (int i = 0; i < phases; ++i) { @@ -277,45 +276,26 @@ void UnifiedAttentionLayer::Setup(int phase, TensorMap& env) copy(rope_base_buf_, bsz, d.rope_base); } if (rope_param_.mrope_mode != MropeMode::kNone) { - // mrope tensors can come from two sources: - // 1. env: the C++ vision encoder produced device tensors in the exact layout - // FastRoPE expects — borrow them with no copy. - // 2. r.inputs: legacy Python-preprocessor path, per-request shaped (length, 3) + - // scalar delta. Falls back here when env did not produce mrope. - if (env.try_("mrope_length")) { - d.mrope_length = env.at("mrope_length").buffer().borrow(); - d.mrope_position_delta = env.at("mrope_position_delta").buffer().borrow(); - d.mrope_position_ids = env.at("mrope_position_ids").borrow(); + auto* mrope_length = env.try_("mrope_length"); + auto* mrope_position_delta = env.try_("mrope_position_delta"); + auto* mrope_position_offsets = env.try_("mrope_position_offsets"); + auto* mrope_position_ids = env.try_("mrope_position_ids"); + if (mrope_length || mrope_position_delta || mrope_position_offsets || mrope_position_ids) { + TM_CHECK(mrope_length) << "MRoPE requires native vision-produced mrope_length"; + TM_CHECK(mrope_position_delta) << "MRoPE requires native vision-produced mrope_position_delta"; + TM_CHECK(mrope_position_offsets) << "MRoPE requires native vision-produced mrope_position_offsets"; + TM_CHECK(mrope_position_ids) << "MRoPE requires native vision-produced mrope_position_ids"; + + d.mrope_length = mrope_length->buffer().borrow(); + d.mrope_position_delta = mrope_position_delta->buffer().borrow(); + d.mrope_position_offsets = mrope_position_offsets->buffer().borrow(); + d.mrope_position_ids = mrope_position_ids->buffer().borrow(); } else { - // Legacy r.inputs path. Lazily allocate owned device buffers on first hit. - if (!d.mrope_position_ids) { - /// TODO: total space for `mrope_position_ids` can be reduced to (max_fwd_tokens, 3) - d.mrope_position_ids = - Tensor_{{engine_param_.max_batch_size, engine_param_.session_len, 3}, kDEVICE}; - d.mrope_position_delta = Buffer_{engine_param_.max_batch_size, kDEVICE}; - d.mrope_length = Buffer_{engine_param_.max_batch_size, kDEVICE}; - } - const auto stride = d.mrope_position_ids.stride(0); - for (int i = 0; i < rc.size(); ++i) { - auto& c = *rc[i]; - auto& r = *c.req; - if (auto pos_ids = r.inputs.try_("mrope_position_ids")) { - int length = pos_ids->shape(0); - mrope_length_buf_[i] = length; - mrope_position_delta_buf_[i] = *r.inputs.at("mrope_position_delta").data(); - if (auto o = Interval{0, length} & Interval{c.history_len + c.alpha, Interval::Size{c.input_len}}) { - copy(pos_ids->data() + o.begin() * 3, - (int)o.size() * 3, - d.mrope_position_ids.data() + i * stride + o.begin() * 3); - } - } - else { - mrope_length_buf_[i] = mrope_position_delta_buf_[i] = 0; - } - } - copy(mrope_length_buf_, rc.size(), d.mrope_length); - copy(mrope_position_delta_buf_, rc.size(), d.mrope_position_delta); + d.mrope_length = mrope_default_buf_.borrow(); + d.mrope_position_delta = mrope_default_buf_.borrow(); + d.mrope_position_offsets = mrope_default_buf_.borrow(); + d.mrope_position_ids = mrope_default_buf_.borrow(); } } } @@ -522,11 +502,10 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, params.rope_param.base = d.rope_base.data() + offset; } if (rope_param_.mrope_mode != MropeMode::kNone) { - params.rope_param.mrope.position_delta = d.mrope_position_delta.data() + offset; - params.rope_param.mrope.length = d.mrope_length.data() + offset; - params.rope_param.mrope.stride = d.mrope_position_ids.stride(0); - params.rope_param.mrope.position_ids = - d.mrope_position_ids.data() + offset * params.rope_param.mrope.stride; + params.rope_param.mrope.position_delta = d.mrope_position_delta.data() + offset; + params.rope_param.mrope.position_offsets = d.mrope_position_offsets.data() + offset; + params.rope_param.mrope.length = d.mrope_length.data() + offset; + params.rope_param.mrope.position_ids = d.mrope_position_ids.data(); } // logn attn diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h index 79c20d3115..ce2926245d 100644 --- a/src/turbomind/models/llama/unified_attention_layer.h +++ b/src/turbomind/models/llama/unified_attention_layer.h @@ -110,8 +110,7 @@ class UnifiedAttentionLayer { Tensor tmp_attn_; Buffer_ rope_base_buf_; - Buffer_ mrope_position_delta_buf_; - Buffer_ mrope_length_buf_; + Buffer_ mrope_default_buf_; CpPostContext cp_fn_ctx_; // context parallel }; diff --git a/src/turbomind/models/qwen3_5vit/bias_gelu.cu b/src/turbomind/models/qwen3_5vit/bias_gelu.cu deleted file mode 100644 index 3d6f9813a7..0000000000 --- a/src/turbomind/models/qwen3_5vit/bias_gelu.cu +++ /dev/null @@ -1,106 +0,0 @@ -#include "src/turbomind/models/qwen3_5vit/bias_gelu.h" - -#include "src/turbomind/core/logger.h" -#include "src/turbomind/kernels/activation_ops.h" -#include "src/turbomind/kernels/core/array_ops.h" -#include "src/turbomind/kernels/core/common.h" - -#include - -namespace turbomind { - -namespace { - -template class Activation> -__global__ void biasActivationKernel(T* data, const T* __restrict__ bias, int64_t stride, int num, int dim) -{ - const int ti = blockIdx.x; - const int di = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size; - - if (ti >= num || di >= dim) { - return; - } - - Array x_vec; - Load(x_vec, data + ti * stride + di); - - auto x = cast(x_vec); - - if (bias) { - Array bias_vec; - Ldg(bias_vec, bias + di); - using namespace ops; - x = x + cast(bias_vec); - } - - PRAGMA_UNROLL - for (int i = 0; i < vec_size; ++i) { - x[i] = Activation::apply(x[i]); - } - - Store(data + ti * stride + di, cast(x)); -} - -} // namespace - -void invokeQwen3_5VitBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream) -{ - if (x.size() == 0) { - return; - } - - TM_CHECK_EQ(x.ndim(), 2); - if (bias) { - TM_CHECK_EQ(bias.shape(-1), x.shape(-1)); - TM_CHECK_EQ(bias.dtype(), x.dtype()); - } - - auto invoke = [&](auto t) { - using T = decltype(t); - constexpr int max_vec = sizeof(uint4) / sizeof(T); - constexpr int threads = 512; - - const int num = x.shape(0); - const int dim = x.shape(1); - const int64_t stride = x.stride(0); - - int best_vec_size = 1; - for (int v = max_vec; v >= 1; v >>= 1) { - if (dim % v == 0 && stride % v == 0) { - best_vec_size = v; - break; - } - } - - auto launch = [&](auto vec_size_) { - constexpr int vec_size = decltype(vec_size_)::value; - const dim3 grid(num, cdiv(dim, threads * vec_size)); - if (type == ActivationType::kGeluPytorchTanh) { - biasActivationKernel - <<>>(x.data(), bias.data_or((T*)nullptr), stride, num, dim); - } - else if (type == ActivationType::kGelu) { - biasActivationKernel - <<>>(x.data(), bias.data_or((T*)nullptr), stride, num, dim); - } - else { - TM_LOG_FATAL("unsupported Qwen3.5 ViT bias activation type: {}", (int)type); - } - }; - - switch (best_vec_size) { - case 8: - return launch(std::integral_constant{}); - case 4: - return launch(std::integral_constant{}); - case 2: - return launch(std::integral_constant{}); - default: - return launch(std::integral_constant{}); - } - }; - - TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/bias_gelu.h b/src/turbomind/models/qwen3_5vit/bias_gelu.h deleted file mode 100644 index 9563c6c2ad..0000000000 --- a/src/turbomind/models/qwen3_5vit/bias_gelu.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include "src/turbomind/core/core.h" -#include "src/turbomind/kernels/activation.h" - -#include - -namespace turbomind { - -// In-place Qwen3.5 ViT bias + unary activation: -// x <- activation(x + bias) -void invokeQwen3_5VitBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu b/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu deleted file mode 100644 index 020d825a84..0000000000 --- a/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu +++ /dev/null @@ -1,137 +0,0 @@ -#include "src/turbomind/models/qwen3_5vit/fast_pos_embed.h" - -#include "src/turbomind/core/data_type.h" -#include "src/turbomind/core/logger.h" -#include "src/turbomind/kernels/core/array.h" -#include "src/turbomind/kernels/core/array_ops.h" - -#include -#include - -namespace turbomind { - -namespace { - -template -__device__ inline T from_float(float x); - -template<> -__device__ inline half from_float(float x) -{ - return __float2half(x); -} - -#ifdef ENABLE_BF16 -template<> -__device__ inline __nv_bfloat16 from_float<__nv_bfloat16>(float x) -{ - return __float2bfloat16(x); -} -#endif - -// `num_grids` is tiny (usually 1..a few) so a linear scan is fine. -__device__ inline int find_grid(const int* offsets, int num_grids, int pos) -{ - int g = 0; - for (int i = 1; i < num_grids; ++i) { - if (offsets[i * 2 + 1] <= pos) { - g = i; - } - else { - break; - } - } - return g; -} - -template -__global__ void fastPosEmbedIdxWeightKernel( - int* idx_out, T* weight_out, const int* grid_thws, const int* grid_offsets, int num_grids, int total_n, int G) -{ - const int pos = blockIdx.x * blockDim.x + threadIdx.x; - if (pos >= total_n) { - return; - } - - const int g = find_grid(grid_offsets, num_grids, pos); - const int grid_h = grid_thws[g * 3 + 1]; - const int grid_w = grid_thws[g * 3 + 2]; - const int local = pos - grid_offsets[g * 2 + 1]; - const int i = local / grid_w; - const int j = local % grid_w; - - // torch.linspace(0, G-1, n) uses the halfway-symmetric formulation so - // that both endpoints are exact: - // step = (end - start) / (n - 1) - // halfway = n / 2 - // out[i=hw]= end - step * (n - 1 - i) - // For n == 1 the single element is `start` (== 0 here); the formula - // below collapses to 0 since hw_h == 0 is bypassed via grid_h > 1. - const float end = (float)(G - 1); - const float step_h = (grid_h > 1) ? end / (float)(grid_h - 1) : 0.f; - const float step_w = (grid_w > 1) ? end / (float)(grid_w - 1) : 0.f; - - const int hw_h = grid_h / 2; - const int hw_w = grid_w / 2; - - const float h_val = (grid_h == 1) ? 0.f : ((i < hw_h) ? step_h * (float)i : end - step_h * (float)(grid_h - 1 - i)); - const float w_val = (grid_w == 1) ? 0.f : ((j < hw_w) ? step_w * (float)j : end - step_w * (float)(grid_w - 1 - j)); - - // torch.Tensor.int() truncates toward zero; h_val, w_val are non-negative - // and bounded above by G-1, so (int) cast is in [0, G-1]. - const int h_floor = (int)h_val; - const int w_floor = (int)w_val; - const int h_ceil = min(h_floor + 1, G - 1); - const int w_ceil = min(w_floor + 1, G - 1); - - const float dh = h_val - (float)h_floor; - const float dw = w_val - (float)w_floor; - - const int base_h = h_floor * G; - const int base_h_ceil = h_ceil * G; - - Array idx; - idx[0] = base_h + w_floor; - idx[1] = base_h + w_ceil; - idx[2] = base_h_ceil + w_floor; - idx[3] = base_h_ceil + w_ceil; - - Array weight; - weight[0] = from_float((1.f - dh) * (1.f - dw)); - weight[1] = from_float((1.f - dh) * dw); - weight[2] = from_float(dh * (1.f - dw)); - weight[3] = from_float(dh * dw); - - const int out_base = pos * 4; - Store(idx_out + out_base, idx); - Store(weight_out + out_base, weight); -} - -} // namespace - -void invokeFastPosEmbedIdxWeight(int* idx_out, - void* weight_out, - DataType dtype, - const int* grid_thws, - const int* grid_offsets, - int num_grids, - int total_n, - int num_grid_per_side, - cudaStream_t stream) -{ - if (total_n <= 0 || num_grids <= 0) { - return; - } - const int block = 256; - const int grid = (total_n + block - 1) / block; - - auto invoke = [&](auto t) { - using T = decltype(t); - fastPosEmbedIdxWeightKernel<<>>( - idx_out, (T*)weight_out, grid_thws, grid_offsets, num_grids, total_n, num_grid_per_side); - }; - TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fast_pos_embed.h b/src/turbomind/models/qwen3_5vit/fast_pos_embed.h deleted file mode 100644 index 7949b01323..0000000000 --- a/src/turbomind/models/qwen3_5vit/fast_pos_embed.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include "src/turbomind/core/data_type.h" - -#include - -namespace turbomind { - -// Precomputes the 4 bilinear-interpolation gather indices and weights -// used by the subsequent pos-embed merge step in Qwen3-VL. -void invokeFastPosEmbedIdxWeight(int* idx_out, // [total_n * 4] - void* weight_out, // [total_n * 4] - DataType dtype, - const int* grid_thws, // [num_grids * 3], (t, h, w) - const int* grid_offsets, // [num_grids * 2], (t*h*w, h*w) - int num_grids, - int total_n, - int num_grid_per_side, - cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu b/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu deleted file mode 100644 index ee19483cdd..0000000000 --- a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu +++ /dev/null @@ -1,92 +0,0 @@ -#include "src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h" - -#include "src/turbomind/core/logger.h" -#include "src/turbomind/kernels/core/array_ops.h" - -namespace turbomind { - -namespace { - -// `num_grids` is tiny (usually 1..a few) so a linear scan is fine. -__device__ inline int find_grid(const int* offsets, int num_grids, int pos) -{ - int g = 0; - for (int i = 1; i < num_grids; ++i) { - if (offsets[i * 2 + 1] <= pos) { - g = i; - } - else { - break; - } - } - return g; -} - -template -__global__ void fastRotaryPosEmbKernel(T* cos_sin_out, - const int* grid_thws, - const int* grid_offsets, - int num_grids, - int total_hw, - int head_dim, - float scale) // -log2(theta) / (head_dim/4) -{ - const int pair_count = head_dim / 2; // e.g. 36 - const int freq_half = head_dim / 4; // e.g. 18 - - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int pos = tid / pair_count; - const int pair_k = tid % pair_count; - if (pos >= total_hw) { - return; - } - - const int g = find_grid(grid_offsets, num_grids, pos); - const int grid_w = grid_thws[g * 3 + 2]; - const int local = pos - grid_offsets[g * 2 + 1]; - const int i = local / grid_w; // h_coord - const int j = local % grid_w; // w_coord - - // Pairs [0, freq_half) rotate in h; pairs [freq_half, 2*freq_half) rotate in w. - const int freq_idx = pair_k % freq_half; - const int coord = (pair_k < freq_half) ? i : j; - const float inv_freq = exp2f((float)freq_idx * scale); - - float c, s; - sincosf((float)coord * inv_freq, &s, &c); - - Array cs{(T)c, (T)s}; - Store(cos_sin_out + (size_t)pos * head_dim + pair_k * 2, cs); -} - -} // namespace - -void invokeQwen3VitRotaryPosEmb(void* cos_sin, - DataType dtype, - const int* grid_thws, - const int* grid_offsets, - int num_grids, - int total_hw, - int head_dim, - float theta, - cudaStream_t stream) -{ - if (total_hw <= 0 || num_grids <= 0 || head_dim <= 0) { - return; - } - TM_CHECK(head_dim % 4 == 0) << "head_dim must be divisible by 4, got " << head_dim; - - const int total = total_hw * (head_dim / 2); - const int block = 256; - const int grid = (total + block - 1) / block; - const float scale = -log2f(theta) / (float)(head_dim / 4); - - auto invoke = [&](auto t) { - using T = decltype(t); - fastRotaryPosEmbKernel - <<>>((T*)cos_sin, grid_thws, grid_offsets, num_grids, total_hw, head_dim, scale); - }; - TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h b/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h deleted file mode 100644 index 920e8b6409..0000000000 --- a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once - -#include "src/turbomind/core/data_type.h" - -#include - -namespace turbomind { - -// Precomputes the (cos, sin) rotary-embedding table for Qwen3-VL vision tokens. -// Layout per natural flat position (keyed by the same index `mapped_idx` carries): -// [c_0, s_0, c_1, s_1, ..., c_{head_dim/2-1}, s_{head_dim/2-1}] -// The pair index `k` uses `h_coord` for k < head_dim/4 and `w_coord` otherwise, -// with inv_freq = theta^(-2*(k % (head_dim/4)) / (head_dim/2)). -void invokeQwen3VitRotaryPosEmb(void* cos_sin, // [total_hw, head_dim] - DataType dtype, - const int* grid_thws, // [num_grids * 3], (t, h, w) - const int* grid_offsets, // [num_grids * 2], (t*h*w, h*w) - int num_grids, - int total_hw, - int head_dim, - float theta, - cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu b/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu deleted file mode 100644 index dc62548e9f..0000000000 --- a/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu +++ /dev/null @@ -1,89 +0,0 @@ -#include "src/turbomind/models/qwen3_5vit/fused_embed_merge.h" - -#include "src/turbomind/core/logger.h" -#include "src/turbomind/kernels/core/array_ops.h" - -namespace turbomind { - -namespace { - -template -__device__ Array roundToStorageDtype(Array x) -{ - return cast(cast(x)); -} - -template -__global__ void fusedPosEmbedMergeKernel(T* hidden_states, - const T* pos_embeds, - const T* pos_embed_weights, - const int* mapped_idx, - const T* bias, - int hidden, - int vdim) -{ - const int index = blockIdx.x; - const int mapped = mapped_idx[index]; // same address for all threads in block -> L1 broadcast - - Array w4; - Ldg(w4, pos_embed_weights + mapped * 4); - - const int row_off = index * hidden; - const int pe_row0 = mapped * 4 * hidden; - - using namespace ops; - for (int d = threadIdx.x; d < vdim; d += blockDim.x) { - Array pos{}; - Array tmp; - Load(tmp, hidden_states + row_off + d * vec_size); - auto hidden_acc = cast(tmp); - - if (bias) { - Ldg(tmp, bias + d * vec_size); - hidden_acc = roundToStorageDtype(hidden_acc + cast(tmp)); - } - PRAGMA_UNROLL - for (int k = 0; k < 4; ++k) { - Ldg(tmp, pos_embeds + pe_row0 + k * hidden + d * vec_size); - pos = pos + cast(tmp * w4[k]); - } - const auto out = hidden_acc + roundToStorageDtype(pos); - Store(hidden_states + row_off + d * vec_size, cast(out)); - } -} - -} // namespace - -void invokeFusedPosEmbedMerge(void* hidden_states, - const void* pos_embeds, - const void* pos_embed_weights, - const int* mapped_idx, - const void* bias, - int batch, - int hidden, - DataType dtype, - cudaStream_t stream) -{ - if (batch <= 0) { - return; - } - - const dim3 grid(batch); - const dim3 block(128); - - auto invoke = [&](auto t) { - using T = decltype(t); - constexpr int vec_size = sizeof(uint4) / sizeof(T); - TM_CHECK(hidden % vec_size == 0); - fusedPosEmbedMergeKernel<<>>((T*)hidden_states, - (const T*)pos_embeds, - (const T*)pos_embed_weights, - mapped_idx, - (const T*)bias, - hidden, - hidden / vec_size); - }; - TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/fused_embed_merge.h b/src/turbomind/models/qwen3_5vit/fused_embed_merge.h deleted file mode 100644 index dc077fe58f..0000000000 --- a/src/turbomind/models/qwen3_5vit/fused_embed_merge.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "src/turbomind/core/data_type.h" - -#include - -namespace turbomind { - -// Fuses the spatial-merge permutation, the bilinear-weighted sum, and the -// t-expansion of Qwen3-VL ViT pos_embed interpolation into a single pass on -// top of the patch_embed linear output. -void invokeFusedPosEmbedMerge(void* hidden_states, // [batch, hidden] - const void* pos_embeds, // [total_hw * 4, hidden] - const void* pos_embed_weights, // [total_hw * 4] - const int* mapped_idx, // [batch] - const void* bias, // [hidden] or nullptr - int batch, - int hidden, - DataType dtype, - cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu b/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu deleted file mode 100644 index 2c16224c9a..0000000000 --- a/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h" - -namespace turbomind { - -namespace { - -constexpr int kBlock = 128; - -__global__ void mropeScatterKernel(int* pos_ids, int row_stride, const MropeSegment* __restrict__ segs) -{ - const MropeSegment s = segs[blockIdx.x]; - const int local_k = blockIdx.y * blockDim.x + threadIdx.x; - if (local_k >= s.n_tok) { - return; - } - int* dst = pos_ids + s.dst_row * row_stride + 3 * (s.dst_offset + local_k); - if (s.h2 == 0) { // text run - const int p = s.base_pos + local_k; - dst[0] = p; - dst[1] = p; - dst[2] = p; - } - else { // image run — grid math uses the original (un-clipped) k - const int k = s.k_offset + local_k; - const int hw = s.h2 * s.w2; - dst[0] = s.base_pos + k / hw; - dst[1] = s.base_pos + (k / s.w2) % s.h2; - dst[2] = s.base_pos + k % s.w2; - } -} - -} // namespace - -void invokeMropePositionIds( - int* pos_ids, int row_stride, const MropeSegment* segments, int num_segments, int max_seg_len, cudaStream_t stream) -{ - if (num_segments <= 0 || max_seg_len <= 0) { - return; - } - const int tiles = (max_seg_len + kBlock - 1) / kBlock; - const dim3 grid((unsigned)num_segments, (unsigned)tiles); - mropeScatterKernel<<>>(pos_ids, row_stride, segments); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/mrope_position_ids.h b/src/turbomind/models/qwen3_5vit/mrope_position_ids.h deleted file mode 100644 index 59352bdff0..0000000000 --- a/src/turbomind/models/qwen3_5vit/mrope_position_ids.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#pragma once - -#include - -namespace turbomind { - -// One descriptor per text / image run, clipped to a prefill chunk's active window. -// `h2 == 0` flags a text run (real image grids always have h2 > 0). -struct MropeSegment { - int dst_row; // batch slot in the (bsz, max_active_end, 3) output table - int dst_offset; // absolute seq index of the first token written by this segment - int n_tok; // tokens to write (already clipped to the active range) - int base_pos; // text: position id at local_k = 0; image: image's mm_start - int h2; // image grid h after spatial merge (0 ⇒ text) - int w2; // image grid w after spatial merge (ignored when h2 == 0) - int k_offset; // starting "k" for image grid math (clip-offset within the run); unused for text -}; - -// Scatter `num_segments` segments into `pos_ids` of shape (bsz, row_stride/3, 3). -// `pos_ids` may be null when num_segments == 0. -void invokeMropePositionIds(int* pos_ids, - int row_stride, // = max_active_end * 3 - const MropeSegment* segments, // device - int num_segments, - int max_seg_len, - cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu b/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu deleted file mode 100644 index a2dbfbedfd..0000000000 --- a/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu +++ /dev/null @@ -1,234 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include "src/turbomind/models/qwen3_5vit/qkv_preprocess.h" - -#include "src/turbomind/core/logger.h" -#include "src/turbomind/kernels/core/array_ops.h" -#include "src/turbomind/kernels/core/common.h" - -#include - -namespace turbomind { - -namespace { - -constexpr int kWarpsPerBlock = 4; - -// Per-head_dim launch traits. Adding a new head_dim is a single specialization here. -template -struct HeadConfig; - -template<> -struct HeadConfig<64> { - static constexpr int kVecSize = 8; - static constexpr int kHeadsPerWarp = 4; -}; - -template<> -struct HeadConfig<72> { - static constexpr int kVecSize = 8; - static constexpr int kHeadsPerWarp = 3; -}; - -template<> -struct HeadConfig<128> { - static constexpr int kVecSize = 8; // 128 / 8 = 16 vec/head - static constexpr int kHeadsPerWarp = 2; // 16 * 2 = 32 == WARP_SIZE -}; - -template -__device__ __forceinline__ void add_bias(Array& x, const T* bias) -{ - Array b; - Ldg(b, bias); - using namespace ops; - x = x + cast(b); -} - -// Rotate adjacent (x[2k], x[2k+1]) pairs using cos/sin packed as -// rope[2k]=cos, rope[2k+1]=sin (see fast_rotary_pos_emb.cu:56-58). -template -__device__ __forceinline__ void apply_rope_pair(Array& x, const Array& rope) -{ - auto cs = cast(rope); - PRAGMA_UNROLL - for (int i = 0; i < VecSize; i += 2) { - const float x0 = x[i]; - const float x1 = x[i + 1]; - const float c = cs[i]; - const float s = cs[i + 1]; - x[i] = c * x0 - s * x1; - x[i + 1] = c * x1 + s * x0; - } -} - -// Load `src`, add `bias`, apply RoPE, store to `dst` (may equal `src` for in-place). -template -__device__ __forceinline__ void fuse_bias_rope_store(T* dst, const T* src, const T* bias, const Array& rope) -{ - Array x_vec; - Load(x_vec, src); - auto x = cast(x_vec); - add_bias(x, bias); - apply_rope_pair(x, rope); - Store(dst, cast(x)); -} - -// V-path: bias only, no RoPE. -template -__device__ __forceinline__ void fuse_bias_store(T* dst, const T* src, const T* bias) -{ - Array x_vec; - Load(x_vec, src); - auto x = cast(x_vec); - add_bias(x, bias); - Store(dst, cast(x)); -} - -template -__global__ __launch_bounds__(kWarpsPerBlock* WARP_SIZE) void prepareQKVKernel(T* __restrict__ qkv, - T* __restrict__ kv, - const T* __restrict__ bias, - const T* __restrict__ rotary_pos_emb, - const int* __restrict__ mapped_idx, - int token_num, - int local_head_num, - int head_group_num, - int rope_head_dim) -{ - using Cfg = HeadConfig; - constexpr int kVecSize = Cfg::kVecSize; - constexpr int kHeadsPerWarp = Cfg::kHeadsPerWarp; - constexpr int kVecPerHead = HD / kVecSize; - static_assert(HD % kVecSize == 0); - static_assert(kVecPerHead * kHeadsPerWarp <= WARP_SIZE); - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane = threadIdx.x - warp_id * WARP_SIZE; - const int head_slot = lane / kVecPerHead; - if (head_slot >= kHeadsPerWarp) { - return; - } - - const int global_warp = blockIdx.x * kWarpsPerBlock + warp_id; - const int total_warps = token_num * head_group_num; - if (global_warp >= total_warps) { - return; - } - - const int token_idx = global_warp / head_group_num; - const int head_group = global_warp - token_idx * head_group_num; - const int head_idx = head_group * kHeadsPerWarp + head_slot; - if (head_idx >= local_head_num) { - return; - } - - const int vec_idx = lane - head_slot * kVecPerHead; - const int di = vec_idx * kVecSize; - - // QKV per-token layout: [Q_heads | K_heads | V_heads], head_num == kv_head_num for ViT. - const int64_t qkv_stride = (int64_t)local_head_num * 3 * HD; - T* const q_ptr = qkv + (int64_t)token_idx * qkv_stride + head_idx * HD + di; - const T* k_ptr = q_ptr + (int64_t)local_head_num * HD; - const T* v_ptr = k_ptr + (int64_t)local_head_num * HD; - - const T* q_bias = bias + head_idx * HD + di; - const T* k_bias = q_bias + local_head_num * HD; - const T* v_bias = k_bias + local_head_num * HD; - - // K/V destination in transposed [kv_head, 2, token, head_dim] layout. - T* const k_dst = kv + ((int64_t)head_idx * 2 * token_num + token_idx) * HD + di; - T* const v_dst = k_dst + (int64_t)token_num * HD; - - // rope[token, di] is shared between Q and K — load once, reuse twice. - // When HD > rope_head_dim, padded di-slices have zero Q/K, so loading a - // zero rope_vec there is correct (and avoids OOB on the [N, rope_head_dim] - // buffer). kVecSize is aligned to rope_head_dim so each vec is fully in or - // fully out of the rope range. - Array rope_vec{}; - if (di < rope_head_dim) { - Ldg(rope_vec, rotary_pos_emb + (int64_t)mapped_idx[token_idx] * rope_head_dim + di); - } - - fuse_bias_rope_store(q_ptr, q_ptr, q_bias, rope_vec); // Q: in-place - fuse_bias_rope_store(k_dst, k_ptr, k_bias, rope_vec); // K: transposed - fuse_bias_store(v_dst, v_ptr, v_bias); // V: transposed, no RoPE -} - -template -void dispatchPrepareQKV(T* qkv, - T* kv, - const T* qkv_bias, - const T* rotary_pos_emb, - const int* mapped_idx, - int token_num, - int local_head_num, - int head_dim, - int rope_head_dim, - cudaStream_t stream) -{ - auto invoke = [&](auto hd_c) { - constexpr int HD = decltype(hd_c)::value; - using Cfg = HeadConfig; - - // Each vec_size-wide load must lie entirely in or out of the rope range. - TM_CHECK(rope_head_dim % Cfg::kVecSize == 0) - << "rope_head_dim (" << rope_head_dim << ") must be a multiple of kVecSize (" << Cfg::kVecSize << ")"; - TM_CHECK(rope_head_dim <= HD) << "rope_head_dim (" << rope_head_dim << ") cannot exceed head_dim (" << HD - << ")"; - - const int head_group_num = (local_head_num + Cfg::kHeadsPerWarp - 1) / Cfg::kHeadsPerWarp; - const int total_warps = token_num * head_group_num; - dim3 grid((total_warps + kWarpsPerBlock - 1) / kWarpsPerBlock); - prepareQKVKernel<<>>( - qkv, kv, qkv_bias, rotary_pos_emb, mapped_idx, token_num, local_head_num, head_group_num, rope_head_dim); - }; - - switch (head_dim) { - case 64: - return invoke(std::integral_constant{}); - case 72: - return invoke(std::integral_constant{}); - case 128: - return invoke(std::integral_constant{}); - default: - TM_LOG_FATAL("unsupported Qwen3.5 ViT head_dim for qkv preprocess: {}", head_dim); - } -} - -} // namespace - -void invokeQwen3_5VitPrepareQKV(void* qkv, - void* kv, - const void* qkv_bias, - const void* rotary_pos_emb, - const int* mapped_idx, - DataType dtype, - int token_num, - int local_head_num, - int head_dim, - int rope_head_dim, - cudaStream_t stream) -{ - if (token_num == 0) { - return; - } - - auto invoke = [&](auto t) { - using T = decltype(t); - dispatchPrepareQKV((T*)qkv, - (T*)kv, - (const T*)qkv_bias, - (const T*)rotary_pos_emb, - mapped_idx, - token_num, - local_head_num, - head_dim, - rope_head_dim, - stream); - }; - - TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qkv_preprocess.h b/src/turbomind/models/qwen3_5vit/qkv_preprocess.h deleted file mode 100644 index da1c2a84cb..0000000000 --- a/src/turbomind/models/qwen3_5vit/qkv_preprocess.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#pragma once - -#include "src/turbomind/core/data_type.h" - -#include - -namespace turbomind { - -// Prepare the Qwen3.5 ViT attention inputs after the fused QKV projection. -// -// qkv layout: -// [token, local_q_heads + 2 * local_kv_heads, head_dim] -// Q is updated in place with bias + RoPE. K/V are written to `kv` as: -// [local_kv_heads, 2, token, head_dim] -// -// `rope_head_dim` is the per-head dim of the rotary_pos_emb buffer and is -// also the cutoff below which RoPE is applied. When the model's real head_dim -// is not natively supported by the attention kernel, Q/K/V are zero-padded -// per-head to a kernel-supported `head_dim` >= `rope_head_dim`; the padded -// `[rope_head_dim, head_dim)` slice has zero Q/K so RoPE is skipped there. -void invokeQwen3_5VitPrepareQKV(void* qkv, - void* kv, - const void* qkv_bias, - const void* rotary_pos_emb, - const int* mapped_idx, - DataType dtype, - int token_num, - int local_head_num, - int head_dim, - int rope_head_dim, - cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc b/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc deleted file mode 100644 index 547206c31e..0000000000 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc +++ /dev/null @@ -1,800 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit.h" - -#include "src/turbomind/core/logger.h" -#include "src/turbomind/core/scope.h" -#include "src/turbomind/kernels/attention/attention.h" -#include "src/turbomind/kernels/gpt_kernels.h" -#include "src/turbomind/kernels/norm/layer_norm.h" -#include "src/turbomind/kernels/norm/rms_norm.h" -#include "src/turbomind/models/layer_norm_weight.h" -#include "src/turbomind/models/llama/SequenceManager.h" -#include "src/turbomind/models/qwen3_5vit/bias_gelu.h" -#include "src/turbomind/models/qwen3_5vit/fast_pos_embed.h" -#include "src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h" -#include "src/turbomind/models/qwen3_5vit/fused_embed_merge.h" -#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h" -#include "src/turbomind/models/qwen3_5vit/qkv_preprocess.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/memory_utils.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace turbomind { - -struct Qwen3_5Vit::Impl { - const Qwen3_5VitWeight& weights_; - const core::Qwen3_5VitConfig& config_; - int phases_; - LlamaLinear& linear_; - const comm::HostComm& h_tp_group; - comm::DeviceCommImpl* const d_comm_; - const int tp_group_; - const DataType engine_data_type_; - - Buffer_ grid_thws_buf_; // (t, h, w) - Buffer_ grid_offsets_buf_; // (t*h*w, h*w) - Buffer_ mapped_idx_buf_; // [batch] - Buffer_ attn_cu_seqlens_buf_; - - struct Data { - Tensor batch_input; - int batch_size; - std::vector> grid_thws_host; - std::vector> image_embeds_coords; // (size, pos) for image embeddings - std::vector> input_embeds_coords; // (size, pos) for input embeddings - - // for fast_pos_embed - Tensor_ grid_thws; - Tensor_ grid_offsets; - Tensor_ mapped_idx; - int total_hw; - - // for full attention, one sequence per temporal frame - Tensor_ attn_cu_seqlens; - Tensor_ attn_finished; - int attn_batch_size; - int max_attn_len; - - // mrope position-id scratch (per-phase pinned host buffers for fast H2D) - Buffer_ mrope_segs_host; // reinterpreted as MropeSegment[], 7 ints per segment - Buffer_ mrope_length_host; - Buffer_ mrope_delta_host; - - // mrope outputs — owned here so UnifiedAttentionLayer can safely borrow() across env clears. - Buffer_ mrope_segs_dev; // device-side segment scratch, grown alongside host - Tensor_ mrope_position_ids; // (bsz, max_active_end, 3), empty when no slot needs table - Tensor_ mrope_length; // (bsz,) - Tensor_ mrope_position_delta; // (bsz,) - - void Clear() - { - batch_size = 0; - total_hw = 0; - attn_batch_size = 0; - max_attn_len = 0; - grid_thws_host.clear(); - image_embeds_coords.clear(); - input_embeds_coords.clear(); - } - }; - - static constexpr int kMropeSegInts = sizeof(MropeSegment) / sizeof(int); - static_assert(sizeof(MropeSegment) % sizeof(int) == 0); - - std::vector data_; - - Impl(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases): - weights_{weights}, - config_{weights.config()}, - phases_{phases}, - linear_{*ctx.linear}, - h_tp_group{ctx.comm.h_comm}, - d_comm_{ctx.comm.d_comm}, - tp_group_{ctx.comm.d_tp_group}, - engine_data_type_{engine.data_type} - { - auto& cfg = weights.config(); - for (int i = 0; i < phases; ++i) { - auto& d = data_.emplace_back(); - d.batch_input = {{engine.max_forward_token_num, cfg.patch_in_dim}, cfg.data_type, kCPUpinned}; - d.mrope_length_host = {engine.max_batch_size, kCPUpinned}; - d.mrope_delta_host = {engine.max_batch_size, kCPUpinned}; - // Generous initial capacity: typical batches emit << bsz * 8 segments. Lazily grown below. - d.mrope_segs_host = {engine.max_batch_size * 8 * (ssize_t)kMropeSegInts, kCPUpinned}; - - // mrope outputs at worst-case shape so Setup() never reallocates them. Rows beyond - // the current bsz (and rows whose length[i] == 0) are stale but unreachable from - // FastRoPE — it only reads position_ids[3 * timestep] when `timestep < length[i]`. - d.mrope_length = Tensor_{{engine.max_batch_size}, kDEVICE}; - d.mrope_position_delta = Tensor_{{engine.max_batch_size}, kDEVICE}; - d.mrope_position_ids = Tensor_{{engine.max_batch_size, engine.session_len, 3}, kDEVICE}; - } - - // should be large enough to hold all patches - grid_thws_buf_ = {engine.max_forward_token_num * 3, kCPUpinned}; - grid_offsets_buf_ = {engine.max_forward_token_num * 2, kCPUpinned}; - mapped_idx_buf_ = {engine.max_forward_token_num, kCPUpinned}; - attn_cu_seqlens_buf_ = {engine.max_forward_token_num + 1, kCPUpinned}; - } - - void AllReduceSum(Tensor& tensor, cudaStream_t stream) const - { - if (d_comm_) { - d_comm_->AllReduceSum( - tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream); - TM_CUDA_CHECK(cudaGetLastError()); - } - } - - void FastPosEmbedInterpolate(Data& d, TensorMap& env) - { - auto& cfg = weights_.config(); - auto stream = core::Context::stream().handle(); - - const int num_grid_per_side = (int)std::sqrt(cfg.num_position_embeddings); - TM_CHECK_EQ(num_grid_per_side * num_grid_per_side, cfg.num_position_embeddings); - TM_CHECK_EQ(weights_.pos_embed.shape(0), cfg.num_position_embeddings); - TM_CHECK_EQ(weights_.pos_embed.shape(1), cfg.hidden_dim); - - Buffer_ pos_embed_idx = {d.total_hw * 4, kDEVICE}; - Tensor pos_embed_weights = {{d.total_hw, 4}, cfg.data_type, kDEVICE}; - invokeFastPosEmbedIdxWeight(pos_embed_idx.data(), - pos_embed_weights.raw_data(), - cfg.data_type, - d.grid_thws.data(), - d.grid_offsets.data(), - d.grid_thws.shape(0), - d.total_hw, - num_grid_per_side, - stream); - TM_CUDA_CHECK(cudaGetLastError()); - - Tensor pos_embeds = {{d.total_hw * 4, cfg.hidden_dim}, cfg.data_type, kDEVICE}; - invokeEmbeddingLookup(pos_embeds, pos_embed_idx, weights_.pos_embed, stream); - TM_CUDA_CHECK(cudaGetLastError()); - - env.produce("pos_embeds", pos_embeds); - env.produce("pos_embed_weights", pos_embed_weights); - } - - void RotPosEmb(Data& d, TensorMap& env) - { - auto& cfg = weights_.config(); - - const int head_dim = cfg.hidden_dim / cfg.head_num; - // produce rotary_pos_emb: [total_hw, head_dim] with interleaved (c,s,c,s,...) pairs, - // keyed by the same natural flat index that `mapped_idx` already carries. Vision q/k - // are reordered into this adjacent-pair layout at export time. - Tensor rotary_pos_emb = {{d.total_hw, head_dim}, cfg.data_type, kDEVICE}; - invokeQwen3VitRotaryPosEmb(rotary_pos_emb.raw_data(), - cfg.data_type, - d.grid_thws.data(), - d.grid_offsets.data(), - d.grid_thws.shape(0), - d.total_hw, - head_dim, - /*theta=*/10000.0f, - core::Context::stream().handle()); - TM_CUDA_CHECK(cudaGetLastError()); - env.produce("rotary_pos_emb", rotary_pos_emb); - } - - Tensor PatchEmbedding(Data& d) - { - Tensor host_input = d.batch_input.slice(0, d.batch_size); - Tensor input = empty_like(host_input, kDEVICE); - - Copy(host_input, input); - TM_CUDA_CHECK(cudaGetLastError()); - - Tensor output; - TM_SCOPE_CALL(linear_.Forward(input, *weights_.patch_embed, output)); - return output; - } - - int Add(RequestCache& c) - { - const auto& [r, s] = std::tie(*c.req, *c.seq); - if (r.mm_inputs) { - if ((not r.session.start_flag) or (not r.session.end_flag)) { - // only support non-interactive inference - return Request::kInvalid; - } - - const auto mm_inputs = std::dynamic_pointer_cast(r.mm_inputs); - if (!mm_inputs) { - return Request::kInvalid; - } - - for (const auto& item : mm_inputs->items) { - if (item.modality != multimodal::Modality::kImage && item.modality != multimodal::Modality::kVideo) { - return Request::kInvalid; - } - - const int tokens = item.token_end - item.token_begin; - if (tokens <= 0) { - return Request::kInvalid; - } - - auto mm_item = std::make_shared( - MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, item.grid_thw}); - s.multimodal_inputs.push_back(mm_item); - } - } - - return Request::kOk; - } - - void Add(int phase, TensorMap& env) - { - // convert model-specific multimodal inputs to internal MultiModalData - const Buffer_ rc = env.at("requests").buffer(); - for (int i = 0; i < rc.size(); ++i) { - auto& c = *TM_CHECK_NOTNULL(rc[i]); - if (c.status == 0) { - c.status = Add(c); - } - } - } - - // Build the mrope tensors consumed by `UnifiedAttentionLayer` and publish them to env. - // - // Per-request layout: one row in `(max_batch_size, session_len, 3)` for each slot. Prefill - // slots with multimodal_inputs get their active range written by `invokeMropePositionIds` - // from a clipped list of MropeSegment descriptors (one per text/image run). All other slots - // (decode + text-only prefill) get `length[i] = 0` so FastRoPE falls through to the closed- - // form `timestep + delta` path and never reads the stale row. - // - // The three output tensors live on `Data` (allocated worst-case in the ctor). env shares - // ownership via shared_ptr; UAL borrows safely across env clears. - void SetupMrope(int phase, TensorMap& env) - { - auto& d = data_.at(phase); - auto& b = *env.at("batch").data()[0]; - auto& rc = b.rc; - - const int bsz = (int)rc.size(); - if (bsz <= 0) { - return; - } - - const int S = weights_.config().spatial_merge_size; - - // 1) One pass to upper-bound segment count, then size host + device scratch in one shot. - // Worst case per prefill slot with mrope: 2*num_images + 1 segments. - int upper_segs = 0; - for (int i = 0; i < bsz; ++i) { - const auto& c = *rc[i]; - if (!c.autoregres && !c.seq->multimodal_inputs.empty()) { - upper_segs += 2 * (int)c.seq->multimodal_inputs.size() + 1; - } - } - const ssize_t upper_ints = (ssize_t)upper_segs * kMropeSegInts; - if (upper_ints > d.mrope_segs_host.size()) { - core::ContextGuard ctx{Allocator{kCPUpinned}}; - d.mrope_segs_host = Buffer_{upper_ints, kCPUpinned}; - } - if (upper_ints > d.mrope_segs_dev.size()) { - d.mrope_segs_dev = Buffer_{upper_ints, kDEVICE}; - } - - // 2) Unified per-request walk — always advance mm_off; emit segments only for needs_table. - auto* segs = reinterpret_cast(d.mrope_segs_host.data()); - int n_segs = 0; - int max_seg_len = 0; - - for (int i = 0; i < bsz; ++i) { - const auto& c = *rc[i]; - const auto& s = *c.seq; - const int seq_len = (int)c.req->inputs.at("input_ids").shape(0); - const bool needs_table = !c.autoregres && !s.multimodal_inputs.empty(); - const int active_start = c.history_len + c.alpha; - const int active_end = active_start + c.input_len; - - auto emit = [&](int run_start, int run_n, int run_base, int h2, int w2) { - const int a = std::max(run_start, active_start); - const int b = std::min(run_start + run_n, active_end); - if (a >= b) { - return; - } - const int local_off = a - run_start; - segs[n_segs++] = MropeSegment{ - i, - a, - b - a, - /*base_pos=*/(h2 == 0) ? run_base + local_off : run_base, - h2, - w2, - /*k_offset=*/(h2 == 0) ? 0 : local_off, - }; - max_seg_len = std::max(max_seg_len, b - a); - }; - - int row = 0, pos = 0, mm_off = 0; - for (const auto& mm : s.multimodal_inputs) { - const auto& [t, h, w] = mm->grid_thw; - const int h2 = h / S, w2 = w / S, n_tok = t * h2 * w2; - TM_CHECK_EQ(n_tok, (int)mm->interval.size()) << "image token count mismatches grid_thw"; - const int img_start = mm->interval.begin(); - const int img_base = img_start + mm_off; - if (needs_table) { - if (img_start > row) { - emit(row, img_start - row, pos, /*h2=*/0, /*w2=*/0); - } - emit(img_start, n_tok, img_base, h2, w2); - } - row = img_start + n_tok; - const int new_pos = std::max({t, h2, w2}); - pos = img_base + new_pos; - mm_off += new_pos - n_tok; - } - if (needs_table && row < seq_len) { - emit(row, seq_len - row, pos, /*h2=*/0, /*w2=*/0); - } - - d.mrope_length_host.data()[i] = needs_table ? seq_len : 0; - d.mrope_delta_host.data()[i] = mm_off; - } - - // 3) Copy the bsz prefix of length / delta into the pre-allocated device tensors. - // Rows beyond bsz are untouched (UAL never reads them). - Copy(d.mrope_length_host.slice(0, bsz), d.mrope_length.buffer().slice(0, bsz)); - Copy(d.mrope_delta_host.slice(0, bsz), d.mrope_position_delta.buffer().slice(0, bsz)); - - // 4) Populate position_ids only when a slot actually needs the table. Rows for slots - // with length[i] == 0 are unreachable from FastRoPE, so leaving them stale is safe. - if (n_segs > 0) { - const ssize_t segs_ints = (ssize_t)n_segs * kMropeSegInts; - Copy(d.mrope_segs_host.slice(0, segs_ints), d.mrope_segs_dev.slice(0, segs_ints)); - invokeMropePositionIds(d.mrope_position_ids.data(), - (int)d.mrope_position_ids.stride(0), - reinterpret_cast(d.mrope_segs_dev.data()), - n_segs, - max_seg_len, - core::Context::stream().handle()); - TM_CUDA_CHECK(cudaGetLastError()); - } - - // 5) Publish all three — the consumer relies on this contract unconditionally. - env.produce("mrope_length", d.mrope_length); - env.produce("mrope_position_delta", d.mrope_position_delta); - env.produce("mrope_position_ids", d.mrope_position_ids); - } - - void Setup(int phase, TensorMap& env) - { - // create batch data according to scheduled sequences - auto& d = data_.at(phase); - auto& b = *env.at("batch").data()[0]; - auto& copy = *env.at("copy").data()[0]; - auto& cfg = weights_.config(); - - int input_ids_offsets = 0; - int image_embeds_offsets = 0; - d.Clear(); - std::vector pixel_values; - - // collect image/video pixel values, grid_thws and embeds_coords - const auto& rc = b.rc; - for (int i = 0; i < rc.size(); ++i) { - const auto& c = *rc[i]; - const auto& s = *c.seq; - - if ((not c.autoregres) && (not s.multimodal_inputs.empty())) { - Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}}; - for (const auto& mm : s.multimodal_inputs) { - auto o = mm->interval & text; - if (auto size = (int)o.size()) { - pixel_values.push_back(mm->data); - d.batch_size += mm->data.shape(0); - - const int text_offset = input_ids_offsets + o.begin() - text.begin(); - const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin(); - d.input_embeds_coords.emplace_back(size, text_offset); - d.image_embeds_coords.emplace_back(size, image_offset); - - auto& grid_thw = mm->grid_thw; - d.grid_thws_host.emplace_back(grid_thw); - auto prod = std::accumulate(grid_thw.begin(), grid_thw.end(), 1, std::multiplies()); - image_embeds_offsets += (prod / cfg.spatial_merge_size / cfg.spatial_merge_size); - } - } - } - - input_ids_offsets += c.autoregres ? 1 : c.input_len; - } - - // copy pixel values to batch input - if (d.batch_size > 0) { - if (d.batch_size > d.batch_input.shape(0)) { - core::ContextGuard ctx{Allocator{kCPUpinned}}; - d.batch_input = {{d.batch_size, cfg.patch_in_dim}, cfg.data_type, kCPUpinned}; - } - ssize_t batch_offset = 0; - for (const auto& pixel_value : pixel_values) { - TM_CHECK_EQ(pixel_value.size(), pixel_value.shape(0) * cfg.patch_in_dim); - TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype()); - Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0))); - batch_offset += pixel_value.shape(0); - } - TM_CHECK_EQ(batch_offset, d.batch_size); - } - - // setup fast_pos_embed - if (const int num_grids = (int)d.grid_thws_host.size(); num_grids > 0) { - for (const auto& [t, h, w] : d.grid_thws_host) { - d.attn_batch_size += t; - d.max_attn_len = std::max(d.max_attn_len, h * w); - } - if (grid_thws_buf_.size() < (ssize_t)num_grids * 3) { - core::ContextGuard ctx{Allocator{kCPUpinned}}; - grid_thws_buf_ = Buffer_{num_grids * 3, kCPUpinned}; - grid_offsets_buf_ = Buffer_{num_grids * 2, kCPUpinned}; - } - if (mapped_idx_buf_.size() < d.batch_size) { - core::ContextGuard ctx{Allocator{kCPUpinned}}; - mapped_idx_buf_ = Buffer_{d.batch_size, kCPUpinned}; - } - if (attn_cu_seqlens_buf_.size() < (ssize_t)d.attn_batch_size + 1) { - core::ContextGuard ctx{Allocator{kCPUpinned}}; - attn_cu_seqlens_buf_ = Buffer_{d.attn_batch_size + 1, kCPUpinned}; - } - d.grid_thws = {{num_grids, 3}, kDEVICE}; - d.grid_offsets = {{num_grids, 2}, kDEVICE}; - d.mapped_idx = {{d.batch_size}, kDEVICE}; - d.attn_cu_seqlens = {{d.attn_batch_size + 1}, kDEVICE}; - d.attn_finished = {{d.attn_batch_size}, kDEVICE}; - - std::pair offset{}; - int attn_seq_idx = 0; - int attn_offset = 0; - attn_cu_seqlens_buf_.data()[attn_seq_idx++] = 0; - for (int i = 0; i < num_grids; ++i) { - const auto& [t, h, w] = d.grid_thws_host[i]; - TM_CHECK(h % cfg.spatial_merge_size == 0); - TM_CHECK(w % cfg.spatial_merge_size == 0); - const int hw = h * w; - for (int tt = 0; tt < t; ++tt) { - attn_offset += hw; - attn_cu_seqlens_buf_.data()[attn_seq_idx++] = attn_offset; - } - - grid_thws_buf_.data()[i * 3] = t; - grid_thws_buf_.data()[i * 3 + 1] = h; - grid_thws_buf_.data()[i * 3 + 2] = w; - grid_offsets_buf_.data()[i * 2] = offset.first; - grid_offsets_buf_.data()[i * 2 + 1] = offset.second; - - // compute mapped_idx - TM_CHECK(offset.first + t * h * w <= d.batch_size); - const int S = cfg.spatial_merge_size; - int* buf = mapped_idx_buf_.data(); - int pos = offset.first; - for (int h_outer = 0; h_outer < h / S; ++h_outer) { - for (int w_outer = 0; w_outer < w / S; ++w_outer) { - for (int h_inner = 0; h_inner < S; ++h_inner) { - for (int w_inner = 0; w_inner < S; ++w_inner) { - const int ii = h_outer * S + h_inner; - const int jj = w_outer * S + w_inner; - buf[pos++] = offset.second + ii * w + jj; - } - } - } - } - for (int tt = 1; tt < t; ++tt) { - std::memcpy(buf + offset.first + tt * hw, buf + offset.first, hw * sizeof(int)); - } - pos = offset.first + t * hw; - TM_CHECK_EQ(pos, offset.first + t * h * w); - offset.first += t * h * w; - offset.second += h * w; - } - TM_CHECK_EQ(offset.first, d.batch_size); - TM_CHECK_EQ(attn_offset, d.batch_size); - TM_CHECK_EQ(attn_seq_idx, d.attn_batch_size + 1); - d.total_hw = offset.second; - copy(grid_thws_buf_.data(), num_grids * 3, d.grid_thws.data()); - copy(grid_offsets_buf_.data(), num_grids * 2, d.grid_offsets.data()); - copy(mapped_idx_buf_.data(), d.batch_size, d.mapped_idx.data()); - copy(attn_cu_seqlens_buf_.data(), d.attn_batch_size + 1, d.attn_cu_seqlens.data()); - Clear(d.attn_finished); - } - - SetupMrope(phase, env); - h_tp_group->Sync(); - } - - void Prepare(int phase, TensorMap& env) - { - auto& d = data_.at(phase); - if (d.batch_size == 0) { - return; - } - - // produce non-merged pos-embeds and weights - FastPosEmbedInterpolate(d, env); - - // produce rotary_pos_emb - RotPosEmb(d, env); - } - - void Forward(int phase, TensorMap& args) - { - auto& d = data_.at(phase); - if (d.batch_size == 0) { - return; - } - - auto& cfg = weights_.config(); - - // 1) patch_embed (Linear without bias, the bias will be folded into the fused kernel) - auto residual = PatchEmbedding(d); - auto pos_embeds = args.consume("pos_embeds"); - auto pos_embed_weights = args.consume("pos_embed_weights"); - auto rotary_pos_emb = args.consume("rotary_pos_emb"); - auto stream = core::Context::stream().handle(); - - // 2) fused pos-embed gather/merge: - // residual[pos, d] += Σ_k w[k] * pos_embeds[mapped*4+k, d] + bias[d] - invokeFusedPosEmbedMerge(residual.raw_data(), - pos_embeds.raw_data(), - pos_embed_weights.raw_data(), - d.mapped_idx.data(), - weights_.patch_embed->bias ? weights_.patch_embed->bias.raw_data() : nullptr, - d.batch_size, - cfg.hidden_dim, - cfg.data_type, - stream); - TM_CUDA_CHECK(cudaGetLastError()); - - // 3) decoder - Tensor hidden_states = [&]() { - Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{}; - if (symm_buf && d.batch_size * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) { - return Tensor{symm_buf.view(cfg.data_type), {d.batch_size, cfg.hidden_dim}}; - } - else { - return Tensor{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE}; - } - }(); - - invokeLayerNorm(hidden_states, - residual, - weights_.block(0)->norm1->weight, - weights_.block(0)->norm1->bias, - weights_.block(0)->norm1->norm_eps_, - stream); - TM_CUDA_CHECK(cudaGetLastError()); - - for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) { - - // attn - auto invoke = [&](auto t) { - using T = decltype(t); - Attn(hidden_states, hidden_states, d, layer_id, rotary_pos_emb); - }; - TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke); - - AllReduceSum(hidden_states, stream); - - auto* block = weights_.block(layer_id); - invokeResidualBiasLayerNorm(hidden_states.raw_data(), - residual.raw_data(), - block->norm2->weight.raw_data(), - block->norm2->bias.data_or((void*)nullptr), - block->attention->wo->bias.data_or((void*)nullptr), - hidden_states.dtype(), - cfg.hidden_dim, - d.batch_size, - block->norm2->norm_eps_, - stream); - TM_CUDA_CHECK(cudaGetLastError()); - - // mlp - Mlp(hidden_states, hidden_states, d, layer_id); - AllReduceSum(hidden_states, stream); - - const auto* next_norm = - layer_id + 1 < cfg.depth ? weights_.block(layer_id + 1)->norm1.get() : weights_.merger_norm.get(); - TM_CHECK_NOTNULL(next_norm); - invokeResidualBiasLayerNorm(hidden_states.raw_data(), - residual.raw_data(), - next_norm->weight.raw_data(), - next_norm->bias.data_or((void*)nullptr), - block->mlp_fc2->bias.data_or((void*)nullptr), - hidden_states.dtype(), - cfg.hidden_dim, - d.batch_size, - next_norm->norm_eps_, - stream); - TM_CUDA_CHECK(cudaGetLastError()); - } - - Tensor image_embeds = Merger(hidden_states); - - // ViT may run in its own dtype (e.g. bf16) while the text engine runs - // in fp16 (AWQ-forced). PatchMultimodalEmbedding merges this buffer - // into the text embedding stream via a byte-level copy, so the dtypes - // must match before publishing. - EnsureFloatDtype(image_embeds, engine_data_type_); - - args.produce("multimodal", - MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf()); - } - - template - AttentionParams CreateVitAttentionParams( - Tensor& attn_context, Tensor& qkv, Tensor& kv, Data& d, const AttentionWeight& attn, int layer_id) - { - const int local_head_num = attn.head_num / attn.tp_size; - const int head_dim = attn.head_dim; - const int token_num = d.batch_size; - - AttentionParams params{}; - params.out = (T*)attn_context.raw_data(); - params.q = (T*)qkv.raw_data(); - - params.stride = (int64_t)local_head_num * 3 * head_dim; - - params.cu_q_len = d.attn_cu_seqlens.data(); - params.cu_k_len = d.attn_cu_seqlens.data(); - params.finished = d.attn_finished.data(); - - params.linear_iter_params = LinearIteratorParams{ - kv.raw_data(), - 2 * token_num * head_dim, - token_num * head_dim, - }; - - params.token_num = token_num; - params.batch_size = d.attn_batch_size; - params.max_q_len = d.max_attn_len; - params.max_k_len = d.max_attn_len; - - params.num_heads = local_head_num; - params.num_kv_heads = local_head_num; - params.size_per_head = head_dim; - params.causal = false; - params.layer_id = layer_id; - - double scaling = 1.; - if (attn.softmax_scale) { - scaling *= attn.softmax_scale; - } - else { - scaling /= std::sqrt((float)head_dim); - } - params.inv_sqrt_dh = scaling * std::log2(std::exp(1.)); - - params.window_size = 0; - params.rope_param.type = RopeType::kNull; - params.max_split_k = 1; - params.cp_size = 1; - params.stream = core::Context::stream().handle(); - - return params; - } - - template - void Attn(Tensor& input, Tensor& output, Data& d, int layer_id, const Tensor& rotary_pos_emb) - { - auto& vit_cfg = weights_.config(); - auto* attn = weights_.block(layer_id)->attention.get(); - - Tensor qkv; - TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv)); - TM_CUDA_CHECK(cudaGetLastError()); - - const int local_head_num = attn->head_num / attn->tp_size; - const int head_dim = attn->head_dim; // may be padded - const int rope_head_dim = vit_cfg.hidden_dim / vit_cfg.head_num; // model's real per-head dim - const int token_num = d.batch_size; - - Tensor tmp_kv{{local_head_num, 2, d.batch_size, head_dim}, qkv.dtype(), qkv.device()}; - invokeQwen3_5VitPrepareQKV(qkv.raw_data(), - tmp_kv.raw_data(), - attn->w_qkv->bias.raw_data(), - rotary_pos_emb.raw_data(), - d.mapped_idx.data(), - qkv.dtype(), - token_num, - local_head_num, - head_dim, - rope_head_dim, - core::Context::stream().handle()); - TM_CUDA_CHECK(cudaGetLastError()); - - Tensor attn_output{{token_num, local_head_num * head_dim}, qkv.dtype(), qkv.device()}; - auto params = CreateVitAttentionParams(attn_output, qkv, tmp_kv, d, *attn, layer_id); - dispatchAttention(params); - TM_CUDA_CHECK(cudaGetLastError()); - - TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output)); - TM_CUDA_CHECK(cudaGetLastError()); - } - - void Mlp(Tensor& input, Tensor& output, Data& d, int layer_id) - { - auto* block = weights_.block(layer_id); - auto stream = core::Context::stream().handle(); - - TM_CHECK(block); - TM_CHECK_EQ(input.shape(0), d.batch_size); - TM_CHECK_EQ(input.shape(1), config_.hidden_dim); - - Tensor inter; - TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter)); - TM_CUDA_CHECK(cudaGetLastError()); - - invokeQwen3_5VitBiasActivation(inter, block->mlp_fc1->bias, ActivationType::kGeluPytorchTanh, stream); - TM_CUDA_CHECK(cudaGetLastError()); - - TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output)); - TM_CUDA_CHECK(cudaGetLastError()); - } - - Tensor Merger(Tensor& input) - { - auto& cfg = config_; - auto stream = core::Context::stream().handle(); - - const int merge_area = cfg.spatial_merge_size * cfg.spatial_merge_size; - Tensor merged_input = input.view({-1, cfg.hidden_dim * merge_area}); - - Tensor inter; - TM_SCOPE_CALL(linear_.Forward(merged_input, *weights_.merger_fc1, inter)); - TM_CUDA_CHECK(cudaGetLastError()); - - invokeQwen3_5VitBiasActivation(inter, weights_.merger_fc1->bias, ActivationType::kGelu, stream); - TM_CUDA_CHECK(cudaGetLastError()); - - Tensor output; - TM_SCOPE_CALL(linear_.Forward(inter, *weights_.merger_fc2, output)); - TM_CUDA_CHECK(cudaGetLastError()); - - AllReduceSum(output, stream); - - ApplyBias(output, weights_.merger_fc2->bias, stream); - TM_CUDA_CHECK(cudaGetLastError()); - - return output; - } -}; - -Qwen3_5Vit::Qwen3_5Vit(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases): - impl_{std::make_unique(engine, ctx, weights, phases)} -{ -} - -Qwen3_5Vit::~Qwen3_5Vit() = default; - -void Qwen3_5Vit::Run(BatchOp op, int phase, TensorMap& env) -{ - TM_FUNCTION_SCOPE(); - switch (op) { - case BatchOp::kAdd: - return impl_->Add(phase, env); - case BatchOp::kSetup: - return impl_->Setup(phase, env); - case BatchOp::kPrepare: - return impl_->Prepare(phase, env); - case BatchOp::kForward: - return impl_->Forward(phase, env); - default: - return; - } -} - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit.h b/src/turbomind/models/qwen3_5vit/qwen3_5vit.h deleted file mode 100644 index 8c2e94d612..0000000000 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. -#pragma once - -#include - -#include "src/turbomind/core/core.h" -#include "src/turbomind/engine/batch.h" -#include "src/turbomind/models/llama/context.h" -#include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/models/vision_model.h" - -namespace turbomind { - -class Qwen3_5VitWeight; - -/// Concrete ``VisionModel`` for the Qwen3.5 ViT encoder. -/// -/// This task only stubs the runtime: the phase methods log a debug -/// breadcrumb and return. Follow-up work fills in the actual ViT -/// kernels (patcher → blocks → merger → caching of image embeddings). -class Qwen3_5Vit: public VisionModel { -public: - Qwen3_5Vit(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases); - - ~Qwen3_5Vit() override; - - void Run(BatchOp op, int phase, TensorMap& env) override; - -private: - struct Impl; - std::unique_ptr impl_; -}; - -} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h b/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h deleted file mode 100644 index 7c2ee35652..0000000000 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. -#pragma once - -#include "src/turbomind/core/core.h" -#include "src/turbomind/core/module.h" -#include "src/turbomind/models/vision_model_weight.h" - -#include - -namespace turbomind::core { - -/// Root config for Qwen3.5 ViT. Carries every structural scalar the -/// C++ runtime needs to allocate kernels later (depth, head_num, -/// patcher dims, …). Each field is visited by ``for_each`` via the -/// X-macro below — pybind11 then exposes every field as a read/write -/// attribute on the Python ``Qwen3_5VitConfig``. -struct Qwen3_5VitConfig: ModuleConfig { - Qwen3_5VitConfig(): ModuleConfig{"Qwen3_5VitWeight"} {} - - DataType data_type{}; - int hidden_dim{0}; - int out_hidden_dim{0}; - int depth{0}; - int head_num{0}; - int intermediate_size{0}; - int patch_in_dim{0}; - int in_channels{0}; - int patch_size{0}; - int temporal_patch_size{0}; - int num_position_embeddings{0}; - int spatial_merge_size{0}; - float norm_eps{1e-6f}; - -#define QWEN3_5VIT_FIELDS(X) \ - X(DataType, data_type) \ - X(int, hidden_dim) \ - X(int, out_hidden_dim) \ - X(int, depth) \ - X(int, head_num) \ - X(int, intermediate_size) \ - X(int, patch_in_dim) \ - X(int, in_channels) \ - X(int, patch_size) \ - X(int, temporal_patch_size) \ - X(int, num_position_embeddings) \ - X(int, spatial_merge_size) \ - X(float, norm_eps, 1e-6f) - - TM_FOR_EACH(Qwen3_5VitConfig, QWEN3_5VIT_FIELDS) - -#undef QWEN3_5VIT_FIELDS -}; - -} // namespace turbomind::core - -namespace turbomind { - -// Forward decls -class LayerNormWeight; -class LinearWeight; -class Qwen3_5VitBlockWeight; - -/// Concrete Qwen3.5 ViT weight tree. -/// -/// Tree: -/// patch_embed LinearWeight (Conv3d-as-Linear; in_dim = C·T·patch²) -/// pos_embed raw tensor (num_position_embeddings × hidden_dim) -/// blocks ModuleList of Qwen3_5VitBlockWeight × depth -/// merger_fc1 LinearWeight (in: hidden·spatial_merge², out: 4·hidden) -/// merger_fc2 LinearWeight (in: 4·hidden, out: out_hidden) -/// merger_norm LayerNormWeight (over hidden_dim) -/// -/// We expose ``merger_*`` as direct children rather than a sub-module to -/// keep the weight tree shallow — the merger has only three pieces. -class Qwen3_5VitWeight: public VisionModelWeight { -public: - const char* type() const override - { - return "Qwen3_5VitWeight"; - } - - Qwen3_5VitWeight() = default; - explicit Qwen3_5VitWeight(const core::Qwen3_5VitConfig& cfg); - - void prepare() override; - bool verify(std::vector& missing) override; - - // --- X-macro field lists --- -#define QWEN3_5VIT_WEIGHT_CHILDREN(X) \ - X(LinearWeight, patch_embed) \ - X(core::ModuleList, blocks) \ - X(LinearWeight, merger_fc1) \ - X(LinearWeight, merger_fc2) \ - X(LayerNormWeight, merger_norm) - -#define QWEN3_5VIT_WEIGHT_PARAMS(X) X(pos_embed) - - TM_MODULE_DECLARE(Qwen3_5VitWeight, QWEN3_5VIT_WEIGHT_CHILDREN, QWEN3_5VIT_WEIGHT_PARAMS) - - // --- Accessors --- - const core::Qwen3_5VitConfig& config() const noexcept - { - return config_; - } - - Qwen3_5VitBlockWeight* block(int i) const; - -private: - core::Qwen3_5VitConfig config_{}; -}; - -} // namespace turbomind diff --git a/src/turbomind/models/qwenvit/qwenvit.cc b/src/turbomind/models/qwenvit/qwenvit.cc new file mode 100644 index 0000000000..1c58cef8ab --- /dev/null +++ b/src/turbomind/models/qwenvit/qwenvit.cc @@ -0,0 +1,1024 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#include "src/turbomind/models/qwenvit/qwenvit.h" + +#include "src/turbomind/core/logger.h" +#include "src/turbomind/core/scope.h" +#include "src/turbomind/kernels/activation.h" +#include "src/turbomind/kernels/attention/attention.h" +#include "src/turbomind/kernels/gpt_kernels.h" +#include "src/turbomind/kernels/norm/layer_norm.h" +#include "src/turbomind/kernels/norm/rms_norm.h" +#include "src/turbomind/models/layer_norm_weight.h" +#include "src/turbomind/models/llama/SequenceManager.h" +#include "src/turbomind/models/norm_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_input.h" +#include "src/turbomind/models/qwenvit/qwenvit_kernels.h" +#include "src/turbomind/models/qwenvit/qwenvit_weight.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/memory_utils.h" + +#include +#include + +namespace turbomind { + +struct QwenVit::Impl { + const QwenVitWeight& weights_; + const core::QwenVitConfig& config_; + LlamaLinear& linear_; + const comm::HostComm& h_tp_group; + comm::DeviceCommImpl* const d_comm_; + const int tp_group_; + const DataType engine_data_type_; + + Buffer_ grid_thws_buf_; // (t, h, w) + Buffer_ grid_offsets_buf_; // (token offset, natural offset) + Buffer_ window_idx_buf_; + Buffer_ cu_window_seqlens_buf_; + Buffer_ attn_cu_seqlens_buf_; + + struct Data { + Tensor batch_input; + int batch_size; + std::vector> grid_thws_host; + std::vector> image_embeds_coords; // (size, pos) for image embeddings + std::vector> input_embeds_coords; // (size, pos) for input embeddings + + // for RoPE / pos-embed interpolation + Tensor_ grid_thws; + Tensor_ grid_offsets; + Tensor_ mapped_idx; + Tensor_ window_idx; + Tensor_ window_mapped_idx; + int total_hw; + int merge_unit_count; + + // for full attention, one sequence per temporal frame + Tensor_ attn_cu_seqlens; + Tensor_ cu_window_seqlens; + Tensor_ attn_finished; + Tensor_ window_attn_finished; + int attn_batch_size; + int max_attn_len; + int window_attn_batch_size; + int max_window_attn_len; + + // mrope position-id scratch (per-phase pinned host buffers for fast H2D) + Buffer_ mrope_segs_host; // reinterpreted as MropeSegment[], kMropeSegInts ints per segment + Buffer_ mrope_length_host; + Buffer_ mrope_delta_host; + Buffer_ mrope_offsets_host; + + // mrope outputs - owned here so UnifiedAttentionLayer can safely borrow() across env clears. + Buffer_ mrope_segs_dev; // device-side segment scratch, grown alongside host + Tensor_ mrope_position_ids; // (max_forward_token_num, 3), flat current-forward table + Tensor_ mrope_length; // (bsz,) + Tensor_ mrope_position_delta; // (bsz,) + Tensor_ mrope_position_offsets; // (bsz,), flat row offset for each request slot + + void Clear() + { + batch_size = 0; + total_hw = 0; + merge_unit_count = 0; + attn_batch_size = 0; + max_attn_len = 0; + window_attn_batch_size = 0; + max_window_attn_len = 0; + grid_thws_host.clear(); + image_embeds_coords.clear(); + input_embeds_coords.clear(); + } + }; + + static constexpr int kMropeSegInts = sizeof(MropeSegment) / sizeof(int); + static_assert(sizeof(MropeSegment) % sizeof(int) == 0); + + std::vector data_; + + Impl(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases): + weights_{weights}, + config_{weights.config()}, + linear_{*ctx.linear}, + h_tp_group{ctx.comm.h_comm}, + d_comm_{ctx.comm.d_comm}, + tp_group_{ctx.comm.d_tp_group}, + engine_data_type_{engine.data_type} + { + for (int i = 0; i < phases; ++i) { + auto& d = data_.emplace_back(); + d.mrope_length_host = {engine.max_batch_size, kCPUpinned}; + d.mrope_delta_host = {engine.max_batch_size, kCPUpinned}; + d.mrope_offsets_host = {engine.max_batch_size, kCPUpinned}; + + // mrope outputs at worst-case current-forward shape so Setup() never reallocates them. + d.mrope_length = Tensor_{{engine.max_batch_size}, kDEVICE}; + d.mrope_position_delta = Tensor_{{engine.max_batch_size}, kDEVICE}; + d.mrope_position_offsets = Tensor_{{engine.max_batch_size}, kDEVICE}; + d.mrope_position_ids = Tensor_{{engine.max_forward_token_num, 3}, kDEVICE}; + } + } + + struct WindowShape { + int vit_window{}; + int llm_h{}; + int llm_w{}; + int win_h{}; + int win_w{}; + }; + + static void EnsureTensor(Tensor& tensor, Layout layout, DataType dtype, DeviceType device) + { + if (tensor.size() < layout.cosize()) { + tensor = Tensor{std::move(layout), dtype, device}; + } + } + + template + static void EnsureTensor(Tensor_& tensor, Layout layout, DeviceType device) + { + if (tensor.size() < layout.cosize()) { + tensor = Tensor_{std::move(layout), device}; + } + } + + template + static void EnsureBuffer(Buffer_& buffer, ssize_t size, DeviceType device) + { + if (buffer.size() < size) { + buffer = Buffer_{size, device}; + } + } + + WindowShape GetWindowShape(int h, int w) const + { + const int S = config_.spatial_merge_size; + const int vit_window = config_.window_size / S / config_.patch_size; + TM_CHECK_GT(vit_window, 0); + + const int llm_h = h / S; + const int llm_w = w / S; + return WindowShape{ + vit_window, llm_h, llm_w, (llm_h + vit_window - 1) / vit_window, (llm_w + vit_window - 1) / vit_window}; + } + + void CollectPrefillInputs(Data& d, const BatchData& b, std::vector& pixel_values) const + { + const auto& cfg = config_; + + int input_ids_offsets = 0; + int image_embeds_offsets = 0; + for (int i = 0; i < b.rc.size(); ++i) { + const auto& c = *b.rc[i]; + const auto& s = *c.seq; + + if ((not c.autoregres) && (not s.multimodal_inputs.empty())) { + Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}}; + for (const auto& mm : s.multimodal_inputs) { + auto o = mm->interval & text; + if (auto size = (int)o.size()) { + pixel_values.push_back(mm->data); + d.batch_size += mm->data.shape(0); + + const int text_offset = input_ids_offsets + o.begin() - text.begin(); + const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin(); + d.input_embeds_coords.emplace_back(size, text_offset); + d.image_embeds_coords.emplace_back(size, image_offset); + + const auto& grid_thw = mm->grid_thw; + d.grid_thws_host.emplace_back(grid_thw); + const auto& [t, h, w] = grid_thw; + const int prod = t * h * w; + image_embeds_offsets += prod / cfg.spatial_merge_size / cfg.spatial_merge_size; + } + } + } + + input_ids_offsets += c.autoregres ? 1 : c.input_len; + } + } + + void ComputeSetupStats(Data& d) const + { + const auto& cfg = config_; + const int S = cfg.spatial_merge_size; + const int merge_unit = S * S; + + d.attn_batch_size = 0; + d.max_attn_len = 0; + d.merge_unit_count = 0; + d.window_attn_batch_size = 0; + d.max_window_attn_len = 0; + + for (const auto& [t, h, w] : d.grid_thws_host) { + TM_CHECK(h % S == 0); + TM_CHECK(w % S == 0); + + const int hw = h * w; + d.attn_batch_size += t; + d.max_attn_len = std::max(d.max_attn_len, hw); + d.merge_unit_count += t * (h / S) * (w / S); + + if (cfg.use_window_attention) { + const auto win = GetWindowShape(h, w); + d.window_attn_batch_size += t * win.win_h * win.win_w; + d.max_window_attn_len = std::max(d.max_window_attn_len, win.vit_window * win.vit_window * merge_unit); + } + } + } + + void EnsureSetupStorage(Data& d) + { + const auto& cfg = config_; + const int num_grids = (int)d.grid_thws_host.size(); + + core::ContextGuard ctx{Allocator{kCPUpinned}}; + EnsureTensor(d.batch_input, {d.batch_size, cfg.patch_in_dim}, cfg.data_type, kCPUpinned); + EnsureBuffer(grid_thws_buf_, (ssize_t)num_grids * 3, kCPUpinned); + EnsureBuffer(grid_offsets_buf_, (ssize_t)num_grids * 2, kCPUpinned); + EnsureBuffer(attn_cu_seqlens_buf_, (ssize_t)d.attn_batch_size + 1, kCPUpinned); + + EnsureTensor(d.grid_thws, {num_grids, 3}, kDEVICE); + EnsureTensor(d.grid_offsets, {num_grids, 2}, kDEVICE); + EnsureTensor(d.mapped_idx, {d.batch_size}, kDEVICE); + EnsureTensor(d.attn_cu_seqlens, {d.attn_batch_size + 1}, kDEVICE); + EnsureTensor(d.attn_finished, {d.attn_batch_size}, kDEVICE); + + if (cfg.use_window_attention) { + EnsureBuffer(window_idx_buf_, (ssize_t)d.merge_unit_count, kCPUpinned); + EnsureBuffer(cu_window_seqlens_buf_, (ssize_t)d.window_attn_batch_size + 1, kCPUpinned); + EnsureTensor(d.window_idx, {d.merge_unit_count}, kDEVICE); + EnsureTensor(d.window_mapped_idx, {d.batch_size}, kDEVICE); + EnsureTensor(d.cu_window_seqlens, {d.window_attn_batch_size + 1}, kDEVICE); + EnsureTensor(d.window_attn_finished, {d.window_attn_batch_size}, kDEVICE); + } + } + + void StagePixelValues(Data& d, const std::vector& pixel_values) const + { + if (d.batch_size == 0) { + return; + } + + ssize_t batch_offset = 0; + for (const auto& pixel_value : pixel_values) { + TM_CHECK_EQ(pixel_value.size(), pixel_value.shape(0) * config_.patch_in_dim); + TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype()); + Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0))); + batch_offset += pixel_value.shape(0); + } + TM_CHECK_EQ(batch_offset, d.batch_size); + } + + void BuildHostAttentionMeta(Data& d) + { + const auto& cfg = config_; + const int S = cfg.spatial_merge_size; + const int merge_unit = S * S; + const int num_grids = (int)d.grid_thws_host.size(); + + int token_offset = 0; + int natural_offset = 0; + int attn_seq_idx = 0; + int attn_offset = 0; + attn_cu_seqlens_buf_.data()[attn_seq_idx++] = 0; + int window_group_pos = 0; + int window_id_base = 0; + int window_seq_idx = 0; + int window_offset = 0; + if (cfg.use_window_attention) { + cu_window_seqlens_buf_.data()[window_seq_idx++] = 0; + } + + for (int i = 0; i < num_grids; ++i) { + const auto& [t, h, w] = d.grid_thws_host[i]; + + const int hw = h * w; + for (int tt = 0; tt < t; ++tt) { + attn_offset += hw; + attn_cu_seqlens_buf_.data()[attn_seq_idx++] = attn_offset; + } + + grid_thws_buf_.data()[i * 3] = t; + grid_thws_buf_.data()[i * 3 + 1] = h; + grid_thws_buf_.data()[i * 3 + 2] = w; + grid_offsets_buf_.data()[i * 2] = token_offset; + grid_offsets_buf_.data()[i * 2 + 1] = natural_offset; + + TM_CHECK_LE(token_offset + t * h * w, d.batch_size); + + if (cfg.use_window_attention) { + const auto win = GetWindowShape(h, w); + for (int tt = 0; tt < t; ++tt) { + for (int wh = 0; wh < win.win_h; ++wh) { + for (int ww = 0; ww < win.win_w; ++ww) { + int valid_cells = 0; + for (int ih = 0; ih < win.vit_window; ++ih) { + const int llm_i = wh * win.vit_window + ih; + for (int iw = 0; iw < win.vit_window; ++iw) { + const int llm_j = ww * win.vit_window + iw; + if (llm_i >= win.llm_h || llm_j >= win.llm_w) { + continue; + } + const int local_group = (tt * win.llm_h + llm_i) * win.llm_w + llm_j; + const int orig_group = window_id_base + local_group; + window_idx_buf_.data()[window_group_pos++] = orig_group; + ++valid_cells; + } + } + window_offset += valid_cells * merge_unit; + if (cu_window_seqlens_buf_.data()[window_seq_idx - 1] != window_offset) { + cu_window_seqlens_buf_.data()[window_seq_idx++] = window_offset; + } + } + } + } + window_id_base += t * win.llm_h * win.llm_w; + } + + token_offset += t * h * w; + natural_offset += h * w; + } + + TM_CHECK_EQ(token_offset, d.batch_size); + TM_CHECK_EQ(attn_offset, d.batch_size); + TM_CHECK_EQ(attn_seq_idx, d.attn_batch_size + 1); + if (cfg.use_window_attention) { + TM_CHECK_EQ(window_group_pos, d.merge_unit_count); + TM_CHECK_EQ(window_offset, d.batch_size); + TM_CHECK_EQ(window_seq_idx, d.window_attn_batch_size + 1); + } + + d.total_hw = natural_offset; + } + + void PublishHostMetadataCopies(BatchCopy& copy, Data& d) + { + const int num_grids = (int)d.grid_thws_host.size(); + + copy(grid_thws_buf_.data(), num_grids * 3, d.grid_thws.data()); + copy(grid_offsets_buf_.data(), num_grids * 2, d.grid_offsets.data()); + copy(attn_cu_seqlens_buf_.data(), d.attn_batch_size + 1, d.attn_cu_seqlens.data()); + Clear(d.attn_finished.slice(0, d.attn_batch_size)); + + if (config_.use_window_attention) { + copy(window_idx_buf_.data(), d.merge_unit_count, d.window_idx.data()); + copy(cu_window_seqlens_buf_.data(), d.window_attn_batch_size + 1, d.cu_window_seqlens.data()); + Clear(d.window_attn_finished.slice(0, d.window_attn_batch_size)); + } + } + + void AllReduceSum(Tensor& tensor, cudaStream_t stream) const + { + if (d_comm_) { + d_comm_->AllReduceSum( + tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream); + TM_CUDA_CHECK(cudaGetLastError()); + } + } + + void ApplyNorm(Tensor& out, const Tensor& input, const core::Module& norm, NormType norm_type) const + { + auto stream = core::Context::stream().handle(); + switch (norm_type) { + case NormType::kLayerNorm: { + const auto& ln = static_cast(norm); + invokeLayerNorm(out, input, ln.weight, ln.bias, ln.norm_eps_, stream); + break; + } + case NormType::kRMSNorm: { + const auto& rms = static_cast(norm); + invokeRMSNorm(out, input, rms.weight, rms.norm_eps_, stream); + break; + } + default: + TM_LOG_FATAL("unsupported QwenVit norm type: {}", (int)norm_type); + } + TM_CUDA_CHECK(cudaGetLastError()); + } + + void ResidualBiasNorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& residual_bias, + const core::Module& norm, + NormType norm_type) const + { + auto stream = core::Context::stream().handle(); + switch (norm_type) { + case NormType::kLayerNorm: { + const auto& ln = static_cast(norm); + invokeResidualBiasLayerNorm(hidden_states.raw_data(), + residual.raw_data(), + ln.weight.raw_data(), + ln.bias.data_or((void*)nullptr), + residual_bias.data_or((void*)nullptr), + hidden_states.dtype(), + config_.hidden_dim, + hidden_states.shape(0), + ln.norm_eps_, + stream); + break; + } + case NormType::kRMSNorm: { + const auto& rms = static_cast(norm); + invokeResidualBiasRMSNorm(hidden_states.raw_data(), + residual.raw_data(), + rms.weight.raw_data(), + residual_bias.data_or((void*)nullptr), + hidden_states.dtype(), + config_.hidden_dim, + hidden_states.shape(0), + rms.norm_eps_, + stream); + break; + } + default: + TM_LOG_FATAL("unsupported QwenVit norm type: {}", (int)norm_type); + } + TM_CUDA_CHECK(cudaGetLastError()); + } + + // Qwen3.5: precompute the bilinear-interpolation gather indices/weights for the + // learned position-embedding table, then gather the 4 neighbour rows. Consumed by + // `invokeFusedPosEmbedMerge` in Forward(). No-op for models without pos_embed. + void FastPosEmbedInterpolate(Data& d, TensorMap& env) + { + auto& cfg = weights_.config(); + auto stream = core::Context::stream().handle(); + + const int num_grid_per_side = (int)std::sqrt(cfg.num_position_embeddings); + TM_CHECK_EQ(num_grid_per_side * num_grid_per_side, cfg.num_position_embeddings); + TM_CHECK_EQ(weights_.pos_embed.shape(0), cfg.num_position_embeddings); + TM_CHECK_EQ(weights_.pos_embed.shape(1), cfg.hidden_dim); + + Buffer_ pos_embed_idx = {d.total_hw * 4, kDEVICE}; + Tensor pos_embed_weights = {{d.total_hw, 4}, cfg.data_type, kDEVICE}; + invokeFastPosEmbedIdxWeight(pos_embed_idx.data(), + pos_embed_weights.raw_data(), + cfg.data_type, + d.grid_thws.data(), + d.grid_offsets.data(), + (int)d.grid_thws_host.size(), + d.total_hw, + num_grid_per_side, + stream); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor pos_embeds = {{d.total_hw * 4, cfg.hidden_dim}, cfg.data_type, kDEVICE}; + invokeEmbeddingLookup(pos_embeds, pos_embed_idx, weights_.pos_embed, stream); + TM_CUDA_CHECK(cudaGetLastError()); + + env.produce("pos_embeds", pos_embeds); + env.produce("pos_embed_weights", pos_embed_weights); + } + + void RotPosEmb(Data& d, TensorMap& env) + { + auto& cfg = weights_.config(); + + const int head_dim = cfg.hidden_dim / cfg.head_num; + // produce rotary_pos_emb: [total_hw, head_dim] with interleaved (c,s,c,s,...) pairs, + // keyed by the same natural flat index that `mapped_idx` already carries. Vision q/k + // are reordered into this adjacent-pair layout at export time. + Tensor rotary_pos_emb = {{d.total_hw, head_dim}, cfg.data_type, kDEVICE}; + invokeQwenVitRotaryPosEmb(rotary_pos_emb.raw_data(), + cfg.data_type, + d.grid_thws.data(), + d.grid_offsets.data(), + (int)d.grid_thws_host.size(), + d.total_hw, + head_dim, + /*theta=*/10000.0f, + core::Context::stream().handle()); + TM_CUDA_CHECK(cudaGetLastError()); + env.produce("rotary_pos_emb", rotary_pos_emb); + } + + Tensor PatchEmbedding(Data& d) + { + Tensor host_input = d.batch_input.slice(0, d.batch_size); + Tensor input = empty_like(host_input, kDEVICE); + + Copy(host_input, input); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor output; + TM_SCOPE_CALL(linear_.Forward(input, *weights_.patch_embed, output)); + return output; + } + + int Add(RequestCache& c) + { + const auto& [r, s] = std::tie(*c.req, *c.seq); + if (r.mm_inputs) { + if ((not r.session.start_flag) or (not r.session.end_flag)) { + // only support non-interactive inference + return Request::kInvalid; + } + + const auto mm_inputs = std::dynamic_pointer_cast(r.mm_inputs); + if (!mm_inputs) { + return Request::kInvalid; + } + + for (const auto& item : mm_inputs->items) { + if (item.modality != multimodal::Modality::kImage && item.modality != multimodal::Modality::kVideo) { + return Request::kInvalid; + } + + const int tokens = item.token_end - item.token_begin; + if (tokens <= 0) { + return Request::kInvalid; + } + + auto mm_item = std::make_shared( + MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, item.grid_thw}); + s.multimodal_inputs.push_back(mm_item); + } + } + + return Request::kOk; + } + + void Add(int phase, TensorMap& env) + { + // convert model-specific multimodal inputs to internal MultiModalData + const Buffer_ rc = env.at("requests").buffer(); + for (int i = 0; i < rc.size(); ++i) { + auto& c = *TM_CHECK_NOTNULL(rc[i]); + if (c.status == 0) { + c.status = Add(c); + } + } + } + + // Build the mrope tensors consumed by `UnifiedAttentionLayer` and publish them to env. + // + // Per-forward layout: one flat row in `(max_forward_token_num, 3)` for each current token. + // Prefill slots with multimodal_inputs get their active range written by + // `invokeMropePositionIds` from a clipped list of MropeSegment descriptors. All other slots + // (decode + text-only prefill) get `length[i] = 0` so FastRoPE falls through to the closed- + // form `timestep + delta` path and never reads the stale rows. + // + // The output tensors live on `Data` (allocated worst-case in the ctor). env shares + // ownership via shared_ptr; UAL borrows safely across env clears. + void SetupMrope(int phase, TensorMap& env, BatchCopy& copy) + { + auto& d = data_.at(phase); + auto& b = *env.at("batch").data()[0]; + auto& rc = b.rc; + + const int bsz = (int)rc.size(); + if (bsz <= 0) { + return; + } + + const int S = weights_.config().spatial_merge_size; + + // 1) One pass to upper-bound segment count, build flat forward offsets, then size scratch. + // Worst case per prefill slot with mrope: 2*num_images + 1 segments. + int upper_segs = 0; + int total_q_tokens = 0; + for (int i = 0; i < bsz; ++i) { + const auto& c = *rc[i]; + d.mrope_offsets_host.data()[i] = total_q_tokens; + total_q_tokens += c.autoregres ? 1 : c.input_len; + if (!c.autoregres && !c.seq->multimodal_inputs.empty()) { + upper_segs += 2 * (int)c.seq->multimodal_inputs.size() + 1; + } + } + TM_CHECK_LE(total_q_tokens, d.mrope_position_ids.shape(0)); + + const ssize_t upper_ints = (ssize_t)upper_segs * kMropeSegInts; + if (upper_ints > d.mrope_segs_host.size()) { + core::ContextGuard ctx{Allocator{kCPUpinned}}; + d.mrope_segs_host = Buffer_{upper_ints, kCPUpinned}; + d.mrope_segs_dev = Buffer_{upper_ints, kDEVICE}; + } + + // 2) Unified per-request walk - always advance mm_off; emit segments only for needs_table. + auto* segs = reinterpret_cast(d.mrope_segs_host.data_or(nullptr)); + int n_segs = 0; + int max_seg_len = 0; + + for (int i = 0; i < bsz; ++i) { + const auto& c = *rc[i]; + const auto& s = *c.seq; + const bool needs_table = !c.autoregres && !s.multimodal_inputs.empty(); + const int active_start = c.history_len + c.alpha; + const int active_end = active_start + c.input_len; + const int q_offset = d.mrope_offsets_host.data()[i]; + + auto emit = [&](int run_start, int run_n, int run_base, int h2, int w2) { + const int a = std::max(run_start, active_start); + const int b = std::min(run_start + run_n, active_end); + if (a >= b) { + return; + } + const int local_off = a - run_start; + segs[n_segs++] = MropeSegment{ + q_offset + (a - active_start), + b - a, + /*base_pos=*/(h2 == 0) ? run_base + local_off : run_base, + h2, + w2, + /*k_offset=*/(h2 == 0) ? 0 : local_off, + }; + max_seg_len = std::max(max_seg_len, b - a); + }; + + int row = 0, pos = 0, mm_off = 0; + for (const auto& mm : s.multimodal_inputs) { + const auto& [t, h, w] = mm->grid_thw; + const int h2 = h / S, w2 = w / S, n_tok = t * h2 * w2; + TM_CHECK_EQ(n_tok, (int)mm->interval.size()) << "image token count mismatches grid_thw"; + const int img_start = mm->interval.begin(); + const int img_base = img_start + mm_off; + if (needs_table) { + if (img_start > row) { + emit(row, img_start - row, pos, /*h2=*/0, /*w2=*/0); + } + emit(img_start, n_tok, img_base, h2, w2); + } + row = img_start + n_tok; + const int new_pos = std::max(t, std::max(h2, w2)); + pos = img_base + new_pos; + mm_off += new_pos - n_tok; + } + if (needs_table && row < active_end) { + emit(row, active_end - row, pos, /*h2=*/0, /*w2=*/0); + } + + d.mrope_length_host.data()[i] = needs_table ? c.input_len : 0; + d.mrope_delta_host.data()[i] = mm_off; + } + + // 3) Copy the bsz prefix of length / delta / flat offsets into the pre-allocated tensors. + // Rows beyond bsz are untouched (UAL never reads them). + copy(d.mrope_length_host, bsz, d.mrope_length.buffer()); + copy(d.mrope_delta_host, bsz, d.mrope_position_delta.buffer()); + copy(d.mrope_offsets_host, bsz, d.mrope_position_offsets.buffer()); + + // 4) Populate position_ids only when a slot actually needs the table. Rows for slots + // with length[i] == 0 are unreachable from FastRoPE, so leaving them stale is safe. + if (n_segs > 0) { + const ssize_t segs_ints = (ssize_t)n_segs * kMropeSegInts; + Copy(d.mrope_segs_host.slice(0, segs_ints), d.mrope_segs_dev.slice(0, segs_ints)); + invokeMropePositionIds(d.mrope_position_ids.data(), + reinterpret_cast(d.mrope_segs_dev.data()), + n_segs, + max_seg_len, + core::Context::stream().handle()); + TM_CUDA_CHECK(cudaGetLastError()); + } + + // 5) Publish all tensors - the consumer relies on this contract unconditionally. + env.produce("mrope_length", d.mrope_length); + env.produce("mrope_position_delta", d.mrope_position_delta); + env.produce("mrope_position_offsets", d.mrope_position_offsets); + env.produce("mrope_position_ids", d.mrope_position_ids); + } + + void Setup(int phase, TensorMap& env) + { + auto& d = data_.at(phase); + auto& b = *env.at("batch").data()[0]; + auto& copy = *env.at("copy").data()[0]; + + d.Clear(); + std::vector pixel_values; + CollectPrefillInputs(d, b, pixel_values); + + if (d.batch_size > 0) { + ComputeSetupStats(d); + EnsureSetupStorage(d); + StagePixelValues(d, pixel_values); + + BuildHostAttentionMeta(d); + PublishHostMetadataCopies(copy, d); + } + + SetupMrope(phase, env, copy); + h_tp_group->Sync(); + } + + void Prepare(int phase, TensorMap& env) + { + auto& d = data_.at(phase); + if (d.batch_size == 0) { + return; + } + + auto stream = core::Context::stream().handle(); + invokeQwenVitBuildMappedIdx(d.mapped_idx.data(), + d.grid_thws.data(), + d.grid_offsets.data(), + (int)d.grid_thws_host.size(), + config_.spatial_merge_size, + stream); + if (config_.use_window_attention) { + invokeQwenVitBuildWindowMappedIdx(d.window_mapped_idx.data(), + d.mapped_idx.data(), + d.window_idx.data(), + config_.spatial_merge_size * config_.spatial_merge_size, + d.merge_unit_count, + stream); + } + + // Qwen3.5 learned positional embedding (bilinear interpolation of a fixed grid). + if (config_.num_position_embeddings > 0) { + FastPosEmbedInterpolate(d, env); + } + + RotPosEmb(d, env); + } + + void Forward(int phase, TensorMap& args) + { + auto& d = data_.at(phase); + if (d.batch_size == 0) { + return; + } + + auto& cfg = weights_.config(); + + auto residual = PatchEmbedding(d); + auto rotary_pos_emb = args.consume("rotary_pos_emb"); + auto stream = core::Context::stream().handle(); + + // Qwen3.5: fused pos-embed gather/merge into the patch_embed output (with bias folded in): + // residual[pos, d] += Σ_k w[k] * pos_embeds[mapped*4+k, d] + bias[d] + if (cfg.num_position_embeddings > 0) { + auto pos_embeds = args.consume("pos_embeds"); + auto pos_embed_weights = args.consume("pos_embed_weights"); + invokeFusedPosEmbedMerge(residual.raw_data(), + pos_embeds.raw_data(), + pos_embed_weights.raw_data(), + d.mapped_idx.data(), + weights_.patch_embed->bias ? weights_.patch_embed->bias.raw_data() : nullptr, + d.batch_size, + cfg.hidden_dim, + cfg.data_type, + stream); + TM_CUDA_CHECK(cudaGetLastError()); + } + + if (cfg.use_window_attention) { + Tensor reordered{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE}; + invokeQwenVitWindowReorder(reordered, + residual, + d.window_idx.data(), + cfg.spatial_merge_size * cfg.spatial_merge_size, + d.merge_unit_count, + stream); + residual = std::move(reordered); + } + + Tensor hidden_states = [&]() { + Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{}; + if (symm_buf && d.batch_size * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) { + return Tensor{symm_buf.view(cfg.data_type), {d.batch_size, cfg.hidden_dim}}; + } + else { + return Tensor{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE}; + } + }(); + + ApplyNorm(hidden_states, residual, *weights_.block(0)->norm1, cfg.norm_type); + + for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) { + auto* block = weights_.block(layer_id); + + // attn + auto invoke = [&](auto t) { + using T = decltype(t); + Attn(hidden_states, hidden_states, d, layer_id, rotary_pos_emb); + }; + TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke); + + if (block->attention->tp_size > 1) { + AllReduceSum(hidden_states, stream); + } + + ResidualBiasNorm(hidden_states, residual, block->attention->wo->bias, *block->norm2, cfg.norm_type); + + // mlp + Mlp(hidden_states, hidden_states, d, layer_id); + AllReduceSum(hidden_states, stream); + + const auto* next_norm = + layer_id + 1 < cfg.depth ? weights_.block(layer_id + 1)->norm1.get() : weights_.merger_norm.get(); + TM_CHECK_NOTNULL(next_norm); + ResidualBiasNorm(hidden_states, residual, block->mlp_fc2->bias, *next_norm, cfg.norm_type); + } + + Tensor image_embeds = Merger(hidden_states); + if (cfg.use_window_attention) { + Tensor reordered{{d.merge_unit_count, cfg.out_hidden_dim}, image_embeds.dtype(), kDEVICE}; + invokeQwenVitReverseWindow(reordered, image_embeds, d.window_idx.data(), d.merge_unit_count, stream); + image_embeds = std::move(reordered); + } + + // ViT may run in its own dtype (e.g. bf16) while the text engine runs + // in fp16 (AWQ-forced). PatchMultimodalEmbedding merges this buffer + // into the text embedding stream via a byte-level copy, so the dtypes + // must match before publishing. + EnsureFloatDtype(image_embeds, engine_data_type_); + + args.produce("multimodal", + MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf()); + } + + template + AttentionParams CreateVitAttentionParams( + Tensor& attn_context, Tensor& qkv, Tensor& kv, Data& d, const AttentionWeight& attn, int layer_id) + { + const bool use_full_attn = IsFullAttentionLayer(layer_id); + const int local_head_num = attn.head_num / attn.tp_size; + const int head_dim = attn.head_dim; + const int token_num = d.batch_size; + + AttentionParams params{}; + params.out = (T*)attn_context.raw_data(); + params.q = (T*)qkv.raw_data(); + + params.stride = (int64_t)local_head_num * 3 * head_dim; + + params.cu_q_len = use_full_attn ? d.attn_cu_seqlens.data() : d.cu_window_seqlens.data(); + params.cu_k_len = params.cu_q_len; + params.finished = use_full_attn ? d.attn_finished.data() : d.window_attn_finished.data(); + + params.linear_iter_params = LinearIteratorParams{ + kv.raw_data(), + 2 * token_num * head_dim, + token_num * head_dim, + }; + + params.token_num = token_num; + params.batch_size = use_full_attn ? d.attn_batch_size : d.window_attn_batch_size; + params.max_q_len = use_full_attn ? d.max_attn_len : d.max_window_attn_len; + params.max_k_len = params.max_q_len; + + params.num_heads = local_head_num; + params.num_kv_heads = local_head_num; + params.size_per_head = head_dim; + params.causal = false; + params.layer_id = layer_id; + + double scaling = 1.; + if (attn.softmax_scale) { + scaling *= attn.softmax_scale; + } + else { + scaling /= std::sqrt((float)head_dim); + } + params.inv_sqrt_dh = scaling * std::log2(std::exp(1.)); + + params.window_size = 0; + params.rope_param.type = RopeType::kNull; + params.max_split_k = 1; + params.cp_size = 1; + params.stream = core::Context::stream().handle(); + + return params; + } + + bool IsFullAttentionLayer(int layer_id) const + { + if (!config_.use_window_attention) { + return true; + } + return std::find(config_.fullatt_block_indexes.begin(), config_.fullatt_block_indexes.end(), layer_id) + != config_.fullatt_block_indexes.end(); + } + + template + void Attn(Tensor& input, Tensor& output, Data& d, int layer_id, const Tensor& rotary_pos_emb) + { + auto& vit_cfg = weights_.config(); + auto* attn = weights_.block(layer_id)->attention.get(); + + Tensor qkv; + TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv)); + TM_CUDA_CHECK(cudaGetLastError()); + + const int local_head_num = attn->head_num / attn->tp_size; + const int head_dim = attn->head_dim; // may be padded + const int rope_head_dim = vit_cfg.hidden_dim / vit_cfg.head_num; // model's real per-head dim + const int token_num = d.batch_size; + + const int* mapped_idx = (config_.use_window_attention ? d.window_mapped_idx.data() : d.mapped_idx.data()); + + Tensor tmp_kv{{local_head_num, 2, d.batch_size, head_dim}, qkv.dtype(), qkv.device()}; + invokeQwenVitPrepareQKV(qkv.raw_data(), + tmp_kv.raw_data(), + attn->w_qkv->bias.raw_data(), + rotary_pos_emb.raw_data(), + mapped_idx, + qkv.dtype(), + token_num, + local_head_num, + head_dim, + rope_head_dim, + core::Context::stream().handle()); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor attn_output{{token_num, local_head_num * head_dim}, qkv.dtype(), qkv.device()}; + auto params = CreateVitAttentionParams(attn_output, qkv, tmp_kv, d, *attn, layer_id); + dispatchAttention(params); + TM_CUDA_CHECK(cudaGetLastError()); + + TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output)); + TM_CUDA_CHECK(cudaGetLastError()); + } + + void Mlp(Tensor& input, Tensor& output, Data& d, int layer_id) + { + auto* block = weights_.block(layer_id); + auto stream = core::Context::stream().handle(); + + if (config_.gated_mlp) { + TM_CHECK(block->mlp_gate); + Tensor gate; + Tensor up; + TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_gate, gate)); + TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, up)); + TM_CUDA_CHECK(cudaGetLastError()); + + ApplyBias(gate, block->mlp_gate->bias, stream); + ApplyBias(up, block->mlp_fc1->bias, stream); + Activation(gate, up, ActivationType::kSilu, stream); + TM_CUDA_CHECK(cudaGetLastError()); + + TM_SCOPE_CALL(linear_.Forward(gate, *block->mlp_fc2, output)); + } + else { + Tensor inter; + TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter)); + TM_CUDA_CHECK(cudaGetLastError()); + + // Qwen2-VL/2.5 use the erf GELU; Qwen3.5 uses the tanh approximation. + const ActivationType act = config_.gelu_tanh ? ActivationType::kGeluPytorchTanh : ActivationType::kGelu; + invokeAddBiasActivation(inter, block->mlp_fc1->bias, act, stream); + TM_CUDA_CHECK(cudaGetLastError()); + + TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output)); + } + TM_CUDA_CHECK(cudaGetLastError()); + } + + Tensor Merger(Tensor& input) + { + auto& cfg = config_; + auto stream = core::Context::stream().handle(); + + const int merge_area = cfg.spatial_merge_size * cfg.spatial_merge_size; + Tensor merged_input = input.view({-1, cfg.hidden_dim * merge_area}); + + Tensor inter; + TM_SCOPE_CALL(linear_.Forward(merged_input, *weights_.merger_fc1, inter)); + TM_CUDA_CHECK(cudaGetLastError()); + + invokeAddBiasActivation(inter, weights_.merger_fc1->bias, ActivationType::kGelu, stream); + TM_CUDA_CHECK(cudaGetLastError()); + + Tensor output; + TM_SCOPE_CALL(linear_.Forward(inter, *weights_.merger_fc2, output)); + TM_CUDA_CHECK(cudaGetLastError()); + + AllReduceSum(output, stream); + + ApplyBias(output, weights_.merger_fc2->bias, stream); + TM_CUDA_CHECK(cudaGetLastError()); + + return output; + } +}; + +QwenVit::QwenVit(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases): + impl_{std::make_unique(engine, ctx, weights, phases)} +{ +} + +QwenVit::~QwenVit() = default; + +void QwenVit::Run(BatchOp op, int phase, TensorMap& env) +{ + TM_FUNCTION_SCOPE(); + switch (op) { + case BatchOp::kAdd: + return impl_->Add(phase, env); + case BatchOp::kSetup: + return impl_->Setup(phase, env); + case BatchOp::kPrepare: + return impl_->Prepare(phase, env); + case BatchOp::kForward: + return impl_->Forward(phase, env); + default: + return; + } +} + +} // namespace turbomind diff --git a/src/turbomind/models/qwenvit/qwenvit.h b/src/turbomind/models/qwenvit/qwenvit.h new file mode 100644 index 0000000000..b69bf2dae9 --- /dev/null +++ b/src/turbomind/models/qwenvit/qwenvit.h @@ -0,0 +1,38 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include + +#include "src/turbomind/core/core.h" +#include "src/turbomind/engine/batch.h" +#include "src/turbomind/models/llama/context.h" +#include "src/turbomind/models/llama/llama_params.h" +#include "src/turbomind/models/vision_model.h" + +namespace turbomind { + +class QwenVitWeight; + +/// Unified ``VisionModel`` for the Qwen ViT family: Qwen2-VL / Qwen2.5-VL / Qwen3.5. +/// +/// A single config-driven implementation. The orthogonal feature toggles that +/// distinguish the families are selected from ``QwenVitConfig``: +/// - window attention (Qwen2.5): use_window_attention +/// - learned pos embedding (Qwen3.5): num_position_embeddings > 0 +/// - gated SiLU MLP (Qwen2.5): gated_mlp +/// - tanh-approx GELU MLP (Qwen3.5): gelu_tanh +/// - RMSNorm vs LayerNorm (Qwen2): norm_type +class QwenVit: public VisionModel { +public: + QwenVit(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases); + + ~QwenVit() override; + + void Run(BatchOp op, int phase, TensorMap& env) override; + +private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc b/src/turbomind/models/qwenvit/qwenvit_block_weight.cc similarity index 55% rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc rename to src/turbomind/models/qwenvit/qwenvit_block_weight.cc index 96b99404cd..594693e292 100644 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc +++ b/src/turbomind/models/qwenvit/qwenvit_block_weight.cc @@ -1,6 +1,6 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h" #include "src/turbomind/core/registry.h" #include "src/turbomind/models/attention_weight.h" @@ -9,8 +9,8 @@ namespace turbomind { -TM_MODULE_REGISTER(Qwen3_5VitBlockWeight, core::Qwen3_5VitBlockConfig); +TM_MODULE_REGISTER(QwenVitBlockWeight, core::QwenVitBlockConfig); -TM_MODULE_METHODS(Qwen3_5VitBlockWeight, QWEN3_5VIT_BLOCK_CHILDREN, QWEN3_5VIT_BLOCK_PARAMS) +TM_MODULE_METHODS(QwenVitBlockWeight, QWENVIT_BLOCK_CHILDREN, QWENVIT_BLOCK_PARAMS) } // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h b/src/turbomind/models/qwenvit/qwenvit_block_weight.h similarity index 61% rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h rename to src/turbomind/models/qwenvit/qwenvit_block_weight.h index 6d9c88f7e2..97e7d10084 100644 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h +++ b/src/turbomind/models/qwenvit/qwenvit_block_weight.h @@ -7,49 +7,45 @@ namespace turbomind::core { -/// Per-block config for the Qwen3.5 vision transformer. -/// -/// Carries the dimensions the runtime needs (hidden_dim, head_num, -/// intermediate_size). Loading itself is structural — children are -/// committed by the Python builder via ``add_child_raw``. -struct Qwen3_5VitBlockConfig: ModuleConfig { - Qwen3_5VitBlockConfig(): ModuleConfig{"Qwen3_5VitBlockWeight"} {} +/// Per-block config for the Qwen2 / Qwen2.5 / Qwen3.5 vision transformer. +struct QwenVitBlockConfig: ModuleConfig { + QwenVitBlockConfig(): ModuleConfig{"QwenVitBlockWeight"} {} -#define QWEN3_5VIT_BLOCK_FIELDS(X) \ +#define QWENVIT_BLOCK_FIELDS(X) \ X(DataType, data_type) \ X(int, hidden_dim) \ X(int, head_num) \ X(int, intermediate_size) \ X(float, norm_eps, 1e-6f) - QWEN3_5VIT_BLOCK_FIELDS(TM_MEMBER) - TM_FOR_EACH(Qwen3_5VitBlockConfig, QWEN3_5VIT_BLOCK_FIELDS) + QWENVIT_BLOCK_FIELDS(TM_MEMBER) + TM_FOR_EACH(QwenVitBlockConfig, QWENVIT_BLOCK_FIELDS) -#undef QWEN3_5VIT_BLOCK_FIELDS +#undef QWENVIT_BLOCK_FIELDS }; } // namespace turbomind::core namespace turbomind { -class LayerNormWeight; class LinearWeight; -/// One transformer block of the Qwen3.5 ViT. +/// One transformer block of the Qwen ViT (covers Qwen2 / Qwen2.5 / Qwen3.5). /// /// Children: -/// - norm1, norm2 LayerNorm (weight + bias) +/// - norm1, norm2 LayerNorm or RMSNorm (held as core::Module) /// - attention AttentionWeight (packed Q/K/V + output projection) +/// - mlp_gate Optional gated SiLU gate projection (Qwen2.5 only) /// - mlp_fc1, mlp_fc2 Linear (in: hidden ↔ intermediate) -class Qwen3_5VitBlockWeight: public core::Module { +class QwenVitBlockWeight: public core::Module { public: const char* type() const override { - return "Qwen3_5VitBlockWeight"; + return "QwenVitBlockWeight"; } - Qwen3_5VitBlockWeight() = default; - explicit Qwen3_5VitBlockWeight(const core::Qwen3_5VitBlockConfig& cfg): + QwenVitBlockWeight() = default; + explicit QwenVitBlockWeight(const core::QwenVitBlockConfig& cfg): data_type{cfg.data_type}, hidden_dim{cfg.hidden_dim}, head_num{cfg.head_num}, @@ -58,16 +54,17 @@ class Qwen3_5VitBlockWeight: public core::Module { { } -#define QWEN3_5VIT_BLOCK_CHILDREN(X) \ - X(LayerNormWeight, norm1) \ - X(LayerNormWeight, norm2) \ +#define QWENVIT_BLOCK_CHILDREN(X) \ + X(core::Module, norm1) \ + X(core::Module, norm2) \ X(AttentionWeight, attention) \ + X(LinearWeight, mlp_gate) \ X(LinearWeight, mlp_fc1) \ X(LinearWeight, mlp_fc2) -#define QWEN3_5VIT_BLOCK_PARAMS(X) +#define QWENVIT_BLOCK_PARAMS(X) - TM_MODULE_DECLARE(Qwen3_5VitBlockWeight, QWEN3_5VIT_BLOCK_CHILDREN, QWEN3_5VIT_BLOCK_PARAMS) + TM_MODULE_DECLARE(QwenVitBlockWeight, QWENVIT_BLOCK_CHILDREN, QWENVIT_BLOCK_PARAMS) // --- Public scalars --- DataType data_type{}; diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h b/src/turbomind/models/qwenvit/qwenvit_input.h similarity index 58% rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h rename to src/turbomind/models/qwenvit/qwenvit_input.h index 122bd49524..735ae74aff 100644 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h +++ b/src/turbomind/models/qwenvit/qwenvit_input.h @@ -12,27 +12,28 @@ namespace turbomind { namespace multimodal { -struct Qwen3_5VitItem { +// Unified multimodal input for the Qwen2-VL / Qwen2.5-VL / Qwen3.5 ViT encoders. +struct QwenVitItem { Modality modality; Tensor data; int token_begin; int token_end; std::array grid_thw; - Qwen3_5VitItem() = default; + QwenVitItem() = default; - Qwen3_5VitItem(Modality modality, Tensor data, int token_begin, int token_end, std::array grid_thw): + QwenVitItem(Modality modality, Tensor data, int token_begin, int token_end, std::array grid_thw): modality{modality}, data{std::move(data)}, token_begin{token_begin}, token_end{token_end}, grid_thw{grid_thw} { } }; -struct Qwen3_5VitInput final: Input { - std::vector items; +struct QwenVitInput final: Input { + std::vector items; - Qwen3_5VitInput() = default; + QwenVitInput() = default; - explicit Qwen3_5VitInput(std::vector items): items{std::move(items)} {} + explicit QwenVitInput(std::vector items): items{std::move(items)} {} }; } // namespace multimodal diff --git a/src/turbomind/models/qwenvit/qwenvit_kernels.cu b/src/turbomind/models/qwenvit/qwenvit_kernels.cu new file mode 100644 index 0000000000..2309dd428d --- /dev/null +++ b/src/turbomind/models/qwenvit/qwenvit_kernels.cu @@ -0,0 +1,827 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// +// Merged CUDA kernels for the unified Qwen ViT (Qwen2-VL / Qwen2.5-VL / Qwen3.5). +// Sections, in order: +// 1. QKV preprocessing (bias + RoPE fuse) — all variants +// 2. Spatial-merge index mapping — all variants +// 3. Learned pos-embed bilinear interpolation — Qwen3.5 +// 4. 2D rotary position-embedding table — all variants +// 5. mrope position ids — all variants +// 6. Window attention reordering — Qwen2.5 + +#include "src/turbomind/models/qwenvit/qwenvit_kernels.h" + +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/logger.h" +#include "src/turbomind/kernels/core/array.h" +#include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/kernels/core/common.h" +#include "src/turbomind/utils/cuda_utils.h" + +#include +#include + +#include + +namespace turbomind { + +namespace { + +// `num_grids` is tiny (usually 1..a few) so a linear scan is fine. +// Shared by the pos-embed interpolation and rotary-embedding kernels. +__device__ inline int find_grid(const int* offsets, int num_grids, int pos) +{ + int g = 0; + for (int i = 1; i < num_grids; ++i) { + if (offsets[i * 2 + 1] <= pos) { + g = i; + } + else { + break; + } + } + return g; +} + +// ------------------------------------------------------------------------------------ +// 1. QKV preprocessing +// ------------------------------------------------------------------------------------ + +constexpr int kWarpsPerBlock = 4; + +// Per-head_dim launch traits. Adding a new head_dim is a single specialization here. +template +struct HeadConfig; + +template<> +struct HeadConfig<64> { + static constexpr int kVecSize = 8; + static constexpr int kHeadsPerWarp = 4; +}; + +template<> +struct HeadConfig<72> { + static constexpr int kVecSize = 8; + static constexpr int kHeadsPerWarp = 3; +}; + +template<> +struct HeadConfig<128> { + static constexpr int kVecSize = 8; // 128 / 8 = 16 vec/head + static constexpr int kHeadsPerWarp = 2; // 16 * 2 = 32 == WARP_SIZE +}; + +template +__device__ __forceinline__ void add_bias(Array& x, const T* bias) +{ + Array b; + Ldg(b, bias); + using namespace ops; + x = x + cast(b); +} + +// Rotate adjacent (x[2k], x[2k+1]) pairs using cos/sin packed as +// rope[2k]=cos, rope[2k+1]=sin (see fastRotaryPosEmbKernel below). +template +__device__ __forceinline__ void apply_rope_pair(Array& x, const Array& rope) +{ + auto cs = cast(rope); + PRAGMA_UNROLL + for (int i = 0; i < VecSize; i += 2) { + const float x0 = x[i]; + const float x1 = x[i + 1]; + const float c = cs[i]; + const float s = cs[i + 1]; + x[i] = c * x0 - s * x1; + x[i + 1] = c * x1 + s * x0; + } +} + +// Load `src`, add `bias`, apply RoPE, store to `dst` (may equal `src` for in-place). +template +__device__ __forceinline__ void fuse_bias_rope_store(T* dst, const T* src, const T* bias, const Array& rope) +{ + Array x_vec; + Load(x_vec, src); + auto x = cast(x_vec); + add_bias(x, bias); + apply_rope_pair(x, rope); + Store(dst, cast(x)); +} + +// V-path: bias only, no RoPE. +template +__device__ __forceinline__ void fuse_bias_store(T* dst, const T* src, const T* bias) +{ + Array x_vec; + Load(x_vec, src); + auto x = cast(x_vec); + add_bias(x, bias); + Store(dst, cast(x)); +} + +template +__global__ __launch_bounds__(kWarpsPerBlock* WARP_SIZE) void prepareQKVKernel(T* __restrict__ qkv, + T* __restrict__ kv, + const T* __restrict__ bias, + const T* __restrict__ rotary_pos_emb, + const int* __restrict__ mapped_idx, + int token_num, + int local_head_num, + int head_group_num, + int rope_head_dim) +{ + using Cfg = HeadConfig; + constexpr int kVecSize = Cfg::kVecSize; + constexpr int kHeadsPerWarp = Cfg::kHeadsPerWarp; + constexpr int kVecPerHead = HD / kVecSize; + static_assert(HD % kVecSize == 0); + static_assert(kVecPerHead * kHeadsPerWarp <= WARP_SIZE); + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane = threadIdx.x - warp_id * WARP_SIZE; + const int head_slot = lane / kVecPerHead; + if (head_slot >= kHeadsPerWarp) { + return; + } + + const int global_warp = blockIdx.x * kWarpsPerBlock + warp_id; + const int total_warps = token_num * head_group_num; + if (global_warp >= total_warps) { + return; + } + + const int token_idx = global_warp / head_group_num; + const int head_group = global_warp - token_idx * head_group_num; + const int head_idx = head_group * kHeadsPerWarp + head_slot; + if (head_idx >= local_head_num) { + return; + } + + const int vec_idx = lane - head_slot * kVecPerHead; + const int di = vec_idx * kVecSize; + + // QKV per-token layout: [Q_heads | K_heads | V_heads], head_num == kv_head_num for ViT. + const int64_t qkv_stride = (int64_t)local_head_num * 3 * HD; + T* const q_ptr = qkv + (int64_t)token_idx * qkv_stride + head_idx * HD + di; + const T* k_ptr = q_ptr + (int64_t)local_head_num * HD; + const T* v_ptr = k_ptr + (int64_t)local_head_num * HD; + + const T* q_bias = bias + head_idx * HD + di; + const T* k_bias = q_bias + local_head_num * HD; + const T* v_bias = k_bias + local_head_num * HD; + + // K/V destination in transposed [kv_head, 2, token, head_dim] layout. + T* const k_dst = kv + ((int64_t)head_idx * 2 * token_num + token_idx) * HD + di; + T* const v_dst = k_dst + (int64_t)token_num * HD; + + // rope[token, di] is shared between Q and K — load once, reuse twice. + // When HD > rope_head_dim, padded di-slices have zero Q/K, so loading a + // zero rope_vec there is correct (and avoids OOB on the [N, rope_head_dim] + // buffer). kVecSize is aligned to rope_head_dim so each vec is fully in or + // fully out of the rope range. + Array rope_vec{}; + if (di < rope_head_dim) { + Ldg(rope_vec, rotary_pos_emb + (int64_t)mapped_idx[token_idx] * rope_head_dim + di); + } + + fuse_bias_rope_store(q_ptr, q_ptr, q_bias, rope_vec); // Q: in-place + fuse_bias_rope_store(k_dst, k_ptr, k_bias, rope_vec); // K: transposed + fuse_bias_store(v_dst, v_ptr, v_bias); // V: transposed, no RoPE +} + +template +void dispatchPrepareQKV(T* qkv, + T* kv, + const T* qkv_bias, + const T* rotary_pos_emb, + const int* mapped_idx, + int token_num, + int local_head_num, + int head_dim, + int rope_head_dim, + cudaStream_t stream) +{ + auto invoke = [&](auto hd_c) { + constexpr int HD = decltype(hd_c)::value; + using Cfg = HeadConfig; + + // Each vec_size-wide load must lie entirely in or out of the rope range. + TM_CHECK(rope_head_dim % Cfg::kVecSize == 0) + << "rope_head_dim (" << rope_head_dim << ") must be a multiple of kVecSize (" << Cfg::kVecSize << ")"; + TM_CHECK(rope_head_dim <= HD) << "rope_head_dim (" << rope_head_dim << ") cannot exceed head_dim (" << HD + << ")"; + + const int head_group_num = (local_head_num + Cfg::kHeadsPerWarp - 1) / Cfg::kHeadsPerWarp; + const int total_warps = token_num * head_group_num; + dim3 grid((total_warps + kWarpsPerBlock - 1) / kWarpsPerBlock); + prepareQKVKernel<<>>( + qkv, kv, qkv_bias, rotary_pos_emb, mapped_idx, token_num, local_head_num, head_group_num, rope_head_dim); + }; + + switch (head_dim) { + case 64: + return invoke(std::integral_constant{}); + case 72: + return invoke(std::integral_constant{}); + case 128: + return invoke(std::integral_constant{}); + default: + TM_LOG_FATAL("unsupported Qwen ViT head_dim for qkv preprocess: {}", head_dim); + } +} + +// ------------------------------------------------------------------------------------ +// 2. Spatial-merge index mapping +// ------------------------------------------------------------------------------------ + +__global__ void buildMappedIdxKernel(int* mapped_idx, int token_offset, int natural_offset, int t, int h, int w, int S) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int total = t * h * w; + if (idx >= total) { + return; + } + + const int hw = h * w; + const int merge_unit = S * S; + const int local = idx % hw; + const int group = local / merge_unit; + const int inner = local - group * merge_unit; + const int group_cols = w / S; + const int h_outer = group / group_cols; + const int w_outer = group - h_outer * group_cols; + const int h_inner = inner / S; + const int w_inner = inner - h_inner * S; + const int natural_idx = (h_outer * S + h_inner) * w + (w_outer * S + w_inner); + mapped_idx[token_offset + idx] = natural_offset + natural_idx; +} + +__global__ void +buildMappedIdxBatchedKernel(int* mapped_idx, const int* grid_thws, const int* grid_offsets, int num_grids, int S) +{ + const int grid_id = blockIdx.x; + if (grid_id >= num_grids) { + return; + } + + const int t = grid_thws[grid_id * 3]; + const int h = grid_thws[grid_id * 3 + 1]; + const int w = grid_thws[grid_id * 3 + 2]; + const int token_offset = grid_offsets[grid_id * 2]; + const int natural_offset = grid_offsets[grid_id * 2 + 1]; + const int total = t * h * w; + const int hw = h * w; + const int merge_unit = S * S; + const int group_cols = w / S; + + for (int idx = threadIdx.x; idx < total; idx += blockDim.x) { + const int local = idx % hw; + const int group = local / merge_unit; + const int inner = local - group * merge_unit; + const int h_outer = group / group_cols; + const int w_outer = group - h_outer * group_cols; + const int h_inner = inner / S; + const int w_inner = inner - h_inner * S; + const int natural_idx = (h_outer * S + h_inner) * w + (w_outer * S + w_inner); + mapped_idx[token_offset + idx] = natural_offset + natural_idx; + } +} + +// ------------------------------------------------------------------------------------ +// 3. Learned pos-embed bilinear interpolation (Qwen3.5) +// ------------------------------------------------------------------------------------ + +template +__device__ inline T from_float(float x); + +template<> +__device__ inline half from_float(float x) +{ + return __float2half(x); +} + +#ifdef ENABLE_BF16 +template<> +__device__ inline __nv_bfloat16 from_float<__nv_bfloat16>(float x) +{ + return __float2bfloat16(x); +} +#endif + +template +__global__ void fastPosEmbedIdxWeightKernel( + int* idx_out, T* weight_out, const int* grid_thws, const int* grid_offsets, int num_grids, int total_n, int G) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + if (pos >= total_n) { + return; + } + + const int g = find_grid(grid_offsets, num_grids, pos); + const int grid_h = grid_thws[g * 3 + 1]; + const int grid_w = grid_thws[g * 3 + 2]; + const int local = pos - grid_offsets[g * 2 + 1]; + const int i = local / grid_w; + const int j = local % grid_w; + + // torch.linspace(0, G-1, n) uses the halfway-symmetric formulation so + // that both endpoints are exact: + // step = (end - start) / (n - 1) + // halfway = n / 2 + // out[i=hw]= end - step * (n - 1 - i) + // For n == 1 the single element is `start` (== 0 here); the formula + // below collapses to 0 since hw_h == 0 is bypassed via grid_h > 1. + const float end = (float)(G - 1); + const float step_h = (grid_h > 1) ? end / (float)(grid_h - 1) : 0.f; + const float step_w = (grid_w > 1) ? end / (float)(grid_w - 1) : 0.f; + + const int hw_h = grid_h / 2; + const int hw_w = grid_w / 2; + + const float h_val = (grid_h == 1) ? 0.f : ((i < hw_h) ? step_h * (float)i : end - step_h * (float)(grid_h - 1 - i)); + const float w_val = (grid_w == 1) ? 0.f : ((j < hw_w) ? step_w * (float)j : end - step_w * (float)(grid_w - 1 - j)); + + // torch.Tensor.int() truncates toward zero; h_val, w_val are non-negative + // and bounded above by G-1, so (int) cast is in [0, G-1]. + const int h_floor = (int)h_val; + const int w_floor = (int)w_val; + const int h_ceil = min(h_floor + 1, G - 1); + const int w_ceil = min(w_floor + 1, G - 1); + + const float dh = h_val - (float)h_floor; + const float dw = w_val - (float)w_floor; + + const int base_h = h_floor * G; + const int base_h_ceil = h_ceil * G; + + Array idx; + idx[0] = base_h + w_floor; + idx[1] = base_h + w_ceil; + idx[2] = base_h_ceil + w_floor; + idx[3] = base_h_ceil + w_ceil; + + Array weight; + weight[0] = from_float((1.f - dh) * (1.f - dw)); + weight[1] = from_float((1.f - dh) * dw); + weight[2] = from_float(dh * (1.f - dw)); + weight[3] = from_float(dh * dw); + + const int out_base = pos * 4; + Store(idx_out + out_base, idx); + Store(weight_out + out_base, weight); +} + +template +__device__ Array roundToStorageDtype(Array x) +{ + return cast(cast(x)); +} + +template +__global__ void fusedPosEmbedMergeKernel(T* hidden_states, + const T* pos_embeds, + const T* pos_embed_weights, + const int* mapped_idx, + const T* bias, + int hidden, + int vdim) +{ + const int index = blockIdx.x; + const int mapped = mapped_idx[index]; // same address for all threads in block -> L1 broadcast + + Array w4; + Ldg(w4, pos_embed_weights + mapped * 4); + + const int row_off = index * hidden; + const int pe_row0 = mapped * 4 * hidden; + + using namespace ops; + for (int d = threadIdx.x; d < vdim; d += blockDim.x) { + Array pos{}; + Array tmp; + Load(tmp, hidden_states + row_off + d * vec_size); + auto hidden_acc = cast(tmp); + + if (bias) { + Ldg(tmp, bias + d * vec_size); + hidden_acc = roundToStorageDtype(hidden_acc + cast(tmp)); + } + PRAGMA_UNROLL + for (int k = 0; k < 4; ++k) { + Ldg(tmp, pos_embeds + pe_row0 + k * hidden + d * vec_size); + pos = pos + cast(tmp * w4[k]); + } + const auto out = hidden_acc + roundToStorageDtype(pos); + Store(hidden_states + row_off + d * vec_size, cast(out)); + } +} + +// ------------------------------------------------------------------------------------ +// 4. 2D rotary position-embedding table +// ------------------------------------------------------------------------------------ + +template +__global__ void fastRotaryPosEmbKernel(T* cos_sin_out, + const int* grid_thws, + const int* grid_offsets, + int num_grids, + int total_hw, + int head_dim, + float scale) // -log2(theta) / (head_dim/4) +{ + const int pair_count = head_dim / 2; // e.g. 36 + const int freq_half = head_dim / 4; // e.g. 18 + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int pos = tid / pair_count; + const int pair_k = tid % pair_count; + if (pos >= total_hw) { + return; + } + + const int g = find_grid(grid_offsets, num_grids, pos); + const int grid_w = grid_thws[g * 3 + 2]; + const int local = pos - grid_offsets[g * 2 + 1]; + const int i = local / grid_w; // h_coord + const int j = local % grid_w; // w_coord + + // Pairs [0, freq_half) rotate in h; pairs [freq_half, 2*freq_half) rotate in w. + const int freq_idx = pair_k % freq_half; + const int coord = (pair_k < freq_half) ? i : j; + const float inv_freq = exp2f((float)freq_idx * scale); + + float c, s; + sincosf((float)coord * inv_freq, &s, &c); + + Array cs{(T)c, (T)s}; + Store(cos_sin_out + (size_t)pos * head_dim + pair_k * 2, cs); +} + +// ------------------------------------------------------------------------------------ +// 5. mrope position ids +// ------------------------------------------------------------------------------------ + +constexpr int kMropeBlock = 128; + +__global__ void mropeScatterKernel(int* pos_ids, const MropeSegment* __restrict__ segs) +{ + const MropeSegment s = segs[blockIdx.x]; + const int local_k = blockIdx.y * blockDim.x + threadIdx.x; + if (local_k >= s.n_tok) { + return; + } + int* dst = pos_ids + 3 * (s.dst_offset + local_k); + if (s.h2 == 0) { // text run + const int p = s.base_pos + local_k; + dst[0] = p; + dst[1] = p; + dst[2] = p; + } + else { // image run - grid math uses the original (un-clipped) k + const int k = s.k_offset + local_k; + const int hw = s.h2 * s.w2; + dst[0] = s.base_pos + k / hw; + dst[1] = s.base_pos + (k / s.w2) % s.h2; + dst[2] = s.base_pos + k % s.w2; + } +} + +// ------------------------------------------------------------------------------------ +// 6. Window attention reordering (Qwen2.5) +// ------------------------------------------------------------------------------------ + +template +__global__ void windowReorderKernel(T* out, + const T* in, + const int* window_idx, + int64_t out_stride, + int64_t in_stride, + int merge_unit, + int group_count, + int dim) +{ + const int dst_group = blockIdx.x; + const int inner = blockIdx.y; + const int di = (threadIdx.x + blockIdx.z * blockDim.x) * vec_size; + if (di >= dim) { + return; + } + + const int src_group = window_idx[dst_group]; + using Vec = Array; + Vec x; + Load(x, in + ((int64_t)src_group * merge_unit + inner) * in_stride + di); + Store(out + ((int64_t)dst_group * merge_unit + inner) * out_stride + di, x); +} + +template +__global__ void reverseWindowKernel( + T* out, const T* in, const int* window_idx, int64_t out_stride, int64_t in_stride, int group_count, int dim) +{ + const int src_group = blockIdx.x; + const int di = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size; + if (di >= dim) { + return; + } + + const int dst_group = window_idx[src_group]; + using Vec = Array; + Vec x; + Load(x, in + (int64_t)src_group * in_stride + di); + Store(out + (int64_t)dst_group * out_stride + di, x); +} + +__global__ void buildWindowMappedIdxKernel( + int* window_mapped_idx, const int* mapped_idx, const int* window_idx, int merge_unit, int total) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total) { + return; + } + + const int dst_group = idx / merge_unit; + const int inner = idx - dst_group * merge_unit; + const int src_group = window_idx[dst_group]; + window_mapped_idx[idx] = mapped_idx[src_group * merge_unit + inner]; +} + +} // namespace + +// ===================================================================================== +// Public entry points +// ===================================================================================== + +void invokeQwenVitPrepareQKV(void* qkv, + void* kv, + const void* qkv_bias, + const void* rotary_pos_emb, + const int* mapped_idx, + DataType dtype, + int token_num, + int local_head_num, + int head_dim, + int rope_head_dim, + cudaStream_t stream) +{ + if (token_num == 0) { + return; + } + + auto invoke = [&](auto t) { + using T = decltype(t); + dispatchPrepareQKV((T*)qkv, + (T*)kv, + (const T*)qkv_bias, + (const T*)rotary_pos_emb, + mapped_idx, + token_num, + local_head_num, + head_dim, + rope_head_dim, + stream); + }; + + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); +} + +void invokeQwenVitBuildMappedIdx(int* mapped_idx, + int token_offset, + int natural_offset, + int t, + int h, + int w, + int spatial_merge_size, + cudaStream_t stream) +{ + if (t * h * w == 0) { + return; + } + + const int total = t * h * w; + const int threads = 256; + buildMappedIdxKernel<<<(total + threads - 1) / threads, threads, 0, stream>>>( + mapped_idx, token_offset, natural_offset, t, h, w, spatial_merge_size); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeQwenVitBuildMappedIdx(int* mapped_idx, + const int* grid_thws, + const int* grid_offsets, + int num_grids, + int spatial_merge_size, + cudaStream_t stream) +{ + if (num_grids == 0) { + return; + } + + const int threads = 256; + buildMappedIdxBatchedKernel<<>>( + mapped_idx, grid_thws, grid_offsets, num_grids, spatial_merge_size); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeFastPosEmbedIdxWeight(int* idx_out, + void* weight_out, + DataType dtype, + const int* grid_thws, + const int* grid_offsets, + int num_grids, + int total_n, + int num_grid_per_side, + cudaStream_t stream) +{ + if (total_n <= 0 || num_grids <= 0) { + return; + } + const int block = 256; + const int grid = (total_n + block - 1) / block; + + auto invoke = [&](auto t) { + using T = decltype(t); + fastPosEmbedIdxWeightKernel<<>>( + idx_out, (T*)weight_out, grid_thws, grid_offsets, num_grids, total_n, num_grid_per_side); + }; + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); +} + +void invokeFusedPosEmbedMerge(void* hidden_states, + const void* pos_embeds, + const void* pos_embed_weights, + const int* mapped_idx, + const void* bias, + int batch, + int hidden, + DataType dtype, + cudaStream_t stream) +{ + if (batch <= 0) { + return; + } + + const dim3 grid(batch); + const dim3 block(128); + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int vec_size = sizeof(uint4) / sizeof(T); + TM_CHECK(hidden % vec_size == 0); + fusedPosEmbedMergeKernel<<>>((T*)hidden_states, + (const T*)pos_embeds, + (const T*)pos_embed_weights, + mapped_idx, + (const T*)bias, + hidden, + hidden / vec_size); + }; + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); +} + +void invokeQwenVitRotaryPosEmb(void* cos_sin, + DataType dtype, + const int* grid_thws, + const int* grid_offsets, + int num_grids, + int total_hw, + int head_dim, + float theta, + cudaStream_t stream) +{ + if (total_hw <= 0 || num_grids <= 0 || head_dim <= 0) { + return; + } + TM_CHECK(head_dim % 4 == 0) << "head_dim must be divisible by 4, got " << head_dim; + + const int total = total_hw * (head_dim / 2); + const int block = 256; + const int grid = (total + block - 1) / block; + const float scale = -log2f(theta) / (float)(head_dim / 4); + + auto invoke = [&](auto t) { + using T = decltype(t); + fastRotaryPosEmbKernel + <<>>((T*)cos_sin, grid_thws, grid_offsets, num_grids, total_hw, head_dim, scale); + }; + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); +} + +void invokeMropePositionIds( + int* pos_ids, const MropeSegment* segments, int num_segments, int max_seg_len, cudaStream_t stream) +{ + if (num_segments <= 0 || max_seg_len <= 0) { + return; + } + const int tiles = (max_seg_len + kMropeBlock - 1) / kMropeBlock; + const dim3 grid((unsigned)num_segments, (unsigned)tiles); + mropeScatterKernel<<>>(pos_ids, segments); +} + +void invokeQwenVitWindowReorder( + Tensor& out, const Tensor& in, const int* window_idx, int merge_unit, int group_count, cudaStream_t stream) +{ + if (group_count == 0) { + return; + } + + const int dim = in.shape(1); + const int threads = 256; + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int max_vec = sizeof(uint4) / sizeof(T); + + int best_vec_size = 1; + for (int v = max_vec; v >= 1; v >>= 1) { + if (dim % v == 0 && in.stride(0) % v == 0 && out.stride(0) % v == 0) { + best_vec_size = v; + break; + } + } + + auto launch = [&](auto vec_size_c) { + constexpr int vec_size = decltype(vec_size_c)::value; + const dim3 grid(group_count, merge_unit, cdiv(dim, threads * vec_size)); + windowReorderKernel<<>>( + out.data(), in.data(), window_idx, out.stride(0), in.stride(0), merge_unit, group_count, dim); + }; + + switch (best_vec_size) { + case 8: + return launch(std::integral_constant{}); + case 4: + return launch(std::integral_constant{}); + case 2: + return launch(std::integral_constant{}); + default: + return launch(std::integral_constant{}); + } + }; + TM_DISPATCH_PRIMARY_DTYPES(in.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeQwenVitReverseWindow( + Tensor& out, const Tensor& in, const int* window_idx, int group_count, cudaStream_t stream) +{ + if (group_count == 0) { + return; + } + + const int dim = in.shape(1); + const int threads = 256; + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int max_vec = sizeof(uint4) / sizeof(T); + + int best_vec_size = 1; + for (int v = max_vec; v >= 1; v >>= 1) { + if (dim % v == 0 && in.stride(0) % v == 0 && out.stride(0) % v == 0) { + best_vec_size = v; + break; + } + } + + auto launch = [&](auto vec_size_c) { + constexpr int vec_size = decltype(vec_size_c)::value; + const dim3 grid(group_count, cdiv(dim, threads * vec_size)); + reverseWindowKernel<<>>( + out.data(), in.data(), window_idx, out.stride(0), in.stride(0), group_count, dim); + }; + + switch (best_vec_size) { + case 8: + return launch(std::integral_constant{}); + case 4: + return launch(std::integral_constant{}); + case 2: + return launch(std::integral_constant{}); + default: + return launch(std::integral_constant{}); + } + }; + TM_DISPATCH_PRIMARY_DTYPES(in.dtype(), invoke); + TM_CUDA_CHECK(cudaGetLastError()); +} + +void invokeQwenVitBuildWindowMappedIdx(int* window_mapped_idx, + const int* mapped_idx, + const int* window_idx, + int merge_unit, + int group_count, + cudaStream_t stream) +{ + if (group_count == 0) { + return; + } + + const int total = group_count * merge_unit; + const int threads = 256; + buildWindowMappedIdxKernel<<>>( + window_mapped_idx, mapped_idx, window_idx, merge_unit, total); + TM_CUDA_CHECK(cudaGetLastError()); +} + +} // namespace turbomind diff --git a/src/turbomind/models/qwenvit/qwenvit_kernels.h b/src/turbomind/models/qwenvit/qwenvit_kernels.h new file mode 100644 index 0000000000..29cf4d78da --- /dev/null +++ b/src/turbomind/models/qwenvit/qwenvit_kernels.h @@ -0,0 +1,148 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/data_type.h" + +#include + +namespace turbomind { + +// ===================================================================================== +// QKV preprocessing (qkv_preprocess) +// ===================================================================================== + +// Prepare the Qwen ViT attention inputs after the fused QKV projection. +// +// qkv layout: +// [token, local_q_heads + 2 * local_kv_heads, head_dim] +// Q is updated in place with bias + RoPE. K/V are written to `kv` as: +// [local_kv_heads, 2, token, head_dim] +// +// `rope_head_dim` is the per-head dim of the rotary_pos_emb buffer and is +// also the cutoff below which RoPE is applied. When the model's real head_dim +// is not natively supported by the attention kernel, Q/K/V are zero-padded +// per-head to a kernel-supported `head_dim` >= `rope_head_dim`; the padded +// `[rope_head_dim, head_dim)` slice has zero Q/K so RoPE is skipped there. +void invokeQwenVitPrepareQKV(void* qkv, + void* kv, + const void* qkv_bias, + const void* rotary_pos_emb, + const int* mapped_idx, + DataType dtype, + int token_num, + int local_head_num, + int head_dim, + int rope_head_dim, + cudaStream_t stream); + +// ===================================================================================== +// Spatial-merge index mapping (grid_mapping) +// ===================================================================================== + +void invokeQwenVitBuildMappedIdx(int* mapped_idx, + int token_offset, + int natural_offset, + int t, + int h, + int w, + int spatial_merge_size, + cudaStream_t stream); + +void invokeQwenVitBuildMappedIdx(int* mapped_idx, + const int* grid_thws, + const int* grid_offsets, + int num_grids, + int spatial_merge_size, + cudaStream_t stream); + +// ===================================================================================== +// Learned positional-embedding bilinear interpolation (fast_pos_embed) — Qwen3.5 +// ===================================================================================== + +// Precomputes the 4 bilinear-interpolation gather indices and weights +// used by the subsequent pos-embed merge step in Qwen3-VL. +void invokeFastPosEmbedIdxWeight(int* idx_out, // [total_n * 4] + void* weight_out, // [total_n * 4] + DataType dtype, + const int* grid_thws, // [num_grids * 3], (t, h, w) + const int* grid_offsets, // [num_grids * 2], (t*h*w, h*w) + int num_grids, + int total_n, + int num_grid_per_side, + cudaStream_t stream); + +// Fuses the spatial-merge permutation, the bilinear-weighted sum, and the +// t-expansion of Qwen3-VL ViT pos_embed interpolation into a single pass on +// top of the patch_embed linear output. (fused_embed_merge) +void invokeFusedPosEmbedMerge(void* hidden_states, // [batch, hidden] + const void* pos_embeds, // [total_hw * 4, hidden] + const void* pos_embed_weights, // [total_hw * 4] + const int* mapped_idx, // [batch] + const void* bias, // [hidden] or nullptr + int batch, + int hidden, + DataType dtype, + cudaStream_t stream); + +// ===================================================================================== +// 2D rotary position embedding table (fast_rotary_pos_emb) +// ===================================================================================== + +// Precomputes the (cos, sin) rotary-embedding table for Qwen-VL vision tokens. +// Layout per natural flat position (keyed by the same index `mapped_idx` carries): +// [c_0, s_0, c_1, s_1, ..., c_{head_dim/2-1}, s_{head_dim/2-1}] +// The pair index `k` uses `h_coord` for k < head_dim/4 and `w_coord` otherwise, +// with inv_freq = theta^(-2*(k % (head_dim/4)) / (head_dim/2)). +void invokeQwenVitRotaryPosEmb(void* cos_sin, // [total_hw, head_dim] + DataType dtype, + const int* grid_thws, // [num_grids * 3], (t, h, w) + const int* grid_offsets, // [num_grids * 2], (t*h*w, h*w) + int num_grids, + int total_hw, + int head_dim, + float theta, + cudaStream_t stream); + +// ===================================================================================== +// mrope position ids (mrope_position_ids) +// ===================================================================================== + +// One descriptor per text / image run, clipped to a prefill chunk's active window. +// `h2 == 0` flags a text run (real image grids always have h2 > 0). +struct MropeSegment { + int dst_offset; // flat forward-token index of the first token written by this segment + int n_tok; // tokens to write (already clipped to the active range) + int base_pos; // text: position id at local_k = 0; image: image's mm_start + int h2; // image grid h after spatial merge (0 => text) + int w2; // image grid w after spatial merge (ignored when h2 == 0) + int k_offset; // starting "k" for image grid math (clip-offset within the run); unused for text +}; + +// Scatter `num_segments` segments into `pos_ids` of shape (max_fwd_tokens, 3). +// `pos_ids` may be null when num_segments == 0. +void invokeMropePositionIds(int* pos_ids, + const MropeSegment* segments, // device + int num_segments, + int max_seg_len, + cudaStream_t stream); + +// ===================================================================================== +// Window attention reordering (window kernels) — Qwen2.5 +// ===================================================================================== + +void invokeQwenVitWindowReorder( + Tensor& out, const Tensor& in, const int* window_idx, int merge_unit, int group_count, cudaStream_t stream); + +void invokeQwenVitReverseWindow( + Tensor& out, const Tensor& in, const int* window_idx, int group_count, cudaStream_t stream); + +void invokeQwenVitBuildWindowMappedIdx(int* window_mapped_idx, + const int* mapped_idx, + const int* window_idx, + int merge_unit, + int group_count, + cudaStream_t stream); + +} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc b/src/turbomind/models/qwenvit/qwenvit_weight.cc similarity index 57% rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc rename to src/turbomind/models/qwenvit/qwenvit_weight.cc index ff55324f1d..14caf479cb 100644 --- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc +++ b/src/turbomind/models/qwenvit/qwenvit_weight.cc @@ -1,36 +1,38 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_weight.h" #include "src/turbomind/core/registry.h" #include "src/turbomind/models/layer_norm_weight.h" #include "src/turbomind/models/linear_weight.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h" #include "src/turbomind/utils/memory_utils.h" namespace turbomind { -Qwen3_5VitWeight::Qwen3_5VitWeight(const core::Qwen3_5VitConfig& cfg): config_{cfg} {} +QwenVitWeight::QwenVitWeight(const core::QwenVitConfig& cfg): config_{cfg} {} -void Qwen3_5VitWeight::prepare() +void QwenVitWeight::prepare() { for_each_child([](const char* /*name*/, Module* child) { if (child) child->prepare(); }); + // Qwen3.5 carries a learned positional-embedding table; keep it in the ViT dtype. if (pos_embed) { EnsureFloatDtype(pos_embed, config_.data_type); } } -bool Qwen3_5VitWeight::verify(std::vector& missing) +bool QwenVitWeight::verify(std::vector& missing) { Module::verify(missing); if (!patch_embed) { missing.push_back(full_path() + ": missing patch_embed"); } - if (!pos_embed) { + // pos_embed only exists for models with a learned position-embedding table (Qwen3.5). + if (config_.num_position_embeddings > 0 && !pos_embed) { missing.push_back(full_path() + ": missing pos_embed"); } if (!blocks || blocks->size() != config_.depth) { @@ -42,16 +44,16 @@ bool Qwen3_5VitWeight::verify(std::vector& missing) return missing.empty(); } -Qwen3_5VitBlockWeight* Qwen3_5VitWeight::block(int i) const +QwenVitBlockWeight* QwenVitWeight::block(int i) const { if (!blocks) { return nullptr; } - return static_cast(blocks->child(std::to_string(i))); + return static_cast(blocks->child(std::to_string(i))); } -TM_MODULE_REGISTER(Qwen3_5VitWeight, core::Qwen3_5VitConfig); +TM_MODULE_REGISTER(QwenVitWeight, core::QwenVitConfig); -TM_MODULE_METHODS(Qwen3_5VitWeight, QWEN3_5VIT_WEIGHT_CHILDREN, QWEN3_5VIT_WEIGHT_PARAMS) +TM_MODULE_METHODS(QwenVitWeight, QWENVIT_WEIGHT_CHILDREN, QWENVIT_WEIGHT_PARAMS) } // namespace turbomind diff --git a/src/turbomind/models/qwenvit/qwenvit_weight.h b/src/turbomind/models/qwenvit/qwenvit_weight.h new file mode 100644 index 0000000000..5fc0ca9ffa --- /dev/null +++ b/src/turbomind/models/qwenvit/qwenvit_weight.h @@ -0,0 +1,132 @@ +// Copyright (c) OpenMMLab. All rights reserved. +#pragma once + +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" +#include "src/turbomind/kernels/norm/norm.h" +#include "src/turbomind/models/vision_model_weight.h" + +#include + +namespace turbomind::core { + +/// Root config for the Qwen ViT family (Qwen2-VL / Qwen2.5-VL / Qwen3.5). +/// +/// Carries every structural scalar the C++ runtime needs to allocate kernels. +/// The feature set of the two model families is the union of orthogonal toggles: +/// - window attention (Qwen2.5): use_window_attention / window_size / fullatt_block_indexes +/// - learned pos embedding (Qwen3.5): num_position_embeddings > 0 (+ pos_embed weight) +/// - gated SiLU MLP (Qwen2.5): gated_mlp +/// - tanh-approx GELU MLP (Qwen3.5): gelu_tanh +/// - RMSNorm vs LayerNorm (Qwen2): norm_type +/// +/// Each field is visited by the X-macro below so pybind11 exposes it as a +/// read/write attribute on the Python ``QwenVitConfig``. +struct QwenVitConfig: ModuleConfig { + QwenVitConfig(): ModuleConfig{"QwenVitWeight"} {} + + DataType data_type{}; + int hidden_dim{0}; + int out_hidden_dim{0}; + int depth{0}; + int head_num{0}; + int intermediate_size{0}; + int patch_in_dim{0}; + int in_channels{0}; + int patch_size{0}; + int temporal_patch_size{0}; + int spatial_merge_size{0}; + int num_position_embeddings{0}; + int window_size{0}; + bool gated_mlp{false}; + bool use_window_attention{false}; + bool gelu_tanh{false}; + NormType norm_type{NormType::kLayerNorm}; + float norm_eps{1e-6f}; + std::vector fullatt_block_indexes; + +#define QWENVIT_FIELDS(X) \ + X(DataType, data_type) \ + X(int, hidden_dim) \ + X(int, out_hidden_dim) \ + X(int, depth) \ + X(int, head_num) \ + X(int, intermediate_size) \ + X(int, patch_in_dim) \ + X(int, in_channels) \ + X(int, patch_size) \ + X(int, temporal_patch_size) \ + X(int, spatial_merge_size) \ + X(int, num_position_embeddings, 0) \ + X(int, window_size, 0) \ + X(bool, gated_mlp, false) \ + X(bool, use_window_attention, false) \ + X(bool, gelu_tanh, false) \ + X(NormType, norm_type, NormType::kLayerNorm) \ + X(std::vector, fullatt_block_indexes) \ + X(float, norm_eps, 1e-6f) + + TM_FOR_EACH(QwenVitConfig, QWENVIT_FIELDS) + +#undef QWENVIT_FIELDS +}; + +} // namespace turbomind::core + +namespace turbomind { + +// Forward decls +class LayerNormWeight; +class LinearWeight; +class QwenVitBlockWeight; + +/// Unified Qwen ViT weight tree (Qwen2-VL / Qwen2.5-VL / Qwen3.5). +/// +/// Tree: +/// patch_embed LinearWeight (Conv3d-as-Linear; in_dim = C·T·patch²) +/// pos_embed raw tensor (num_position_embeddings × hidden_dim) — Qwen3.5 only, optional +/// blocks ModuleList of QwenVitBlockWeight × depth +/// merger_fc1 LinearWeight (in: hidden·spatial_merge², out: 4·hidden) +/// merger_fc2 LinearWeight (in: 4·hidden, out: out_hidden) +/// merger_norm LayerNormWeight or NormWeight (over hidden_dim) +/// +/// We expose ``merger_*`` as direct children rather than a sub-module to +/// keep the weight tree shallow — the merger has only three pieces. +class QwenVitWeight: public VisionModelWeight { +public: + const char* type() const override + { + return "QwenVitWeight"; + } + + QwenVitWeight() = default; + explicit QwenVitWeight(const core::QwenVitConfig& cfg); + + void prepare() override; + bool verify(std::vector& missing) override; + + // --- X-macro field lists --- +#define QWENVIT_WEIGHT_CHILDREN(X) \ + X(LinearWeight, patch_embed) \ + X(core::ModuleList, blocks) \ + X(LinearWeight, merger_fc1) \ + X(LinearWeight, merger_fc2) \ + X(core::Module, merger_norm) + +#define QWENVIT_WEIGHT_PARAMS(X) X(pos_embed) + + TM_MODULE_DECLARE(QwenVitWeight, QWENVIT_WEIGHT_CHILDREN, QWENVIT_WEIGHT_PARAMS) + + // --- Accessors --- + const core::QwenVitConfig& config() const noexcept + { + return config_; + } + + QwenVitBlockWeight* block(int i) const; + +private: + core::QwenVitConfig config_{}; +}; + +} // namespace turbomind diff --git a/src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu b/src/turbomind/models/qwenvit/test_mrope_position_ids.cu similarity index 71% rename from src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu rename to src/turbomind/models/qwenvit/test_mrope_position_ids.cu index b622eda4c8..eb6854d081 100644 --- a/src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu +++ b/src/turbomind/models/qwenvit/test_mrope_position_ids.cu @@ -1,7 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. // // Standalone test for invokeMropePositionIds. Builds segment descriptors with the same -// logic the production Qwen3_5Vit::Impl::Setup() uses, runs the device kernel, and +// logic the production QwenVit::Impl::Setup() uses, runs the device kernel, and // compares against a CPU reference that replicates the pre-refactor scalar loop. #include @@ -13,12 +13,21 @@ #include -#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h" +#include "src/turbomind/models/qwenvit/qwenvit_kernels.h" using namespace turbomind; namespace { +#define CHECK_CUDA(call) \ + do { \ + const cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::printf("[CUDA] %s failed: %s\n", #call, cudaGetErrorString(err)); \ + return 1; \ + } \ + } while (0) + struct ImageSpec { int seq_start; int t; @@ -36,7 +45,6 @@ struct RequestSpec { // Pre-refactor scalar reference: builds the full (seq_len, 3) row for each request, // then we read out the [active_start, active_end) slice to compare against kernel output. -// Mirrors qwen3_5vit.cc:440-516 (pre-refactor) exactly. std::vector cpu_reference_full_row(const RequestSpec& r, int S, int& out_delta) { std::vector row(r.seq_len * 3, 0); @@ -87,8 +95,8 @@ std::vector cpu_reference_full_row(const RequestSpec& r, int S, int& out_de return row; } -// Mirrors the host walk in qwen3_5vit.cc Setup() — same logic, returns the segment list. -void emit_segments(const RequestSpec& r, int request_idx, int S, std::vector& out) +// Mirrors the host walk in qwenvit.cc Setup() - same logic, returns the segment list. +void emit_segments(const RequestSpec& r, int q_offset, int S, std::vector& out) { if (r.autoregres || r.images.empty()) { return; @@ -102,8 +110,7 @@ void emit_segments(const RequestSpec& r, int request_idx, int S, std::vector& batch, int S) { - const int bsz = (int)batch.size(); - int max_active_end = 0; - int max_seg_len = 0; - bool any_table = false; + const int bsz = (int)batch.size(); + std::vector q_offsets(bsz + 1, 0); + int max_seg_len = 0; + bool any_table = false; std::vector segs; for (int i = 0; i < bsz; ++i) { + q_offsets[i + 1] = q_offsets[i] + (batch[i].autoregres ? 1 : batch[i].active_end - batch[i].active_start); const size_t before = segs.size(); - emit_segments(batch[i], i, S, segs); + emit_segments(batch[i], q_offsets[i], S, segs); for (size_t j = before; j < segs.size(); ++j) { max_seg_len = std::max(max_seg_len, segs[j].n_tok); } if (!batch[i].autoregres && !batch[i].images.empty()) { - max_active_end = std::max(max_active_end, batch[i].active_end); - any_table = true; + any_table = true; } } // Run kernel std::vector kernel_out; if (any_table) { - const ssize_t pos_ids_count = (ssize_t)bsz * max_active_end * 3; + const ssize_t pos_ids_count = (ssize_t)q_offsets.back() * 3; int* d_pos_ids = nullptr; - cudaMalloc(&d_pos_ids, pos_ids_count * sizeof(int)); - cudaMemset(d_pos_ids, 0xCC, pos_ids_count * sizeof(int)); // poison + CHECK_CUDA(cudaMalloc(&d_pos_ids, pos_ids_count * sizeof(int))); + CHECK_CUDA(cudaMemset(d_pos_ids, 0xCC, pos_ids_count * sizeof(int))); // poison MropeSegment* d_segs = nullptr; - cudaMalloc(&d_segs, segs.size() * sizeof(MropeSegment)); - cudaMemcpy(d_segs, segs.data(), segs.size() * sizeof(MropeSegment), cudaMemcpyHostToDevice); + CHECK_CUDA(cudaMalloc(&d_segs, segs.size() * sizeof(MropeSegment))); + CHECK_CUDA(cudaMemcpy(d_segs, segs.data(), segs.size() * sizeof(MropeSegment), cudaMemcpyHostToDevice)); invokeMropePositionIds(d_pos_ids, - max_active_end * 3, d_segs, (int)segs.size(), max_seg_len, /*stream=*/0); - cudaDeviceSynchronize(); + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); kernel_out.resize(pos_ids_count); - cudaMemcpy(kernel_out.data(), d_pos_ids, pos_ids_count * sizeof(int), cudaMemcpyDeviceToHost); + CHECK_CUDA(cudaMemcpy(kernel_out.data(), d_pos_ids, pos_ids_count * sizeof(int), cudaMemcpyDeviceToHost)); - cudaFree(d_pos_ids); - cudaFree(d_segs); + CHECK_CUDA(cudaFree(d_pos_ids)); + CHECK_CUDA(cudaFree(d_segs)); } // Compare against CPU reference within each request's active range @@ -188,7 +195,7 @@ int run_case(const std::string& name, const std::vector& batch, int } for (int k = r.active_start; k < r.active_end; ++k) { for (int c = 0; c < 3; ++c) { - const int got = kernel_out[(size_t)i * max_active_end * 3 + (size_t)k * 3 + c]; + const int got = kernel_out[((size_t)q_offsets[i] + k - r.active_start) * 3 + c]; const int ref = full_row[k * 3 + c]; if (got != ref) { if (errors < 16) { @@ -202,10 +209,10 @@ int run_case(const std::string& name, const std::vector& batch, int } if (errors == 0) { - std::printf("[PASS] %s — bsz=%d segs=%zu max_active_end=%d\n", name.c_str(), bsz, segs.size(), max_active_end); + std::printf("[PASS] %s - bsz=%d segs=%zu q_tokens=%d\n", name.c_str(), bsz, segs.size(), q_offsets.back()); } else { - std::printf("[FAIL] %s — %d mismatches\n", name.c_str(), errors); + std::printf("[FAIL] %s - %d mismatches\n", name.c_str(), errors); } return errors; } @@ -217,7 +224,7 @@ int main() const int S = 2; // spatial_merge_size int errors = 0; - // (a) Decode-only batch — no table writes expected. + // (a) Decode-only batch - no table writes expected. errors += run_case("decode_only", {RequestSpec{/*seq_len=*/64, /*active_start=*/64, @@ -226,7 +233,7 @@ int main() /*images=*/{}}}, S); - // (b) Pure-text prefill — empty images, identity positions. + // (b) Pure-text prefill - empty images, identity positions. errors += run_case("pure_text", {RequestSpec{32, 0, 32, false, {}}}, S); // (c) Single-image prefill (image in the middle). @@ -249,7 +256,7 @@ int main() RequestSpec{30, 0, 30, false, {{6, 1, 6, 4}}}}, // image prefill S); - // (g) Chunked prefill — second chunk, history_len > 0, active range mid-prompt. + // (g) Chunked prefill - second chunk, history_len > 0, active range mid-prompt. // Image overlaps both chunks. errors += run_case("chunked_prefill", {RequestSpec{60, 16, 32, false, {{8, 1, 8, 8}}}}, // image spans 8..24 @@ -258,10 +265,15 @@ int main() // (h) Multi-image with clipping. errors += run_case("multi_image_clip", {RequestSpec{80, 10, 50, false, {{6, 1, 4, 4}, {30, 2, 4, 4}}}}, S); + // (i) Chunked prefill fully after image tokens - validates post-image delta/table rows. + errors += run_case("chunked_after_image", + {RequestSpec{80, 48, 72, false, {{8, 1, 8, 8}}}}, // image spans 8..24 + S); + if (errors == 0) { std::printf("All cases passed.\n"); return 0; } - std::printf("FAILED — %d total mismatches.\n", errors); + std::printf("FAILED - %d total mismatches.\n", errors); return 1; } diff --git a/src/turbomind/models/vision_model.cc b/src/turbomind/models/vision_model.cc index 6cfc8cbf60..4f2e10dcf4 100644 --- a/src/turbomind/models/vision_model.cc +++ b/src/turbomind/models/vision_model.cc @@ -3,8 +3,10 @@ #include "src/turbomind/models/vision_model.h" #include "src/turbomind/core/logger.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h" +#include "src/turbomind/models/internvit/internvit.h" +#include "src/turbomind/models/internvit/internvit_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit.h" +#include "src/turbomind/models/qwenvit/qwenvit_weight.h" #include @@ -15,8 +17,11 @@ std::unique_ptr CreateVisionModel(const VisionModelWeight& weights, const Context& ctx, int phases) { - if (std::string_view{weights.type()} == "Qwen3_5VitWeight") { - return std::make_unique(engine, ctx, static_cast(weights), phases); + if (std::string_view{weights.type()} == "QwenVitWeight") { + return std::make_unique(engine, ctx, static_cast(weights), phases); + } + if (std::string_view{weights.type()} == "InternVitWeight") { + return std::make_unique(engine, ctx, static_cast(weights), phases); } TM_LOG_FATAL("Unsupported vision model weight type: {}", weights.type()); diff --git a/src/turbomind/models/vision_model.h b/src/turbomind/models/vision_model.h index 281ec6cc6b..a1c2edc9aa 100644 --- a/src/turbomind/models/vision_model.h +++ b/src/turbomind/models/vision_model.h @@ -16,7 +16,7 @@ namespace turbomind { /// Polymorphic peer of ``LanguageModel`` for the vision sub-graph. /// -/// Concrete subclasses (one per VLM family — ``Qwen3_5Vit``, +/// Concrete subclasses (one per VLM family — ``QwenVit``, /// ``InternVit``, …) wire up the per-family C++ runtime. The /// engine talks to this base via ``Run(BatchOp, phase, env)``, /// mirroring ``LanguageModel::Run``. diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 1a7a107a7f..21917807d9 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -22,18 +22,22 @@ #include "src/turbomind/engine/engine_config.h" #include "src/turbomind/engine/model_request.h" #include "src/turbomind/engine/multimodal_input.h" +#include "src/turbomind/kernels/norm/norm.h" #include "src/turbomind/models/attention_weight.h" #include "src/turbomind/models/decoder_layer_weight.h" #include "src/turbomind/models/delta_net_weight.h" #include "src/turbomind/models/ffn_weight.h" +#include "src/turbomind/models/internvit/internvit_block_weight.h" +#include "src/turbomind/models/internvit/internvit_input.h" +#include "src/turbomind/models/internvit/internvit_weight.h" #include "src/turbomind/models/layer_norm_weight.h" #include "src/turbomind/models/linear_weight.h" #include "src/turbomind/models/model_weight.h" #include "src/turbomind/models/moe_weight.h" #include "src/turbomind/models/norm_weight.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h" -#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h" +#include "src/turbomind/models/qwenvit/qwenvit_input.h" +#include "src/turbomind/models/qwenvit/qwenvit_weight.h" #include "src/turbomind/models/vision_model_weight.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/turbomind.h" @@ -333,10 +337,12 @@ PYBIND11_MODULE(_turbomind, m) { py::module_ multimodal = m.def_submodule("multimodal"); - using MMInput = ft::multimodal::Input; - using MMModality = ft::multimodal::Modality; - using QwenVitItem = ft::multimodal::Qwen3_5VitItem; - using QwenVitInput = ft::multimodal::Qwen3_5VitInput; + using MMInput = ft::multimodal::Input; + using MMModality = ft::multimodal::Modality; + using InternVitItem = ft::multimodal::InternVitItem; + using InternVitInput = ft::multimodal::InternVitInput; + using QwenVitItem = ft::multimodal::QwenVitItem; + using QwenVitInput = ft::multimodal::QwenVitInput; py::class_>(multimodal, "Input"); py::enum_(multimodal, "Modality") .value("IMAGE", MMModality::kImage) @@ -344,7 +350,7 @@ PYBIND11_MODULE(_turbomind, m) .value("AUDIO", MMModality::kAudio) .value("TIME_SERIES", MMModality::kTimeSeries) .export_values(); - py::class_(multimodal, "Qwen3_5VitItem") + py::class_(multimodal, "QwenVitItem") .def(py::init<>()) .def(py::init([](MMModality modality, std::shared_ptr data, @@ -366,10 +372,30 @@ PYBIND11_MODULE(_turbomind, m) .def_readwrite("token_begin", &QwenVitItem::token_begin) .def_readwrite("token_end", &QwenVitItem::token_end) .def_readwrite("grid_thw", &QwenVitItem::grid_thw); - py::class_>(multimodal, "Qwen3_5VitInput") + py::class_>(multimodal, "QwenVitInput") .def(py::init<>()) .def(py::init>(), "items"_a) .def_readwrite("items", &QwenVitInput::items); + py::class_(multimodal, "InternVitItem") + .def(py::init<>()) + .def(py::init([](MMModality modality, std::shared_ptr data, int token_begin, int token_end) { + return InternVitItem{modality, *data, token_begin, token_end}; + }), + "modality"_a, + "data"_a, + "token_begin"_a, + "token_end"_a) + .def_readwrite("modality", &InternVitItem::modality) + .def_property( + "data", + [](const InternVitItem& self) { return std::make_shared(self.data); }, + [](InternVitItem& self, std::shared_ptr data) { self.data = *data; }) + .def_readwrite("token_begin", &InternVitItem::token_begin) + .def_readwrite("token_end", &InternVitItem::token_end); + py::class_>(multimodal, "InternVitInput") + .def(py::init<>()) + .def(py::init>(), "items"_a) + .def_readwrite("items", &InternVitInput::items); py::class_>(m, "RequestMetrics") .def(py::init()) @@ -440,9 +466,9 @@ PYBIND11_MODULE(_turbomind, m) py::class_>(m, "AtomicRequestState") .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); }); - // data type { using namespace turbomind; + // data type py::enum_(m, "DataType") .value("TYPE_INVALID", kNull) .value("TYPE_BOOL", kBool) @@ -467,6 +493,12 @@ PYBIND11_MODULE(_turbomind, m) .value("MEMORY_CPU", ft::DeviceType::kCPU) .value("MEMORY_CPU_PINNED", ft::DeviceType::kCPUpinned) .value("MEMORY_GPU", ft::DeviceType::kDEVICE); + + // norm type + py::enum_(m, "NormType") + .value("NONE", ft::NormType::kNone) + .value("LAYER_NORM", ft::NormType::kLayerNorm) + .value("RMS_NORM", ft::NormType::kRMSNorm); } // DataFormat descriptors @@ -508,8 +540,10 @@ PYBIND11_MODULE(_turbomind, m) bind_config(m, "DecoderLayerConfig"); bind_config(m, "ModelWeightConfig"); bind_config(m, "LayerNormConfig"); - bind_config(m, "Qwen3_5VitConfig"); - bind_config(m, "Qwen3_5VitBlockConfig"); + bind_config(m, "QwenVitConfig"); + bind_config(m, "QwenVitBlockConfig"); + bind_config(m, "InternVitConfig"); + bind_config(m, "InternVitBlockConfig"); // tensor py::class_>(m, "Tensor")