diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index cf2452935e..a5c7266c3e 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -399,13 +399,14 @@ async def _get_multimodal_prompt_input(self,
                                                            mm_processor_kwargs=mm_processor_kwargs)
             else:
                 results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs=mm_processor_kwargs)
-                results = await self.vl_encoder.async_infer(results)
+                if not self.vl_encoder.model._turbomind_native_vision:
+                    results = await self.vl_encoder.async_infer(results)
                 results = await self.vl_encoder.wrap_for_turbomind(messages=results,
-                                                                chat_template=chat_template,
-                                                                tokenizer=self.tokenizer,
-                                                                sequence_start=sequence_start,
-                                                                tools=tools,
-                                                                chat_template_kwargs=chat_template_kwargs)
+                                                                   chat_template=chat_template,
+                                                                   tokenizer=self.tokenizer,
+                                                                   sequence_start=sequence_start,
+                                                                   tools=tools,
+                                                                   chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
             if self.vl_encoder._uses_new_preprocess:
                 input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages,
diff --git a/lmdeploy/turbomind/models/__init__.py b/lmdeploy/turbomind/models/__init__.py
index 8402a3edaf..1d5f18f4ce 100644
--- a/lmdeploy/turbomind/models/__init__.py
+++ b/lmdeploy/turbomind/models/__init__.py
@@ -6,5 +6,6 @@
 from .llama import LlamaModel  # noqa: F401
 from .mixtral import MixtralModel  # noqa: F401
 from .qwen2 import Qwen2Model  # noqa: F401
+from .qwen2_vl import Qwen2VLModel  # noqa: F401
 from .qwen3 import Qwen3TextModel  # noqa: F401
 from .qwen3_5 import Qwen3_5Model, Qwen3_5TextModel, Qwen3_5VisionModel  # noqa: F401
diff --git a/lmdeploy/turbomind/models/internvl.py b/lmdeploy/turbomind/models/internvl.py
index ee3f344850..2603df4f1c 100644
--- a/lmdeploy/turbomind/models/internvl.py
+++ b/lmdeploy/turbomind/models/internvl.py
@@ -3,9 +3,32 @@
 HF-style InternVL/InternS1)."""
 from __future__ import annotations
 
+from types import SimpleNamespace
+from typing import Any
+
+import _turbomind as _tm
+import torch
 from transformers import PretrainedConfig
 
+from lmdeploy.vl.constants import Modality
+
+from ..builders import (
+    AttentionBuilder,
+    Builder,
+    LayerNormBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    NormBuilder,
+    SplitSide,
+    VisionModelBuilder,
+    make_layer_norm_config,
+    make_norm_config,
+)
+from ..builders._base import ParallelGroup
+from ..linear import Linear, transform_output_dim
 from ..supported_models import SUPPORTED_ARCHS
+from ..text_model import TextModel
+from ..weight_format import TrivialFormat
 from .base import INPUT_MODELS
 
 
@@ -15,6 +38,14 @@ def _cfg_get(cfg, name: str, default=None):
     return getattr(cfg, name, default)
 
 
+def _to_tm_norm_type(norm_type: str):
+    if norm_type == 'layer_norm':
+        return _tm.NormType.LAYER_NORM
+    if norm_type == 'rms_norm':
+        return _tm.NormType.RMS_NORM
+    raise ValueError(f'Unsupported InternVit vision norm_type: {norm_type!r}')
+
+
 def map_interns1_hf_keys(name: str) -> str:
     """Map Intern-S1 HF VLM checkpoint keys to the Qwen3 text loader layout."""
     language_model_prefix = 'model.language_model.'
@@ -26,12 +57,339 @@ def map_interns1_hf_keys(name: str) -> str:
     return name
 
 
+def map_internvl_hf_keys(name: str) -> str:
+    """Map InternVL HF vision keys to the InternVit loader layout."""
+    if name.startswith('vision_tower.') or name.startswith('multi_modal_projector.'):
+        return f'model.{name}'
+    return name
+
+
+def map_legacy_internvl_keys(name: str) -> str:
+    """Map legacy InternVLChatModel ViT keys to the InternVit loader layout."""
+    if name == 'vision_model.embeddings.class_embedding':
+        return 'model.vision_tower.embeddings.cls_token'
+    if name == 'vision_model.embeddings.position_embedding':
+        return 'model.vision_tower.embeddings.position_embeddings'
+
+    patch_embed = 'vision_model.embeddings.patch_embedding.'
+    if name.startswith(patch_embed):
+        suffix = name[len(patch_embed):]
+        return f'model.vision_tower.embeddings.patch_embeddings.projection.{suffix}'
+
+    block_prefix = 'vision_model.encoder.layers.'
+    if name.startswith(block_prefix):
+        rest = name[len(block_prefix):]
+        if '.' not in rest:
+            return name
+        layer_id, rest = rest.split('.', 1)
+        prefix = f'model.vision_tower.encoder.layer.{layer_id}.'
+
+        if rest == 'ls1':
+            return prefix + 'lambda_1'
+        if rest == 'ls2':
+            return prefix + 'lambda_2'
+        if rest.startswith('norm1.'):
+            return prefix + 'layernorm_before.' + rest[len('norm1.'):]
+        if rest.startswith('norm2.'):
+            return prefix + 'layernorm_after.' + rest[len('norm2.'):]
+        if rest.startswith('attn.qkv.'):
+            return prefix + 'attention.qkv.' + rest[len('attn.qkv.'):]
+        if rest.startswith('attn.proj.'):
+            return prefix + 'attention.projection_layer.' + rest[len('attn.proj.'):]
+        if rest.startswith('attn.q_norm.'):
+            return prefix + 'attention.q_norm.' + rest[len('attn.q_norm.'):]
+        if rest.startswith('attn.k_norm.'):
+            return prefix + 'attention.k_norm.' + rest[len('attn.k_norm.'):]
+        if rest.startswith('mlp.'):
+            return prefix + rest
+
+    if name.startswith('mlp1.0.'):
+        return 'model.multi_modal_projector.layer_norm.' + name[len('mlp1.0.'):]
+    if name.startswith('mlp1.1.'):
+        return 'model.multi_modal_projector.linear_1.' + name[len('mlp1.1.'):]
+    if name.startswith('mlp1.3.'):
+        return 'model.multi_modal_projector.linear_2.' + name[len('mlp1.3.'):]
+
+    return name
+
+
+def _validate_legacy_internvl_chat(cfg):
+    cfg = _legacy_namespace(cfg)
+    if getattr(cfg, 'ps_version', None) != 'v2':
+        raise ValueError(
+            f"InternVLChatModel TurboMind native ViT requires ps_version='v2', "
+            f"got {getattr(cfg, 'ps_version', None)!r}.")
+
+
+def _legacy_namespace(cfg):
+    return SimpleNamespace(**cfg) if isinstance(cfg, dict) else cfg
+
+
+def _legacy_square_size(value) -> int:
+    if isinstance(value, (list, tuple)):
+        if len(value) != 2 or value[0] != value[1]:
+            raise ValueError(f'legacy InternVit expects a square size, got {value!r}')
+        value = value[0]
+    return int(value)
+
+
+@transform_output_dim
+def _split_packed_vision_qkv(tensor: torch.Tensor):
+    """Split packed vision QKV layout [Q | K | V] along output dim."""
+    if tensor.shape[-1] % 3 != 0:
+        raise ValueError(f'packed vision qkv output dim is not divisible by 3: {tuple(tensor.shape)}')
+    return tuple(x.contiguous() for x in tensor.chunk(3, dim=-1))
+
+
+class InternVitVisionModel(TextModel):
+    """InternVit weight model rooted at ``ModelRoot.vision_model``."""
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver, parent_cfg: PretrainedConfig):
+        super().__init__(cfg, resolver=resolver)
+
+        self._hidden = int(cfg.hidden_size)
+        self._heads = int(cfg.num_attention_heads)
+        self._depth = int(cfg.num_hidden_layers)
+        self._inter = int(cfg.intermediate_size)
+        self._channels = int(cfg.num_channels)
+        image_h, image_w = cfg.image_size
+        patch_h, patch_w = cfg.patch_size
+        self._image_h, self._image_w = int(image_h), int(image_w)
+        self._patch_h, self._patch_w = int(patch_h), int(patch_w)
+        self._norm_eps = float(cfg.layer_norm_eps)
+        self._norm_type = _to_tm_norm_type(cfg.norm_type)
+        self._use_qk_norm = bool(cfg.use_qk_norm)
+        self._head_dim = self._hidden // self._heads
+        self._patch_in_dim = self._channels * self._patch_h * self._patch_w
+        self._num_patches = (self._image_h // self._patch_h) * (self._image_w // self._patch_w)
+        self._downsample_ratio = float(parent_cfg.downsample_ratio)
+        self._image_seq_length = int(parent_cfg.image_seq_length)
+        self._out_hidden = int(parent_cfg.text_config.hidden_size)
+        self._projector_scale = int(round(1.0 / self._downsample_ratio))
+        self._projector_in_dim = self._hidden * self._projector_scale * self._projector_scale
+
+    def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
+        items = []
+        for input_mm in multimodal:
+            modality = input_mm.get('modality', Modality.IMAGE)
+            if modality not in (Modality.IMAGE, Modality.IMAGE.value, 'image'):
+                raise ValueError(f'InternVit TurboMind does not support modality {modality!r}')
+
+            pixel_values = self._tm_tensor(input_mm['pixel_values'])
+            token_begin = int(input_mm['offset'])
+            token_end = token_begin + int(input_mm['image_tokens'])
+            items.append(
+                _tm.multimodal.InternVitItem(
+                    modality=_tm.multimodal.Modality.IMAGE,
+                    data=pixel_values,
+                    token_begin=token_begin,
+                    token_end=token_end,
+                ))
+
+        return _tm.multimodal.InternVitInput(items)
+
+    def model(self, pfx):
+        self._build_vision_model(pfx + 'model.vision_tower', pfx + 'model.multi_modal_projector')
+
+    def _build_vision_model(self, vision_pfx, projector_pfx):
+        cfg = self._make_root_cfg()
+        root = self._restore_dtype(VisionModelBuilder(
+            cfg, self._ctx, root_handles=self._root_handles, tp=self._model_tp))
+
+        emb_pfx = vision_pfx + 'embeddings'
+        root._add_tensor('cls_token', (emb_pfx + 'cls_token').pop())
+        root._add_tensor('position_embeddings', (emb_pfx + 'position_embeddings').pop())
+        root._add_linear('patch_embed', self._patch_embed(emb_pfx + 'patch_embeddings.projection'))
+        root.blocks = self.vit_blocks(vision_pfx + 'encoder.layer')
+
+        root.projector_norm = self._layer_norm(projector_pfx + 'layer_norm',
+                                               dim=self._projector_in_dim,
+                                               norm_eps=1e-5)
+        root._add_linear('projector_fc1', self._linear(projector_pfx + 'linear_1'), SplitSide.OUTPUT)
+        root._add_linear('projector_fc2', self._linear(projector_pfx + 'linear_2'), SplitSide.INPUT)
+        root.build()
+
+    def _make_root_cfg(self):
+        cfg = _tm.InternVitConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._hidden
+        cfg.depth = self._depth
+        cfg.patch_in_dim = self._patch_in_dim
+        cfg.in_channels = self._channels
+        cfg.image_height = self._image_h
+        cfg.image_width = self._image_w
+        cfg.patch_height = self._patch_h
+        cfg.patch_width = self._patch_w
+        cfg.num_patches = self._num_patches
+        cfg.image_seq_length = self._image_seq_length
+        cfg.norm_type = self._norm_type
+        return cfg
+
+    def _make_block_cfg(self):
+        cfg = _tm.InternVitBlockConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._hidden
+        cfg.head_num = self._heads
+        cfg.intermediate_size = self._inter
+        cfg.norm_eps = self._norm_eps
+        return cfg
+
+    def _make_attn_cfg(self):
+        cfg = _tm.AttentionConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._hidden
+        cfg.head_dim = self._head_dim
+        cfg.head_num = self._heads
+        cfg.kv_head_num = self._heads
+        cfg.window_size = 0
+        cfg.causal = False
+        return cfg
+
+    def vit_blocks(self, pfx):
+        blocks = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i, p in pfx.slices(0, self._depth):
+            blocks[i] = self.vit_block(p)
+        return blocks.build()
+
+    def vit_block(self, pfx):
+        b = self._restore_dtype(Builder(self._make_block_cfg(), self._ctx))
+        b.tp = self._model_tp
+
+        b.norm1 = self._vision_norm(pfx + 'layernorm_before')
+        b.norm2 = self._vision_norm(pfx + 'layernorm_after')
+        b.attention = self.vit_attn(pfx + 'attention')
+        b._add_linear('mlp_fc1', self._linear(pfx + 'mlp.fc1'), SplitSide.OUTPUT)
+        b._add_linear('mlp_fc2', self._linear(pfx + 'mlp.fc2'), SplitSide.INPUT)
+        b._add_tensor('lambda_1', (pfx + 'lambda_1').pop())
+        b._add_tensor('lambda_2', (pfx + 'lambda_2').pop())
+        return b.build()
+
+    def vit_attn(self, pfx):
+        q = self._linear(pfx + 'q_proj')
+        k = self._linear(pfx + 'k_proj')
+        v = self._linear(pfx + 'v_proj')
+        o = self._linear(pfx + 'projection_layer')
+
+        cfg = self._make_attn_cfg()
+        attn_tp = self._model_tp if self._heads % self._model_tp.size == 0 else ParallelGroup(1, None)
+        m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp))
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+        if self._use_qk_norm and (pfx + 'q_norm').has('weight') and (pfx + 'k_norm').has('weight'):
+            m.q_norm = self._rms_norm(pfx + 'q_norm', tp=attn_tp)
+            m.k_norm = self._rms_norm(pfx + 'k_norm', tp=attn_tp)
+        return m.build()
+
+    def _tm_tensor(self, tensor: torch.Tensor):
+        if not isinstance(tensor, torch.Tensor):
+            raise TypeError(f'InternVit multimodal data should be a torch.Tensor, got {type(tensor).__name__}')
+        return _tm.from_dlpack(tensor.contiguous())
+
+    def _restore_dtype(self, builder):
+        builder.config.data_type = self._resolver.data_type
+        return builder
+
+    def _patch_embed(self, pfx):
+        weight = pfx.pop('weight')
+        weight = weight.reshape(weight.shape[0], -1).t().contiguous()
+        tensors = {'weight': weight}
+        if pfx.has('bias'):
+            tensors['bias'] = pfx.pop('bias')
+        return Linear(tensors=tensors, weight_format=TrivialFormat())
+
+    def _vision_norm(self, pfx):
+        if self._norm_type == _tm.NormType.LAYER_NORM:
+            return self._layer_norm(pfx, dim=self._hidden, norm_eps=self._norm_eps)
+        elif self._norm_type == _tm.NormType.RMS_NORM:
+            return self._rms_norm(pfx, tp=ParallelGroup(1, None))
+        else:
+            raise ValueError(f'Unsupported InternVit vision norm_type: {self._norm_type!r}')
+
+    def _rms_norm(self, pfx, tp: ParallelGroup):
+        weight = pfx.pop('weight')
+        tp_size = tp.size
+        dim = weight.shape[-1]
+        if tp_size > 1:
+            assert dim % tp_size == 0, (
+                f'{pfx}.weight dim={dim} is not divisible by tp={tp_size}')
+            dim //= tp_size
+        cfg = make_norm_config(dim=dim, norm_eps=self._norm_eps)
+        cfg.data_type = self._resolver.data_type
+        m = self._restore_dtype(NormBuilder(cfg, self._ctx))
+        if tp_size > 1:
+            m.tp = tp
+            m._add_tensor('weight', weight, SplitSide.OUTPUT)
+        else:
+            m.set_weight(weight)
+        return m.build()
+
+    def _layer_norm(self, pfx, *, dim: int, norm_eps: float):
+        weight = pfx.pop('weight')
+        bias = pfx.pop('bias') if pfx.has('bias') else None
+        cfg = make_layer_norm_config(dim=dim, data_type=self._resolver.data_type, norm_eps=norm_eps)
+        m = self._restore_dtype(LayerNormBuilder(cfg, self._ctx))
+        m.set_weight(weight, bias=bias)
+        return m.build()
+
+
+class LegacyInternVitVisionModel(InternVitVisionModel):
+    """Legacy InternVLChatModel ViT adapter for the canonical InternVit layout.
+
+    Legacy InternVL stores attention as a single ``attn.qkv`` linear. Other
+    weights are normalized through ``map_legacy_internvl_keys``.
+    """
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver, parent_cfg: PretrainedConfig):
+        cfg = _legacy_namespace(cfg)
+        parent_cfg = _legacy_namespace(parent_cfg)
+        llm_cfg = _legacy_namespace(parent_cfg.llm_config)
+        image_size = _legacy_square_size(cfg.image_size)
+        patch_size = _legacy_square_size(cfg.patch_size)
+        downsample_ratio = float(parent_cfg.downsample_ratio)
+        image_seq_length = int((image_size // patch_size)**2 * (downsample_ratio**2))
+
+        normalized_cfg = SimpleNamespace(
+            hidden_size=cfg.hidden_size,
+            num_attention_heads=cfg.num_attention_heads,
+            num_hidden_layers=cfg.num_hidden_layers,
+            intermediate_size=cfg.intermediate_size,
+            num_channels=cfg.num_channels,
+            image_size=(image_size, image_size),
+            patch_size=(patch_size, patch_size),
+            layer_norm_eps=cfg.layer_norm_eps,
+            norm_type=cfg.norm_type,
+            use_qk_norm=cfg.qk_normalization,
+        )
+        normalized_parent_cfg = SimpleNamespace(
+            downsample_ratio=downsample_ratio,
+            image_seq_length=image_seq_length,
+            text_config=llm_cfg,
+        )
+        super().__init__(normalized_cfg, resolver=resolver, parent_cfg=normalized_parent_cfg)
+
+    def vit_attn(self, pfx):
+        q, k, v = _split_packed_vision_qkv(self._linear(pfx + 'qkv'))
+        o = self._linear(pfx + 'projection_layer')
+
+        cfg = self._make_attn_cfg()
+        attn_tp = self._model_tp if self._heads % self._model_tp.size == 0 else ParallelGroup(1, None)
+        m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp))
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(o)
+        if self._use_qk_norm and (pfx + 'q_norm').has('weight') and (pfx + 'k_norm').has('weight'):
+            m.q_norm = self._rms_norm(pfx + 'q_norm', tp=attn_tp)
+            m.k_norm = self._rms_norm(pfx + 'k_norm', tp=attn_tp)
+        return m.build()
+
+
 @INPUT_MODELS.register_module(name='internvl')
 class InternVLModel:
     """Aggregate source model for InternVL checkpoints with any registered text
     model."""
 
-    def __init__(self, cfg: PretrainedConfig, *, resolver):
+    _vision = True
+
+    def __init__(self, cfg: PretrainedConfig, *, resolver, vision_resolver=None, disable_vision_encoder: bool = False):
         llm_cfg = _cfg_get(cfg, 'llm_config')
         if llm_cfg is None:
             llm_cfg = _cfg_get(cfg, 'text_config')
@@ -55,9 +413,28 @@ def __init__(self, cfg: PretrainedConfig, *, resolver):
         self.text_model = text_model_cls(llm_cfg, resolver=resolver)
         archs = _cfg_get(cfg, 'architectures') or []
         self._checkpoint_mappings = []
-        if archs and archs[0] == 'InternS1ForConditionalGeneration':
+        arch = archs[0] if archs else None
+        if arch == 'InternS1ForConditionalGeneration':
             self._checkpoint_mappings.append(map_interns1_hf_keys)
-        self.vision_model = None
+        elif arch == 'InternVLForConditionalGeneration':
+            self._checkpoint_mappings.append(map_internvl_hf_keys)
+        elif arch == 'InternVLChatModel':
+            self._checkpoint_mappings.append(map_legacy_internvl_keys)
+        vision_cfg = cfg.vision_config if hasattr(cfg, 'vision_config') else None
+        if not disable_vision_encoder and vision_cfg is not None:
+            if arch == 'InternVLChatModel':
+                _validate_legacy_internvl_chat(cfg)
+                self.vision_model = LegacyInternVitVisionModel(vision_cfg,
+                                                               resolver=vision_resolver or resolver,
+                                                               parent_cfg=cfg)
+            elif arch in ('InternS1ForConditionalGeneration', 'InternVLForConditionalGeneration'):
+                self.vision_model = InternVitVisionModel(vision_cfg,
+                                                         resolver=vision_resolver or resolver,
+                                                         parent_cfg=cfg)
+            else:
+                raise ValueError(f'InternVL TurboMind vision architecture {arch!r} is not supported.')
+        else:
+            self.vision_model = None
 
     def bind_runtime(self, *, ctx, root_handles,
                      attn_tp, mlp_tp, model_tp):
@@ -68,6 +445,14 @@ def bind_runtime(self, *, ctx, root_handles,
             mlp_tp=mlp_tp,
             model_tp=model_tp,
         )
+        if self.vision_model is not None:
+            self.vision_model.bind_runtime(
+                ctx=ctx,
+                root_handles=root_handles,
+                attn_tp=attn_tp,
+                mlp_tp=mlp_tp,
+                model_tp=model_tp,
+            )
 
     @property
     def _vocab_size(self):
@@ -77,5 +462,12 @@ def _vocab_size(self):
     def _loader_mappings(self):
         return self._checkpoint_mappings + list(getattr(type(self.text_model), '_loader_mappings', []))
 
+    def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
+        if self.vision_model is None:
+            raise ValueError('InternVL TurboMind vision encoder is not available.')
+        return self.vision_model.to_turbomind_multimodal(multimodal)
+
     def model(self, pfx):
         self.text_model.model(pfx + 'language_model')
+        if self.vision_model is not None:
+            self.vision_model.model(pfx)
diff --git a/lmdeploy/turbomind/models/qwen2_vl.py b/lmdeploy/turbomind/models/qwen2_vl.py
new file mode 100644
index 0000000000..1c58d4735e
--- /dev/null
+++ b/lmdeploy/turbomind/models/qwen2_vl.py
@@ -0,0 +1,449 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Qwen2-VL / Qwen2.5-VL aggregate source model for TurboMind."""
+from __future__ import annotations
+
+import math
+from typing import Any
+
+import _turbomind as _tm
+import torch
+
+from lmdeploy.vl.constants import Modality
+
+from ..builders import (
+    AttentionBuilder,
+    Builder,
+    LayerNormBuilder,
+    ModuleListBuilder,
+    ModuleListConfig,
+    NormBuilder,
+    SplitSide,
+    VisionModelBuilder,
+    make_layer_norm_config,
+    make_norm_config,
+)
+from ..builders._base import ParallelGroup
+from ..linear import Linear, transform_input_dim, transform_output_dim
+from ..text_model import TextModel
+from ..weight_format import TrivialFormat
+from .base import INPUT_MODELS
+from .qwen2 import Qwen2Model
+from .qwen3_5 import (
+    _assert_trivial,
+    _pad_head_dim_in,
+    _pad_head_dim_out,
+    _split_packed_vision_qkv,
+)
+from .utils import reorder_rotary_emb
+
+_VIT_HEAD_DIM_PADDED = {
+    64: 64,
+    80: 128,
+}
+
+
+@transform_output_dim
+def _pad_output_dim(t: torch.Tensor, *, dst_dim: int) -> torch.Tensor:
+    pad = dst_dim - t.shape[-1]
+    if pad <= 0:
+        return t
+    return torch.cat([t, t.new_zeros(t.shape[:-1] + (pad, ))], dim=-1).contiguous()
+
+
+@transform_input_dim
+def _pad_input_dim(t: torch.Tensor, *, dst_dim: int) -> torch.Tensor:
+    pad = dst_dim - t.shape[0]
+    if pad <= 0:
+        return t
+    return torch.cat([t, t.new_zeros((pad, ) + t.shape[1:])], dim=0).contiguous()
+
+
+def _padded_vit_head_dim(real_hd: int) -> int:
+    if real_hd not in _VIT_HEAD_DIM_PADDED:
+        raise NotImplementedError(
+            f'Qwen2 ViT head_dim={real_hd} is not supported; '
+            f'known: {sorted(_VIT_HEAD_DIM_PADDED)}')
+    return _VIT_HEAD_DIM_PADDED[real_hd]
+
+
+def _to_tm_norm_type(norm_type: str):
+    if norm_type == 'layer_norm':
+        return _tm.NormType.LAYER_NORM
+    if norm_type == 'rms_norm':
+        return _tm.NormType.RMS_NORM
+    raise ValueError(f'Unsupported Qwen2 ViT norm_type: {norm_type!r}')
+
+
+class _BaseQwen2VisionModel(TextModel):
+    """Common Qwen2-VL vision sub-tree rooted at ``ModelRoot.vision_model``."""
+
+    _gated_mlp = False
+    _norm_type = ''
+    _use_window_attention = False
+
+    def __init__(self, cfg, *, resolver):
+        super().__init__(cfg, resolver=resolver)
+
+        self._vis_hidden = self._vision_hidden_size(cfg)
+        self._vis_out_hidden = int(getattr(cfg, 'out_hidden_size', getattr(cfg, 'hidden_size', self._vis_hidden)))
+        self._vis_inter = self._vision_intermediate_size(cfg)
+        self._vis_depth = int(cfg.depth)
+        self._vis_heads = int(cfg.num_heads)
+        self._vis_in_chans = int(getattr(cfg, 'in_channels', getattr(cfg, 'in_chans', 3)))
+        self._vis_patch = int(cfg.patch_size)
+        self._vis_temporal = int(cfg.temporal_patch_size)
+        self._vis_spatial_merge = int(cfg.spatial_merge_size)
+        self._vis_norm_eps = 1e-6
+        self._window_size = self._vision_window_size(cfg)
+        self._fullatt_block_indexes = self._vision_fullatt_block_indexes(cfg)
+
+        self._patch_in_dim = (self._vis_in_chans
+                              * self._vis_temporal
+                              * self._vis_patch
+                              * self._vis_patch)
+
+    def _vision_hidden_size(self, cfg):
+        return int(getattr(cfg, 'hidden_size', getattr(cfg, 'embed_dim', 0)))
+
+    def _vision_intermediate_size(self, cfg):
+        raise NotImplementedError
+
+    def _vision_window_size(self, cfg):
+        return 0
+
+    def _vision_fullatt_block_indexes(self, cfg):
+        return []
+
+    def _torch_dtype(self):
+        if self._resolver.data_type == _tm.DataType.TYPE_FP16:
+            return torch.float16
+        if self._resolver.data_type == _tm.DataType.TYPE_BF16:
+            return torch.bfloat16
+        if self._resolver.data_type == _tm.DataType.TYPE_FP32:
+            return torch.float32
+        return None
+
+    def _tm_tensor(self, tensor: torch.Tensor):
+        if not isinstance(tensor, torch.Tensor):
+            raise TypeError(f'Qwen2 ViT multimodal data should be a torch.Tensor, got {type(tensor).__name__}')
+        target_dtype = self._torch_dtype()
+        if target_dtype is not None and tensor.is_floating_point() and tensor.dtype != target_dtype:
+            tensor = tensor.to(target_dtype)
+        return _tm.from_dlpack(tensor.contiguous())
+
+    @staticmethod
+    def _grid_thw(grid_thw) -> tuple[int, int, int]:
+        if isinstance(grid_thw, torch.Tensor):
+            values = grid_thw.flatten().tolist()
+        else:
+            values = list(grid_thw)
+        if len(values) != 3:
+            raise ValueError(f'Qwen2 ViT grid_thw should contain 3 values, got {values!r}')
+        return int(values[0]), int(values[1]), int(values[2])
+
+    @staticmethod
+    def _token_range(input_mm: dict[str, Any]) -> tuple[int, int]:
+        offset = input_mm['offset']
+        if isinstance(offset, torch.Tensor):
+            values = offset.flatten().tolist()
+        elif isinstance(offset, (list, tuple)):
+            values = list(offset)
+        else:
+            values = [offset]
+        if len(values) == 2:
+            return int(values[0]), int(values[1])
+        if len(values) != 1:
+            raise ValueError(f'Qwen2 ViT offset should contain 1 or 2 values, got {values!r}')
+        tokens = input_mm['image_tokens']
+        if isinstance(tokens, torch.Tensor):
+            tokens = tokens.flatten()[0].item()
+        return int(values[0]), int(values[0]) + int(tokens)
+
+    def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
+        items = []
+        for input_mm in multimodal:
+            modality = input_mm.get('modality', Modality.IMAGE)
+            if modality not in (Modality.IMAGE, Modality.IMAGE.value, 'image'):
+                raise ValueError(f'Qwen2 TurboMind native vision only supports image inputs, got {modality!r}')
+
+            data = self._tm_tensor(input_mm['pixel_values'])
+            grid_thw = self._grid_thw(input_mm['image_grid_thw'])
+            token_begin, token_end = self._token_range(input_mm)
+            items.append(
+                _tm.multimodal.QwenVitItem(
+                    modality=_tm.multimodal.Modality.IMAGE,
+                    data=data,
+                    token_begin=token_begin,
+                    token_end=token_end,
+                    grid_thw=grid_thw,
+                ))
+
+        return _tm.multimodal.QwenVitInput(items)
+
+    def model(self, pfx):
+        self._build_vision_model(pfx + 'visual')
+
+    def _restore_dtype(self, builder):
+        builder.config.data_type = self._resolver.data_type
+        return builder
+
+    def _make_vision_root_cfg(self):
+        cfg = _tm.QwenVitConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._vis_hidden
+        cfg.out_hidden_dim = self._vis_out_hidden
+        cfg.depth = self._vis_depth
+        cfg.head_num = self._vis_heads
+        cfg.intermediate_size = self._padded_inter_size()
+        cfg.patch_in_dim = self._patch_in_dim
+        cfg.in_channels = self._vis_in_chans
+        cfg.patch_size = self._vis_patch
+        cfg.temporal_patch_size = self._vis_temporal
+        cfg.spatial_merge_size = self._vis_spatial_merge
+        cfg.window_size = self._window_size
+        cfg.gated_mlp = self._gated_mlp
+        cfg.use_window_attention = self._use_window_attention
+        cfg.norm_type = _to_tm_norm_type(self._norm_type)
+        cfg.fullatt_block_indexes = self._fullatt_block_indexes
+        cfg.norm_eps = self._vis_norm_eps
+        return cfg
+
+    def _build_vision_model(self, pfx):
+        cfg = self._make_vision_root_cfg()
+        root = self._restore_dtype(VisionModelBuilder(
+            cfg, self._ctx,
+            root_handles=self._root_handles,
+            tp=self._model_tp))
+
+        root._add_linear('patch_embed', self._patch_embed(pfx + 'patch_embed.proj'))
+        root.blocks = self.vit_blocks(pfx + 'blocks')
+        root.merger_norm = self._vision_norm(pfx + 'merger.ln_q', dim=self._vis_hidden)
+        root._add_linear('merger_fc1', self._linear(pfx + 'merger.mlp.0'), SplitSide.OUTPUT)
+        root._add_linear('merger_fc2', self._linear(pfx + 'merger.mlp.2'), SplitSide.INPUT)
+        root.build()
+
+    def _patch_embed(self, pfx):
+        weight = pfx.pop('weight')
+        weight = weight.reshape(weight.shape[0], -1).t().contiguous()
+        return Linear(tensors={'weight': weight}, weight_format=TrivialFormat())
+
+    def vit_blocks(self, pfx):
+        blocks = ModuleListBuilder(ModuleListConfig(), self._ctx)
+        for i, p in pfx.slices(0, self._vis_depth):
+            blocks[i] = self.vit_block(p)
+        return blocks.build()
+
+    def vit_block(self, pfx):
+        inter_size = self._padded_inter_size()
+        cfg = _tm.QwenVitBlockConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._vis_hidden
+        cfg.head_num = self._vis_heads
+        cfg.intermediate_size = inter_size
+        cfg.norm_eps = self._vis_norm_eps
+
+        b = self._restore_dtype(Builder(cfg, self._ctx))
+        b.tp = self._model_tp
+        b.norm1 = self._vision_norm(pfx + 'norm1', dim=self._vis_hidden)
+        b.norm2 = self._vision_norm(pfx + 'norm2', dim=self._vis_hidden)
+        b.attention = self.vit_attn(pfx + 'attn')
+        self._add_mlp(b, pfx, inter_size=inter_size)
+        return b.build()
+
+    def _add_mlp(self, builder, pfx, *, inter_size: int):
+        raise NotImplementedError
+
+    def _padded_inter_size(self):
+        # Bias/activation kernels vectorize half/bf16 in 8-element chunks.
+        # Pad the global intermediate so each TP shard has aligned width.
+        align = max(1, self._model_tp.size) * 8
+        return ((self._vis_inter + align - 1) // align) * align
+
+    def _pad_plain_mlp(self, fc1: Linear, fc2: Linear, *, inter_size: int):
+        if inter_size == self._vis_inter:
+            return fc1, fc2
+        return _pad_output_dim(fc1, dst_dim=inter_size), _pad_input_dim(fc2, dst_dim=inter_size)
+
+    def _pad_mlp(self, gate: Linear, up: Linear, down: Linear, *, inter_size: int):
+        if inter_size == self._vis_inter:
+            return gate, up, down
+        return (_pad_output_dim(gate, dst_dim=inter_size),
+                _pad_output_dim(up, dst_dim=inter_size),
+                _pad_input_dim(down, dst_dim=inter_size))
+
+    def _make_vision_attn_cfg(self):
+        real_hd = self._vis_hidden // self._vis_heads
+        padded_hd = _padded_vit_head_dim(real_hd)
+        cfg = _tm.AttentionConfig()
+        cfg.data_type = self._resolver.data_type
+        cfg.hidden_dim = self._vis_hidden
+        cfg.head_dim = padded_hd
+        cfg.head_num = self._vis_heads
+        cfg.kv_head_num = self._vis_heads
+        cfg.window_size = 0
+        cfg.causal = False
+        cfg.softmax_scale = (1.0 / math.sqrt(real_hd) if padded_hd != real_hd else 0.0)
+        return cfg
+
+    def vit_attn(self, pfx):
+        cfg = self._make_vision_attn_cfg()
+        real_hd = self._vis_hidden // self._vis_heads
+        padded_hd = cfg.head_dim
+        H = cfg.head_num
+
+        q, k, v = _split_packed_vision_qkv(self._linear(pfx + 'qkv'))
+        q = reorder_rotary_emb(q, real_hd, real_hd, resolver=self._resolver)
+        k = reorder_rotary_emb(k, real_hd, real_hd, resolver=self._resolver)
+        proj = self._linear(pfx + 'proj')
+
+        if padded_hd != real_hd:
+            for ln, name in [(q, 'q'), (k, 'k'), (v, 'v'), (proj, 'proj')]:
+                _assert_trivial(ln, name)
+            pad_kwargs = dict(num_heads=H, src_hd=real_hd, dst_hd=padded_hd)
+            q = _pad_head_dim_out(q, **pad_kwargs)
+            k = _pad_head_dim_out(k, **pad_kwargs)
+            v = _pad_head_dim_out(v, **pad_kwargs)
+            proj = _pad_head_dim_in(proj, **pad_kwargs)
+
+        attn_tp = self._model_tp if self._vis_heads % self._model_tp.size == 0 else ParallelGroup(1, None)
+        m = self._restore_dtype(AttentionBuilder(cfg, self._ctx, tp=attn_tp))
+        m.add_qkv_proj(q, k, v)
+        m.add_o_proj(proj)
+        return m.build()
+
+    def _vision_norm(self, pfx, *, dim: int):
+        raise NotImplementedError
+
+    def _layer_norm(self, pfx, *, dim: int):
+        weight = pfx.pop('weight')
+        bias = pfx.pop('bias') if pfx.has('bias') else None
+        cfg = make_layer_norm_config(dim=dim,
+                                     data_type=self._resolver.data_type,
+                                     norm_eps=self._vis_norm_eps)
+        m = self._restore_dtype(LayerNormBuilder(cfg, self._ctx))
+        m.set_weight(weight, bias=bias)
+        return m.build()
+
+    def _rms_norm(self, pfx, *, dim: int):
+        weight = pfx.pop('weight')
+        cfg = make_norm_config(dim=dim, norm_eps=self._vis_norm_eps)
+        cfg.data_type = self._resolver.data_type
+        m = self._restore_dtype(NormBuilder(cfg, self._ctx))
+        m.set_weight(weight)
+        return m.build()
+
+
+class Qwen2VisionModel(_BaseQwen2VisionModel):
+    """Qwen2-VL vision tower."""
+
+    _norm_type = 'layer_norm'
+
+    def _vision_hidden_size(self, cfg):
+        return int(cfg.embed_dim)
+
+    def _vision_intermediate_size(self, cfg):
+        return int(self._vis_hidden * cfg.mlp_ratio)
+
+    def _vision_norm(self, pfx, *, dim: int):
+        return self._layer_norm(pfx, dim=dim)
+
+    def _add_mlp(self, builder, pfx, *, inter_size: int):
+        fc1, fc2 = self._pad_plain_mlp(
+            self._linear(pfx + 'mlp.fc1'),
+            self._linear(pfx + 'mlp.fc2'),
+            inter_size=inter_size,
+        )
+        builder._add_linear('mlp_fc1', fc1, SplitSide.OUTPUT)
+        builder._add_linear('mlp_fc2', fc2, SplitSide.INPUT)
+
+
+class Qwen2_5VisionModel(_BaseQwen2VisionModel):
+    """Qwen2.5-VL vision tower."""
+
+    _gated_mlp = True
+    _norm_type = 'rms_norm'
+    _use_window_attention = True
+
+    def _vision_intermediate_size(self, cfg):
+        return int(cfg.intermediate_size)
+
+    def _vision_window_size(self, cfg):
+        return int(cfg.window_size)
+
+    def _vision_fullatt_block_indexes(self, cfg):
+        return [int(x) for x in cfg.fullatt_block_indexes]
+
+    def _vision_norm(self, pfx, *, dim: int):
+        return self._rms_norm(pfx, dim=dim)
+
+    def _add_mlp(self, builder, pfx, *, inter_size: int):
+        gate, up, down = self._pad_mlp(
+            self._linear(pfx + 'mlp.gate_proj'),
+            self._linear(pfx + 'mlp.up_proj'),
+            self._linear(pfx + 'mlp.down_proj'),
+            inter_size=inter_size,
+        )
+        builder._add_linear('mlp_gate', gate, SplitSide.OUTPUT)
+        builder._add_linear('mlp_fc1', up, SplitSide.OUTPUT)
+        builder._add_linear('mlp_fc2', down, SplitSide.INPUT)
+
+
+_VISION_MODEL_CLS = {
+    'Qwen2VLForConditionalGeneration': Qwen2VisionModel,
+    'Qwen2_5_VLForConditionalGeneration': Qwen2_5VisionModel,
+}
+
+
+@INPUT_MODELS.register_module(name='qwen2_vl')
+class Qwen2VLModel:
+    """Aggregate source model for Qwen2-VL and Qwen2.5-VL checkpoints."""
+
+    _vision = True
+
+    def __init__(self, cfg, *, resolver, vision_resolver=None, disable_vision_encoder: bool = False):
+        text_cfg = getattr(cfg, 'text_config', cfg)
+        if text_cfg is None:
+            raise ValueError('Qwen2VLModel requires a checkpoint with text_config.')
+        if not hasattr(text_cfg, 'tie_word_embeddings'):
+            text_cfg.tie_word_embeddings = getattr(cfg, 'tie_word_embeddings', False)
+        self.text_model = Qwen2Model(text_cfg, resolver=resolver)
+
+        archs = getattr(cfg, 'architectures', None) or []
+        self._arch = archs[0] if archs else ''
+        vision_cfg = getattr(cfg, 'vision_config', None)
+        if disable_vision_encoder or vision_cfg is None:
+            self.vision_model = None
+        else:
+            vision_cls = _VISION_MODEL_CLS.get(self._arch)
+            if vision_cls is None:
+                raise ValueError(f'Unsupported Qwen2-VL architecture: {self._arch!r}')
+            self.vision_model = vision_cls(vision_cfg, resolver=vision_resolver or resolver)
+
+    def bind_runtime(self, *, ctx, root_handles, attn_tp, mlp_tp, model_tp):
+        for m in (self.text_model, self.vision_model):
+            if m is not None:
+                m.bind_runtime(
+                    ctx=ctx,
+                    root_handles=root_handles,
+                    attn_tp=attn_tp,
+                    mlp_tp=mlp_tp,
+                    model_tp=model_tp,
+                )
+
+    @property
+    def _vocab_size(self):
+        return self.text_model.cfg.vocab_size
+
+    @property
+    def _loader_mappings(self):
+        return list(getattr(type(self.text_model), '_loader_mappings', []))
+
+    def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
+        if self.vision_model is None:
+            raise ValueError('Qwen2 TurboMind vision encoder is not available.')
+        return self.vision_model.to_turbomind_multimodal(multimodal)
+
+    def model(self, pfx):
+        self.text_model.model(pfx)
+        if self.vision_model is not None:
+            self.vision_model.model(pfx)
diff --git a/lmdeploy/turbomind/models/qwen3_5.py b/lmdeploy/turbomind/models/qwen3_5.py
index 0d13f372ae..45ccd9daa3 100644
--- a/lmdeploy/turbomind/models/qwen3_5.py
+++ b/lmdeploy/turbomind/models/qwen3_5.py
@@ -45,6 +45,7 @@
     _act_type_id,
     make_layer_norm_config,
 )
+from ..builders._base import ParallelGroup
 from ..builders.attention import split_output_gate
 from ..linear import Linear, transform_input_dim, transform_output_dim
 from ..text_model import TextModel
@@ -402,7 +403,7 @@ def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
 
             token_begin, token_end = self._offset_pair(input_mm['offset'])
             items.append(
-                _tm.multimodal.Qwen3_5VitItem(
+                _tm.multimodal.QwenVitItem(
                     modality=tm_modality,
                     data=data,
                     token_begin=token_begin,
@@ -410,7 +411,7 @@ def to_turbomind_multimodal(self, multimodal: list[dict[str, Any]]):
                     grid_thw=grid_thw,
                 ))
 
-        return _tm.multimodal.Qwen3_5VitInput(items)
+        return _tm.multimodal.QwenVitInput(items)
 
     # ------------------------------------------------------------------
     # model() — build the vision sub-tree
@@ -448,7 +449,7 @@ def _build_vision_model(self, pfx):
         root.build()
 
     def _make_vision_root_cfg(self):
-        cfg = _tm.Qwen3_5VitConfig()
+        cfg = _tm.QwenVitConfig()
         cfg.data_type = self._resolver.data_type
         cfg.hidden_dim = self._vis_hidden
         cfg.out_hidden_dim = self._vis_out_hidden
@@ -461,6 +462,8 @@ def _make_vision_root_cfg(self):
         cfg.temporal_patch_size = self._vis_temporal
         cfg.num_position_embeddings = self._vis_pos_n
         cfg.spatial_merge_size = self._vis_spatial_merge
+        # Qwen3.5 ViT MLP uses the tanh-approximation GELU.
+        cfg.gelu_tanh = True
         cfg.norm_eps = self._vis_norm_eps
         return cfg
 
@@ -481,7 +484,7 @@ def vit_blocks(self, pfx):
         return blocks.build()
 
     def vit_block(self, pfx):
-        cfg = _tm.Qwen3_5VitBlockConfig()
+        cfg = _tm.QwenVitBlockConfig()
         cfg.data_type = self._resolver.data_type
         cfg.hidden_dim = self._vis_hidden
         cfg.head_num = self._vis_heads
@@ -546,8 +549,9 @@ def vit_attn(self, pfx):
             v = _pad_head_dim_out(v, **pad_kwargs)
             proj = _pad_head_dim_in(proj, **pad_kwargs)
 
+        attn_tp = self._model_tp if self._vis_heads % self._model_tp.size == 0 else ParallelGroup(1, None)
         m = self._restore_dtype(
-            AttentionBuilder(cfg, self._ctx, tp=self._model_tp))
+            AttentionBuilder(cfg, self._ctx, tp=attn_tp))
         m.add_qkv_proj(q, k, v)
         m.add_o_proj(proj)
         return m.build()
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 95f6f24d32..43c31ac465 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -8,6 +8,9 @@
     # Qwen2 / Qwen2-MoE
     Qwen2ForCausalLM='qwen2',
     Qwen2MoeForCausalLM='qwen2-moe',
+    # Qwen2-vl / Qwen2.5-vl
+    Qwen2VLForConditionalGeneration='qwen2_vl',
+    Qwen2_5_VLForConditionalGeneration='qwen2_vl',
     # Qwen3
     Qwen3ForCausalLM='qwen3',
     Qwen3MoeForCausalLM='qwen3-moe',
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index f2a5f62ccf..e8d3aaf6ac 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -203,6 +203,14 @@ async def wrap_for_turbomind(
                         ...
                     }
         """
+        if self.model._turbomind_native_vision:
+            return await self.wrap_for_pytorch(messages,
+                                               chat_template,
+                                               tokenizer,
+                                               sequence_start,
+                                               tools=tools,
+                                               chat_template_kwargs=chat_template_kwargs)
+
         result = self.model.to_turbomind(messages,
                                          chat_template,
                                          tokenizer,
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index 1534ad3388..69dead5450 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -76,6 +76,7 @@ class InternVLVisionModel(VisionModel):
     """InternVL vision model."""
 
     _arch = 'InternVLChatModel'
+    _turbomind_native_vision = True
 
     def __init__(self,
                  model_path: str,
@@ -210,7 +211,7 @@ def preprocess(self, messages: list[dict]) -> list[dict]:
             pixel_values = self.processor(image, params)
             image_tokens = (pixel_values.shape[0] * self.image_tokens_per_patch)
             outputs.append(
-                dict(pixel_values=pixel_values,
+                dict(pixel_values=pixel_values.to(self.mm_feature_dtype),
                      image_tokens=image_tokens,
                      image_token_id=self.image_token_id,
                      image_size=image.size))
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
index 3816cfe491..2c6e836b97 100644
--- a/lmdeploy/vl/model/internvl3_hf.py
+++ b/lmdeploy/vl/model/internvl3_hf.py
@@ -35,6 +35,7 @@ class InternVL3VisionModel(InternVLVisionModel):
     """Internvl3 vision model."""
 
     _arch = ['InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration']
+    _turbomind_native_vision = True
 
     def __init__(self,
                  model_path: str,
@@ -107,7 +108,7 @@ def preprocess(self, messages: list[dict]) -> list[dict]:
             cur_num_patches = image_num_patches[idx]
             pixel_values = image_pixel_values[cum_num_patches:cum_num_patches + cur_num_patches, ...]
             cum_num_patches += cur_num_patches
-            data = dict(pixel_values=pixel_values,
+            data = dict(pixel_values=pixel_values.to(self.mm_feature_dtype),
                         image_tokens=self.image_tokens_per_patch * cur_num_patches,
                         image_token_id=self.image_token_id)
             outputs.append(data)
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
index 98e3f8cd09..d87c65fafa 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/vl/model/qwen2.py
@@ -5,28 +5,14 @@
 from lmdeploy.vl.model.utils import disable_logging
 
 
-def check_qwen_vl_deps_install():
-    """Check qwen_vl_utils."""
-    try:
-        import qwen_vl_utils  # noqa: F401
-    except ImportError:
-        raise ImportError('please install qwen_vl_utils by `pip install qwen_vl_utils`'  # noqa: E501
-                          )
-    try:
-        from transformers import Qwen2VLForConditionalGeneration  # noqa: F401
-    except ImportError:
-        raise ImportError('please install latest transformers by '
-                          'pip install git+https://github.com/huggingface/transformers.git')
-
-
 @VISION_MODELS.register_module()
 class Qwen2VLModel(VisionModel):
     """Qwen2VL model."""
 
     _arch = ['Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration']
+    _turbomind_native_vision = True
 
     def build_preprocessor(self, trust_remote_code: bool = False):
-        check_qwen_vl_deps_install()
         from transformers import AutoProcessor
         self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=trust_remote_code)
         tokenizer = self.processor.tokenizer
@@ -35,16 +21,12 @@ def build_preprocessor(self, trust_remote_code: bool = False):
 
     def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
-        from qwen_vl_utils import process_vision_info
-
         images = self.collect_multimodal_items(messages)
-        optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
+        optional_keys = {'min_pixels', 'max_pixels'}
         outputs = []
         for modality, image, params in images:
-            item = dict(type='image', image=image)
-            item.update({key: params[key] for key in params.keys() if key in optional_keys})
-            image_inputs, _ = process_vision_info([dict(content=[item])])
-            result = self.processor.image_processor(images=image_inputs, return_tensors='pt')
+            image_kwargs = {key: params[key] for key in params.keys() if key in optional_keys}
+            result = self.processor.image_processor(images=[image], return_tensors='pt', **image_kwargs)
             merge_length = self.processor.image_processor.merge_size**2
             image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
             result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
@@ -53,7 +35,6 @@ def preprocess(self, messages: list[dict]) -> list[dict]:
         return messages
 
     def build_model(self, trust_remote_code: bool = False):
-        check_qwen_vl_deps_install()
         arch = self.hf_config.architectures[0]
         if arch == 'Qwen2VLForConditionalGeneration':
             from transformers import Qwen2VLForConditionalGeneration as AutoModelCls
diff --git a/src/turbomind/kernels/activation.cu b/src/turbomind/kernels/activation.cu
index cab79c6ee7..6d9f7a5ae7 100644
--- a/src/turbomind/kernels/activation.cu
+++ b/src/turbomind/kernels/activation.cu
@@ -2,10 +2,13 @@
 #include "src/turbomind/core/data_type.h"
 
 #include "src/turbomind/kernels/activation.h"
+#include "src/turbomind/kernels/activation_ops.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
+#include <type_traits>
+
 namespace turbomind {
 
 template<class T>
@@ -193,4 +196,95 @@ void Activation(Tensor&             gate_up,  //
     TM_CUDA_CHECK(cudaGetLastError());
 }
 
+template<int vec_size, typename T, template<typename> class Activation>
+__global__ void AddBiasActivationKernel(T* data, const T* __restrict__ bias, int64_t stride, int num, int dim)
+{
+    const int ti = blockIdx.x;
+    const int di = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size;
+
+    if (ti >= num || di >= dim) {
+        return;
+    }
+
+    Array<T, vec_size> x_vec;
+    Load(x_vec, data + ti * stride + di);
+
+    auto x = cast<float>(x_vec);
+
+    if (bias) {
+        Array<T, vec_size> bias_vec;
+        Ldg(bias_vec, bias + di);
+        using namespace ops;
+        x = x + cast<float>(bias_vec);
+    }
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        x[i] = Activation<float>::apply(x[i]);
+    }
+
+    Store(data + ti * stride + di, cast<T>(x));
+}
+
+void invokeAddBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream)
+{
+    if (x.size() == 0) {
+        return;
+    }
+
+    TM_CHECK_EQ(x.ndim(), 2);
+    if (bias) {
+        TM_CHECK_EQ(bias.shape(-1), x.shape(-1));
+        TM_CHECK_EQ(bias.dtype(), x.dtype());
+    }
+
+    auto invoke = [&](auto t) {
+        using T               = decltype(t);
+        constexpr int max_vec = sizeof(uint4) / sizeof(T);
+        constexpr int threads = 512;
+
+        const int     num    = x.shape(0);
+        const int     dim    = x.shape(1);
+        const int64_t stride = x.stride(0);
+
+        int best_vec_size = 1;
+        for (int v = max_vec; v >= 1; v >>= 1) {
+            if (dim % v == 0 && stride % v == 0) {
+                best_vec_size = v;
+                break;
+            }
+        }
+
+        auto launch = [&](auto vec_size_) {
+            constexpr int vec_size = decltype(vec_size_)::value;
+            const dim3    grid(num, cdiv(dim, threads * vec_size));
+            if (type == ActivationType::kGeluPytorchTanh) {
+                AddBiasActivationKernel<vec_size, T, GeluActivation>
+                    <<<grid, threads, 0, stream>>>(x.data<T>(), bias.data_or((T*)nullptr), stride, num, dim);
+            }
+            else if (type == ActivationType::kGelu) {
+                AddBiasActivationKernel<vec_size, T, GeluErfActivation>
+                    <<<grid, threads, 0, stream>>>(x.data<T>(), bias.data_or((T*)nullptr), stride, num, dim);
+            }
+            else {
+                TM_LOG_FATAL("unsupported add-bias activation type: {}", (int)type);
+            }
+        };
+
+        switch (best_vec_size) {
+            case 8:
+                return launch(std::integral_constant<int, 8>{});
+            case 4:
+                return launch(std::integral_constant<int, 4>{});
+            case 2:
+                return launch(std::integral_constant<int, 2>{});
+            default:
+                return launch(std::integral_constant<int, 1>{});
+        }
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/activation.h b/src/turbomind/kernels/activation.h
index 2eceeb9d89..1d13c63bdc 100644
--- a/src/turbomind/kernels/activation.h
+++ b/src/turbomind/kernels/activation.h
@@ -20,4 +20,9 @@ void Activation(Tensor&             gate_up,  //
                 ActivationType      type,
                 cudaStream_t        stream);
 
+// In-place add-bias + unary activation: x <- activation(x + bias).
+// `x` is a 2D tensor; `bias` (optional) broadcasts over the last dim.
+// Supports kGelu (erf) and kGeluPytorchTanh (tanh approximation).
+void invokeAddBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream);
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/attention_universal.h b/src/turbomind/kernels/attention/attention_universal.h
index c7960de01a..e6b192c2d2 100644
--- a/src/turbomind/kernels/attention/attention_universal.h
+++ b/src/turbomind/kernels/attention/attention_universal.h
@@ -234,11 +234,12 @@ struct AttentionUniversal {
             rope.init(di);
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
-                const int ti = (offset.y + s * Map::kDeltaS) / CTA_H + query_idx + history_len;
-                rope.apply(vec_Q[s][c], ti);
+                const int qi = (offset.y + s * Map::kDeltaS) / CTA_H + query_idx;
+                const int ti = qi + history_len;
+                rope.apply(vec_Q[s][c], ti, qi);
                 if constexpr (kProcessKV) {
                     if (s == 0) {
-                        rope.apply(vec_K[0][c], ti);
+                        rope.apply(vec_K[0][c], ti, qi);
                     }
                 }
             }
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index 14d64caed2..1715394074 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -132,8 +132,9 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**          blocks,
             rope.init(di);
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
-                const int ti = history_len + offset.y + s * Map::kDeltaS + token_idx;  // sequence local
-                rope.apply(vec_K[s][c], ti);
+                const int qi = offset.y + s * Map::kDeltaS + token_idx;
+                const int ti = history_len + qi;  // sequence local
+                rope.apply(vec_K[s][c], ti, qi);
             }
         }
     }
@@ -430,7 +431,7 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*              k,
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
                 const int ti = offset.y + s * Map::kDeltaS + token_idx;  // sequence local
-                rope.apply(out_K[s][c], ti);
+                rope.apply(out_K[s][c], ti, ti);
             }
         }
     }
diff --git a/src/turbomind/kernels/attention/rotary_embedding.h b/src/turbomind/kernels/attention/rotary_embedding.h
index 91a5316d62..d4281333a0 100644
--- a/src/turbomind/kernels/attention/rotary_embedding.h
+++ b/src/turbomind/kernels/attention/rotary_embedding.h
@@ -82,8 +82,8 @@ struct FastRoPE {
         }
         // mrope is an operation applied on top of any base rope type
         if (param_.mrope_mode != MropeMode::kNone) {
-            param_.mrope.position_ids += batch_idx * param_.mrope.stride;
             param_.mrope.position_delta += batch_idx;
+            param_.mrope.position_offsets += batch_idx;
             param_.mrope.length += batch_idx;
         }
     }
@@ -110,7 +110,7 @@ struct FastRoPE {
     }
 
     template<typename T>
-    __device__ void apply(Array<T, N>& x, float timestep)
+    __device__ void apply(Array<T, N>& x, float timestep, int token_idx)
     {
         if (param_.mrope_mode == MropeMode::kNone) {
             // Most models apply rotary embedding in half precision
@@ -120,17 +120,18 @@ struct FastRoPE {
             }
         }
         else if (param_.mrope_mode == MropeMode::kChunked) {
-            apply_mrope_impl<MropeMode::kChunked>(x, timestep);
+            apply_mrope_impl<MropeMode::kChunked>(x, timestep, token_idx);
         }
         else if (param_.mrope_mode == MropeMode::kInterleaved) {
-            apply_mrope_impl<MropeMode::kInterleaved>(x, timestep);
+            apply_mrope_impl<MropeMode::kInterleaved>(x, timestep, token_idx);
         }
     }
 
-    __device__ __forceinline__ MropeCoord get_mrope_coord(float timestep) const
+    __device__ __forceinline__ MropeCoord get_mrope_coord(float timestep, int token_idx) const
     {
-        if (timestep < *param_.mrope.length) {
-            const int* t = param_.mrope.position_ids + 3 * (int)timestep;
+        if (token_idx < *param_.mrope.length) {
+            const int  row = *param_.mrope.position_offsets + token_idx;
+            const int* t   = param_.mrope.position_ids + 3 * row;
             return {t[0], t[1], t[2]};
         }
         const int pos = (int)timestep + (*param_.mrope.position_delta);
@@ -180,9 +181,9 @@ struct FastRoPE {
     }
 
     template<MropeMode mode, typename T>
-    __device__ __forceinline__ void apply_mrope_impl(Array<T, N>& x, float timestep) const
+    __device__ __forceinline__ void apply_mrope_impl(Array<T, N>& x, float timestep, int token_idx) const
     {
-        const MropeCoord coord = get_mrope_coord(timestep);
+        const MropeCoord coord = get_mrope_coord(timestep, token_idx);
         PRAGMA_UNROLL
         for (int i = 0; i < N; i += 2) {
             const int pair_idx = (i + idx_) >> 1;
diff --git a/src/turbomind/kernels/norm/norm.h b/src/turbomind/kernels/norm/norm.h
new file mode 100644
index 0000000000..80b9894f9c
--- /dev/null
+++ b/src/turbomind/kernels/norm/norm.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+namespace turbomind {
+
+enum class NormType
+{
+    kNone,
+    kLayerNorm,
+    kRMSNorm,
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/CMakeLists.txt b/src/turbomind/models/CMakeLists.txt
index 376b035664..18bf9caad7 100644
--- a/src/turbomind/models/CMakeLists.txt
+++ b/src/turbomind/models/CMakeLists.txt
@@ -15,15 +15,14 @@ add_library(models STATIC
         model_weight.cc
         model_root.cc
         vision_model.cc
-        qwen3_5vit/fast_pos_embed.cu
-        qwen3_5vit/fast_rotary_pos_emb.cu
-        qwen3_5vit/fused_embed_merge.cu
-        qwen3_5vit/qkv_preprocess.cu
-        qwen3_5vit/mrope_position_ids.cu
-        qwen3_5vit/bias_gelu.cu
-        qwen3_5vit/qwen3_5vit_block_weight.cc
-        qwen3_5vit/qwen3_5vit_weight.cc
-        qwen3_5vit/qwen3_5vit.cc
+        qwenvit/qwenvit_kernels.cu
+        qwenvit/qwenvit_block_weight.cc
+        qwenvit/qwenvit_weight.cc
+        qwenvit/qwenvit.cc
+        internvit/internvit_kernels.cu
+        internvit/internvit_block_weight.cc
+        internvit/internvit_weight.cc
+        internvit/internvit.cc
         llama/LlamaLinear.cu
         llama/BlockManager.cc
         llama/BlockTrie.cc
@@ -75,7 +74,7 @@ if(BUILD_TEST)
             CUDA::cudart)
 
     add_executable(test_mrope_position_ids
-            qwen3_5vit/test_mrope_position_ids.cu)
+            qwenvit/test_mrope_position_ids.cu)
     set_property(TARGET test_mrope_position_ids PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
     target_link_libraries(test_mrope_position_ids PRIVATE
             models
diff --git a/src/turbomind/models/internvit/internvit.cc b/src/turbomind/models/internvit/internvit.cc
new file mode 100644
index 0000000000..c18ce1f2c0
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit.cc
@@ -0,0 +1,561 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/internvit/internvit.h"
+
+#include "src/turbomind/comm/device_comm.h"
+#include "src/turbomind/engine/request.h"
+#include "src/turbomind/kernels/activation.h"
+#include "src/turbomind/kernels/attention/attention.h"
+#include "src/turbomind/kernels/norm/layer_norm.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
+#include "src/turbomind/models/internvit/internvit_block_weight.h"
+#include "src/turbomind/models/internvit/internvit_input.h"
+#include "src/turbomind/models/internvit/internvit_kernels.h"
+#include "src/turbomind/models/internvit/internvit_weight.h"
+#include "src/turbomind/models/layer_norm_weight.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/models/norm_weight.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+#include <array>
+#include <cmath>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace turbomind {
+
+struct InternVit::Impl {
+    const InternVitWeight&       weights_;
+    const core::InternVitConfig& config_;
+    const comm::HostComm&        h_tp_group;
+    LlamaLinear&                 linear_;
+    comm::DeviceCommImpl* const  d_comm_;
+    const int                    tp_group_;
+    const int                    tp_size_;
+    const DataType               engine_data_type_;
+
+    Buffer_<int> attn_cu_seqlens_buf_;
+
+    struct Data {
+        Tensor                           batch_input;
+        int                              batch_size{};
+        std::vector<std::pair<int, int>> image_embeds_coords;
+        std::vector<std::pair<int, int>> input_embeds_coords;
+        Tensor_<int>                     attn_cu_seqlens;
+        Tensor_<bool>                    attn_finished;
+        int                              token_num{};
+        int                              seq_len{};
+
+        void Clear()
+        {
+            batch_size = 0;
+            token_num  = 0;
+            seq_len    = 0;
+            image_embeds_coords.clear();
+            input_embeds_coords.clear();
+        }
+    };
+
+    std::vector<Data> data_;
+
+    Impl(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases):
+        weights_{weights},
+        config_{weights.config()},
+        h_tp_group{ctx.comm.h_comm},
+        linear_{*ctx.linear},
+        d_comm_{ctx.comm.d_comm},
+        tp_group_{ctx.comm.d_tp_group},
+        tp_size_{ctx.comm.h_tp_group ? ctx.comm.h_tp_group->n_ranks() : 1},
+        engine_data_type_{engine.data_type}
+    {
+        const auto& cfg = weights.config();
+        for (int i = 0; i < phases; ++i) {
+            auto& d           = data_.emplace_back();
+            d.batch_input     = {{engine.max_forward_token_num, cfg.in_channels, cfg.image_height, cfg.image_width},
+                             cfg.data_type,
+                             kCPUpinned};
+            d.attn_cu_seqlens = Tensor_<int>{{engine.max_forward_token_num + 1}, kDEVICE};
+            d.attn_finished   = Tensor_<bool>{{engine.max_forward_token_num}, kDEVICE};
+        }
+        attn_cu_seqlens_buf_ = {engine.max_forward_token_num + 1, kCPUpinned};
+    }
+
+    void AllReduceSum(Tensor& tensor, cudaStream_t stream) const
+    {
+        if (d_comm_ && tp_size_ > 1) {
+            d_comm_->AllReduceSum(
+                tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream);
+            TM_CUDA_CHECK(cudaGetLastError());
+        }
+    }
+
+    void ApplyNorm(Tensor& out, const Tensor& input, const core::Module& norm, NormType norm_type) const
+    {
+        auto stream = core::Context::stream().handle();
+        switch (norm_type) {
+            case NormType::kLayerNorm: {
+                const auto& ln = static_cast<const LayerNormWeight&>(norm);
+                invokeLayerNorm(out, input, ln.weight, ln.bias, ln.norm_eps_, stream);
+                break;
+            }
+            case NormType::kRMSNorm: {
+                const auto& rms = static_cast<const NormWeight&>(norm);
+                invokeRMSNorm(out, input, rms.weight, rms.norm_eps_, stream);
+                break;
+            }
+            default:
+                TM_LOG_FATAL("unsupported InternVit norm type: {}", (int)norm_type);
+        }
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    void ResidualScaleNorm(Tensor&             hidden_states,
+                           Tensor&             residual,
+                           const Tensor&       branch_output,
+                           const Tensor&       branch_scale,
+                           const Tensor&       branch_bias,
+                           const core::Module* norm,
+                           NormType            norm_type) const
+    {
+        auto stream = core::Context::stream().handle();
+        switch (norm_type) {
+            case NormType::kLayerNorm: {
+                const auto& ln = static_cast<const LayerNormWeight&>(*norm);
+                invokeInternVitResidualScaleNorm(hidden_states,
+                                                 residual,
+                                                 branch_output,
+                                                 branch_scale,
+                                                 branch_bias,
+                                                 ln.weight,
+                                                 ln.bias,
+                                                 ln.norm_eps_,
+                                                 norm_type,
+                                                 stream);
+                break;
+            }
+            case NormType::kRMSNorm: {
+                const auto& rms = static_cast<const NormWeight&>(*norm);
+                invokeInternVitResidualScaleNorm(hidden_states,
+                                                 residual,
+                                                 branch_output,
+                                                 branch_scale,
+                                                 branch_bias,
+                                                 rms.weight,
+                                                 {},
+                                                 rms.norm_eps_,
+                                                 norm_type,
+                                                 stream);
+                break;
+            }
+            case NormType::kNone: {
+                invokeInternVitResidualScaleNorm(
+                    hidden_states, residual, branch_output, branch_scale, branch_bias, {}, {}, 0.f, norm_type, stream);
+                break;
+            }
+            default:
+                TM_LOG_FATAL("unsupported InternVit norm type: {}", (int)norm_type);
+        }
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    int Add(RequestCache& c)
+    {
+        const auto& [r, s] = std::tie(*c.req, *c.seq);
+        if (!r.mm_inputs) {
+            return Request::kOk;
+        }
+
+        if ((not r.session.start_flag) or (not r.session.end_flag)) {
+            return Request::kInvalid;
+        }
+
+        const auto mm_inputs = std::dynamic_pointer_cast<multimodal::InternVitInput>(r.mm_inputs);
+        if (!mm_inputs) {
+            return Request::kInvalid;
+        }
+
+        for (const auto& item : mm_inputs->items) {
+            if (item.modality != multimodal::Modality::kImage) {
+                return Request::kInvalid;
+            }
+
+            const int tokens = item.token_end - item.token_begin;
+            if (tokens <= 0) {
+                return Request::kInvalid;
+            }
+
+            auto mm_item = std::make_shared<MultiModalData>(
+                MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, std::array<int, 3>{}});
+            s.multimodal_inputs.push_back(mm_item);
+        }
+
+        return Request::kOk;
+    }
+
+    void Add(int /*phase*/, TensorMap& env)
+    {
+        const Buffer_<RequestCache*> rc = env.at("requests").buffer();
+        for (int i = 0; i < rc.size(); ++i) {
+            auto& c = *TM_CHECK_NOTNULL(rc[i]);
+            if (c.status == 0) {
+                c.status = Add(c);
+            }
+        }
+    }
+
+    void Setup(int phase, TensorMap& env)
+    {
+        auto&       d    = data_.at(phase);
+        auto&       b    = *env.at("batch").data<BatchData*>()[0];
+        auto&       copy = *env.at("copy").data<BatchCopy*>()[0];
+        const auto& cfg  = config_;
+
+        int input_ids_offsets    = 0;
+        int image_embeds_offsets = 0;
+        d.Clear();
+        std::vector<Tensor> pixel_values;
+
+        const auto& rc = b.rc;
+        for (int i = 0; i < rc.size(); ++i) {
+            const auto& c = *rc[i];
+            const auto& s = *c.seq;
+
+            if ((not c.autoregres) && (not s.multimodal_inputs.empty())) {
+                Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}};
+                for (const auto& mm : s.multimodal_inputs) {
+                    auto o = mm->interval & text;
+                    if (auto size = (int)o.size()) {
+                        pixel_values.push_back(mm->data);
+                        d.batch_size += mm->data.shape(0);
+
+                        const int text_offset  = input_ids_offsets + o.begin() - text.begin();
+                        const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin();
+                        d.input_embeds_coords.emplace_back(size, text_offset);
+                        d.image_embeds_coords.emplace_back(size, image_offset);
+
+                        image_embeds_offsets += (int)mm->interval.size();
+                    }
+                }
+            }
+
+            input_ids_offsets += c.autoregres ? 1 : c.input_len;
+        }
+
+        if (d.batch_size > 0) {
+            // batch input
+            if (d.batch_size > d.batch_input.shape(0)) {
+                core::ContextGuard ctx{Allocator{kCPUpinned}};
+                Layout             layout{d.batch_size, cfg.in_channels, cfg.image_height, cfg.image_width};
+                d.batch_input = {layout, cfg.data_type, kCPUpinned};
+            }
+
+            ssize_t batch_offset = 0;
+            for (const auto& pixel_value : pixel_values) {
+                TM_CHECK_EQ(pixel_value.ndim(), 4);
+                TM_CHECK_EQ(pixel_value.shape(1), cfg.in_channels);
+                TM_CHECK_EQ(pixel_value.shape(2), cfg.image_height);
+                TM_CHECK_EQ(pixel_value.shape(3), cfg.image_width);
+                TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype());
+                Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0)));
+                batch_offset += pixel_value.shape(0);
+            }
+            TM_CHECK_EQ(batch_offset, d.batch_size);
+
+            // attention meta
+            d.seq_len   = cfg.num_patches + 1;
+            d.token_num = d.batch_size * d.seq_len;
+
+            if (d.attn_cu_seqlens.size() < d.batch_size + 1) {
+                d.attn_cu_seqlens = Tensor_<int>{{d.batch_size + 1}, kDEVICE};
+            }
+            if (d.attn_finished.size() < d.batch_size) {
+                d.attn_finished = Tensor_<bool>{{d.batch_size}, kDEVICE};
+            }
+            if (attn_cu_seqlens_buf_.size() < d.batch_size + 1) {
+                core::ContextGuard ctx{Allocator{kCPUpinned}};
+                attn_cu_seqlens_buf_ = {d.batch_size + 1, kCPUpinned};
+            }
+
+            for (int i = 0; i <= d.batch_size; ++i) {
+                attn_cu_seqlens_buf_[i] = i * d.seq_len;
+            }
+            copy(attn_cu_seqlens_buf_.data(), d.batch_size + 1, d.attn_cu_seqlens.data());
+            Clear(d.attn_finished.slice(0, d.batch_size));
+        }
+        h_tp_group->Sync();
+    }
+
+    Tensor PatchEmbedding(Data& d)
+    {
+        const auto& cfg    = config_;
+        auto        stream = core::Context::stream().handle();
+
+        Tensor host_input = d.batch_input.slice(0, d.batch_size);
+        Tensor input      = empty_like(host_input, kDEVICE);
+        Copy(host_input, input);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor patches{{d.batch_size * cfg.num_patches, cfg.patch_in_dim}, cfg.data_type, kDEVICE};
+        invokeInternVitPatchify(patches,
+                                input,
+                                d.batch_size,
+                                cfg.in_channels,
+                                cfg.image_height,
+                                cfg.image_width,
+                                cfg.patch_height,
+                                cfg.patch_width,
+                                stream);
+
+        Tensor patch_embeds;
+        TM_SCOPE_CALL(linear_.Forward(patches, *weights_.patch_embed, patch_embeds));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor hidden{{d.token_num, cfg.hidden_dim}, cfg.data_type, kDEVICE};
+        invokeInternVitAddEmbeddings(hidden,
+                                     patch_embeds,
+                                     weights_.patch_embed->bias,
+                                     weights_.cls_token,
+                                     weights_.position_embeddings,
+                                     d.batch_size,
+                                     cfg.num_patches,
+                                     cfg.hidden_dim,
+                                     stream);
+        return hidden;
+    }
+
+    template<typename T>
+    AttentionParams<T> CreateVitAttentionParams(
+        Tensor& attn_output, Tensor& qkv, Tensor& kv, const Data& d, const AttentionWeight& attn, int layer_id)
+    {
+        const int local_head_num = attn.head_num / attn.tp_size;
+        const int head_dim       = attn.head_dim;
+        const int token_num      = d.token_num;
+
+        AttentionParams<T> params{};
+        params.out = (T*)attn_output.raw_data();
+        params.q   = (T*)qkv.raw_data();
+
+        params.stride = (int64_t)local_head_num * 3 * head_dim;
+
+        params.cu_q_len = d.attn_cu_seqlens.data();
+        params.cu_k_len = d.attn_cu_seqlens.data();
+        params.finished = d.attn_finished.data();
+
+        params.linear_iter_params = LinearIteratorParams{
+            kv.raw_data(),
+            2 * token_num * head_dim,
+            token_num * head_dim,
+        };
+
+        params.token_num  = token_num;
+        params.batch_size = d.batch_size;
+        params.max_q_len  = d.seq_len;
+        params.max_k_len  = d.seq_len;
+
+        params.num_heads     = local_head_num;
+        params.num_kv_heads  = local_head_num;
+        params.size_per_head = head_dim;
+        params.causal        = false;
+        params.layer_id      = layer_id;
+
+        double scaling = 1.;
+        if (attn.softmax_scale) {
+            scaling *= attn.softmax_scale;
+        }
+        else {
+            scaling /= std::sqrt((float)head_dim);
+        }
+        params.inv_sqrt_dh = scaling * std::log2(std::exp(1.));
+
+        params.window_size     = 0;
+        params.rope_param.type = RopeType::kNull;
+        params.max_split_k     = 1;
+        params.cp_size         = 1;
+        params.stream          = core::Context::stream().handle();
+        return params;
+    }
+
+    template<typename T>
+    void Attn(Tensor& input, Tensor& output, Data& d, int layer_id)
+    {
+        auto* attn   = weights_.block(layer_id)->attention.get();
+        auto  stream = core::Context::stream().handle();
+
+        Tensor qkv;
+        TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        const int local_head_num = attn->head_num / attn->tp_size;
+        const int head_dim       = attn->head_dim;
+        const int local_dim      = local_head_num * head_dim;
+
+        ApplyBias(qkv, attn->w_qkv->bias, stream);
+
+        if (attn->q_norm && attn->k_norm) {
+            Tensor sums{{2, d.token_num}, kFloat, kDEVICE};
+            invokeInternVitPreRMSNorm(sums, qkv, local_dim, stream);
+            if (attn->tp_size > 1) {
+                AllReduceSum(sums, stream);
+            }
+            invokeInternVitPostRMSNorm(qkv,
+                                       sums,
+                                       attn->q_norm->weight,
+                                       attn->k_norm->weight,
+                                       local_dim,
+                                       config_.hidden_dim,
+                                       attn->q_norm->norm_eps_,
+                                       stream);
+        }
+
+        Tensor kv{{local_head_num, 2, d.token_num, head_dim}, qkv.dtype(), qkv.device()};
+        invokeInternVitPrepareQKV(kv, qkv, local_head_num, head_dim, stream);
+
+        Tensor attn_output{{d.token_num, local_dim}, qkv.dtype(), qkv.device()};
+        auto   params = CreateVitAttentionParams<T>(attn_output, qkv, kv, d, *attn, layer_id);
+        dispatchAttention<T>(params);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output));
+        TM_CUDA_CHECK(cudaGetLastError());
+        if (attn->tp_size > 1) {
+            AllReduceSum(output, stream);
+        }
+    }
+
+    void Mlp(Tensor& input, Tensor& output, int layer_id)
+    {
+        auto* block  = weights_.block(layer_id);
+        auto  stream = core::Context::stream().handle();
+
+        Tensor inter;
+        TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        invokeAddBiasActivation(inter, block->mlp_fc1->bias, ActivationType::kGelu, stream);
+
+        TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output));
+        TM_CUDA_CHECK(cudaGetLastError());
+        AllReduceSum(output, stream);
+    }
+
+    Tensor Projector(Tensor& hidden, Data& d)
+    {
+        const auto& cfg    = config_;
+        auto        stream = core::Context::stream().handle();
+
+        const int grid_size = (int)std::sqrt(cfg.num_patches);
+        TM_CHECK_EQ(grid_size * grid_size, cfg.num_patches);
+        TM_CHECK_EQ(cfg.image_seq_length, (grid_size / 2) * (grid_size / 2));
+
+        Tensor shuffled{{d.batch_size * cfg.image_seq_length, cfg.hidden_dim * 4}, cfg.data_type, kDEVICE};
+        invokeInternVitPixelShuffle(shuffled, hidden, grid_size, stream);
+
+        Tensor projector_normed{{d.batch_size * cfg.image_seq_length, cfg.hidden_dim * 4}, cfg.data_type, kDEVICE};
+        invokeLayerNorm(projector_normed,
+                        shuffled,
+                        weights_.projector_norm->weight,
+                        weights_.projector_norm->bias,
+                        weights_.projector_norm->norm_eps_,
+                        stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor inter;
+        TM_SCOPE_CALL(linear_.Forward(projector_normed, *weights_.projector_fc1, inter));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        invokeAddBiasActivation(inter, weights_.projector_fc1->bias, ActivationType::kGelu, stream);
+
+        Tensor output;
+        TM_SCOPE_CALL(linear_.Forward(inter, *weights_.projector_fc2, output));
+        TM_CUDA_CHECK(cudaGetLastError());
+        AllReduceSum(output, stream);
+        ApplyBias(output, weights_.projector_fc2->bias, stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+        return output;
+    }
+
+    void Forward(int phase, TensorMap& args)
+    {
+        const auto& cfg = config_;
+        auto&       d   = data_.at(phase);
+        if (d.batch_size == 0) {
+            return;
+        }
+
+        auto stream   = core::Context::stream().handle();
+        auto residual = PatchEmbedding(d);
+
+        Tensor hidden_states = [&]() {
+            Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{};
+            if (symm_buf && d.token_num * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) {
+                return Tensor{symm_buf.view(cfg.data_type), {d.token_num, cfg.hidden_dim}};
+            }
+            else {
+                return Tensor{{d.token_num, cfg.hidden_dim}, cfg.data_type, kDEVICE};
+            }
+        }();
+
+        ApplyNorm(hidden_states, residual, *weights_.block(0)->norm1, config_.norm_type);
+
+        for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) {
+            auto* block = weights_.block(layer_id);
+
+            auto invoke = [&](auto t) {
+                using T = decltype(t);
+                Attn<T>(hidden_states, hidden_states, d, layer_id);
+            };
+            TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke);
+            ResidualScaleNorm(hidden_states,
+                              residual,
+                              hidden_states,
+                              block->lambda_1,
+                              block->attention->wo->bias,
+                              block->norm2.get(),
+                              config_.norm_type);
+
+            Mlp(hidden_states, hidden_states, layer_id);
+
+            const bool is_last_layer = layer_id + 1 == cfg.depth;
+            ResidualScaleNorm(hidden_states,
+                              residual,
+                              hidden_states,
+                              block->lambda_2,
+                              block->mlp_fc2->bias,
+                              is_last_layer ? nullptr : weights_.block(layer_id + 1)->norm1.get(),
+                              is_last_layer ? NormType::kNone : config_.norm_type);
+        }
+
+        Tensor image_embeds = Projector(residual, d);
+        EnsureFloatDtype(image_embeds, engine_data_type_);
+
+        args.produce("multimodal",
+                     MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf());
+    }
+};
+
+InternVit::InternVit(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases):
+    impl_{std::make_unique<Impl>(engine, ctx, weights, phases)}
+{
+}
+
+InternVit::~InternVit() = default;
+
+void InternVit::Run(BatchOp op, int phase, TensorMap& env)
+{
+    switch (op) {
+        case BatchOp::kAdd:
+            return impl_->Add(phase, env);
+        case BatchOp::kSetup:
+            return impl_->Setup(phase, env);
+        case BatchOp::kForward:
+            return impl_->Forward(phase, env);
+        default:
+            return;
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit.h b/src/turbomind/models/internvit/internvit.h
new file mode 100644
index 0000000000..5d5ba46b9e
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit.h
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/models/vision_model.h"
+
+#include <memory>
+
+namespace turbomind {
+
+class InternVitWeight;
+
+class InternVit: public VisionModel {
+public:
+    InternVit(const EngineParam& engine, const Context& ctx, const InternVitWeight& weights, int phases);
+
+    ~InternVit() override;
+
+    void Run(BatchOp op, int phase, TensorMap& env) override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_block_weight.cc b/src/turbomind/models/internvit/internvit_block_weight.cc
new file mode 100644
index 0000000000..ec92ed1cbc
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_block_weight.cc
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/internvit/internvit_block_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+void InternVitBlockWeight::prepare()
+{
+    for_each_child([](const char* /*name*/, core::Module* child) {
+        if (child) {
+            child->prepare();
+        }
+    });
+
+    if (lambda_1) {
+        EnsureFloatDtype(lambda_1, data_type);
+    }
+    if (lambda_2) {
+        EnsureFloatDtype(lambda_2, data_type);
+    }
+}
+
+TM_MODULE_REGISTER(InternVitBlockWeight, core::InternVitBlockConfig);
+
+TM_MODULE_METHODS(InternVitBlockWeight, INTERNVIT_BLOCK_CHILDREN, INTERNVIT_BLOCK_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_block_weight.h b/src/turbomind/models/internvit/internvit_block_weight.h
new file mode 100644
index 0000000000..dff96502df
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_block_weight.h
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/models/attention_weight.h"
+
+namespace turbomind::core {
+
+struct InternVitBlockConfig: ModuleConfig {
+    InternVitBlockConfig(): ModuleConfig{"InternVitBlockWeight"} {}
+
+#define INTERNVIT_BLOCK_FIELDS(X)                                                                                      \
+    X(DataType, data_type)                                                                                             \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, head_num)                                                                                                   \
+    X(int, intermediate_size)                                                                                          \
+    X(float, norm_eps, 1e-6f)
+
+    INTERNVIT_BLOCK_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(InternVitBlockConfig, INTERNVIT_BLOCK_FIELDS)
+
+#undef INTERNVIT_BLOCK_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class LinearWeight;
+
+class InternVitBlockWeight: public core::Module {
+public:
+    const char* type() const override
+    {
+        return "InternVitBlockWeight";
+    }
+
+    InternVitBlockWeight() = default;
+    explicit InternVitBlockWeight(const core::InternVitBlockConfig& cfg):
+        data_type{cfg.data_type},
+        hidden_dim{cfg.hidden_dim},
+        head_num{cfg.head_num},
+        intermediate_size{cfg.intermediate_size},
+        norm_eps{cfg.norm_eps}
+    {
+    }
+
+    void prepare() override;
+
+#define INTERNVIT_BLOCK_CHILDREN(X)                                                                                    \
+    X(core::Module, norm1)                                                                                             \
+    X(core::Module, norm2)                                                                                             \
+    X(AttentionWeight, attention)                                                                                      \
+    X(LinearWeight, mlp_fc1)                                                                                           \
+    X(LinearWeight, mlp_fc2)
+
+#define INTERNVIT_BLOCK_PARAMS(X)                                                                                      \
+    X(lambda_1)                                                                                                        \
+    X(lambda_2)
+
+    TM_MODULE_DECLARE(InternVitBlockWeight, INTERNVIT_BLOCK_CHILDREN, INTERNVIT_BLOCK_PARAMS)
+
+    DataType data_type{};
+    int      hidden_dim{};
+    int      head_num{};
+    int      intermediate_size{};
+    float    norm_eps{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_input.h b/src/turbomind/models/internvit/internvit_input.h
new file mode 100644
index 0000000000..34f50a72b0
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_input.h
@@ -0,0 +1,37 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/engine/multimodal_input.h"
+
+#include <utility>
+#include <vector>
+
+namespace turbomind {
+namespace multimodal {
+
+struct InternVitItem {
+    Modality modality;
+    Tensor   data;
+    int      token_begin;
+    int      token_end;
+
+    InternVitItem() = default;
+
+    InternVitItem(Modality modality, Tensor data, int token_begin, int token_end):
+        modality{modality}, data{std::move(data)}, token_begin{token_begin}, token_end{token_end}
+    {
+    }
+};
+
+struct InternVitInput final: Input {
+    std::vector<InternVitItem> items;
+
+    InternVitInput() = default;
+
+    explicit InternVitInput(std::vector<InternVitItem> items): items{std::move(items)} {}
+};
+
+}  // namespace multimodal
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_kernels.cu b/src/turbomind/models/internvit/internvit_kernels.cu
new file mode 100644
index 0000000000..4b85ad0cfb
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_kernels.cu
@@ -0,0 +1,700 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/internvit/internvit_kernels.h"
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+#include "cub/block/block_reduce.cuh"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <type_traits>
+
+namespace turbomind {
+
+namespace {
+
+struct SumPair {
+    float s{};
+    float sq{};
+
+    __device__ SumPair operator+(const SumPair& other) const
+    {
+        return {s + other.s, sq + other.sq};
+    }
+};
+
+template<typename T>
+__global__ void patchIm2ColKernel(T*       out,
+                                  const T* input,
+                                  int      channels,
+                                  int      image_h,
+                                  int      image_w,
+                                  int      patch_h,
+                                  int      patch_w,
+                                  int      grid_w,
+                                  int      patch_area,
+                                  int      patch_in_dim,
+                                  int      num_patches)
+{
+    const int batch = blockIdx.x;
+    const int patch = blockIdx.y;
+    const int row   = batch * num_patches + patch;
+    const int ph    = patch / grid_w;
+    const int pw    = patch - ph * grid_w;
+
+    for (int k = threadIdx.x; k < patch_in_dim; k += blockDim.x) {
+        const int c   = k / patch_area;
+        const int rem = k - c * patch_area;
+        const int ih  = rem / patch_w;
+        const int iw  = rem - ih * patch_w;
+
+        const int64_t src =
+            ((int64_t)batch * channels + c) * image_h * image_w + (ph * patch_h + ih) * image_w + pw * patch_w + iw;
+        out[(int64_t)row * patch_in_dim + k] = input[src];
+    }
+}
+
+template<typename T, int vec_size>
+__global__ void addEmbeddingsVecKernel(T*       out,
+                                       const T* patch,
+                                       const T* patch_bias,
+                                       const T* cls_token,
+                                       const T* pos,
+                                       int      seq_len,
+                                       int      num_patches,
+                                       int      tiles)
+{
+    const int vec_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int token  = blockIdx.y;
+    const int batch  = blockIdx.z;
+    if (vec_id >= tiles) {
+        return;
+    }
+
+    Array<T, vec_size> pos_vec;
+    Array<T, vec_size> out_vec;
+    Array<T, vec_size> bias_vec{};
+    Load(pos_vec, pos + (token * tiles + vec_id) * vec_size);
+
+    if (token == 0) {
+        Load(out_vec, cls_token + vec_id * vec_size);
+    }
+    else {
+        Load(out_vec, patch + (((int64_t)batch * num_patches + token - 1) * tiles + vec_id) * vec_size);
+        if (patch_bias) {
+            Load(bias_vec, patch_bias + vec_id * vec_size);
+        }
+    }
+
+    using namespace ops;
+    Store(out + (((int64_t)batch * seq_len + token) * tiles + vec_id) * vec_size,
+          cast<T>(cast<float>(out_vec) + cast<float>(bias_vec) + cast<float>(pos_vec)));
+}
+
+template<typename T, int vec_size, int block_dim>
+__global__ void preRMSNormKernel(float* sums, const T* qkv, int token_num, int local_dim, int qkv_dim)
+{
+    const int token = blockIdx.x;
+    const int part  = blockIdx.y;  // 0: q, 1: k
+    const int base  = token * qkv_dim + part * local_dim;
+
+    using namespace ops;
+    float sum = 0.f;
+    for (int d = threadIdx.x * vec_size; d < local_dim; d += block_dim * vec_size) {
+        Array<T, vec_size> qkv_vec;
+        Load(qkv_vec, qkv + base + d);
+
+        const auto qkv_float = cast<float>(qkv_vec);
+        const auto sq        = qkv_float * qkv_float;
+
+        PRAGMA_UNROLL
+        for (int i = 0; i < vec_size; ++i) {
+            sum += sq[i];
+        }
+    }
+
+    using BlockReduce = cub::BlockReduce<float, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    if (threadIdx.x == 0) {
+        sums[part * token_num + token] = sum;
+    }
+}
+
+template<typename T, int vec_size>
+__global__ void postRMSNormKernel(T*           qkv,
+                                  const float* sums,
+                                  const T*     q_weight,
+                                  const T*     k_weight,
+                                  int          token_num,
+                                  int          local_dim,
+                                  int          qkv_dim,
+                                  int          hidden_dim,
+                                  int          tiles,
+                                  float        eps)
+{
+    const int token      = blockIdx.x;
+    const int tile_block = blockIdx.y;
+    const int part       = blockIdx.z;
+    const int vec_id     = tile_block * blockDim.x + threadIdx.x;
+    if (vec_id >= tiles) {
+        return;
+    }
+
+    const int d    = vec_id * vec_size;
+    const int base = token * qkv_dim + part * local_dim + d;
+
+    Array<T, vec_size> qkv_vec;
+    Array<T, vec_size> weight_vec;
+    Load(qkv_vec, qkv + base);
+    Ldg(weight_vec, (part == 0 ? q_weight : k_weight) + d);
+
+    const float inv = rsqrtf(sums[part * token_num + token] / hidden_dim + eps);
+
+    using namespace ops;
+    Store(qkv + base, cast<T>((cast<float>(qkv_vec) * cast<float>(weight_vec)) * inv));
+}
+
+template<typename T, int head_dim, int vec_size, int warps_per_block>
+__global__ void prepareQKVKernel(T* kv, const T* qkv, int token_num, int local_head_num)
+{
+    static_assert(head_dim % vec_size == 0);
+    constexpr int kVecPerHead   = head_dim / vec_size;
+    constexpr int kHeadsPerWarp = WARP_SIZE / kVecPerHead;
+    static_assert(kVecPerHead * kHeadsPerWarp == WARP_SIZE);
+
+    const int warp_id   = threadIdx.x / WARP_SIZE;
+    const int lane_id   = threadIdx.x - warp_id * WARP_SIZE;
+    const int head_slot = lane_id / kVecPerHead;
+    const int vec_id    = lane_id - head_slot * kVecPerHead;
+
+    const int token = blockIdx.x * warps_per_block + warp_id;
+    if (token >= token_num) {
+        return;
+    }
+
+    const int head_group = blockIdx.y;
+    const int part       = blockIdx.z;
+    const int head       = head_group * kHeadsPerWarp + head_slot;
+    if (head >= local_head_num) {
+        return;
+    }
+
+    const int     local_dim  = local_head_num * head_dim;
+    const int     qkv_dim    = 3 * local_dim;
+    const int     offset     = vec_id * vec_size;
+    const int     src_offset = (part == 0 ? local_dim : 2 * local_dim) + head * head_dim + offset;
+    const int64_t dst        = (((int64_t)head * 2 + part) * token_num + token) * head_dim + offset;
+
+    Array<T, vec_size> qkv_vec;
+    Load(qkv_vec, qkv + token * qkv_dim + src_offset);
+    Store(kv + dst, qkv_vec);
+}
+
+// residual <- residual + (branch_output + optional branch_bias) * branch_scale
+template<typename T, int vec_size>
+__global__ void residualScaleKernel(T* __restrict__ residual,
+                                    const T* __restrict__ branch_output,
+                                    const T* __restrict__ branch_scale,
+                                    const T* __restrict__ branch_bias,
+                                    int hidden_dim,
+                                    int tiles)
+{
+    const int token  = blockIdx.x;
+    const int vec_id = blockIdx.y * blockDim.x + threadIdx.x;
+    if (vec_id >= tiles) {
+        return;
+    }
+
+    const int d = vec_id * vec_size;
+    residual += (int64_t)token * hidden_dim + d;
+    branch_output += (int64_t)token * hidden_dim + d;
+    branch_scale += d;
+    if (branch_bias) {
+        branch_bias += d;
+    }
+
+    Array<T, vec_size> residual_vec;
+    Array<T, vec_size> branch_vec;
+    Array<T, vec_size> branch_scale_vec;
+    Array<T, vec_size> branch_bias_vec{};
+
+    Load(residual_vec, residual);
+    Load(branch_vec, branch_output);
+    Ldg(branch_scale_vec, branch_scale);
+    if (branch_bias) {
+        Ldg(branch_bias_vec, branch_bias);
+    }
+
+    using namespace ops;
+    Store(
+        residual,
+        cast<T>(cast<float>(residual_vec) + cast<float>(branch_vec + branch_bias_vec) * cast<float>(branch_scale_vec)));
+}
+
+// residual <- residual + (branch_output + optional branch_bias) * branch_scale
+// hidden_states <- LayerNorm(residual) * norm_weight + optional norm_bias
+template<typename T, int vec_size, int block_dim>
+__global__ void residualScaleLayerNormKernel(T*       hidden_states,
+                                             T*       residual,
+                                             const T* branch_output,
+                                             const T* branch_scale,
+                                             const T* branch_bias,
+                                             const T* norm_weight,
+                                             const T* norm_bias,
+                                             int      hidden_dim,
+                                             float    eps)
+{
+    const int token = blockIdx.x;
+    const int di    = threadIdx.x * vec_size;
+
+    residual += (int64_t)token * hidden_dim;
+    branch_output += (int64_t)token * hidden_dim;
+    hidden_states += (int64_t)token * hidden_dim;
+
+    Array<float, vec_size> sum_v{};
+    Array<float, vec_size> sq_v{};
+    Array<T, vec_size>     residual_vec;
+    Array<T, vec_size>     branch_vec;
+    Array<T, vec_size>     branch_scale_vec;
+    Array<T, vec_size>     branch_bias_vec{};
+
+    using namespace ops;
+    for (int i = di; i < hidden_dim; i += block_dim * vec_size) {
+        Load(residual_vec, residual + i);
+        Load(branch_vec, branch_output + i);
+        Ldg(branch_scale_vec, branch_scale + i);
+        if (branch_bias) {
+            Ldg(branch_bias_vec, branch_bias + i);
+        }
+
+        residual_vec = cast<T>(cast<float>(residual_vec)
+                               + cast<float>(branch_vec + branch_bias_vec) * cast<float>(branch_scale_vec));
+        Store(residual + i, residual_vec);
+
+        const auto residual_float = cast<float>(residual_vec);
+        sum_v                     = sum_v + residual_float;
+        sq_v                      = sq_v + residual_float * residual_float;
+    }
+
+    SumPair pair{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        pair.s += sum_v[i];
+        pair.sq += sq_v[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<SumPair, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    pair = BlockReduce{temp_storage}.Sum(pair);
+
+    __shared__ float shared_mean;
+    __shared__ float shared_inv_std;
+
+    if (threadIdx.x == 0) {
+        const float inv_dim = 1.f / hidden_dim;
+        const float mean    = pair.s * inv_dim;
+        const float var     = fmaxf(pair.sq * inv_dim - mean * mean, 0.f);
+        shared_mean         = mean;
+        shared_inv_std      = rsqrtf(var + eps);
+    }
+
+    __syncthreads();
+
+    const float mean    = shared_mean;
+    const float inv_std = shared_inv_std;
+
+    Array<T, vec_size> weight_vec;
+    Array<T, vec_size> bias_vec{};
+    for (int i = di; i < hidden_dim; i += block_dim * vec_size) {
+        Load(residual_vec, residual + i);
+        Ldg(weight_vec, norm_weight + i);
+        if (norm_bias) {
+            Ldg(bias_vec, norm_bias + i);
+        }
+
+        Array<float, vec_size> out_vec;
+        PRAGMA_UNROLL
+        for (int j = 0; j < vec_size; ++j) {
+            out_vec[j] = (static_cast<float>(residual_vec[j]) - mean) * inv_std * static_cast<float>(weight_vec[j])
+                         + static_cast<float>(bias_vec[j]);
+        }
+        Store(hidden_states + i, cast<T>(out_vec));
+    }
+}
+
+// residual <- residual + (branch_output + optional branch_bias) * branch_scale
+// hidden_states <- RMSNorm(residual) * norm_weight
+template<typename T, int vec_size, int block_dim>
+__global__ void residualScaleRMSNormKernel(T*       hidden_states,
+                                           T*       residual,
+                                           const T* branch_output,
+                                           const T* branch_scale,
+                                           const T* branch_bias,
+                                           const T* norm_weight,
+                                           int      hidden_dim,
+                                           float    eps)
+{
+    const int token = blockIdx.x;
+    const int di    = threadIdx.x * vec_size;
+
+    residual += (int64_t)token * hidden_dim;
+    branch_output += (int64_t)token * hidden_dim;
+    hidden_states += (int64_t)token * hidden_dim;
+
+    Array<float, vec_size> sq_v{};
+    Array<T, vec_size>     residual_vec;
+    Array<T, vec_size>     branch_vec;
+    Array<T, vec_size>     branch_scale_vec;
+    Array<T, vec_size>     branch_bias_vec{};
+
+    using namespace ops;
+    for (int i = di; i < hidden_dim; i += block_dim * vec_size) {
+        Load(residual_vec, residual + i);
+        Load(branch_vec, branch_output + i);
+        Ldg(branch_scale_vec, branch_scale + i);
+        if (branch_bias) {
+            Ldg(branch_bias_vec, branch_bias + i);
+        }
+
+        residual_vec = cast<T>(cast<float>(residual_vec)
+                               + cast<float>(branch_vec + branch_bias_vec) * cast<float>(branch_scale_vec));
+        Store(residual + i, residual_vec);
+
+        const auto residual_float = cast<float>(residual_vec);
+        sq_v                      = sq_v + residual_float * residual_float;
+    }
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += sq_v[i];
+    }
+
+    using BlockReduce = cub::BlockReduce<float, block_dim>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    sum = BlockReduce{temp_storage}.Sum(sum);
+
+    __shared__ float shared_inv_rms;
+    if (threadIdx.x == 0) {
+        shared_inv_rms = rsqrtf(sum / hidden_dim + eps);
+    }
+
+    __syncthreads();
+
+    const float inv_rms = shared_inv_rms;
+
+    using namespace ops;
+    Array<T, vec_size> weight_vec;
+    for (int i = di; i < hidden_dim; i += block_dim * vec_size) {
+        Load(residual_vec, residual + i);
+        Ldg(weight_vec, norm_weight + i);
+        Store(hidden_states + i, cast<T>(cast<float>(residual_vec) * inv_rms * cast<float>(weight_vec)));
+    }
+}
+
+template<typename T, int vec_size>
+__global__ void pixelShuffleKernel(T* __restrict__ out,
+                                   const T* __restrict__ hidden,
+                                   int grid_size,
+                                   int out_grid,
+                                   int hidden_dim,
+                                   int seq_len,
+                                   int tiles)
+{
+    const int vec_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (vec_id >= tiles) {
+        return;
+    }
+
+    const int token = blockIdx.y;
+    const int batch = blockIdx.z;
+    const int ow    = token / out_grid;
+    const int oh    = token - ow * out_grid;
+
+    const int     c        = vec_id * vec_size;
+    const int     in_token = 1 + (ow * 2) * grid_size + oh * 2;
+    const int64_t in0      = ((int64_t)batch * seq_len + in_token) * hidden_dim + c;
+    const int64_t out0     = ((int64_t)batch * out_grid * out_grid + token) * (hidden_dim * 4) + c;
+
+    Array<T, vec_size> v;
+    Load(v, hidden + in0);
+    Store(out + out0, v);
+    Load(v, hidden + in0 + hidden_dim);
+    Store(out + out0 + hidden_dim, v);
+    Load(v, hidden + in0 + (int64_t)grid_size * hidden_dim);
+    Store(out + out0 + 2 * hidden_dim, v);
+    Load(v, hidden + in0 + ((int64_t)grid_size + 1) * hidden_dim);
+    Store(out + out0 + 3 * hidden_dim, v);
+}
+
+}  // namespace
+
+void invokeInternVitPatchify(Tensor&       patches,
+                             const Tensor& pixel_values,
+                             int           batch_size,
+                             int           channels,
+                             int           image_h,
+                             int           image_w,
+                             int           patch_h,
+                             int           patch_w,
+                             cudaStream_t  stream)
+{
+    TM_CHECK_EQ(patches.ndim(), 2);
+    TM_CHECK_EQ(pixel_values.ndim(), 4);
+    TM_CHECK_EQ(patches.dtype(), pixel_values.dtype());
+
+    const int grid_h       = image_h / patch_h;
+    const int grid_w       = image_w / patch_w;
+    const int num_patches  = grid_h * grid_w;
+    const int patch_area   = patch_h * patch_w;
+    const int patch_in_dim = channels * patch_area;
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        const dim3 grid(batch_size, num_patches);
+        const dim3 block(256);
+        patchIm2ColKernel<T><<<grid, block, 0, stream>>>((T*)patches.raw_data(),
+                                                         (const T*)pixel_values.raw_data(),
+                                                         channels,
+                                                         image_h,
+                                                         image_w,
+                                                         patch_h,
+                                                         patch_w,
+                                                         grid_w,
+                                                         patch_area,
+                                                         patch_in_dim,
+                                                         num_patches);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(patches.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitAddEmbeddings(Tensor&       hidden,
+                                  const Tensor& patch_embeds,
+                                  const Tensor& patch_bias,
+                                  const Tensor& cls_token,
+                                  const Tensor& position_embeddings,
+                                  int           batch_size,
+                                  int           num_patches,
+                                  int           hidden_dim,
+                                  cudaStream_t  stream)
+{
+    constexpr int Vec      = 4;
+    constexpr int kThreads = 256;
+    const int     seq_len  = num_patches + 1;
+    const int     tiles    = hidden_dim / Vec;
+    const dim3    grid((tiles + kThreads - 1) / kThreads, seq_len, batch_size);
+
+    TM_CHECK_EQ(hidden_dim % Vec, 0);
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        addEmbeddingsVecKernel<T, Vec>
+            <<<grid, kThreads, 0, stream>>>((T*)hidden.raw_data(),
+                                            (const T*)patch_embeds.raw_data(),
+                                            patch_bias ? (const T*)patch_bias.raw_data() : nullptr,
+                                            (const T*)cls_token.raw_data(),
+                                            (const T*)position_embeddings.raw_data(),
+                                            seq_len,
+                                            num_patches,
+                                            tiles);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(hidden.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitPreRMSNorm(Tensor& sums, const Tensor& qkv, int local_dim, cudaStream_t stream)
+{
+    TM_CHECK_EQ(sums.dtype(), kFloat);
+    constexpr int kThreads  = 64;
+    const int     token_num = qkv.shape(0);
+    const int     qkv_dim   = qkv.shape(1);
+
+    auto invoke = [&](auto t) {
+        using T                = decltype(t);
+        constexpr int kVecSize = sizeof(uint4) / sizeof(T);
+        TM_CHECK_EQ(local_dim % kVecSize, 0);
+        preRMSNormKernel<T, kVecSize, kThreads><<<dim3(token_num, 2), kThreads, 0, stream>>>(
+            (float*)sums.raw_data(), (const T*)qkv.raw_data(), token_num, local_dim, qkv_dim);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitPostRMSNorm(Tensor&       qkv,
+                                const Tensor& sums,
+                                const Tensor& q_weight,
+                                const Tensor& k_weight,
+                                int           local_dim,
+                                int           hidden_dim,
+                                float         eps,
+                                cudaStream_t  stream)
+{
+    constexpr int kThreads  = 256;
+    const int     token_num = qkv.shape(0);
+    const int     qkv_dim   = qkv.shape(1);
+
+    TM_CHECK_EQ(q_weight.size(), local_dim);
+    TM_CHECK_EQ(k_weight.size(), local_dim);
+
+    auto invoke = [&](auto t) {
+        using T                = decltype(t);
+        constexpr int kVecSize = sizeof(uint4) / sizeof(T);
+        TM_CHECK_EQ(local_dim % kVecSize, 0);
+        const int  tiles       = local_dim / kVecSize;
+        const int  tile_blocks = (tiles + kThreads - 1) / kThreads;
+        const dim3 grid(token_num, tile_blocks, 2);
+        postRMSNormKernel<T, kVecSize><<<grid, kThreads, 0, stream>>>((T*)qkv.raw_data(),
+                                                                      (const float*)sums.raw_data(),
+                                                                      (const T*)q_weight.raw_data(),
+                                                                      (const T*)k_weight.raw_data(),
+                                                                      token_num,
+                                                                      local_dim,
+                                                                      qkv_dim,
+                                                                      hidden_dim,
+                                                                      tiles,
+                                                                      eps);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitPrepareQKV(Tensor& kv, const Tensor& qkv, int local_head_num, int head_dim, cudaStream_t stream)
+{
+    auto invoke = [&](auto t) {
+        using T       = decltype(t);
+        auto dispatch = [&](auto head_dim_c) {
+            constexpr int kHeadDim       = decltype(head_dim_c)::value;
+            constexpr int kVecSize       = sizeof(uint4) / sizeof(T);
+            constexpr int kVecPerHead    = kHeadDim / kVecSize;
+            constexpr int kHeadsPerWarp  = WARP_SIZE / kVecPerHead;
+            constexpr int kWarpsPerBlock = 4;
+            static_assert(kVecPerHead * kHeadsPerWarp == WARP_SIZE);
+
+            const int  token_num      = qkv.shape(0);
+            const int  head_group_num = (local_head_num + kHeadsPerWarp - 1) / kHeadsPerWarp;
+            const dim3 grid((token_num + kWarpsPerBlock - 1) / kWarpsPerBlock, head_group_num, 2);
+            const dim3 block(kWarpsPerBlock * WARP_SIZE);
+            prepareQKVKernel<T, kHeadDim, kVecSize, kWarpsPerBlock>
+                <<<grid, block, 0, stream>>>((T*)kv.raw_data(), (const T*)qkv.raw_data(), token_num, local_head_num);
+        };
+
+        if (head_dim == 64) {
+            dispatch(std::integral_constant<int, 64>{});
+        }
+        else if (head_dim == 128) {
+            dispatch(std::integral_constant<int, 128>{});
+        }
+        else {
+            TM_LOG_FATAL("unsupported InternVit PrepareQKV head_dim: {}", head_dim);
+        }
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(qkv.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitResidualScaleNorm(Tensor&       hidden_states,
+                                      Tensor&       residual,
+                                      const Tensor& branch_output,
+                                      const Tensor& branch_scale,
+                                      const Tensor& branch_bias,
+                                      const Tensor& norm_weight,
+                                      const Tensor& norm_bias,
+                                      float         eps,
+                                      NormType      norm_type,
+                                      cudaStream_t  stream)
+{
+    TM_CHECK_EQ(residual.ndim(), 2);
+    TM_CHECK_EQ(branch_output.size(), residual.size());
+    TM_CHECK_EQ(branch_output.dtype(), residual.dtype());
+
+    const int hidden_dim = residual.shape(1);
+    const int token_num  = residual.shape(0);
+
+    TM_CHECK_EQ(branch_scale.size(), hidden_dim);
+
+    if (norm_type != NormType::kNone) {
+        TM_CHECK(norm_weight);
+        TM_CHECK_EQ(norm_weight.size(), hidden_dim);
+    }
+
+    auto invoke = [&](auto t) {
+        using T                    = decltype(t);
+        constexpr int kNormThreads = 512;
+        constexpr int kVecSize     = sizeof(uint4) / sizeof(T);
+        TM_CHECK_EQ(hidden_dim % kVecSize, 0);
+
+        if (norm_type == NormType::kNone) {
+            const int  kThreads = hidden_dim <= 1024 ? 128 : 256;
+            const int  tiles    = hidden_dim / kVecSize;
+            const dim3 grid(token_num, (tiles + kThreads - 1) / kThreads);
+            residualScaleKernel<T, kVecSize>
+                <<<grid, kThreads, 0, stream>>>((T*)residual.raw_data(),
+                                                (const T*)branch_output.raw_data(),
+                                                (const T*)branch_scale.raw_data(),
+                                                branch_bias ? (const T*)branch_bias.raw_data() : nullptr,
+                                                hidden_dim,
+                                                tiles);
+        }
+        else if (norm_type == NormType::kLayerNorm) {
+            residualScaleLayerNormKernel<T, kVecSize, kNormThreads>
+                <<<token_num, kNormThreads, 0, stream>>>((T*)hidden_states.raw_data(),
+                                                         (T*)residual.raw_data(),
+                                                         (const T*)branch_output.raw_data(),
+                                                         (const T*)branch_scale.raw_data(),
+                                                         branch_bias ? (const T*)branch_bias.raw_data() : nullptr,
+                                                         (const T*)norm_weight.raw_data(),
+                                                         norm_bias ? (const T*)norm_bias.raw_data() : nullptr,
+                                                         hidden_dim,
+                                                         eps);
+        }
+        else if (norm_type == NormType::kRMSNorm) {
+            residualScaleRMSNormKernel<T, kVecSize, kNormThreads>
+                <<<token_num, kNormThreads, 0, stream>>>((T*)hidden_states.raw_data(),
+                                                         (T*)residual.raw_data(),
+                                                         (const T*)branch_output.raw_data(),
+                                                         (const T*)branch_scale.raw_data(),
+                                                         branch_bias ? (const T*)branch_bias.raw_data() : nullptr,
+                                                         (const T*)norm_weight.raw_data(),
+                                                         hidden_dim,
+                                                         eps);
+        }
+        else {
+            TM_LOG_FATAL("unsupported InternVit residual norm type: {}", (int)norm_type);
+        }
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(residual.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeInternVitPixelShuffle(Tensor& output, const Tensor& hidden, int grid_size, cudaStream_t stream)
+{
+    // kVecSize=4 was faster than 16-byte vectors in pixel-shuffle benchmarks.
+    constexpr int kVecSize = 4;
+    constexpr int kThreads = 128;
+
+    const int out_grid   = grid_size / 2;
+    const int seq_len    = grid_size * grid_size + 1;
+    const int hidden_dim = hidden.shape(1);
+    const int batch_size = output.shape(0) / (out_grid * out_grid);
+    const int tiles      = hidden_dim / kVecSize;
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        const dim3 grid((tiles + kThreads - 1) / kThreads, out_grid * out_grid, batch_size);
+        pixelShuffleKernel<T, kVecSize><<<grid, kThreads, 0, stream>>>(
+            (T*)output.raw_data(), (const T*)hidden.raw_data(), grid_size, out_grid, hidden_dim, seq_len, tiles);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(output.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_kernels.h b/src/turbomind/models/internvit/internvit_kernels.h
new file mode 100644
index 0000000000..d90c0da02b
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_kernels.h
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/kernels/norm/norm.h"
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+void invokeInternVitPatchify(Tensor&       patches,
+                             const Tensor& pixel_values,
+                             int           batch_size,
+                             int           channels,
+                             int           image_h,
+                             int           image_w,
+                             int           patch_h,
+                             int           patch_w,
+                             cudaStream_t  stream);
+
+void invokeInternVitAddEmbeddings(Tensor&       hidden,
+                                  const Tensor& patch_embeds,
+                                  const Tensor& patch_bias,
+                                  const Tensor& cls_token,
+                                  const Tensor& position_embeddings,
+                                  int           batch_size,
+                                  int           num_patches,
+                                  int           hidden_dim,
+                                  cudaStream_t  stream);
+
+void invokeInternVitPreRMSNorm(Tensor& sums, const Tensor& qkv, int local_dim, cudaStream_t stream);
+
+void invokeInternVitPostRMSNorm(Tensor&       qkv,
+                                const Tensor& sums,
+                                const Tensor& q_weight,
+                                const Tensor& k_weight,
+                                int           local_dim,
+                                int           hidden_dim,
+                                float         eps,
+                                cudaStream_t  stream);
+
+void invokeInternVitPrepareQKV(Tensor& kv, const Tensor& qkv, int local_head_num, int head_dim, cudaStream_t stream);
+
+void invokeInternVitResidualScaleNorm(Tensor&       hidden_states,
+                                      Tensor&       residual,
+                                      const Tensor& branch_output,
+                                      const Tensor& branch_scale,
+                                      const Tensor& branch_bias,
+                                      const Tensor& norm_weight,
+                                      const Tensor& norm_bias,
+                                      float         eps,
+                                      NormType      norm_type,
+                                      cudaStream_t  stream);
+
+void invokeInternVitPixelShuffle(Tensor& output, const Tensor& hidden, int grid_size, cudaStream_t stream);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_weight.cc b/src/turbomind/models/internvit/internvit_weight.cc
new file mode 100644
index 0000000000..94a29db2ab
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_weight.cc
@@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/internvit/internvit_weight.h"
+
+#include "src/turbomind/core/registry.h"
+#include "src/turbomind/models/internvit/internvit_block_weight.h"
+#include "src/turbomind/models/layer_norm_weight.h"
+#include "src/turbomind/models/linear_weight.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+InternVitWeight::InternVitWeight(const core::InternVitConfig& cfg): config_{cfg} {}
+
+void InternVitWeight::prepare()
+{
+    for_each_child([](const char* /*name*/, core::Module* child) {
+        if (child) {
+            child->prepare();
+        }
+    });
+
+    if (cls_token) {
+        EnsureFloatDtype(cls_token, config_.data_type);
+    }
+    if (position_embeddings) {
+        EnsureFloatDtype(position_embeddings, config_.data_type);
+    }
+}
+
+bool InternVitWeight::verify(std::vector<std::string>& missing)
+{
+    core::Module::verify(missing);
+    if (!patch_embed) {
+        missing.push_back(full_path() + ": missing patch_embed");
+    }
+    if (!cls_token) {
+        missing.push_back(full_path() + ": missing cls_token");
+    }
+    if (!position_embeddings) {
+        missing.push_back(full_path() + ": missing position_embeddings");
+    }
+    if (!blocks || blocks->size() != config_.depth) {
+        missing.push_back(full_path() + ": blocks count mismatch (expected " + std::to_string(config_.depth) + ")");
+    }
+    if (!projector_norm || !projector_fc1 || !projector_fc2) {
+        missing.push_back(full_path() + ": missing projector");
+    }
+    return missing.empty();
+}
+
+InternVitBlockWeight* InternVitWeight::block(int i) const
+{
+    if (!blocks) {
+        return nullptr;
+    }
+    return static_cast<InternVitBlockWeight*>(blocks->child(std::to_string(i)));
+}
+
+TM_MODULE_REGISTER(InternVitWeight, core::InternVitConfig);
+
+TM_MODULE_METHODS(InternVitWeight, INTERNVIT_WEIGHT_CHILDREN, INTERNVIT_WEIGHT_PARAMS)
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/internvit/internvit_weight.h b/src/turbomind/models/internvit/internvit_weight.h
new file mode 100644
index 0000000000..98d064a149
--- /dev/null
+++ b/src/turbomind/models/internvit/internvit_weight.h
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/kernels/norm/norm.h"
+#include "src/turbomind/models/vision_model_weight.h"
+
+#include <vector>
+
+namespace turbomind::core {
+
+struct InternVitConfig: ModuleConfig {
+    InternVitConfig(): ModuleConfig{"InternVitWeight"} {}
+
+    DataType data_type{};
+    int      hidden_dim{0};
+    int      depth{0};
+    int      patch_in_dim{0};
+    int      in_channels{0};
+    int      image_height{0};
+    int      image_width{0};
+    int      patch_height{0};
+    int      patch_width{0};
+    int      num_patches{0};
+    int      image_seq_length{0};
+    NormType norm_type{NormType::kRMSNorm};
+
+#define INTERNVIT_FIELDS(X)                                                                                            \
+    X(DataType, data_type)                                                                                             \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, depth)                                                                                                      \
+    X(int, patch_in_dim)                                                                                               \
+    X(int, in_channels)                                                                                                \
+    X(int, image_height)                                                                                               \
+    X(int, image_width)                                                                                                \
+    X(int, patch_height)                                                                                               \
+    X(int, patch_width)                                                                                                \
+    X(int, num_patches)                                                                                                \
+    X(int, image_seq_length)                                                                                           \
+    X(NormType, norm_type, NormType::kRMSNorm)
+
+    TM_FOR_EACH(InternVitConfig, INTERNVIT_FIELDS)
+
+#undef INTERNVIT_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+class InternVitBlockWeight;
+class LayerNormWeight;
+class LinearWeight;
+
+class InternVitWeight: public VisionModelWeight {
+public:
+    const char* type() const override
+    {
+        return "InternVitWeight";
+    }
+
+    InternVitWeight() = default;
+    explicit InternVitWeight(const core::InternVitConfig& cfg);
+
+    void prepare() override;
+    bool verify(std::vector<std::string>& missing) override;
+
+#define INTERNVIT_WEIGHT_CHILDREN(X)                                                                                   \
+    X(LinearWeight, patch_embed)                                                                                       \
+    X(core::ModuleList, blocks)                                                                                        \
+    X(LayerNormWeight, projector_norm)                                                                                 \
+    X(LinearWeight, projector_fc1)                                                                                     \
+    X(LinearWeight, projector_fc2)
+
+#define INTERNVIT_WEIGHT_PARAMS(X)                                                                                     \
+    X(cls_token)                                                                                                       \
+    X(position_embeddings)
+
+    TM_MODULE_DECLARE(InternVitWeight, INTERNVIT_WEIGHT_CHILDREN, INTERNVIT_WEIGHT_PARAMS)
+
+    const core::InternVitConfig& config() const noexcept
+    {
+        return config_;
+    }
+
+    InternVitBlockWeight* block(int i) const;
+
+private:
+    core::InternVitConfig config_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_rope.h b/src/turbomind/models/llama/llama_rope.h
index 32a697204e..ec45bc103e 100644
--- a/src/turbomind/models/llama/llama_rope.h
+++ b/src/turbomind/models/llama/llama_rope.h
@@ -39,9 +39,9 @@ struct Llama3RopeKernelParam {
 struct MropeRopeKernelParam {
     int3 section;
 
-    int  stride{};
     int* position_ids{};
     int* position_delta{};
+    int* position_offsets{};
     int* length{};
 };
 
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index cc0e529eb4..2f07791bce 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -68,8 +68,9 @@ struct AttentionData {
 
     Buffer_<float> rope_base;
 
-    Tensor_<int> mrope_position_ids;
+    Buffer_<int> mrope_position_ids;
     Buffer_<int> mrope_position_delta;
+    Buffer_<int> mrope_position_offsets;
     Buffer_<int> mrope_length;
 
     // borrowed from env
@@ -136,10 +137,8 @@ UnifiedAttentionLayer::UnifiedAttentionLayer(int                           quant
         rope_base_buf_ = {bsz + 1, kCPUpinned};
     }
     if (rope_param_.mrope_mode != MropeMode::kNone) {
-        // mrope device buffers are allocated lazily — borrowed from env when the vision encoder
-        // produced them, or owned (allocated in legacy_mrope_setup) when only r.inputs supplies them.
-        mrope_position_delta_buf_ = {bsz, kCPUpinned};
-        mrope_length_buf_         = {bsz, kCPUpinned};
+        mrope_default_buf_ = Buffer_<int>{std::max(bsz, 3), kDEVICE};
+        Clear(mrope_default_buf_);
     }
     const int max_blocks = bsz * cdiv(engine.session_len, engine_param_.cache_block_seq_len);
     for (int i = 0; i < phases; ++i) {
@@ -277,45 +276,26 @@ void UnifiedAttentionLayer::Setup(int phase, TensorMap& env)
         copy(rope_base_buf_, bsz, d.rope_base);
     }
     if (rope_param_.mrope_mode != MropeMode::kNone) {
-        // mrope tensors can come from two sources:
-        //   1. env: the C++ vision encoder produced device tensors in the exact layout
-        //      FastRoPE expects — borrow them with no copy.
-        //   2. r.inputs: legacy Python-preprocessor path, per-request shaped (length, 3) +
-        //      scalar delta. Falls back here when env did not produce mrope.
-        if (env.try_("mrope_length")) {
-            d.mrope_length         = env.at("mrope_length").buffer().borrow();
-            d.mrope_position_delta = env.at("mrope_position_delta").buffer().borrow();
-            d.mrope_position_ids   = env.at("mrope_position_ids").borrow();
+        auto* mrope_length           = env.try_("mrope_length");
+        auto* mrope_position_delta   = env.try_("mrope_position_delta");
+        auto* mrope_position_offsets = env.try_("mrope_position_offsets");
+        auto* mrope_position_ids     = env.try_("mrope_position_ids");
+        if (mrope_length || mrope_position_delta || mrope_position_offsets || mrope_position_ids) {
+            TM_CHECK(mrope_length) << "MRoPE requires native vision-produced mrope_length";
+            TM_CHECK(mrope_position_delta) << "MRoPE requires native vision-produced mrope_position_delta";
+            TM_CHECK(mrope_position_offsets) << "MRoPE requires native vision-produced mrope_position_offsets";
+            TM_CHECK(mrope_position_ids) << "MRoPE requires native vision-produced mrope_position_ids";
+
+            d.mrope_length           = mrope_length->buffer().borrow();
+            d.mrope_position_delta   = mrope_position_delta->buffer().borrow();
+            d.mrope_position_offsets = mrope_position_offsets->buffer().borrow();
+            d.mrope_position_ids     = mrope_position_ids->buffer().borrow();
         }
         else {
-            // Legacy r.inputs path. Lazily allocate owned device buffers on first hit.
-            if (!d.mrope_position_ids) {
-                /// TODO: total space for `mrope_position_ids` can be reduced to (max_fwd_tokens, 3)
-                d.mrope_position_ids =
-                    Tensor_<int>{{engine_param_.max_batch_size, engine_param_.session_len, 3}, kDEVICE};
-                d.mrope_position_delta = Buffer_<int>{engine_param_.max_batch_size, kDEVICE};
-                d.mrope_length         = Buffer_<int>{engine_param_.max_batch_size, kDEVICE};
-            }
-            const auto stride = d.mrope_position_ids.stride(0);
-            for (int i = 0; i < rc.size(); ++i) {
-                auto& c = *rc[i];
-                auto& r = *c.req;
-                if (auto pos_ids = r.inputs.try_("mrope_position_ids")) {
-                    int length                   = pos_ids->shape(0);
-                    mrope_length_buf_[i]         = length;
-                    mrope_position_delta_buf_[i] = *r.inputs.at("mrope_position_delta").data<int>();
-                    if (auto o = Interval{0, length} & Interval{c.history_len + c.alpha, Interval::Size{c.input_len}}) {
-                        copy(pos_ids->data<int>() + o.begin() * 3,
-                             (int)o.size() * 3,
-                             d.mrope_position_ids.data() + i * stride + o.begin() * 3);
-                    }
-                }
-                else {
-                    mrope_length_buf_[i] = mrope_position_delta_buf_[i] = 0;
-                }
-            }
-            copy(mrope_length_buf_, rc.size(), d.mrope_length);
-            copy(mrope_position_delta_buf_, rc.size(), d.mrope_position_delta);
+            d.mrope_length           = mrope_default_buf_.borrow();
+            d.mrope_position_delta   = mrope_default_buf_.borrow();
+            d.mrope_position_offsets = mrope_default_buf_.borrow();
+            d.mrope_position_ids     = mrope_default_buf_.borrow();
         }
     }
 }
@@ -522,11 +502,10 @@ Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p,
             params.rope_param.base = d.rope_base.data() + offset;
         }
         if (rope_param_.mrope_mode != MropeMode::kNone) {
-            params.rope_param.mrope.position_delta = d.mrope_position_delta.data() + offset;
-            params.rope_param.mrope.length         = d.mrope_length.data() + offset;
-            params.rope_param.mrope.stride         = d.mrope_position_ids.stride(0);
-            params.rope_param.mrope.position_ids =
-                d.mrope_position_ids.data() + offset * params.rope_param.mrope.stride;
+            params.rope_param.mrope.position_delta   = d.mrope_position_delta.data() + offset;
+            params.rope_param.mrope.position_offsets = d.mrope_position_offsets.data() + offset;
+            params.rope_param.mrope.length           = d.mrope_length.data() + offset;
+            params.rope_param.mrope.position_ids     = d.mrope_position_ids.data();
         }
 
         // logn attn
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 79c20d3115..ce2926245d 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -110,8 +110,7 @@ class UnifiedAttentionLayer {
     Tensor         tmp_attn_;
 
     Buffer_<float> rope_base_buf_;
-    Buffer_<int>   mrope_position_delta_buf_;
-    Buffer_<int>   mrope_length_buf_;
+    Buffer_<int>   mrope_default_buf_;
 
     CpPostContext cp_fn_ctx_;  // context parallel
 };
diff --git a/src/turbomind/models/qwen3_5vit/bias_gelu.cu b/src/turbomind/models/qwen3_5vit/bias_gelu.cu
deleted file mode 100644
index 3d6f9813a7..0000000000
--- a/src/turbomind/models/qwen3_5vit/bias_gelu.cu
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "src/turbomind/models/qwen3_5vit/bias_gelu.h"
-
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/kernels/activation_ops.h"
-#include "src/turbomind/kernels/core/array_ops.h"
-#include "src/turbomind/kernels/core/common.h"
-
-#include <type_traits>
-
-namespace turbomind {
-
-namespace {
-
-template<int vec_size, typename T, template<typename> class Activation>
-__global__ void biasActivationKernel(T* data, const T* __restrict__ bias, int64_t stride, int num, int dim)
-{
-    const int ti = blockIdx.x;
-    const int di = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size;
-
-    if (ti >= num || di >= dim) {
-        return;
-    }
-
-    Array<T, vec_size> x_vec;
-    Load(x_vec, data + ti * stride + di);
-
-    auto x = cast<float>(x_vec);
-
-    if (bias) {
-        Array<T, vec_size> bias_vec;
-        Ldg(bias_vec, bias + di);
-        using namespace ops;
-        x = x + cast<float>(bias_vec);
-    }
-
-    PRAGMA_UNROLL
-    for (int i = 0; i < vec_size; ++i) {
-        x[i] = Activation<float>::apply(x[i]);
-    }
-
-    Store(data + ti * stride + di, cast<T>(x));
-}
-
-}  // namespace
-
-void invokeQwen3_5VitBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream)
-{
-    if (x.size() == 0) {
-        return;
-    }
-
-    TM_CHECK_EQ(x.ndim(), 2);
-    if (bias) {
-        TM_CHECK_EQ(bias.shape(-1), x.shape(-1));
-        TM_CHECK_EQ(bias.dtype(), x.dtype());
-    }
-
-    auto invoke = [&](auto t) {
-        using T               = decltype(t);
-        constexpr int max_vec = sizeof(uint4) / sizeof(T);
-        constexpr int threads = 512;
-
-        const int     num    = x.shape(0);
-        const int     dim    = x.shape(1);
-        const int64_t stride = x.stride(0);
-
-        int best_vec_size = 1;
-        for (int v = max_vec; v >= 1; v >>= 1) {
-            if (dim % v == 0 && stride % v == 0) {
-                best_vec_size = v;
-                break;
-            }
-        }
-
-        auto launch = [&](auto vec_size_) {
-            constexpr int vec_size = decltype(vec_size_)::value;
-            const dim3    grid(num, cdiv(dim, threads * vec_size));
-            if (type == ActivationType::kGeluPytorchTanh) {
-                biasActivationKernel<vec_size, T, GeluActivation>
-                    <<<grid, threads, 0, stream>>>(x.data<T>(), bias.data_or((T*)nullptr), stride, num, dim);
-            }
-            else if (type == ActivationType::kGelu) {
-                biasActivationKernel<vec_size, T, GeluErfActivation>
-                    <<<grid, threads, 0, stream>>>(x.data<T>(), bias.data_or((T*)nullptr), stride, num, dim);
-            }
-            else {
-                TM_LOG_FATAL("unsupported Qwen3.5 ViT bias activation type: {}", (int)type);
-            }
-        };
-
-        switch (best_vec_size) {
-            case 8:
-                return launch(std::integral_constant<int, 8>{});
-            case 4:
-                return launch(std::integral_constant<int, 4>{});
-            case 2:
-                return launch(std::integral_constant<int, 2>{});
-            default:
-                return launch(std::integral_constant<int, 1>{});
-        }
-    };
-
-    TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/bias_gelu.h b/src/turbomind/models/qwen3_5vit/bias_gelu.h
deleted file mode 100644
index 9563c6c2ad..0000000000
--- a/src/turbomind/models/qwen3_5vit/bias_gelu.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include "src/turbomind/core/core.h"
-#include "src/turbomind/kernels/activation.h"
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// In-place Qwen3.5 ViT bias + unary activation:
-// x <- activation(x + bias)
-void invokeQwen3_5VitBiasActivation(Tensor& x, const Tensor& bias, ActivationType type, cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu b/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu
deleted file mode 100644
index 020d825a84..0000000000
--- a/src/turbomind/models/qwen3_5vit/fast_pos_embed.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-#include "src/turbomind/models/qwen3_5vit/fast_pos_embed.h"
-
-#include "src/turbomind/core/data_type.h"
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/kernels/core/array.h"
-#include "src/turbomind/kernels/core/array_ops.h"
-
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-namespace turbomind {
-
-namespace {
-
-template<typename T>
-__device__ inline T from_float(float x);
-
-template<>
-__device__ inline half from_float<half>(float x)
-{
-    return __float2half(x);
-}
-
-#ifdef ENABLE_BF16
-template<>
-__device__ inline __nv_bfloat16 from_float<__nv_bfloat16>(float x)
-{
-    return __float2bfloat16(x);
-}
-#endif
-
-// `num_grids` is tiny (usually 1..a few) so a linear scan is fine.
-__device__ inline int find_grid(const int* offsets, int num_grids, int pos)
-{
-    int g = 0;
-    for (int i = 1; i < num_grids; ++i) {
-        if (offsets[i * 2 + 1] <= pos) {
-            g = i;
-        }
-        else {
-            break;
-        }
-    }
-    return g;
-}
-
-template<typename T>
-__global__ void fastPosEmbedIdxWeightKernel(
-    int* idx_out, T* weight_out, const int* grid_thws, const int* grid_offsets, int num_grids, int total_n, int G)
-{
-    const int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    if (pos >= total_n) {
-        return;
-    }
-
-    const int g      = find_grid(grid_offsets, num_grids, pos);
-    const int grid_h = grid_thws[g * 3 + 1];
-    const int grid_w = grid_thws[g * 3 + 2];
-    const int local  = pos - grid_offsets[g * 2 + 1];
-    const int i      = local / grid_w;
-    const int j      = local % grid_w;
-
-    // torch.linspace(0, G-1, n) uses the halfway-symmetric formulation so
-    // that both endpoints are exact:
-    //   step      = (end - start) / (n - 1)
-    //   halfway   = n / 2
-    //   out[i<hw] = start + step * i
-    //   out[i>=hw]= end   - step * (n - 1 - i)
-    // For n == 1 the single element is `start` (== 0 here); the formula
-    // below collapses to 0 since hw_h == 0 is bypassed via grid_h > 1.
-    const float end    = (float)(G - 1);
-    const float step_h = (grid_h > 1) ? end / (float)(grid_h - 1) : 0.f;
-    const float step_w = (grid_w > 1) ? end / (float)(grid_w - 1) : 0.f;
-
-    const int hw_h = grid_h / 2;
-    const int hw_w = grid_w / 2;
-
-    const float h_val = (grid_h == 1) ? 0.f : ((i < hw_h) ? step_h * (float)i : end - step_h * (float)(grid_h - 1 - i));
-    const float w_val = (grid_w == 1) ? 0.f : ((j < hw_w) ? step_w * (float)j : end - step_w * (float)(grid_w - 1 - j));
-
-    // torch.Tensor.int() truncates toward zero; h_val, w_val are non-negative
-    // and bounded above by G-1, so (int) cast is in [0, G-1].
-    const int h_floor = (int)h_val;
-    const int w_floor = (int)w_val;
-    const int h_ceil  = min(h_floor + 1, G - 1);
-    const int w_ceil  = min(w_floor + 1, G - 1);
-
-    const float dh = h_val - (float)h_floor;
-    const float dw = w_val - (float)w_floor;
-
-    const int base_h      = h_floor * G;
-    const int base_h_ceil = h_ceil * G;
-
-    Array<int, 4> idx;
-    idx[0] = base_h + w_floor;
-    idx[1] = base_h + w_ceil;
-    idx[2] = base_h_ceil + w_floor;
-    idx[3] = base_h_ceil + w_ceil;
-
-    Array<T, 4> weight;
-    weight[0] = from_float<T>((1.f - dh) * (1.f - dw));
-    weight[1] = from_float<T>((1.f - dh) * dw);
-    weight[2] = from_float<T>(dh * (1.f - dw));
-    weight[3] = from_float<T>(dh * dw);
-
-    const int out_base = pos * 4;
-    Store(idx_out + out_base, idx);
-    Store(weight_out + out_base, weight);
-}
-
-}  // namespace
-
-void invokeFastPosEmbedIdxWeight(int*         idx_out,
-                                 void*        weight_out,
-                                 DataType     dtype,
-                                 const int*   grid_thws,
-                                 const int*   grid_offsets,
-                                 int          num_grids,
-                                 int          total_n,
-                                 int          num_grid_per_side,
-                                 cudaStream_t stream)
-{
-    if (total_n <= 0 || num_grids <= 0) {
-        return;
-    }
-    const int block = 256;
-    const int grid  = (total_n + block - 1) / block;
-
-    auto invoke = [&](auto t) {
-        using T = decltype(t);
-        fastPosEmbedIdxWeightKernel<T><<<grid, block, 0, stream>>>(
-            idx_out, (T*)weight_out, grid_thws, grid_offsets, num_grids, total_n, num_grid_per_side);
-    };
-    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fast_pos_embed.h b/src/turbomind/models/qwen3_5vit/fast_pos_embed.h
deleted file mode 100644
index 7949b01323..0000000000
--- a/src/turbomind/models/qwen3_5vit/fast_pos_embed.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include "src/turbomind/core/data_type.h"
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// Precomputes the 4 bilinear-interpolation gather indices and weights
-// used by the subsequent pos-embed merge step in Qwen3-VL.
-void invokeFastPosEmbedIdxWeight(int*         idx_out,     // [total_n * 4]
-                                 void*        weight_out,  // [total_n * 4]
-                                 DataType     dtype,
-                                 const int*   grid_thws,     // [num_grids * 3], (t, h, w)
-                                 const int*   grid_offsets,  // [num_grids * 2], (t*h*w, h*w)
-                                 int          num_grids,
-                                 int          total_n,
-                                 int          num_grid_per_side,
-                                 cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu b/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu
deleted file mode 100644
index ee19483cdd..0000000000
--- a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h"
-
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/kernels/core/array_ops.h"
-
-namespace turbomind {
-
-namespace {
-
-// `num_grids` is tiny (usually 1..a few) so a linear scan is fine.
-__device__ inline int find_grid(const int* offsets, int num_grids, int pos)
-{
-    int g = 0;
-    for (int i = 1; i < num_grids; ++i) {
-        if (offsets[i * 2 + 1] <= pos) {
-            g = i;
-        }
-        else {
-            break;
-        }
-    }
-    return g;
-}
-
-template<typename T>
-__global__ void fastRotaryPosEmbKernel(T*         cos_sin_out,
-                                       const int* grid_thws,
-                                       const int* grid_offsets,
-                                       int        num_grids,
-                                       int        total_hw,
-                                       int        head_dim,
-                                       float      scale)  // -log2(theta) / (head_dim/4)
-{
-    const int pair_count = head_dim / 2;  // e.g. 36
-    const int freq_half  = head_dim / 4;  // e.g. 18
-
-    const int tid    = blockIdx.x * blockDim.x + threadIdx.x;
-    const int pos    = tid / pair_count;
-    const int pair_k = tid % pair_count;
-    if (pos >= total_hw) {
-        return;
-    }
-
-    const int g      = find_grid(grid_offsets, num_grids, pos);
-    const int grid_w = grid_thws[g * 3 + 2];
-    const int local  = pos - grid_offsets[g * 2 + 1];
-    const int i      = local / grid_w;  // h_coord
-    const int j      = local % grid_w;  // w_coord
-
-    // Pairs [0, freq_half) rotate in h; pairs [freq_half, 2*freq_half) rotate in w.
-    const int   freq_idx = pair_k % freq_half;
-    const int   coord    = (pair_k < freq_half) ? i : j;
-    const float inv_freq = exp2f((float)freq_idx * scale);
-
-    float c, s;
-    sincosf((float)coord * inv_freq, &s, &c);
-
-    Array<T, 2> cs{(T)c, (T)s};
-    Store(cos_sin_out + (size_t)pos * head_dim + pair_k * 2, cs);
-}
-
-}  // namespace
-
-void invokeQwen3VitRotaryPosEmb(void*        cos_sin,
-                                DataType     dtype,
-                                const int*   grid_thws,
-                                const int*   grid_offsets,
-                                int          num_grids,
-                                int          total_hw,
-                                int          head_dim,
-                                float        theta,
-                                cudaStream_t stream)
-{
-    if (total_hw <= 0 || num_grids <= 0 || head_dim <= 0) {
-        return;
-    }
-    TM_CHECK(head_dim % 4 == 0) << "head_dim must be divisible by 4, got " << head_dim;
-
-    const int   total = total_hw * (head_dim / 2);
-    const int   block = 256;
-    const int   grid  = (total + block - 1) / block;
-    const float scale = -log2f(theta) / (float)(head_dim / 4);
-
-    auto invoke = [&](auto t) {
-        using T = decltype(t);
-        fastRotaryPosEmbKernel<T>
-            <<<grid, block, 0, stream>>>((T*)cos_sin, grid_thws, grid_offsets, num_grids, total_hw, head_dim, scale);
-    };
-    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h b/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h
deleted file mode 100644
index 920e8b6409..0000000000
--- a/src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include "src/turbomind/core/data_type.h"
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// Precomputes the (cos, sin) rotary-embedding table for Qwen3-VL vision tokens.
-// Layout per natural flat position (keyed by the same index `mapped_idx` carries):
-//   [c_0, s_0, c_1, s_1, ..., c_{head_dim/2-1}, s_{head_dim/2-1}]
-// The pair index `k` uses `h_coord` for k < head_dim/4 and `w_coord` otherwise,
-// with inv_freq = theta^(-2*(k % (head_dim/4)) / (head_dim/2)).
-void invokeQwen3VitRotaryPosEmb(void*        cos_sin,  // [total_hw, head_dim]
-                                DataType     dtype,
-                                const int*   grid_thws,     // [num_grids * 3], (t, h, w)
-                                const int*   grid_offsets,  // [num_grids * 2], (t*h*w, h*w)
-                                int          num_grids,
-                                int          total_hw,
-                                int          head_dim,
-                                float        theta,
-                                cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu b/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu
deleted file mode 100644
index dc62548e9f..0000000000
--- a/src/turbomind/models/qwen3_5vit/fused_embed_merge.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "src/turbomind/models/qwen3_5vit/fused_embed_merge.h"
-
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/kernels/core/array_ops.h"
-
-namespace turbomind {
-
-namespace {
-
-template<typename T, int N>
-__device__ Array<float, N> roundToStorageDtype(Array<float, N> x)
-{
-    return cast<float>(cast<T>(x));
-}
-
-template<int vec_size, typename T>
-__global__ void fusedPosEmbedMergeKernel(T*         hidden_states,
-                                         const T*   pos_embeds,
-                                         const T*   pos_embed_weights,
-                                         const int* mapped_idx,
-                                         const T*   bias,
-                                         int        hidden,
-                                         int        vdim)
-{
-    const int index  = blockIdx.x;
-    const int mapped = mapped_idx[index];  // same address for all threads in block -> L1 broadcast
-
-    Array<T, 4> w4;
-    Ldg(w4, pos_embed_weights + mapped * 4);
-
-    const int row_off = index * hidden;
-    const int pe_row0 = mapped * 4 * hidden;
-
-    using namespace ops;
-    for (int d = threadIdx.x; d < vdim; d += blockDim.x) {
-        Array<float, vec_size> pos{};
-        Array<T, vec_size>     tmp;
-        Load(tmp, hidden_states + row_off + d * vec_size);
-        auto hidden_acc = cast<float>(tmp);
-
-        if (bias) {
-            Ldg(tmp, bias + d * vec_size);
-            hidden_acc = roundToStorageDtype<T>(hidden_acc + cast<float>(tmp));
-        }
-        PRAGMA_UNROLL
-        for (int k = 0; k < 4; ++k) {
-            Ldg(tmp, pos_embeds + pe_row0 + k * hidden + d * vec_size);
-            pos = pos + cast<float>(tmp * w4[k]);
-        }
-        const auto out = hidden_acc + roundToStorageDtype<T>(pos);
-        Store(hidden_states + row_off + d * vec_size, cast<T>(out));
-    }
-}
-
-}  // namespace
-
-void invokeFusedPosEmbedMerge(void*        hidden_states,
-                              const void*  pos_embeds,
-                              const void*  pos_embed_weights,
-                              const int*   mapped_idx,
-                              const void*  bias,
-                              int          batch,
-                              int          hidden,
-                              DataType     dtype,
-                              cudaStream_t stream)
-{
-    if (batch <= 0) {
-        return;
-    }
-
-    const dim3 grid(batch);
-    const dim3 block(128);
-
-    auto invoke = [&](auto t) {
-        using T                = decltype(t);
-        constexpr int vec_size = sizeof(uint4) / sizeof(T);
-        TM_CHECK(hidden % vec_size == 0);
-        fusedPosEmbedMergeKernel<vec_size, T><<<grid, block, 0, stream>>>((T*)hidden_states,
-                                                                          (const T*)pos_embeds,
-                                                                          (const T*)pos_embed_weights,
-                                                                          mapped_idx,
-                                                                          (const T*)bias,
-                                                                          hidden,
-                                                                          hidden / vec_size);
-    };
-    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/fused_embed_merge.h b/src/turbomind/models/qwen3_5vit/fused_embed_merge.h
deleted file mode 100644
index dc077fe58f..0000000000
--- a/src/turbomind/models/qwen3_5vit/fused_embed_merge.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "src/turbomind/core/data_type.h"
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// Fuses the spatial-merge permutation, the bilinear-weighted sum, and the
-// t-expansion of Qwen3-VL ViT pos_embed interpolation into a single pass on
-// top of the patch_embed linear output.
-void invokeFusedPosEmbedMerge(void*        hidden_states,      // [batch, hidden]
-                              const void*  pos_embeds,         // [total_hw * 4, hidden]
-                              const void*  pos_embed_weights,  // [total_hw * 4]
-                              const int*   mapped_idx,         // [batch]
-                              const void*  bias,               // [hidden] or nullptr
-                              int          batch,
-                              int          hidden,
-                              DataType     dtype,
-                              cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu b/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu
deleted file mode 100644
index 2c16224c9a..0000000000
--- a/src/turbomind/models/qwen3_5vit/mrope_position_ids.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h"
-
-namespace turbomind {
-
-namespace {
-
-constexpr int kBlock = 128;
-
-__global__ void mropeScatterKernel(int* pos_ids, int row_stride, const MropeSegment* __restrict__ segs)
-{
-    const MropeSegment s       = segs[blockIdx.x];
-    const int          local_k = blockIdx.y * blockDim.x + threadIdx.x;
-    if (local_k >= s.n_tok) {
-        return;
-    }
-    int* dst = pos_ids + s.dst_row * row_stride + 3 * (s.dst_offset + local_k);
-    if (s.h2 == 0) {  // text run
-        const int p = s.base_pos + local_k;
-        dst[0]      = p;
-        dst[1]      = p;
-        dst[2]      = p;
-    }
-    else {  // image run — grid math uses the original (un-clipped) k
-        const int k  = s.k_offset + local_k;
-        const int hw = s.h2 * s.w2;
-        dst[0]       = s.base_pos + k / hw;
-        dst[1]       = s.base_pos + (k / s.w2) % s.h2;
-        dst[2]       = s.base_pos + k % s.w2;
-    }
-}
-
-}  // namespace
-
-void invokeMropePositionIds(
-    int* pos_ids, int row_stride, const MropeSegment* segments, int num_segments, int max_seg_len, cudaStream_t stream)
-{
-    if (num_segments <= 0 || max_seg_len <= 0) {
-        return;
-    }
-    const int  tiles = (max_seg_len + kBlock - 1) / kBlock;
-    const dim3 grid((unsigned)num_segments, (unsigned)tiles);
-    mropeScatterKernel<<<grid, kBlock, 0, stream>>>(pos_ids, row_stride, segments);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/mrope_position_ids.h b/src/turbomind/models/qwen3_5vit/mrope_position_ids.h
deleted file mode 100644
index 59352bdff0..0000000000
--- a/src/turbomind/models/qwen3_5vit/mrope_position_ids.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// One descriptor per text / image run, clipped to a prefill chunk's active window.
-// `h2 == 0` flags a text run (real image grids always have h2 > 0).
-struct MropeSegment {
-    int dst_row;     // batch slot in the (bsz, max_active_end, 3) output table
-    int dst_offset;  // absolute seq index of the first token written by this segment
-    int n_tok;       // tokens to write (already clipped to the active range)
-    int base_pos;    // text: position id at local_k = 0; image: image's mm_start
-    int h2;          // image grid h after spatial merge (0 ⇒ text)
-    int w2;          // image grid w after spatial merge (ignored when h2 == 0)
-    int k_offset;    // starting "k" for image grid math (clip-offset within the run); unused for text
-};
-
-// Scatter `num_segments` segments into `pos_ids` of shape (bsz, row_stride/3, 3).
-// `pos_ids` may be null when num_segments == 0.
-void invokeMropePositionIds(int*                pos_ids,
-                            int                 row_stride,  // = max_active_end * 3
-                            const MropeSegment* segments,    // device
-                            int                 num_segments,
-                            int                 max_seg_len,
-                            cudaStream_t        stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu b/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu
deleted file mode 100644
index a2dbfbedfd..0000000000
--- a/src/turbomind/models/qwen3_5vit/qkv_preprocess.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/models/qwen3_5vit/qkv_preprocess.h"
-
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/kernels/core/array_ops.h"
-#include "src/turbomind/kernels/core/common.h"
-
-#include <type_traits>
-
-namespace turbomind {
-
-namespace {
-
-constexpr int kWarpsPerBlock = 4;
-
-// Per-head_dim launch traits. Adding a new head_dim is a single specialization here.
-template<int HD>
-struct HeadConfig;
-
-template<>
-struct HeadConfig<64> {
-    static constexpr int kVecSize      = 8;
-    static constexpr int kHeadsPerWarp = 4;
-};
-
-template<>
-struct HeadConfig<72> {
-    static constexpr int kVecSize      = 8;
-    static constexpr int kHeadsPerWarp = 3;
-};
-
-template<>
-struct HeadConfig<128> {
-    static constexpr int kVecSize      = 8;  // 128 / 8 = 16 vec/head
-    static constexpr int kHeadsPerWarp = 2;  // 16 * 2 = 32 == WARP_SIZE
-};
-
-template<int VecSize, typename T>
-__device__ __forceinline__ void add_bias(Array<float, VecSize>& x, const T* bias)
-{
-    Array<T, VecSize> b;
-    Ldg(b, bias);
-    using namespace ops;
-    x = x + cast<float>(b);
-}
-
-// Rotate adjacent (x[2k], x[2k+1]) pairs using cos/sin packed as
-// rope[2k]=cos, rope[2k+1]=sin (see fast_rotary_pos_emb.cu:56-58).
-template<int VecSize, typename T>
-__device__ __forceinline__ void apply_rope_pair(Array<float, VecSize>& x, const Array<T, VecSize>& rope)
-{
-    auto cs = cast<float>(rope);
-    PRAGMA_UNROLL
-    for (int i = 0; i < VecSize; i += 2) {
-        const float x0 = x[i];
-        const float x1 = x[i + 1];
-        const float c  = cs[i];
-        const float s  = cs[i + 1];
-        x[i]           = c * x0 - s * x1;
-        x[i + 1]       = c * x1 + s * x0;
-    }
-}
-
-// Load `src`, add `bias`, apply RoPE, store to `dst` (may equal `src` for in-place).
-template<int VecSize, typename T>
-__device__ __forceinline__ void fuse_bias_rope_store(T* dst, const T* src, const T* bias, const Array<T, VecSize>& rope)
-{
-    Array<T, VecSize> x_vec;
-    Load(x_vec, src);
-    auto x = cast<float>(x_vec);
-    add_bias(x, bias);
-    apply_rope_pair(x, rope);
-    Store(dst, cast<T>(x));
-}
-
-// V-path: bias only, no RoPE.
-template<int VecSize, typename T>
-__device__ __forceinline__ void fuse_bias_store(T* dst, const T* src, const T* bias)
-{
-    Array<T, VecSize> x_vec;
-    Load(x_vec, src);
-    auto x = cast<float>(x_vec);
-    add_bias(x, bias);
-    Store(dst, cast<T>(x));
-}
-
-template<typename T, int HD>
-__global__ __launch_bounds__(kWarpsPerBlock* WARP_SIZE) void prepareQKVKernel(T* __restrict__ qkv,
-                                                                              T* __restrict__ kv,
-                                                                              const T* __restrict__ bias,
-                                                                              const T* __restrict__ rotary_pos_emb,
-                                                                              const int* __restrict__ mapped_idx,
-                                                                              int token_num,
-                                                                              int local_head_num,
-                                                                              int head_group_num,
-                                                                              int rope_head_dim)
-{
-    using Cfg                   = HeadConfig<HD>;
-    constexpr int kVecSize      = Cfg::kVecSize;
-    constexpr int kHeadsPerWarp = Cfg::kHeadsPerWarp;
-    constexpr int kVecPerHead   = HD / kVecSize;
-    static_assert(HD % kVecSize == 0);
-    static_assert(kVecPerHead * kHeadsPerWarp <= WARP_SIZE);
-
-    const int warp_id   = threadIdx.x / WARP_SIZE;
-    const int lane      = threadIdx.x - warp_id * WARP_SIZE;
-    const int head_slot = lane / kVecPerHead;
-    if (head_slot >= kHeadsPerWarp) {
-        return;
-    }
-
-    const int global_warp = blockIdx.x * kWarpsPerBlock + warp_id;
-    const int total_warps = token_num * head_group_num;
-    if (global_warp >= total_warps) {
-        return;
-    }
-
-    const int token_idx  = global_warp / head_group_num;
-    const int head_group = global_warp - token_idx * head_group_num;
-    const int head_idx   = head_group * kHeadsPerWarp + head_slot;
-    if (head_idx >= local_head_num) {
-        return;
-    }
-
-    const int vec_idx = lane - head_slot * kVecPerHead;
-    const int di      = vec_idx * kVecSize;
-
-    // QKV per-token layout: [Q_heads | K_heads | V_heads], head_num == kv_head_num for ViT.
-    const int64_t qkv_stride = (int64_t)local_head_num * 3 * HD;
-    T* const      q_ptr      = qkv + (int64_t)token_idx * qkv_stride + head_idx * HD + di;
-    const T*      k_ptr      = q_ptr + (int64_t)local_head_num * HD;
-    const T*      v_ptr      = k_ptr + (int64_t)local_head_num * HD;
-
-    const T* q_bias = bias + head_idx * HD + di;
-    const T* k_bias = q_bias + local_head_num * HD;
-    const T* v_bias = k_bias + local_head_num * HD;
-
-    // K/V destination in transposed [kv_head, 2, token, head_dim] layout.
-    T* const k_dst = kv + ((int64_t)head_idx * 2 * token_num + token_idx) * HD + di;
-    T* const v_dst = k_dst + (int64_t)token_num * HD;
-
-    // rope[token, di] is shared between Q and K — load once, reuse twice.
-    // When HD > rope_head_dim, padded di-slices have zero Q/K, so loading a
-    // zero rope_vec there is correct (and avoids OOB on the [N, rope_head_dim]
-    // buffer). kVecSize is aligned to rope_head_dim so each vec is fully in or
-    // fully out of the rope range.
-    Array<T, kVecSize> rope_vec{};
-    if (di < rope_head_dim) {
-        Ldg(rope_vec, rotary_pos_emb + (int64_t)mapped_idx[token_idx] * rope_head_dim + di);
-    }
-
-    fuse_bias_rope_store<kVecSize>(q_ptr, q_ptr, q_bias, rope_vec);  // Q: in-place
-    fuse_bias_rope_store<kVecSize>(k_dst, k_ptr, k_bias, rope_vec);  // K: transposed
-    fuse_bias_store<kVecSize>(v_dst, v_ptr, v_bias);                 // V: transposed, no RoPE
-}
-
-template<typename T>
-void dispatchPrepareQKV(T*           qkv,
-                        T*           kv,
-                        const T*     qkv_bias,
-                        const T*     rotary_pos_emb,
-                        const int*   mapped_idx,
-                        int          token_num,
-                        int          local_head_num,
-                        int          head_dim,
-                        int          rope_head_dim,
-                        cudaStream_t stream)
-{
-    auto invoke = [&](auto hd_c) {
-        constexpr int HD = decltype(hd_c)::value;
-        using Cfg        = HeadConfig<HD>;
-
-        // Each vec_size-wide load must lie entirely in or out of the rope range.
-        TM_CHECK(rope_head_dim % Cfg::kVecSize == 0)
-            << "rope_head_dim (" << rope_head_dim << ") must be a multiple of kVecSize (" << Cfg::kVecSize << ")";
-        TM_CHECK(rope_head_dim <= HD) << "rope_head_dim (" << rope_head_dim << ") cannot exceed head_dim (" << HD
-                                      << ")";
-
-        const int head_group_num = (local_head_num + Cfg::kHeadsPerWarp - 1) / Cfg::kHeadsPerWarp;
-        const int total_warps    = token_num * head_group_num;
-        dim3      grid((total_warps + kWarpsPerBlock - 1) / kWarpsPerBlock);
-        prepareQKVKernel<T, HD><<<grid, kWarpsPerBlock * WARP_SIZE, 0, stream>>>(
-            qkv, kv, qkv_bias, rotary_pos_emb, mapped_idx, token_num, local_head_num, head_group_num, rope_head_dim);
-    };
-
-    switch (head_dim) {
-        case 64:
-            return invoke(std::integral_constant<int, 64>{});
-        case 72:
-            return invoke(std::integral_constant<int, 72>{});
-        case 128:
-            return invoke(std::integral_constant<int, 128>{});
-        default:
-            TM_LOG_FATAL("unsupported Qwen3.5 ViT head_dim for qkv preprocess: {}", head_dim);
-    }
-}
-
-}  // namespace
-
-void invokeQwen3_5VitPrepareQKV(void*        qkv,
-                                void*        kv,
-                                const void*  qkv_bias,
-                                const void*  rotary_pos_emb,
-                                const int*   mapped_idx,
-                                DataType     dtype,
-                                int          token_num,
-                                int          local_head_num,
-                                int          head_dim,
-                                int          rope_head_dim,
-                                cudaStream_t stream)
-{
-    if (token_num == 0) {
-        return;
-    }
-
-    auto invoke = [&](auto t) {
-        using T = decltype(t);
-        dispatchPrepareQKV((T*)qkv,
-                           (T*)kv,
-                           (const T*)qkv_bias,
-                           (const T*)rotary_pos_emb,
-                           mapped_idx,
-                           token_num,
-                           local_head_num,
-                           head_dim,
-                           rope_head_dim,
-                           stream);
-    };
-
-    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qkv_preprocess.h b/src/turbomind/models/qwen3_5vit/qkv_preprocess.h
deleted file mode 100644
index da1c2a84cb..0000000000
--- a/src/turbomind/models/qwen3_5vit/qkv_preprocess.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#pragma once
-
-#include "src/turbomind/core/data_type.h"
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-// Prepare the Qwen3.5 ViT attention inputs after the fused QKV projection.
-//
-// qkv layout:
-//   [token, local_q_heads + 2 * local_kv_heads, head_dim]
-// Q is updated in place with bias + RoPE. K/V are written to `kv` as:
-//   [local_kv_heads, 2, token, head_dim]
-//
-// `rope_head_dim` is the per-head dim of the rotary_pos_emb buffer and is
-// also the cutoff below which RoPE is applied. When the model's real head_dim
-// is not natively supported by the attention kernel, Q/K/V are zero-padded
-// per-head to a kernel-supported `head_dim` >= `rope_head_dim`; the padded
-// `[rope_head_dim, head_dim)` slice has zero Q/K so RoPE is skipped there.
-void invokeQwen3_5VitPrepareQKV(void*        qkv,
-                                void*        kv,
-                                const void*  qkv_bias,
-                                const void*  rotary_pos_emb,
-                                const int*   mapped_idx,
-                                DataType     dtype,
-                                int          token_num,
-                                int          local_head_num,
-                                int          head_dim,
-                                int          rope_head_dim,
-                                cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc b/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc
deleted file mode 100644
index 547206c31e..0000000000
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit.cc
+++ /dev/null
@@ -1,800 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit.h"
-
-#include "src/turbomind/core/logger.h"
-#include "src/turbomind/core/scope.h"
-#include "src/turbomind/kernels/attention/attention.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
-#include "src/turbomind/kernels/norm/layer_norm.h"
-#include "src/turbomind/kernels/norm/rms_norm.h"
-#include "src/turbomind/models/layer_norm_weight.h"
-#include "src/turbomind/models/llama/SequenceManager.h"
-#include "src/turbomind/models/qwen3_5vit/bias_gelu.h"
-#include "src/turbomind/models/qwen3_5vit/fast_pos_embed.h"
-#include "src/turbomind/models/qwen3_5vit/fast_rotary_pos_emb.h"
-#include "src/turbomind/models/qwen3_5vit/fused_embed_merge.h"
-#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h"
-#include "src/turbomind/models/qwen3_5vit/qkv_preprocess.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <fstream>
-#include <functional>
-#include <numeric>
-#include <string>
-
-namespace turbomind {
-
-struct Qwen3_5Vit::Impl {
-    const Qwen3_5VitWeight&       weights_;
-    const core::Qwen3_5VitConfig& config_;
-    int                           phases_;
-    LlamaLinear&                  linear_;
-    const comm::HostComm&         h_tp_group;
-    comm::DeviceCommImpl* const   d_comm_;
-    const int                     tp_group_;
-    const DataType                engine_data_type_;
-
-    Buffer_<int> grid_thws_buf_;     // (t, h, w)
-    Buffer_<int> grid_offsets_buf_;  // (t*h*w, h*w)
-    Buffer_<int> mapped_idx_buf_;    // [batch]
-    Buffer_<int> attn_cu_seqlens_buf_;
-
-    struct Data {
-        Tensor                           batch_input;
-        int                              batch_size;
-        std::vector<std::array<int, 3>>  grid_thws_host;
-        std::vector<std::pair<int, int>> image_embeds_coords;  // (size, pos) for image embeddings
-        std::vector<std::pair<int, int>> input_embeds_coords;  // (size, pos) for input embeddings
-
-        // for fast_pos_embed
-        Tensor_<int> grid_thws;
-        Tensor_<int> grid_offsets;
-        Tensor_<int> mapped_idx;
-        int          total_hw;
-
-        // for full attention, one sequence per temporal frame
-        Tensor_<int>  attn_cu_seqlens;
-        Tensor_<bool> attn_finished;
-        int           attn_batch_size;
-        int           max_attn_len;
-
-        // mrope position-id scratch (per-phase pinned host buffers for fast H2D)
-        Buffer_<int> mrope_segs_host;  // reinterpreted as MropeSegment[], 7 ints per segment
-        Buffer_<int> mrope_length_host;
-        Buffer_<int> mrope_delta_host;
-
-        // mrope outputs — owned here so UnifiedAttentionLayer can safely borrow() across env clears.
-        Buffer_<int> mrope_segs_dev;        // device-side segment scratch, grown alongside host
-        Tensor_<int> mrope_position_ids;    // (bsz, max_active_end, 3), empty when no slot needs table
-        Tensor_<int> mrope_length;          // (bsz,)
-        Tensor_<int> mrope_position_delta;  // (bsz,)
-
-        void Clear()
-        {
-            batch_size      = 0;
-            total_hw        = 0;
-            attn_batch_size = 0;
-            max_attn_len    = 0;
-            grid_thws_host.clear();
-            image_embeds_coords.clear();
-            input_embeds_coords.clear();
-        }
-    };
-
-    static constexpr int kMropeSegInts = sizeof(MropeSegment) / sizeof(int);
-    static_assert(sizeof(MropeSegment) % sizeof(int) == 0);
-
-    std::vector<Data> data_;
-
-    Impl(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases):
-        weights_{weights},
-        config_{weights.config()},
-        phases_{phases},
-        linear_{*ctx.linear},
-        h_tp_group{ctx.comm.h_comm},
-        d_comm_{ctx.comm.d_comm},
-        tp_group_{ctx.comm.d_tp_group},
-        engine_data_type_{engine.data_type}
-    {
-        auto& cfg = weights.config();
-        for (int i = 0; i < phases; ++i) {
-            auto& d             = data_.emplace_back();
-            d.batch_input       = {{engine.max_forward_token_num, cfg.patch_in_dim}, cfg.data_type, kCPUpinned};
-            d.mrope_length_host = {engine.max_batch_size, kCPUpinned};
-            d.mrope_delta_host  = {engine.max_batch_size, kCPUpinned};
-            // Generous initial capacity: typical batches emit << bsz * 8 segments. Lazily grown below.
-            d.mrope_segs_host = {engine.max_batch_size * 8 * (ssize_t)kMropeSegInts, kCPUpinned};
-
-            // mrope outputs at worst-case shape so Setup() never reallocates them. Rows beyond
-            // the current bsz (and rows whose length[i] == 0) are stale but unreachable from
-            // FastRoPE — it only reads position_ids[3 * timestep] when `timestep < length[i]`.
-            d.mrope_length         = Tensor_<int>{{engine.max_batch_size}, kDEVICE};
-            d.mrope_position_delta = Tensor_<int>{{engine.max_batch_size}, kDEVICE};
-            d.mrope_position_ids   = Tensor_<int>{{engine.max_batch_size, engine.session_len, 3}, kDEVICE};
-        }
-
-        // should be large enough to hold all patches
-        grid_thws_buf_       = {engine.max_forward_token_num * 3, kCPUpinned};
-        grid_offsets_buf_    = {engine.max_forward_token_num * 2, kCPUpinned};
-        mapped_idx_buf_      = {engine.max_forward_token_num, kCPUpinned};
-        attn_cu_seqlens_buf_ = {engine.max_forward_token_num + 1, kCPUpinned};
-    }
-
-    void AllReduceSum(Tensor& tensor, cudaStream_t stream) const
-    {
-        if (d_comm_) {
-            d_comm_->AllReduceSum(
-                tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream);
-            TM_CUDA_CHECK(cudaGetLastError());
-        }
-    }
-
-    void FastPosEmbedInterpolate(Data& d, TensorMap& env)
-    {
-        auto& cfg    = weights_.config();
-        auto  stream = core::Context::stream().handle();
-
-        const int num_grid_per_side = (int)std::sqrt(cfg.num_position_embeddings);
-        TM_CHECK_EQ(num_grid_per_side * num_grid_per_side, cfg.num_position_embeddings);
-        TM_CHECK_EQ(weights_.pos_embed.shape(0), cfg.num_position_embeddings);
-        TM_CHECK_EQ(weights_.pos_embed.shape(1), cfg.hidden_dim);
-
-        Buffer_<int> pos_embed_idx     = {d.total_hw * 4, kDEVICE};
-        Tensor       pos_embed_weights = {{d.total_hw, 4}, cfg.data_type, kDEVICE};
-        invokeFastPosEmbedIdxWeight(pos_embed_idx.data(),
-                                    pos_embed_weights.raw_data(),
-                                    cfg.data_type,
-                                    d.grid_thws.data(),
-                                    d.grid_offsets.data(),
-                                    d.grid_thws.shape(0),
-                                    d.total_hw,
-                                    num_grid_per_side,
-                                    stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        Tensor pos_embeds = {{d.total_hw * 4, cfg.hidden_dim}, cfg.data_type, kDEVICE};
-        invokeEmbeddingLookup(pos_embeds, pos_embed_idx, weights_.pos_embed, stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        env.produce("pos_embeds", pos_embeds);
-        env.produce("pos_embed_weights", pos_embed_weights);
-    }
-
-    void RotPosEmb(Data& d, TensorMap& env)
-    {
-        auto& cfg = weights_.config();
-
-        const int head_dim = cfg.hidden_dim / cfg.head_num;
-        // produce rotary_pos_emb: [total_hw, head_dim] with interleaved (c,s,c,s,...) pairs,
-        // keyed by the same natural flat index that `mapped_idx` already carries. Vision q/k
-        // are reordered into this adjacent-pair layout at export time.
-        Tensor rotary_pos_emb = {{d.total_hw, head_dim}, cfg.data_type, kDEVICE};
-        invokeQwen3VitRotaryPosEmb(rotary_pos_emb.raw_data(),
-                                   cfg.data_type,
-                                   d.grid_thws.data(),
-                                   d.grid_offsets.data(),
-                                   d.grid_thws.shape(0),
-                                   d.total_hw,
-                                   head_dim,
-                                   /*theta=*/10000.0f,
-                                   core::Context::stream().handle());
-        TM_CUDA_CHECK(cudaGetLastError());
-        env.produce("rotary_pos_emb", rotary_pos_emb);
-    }
-
-    Tensor PatchEmbedding(Data& d)
-    {
-        Tensor host_input = d.batch_input.slice(0, d.batch_size);
-        Tensor input      = empty_like(host_input, kDEVICE);
-
-        Copy(host_input, input);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        Tensor output;
-        TM_SCOPE_CALL(linear_.Forward(input, *weights_.patch_embed, output));
-        return output;
-    }
-
-    int Add(RequestCache& c)
-    {
-        const auto& [r, s] = std::tie(*c.req, *c.seq);
-        if (r.mm_inputs) {
-            if ((not r.session.start_flag) or (not r.session.end_flag)) {
-                // only support non-interactive inference
-                return Request::kInvalid;
-            }
-
-            const auto mm_inputs = std::dynamic_pointer_cast<multimodal::Qwen3_5VitInput>(r.mm_inputs);
-            if (!mm_inputs) {
-                return Request::kInvalid;
-            }
-
-            for (const auto& item : mm_inputs->items) {
-                if (item.modality != multimodal::Modality::kImage && item.modality != multimodal::Modality::kVideo) {
-                    return Request::kInvalid;
-                }
-
-                const int tokens = item.token_end - item.token_begin;
-                if (tokens <= 0) {
-                    return Request::kInvalid;
-                }
-
-                auto mm_item = std::make_shared<MultiModalData>(
-                    MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, item.grid_thw});
-                s.multimodal_inputs.push_back(mm_item);
-            }
-        }
-
-        return Request::kOk;
-    }
-
-    void Add(int phase, TensorMap& env)
-    {
-        // convert model-specific multimodal inputs to internal MultiModalData
-        const Buffer_<RequestCache*> rc = env.at("requests").buffer();
-        for (int i = 0; i < rc.size(); ++i) {
-            auto& c = *TM_CHECK_NOTNULL(rc[i]);
-            if (c.status == 0) {
-                c.status = Add(c);
-            }
-        }
-    }
-
-    // Build the mrope tensors consumed by `UnifiedAttentionLayer` and publish them to env.
-    //
-    // Per-request layout: one row in `(max_batch_size, session_len, 3)` for each slot. Prefill
-    // slots with multimodal_inputs get their active range written by `invokeMropePositionIds`
-    // from a clipped list of MropeSegment descriptors (one per text/image run). All other slots
-    // (decode + text-only prefill) get `length[i] = 0` so FastRoPE falls through to the closed-
-    // form `timestep + delta` path and never reads the stale row.
-    //
-    // The three output tensors live on `Data` (allocated worst-case in the ctor). env shares
-    // ownership via shared_ptr; UAL borrows safely across env clears.
-    void SetupMrope(int phase, TensorMap& env)
-    {
-        auto& d  = data_.at(phase);
-        auto& b  = *env.at("batch").data<BatchData*>()[0];
-        auto& rc = b.rc;
-
-        const int bsz = (int)rc.size();
-        if (bsz <= 0) {
-            return;
-        }
-
-        const int S = weights_.config().spatial_merge_size;
-
-        // 1) One pass to upper-bound segment count, then size host + device scratch in one shot.
-        //    Worst case per prefill slot with mrope: 2*num_images + 1 segments.
-        int upper_segs = 0;
-        for (int i = 0; i < bsz; ++i) {
-            const auto& c = *rc[i];
-            if (!c.autoregres && !c.seq->multimodal_inputs.empty()) {
-                upper_segs += 2 * (int)c.seq->multimodal_inputs.size() + 1;
-            }
-        }
-        const ssize_t upper_ints = (ssize_t)upper_segs * kMropeSegInts;
-        if (upper_ints > d.mrope_segs_host.size()) {
-            core::ContextGuard ctx{Allocator{kCPUpinned}};
-            d.mrope_segs_host = Buffer_<int>{upper_ints, kCPUpinned};
-        }
-        if (upper_ints > d.mrope_segs_dev.size()) {
-            d.mrope_segs_dev = Buffer_<int>{upper_ints, kDEVICE};
-        }
-
-        // 2) Unified per-request walk — always advance mm_off; emit segments only for needs_table.
-        auto* segs        = reinterpret_cast<MropeSegment*>(d.mrope_segs_host.data());
-        int   n_segs      = 0;
-        int   max_seg_len = 0;
-
-        for (int i = 0; i < bsz; ++i) {
-            const auto& c            = *rc[i];
-            const auto& s            = *c.seq;
-            const int   seq_len      = (int)c.req->inputs.at("input_ids").shape(0);
-            const bool  needs_table  = !c.autoregres && !s.multimodal_inputs.empty();
-            const int   active_start = c.history_len + c.alpha;
-            const int   active_end   = active_start + c.input_len;
-
-            auto emit = [&](int run_start, int run_n, int run_base, int h2, int w2) {
-                const int a = std::max(run_start, active_start);
-                const int b = std::min(run_start + run_n, active_end);
-                if (a >= b) {
-                    return;
-                }
-                const int local_off = a - run_start;
-                segs[n_segs++]      = MropeSegment{
-                    i,
-                    a,
-                    b - a,
-                    /*base_pos=*/(h2 == 0) ? run_base + local_off : run_base,
-                    h2,
-                    w2,
-                    /*k_offset=*/(h2 == 0) ? 0 : local_off,
-                };
-                max_seg_len = std::max(max_seg_len, b - a);
-            };
-
-            int row = 0, pos = 0, mm_off = 0;
-            for (const auto& mm : s.multimodal_inputs) {
-                const auto& [t, h, w] = mm->grid_thw;
-                const int h2 = h / S, w2 = w / S, n_tok = t * h2 * w2;
-                TM_CHECK_EQ(n_tok, (int)mm->interval.size()) << "image token count mismatches grid_thw";
-                const int img_start = mm->interval.begin();
-                const int img_base  = img_start + mm_off;
-                if (needs_table) {
-                    if (img_start > row) {
-                        emit(row, img_start - row, pos, /*h2=*/0, /*w2=*/0);
-                    }
-                    emit(img_start, n_tok, img_base, h2, w2);
-                }
-                row               = img_start + n_tok;
-                const int new_pos = std::max({t, h2, w2});
-                pos               = img_base + new_pos;
-                mm_off += new_pos - n_tok;
-            }
-            if (needs_table && row < seq_len) {
-                emit(row, seq_len - row, pos, /*h2=*/0, /*w2=*/0);
-            }
-
-            d.mrope_length_host.data()[i] = needs_table ? seq_len : 0;
-            d.mrope_delta_host.data()[i]  = mm_off;
-        }
-
-        // 3) Copy the bsz prefix of length / delta into the pre-allocated device tensors.
-        //    Rows beyond bsz are untouched (UAL never reads them).
-        Copy(d.mrope_length_host.slice(0, bsz), d.mrope_length.buffer().slice(0, bsz));
-        Copy(d.mrope_delta_host.slice(0, bsz), d.mrope_position_delta.buffer().slice(0, bsz));
-
-        // 4) Populate position_ids only when a slot actually needs the table. Rows for slots
-        //    with length[i] == 0 are unreachable from FastRoPE, so leaving them stale is safe.
-        if (n_segs > 0) {
-            const ssize_t segs_ints = (ssize_t)n_segs * kMropeSegInts;
-            Copy(d.mrope_segs_host.slice(0, segs_ints), d.mrope_segs_dev.slice(0, segs_ints));
-            invokeMropePositionIds(d.mrope_position_ids.data(),
-                                   (int)d.mrope_position_ids.stride(0),
-                                   reinterpret_cast<const MropeSegment*>(d.mrope_segs_dev.data()),
-                                   n_segs,
-                                   max_seg_len,
-                                   core::Context::stream().handle());
-            TM_CUDA_CHECK(cudaGetLastError());
-        }
-
-        // 5) Publish all three — the consumer relies on this contract unconditionally.
-        env.produce("mrope_length", d.mrope_length);
-        env.produce("mrope_position_delta", d.mrope_position_delta);
-        env.produce("mrope_position_ids", d.mrope_position_ids);
-    }
-
-    void Setup(int phase, TensorMap& env)
-    {
-        // create batch data according to scheduled sequences
-        auto& d    = data_.at(phase);
-        auto& b    = *env.at("batch").data<BatchData*>()[0];
-        auto& copy = *env.at("copy").data<BatchCopy*>()[0];
-        auto& cfg  = weights_.config();
-
-        int input_ids_offsets    = 0;
-        int image_embeds_offsets = 0;
-        d.Clear();
-        std::vector<Tensor> pixel_values;
-
-        // collect image/video pixel values, grid_thws and embeds_coords
-        const auto& rc = b.rc;
-        for (int i = 0; i < rc.size(); ++i) {
-            const auto& c = *rc[i];
-            const auto& s = *c.seq;
-
-            if ((not c.autoregres) && (not s.multimodal_inputs.empty())) {
-                Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}};
-                for (const auto& mm : s.multimodal_inputs) {
-                    auto o = mm->interval & text;
-                    if (auto size = (int)o.size()) {
-                        pixel_values.push_back(mm->data);
-                        d.batch_size += mm->data.shape(0);
-
-                        const int text_offset  = input_ids_offsets + o.begin() - text.begin();
-                        const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin();
-                        d.input_embeds_coords.emplace_back(size, text_offset);
-                        d.image_embeds_coords.emplace_back(size, image_offset);
-
-                        auto& grid_thw = mm->grid_thw;
-                        d.grid_thws_host.emplace_back(grid_thw);
-                        auto prod = std::accumulate(grid_thw.begin(), grid_thw.end(), 1, std::multiplies<int>());
-                        image_embeds_offsets += (prod / cfg.spatial_merge_size / cfg.spatial_merge_size);
-                    }
-                }
-            }
-
-            input_ids_offsets += c.autoregres ? 1 : c.input_len;
-        }
-
-        // copy pixel values to batch input
-        if (d.batch_size > 0) {
-            if (d.batch_size > d.batch_input.shape(0)) {
-                core::ContextGuard ctx{Allocator{kCPUpinned}};
-                d.batch_input = {{d.batch_size, cfg.patch_in_dim}, cfg.data_type, kCPUpinned};
-            }
-            ssize_t batch_offset = 0;
-            for (const auto& pixel_value : pixel_values) {
-                TM_CHECK_EQ(pixel_value.size(), pixel_value.shape(0) * cfg.patch_in_dim);
-                TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype());
-                Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0)));
-                batch_offset += pixel_value.shape(0);
-            }
-            TM_CHECK_EQ(batch_offset, d.batch_size);
-        }
-
-        // setup fast_pos_embed
-        if (const int num_grids = (int)d.grid_thws_host.size(); num_grids > 0) {
-            for (const auto& [t, h, w] : d.grid_thws_host) {
-                d.attn_batch_size += t;
-                d.max_attn_len = std::max(d.max_attn_len, h * w);
-            }
-            if (grid_thws_buf_.size() < (ssize_t)num_grids * 3) {
-                core::ContextGuard ctx{Allocator{kCPUpinned}};
-                grid_thws_buf_    = Buffer_<int>{num_grids * 3, kCPUpinned};
-                grid_offsets_buf_ = Buffer_<int>{num_grids * 2, kCPUpinned};
-            }
-            if (mapped_idx_buf_.size() < d.batch_size) {
-                core::ContextGuard ctx{Allocator{kCPUpinned}};
-                mapped_idx_buf_ = Buffer_<int>{d.batch_size, kCPUpinned};
-            }
-            if (attn_cu_seqlens_buf_.size() < (ssize_t)d.attn_batch_size + 1) {
-                core::ContextGuard ctx{Allocator{kCPUpinned}};
-                attn_cu_seqlens_buf_ = Buffer_<int>{d.attn_batch_size + 1, kCPUpinned};
-            }
-            d.grid_thws       = {{num_grids, 3}, kDEVICE};
-            d.grid_offsets    = {{num_grids, 2}, kDEVICE};
-            d.mapped_idx      = {{d.batch_size}, kDEVICE};
-            d.attn_cu_seqlens = {{d.attn_batch_size + 1}, kDEVICE};
-            d.attn_finished   = {{d.attn_batch_size}, kDEVICE};
-
-            std::pair<int, int> offset{};
-            int                 attn_seq_idx            = 0;
-            int                 attn_offset             = 0;
-            attn_cu_seqlens_buf_.data()[attn_seq_idx++] = 0;
-            for (int i = 0; i < num_grids; ++i) {
-                const auto& [t, h, w] = d.grid_thws_host[i];
-                TM_CHECK(h % cfg.spatial_merge_size == 0);
-                TM_CHECK(w % cfg.spatial_merge_size == 0);
-                const int hw = h * w;
-                for (int tt = 0; tt < t; ++tt) {
-                    attn_offset += hw;
-                    attn_cu_seqlens_buf_.data()[attn_seq_idx++] = attn_offset;
-                }
-
-                grid_thws_buf_.data()[i * 3]        = t;
-                grid_thws_buf_.data()[i * 3 + 1]    = h;
-                grid_thws_buf_.data()[i * 3 + 2]    = w;
-                grid_offsets_buf_.data()[i * 2]     = offset.first;
-                grid_offsets_buf_.data()[i * 2 + 1] = offset.second;
-
-                // compute mapped_idx
-                TM_CHECK(offset.first + t * h * w <= d.batch_size);
-                const int S   = cfg.spatial_merge_size;
-                int*      buf = mapped_idx_buf_.data();
-                int       pos = offset.first;
-                for (int h_outer = 0; h_outer < h / S; ++h_outer) {
-                    for (int w_outer = 0; w_outer < w / S; ++w_outer) {
-                        for (int h_inner = 0; h_inner < S; ++h_inner) {
-                            for (int w_inner = 0; w_inner < S; ++w_inner) {
-                                const int ii = h_outer * S + h_inner;
-                                const int jj = w_outer * S + w_inner;
-                                buf[pos++]   = offset.second + ii * w + jj;
-                            }
-                        }
-                    }
-                }
-                for (int tt = 1; tt < t; ++tt) {
-                    std::memcpy(buf + offset.first + tt * hw, buf + offset.first, hw * sizeof(int));
-                }
-                pos = offset.first + t * hw;
-                TM_CHECK_EQ(pos, offset.first + t * h * w);
-                offset.first += t * h * w;
-                offset.second += h * w;
-            }
-            TM_CHECK_EQ(offset.first, d.batch_size);
-            TM_CHECK_EQ(attn_offset, d.batch_size);
-            TM_CHECK_EQ(attn_seq_idx, d.attn_batch_size + 1);
-            d.total_hw = offset.second;
-            copy(grid_thws_buf_.data(), num_grids * 3, d.grid_thws.data());
-            copy(grid_offsets_buf_.data(), num_grids * 2, d.grid_offsets.data());
-            copy(mapped_idx_buf_.data(), d.batch_size, d.mapped_idx.data());
-            copy(attn_cu_seqlens_buf_.data(), d.attn_batch_size + 1, d.attn_cu_seqlens.data());
-            Clear(d.attn_finished);
-        }
-
-        SetupMrope(phase, env);
-        h_tp_group->Sync();
-    }
-
-    void Prepare(int phase, TensorMap& env)
-    {
-        auto& d = data_.at(phase);
-        if (d.batch_size == 0) {
-            return;
-        }
-
-        // produce non-merged pos-embeds and weights
-        FastPosEmbedInterpolate(d, env);
-
-        // produce rotary_pos_emb
-        RotPosEmb(d, env);
-    }
-
-    void Forward(int phase, TensorMap& args)
-    {
-        auto& d = data_.at(phase);
-        if (d.batch_size == 0) {
-            return;
-        }
-
-        auto& cfg = weights_.config();
-
-        // 1) patch_embed (Linear without bias, the bias will be folded into the fused kernel)
-        auto residual          = PatchEmbedding(d);
-        auto pos_embeds        = args.consume("pos_embeds");
-        auto pos_embed_weights = args.consume("pos_embed_weights");
-        auto rotary_pos_emb    = args.consume("rotary_pos_emb");
-        auto stream            = core::Context::stream().handle();
-
-        // 2) fused pos-embed gather/merge:
-        // residual[pos, d] += Σ_k w[k] * pos_embeds[mapped*4+k, d] + bias[d]
-        invokeFusedPosEmbedMerge(residual.raw_data(),
-                                 pos_embeds.raw_data(),
-                                 pos_embed_weights.raw_data(),
-                                 d.mapped_idx.data(),
-                                 weights_.patch_embed->bias ? weights_.patch_embed->bias.raw_data() : nullptr,
-                                 d.batch_size,
-                                 cfg.hidden_dim,
-                                 cfg.data_type,
-                                 stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        // 3) decoder
-        Tensor hidden_states = [&]() {
-            Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{};
-            if (symm_buf && d.batch_size * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) {
-                return Tensor{symm_buf.view(cfg.data_type), {d.batch_size, cfg.hidden_dim}};
-            }
-            else {
-                return Tensor{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE};
-            }
-        }();
-
-        invokeLayerNorm(hidden_states,
-                        residual,
-                        weights_.block(0)->norm1->weight,
-                        weights_.block(0)->norm1->bias,
-                        weights_.block(0)->norm1->norm_eps_,
-                        stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) {
-
-            // attn
-            auto invoke = [&](auto t) {
-                using T = decltype(t);
-                Attn<T>(hidden_states, hidden_states, d, layer_id, rotary_pos_emb);
-            };
-            TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke);
-
-            AllReduceSum(hidden_states, stream);
-
-            auto* block = weights_.block(layer_id);
-            invokeResidualBiasLayerNorm(hidden_states.raw_data(),
-                                        residual.raw_data(),
-                                        block->norm2->weight.raw_data(),
-                                        block->norm2->bias.data_or((void*)nullptr),
-                                        block->attention->wo->bias.data_or((void*)nullptr),
-                                        hidden_states.dtype(),
-                                        cfg.hidden_dim,
-                                        d.batch_size,
-                                        block->norm2->norm_eps_,
-                                        stream);
-            TM_CUDA_CHECK(cudaGetLastError());
-
-            // mlp
-            Mlp(hidden_states, hidden_states, d, layer_id);
-            AllReduceSum(hidden_states, stream);
-
-            const auto* next_norm =
-                layer_id + 1 < cfg.depth ? weights_.block(layer_id + 1)->norm1.get() : weights_.merger_norm.get();
-            TM_CHECK_NOTNULL(next_norm);
-            invokeResidualBiasLayerNorm(hidden_states.raw_data(),
-                                        residual.raw_data(),
-                                        next_norm->weight.raw_data(),
-                                        next_norm->bias.data_or((void*)nullptr),
-                                        block->mlp_fc2->bias.data_or((void*)nullptr),
-                                        hidden_states.dtype(),
-                                        cfg.hidden_dim,
-                                        d.batch_size,
-                                        next_norm->norm_eps_,
-                                        stream);
-            TM_CUDA_CHECK(cudaGetLastError());
-        }
-
-        Tensor image_embeds = Merger(hidden_states);
-
-        // ViT may run in its own dtype (e.g. bf16) while the text engine runs
-        // in fp16 (AWQ-forced). PatchMultimodalEmbedding merges this buffer
-        // into the text embedding stream via a byte-level copy, so the dtypes
-        // must match before publishing.
-        EnsureFloatDtype(image_embeds, engine_data_type_);
-
-        args.produce("multimodal",
-                     MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf());
-    }
-
-    template<typename T>
-    AttentionParams<T> CreateVitAttentionParams(
-        Tensor& attn_context, Tensor& qkv, Tensor& kv, Data& d, const AttentionWeight& attn, int layer_id)
-    {
-        const int local_head_num = attn.head_num / attn.tp_size;
-        const int head_dim       = attn.head_dim;
-        const int token_num      = d.batch_size;
-
-        AttentionParams<T> params{};
-        params.out = (T*)attn_context.raw_data();
-        params.q   = (T*)qkv.raw_data();
-
-        params.stride = (int64_t)local_head_num * 3 * head_dim;
-
-        params.cu_q_len = d.attn_cu_seqlens.data();
-        params.cu_k_len = d.attn_cu_seqlens.data();
-        params.finished = d.attn_finished.data();
-
-        params.linear_iter_params = LinearIteratorParams{
-            kv.raw_data(),
-            2 * token_num * head_dim,
-            token_num * head_dim,
-        };
-
-        params.token_num  = token_num;
-        params.batch_size = d.attn_batch_size;
-        params.max_q_len  = d.max_attn_len;
-        params.max_k_len  = d.max_attn_len;
-
-        params.num_heads     = local_head_num;
-        params.num_kv_heads  = local_head_num;
-        params.size_per_head = head_dim;
-        params.causal        = false;
-        params.layer_id      = layer_id;
-
-        double scaling = 1.;
-        if (attn.softmax_scale) {
-            scaling *= attn.softmax_scale;
-        }
-        else {
-            scaling /= std::sqrt((float)head_dim);
-        }
-        params.inv_sqrt_dh = scaling * std::log2(std::exp(1.));
-
-        params.window_size     = 0;
-        params.rope_param.type = RopeType::kNull;
-        params.max_split_k     = 1;
-        params.cp_size         = 1;
-        params.stream          = core::Context::stream().handle();
-
-        return params;
-    }
-
-    template<typename T>
-    void Attn(Tensor& input, Tensor& output, Data& d, int layer_id, const Tensor& rotary_pos_emb)
-    {
-        auto& vit_cfg = weights_.config();
-        auto* attn    = weights_.block(layer_id)->attention.get();
-
-        Tensor qkv;
-        TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv));
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        const int local_head_num = attn->head_num / attn->tp_size;
-        const int head_dim       = attn->head_dim;                         // may be padded
-        const int rope_head_dim  = vit_cfg.hidden_dim / vit_cfg.head_num;  // model's real per-head dim
-        const int token_num      = d.batch_size;
-
-        Tensor tmp_kv{{local_head_num, 2, d.batch_size, head_dim}, qkv.dtype(), qkv.device()};
-        invokeQwen3_5VitPrepareQKV(qkv.raw_data(),
-                                   tmp_kv.raw_data(),
-                                   attn->w_qkv->bias.raw_data(),
-                                   rotary_pos_emb.raw_data(),
-                                   d.mapped_idx.data(),
-                                   qkv.dtype(),
-                                   token_num,
-                                   local_head_num,
-                                   head_dim,
-                                   rope_head_dim,
-                                   core::Context::stream().handle());
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        Tensor attn_output{{token_num, local_head_num * head_dim}, qkv.dtype(), qkv.device()};
-        auto   params = CreateVitAttentionParams<T>(attn_output, qkv, tmp_kv, d, *attn, layer_id);
-        dispatchAttention<T>(params);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output));
-        TM_CUDA_CHECK(cudaGetLastError());
-    }
-
-    void Mlp(Tensor& input, Tensor& output, Data& d, int layer_id)
-    {
-        auto* block  = weights_.block(layer_id);
-        auto  stream = core::Context::stream().handle();
-
-        TM_CHECK(block);
-        TM_CHECK_EQ(input.shape(0), d.batch_size);
-        TM_CHECK_EQ(input.shape(1), config_.hidden_dim);
-
-        Tensor inter;
-        TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter));
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        invokeQwen3_5VitBiasActivation(inter, block->mlp_fc1->bias, ActivationType::kGeluPytorchTanh, stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output));
-        TM_CUDA_CHECK(cudaGetLastError());
-    }
-
-    Tensor Merger(Tensor& input)
-    {
-        auto& cfg    = config_;
-        auto  stream = core::Context::stream().handle();
-
-        const int merge_area   = cfg.spatial_merge_size * cfg.spatial_merge_size;
-        Tensor    merged_input = input.view({-1, cfg.hidden_dim * merge_area});
-
-        Tensor inter;
-        TM_SCOPE_CALL(linear_.Forward(merged_input, *weights_.merger_fc1, inter));
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        invokeQwen3_5VitBiasActivation(inter, weights_.merger_fc1->bias, ActivationType::kGelu, stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        Tensor output;
-        TM_SCOPE_CALL(linear_.Forward(inter, *weights_.merger_fc2, output));
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        AllReduceSum(output, stream);
-
-        ApplyBias(output, weights_.merger_fc2->bias, stream);
-        TM_CUDA_CHECK(cudaGetLastError());
-
-        return output;
-    }
-};
-
-Qwen3_5Vit::Qwen3_5Vit(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases):
-    impl_{std::make_unique<Impl>(engine, ctx, weights, phases)}
-{
-}
-
-Qwen3_5Vit::~Qwen3_5Vit() = default;
-
-void Qwen3_5Vit::Run(BatchOp op, int phase, TensorMap& env)
-{
-    TM_FUNCTION_SCOPE();
-    switch (op) {
-        case BatchOp::kAdd:
-            return impl_->Add(phase, env);
-        case BatchOp::kSetup:
-            return impl_->Setup(phase, env);
-        case BatchOp::kPrepare:
-            return impl_->Prepare(phase, env);
-        case BatchOp::kForward:
-            return impl_->Forward(phase, env);
-        default:
-            return;
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit.h b/src/turbomind/models/qwen3_5vit/qwen3_5vit.h
deleted file mode 100644
index 8c2e94d612..0000000000
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#pragma once
-
-#include <memory>
-
-#include "src/turbomind/core/core.h"
-#include "src/turbomind/engine/batch.h"
-#include "src/turbomind/models/llama/context.h"
-#include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/models/vision_model.h"
-
-namespace turbomind {
-
-class Qwen3_5VitWeight;
-
-/// Concrete ``VisionModel`` for the Qwen3.5 ViT encoder.
-///
-/// This task only stubs the runtime: the phase methods log a debug
-/// breadcrumb and return. Follow-up work fills in the actual ViT
-/// kernels (patcher → blocks → merger → caching of image embeddings).
-class Qwen3_5Vit: public VisionModel {
-public:
-    Qwen3_5Vit(const EngineParam& engine, const Context& ctx, const Qwen3_5VitWeight& weights, int phases);
-
-    ~Qwen3_5Vit() override;
-
-    void Run(BatchOp op, int phase, TensorMap& env) override;
-
-private:
-    struct Impl;
-    std::unique_ptr<Impl> impl_;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h b/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h
deleted file mode 100644
index 7c2ee35652..0000000000
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#pragma once
-
-#include "src/turbomind/core/core.h"
-#include "src/turbomind/core/module.h"
-#include "src/turbomind/models/vision_model_weight.h"
-
-#include <vector>
-
-namespace turbomind::core {
-
-/// Root config for Qwen3.5 ViT. Carries every structural scalar the
-/// C++ runtime needs to allocate kernels later (depth, head_num,
-/// patcher dims, …). Each field is visited by ``for_each`` via the
-/// X-macro below — pybind11 then exposes every field as a read/write
-/// attribute on the Python ``Qwen3_5VitConfig``.
-struct Qwen3_5VitConfig: ModuleConfig {
-    Qwen3_5VitConfig(): ModuleConfig{"Qwen3_5VitWeight"} {}
-
-    DataType data_type{};
-    int      hidden_dim{0};
-    int      out_hidden_dim{0};
-    int      depth{0};
-    int      head_num{0};
-    int      intermediate_size{0};
-    int      patch_in_dim{0};
-    int      in_channels{0};
-    int      patch_size{0};
-    int      temporal_patch_size{0};
-    int      num_position_embeddings{0};
-    int      spatial_merge_size{0};
-    float    norm_eps{1e-6f};
-
-#define QWEN3_5VIT_FIELDS(X)                                                                                           \
-    X(DataType, data_type)                                                                                             \
-    X(int, hidden_dim)                                                                                                 \
-    X(int, out_hidden_dim)                                                                                             \
-    X(int, depth)                                                                                                      \
-    X(int, head_num)                                                                                                   \
-    X(int, intermediate_size)                                                                                          \
-    X(int, patch_in_dim)                                                                                               \
-    X(int, in_channels)                                                                                                \
-    X(int, patch_size)                                                                                                 \
-    X(int, temporal_patch_size)                                                                                        \
-    X(int, num_position_embeddings)                                                                                    \
-    X(int, spatial_merge_size)                                                                                         \
-    X(float, norm_eps, 1e-6f)
-
-    TM_FOR_EACH(Qwen3_5VitConfig, QWEN3_5VIT_FIELDS)
-
-#undef QWEN3_5VIT_FIELDS
-};
-
-}  // namespace turbomind::core
-
-namespace turbomind {
-
-// Forward decls
-class LayerNormWeight;
-class LinearWeight;
-class Qwen3_5VitBlockWeight;
-
-/// Concrete Qwen3.5 ViT weight tree.
-///
-/// Tree:
-///   patch_embed   LinearWeight (Conv3d-as-Linear; in_dim = C·T·patch²)
-///   pos_embed     raw tensor   (num_position_embeddings × hidden_dim)
-///   blocks        ModuleList   of Qwen3_5VitBlockWeight × depth
-///   merger_fc1    LinearWeight (in: hidden·spatial_merge², out: 4·hidden)
-///   merger_fc2    LinearWeight (in: 4·hidden,              out: out_hidden)
-///   merger_norm   LayerNormWeight (over hidden_dim)
-///
-/// We expose ``merger_*`` as direct children rather than a sub-module to
-/// keep the weight tree shallow — the merger has only three pieces.
-class Qwen3_5VitWeight: public VisionModelWeight {
-public:
-    const char* type() const override
-    {
-        return "Qwen3_5VitWeight";
-    }
-
-    Qwen3_5VitWeight() = default;
-    explicit Qwen3_5VitWeight(const core::Qwen3_5VitConfig& cfg);
-
-    void prepare() override;
-    bool verify(std::vector<std::string>& missing) override;
-
-    // --- X-macro field lists ---
-#define QWEN3_5VIT_WEIGHT_CHILDREN(X)                                                                                  \
-    X(LinearWeight, patch_embed)                                                                                       \
-    X(core::ModuleList, blocks)                                                                                        \
-    X(LinearWeight, merger_fc1)                                                                                        \
-    X(LinearWeight, merger_fc2)                                                                                        \
-    X(LayerNormWeight, merger_norm)
-
-#define QWEN3_5VIT_WEIGHT_PARAMS(X) X(pos_embed)
-
-    TM_MODULE_DECLARE(Qwen3_5VitWeight, QWEN3_5VIT_WEIGHT_CHILDREN, QWEN3_5VIT_WEIGHT_PARAMS)
-
-    // --- Accessors ---
-    const core::Qwen3_5VitConfig& config() const noexcept
-    {
-        return config_;
-    }
-
-    Qwen3_5VitBlockWeight* block(int i) const;
-
-private:
-    core::Qwen3_5VitConfig config_{};
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/qwenvit/qwenvit.cc b/src/turbomind/models/qwenvit/qwenvit.cc
new file mode 100644
index 0000000000..1c58cef8ab
--- /dev/null
+++ b/src/turbomind/models/qwenvit/qwenvit.cc
@@ -0,0 +1,1024 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/turbomind/models/qwenvit/qwenvit.h"
+
+#include "src/turbomind/core/logger.h"
+#include "src/turbomind/core/scope.h"
+#include "src/turbomind/kernels/activation.h"
+#include "src/turbomind/kernels/attention/attention.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/kernels/norm/layer_norm.h"
+#include "src/turbomind/kernels/norm/rms_norm.h"
+#include "src/turbomind/models/layer_norm_weight.h"
+#include "src/turbomind/models/llama/SequenceManager.h"
+#include "src/turbomind/models/norm_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_input.h"
+#include "src/turbomind/models/qwenvit/qwenvit_kernels.h"
+#include "src/turbomind/models/qwenvit/qwenvit_weight.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/memory_utils.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace turbomind {
+
+struct QwenVit::Impl {
+    const QwenVitWeight&        weights_;
+    const core::QwenVitConfig&  config_;
+    LlamaLinear&                linear_;
+    const comm::HostComm&       h_tp_group;
+    comm::DeviceCommImpl* const d_comm_;
+    const int                   tp_group_;
+    const DataType              engine_data_type_;
+
+    Buffer_<int> grid_thws_buf_;     // (t, h, w)
+    Buffer_<int> grid_offsets_buf_;  // (token offset, natural offset)
+    Buffer_<int> window_idx_buf_;
+    Buffer_<int> cu_window_seqlens_buf_;
+    Buffer_<int> attn_cu_seqlens_buf_;
+
+    struct Data {
+        Tensor                           batch_input;
+        int                              batch_size;
+        std::vector<std::array<int, 3>>  grid_thws_host;
+        std::vector<std::pair<int, int>> image_embeds_coords;  // (size, pos) for image embeddings
+        std::vector<std::pair<int, int>> input_embeds_coords;  // (size, pos) for input embeddings
+
+        // for RoPE / pos-embed interpolation
+        Tensor_<int> grid_thws;
+        Tensor_<int> grid_offsets;
+        Tensor_<int> mapped_idx;
+        Tensor_<int> window_idx;
+        Tensor_<int> window_mapped_idx;
+        int          total_hw;
+        int          merge_unit_count;
+
+        // for full attention, one sequence per temporal frame
+        Tensor_<int>  attn_cu_seqlens;
+        Tensor_<int>  cu_window_seqlens;
+        Tensor_<bool> attn_finished;
+        Tensor_<bool> window_attn_finished;
+        int           attn_batch_size;
+        int           max_attn_len;
+        int           window_attn_batch_size;
+        int           max_window_attn_len;
+
+        // mrope position-id scratch (per-phase pinned host buffers for fast H2D)
+        Buffer_<int> mrope_segs_host;  // reinterpreted as MropeSegment[], kMropeSegInts ints per segment
+        Buffer_<int> mrope_length_host;
+        Buffer_<int> mrope_delta_host;
+        Buffer_<int> mrope_offsets_host;
+
+        // mrope outputs - owned here so UnifiedAttentionLayer can safely borrow() across env clears.
+        Buffer_<int> mrope_segs_dev;          // device-side segment scratch, grown alongside host
+        Tensor_<int> mrope_position_ids;      // (max_forward_token_num, 3), flat current-forward table
+        Tensor_<int> mrope_length;            // (bsz,)
+        Tensor_<int> mrope_position_delta;    // (bsz,)
+        Tensor_<int> mrope_position_offsets;  // (bsz,), flat row offset for each request slot
+
+        void Clear()
+        {
+            batch_size             = 0;
+            total_hw               = 0;
+            merge_unit_count       = 0;
+            attn_batch_size        = 0;
+            max_attn_len           = 0;
+            window_attn_batch_size = 0;
+            max_window_attn_len    = 0;
+            grid_thws_host.clear();
+            image_embeds_coords.clear();
+            input_embeds_coords.clear();
+        }
+    };
+
+    static constexpr int kMropeSegInts = sizeof(MropeSegment) / sizeof(int);
+    static_assert(sizeof(MropeSegment) % sizeof(int) == 0);
+
+    std::vector<Data> data_;
+
+    Impl(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases):
+        weights_{weights},
+        config_{weights.config()},
+        linear_{*ctx.linear},
+        h_tp_group{ctx.comm.h_comm},
+        d_comm_{ctx.comm.d_comm},
+        tp_group_{ctx.comm.d_tp_group},
+        engine_data_type_{engine.data_type}
+    {
+        for (int i = 0; i < phases; ++i) {
+            auto& d              = data_.emplace_back();
+            d.mrope_length_host  = {engine.max_batch_size, kCPUpinned};
+            d.mrope_delta_host   = {engine.max_batch_size, kCPUpinned};
+            d.mrope_offsets_host = {engine.max_batch_size, kCPUpinned};
+
+            // mrope outputs at worst-case current-forward shape so Setup() never reallocates them.
+            d.mrope_length           = Tensor_<int>{{engine.max_batch_size}, kDEVICE};
+            d.mrope_position_delta   = Tensor_<int>{{engine.max_batch_size}, kDEVICE};
+            d.mrope_position_offsets = Tensor_<int>{{engine.max_batch_size}, kDEVICE};
+            d.mrope_position_ids     = Tensor_<int>{{engine.max_forward_token_num, 3}, kDEVICE};
+        }
+    }
+
+    struct WindowShape {
+        int vit_window{};
+        int llm_h{};
+        int llm_w{};
+        int win_h{};
+        int win_w{};
+    };
+
+    static void EnsureTensor(Tensor& tensor, Layout layout, DataType dtype, DeviceType device)
+    {
+        if (tensor.size() < layout.cosize()) {
+            tensor = Tensor{std::move(layout), dtype, device};
+        }
+    }
+
+    template<class T>
+    static void EnsureTensor(Tensor_<T>& tensor, Layout layout, DeviceType device)
+    {
+        if (tensor.size() < layout.cosize()) {
+            tensor = Tensor_<T>{std::move(layout), device};
+        }
+    }
+
+    template<class T>
+    static void EnsureBuffer(Buffer_<T>& buffer, ssize_t size, DeviceType device)
+    {
+        if (buffer.size() < size) {
+            buffer = Buffer_<T>{size, device};
+        }
+    }
+
+    WindowShape GetWindowShape(int h, int w) const
+    {
+        const int S          = config_.spatial_merge_size;
+        const int vit_window = config_.window_size / S / config_.patch_size;
+        TM_CHECK_GT(vit_window, 0);
+
+        const int llm_h = h / S;
+        const int llm_w = w / S;
+        return WindowShape{
+            vit_window, llm_h, llm_w, (llm_h + vit_window - 1) / vit_window, (llm_w + vit_window - 1) / vit_window};
+    }
+
+    void CollectPrefillInputs(Data& d, const BatchData& b, std::vector<Tensor>& pixel_values) const
+    {
+        const auto& cfg = config_;
+
+        int input_ids_offsets    = 0;
+        int image_embeds_offsets = 0;
+        for (int i = 0; i < b.rc.size(); ++i) {
+            const auto& c = *b.rc[i];
+            const auto& s = *c.seq;
+
+            if ((not c.autoregres) && (not s.multimodal_inputs.empty())) {
+                Interval text{c.history_len + c.alpha, Interval::Size{c.input_len}};
+                for (const auto& mm : s.multimodal_inputs) {
+                    auto o = mm->interval & text;
+                    if (auto size = (int)o.size()) {
+                        pixel_values.push_back(mm->data);
+                        d.batch_size += mm->data.shape(0);
+
+                        const int text_offset  = input_ids_offsets + o.begin() - text.begin();
+                        const int image_offset = image_embeds_offsets + o.begin() - mm->interval.begin();
+                        d.input_embeds_coords.emplace_back(size, text_offset);
+                        d.image_embeds_coords.emplace_back(size, image_offset);
+
+                        const auto& grid_thw = mm->grid_thw;
+                        d.grid_thws_host.emplace_back(grid_thw);
+                        const auto& [t, h, w] = grid_thw;
+                        const int prod        = t * h * w;
+                        image_embeds_offsets += prod / cfg.spatial_merge_size / cfg.spatial_merge_size;
+                    }
+                }
+            }
+
+            input_ids_offsets += c.autoregres ? 1 : c.input_len;
+        }
+    }
+
+    void ComputeSetupStats(Data& d) const
+    {
+        const auto& cfg        = config_;
+        const int   S          = cfg.spatial_merge_size;
+        const int   merge_unit = S * S;
+
+        d.attn_batch_size        = 0;
+        d.max_attn_len           = 0;
+        d.merge_unit_count       = 0;
+        d.window_attn_batch_size = 0;
+        d.max_window_attn_len    = 0;
+
+        for (const auto& [t, h, w] : d.grid_thws_host) {
+            TM_CHECK(h % S == 0);
+            TM_CHECK(w % S == 0);
+
+            const int hw = h * w;
+            d.attn_batch_size += t;
+            d.max_attn_len = std::max(d.max_attn_len, hw);
+            d.merge_unit_count += t * (h / S) * (w / S);
+
+            if (cfg.use_window_attention) {
+                const auto win = GetWindowShape(h, w);
+                d.window_attn_batch_size += t * win.win_h * win.win_w;
+                d.max_window_attn_len = std::max(d.max_window_attn_len, win.vit_window * win.vit_window * merge_unit);
+            }
+        }
+    }
+
+    void EnsureSetupStorage(Data& d)
+    {
+        const auto& cfg       = config_;
+        const int   num_grids = (int)d.grid_thws_host.size();
+
+        core::ContextGuard ctx{Allocator{kCPUpinned}};
+        EnsureTensor(d.batch_input, {d.batch_size, cfg.patch_in_dim}, cfg.data_type, kCPUpinned);
+        EnsureBuffer(grid_thws_buf_, (ssize_t)num_grids * 3, kCPUpinned);
+        EnsureBuffer(grid_offsets_buf_, (ssize_t)num_grids * 2, kCPUpinned);
+        EnsureBuffer(attn_cu_seqlens_buf_, (ssize_t)d.attn_batch_size + 1, kCPUpinned);
+
+        EnsureTensor(d.grid_thws, {num_grids, 3}, kDEVICE);
+        EnsureTensor(d.grid_offsets, {num_grids, 2}, kDEVICE);
+        EnsureTensor(d.mapped_idx, {d.batch_size}, kDEVICE);
+        EnsureTensor(d.attn_cu_seqlens, {d.attn_batch_size + 1}, kDEVICE);
+        EnsureTensor(d.attn_finished, {d.attn_batch_size}, kDEVICE);
+
+        if (cfg.use_window_attention) {
+            EnsureBuffer(window_idx_buf_, (ssize_t)d.merge_unit_count, kCPUpinned);
+            EnsureBuffer(cu_window_seqlens_buf_, (ssize_t)d.window_attn_batch_size + 1, kCPUpinned);
+            EnsureTensor(d.window_idx, {d.merge_unit_count}, kDEVICE);
+            EnsureTensor(d.window_mapped_idx, {d.batch_size}, kDEVICE);
+            EnsureTensor(d.cu_window_seqlens, {d.window_attn_batch_size + 1}, kDEVICE);
+            EnsureTensor(d.window_attn_finished, {d.window_attn_batch_size}, kDEVICE);
+        }
+    }
+
+    void StagePixelValues(Data& d, const std::vector<Tensor>& pixel_values) const
+    {
+        if (d.batch_size == 0) {
+            return;
+        }
+
+        ssize_t batch_offset = 0;
+        for (const auto& pixel_value : pixel_values) {
+            TM_CHECK_EQ(pixel_value.size(), pixel_value.shape(0) * config_.patch_in_dim);
+            TM_CHECK_EQ(pixel_value.dtype(), d.batch_input.dtype());
+            Copy(pixel_value, d.batch_input.slice(batch_offset, pixel_value.shape(0)));
+            batch_offset += pixel_value.shape(0);
+        }
+        TM_CHECK_EQ(batch_offset, d.batch_size);
+    }
+
+    void BuildHostAttentionMeta(Data& d)
+    {
+        const auto& cfg        = config_;
+        const int   S          = cfg.spatial_merge_size;
+        const int   merge_unit = S * S;
+        const int   num_grids  = (int)d.grid_thws_host.size();
+
+        int token_offset                            = 0;
+        int natural_offset                          = 0;
+        int attn_seq_idx                            = 0;
+        int attn_offset                             = 0;
+        attn_cu_seqlens_buf_.data()[attn_seq_idx++] = 0;
+        int window_group_pos                        = 0;
+        int window_id_base                          = 0;
+        int window_seq_idx                          = 0;
+        int window_offset                           = 0;
+        if (cfg.use_window_attention) {
+            cu_window_seqlens_buf_.data()[window_seq_idx++] = 0;
+        }
+
+        for (int i = 0; i < num_grids; ++i) {
+            const auto& [t, h, w] = d.grid_thws_host[i];
+
+            const int hw = h * w;
+            for (int tt = 0; tt < t; ++tt) {
+                attn_offset += hw;
+                attn_cu_seqlens_buf_.data()[attn_seq_idx++] = attn_offset;
+            }
+
+            grid_thws_buf_.data()[i * 3]        = t;
+            grid_thws_buf_.data()[i * 3 + 1]    = h;
+            grid_thws_buf_.data()[i * 3 + 2]    = w;
+            grid_offsets_buf_.data()[i * 2]     = token_offset;
+            grid_offsets_buf_.data()[i * 2 + 1] = natural_offset;
+
+            TM_CHECK_LE(token_offset + t * h * w, d.batch_size);
+
+            if (cfg.use_window_attention) {
+                const auto win = GetWindowShape(h, w);
+                for (int tt = 0; tt < t; ++tt) {
+                    for (int wh = 0; wh < win.win_h; ++wh) {
+                        for (int ww = 0; ww < win.win_w; ++ww) {
+                            int valid_cells = 0;
+                            for (int ih = 0; ih < win.vit_window; ++ih) {
+                                const int llm_i = wh * win.vit_window + ih;
+                                for (int iw = 0; iw < win.vit_window; ++iw) {
+                                    const int llm_j = ww * win.vit_window + iw;
+                                    if (llm_i >= win.llm_h || llm_j >= win.llm_w) {
+                                        continue;
+                                    }
+                                    const int local_group = (tt * win.llm_h + llm_i) * win.llm_w + llm_j;
+                                    const int orig_group  = window_id_base + local_group;
+                                    window_idx_buf_.data()[window_group_pos++] = orig_group;
+                                    ++valid_cells;
+                                }
+                            }
+                            window_offset += valid_cells * merge_unit;
+                            if (cu_window_seqlens_buf_.data()[window_seq_idx - 1] != window_offset) {
+                                cu_window_seqlens_buf_.data()[window_seq_idx++] = window_offset;
+                            }
+                        }
+                    }
+                }
+                window_id_base += t * win.llm_h * win.llm_w;
+            }
+
+            token_offset += t * h * w;
+            natural_offset += h * w;
+        }
+
+        TM_CHECK_EQ(token_offset, d.batch_size);
+        TM_CHECK_EQ(attn_offset, d.batch_size);
+        TM_CHECK_EQ(attn_seq_idx, d.attn_batch_size + 1);
+        if (cfg.use_window_attention) {
+            TM_CHECK_EQ(window_group_pos, d.merge_unit_count);
+            TM_CHECK_EQ(window_offset, d.batch_size);
+            TM_CHECK_EQ(window_seq_idx, d.window_attn_batch_size + 1);
+        }
+
+        d.total_hw = natural_offset;
+    }
+
+    void PublishHostMetadataCopies(BatchCopy& copy, Data& d)
+    {
+        const int num_grids = (int)d.grid_thws_host.size();
+
+        copy(grid_thws_buf_.data(), num_grids * 3, d.grid_thws.data());
+        copy(grid_offsets_buf_.data(), num_grids * 2, d.grid_offsets.data());
+        copy(attn_cu_seqlens_buf_.data(), d.attn_batch_size + 1, d.attn_cu_seqlens.data());
+        Clear(d.attn_finished.slice(0, d.attn_batch_size));
+
+        if (config_.use_window_attention) {
+            copy(window_idx_buf_.data(), d.merge_unit_count, d.window_idx.data());
+            copy(cu_window_seqlens_buf_.data(), d.window_attn_batch_size + 1, d.cu_window_seqlens.data());
+            Clear(d.window_attn_finished.slice(0, d.window_attn_batch_size));
+        }
+    }
+
+    void AllReduceSum(Tensor& tensor, cudaStream_t stream) const
+    {
+        if (d_comm_) {
+            d_comm_->AllReduceSum(
+                tensor.raw_data(), tensor.raw_data(), tensor.size(), tensor.dtype(), tp_group_, stream);
+            TM_CUDA_CHECK(cudaGetLastError());
+        }
+    }
+
+    void ApplyNorm(Tensor& out, const Tensor& input, const core::Module& norm, NormType norm_type) const
+    {
+        auto stream = core::Context::stream().handle();
+        switch (norm_type) {
+            case NormType::kLayerNorm: {
+                const auto& ln = static_cast<const LayerNormWeight&>(norm);
+                invokeLayerNorm(out, input, ln.weight, ln.bias, ln.norm_eps_, stream);
+                break;
+            }
+            case NormType::kRMSNorm: {
+                const auto& rms = static_cast<const NormWeight&>(norm);
+                invokeRMSNorm(out, input, rms.weight, rms.norm_eps_, stream);
+                break;
+            }
+            default:
+                TM_LOG_FATAL("unsupported QwenVit norm type: {}", (int)norm_type);
+        }
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    void ResidualBiasNorm(Tensor&             hidden_states,
+                          Tensor&             residual,
+                          const Tensor&       residual_bias,
+                          const core::Module& norm,
+                          NormType            norm_type) const
+    {
+        auto stream = core::Context::stream().handle();
+        switch (norm_type) {
+            case NormType::kLayerNorm: {
+                const auto& ln = static_cast<const LayerNormWeight&>(norm);
+                invokeResidualBiasLayerNorm(hidden_states.raw_data(),
+                                            residual.raw_data(),
+                                            ln.weight.raw_data(),
+                                            ln.bias.data_or((void*)nullptr),
+                                            residual_bias.data_or((void*)nullptr),
+                                            hidden_states.dtype(),
+                                            config_.hidden_dim,
+                                            hidden_states.shape(0),
+                                            ln.norm_eps_,
+                                            stream);
+                break;
+            }
+            case NormType::kRMSNorm: {
+                const auto& rms = static_cast<const NormWeight&>(norm);
+                invokeResidualBiasRMSNorm(hidden_states.raw_data(),
+                                          residual.raw_data(),
+                                          rms.weight.raw_data(),
+                                          residual_bias.data_or((void*)nullptr),
+                                          hidden_states.dtype(),
+                                          config_.hidden_dim,
+                                          hidden_states.shape(0),
+                                          rms.norm_eps_,
+                                          stream);
+                break;
+            }
+            default:
+                TM_LOG_FATAL("unsupported QwenVit norm type: {}", (int)norm_type);
+        }
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    // Qwen3.5: precompute the bilinear-interpolation gather indices/weights for the
+    // learned position-embedding table, then gather the 4 neighbour rows. Consumed by
+    // `invokeFusedPosEmbedMerge` in Forward(). No-op for models without pos_embed.
+    void FastPosEmbedInterpolate(Data& d, TensorMap& env)
+    {
+        auto& cfg    = weights_.config();
+        auto  stream = core::Context::stream().handle();
+
+        const int num_grid_per_side = (int)std::sqrt(cfg.num_position_embeddings);
+        TM_CHECK_EQ(num_grid_per_side * num_grid_per_side, cfg.num_position_embeddings);
+        TM_CHECK_EQ(weights_.pos_embed.shape(0), cfg.num_position_embeddings);
+        TM_CHECK_EQ(weights_.pos_embed.shape(1), cfg.hidden_dim);
+
+        Buffer_<int> pos_embed_idx     = {d.total_hw * 4, kDEVICE};
+        Tensor       pos_embed_weights = {{d.total_hw, 4}, cfg.data_type, kDEVICE};
+        invokeFastPosEmbedIdxWeight(pos_embed_idx.data(),
+                                    pos_embed_weights.raw_data(),
+                                    cfg.data_type,
+                                    d.grid_thws.data(),
+                                    d.grid_offsets.data(),
+                                    (int)d.grid_thws_host.size(),
+                                    d.total_hw,
+                                    num_grid_per_side,
+                                    stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor pos_embeds = {{d.total_hw * 4, cfg.hidden_dim}, cfg.data_type, kDEVICE};
+        invokeEmbeddingLookup(pos_embeds, pos_embed_idx, weights_.pos_embed, stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        env.produce("pos_embeds", pos_embeds);
+        env.produce("pos_embed_weights", pos_embed_weights);
+    }
+
+    void RotPosEmb(Data& d, TensorMap& env)
+    {
+        auto& cfg = weights_.config();
+
+        const int head_dim = cfg.hidden_dim / cfg.head_num;
+        // produce rotary_pos_emb: [total_hw, head_dim] with interleaved (c,s,c,s,...) pairs,
+        // keyed by the same natural flat index that `mapped_idx` already carries. Vision q/k
+        // are reordered into this adjacent-pair layout at export time.
+        Tensor rotary_pos_emb = {{d.total_hw, head_dim}, cfg.data_type, kDEVICE};
+        invokeQwenVitRotaryPosEmb(rotary_pos_emb.raw_data(),
+                                  cfg.data_type,
+                                  d.grid_thws.data(),
+                                  d.grid_offsets.data(),
+                                  (int)d.grid_thws_host.size(),
+                                  d.total_hw,
+                                  head_dim,
+                                  /*theta=*/10000.0f,
+                                  core::Context::stream().handle());
+        TM_CUDA_CHECK(cudaGetLastError());
+        env.produce("rotary_pos_emb", rotary_pos_emb);
+    }
+
+    Tensor PatchEmbedding(Data& d)
+    {
+        Tensor host_input = d.batch_input.slice(0, d.batch_size);
+        Tensor input      = empty_like(host_input, kDEVICE);
+
+        Copy(host_input, input);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor output;
+        TM_SCOPE_CALL(linear_.Forward(input, *weights_.patch_embed, output));
+        return output;
+    }
+
+    int Add(RequestCache& c)
+    {
+        const auto& [r, s] = std::tie(*c.req, *c.seq);
+        if (r.mm_inputs) {
+            if ((not r.session.start_flag) or (not r.session.end_flag)) {
+                // only support non-interactive inference
+                return Request::kInvalid;
+            }
+
+            const auto mm_inputs = std::dynamic_pointer_cast<multimodal::QwenVitInput>(r.mm_inputs);
+            if (!mm_inputs) {
+                return Request::kInvalid;
+            }
+
+            for (const auto& item : mm_inputs->items) {
+                if (item.modality != multimodal::Modality::kImage && item.modality != multimodal::Modality::kVideo) {
+                    return Request::kInvalid;
+                }
+
+                const int tokens = item.token_end - item.token_begin;
+                if (tokens <= 0) {
+                    return Request::kInvalid;
+                }
+
+                auto mm_item = std::make_shared<MultiModalData>(
+                    MultiModalData{item.data, Interval{item.token_begin, Interval::Size{tokens}}, item.grid_thw});
+                s.multimodal_inputs.push_back(mm_item);
+            }
+        }
+
+        return Request::kOk;
+    }
+
+    void Add(int phase, TensorMap& env)
+    {
+        // convert model-specific multimodal inputs to internal MultiModalData
+        const Buffer_<RequestCache*> rc = env.at("requests").buffer();
+        for (int i = 0; i < rc.size(); ++i) {
+            auto& c = *TM_CHECK_NOTNULL(rc[i]);
+            if (c.status == 0) {
+                c.status = Add(c);
+            }
+        }
+    }
+
+    // Build the mrope tensors consumed by `UnifiedAttentionLayer` and publish them to env.
+    //
+    // Per-forward layout: one flat row in `(max_forward_token_num, 3)` for each current token.
+    // Prefill slots with multimodal_inputs get their active range written by
+    // `invokeMropePositionIds` from a clipped list of MropeSegment descriptors. All other slots
+    // (decode + text-only prefill) get `length[i] = 0` so FastRoPE falls through to the closed-
+    // form `timestep + delta` path and never reads the stale rows.
+    //
+    // The output tensors live on `Data` (allocated worst-case in the ctor). env shares
+    // ownership via shared_ptr; UAL borrows safely across env clears.
+    void SetupMrope(int phase, TensorMap& env, BatchCopy& copy)
+    {
+        auto& d  = data_.at(phase);
+        auto& b  = *env.at("batch").data<BatchData*>()[0];
+        auto& rc = b.rc;
+
+        const int bsz = (int)rc.size();
+        if (bsz <= 0) {
+            return;
+        }
+
+        const int S = weights_.config().spatial_merge_size;
+
+        // 1) One pass to upper-bound segment count, build flat forward offsets, then size scratch.
+        //    Worst case per prefill slot with mrope: 2*num_images + 1 segments.
+        int upper_segs     = 0;
+        int total_q_tokens = 0;
+        for (int i = 0; i < bsz; ++i) {
+            const auto& c                  = *rc[i];
+            d.mrope_offsets_host.data()[i] = total_q_tokens;
+            total_q_tokens += c.autoregres ? 1 : c.input_len;
+            if (!c.autoregres && !c.seq->multimodal_inputs.empty()) {
+                upper_segs += 2 * (int)c.seq->multimodal_inputs.size() + 1;
+            }
+        }
+        TM_CHECK_LE(total_q_tokens, d.mrope_position_ids.shape(0));
+
+        const ssize_t upper_ints = (ssize_t)upper_segs * kMropeSegInts;
+        if (upper_ints > d.mrope_segs_host.size()) {
+            core::ContextGuard ctx{Allocator{kCPUpinned}};
+            d.mrope_segs_host = Buffer_<int>{upper_ints, kCPUpinned};
+            d.mrope_segs_dev  = Buffer_<int>{upper_ints, kDEVICE};
+        }
+
+        // 2) Unified per-request walk - always advance mm_off; emit segments only for needs_table.
+        auto* segs        = reinterpret_cast<MropeSegment*>(d.mrope_segs_host.data_or(nullptr));
+        int   n_segs      = 0;
+        int   max_seg_len = 0;
+
+        for (int i = 0; i < bsz; ++i) {
+            const auto& c            = *rc[i];
+            const auto& s            = *c.seq;
+            const bool  needs_table  = !c.autoregres && !s.multimodal_inputs.empty();
+            const int   active_start = c.history_len + c.alpha;
+            const int   active_end   = active_start + c.input_len;
+            const int   q_offset     = d.mrope_offsets_host.data()[i];
+
+            auto emit = [&](int run_start, int run_n, int run_base, int h2, int w2) {
+                const int a = std::max(run_start, active_start);
+                const int b = std::min(run_start + run_n, active_end);
+                if (a >= b) {
+                    return;
+                }
+                const int local_off = a - run_start;
+                segs[n_segs++]      = MropeSegment{
+                    q_offset + (a - active_start),
+                    b - a,
+                    /*base_pos=*/(h2 == 0) ? run_base + local_off : run_base,
+                    h2,
+                    w2,
+                    /*k_offset=*/(h2 == 0) ? 0 : local_off,
+                };
+                max_seg_len = std::max(max_seg_len, b - a);
+            };
+
+            int row = 0, pos = 0, mm_off = 0;
+            for (const auto& mm : s.multimodal_inputs) {
+                const auto& [t, h, w] = mm->grid_thw;
+                const int h2 = h / S, w2 = w / S, n_tok = t * h2 * w2;
+                TM_CHECK_EQ(n_tok, (int)mm->interval.size()) << "image token count mismatches grid_thw";
+                const int img_start = mm->interval.begin();
+                const int img_base  = img_start + mm_off;
+                if (needs_table) {
+                    if (img_start > row) {
+                        emit(row, img_start - row, pos, /*h2=*/0, /*w2=*/0);
+                    }
+                    emit(img_start, n_tok, img_base, h2, w2);
+                }
+                row               = img_start + n_tok;
+                const int new_pos = std::max(t, std::max(h2, w2));
+                pos               = img_base + new_pos;
+                mm_off += new_pos - n_tok;
+            }
+            if (needs_table && row < active_end) {
+                emit(row, active_end - row, pos, /*h2=*/0, /*w2=*/0);
+            }
+
+            d.mrope_length_host.data()[i] = needs_table ? c.input_len : 0;
+            d.mrope_delta_host.data()[i]  = mm_off;
+        }
+
+        // 3) Copy the bsz prefix of length / delta / flat offsets into the pre-allocated tensors.
+        //    Rows beyond bsz are untouched (UAL never reads them).
+        copy(d.mrope_length_host, bsz, d.mrope_length.buffer());
+        copy(d.mrope_delta_host, bsz, d.mrope_position_delta.buffer());
+        copy(d.mrope_offsets_host, bsz, d.mrope_position_offsets.buffer());
+
+        // 4) Populate position_ids only when a slot actually needs the table. Rows for slots
+        //    with length[i] == 0 are unreachable from FastRoPE, so leaving them stale is safe.
+        if (n_segs > 0) {
+            const ssize_t segs_ints = (ssize_t)n_segs * kMropeSegInts;
+            Copy(d.mrope_segs_host.slice(0, segs_ints), d.mrope_segs_dev.slice(0, segs_ints));
+            invokeMropePositionIds(d.mrope_position_ids.data(),
+                                   reinterpret_cast<const MropeSegment*>(d.mrope_segs_dev.data()),
+                                   n_segs,
+                                   max_seg_len,
+                                   core::Context::stream().handle());
+            TM_CUDA_CHECK(cudaGetLastError());
+        }
+
+        // 5) Publish all tensors - the consumer relies on this contract unconditionally.
+        env.produce("mrope_length", d.mrope_length);
+        env.produce("mrope_position_delta", d.mrope_position_delta);
+        env.produce("mrope_position_offsets", d.mrope_position_offsets);
+        env.produce("mrope_position_ids", d.mrope_position_ids);
+    }
+
+    void Setup(int phase, TensorMap& env)
+    {
+        auto& d    = data_.at(phase);
+        auto& b    = *env.at("batch").data<BatchData*>()[0];
+        auto& copy = *env.at("copy").data<BatchCopy*>()[0];
+
+        d.Clear();
+        std::vector<Tensor> pixel_values;
+        CollectPrefillInputs(d, b, pixel_values);
+
+        if (d.batch_size > 0) {
+            ComputeSetupStats(d);
+            EnsureSetupStorage(d);
+            StagePixelValues(d, pixel_values);
+
+            BuildHostAttentionMeta(d);
+            PublishHostMetadataCopies(copy, d);
+        }
+
+        SetupMrope(phase, env, copy);
+        h_tp_group->Sync();
+    }
+
+    void Prepare(int phase, TensorMap& env)
+    {
+        auto& d = data_.at(phase);
+        if (d.batch_size == 0) {
+            return;
+        }
+
+        auto stream = core::Context::stream().handle();
+        invokeQwenVitBuildMappedIdx(d.mapped_idx.data(),
+                                    d.grid_thws.data(),
+                                    d.grid_offsets.data(),
+                                    (int)d.grid_thws_host.size(),
+                                    config_.spatial_merge_size,
+                                    stream);
+        if (config_.use_window_attention) {
+            invokeQwenVitBuildWindowMappedIdx(d.window_mapped_idx.data(),
+                                              d.mapped_idx.data(),
+                                              d.window_idx.data(),
+                                              config_.spatial_merge_size * config_.spatial_merge_size,
+                                              d.merge_unit_count,
+                                              stream);
+        }
+
+        // Qwen3.5 learned positional embedding (bilinear interpolation of a fixed grid).
+        if (config_.num_position_embeddings > 0) {
+            FastPosEmbedInterpolate(d, env);
+        }
+
+        RotPosEmb(d, env);
+    }
+
+    void Forward(int phase, TensorMap& args)
+    {
+        auto& d = data_.at(phase);
+        if (d.batch_size == 0) {
+            return;
+        }
+
+        auto& cfg = weights_.config();
+
+        auto residual       = PatchEmbedding(d);
+        auto rotary_pos_emb = args.consume("rotary_pos_emb");
+        auto stream         = core::Context::stream().handle();
+
+        // Qwen3.5: fused pos-embed gather/merge into the patch_embed output (with bias folded in):
+        //   residual[pos, d] += Σ_k w[k] * pos_embeds[mapped*4+k, d] + bias[d]
+        if (cfg.num_position_embeddings > 0) {
+            auto pos_embeds        = args.consume("pos_embeds");
+            auto pos_embed_weights = args.consume("pos_embed_weights");
+            invokeFusedPosEmbedMerge(residual.raw_data(),
+                                     pos_embeds.raw_data(),
+                                     pos_embed_weights.raw_data(),
+                                     d.mapped_idx.data(),
+                                     weights_.patch_embed->bias ? weights_.patch_embed->bias.raw_data() : nullptr,
+                                     d.batch_size,
+                                     cfg.hidden_dim,
+                                     cfg.data_type,
+                                     stream);
+            TM_CUDA_CHECK(cudaGetLastError());
+        }
+
+        if (cfg.use_window_attention) {
+            Tensor reordered{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE};
+            invokeQwenVitWindowReorder(reordered,
+                                       residual,
+                                       d.window_idx.data(),
+                                       cfg.spatial_merge_size * cfg.spatial_merge_size,
+                                       d.merge_unit_count,
+                                       stream);
+            residual = std::move(reordered);
+        }
+
+        Tensor hidden_states = [&]() {
+            Buffer symm_buf = args.contains("symm_buf") ? args.at("symm_buf").buffer() : Buffer{};
+            if (symm_buf && d.batch_size * cfg.hidden_dim <= symm_buf.size() / turbomind::byte_size(cfg.data_type)) {
+                return Tensor{symm_buf.view(cfg.data_type), {d.batch_size, cfg.hidden_dim}};
+            }
+            else {
+                return Tensor{{d.batch_size, cfg.hidden_dim}, cfg.data_type, kDEVICE};
+            }
+        }();
+
+        ApplyNorm(hidden_states, residual, *weights_.block(0)->norm1, cfg.norm_type);
+
+        for (int layer_id = 0; layer_id < cfg.depth; ++layer_id) {
+            auto* block = weights_.block(layer_id);
+
+            // attn
+            auto invoke = [&](auto t) {
+                using T = decltype(t);
+                Attn<T>(hidden_states, hidden_states, d, layer_id, rotary_pos_emb);
+            };
+            TM_DISPATCH_PRIMARY_DTYPES(hidden_states.dtype(), invoke);
+
+            if (block->attention->tp_size > 1) {
+                AllReduceSum(hidden_states, stream);
+            }
+
+            ResidualBiasNorm(hidden_states, residual, block->attention->wo->bias, *block->norm2, cfg.norm_type);
+
+            // mlp
+            Mlp(hidden_states, hidden_states, d, layer_id);
+            AllReduceSum(hidden_states, stream);
+
+            const auto* next_norm =
+                layer_id + 1 < cfg.depth ? weights_.block(layer_id + 1)->norm1.get() : weights_.merger_norm.get();
+            TM_CHECK_NOTNULL(next_norm);
+            ResidualBiasNorm(hidden_states, residual, block->mlp_fc2->bias, *next_norm, cfg.norm_type);
+        }
+
+        Tensor image_embeds = Merger(hidden_states);
+        if (cfg.use_window_attention) {
+            Tensor reordered{{d.merge_unit_count, cfg.out_hidden_dim}, image_embeds.dtype(), kDEVICE};
+            invokeQwenVitReverseWindow(reordered, image_embeds, d.window_idx.data(), d.merge_unit_count, stream);
+            image_embeds = std::move(reordered);
+        }
+
+        // ViT may run in its own dtype (e.g. bf16) while the text engine runs
+        // in fp16 (AWQ-forced). PatchMultimodalEmbedding merges this buffer
+        // into the text embedding stream via a byte-level copy, so the dtypes
+        // must match before publishing.
+        EnsureFloatDtype(image_embeds, engine_data_type_);
+
+        args.produce("multimodal",
+                     MultiModalEmbeddingData{image_embeds, d.image_embeds_coords, d.input_embeds_coords}.buf());
+    }
+
+    template<typename T>
+    AttentionParams<T> CreateVitAttentionParams(
+        Tensor& attn_context, Tensor& qkv, Tensor& kv, Data& d, const AttentionWeight& attn, int layer_id)
+    {
+        const bool use_full_attn  = IsFullAttentionLayer(layer_id);
+        const int  local_head_num = attn.head_num / attn.tp_size;
+        const int  head_dim       = attn.head_dim;
+        const int  token_num      = d.batch_size;
+
+        AttentionParams<T> params{};
+        params.out = (T*)attn_context.raw_data();
+        params.q   = (T*)qkv.raw_data();
+
+        params.stride = (int64_t)local_head_num * 3 * head_dim;
+
+        params.cu_q_len = use_full_attn ? d.attn_cu_seqlens.data() : d.cu_window_seqlens.data();
+        params.cu_k_len = params.cu_q_len;
+        params.finished = use_full_attn ? d.attn_finished.data() : d.window_attn_finished.data();
+
+        params.linear_iter_params = LinearIteratorParams{
+            kv.raw_data(),
+            2 * token_num * head_dim,
+            token_num * head_dim,
+        };
+
+        params.token_num  = token_num;
+        params.batch_size = use_full_attn ? d.attn_batch_size : d.window_attn_batch_size;
+        params.max_q_len  = use_full_attn ? d.max_attn_len : d.max_window_attn_len;
+        params.max_k_len  = params.max_q_len;
+
+        params.num_heads     = local_head_num;
+        params.num_kv_heads  = local_head_num;
+        params.size_per_head = head_dim;
+        params.causal        = false;
+        params.layer_id      = layer_id;
+
+        double scaling = 1.;
+        if (attn.softmax_scale) {
+            scaling *= attn.softmax_scale;
+        }
+        else {
+            scaling /= std::sqrt((float)head_dim);
+        }
+        params.inv_sqrt_dh = scaling * std::log2(std::exp(1.));
+
+        params.window_size     = 0;
+        params.rope_param.type = RopeType::kNull;
+        params.max_split_k     = 1;
+        params.cp_size         = 1;
+        params.stream          = core::Context::stream().handle();
+
+        return params;
+    }
+
+    bool IsFullAttentionLayer(int layer_id) const
+    {
+        if (!config_.use_window_attention) {
+            return true;
+        }
+        return std::find(config_.fullatt_block_indexes.begin(), config_.fullatt_block_indexes.end(), layer_id)
+               != config_.fullatt_block_indexes.end();
+    }
+
+    template<typename T>
+    void Attn(Tensor& input, Tensor& output, Data& d, int layer_id, const Tensor& rotary_pos_emb)
+    {
+        auto& vit_cfg = weights_.config();
+        auto* attn    = weights_.block(layer_id)->attention.get();
+
+        Tensor qkv;
+        TM_SCOPE_CALL(linear_.Forward(input, *attn->w_qkv, qkv));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        const int local_head_num = attn->head_num / attn->tp_size;
+        const int head_dim       = attn->head_dim;                         // may be padded
+        const int rope_head_dim  = vit_cfg.hidden_dim / vit_cfg.head_num;  // model's real per-head dim
+        const int token_num      = d.batch_size;
+
+        const int* mapped_idx = (config_.use_window_attention ? d.window_mapped_idx.data() : d.mapped_idx.data());
+
+        Tensor tmp_kv{{local_head_num, 2, d.batch_size, head_dim}, qkv.dtype(), qkv.device()};
+        invokeQwenVitPrepareQKV(qkv.raw_data(),
+                                tmp_kv.raw_data(),
+                                attn->w_qkv->bias.raw_data(),
+                                rotary_pos_emb.raw_data(),
+                                mapped_idx,
+                                qkv.dtype(),
+                                token_num,
+                                local_head_num,
+                                head_dim,
+                                rope_head_dim,
+                                core::Context::stream().handle());
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor attn_output{{token_num, local_head_num * head_dim}, qkv.dtype(), qkv.device()};
+        auto   params = CreateVitAttentionParams<T>(attn_output, qkv, tmp_kv, d, *attn, layer_id);
+        dispatchAttention<T>(params);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        TM_SCOPE_CALL(linear_.Forward(attn_output, *attn->wo, output));
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    void Mlp(Tensor& input, Tensor& output, Data& d, int layer_id)
+    {
+        auto* block  = weights_.block(layer_id);
+        auto  stream = core::Context::stream().handle();
+
+        if (config_.gated_mlp) {
+            TM_CHECK(block->mlp_gate);
+            Tensor gate;
+            Tensor up;
+            TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_gate, gate));
+            TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, up));
+            TM_CUDA_CHECK(cudaGetLastError());
+
+            ApplyBias(gate, block->mlp_gate->bias, stream);
+            ApplyBias(up, block->mlp_fc1->bias, stream);
+            Activation(gate, up, ActivationType::kSilu, stream);
+            TM_CUDA_CHECK(cudaGetLastError());
+
+            TM_SCOPE_CALL(linear_.Forward(gate, *block->mlp_fc2, output));
+        }
+        else {
+            Tensor inter;
+            TM_SCOPE_CALL(linear_.Forward(input, *block->mlp_fc1, inter));
+            TM_CUDA_CHECK(cudaGetLastError());
+
+            // Qwen2-VL/2.5 use the erf GELU; Qwen3.5 uses the tanh approximation.
+            const ActivationType act = config_.gelu_tanh ? ActivationType::kGeluPytorchTanh : ActivationType::kGelu;
+            invokeAddBiasActivation(inter, block->mlp_fc1->bias, act, stream);
+            TM_CUDA_CHECK(cudaGetLastError());
+
+            TM_SCOPE_CALL(linear_.Forward(inter, *block->mlp_fc2, output));
+        }
+        TM_CUDA_CHECK(cudaGetLastError());
+    }
+
+    Tensor Merger(Tensor& input)
+    {
+        auto& cfg    = config_;
+        auto  stream = core::Context::stream().handle();
+
+        const int merge_area   = cfg.spatial_merge_size * cfg.spatial_merge_size;
+        Tensor    merged_input = input.view({-1, cfg.hidden_dim * merge_area});
+
+        Tensor inter;
+        TM_SCOPE_CALL(linear_.Forward(merged_input, *weights_.merger_fc1, inter));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        invokeAddBiasActivation(inter, weights_.merger_fc1->bias, ActivationType::kGelu, stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        Tensor output;
+        TM_SCOPE_CALL(linear_.Forward(inter, *weights_.merger_fc2, output));
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        AllReduceSum(output, stream);
+
+        ApplyBias(output, weights_.merger_fc2->bias, stream);
+        TM_CUDA_CHECK(cudaGetLastError());
+
+        return output;
+    }
+};
+
+QwenVit::QwenVit(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases):
+    impl_{std::make_unique<Impl>(engine, ctx, weights, phases)}
+{
+}
+
+QwenVit::~QwenVit() = default;
+
+void QwenVit::Run(BatchOp op, int phase, TensorMap& env)
+{
+    TM_FUNCTION_SCOPE();
+    switch (op) {
+        case BatchOp::kAdd:
+            return impl_->Add(phase, env);
+        case BatchOp::kSetup:
+            return impl_->Setup(phase, env);
+        case BatchOp::kPrepare:
+            return impl_->Prepare(phase, env);
+        case BatchOp::kForward:
+            return impl_->Forward(phase, env);
+        default:
+            return;
+    }
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/qwenvit/qwenvit.h b/src/turbomind/models/qwenvit/qwenvit.h
new file mode 100644
index 0000000000..b69bf2dae9
--- /dev/null
+++ b/src/turbomind/models/qwenvit/qwenvit.h
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include <memory>
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/engine/batch.h"
+#include "src/turbomind/models/llama/context.h"
+#include "src/turbomind/models/llama/llama_params.h"
+#include "src/turbomind/models/vision_model.h"
+
+namespace turbomind {
+
+class QwenVitWeight;
+
+/// Unified ``VisionModel`` for the Qwen ViT family: Qwen2-VL / Qwen2.5-VL / Qwen3.5.
+///
+/// A single config-driven implementation. The orthogonal feature toggles that
+/// distinguish the families are selected from ``QwenVitConfig``:
+///   - window attention      (Qwen2.5):  use_window_attention
+///   - learned pos embedding  (Qwen3.5):  num_position_embeddings > 0
+///   - gated SiLU MLP         (Qwen2.5):  gated_mlp
+///   - tanh-approx GELU MLP   (Qwen3.5):  gelu_tanh
+///   - RMSNorm vs LayerNorm   (Qwen2):    norm_type
+class QwenVit: public VisionModel {
+public:
+    QwenVit(const EngineParam& engine, const Context& ctx, const QwenVitWeight& weights, int phases);
+
+    ~QwenVit() override;
+
+    void Run(BatchOp op, int phase, TensorMap& env) override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc b/src/turbomind/models/qwenvit/qwenvit_block_weight.cc
similarity index 55%
rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc
rename to src/turbomind/models/qwenvit/qwenvit_block_weight.cc
index 96b99404cd..594693e292 100644
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.cc
+++ b/src/turbomind/models/qwenvit/qwenvit_block_weight.cc
@@ -1,6 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h"
 
 #include "src/turbomind/core/registry.h"
 #include "src/turbomind/models/attention_weight.h"
@@ -9,8 +9,8 @@
 
 namespace turbomind {
 
-TM_MODULE_REGISTER(Qwen3_5VitBlockWeight, core::Qwen3_5VitBlockConfig);
+TM_MODULE_REGISTER(QwenVitBlockWeight, core::QwenVitBlockConfig);
 
-TM_MODULE_METHODS(Qwen3_5VitBlockWeight, QWEN3_5VIT_BLOCK_CHILDREN, QWEN3_5VIT_BLOCK_PARAMS)
+TM_MODULE_METHODS(QwenVitBlockWeight, QWENVIT_BLOCK_CHILDREN, QWENVIT_BLOCK_PARAMS)
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h b/src/turbomind/models/qwenvit/qwenvit_block_weight.h
similarity index 61%
rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h
rename to src/turbomind/models/qwenvit/qwenvit_block_weight.h
index 6d9c88f7e2..97e7d10084 100644
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h
+++ b/src/turbomind/models/qwenvit/qwenvit_block_weight.h
@@ -7,49 +7,45 @@
 
 namespace turbomind::core {
 
-/// Per-block config for the Qwen3.5 vision transformer.
-///
-/// Carries the dimensions the runtime needs (hidden_dim, head_num,
-/// intermediate_size). Loading itself is structural — children are
-/// committed by the Python builder via ``add_child_raw``.
-struct Qwen3_5VitBlockConfig: ModuleConfig {
-    Qwen3_5VitBlockConfig(): ModuleConfig{"Qwen3_5VitBlockWeight"} {}
+/// Per-block config for the Qwen2 / Qwen2.5 / Qwen3.5 vision transformer.
+struct QwenVitBlockConfig: ModuleConfig {
+    QwenVitBlockConfig(): ModuleConfig{"QwenVitBlockWeight"} {}
 
-#define QWEN3_5VIT_BLOCK_FIELDS(X)                                                                                     \
+#define QWENVIT_BLOCK_FIELDS(X)                                                                                        \
     X(DataType, data_type)                                                                                             \
     X(int, hidden_dim)                                                                                                 \
     X(int, head_num)                                                                                                   \
     X(int, intermediate_size)                                                                                          \
     X(float, norm_eps, 1e-6f)
 
-    QWEN3_5VIT_BLOCK_FIELDS(TM_MEMBER)
-    TM_FOR_EACH(Qwen3_5VitBlockConfig, QWEN3_5VIT_BLOCK_FIELDS)
+    QWENVIT_BLOCK_FIELDS(TM_MEMBER)
+    TM_FOR_EACH(QwenVitBlockConfig, QWENVIT_BLOCK_FIELDS)
 
-#undef QWEN3_5VIT_BLOCK_FIELDS
+#undef QWENVIT_BLOCK_FIELDS
 };
 
 }  // namespace turbomind::core
 
 namespace turbomind {
 
-class LayerNormWeight;
 class LinearWeight;
 
-/// One transformer block of the Qwen3.5 ViT.
+/// One transformer block of the Qwen ViT (covers Qwen2 / Qwen2.5 / Qwen3.5).
 ///
 /// Children:
-///   - norm1, norm2      LayerNorm (weight + bias)
+///   - norm1, norm2      LayerNorm or RMSNorm (held as core::Module)
 ///   - attention         AttentionWeight (packed Q/K/V + output projection)
+///   - mlp_gate          Optional gated SiLU gate projection (Qwen2.5 only)
 ///   - mlp_fc1, mlp_fc2  Linear (in: hidden ↔ intermediate)
-class Qwen3_5VitBlockWeight: public core::Module {
+class QwenVitBlockWeight: public core::Module {
 public:
     const char* type() const override
     {
-        return "Qwen3_5VitBlockWeight";
+        return "QwenVitBlockWeight";
     }
 
-    Qwen3_5VitBlockWeight() = default;
-    explicit Qwen3_5VitBlockWeight(const core::Qwen3_5VitBlockConfig& cfg):
+    QwenVitBlockWeight() = default;
+    explicit QwenVitBlockWeight(const core::QwenVitBlockConfig& cfg):
         data_type{cfg.data_type},
         hidden_dim{cfg.hidden_dim},
         head_num{cfg.head_num},
@@ -58,16 +54,17 @@ class Qwen3_5VitBlockWeight: public core::Module {
     {
     }
 
-#define QWEN3_5VIT_BLOCK_CHILDREN(X)                                                                                   \
-    X(LayerNormWeight, norm1)                                                                                          \
-    X(LayerNormWeight, norm2)                                                                                          \
+#define QWENVIT_BLOCK_CHILDREN(X)                                                                                      \
+    X(core::Module, norm1)                                                                                             \
+    X(core::Module, norm2)                                                                                             \
     X(AttentionWeight, attention)                                                                                      \
+    X(LinearWeight, mlp_gate)                                                                                          \
     X(LinearWeight, mlp_fc1)                                                                                           \
     X(LinearWeight, mlp_fc2)
 
-#define QWEN3_5VIT_BLOCK_PARAMS(X)
+#define QWENVIT_BLOCK_PARAMS(X)
 
-    TM_MODULE_DECLARE(Qwen3_5VitBlockWeight, QWEN3_5VIT_BLOCK_CHILDREN, QWEN3_5VIT_BLOCK_PARAMS)
+    TM_MODULE_DECLARE(QwenVitBlockWeight, QWENVIT_BLOCK_CHILDREN, QWENVIT_BLOCK_PARAMS)
 
     // --- Public scalars ---
     DataType data_type{};
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h b/src/turbomind/models/qwenvit/qwenvit_input.h
similarity index 58%
rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h
rename to src/turbomind/models/qwenvit/qwenvit_input.h
index 122bd49524..735ae74aff 100644
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h
+++ b/src/turbomind/models/qwenvit/qwenvit_input.h
@@ -12,27 +12,28 @@
 namespace turbomind {
 namespace multimodal {
 
-struct Qwen3_5VitItem {
+// Unified multimodal input for the Qwen2-VL / Qwen2.5-VL / Qwen3.5 ViT encoders.
+struct QwenVitItem {
     Modality           modality;
     Tensor             data;
     int                token_begin;
     int                token_end;
     std::array<int, 3> grid_thw;
 
-    Qwen3_5VitItem() = default;
+    QwenVitItem() = default;
 
-    Qwen3_5VitItem(Modality modality, Tensor data, int token_begin, int token_end, std::array<int, 3> grid_thw):
+    QwenVitItem(Modality modality, Tensor data, int token_begin, int token_end, std::array<int, 3> grid_thw):
         modality{modality}, data{std::move(data)}, token_begin{token_begin}, token_end{token_end}, grid_thw{grid_thw}
     {
     }
 };
 
-struct Qwen3_5VitInput final: Input {
-    std::vector<Qwen3_5VitItem> items;
+struct QwenVitInput final: Input {
+    std::vector<QwenVitItem> items;
 
-    Qwen3_5VitInput() = default;
+    QwenVitInput() = default;
 
-    explicit Qwen3_5VitInput(std::vector<Qwen3_5VitItem> items): items{std::move(items)} {}
+    explicit QwenVitInput(std::vector<QwenVitItem> items): items{std::move(items)} {}
 };
 
 }  // namespace multimodal
diff --git a/src/turbomind/models/qwenvit/qwenvit_kernels.cu b/src/turbomind/models/qwenvit/qwenvit_kernels.cu
new file mode 100644
index 0000000000..2309dd428d
--- /dev/null
+++ b/src/turbomind/models/qwenvit/qwenvit_kernels.cu
@@ -0,0 +1,827 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+//
+// Merged CUDA kernels for the unified Qwen ViT (Qwen2-VL / Qwen2.5-VL / Qwen3.5).
+// Sections, in order:
+//   1. QKV preprocessing (bias + RoPE fuse)        — all variants
+//   2. Spatial-merge index mapping                  — all variants
+//   3. Learned pos-embed bilinear interpolation     — Qwen3.5
+//   4. 2D rotary position-embedding table           — all variants
+//   5. mrope position ids                           — all variants
+//   6. Window attention reordering                  — Qwen2.5
+
+#include "src/turbomind/models/qwenvit/qwenvit_kernels.h"
+
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/logger.h"
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/utils/cuda_utils.h"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <type_traits>
+
+namespace turbomind {
+
+namespace {
+
+// `num_grids` is tiny (usually 1..a few) so a linear scan is fine.
+// Shared by the pos-embed interpolation and rotary-embedding kernels.
+__device__ inline int find_grid(const int* offsets, int num_grids, int pos)
+{
+    int g = 0;
+    for (int i = 1; i < num_grids; ++i) {
+        if (offsets[i * 2 + 1] <= pos) {
+            g = i;
+        }
+        else {
+            break;
+        }
+    }
+    return g;
+}
+
+// ------------------------------------------------------------------------------------
+// 1. QKV preprocessing
+// ------------------------------------------------------------------------------------
+
+constexpr int kWarpsPerBlock = 4;
+
+// Per-head_dim launch traits. Adding a new head_dim is a single specialization here.
+template<int HD>
+struct HeadConfig;
+
+template<>
+struct HeadConfig<64> {
+    static constexpr int kVecSize      = 8;
+    static constexpr int kHeadsPerWarp = 4;
+};
+
+template<>
+struct HeadConfig<72> {
+    static constexpr int kVecSize      = 8;
+    static constexpr int kHeadsPerWarp = 3;
+};
+
+template<>
+struct HeadConfig<128> {
+    static constexpr int kVecSize      = 8;  // 128 / 8 = 16 vec/head
+    static constexpr int kHeadsPerWarp = 2;  // 16 * 2 = 32 == WARP_SIZE
+};
+
+template<int VecSize, typename T>
+__device__ __forceinline__ void add_bias(Array<float, VecSize>& x, const T* bias)
+{
+    Array<T, VecSize> b;
+    Ldg(b, bias);
+    using namespace ops;
+    x = x + cast<float>(b);
+}
+
+// Rotate adjacent (x[2k], x[2k+1]) pairs using cos/sin packed as
+// rope[2k]=cos, rope[2k+1]=sin (see fastRotaryPosEmbKernel below).
+template<int VecSize, typename T>
+__device__ __forceinline__ void apply_rope_pair(Array<float, VecSize>& x, const Array<T, VecSize>& rope)
+{
+    auto cs = cast<float>(rope);
+    PRAGMA_UNROLL
+    for (int i = 0; i < VecSize; i += 2) {
+        const float x0 = x[i];
+        const float x1 = x[i + 1];
+        const float c  = cs[i];
+        const float s  = cs[i + 1];
+        x[i]           = c * x0 - s * x1;
+        x[i + 1]       = c * x1 + s * x0;
+    }
+}
+
+// Load `src`, add `bias`, apply RoPE, store to `dst` (may equal `src` for in-place).
+template<int VecSize, typename T>
+__device__ __forceinline__ void fuse_bias_rope_store(T* dst, const T* src, const T* bias, const Array<T, VecSize>& rope)
+{
+    Array<T, VecSize> x_vec;
+    Load(x_vec, src);
+    auto x = cast<float>(x_vec);
+    add_bias(x, bias);
+    apply_rope_pair(x, rope);
+    Store(dst, cast<T>(x));
+}
+
+// V-path: bias only, no RoPE.
+template<int VecSize, typename T>
+__device__ __forceinline__ void fuse_bias_store(T* dst, const T* src, const T* bias)
+{
+    Array<T, VecSize> x_vec;
+    Load(x_vec, src);
+    auto x = cast<float>(x_vec);
+    add_bias(x, bias);
+    Store(dst, cast<T>(x));
+}
+
+template<typename T, int HD>
+__global__ __launch_bounds__(kWarpsPerBlock* WARP_SIZE) void prepareQKVKernel(T* __restrict__ qkv,
+                                                                              T* __restrict__ kv,
+                                                                              const T* __restrict__ bias,
+                                                                              const T* __restrict__ rotary_pos_emb,
+                                                                              const int* __restrict__ mapped_idx,
+                                                                              int token_num,
+                                                                              int local_head_num,
+                                                                              int head_group_num,
+                                                                              int rope_head_dim)
+{
+    using Cfg                   = HeadConfig<HD>;
+    constexpr int kVecSize      = Cfg::kVecSize;
+    constexpr int kHeadsPerWarp = Cfg::kHeadsPerWarp;
+    constexpr int kVecPerHead   = HD / kVecSize;
+    static_assert(HD % kVecSize == 0);
+    static_assert(kVecPerHead * kHeadsPerWarp <= WARP_SIZE);
+
+    const int warp_id   = threadIdx.x / WARP_SIZE;
+    const int lane      = threadIdx.x - warp_id * WARP_SIZE;
+    const int head_slot = lane / kVecPerHead;
+    if (head_slot >= kHeadsPerWarp) {
+        return;
+    }
+
+    const int global_warp = blockIdx.x * kWarpsPerBlock + warp_id;
+    const int total_warps = token_num * head_group_num;
+    if (global_warp >= total_warps) {
+        return;
+    }
+
+    const int token_idx  = global_warp / head_group_num;
+    const int head_group = global_warp - token_idx * head_group_num;
+    const int head_idx   = head_group * kHeadsPerWarp + head_slot;
+    if (head_idx >= local_head_num) {
+        return;
+    }
+
+    const int vec_idx = lane - head_slot * kVecPerHead;
+    const int di      = vec_idx * kVecSize;
+
+    // QKV per-token layout: [Q_heads | K_heads | V_heads], head_num == kv_head_num for ViT.
+    const int64_t qkv_stride = (int64_t)local_head_num * 3 * HD;
+    T* const      q_ptr      = qkv + (int64_t)token_idx * qkv_stride + head_idx * HD + di;
+    const T*      k_ptr      = q_ptr + (int64_t)local_head_num * HD;
+    const T*      v_ptr      = k_ptr + (int64_t)local_head_num * HD;
+
+    const T* q_bias = bias + head_idx * HD + di;
+    const T* k_bias = q_bias + local_head_num * HD;
+    const T* v_bias = k_bias + local_head_num * HD;
+
+    // K/V destination in transposed [kv_head, 2, token, head_dim] layout.
+    T* const k_dst = kv + ((int64_t)head_idx * 2 * token_num + token_idx) * HD + di;
+    T* const v_dst = k_dst + (int64_t)token_num * HD;
+
+    // rope[token, di] is shared between Q and K — load once, reuse twice.
+    // When HD > rope_head_dim, padded di-slices have zero Q/K, so loading a
+    // zero rope_vec there is correct (and avoids OOB on the [N, rope_head_dim]
+    // buffer). kVecSize is aligned to rope_head_dim so each vec is fully in or
+    // fully out of the rope range.
+    Array<T, kVecSize> rope_vec{};
+    if (di < rope_head_dim) {
+        Ldg(rope_vec, rotary_pos_emb + (int64_t)mapped_idx[token_idx] * rope_head_dim + di);
+    }
+
+    fuse_bias_rope_store<kVecSize>(q_ptr, q_ptr, q_bias, rope_vec);  // Q: in-place
+    fuse_bias_rope_store<kVecSize>(k_dst, k_ptr, k_bias, rope_vec);  // K: transposed
+    fuse_bias_store<kVecSize>(v_dst, v_ptr, v_bias);                 // V: transposed, no RoPE
+}
+
+template<typename T>
+void dispatchPrepareQKV(T*           qkv,
+                        T*           kv,
+                        const T*     qkv_bias,
+                        const T*     rotary_pos_emb,
+                        const int*   mapped_idx,
+                        int          token_num,
+                        int          local_head_num,
+                        int          head_dim,
+                        int          rope_head_dim,
+                        cudaStream_t stream)
+{
+    auto invoke = [&](auto hd_c) {
+        constexpr int HD = decltype(hd_c)::value;
+        using Cfg        = HeadConfig<HD>;
+
+        // Each vec_size-wide load must lie entirely in or out of the rope range.
+        TM_CHECK(rope_head_dim % Cfg::kVecSize == 0)
+            << "rope_head_dim (" << rope_head_dim << ") must be a multiple of kVecSize (" << Cfg::kVecSize << ")";
+        TM_CHECK(rope_head_dim <= HD) << "rope_head_dim (" << rope_head_dim << ") cannot exceed head_dim (" << HD
+                                      << ")";
+
+        const int head_group_num = (local_head_num + Cfg::kHeadsPerWarp - 1) / Cfg::kHeadsPerWarp;
+        const int total_warps    = token_num * head_group_num;
+        dim3      grid((total_warps + kWarpsPerBlock - 1) / kWarpsPerBlock);
+        prepareQKVKernel<T, HD><<<grid, kWarpsPerBlock * WARP_SIZE, 0, stream>>>(
+            qkv, kv, qkv_bias, rotary_pos_emb, mapped_idx, token_num, local_head_num, head_group_num, rope_head_dim);
+    };
+
+    switch (head_dim) {
+        case 64:
+            return invoke(std::integral_constant<int, 64>{});
+        case 72:
+            return invoke(std::integral_constant<int, 72>{});
+        case 128:
+            return invoke(std::integral_constant<int, 128>{});
+        default:
+            TM_LOG_FATAL("unsupported Qwen ViT head_dim for qkv preprocess: {}", head_dim);
+    }
+}
+
+// ------------------------------------------------------------------------------------
+// 2. Spatial-merge index mapping
+// ------------------------------------------------------------------------------------
+
+__global__ void buildMappedIdxKernel(int* mapped_idx, int token_offset, int natural_offset, int t, int h, int w, int S)
+{
+    const int idx   = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total = t * h * w;
+    if (idx >= total) {
+        return;
+    }
+
+    const int hw                   = h * w;
+    const int merge_unit           = S * S;
+    const int local                = idx % hw;
+    const int group                = local / merge_unit;
+    const int inner                = local - group * merge_unit;
+    const int group_cols           = w / S;
+    const int h_outer              = group / group_cols;
+    const int w_outer              = group - h_outer * group_cols;
+    const int h_inner              = inner / S;
+    const int w_inner              = inner - h_inner * S;
+    const int natural_idx          = (h_outer * S + h_inner) * w + (w_outer * S + w_inner);
+    mapped_idx[token_offset + idx] = natural_offset + natural_idx;
+}
+
+__global__ void
+buildMappedIdxBatchedKernel(int* mapped_idx, const int* grid_thws, const int* grid_offsets, int num_grids, int S)
+{
+    const int grid_id = blockIdx.x;
+    if (grid_id >= num_grids) {
+        return;
+    }
+
+    const int t              = grid_thws[grid_id * 3];
+    const int h              = grid_thws[grid_id * 3 + 1];
+    const int w              = grid_thws[grid_id * 3 + 2];
+    const int token_offset   = grid_offsets[grid_id * 2];
+    const int natural_offset = grid_offsets[grid_id * 2 + 1];
+    const int total          = t * h * w;
+    const int hw             = h * w;
+    const int merge_unit     = S * S;
+    const int group_cols     = w / S;
+
+    for (int idx = threadIdx.x; idx < total; idx += blockDim.x) {
+        const int local                = idx % hw;
+        const int group                = local / merge_unit;
+        const int inner                = local - group * merge_unit;
+        const int h_outer              = group / group_cols;
+        const int w_outer              = group - h_outer * group_cols;
+        const int h_inner              = inner / S;
+        const int w_inner              = inner - h_inner * S;
+        const int natural_idx          = (h_outer * S + h_inner) * w + (w_outer * S + w_inner);
+        mapped_idx[token_offset + idx] = natural_offset + natural_idx;
+    }
+}
+
+// ------------------------------------------------------------------------------------
+// 3. Learned pos-embed bilinear interpolation (Qwen3.5)
+// ------------------------------------------------------------------------------------
+
+template<typename T>
+__device__ inline T from_float(float x);
+
+template<>
+__device__ inline half from_float<half>(float x)
+{
+    return __float2half(x);
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ inline __nv_bfloat16 from_float<__nv_bfloat16>(float x)
+{
+    return __float2bfloat16(x);
+}
+#endif
+
+template<typename T>
+__global__ void fastPosEmbedIdxWeightKernel(
+    int* idx_out, T* weight_out, const int* grid_thws, const int* grid_offsets, int num_grids, int total_n, int G)
+{
+    const int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pos >= total_n) {
+        return;
+    }
+
+    const int g      = find_grid(grid_offsets, num_grids, pos);
+    const int grid_h = grid_thws[g * 3 + 1];
+    const int grid_w = grid_thws[g * 3 + 2];
+    const int local  = pos - grid_offsets[g * 2 + 1];
+    const int i      = local / grid_w;
+    const int j      = local % grid_w;
+
+    // torch.linspace(0, G-1, n) uses the halfway-symmetric formulation so
+    // that both endpoints are exact:
+    //   step      = (end - start) / (n - 1)
+    //   halfway   = n / 2
+    //   out[i<hw] = start + step * i
+    //   out[i>=hw]= end   - step * (n - 1 - i)
+    // For n == 1 the single element is `start` (== 0 here); the formula
+    // below collapses to 0 since hw_h == 0 is bypassed via grid_h > 1.
+    const float end    = (float)(G - 1);
+    const float step_h = (grid_h > 1) ? end / (float)(grid_h - 1) : 0.f;
+    const float step_w = (grid_w > 1) ? end / (float)(grid_w - 1) : 0.f;
+
+    const int hw_h = grid_h / 2;
+    const int hw_w = grid_w / 2;
+
+    const float h_val = (grid_h == 1) ? 0.f : ((i < hw_h) ? step_h * (float)i : end - step_h * (float)(grid_h - 1 - i));
+    const float w_val = (grid_w == 1) ? 0.f : ((j < hw_w) ? step_w * (float)j : end - step_w * (float)(grid_w - 1 - j));
+
+    // torch.Tensor.int() truncates toward zero; h_val, w_val are non-negative
+    // and bounded above by G-1, so (int) cast is in [0, G-1].
+    const int h_floor = (int)h_val;
+    const int w_floor = (int)w_val;
+    const int h_ceil  = min(h_floor + 1, G - 1);
+    const int w_ceil  = min(w_floor + 1, G - 1);
+
+    const float dh = h_val - (float)h_floor;
+    const float dw = w_val - (float)w_floor;
+
+    const int base_h      = h_floor * G;
+    const int base_h_ceil = h_ceil * G;
+
+    Array<int, 4> idx;
+    idx[0] = base_h + w_floor;
+    idx[1] = base_h + w_ceil;
+    idx[2] = base_h_ceil + w_floor;
+    idx[3] = base_h_ceil + w_ceil;
+
+    Array<T, 4> weight;
+    weight[0] = from_float<T>((1.f - dh) * (1.f - dw));
+    weight[1] = from_float<T>((1.f - dh) * dw);
+    weight[2] = from_float<T>(dh * (1.f - dw));
+    weight[3] = from_float<T>(dh * dw);
+
+    const int out_base = pos * 4;
+    Store(idx_out + out_base, idx);
+    Store(weight_out + out_base, weight);
+}
+
+template<typename T, int N>
+__device__ Array<float, N> roundToStorageDtype(Array<float, N> x)
+{
+    return cast<float>(cast<T>(x));
+}
+
+template<int vec_size, typename T>
+__global__ void fusedPosEmbedMergeKernel(T*         hidden_states,
+                                         const T*   pos_embeds,
+                                         const T*   pos_embed_weights,
+                                         const int* mapped_idx,
+                                         const T*   bias,
+                                         int        hidden,
+                                         int        vdim)
+{
+    const int index  = blockIdx.x;
+    const int mapped = mapped_idx[index];  // same address for all threads in block -> L1 broadcast
+
+    Array<T, 4> w4;
+    Ldg(w4, pos_embed_weights + mapped * 4);
+
+    const int row_off = index * hidden;
+    const int pe_row0 = mapped * 4 * hidden;
+
+    using namespace ops;
+    for (int d = threadIdx.x; d < vdim; d += blockDim.x) {
+        Array<float, vec_size> pos{};
+        Array<T, vec_size>     tmp;
+        Load(tmp, hidden_states + row_off + d * vec_size);
+        auto hidden_acc = cast<float>(tmp);
+
+        if (bias) {
+            Ldg(tmp, bias + d * vec_size);
+            hidden_acc = roundToStorageDtype<T>(hidden_acc + cast<float>(tmp));
+        }
+        PRAGMA_UNROLL
+        for (int k = 0; k < 4; ++k) {
+            Ldg(tmp, pos_embeds + pe_row0 + k * hidden + d * vec_size);
+            pos = pos + cast<float>(tmp * w4[k]);
+        }
+        const auto out = hidden_acc + roundToStorageDtype<T>(pos);
+        Store(hidden_states + row_off + d * vec_size, cast<T>(out));
+    }
+}
+
+// ------------------------------------------------------------------------------------
+// 4. 2D rotary position-embedding table
+// ------------------------------------------------------------------------------------
+
+template<typename T>
+__global__ void fastRotaryPosEmbKernel(T*         cos_sin_out,
+                                       const int* grid_thws,
+                                       const int* grid_offsets,
+                                       int        num_grids,
+                                       int        total_hw,
+                                       int        head_dim,
+                                       float      scale)  // -log2(theta) / (head_dim/4)
+{
+    const int pair_count = head_dim / 2;  // e.g. 36
+    const int freq_half  = head_dim / 4;  // e.g. 18
+
+    const int tid    = blockIdx.x * blockDim.x + threadIdx.x;
+    const int pos    = tid / pair_count;
+    const int pair_k = tid % pair_count;
+    if (pos >= total_hw) {
+        return;
+    }
+
+    const int g      = find_grid(grid_offsets, num_grids, pos);
+    const int grid_w = grid_thws[g * 3 + 2];
+    const int local  = pos - grid_offsets[g * 2 + 1];
+    const int i      = local / grid_w;  // h_coord
+    const int j      = local % grid_w;  // w_coord
+
+    // Pairs [0, freq_half) rotate in h; pairs [freq_half, 2*freq_half) rotate in w.
+    const int   freq_idx = pair_k % freq_half;
+    const int   coord    = (pair_k < freq_half) ? i : j;
+    const float inv_freq = exp2f((float)freq_idx * scale);
+
+    float c, s;
+    sincosf((float)coord * inv_freq, &s, &c);
+
+    Array<T, 2> cs{(T)c, (T)s};
+    Store(cos_sin_out + (size_t)pos * head_dim + pair_k * 2, cs);
+}
+
+// ------------------------------------------------------------------------------------
+// 5. mrope position ids
+// ------------------------------------------------------------------------------------
+
+constexpr int kMropeBlock = 128;
+
+__global__ void mropeScatterKernel(int* pos_ids, const MropeSegment* __restrict__ segs)
+{
+    const MropeSegment s       = segs[blockIdx.x];
+    const int          local_k = blockIdx.y * blockDim.x + threadIdx.x;
+    if (local_k >= s.n_tok) {
+        return;
+    }
+    int* dst = pos_ids + 3 * (s.dst_offset + local_k);
+    if (s.h2 == 0) {  // text run
+        const int p = s.base_pos + local_k;
+        dst[0]      = p;
+        dst[1]      = p;
+        dst[2]      = p;
+    }
+    else {  // image run - grid math uses the original (un-clipped) k
+        const int k  = s.k_offset + local_k;
+        const int hw = s.h2 * s.w2;
+        dst[0]       = s.base_pos + k / hw;
+        dst[1]       = s.base_pos + (k / s.w2) % s.h2;
+        dst[2]       = s.base_pos + k % s.w2;
+    }
+}
+
+// ------------------------------------------------------------------------------------
+// 6. Window attention reordering (Qwen2.5)
+// ------------------------------------------------------------------------------------
+
+template<int vec_size, typename T>
+__global__ void windowReorderKernel(T*         out,
+                                    const T*   in,
+                                    const int* window_idx,
+                                    int64_t    out_stride,
+                                    int64_t    in_stride,
+                                    int        merge_unit,
+                                    int        group_count,
+                                    int        dim)
+{
+    const int dst_group = blockIdx.x;
+    const int inner     = blockIdx.y;
+    const int di        = (threadIdx.x + blockIdx.z * blockDim.x) * vec_size;
+    if (di >= dim) {
+        return;
+    }
+
+    const int src_group = window_idx[dst_group];
+    using Vec           = Array<T, vec_size>;
+    Vec x;
+    Load(x, in + ((int64_t)src_group * merge_unit + inner) * in_stride + di);
+    Store(out + ((int64_t)dst_group * merge_unit + inner) * out_stride + di, x);
+}
+
+template<int vec_size, typename T>
+__global__ void reverseWindowKernel(
+    T* out, const T* in, const int* window_idx, int64_t out_stride, int64_t in_stride, int group_count, int dim)
+{
+    const int src_group = blockIdx.x;
+    const int di        = (threadIdx.x + blockIdx.y * blockDim.x) * vec_size;
+    if (di >= dim) {
+        return;
+    }
+
+    const int dst_group = window_idx[src_group];
+    using Vec           = Array<T, vec_size>;
+    Vec x;
+    Load(x, in + (int64_t)src_group * in_stride + di);
+    Store(out + (int64_t)dst_group * out_stride + di, x);
+}
+
+__global__ void buildWindowMappedIdxKernel(
+    int* window_mapped_idx, const int* mapped_idx, const int* window_idx, int merge_unit, int total)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    const int dst_group    = idx / merge_unit;
+    const int inner        = idx - dst_group * merge_unit;
+    const int src_group    = window_idx[dst_group];
+    window_mapped_idx[idx] = mapped_idx[src_group * merge_unit + inner];
+}
+
+}  // namespace
+
+// =====================================================================================
+// Public entry points
+// =====================================================================================
+
+void invokeQwenVitPrepareQKV(void*        qkv,
+                             void*        kv,
+                             const void*  qkv_bias,
+                             const void*  rotary_pos_emb,
+                             const int*   mapped_idx,
+                             DataType     dtype,
+                             int          token_num,
+                             int          local_head_num,
+                             int          head_dim,
+                             int          rope_head_dim,
+                             cudaStream_t stream)
+{
+    if (token_num == 0) {
+        return;
+    }
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        dispatchPrepareQKV((T*)qkv,
+                           (T*)kv,
+                           (const T*)qkv_bias,
+                           (const T*)rotary_pos_emb,
+                           mapped_idx,
+                           token_num,
+                           local_head_num,
+                           head_dim,
+                           rope_head_dim,
+                           stream);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
+}
+
+void invokeQwenVitBuildMappedIdx(int*         mapped_idx,
+                                 int          token_offset,
+                                 int          natural_offset,
+                                 int          t,
+                                 int          h,
+                                 int          w,
+                                 int          spatial_merge_size,
+                                 cudaStream_t stream)
+{
+    if (t * h * w == 0) {
+        return;
+    }
+
+    const int total   = t * h * w;
+    const int threads = 256;
+    buildMappedIdxKernel<<<(total + threads - 1) / threads, threads, 0, stream>>>(
+        mapped_idx, token_offset, natural_offset, t, h, w, spatial_merge_size);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeQwenVitBuildMappedIdx(int*         mapped_idx,
+                                 const int*   grid_thws,
+                                 const int*   grid_offsets,
+                                 int          num_grids,
+                                 int          spatial_merge_size,
+                                 cudaStream_t stream)
+{
+    if (num_grids == 0) {
+        return;
+    }
+
+    const int threads = 256;
+    buildMappedIdxBatchedKernel<<<num_grids, threads, 0, stream>>>(
+        mapped_idx, grid_thws, grid_offsets, num_grids, spatial_merge_size);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeFastPosEmbedIdxWeight(int*         idx_out,
+                                 void*        weight_out,
+                                 DataType     dtype,
+                                 const int*   grid_thws,
+                                 const int*   grid_offsets,
+                                 int          num_grids,
+                                 int          total_n,
+                                 int          num_grid_per_side,
+                                 cudaStream_t stream)
+{
+    if (total_n <= 0 || num_grids <= 0) {
+        return;
+    }
+    const int block = 256;
+    const int grid  = (total_n + block - 1) / block;
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        fastPosEmbedIdxWeightKernel<T><<<grid, block, 0, stream>>>(
+            idx_out, (T*)weight_out, grid_thws, grid_offsets, num_grids, total_n, num_grid_per_side);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
+}
+
+void invokeFusedPosEmbedMerge(void*        hidden_states,
+                              const void*  pos_embeds,
+                              const void*  pos_embed_weights,
+                              const int*   mapped_idx,
+                              const void*  bias,
+                              int          batch,
+                              int          hidden,
+                              DataType     dtype,
+                              cudaStream_t stream)
+{
+    if (batch <= 0) {
+        return;
+    }
+
+    const dim3 grid(batch);
+    const dim3 block(128);
+
+    auto invoke = [&](auto t) {
+        using T                = decltype(t);
+        constexpr int vec_size = sizeof(uint4) / sizeof(T);
+        TM_CHECK(hidden % vec_size == 0);
+        fusedPosEmbedMergeKernel<vec_size, T><<<grid, block, 0, stream>>>((T*)hidden_states,
+                                                                          (const T*)pos_embeds,
+                                                                          (const T*)pos_embed_weights,
+                                                                          mapped_idx,
+                                                                          (const T*)bias,
+                                                                          hidden,
+                                                                          hidden / vec_size);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
+}
+
+void invokeQwenVitRotaryPosEmb(void*        cos_sin,
+                               DataType     dtype,
+                               const int*   grid_thws,
+                               const int*   grid_offsets,
+                               int          num_grids,
+                               int          total_hw,
+                               int          head_dim,
+                               float        theta,
+                               cudaStream_t stream)
+{
+    if (total_hw <= 0 || num_grids <= 0 || head_dim <= 0) {
+        return;
+    }
+    TM_CHECK(head_dim % 4 == 0) << "head_dim must be divisible by 4, got " << head_dim;
+
+    const int   total = total_hw * (head_dim / 2);
+    const int   block = 256;
+    const int   grid  = (total + block - 1) / block;
+    const float scale = -log2f(theta) / (float)(head_dim / 4);
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        fastRotaryPosEmbKernel<T>
+            <<<grid, block, 0, stream>>>((T*)cos_sin, grid_thws, grid_offsets, num_grids, total_hw, head_dim, scale);
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
+}
+
+void invokeMropePositionIds(
+    int* pos_ids, const MropeSegment* segments, int num_segments, int max_seg_len, cudaStream_t stream)
+{
+    if (num_segments <= 0 || max_seg_len <= 0) {
+        return;
+    }
+    const int  tiles = (max_seg_len + kMropeBlock - 1) / kMropeBlock;
+    const dim3 grid((unsigned)num_segments, (unsigned)tiles);
+    mropeScatterKernel<<<grid, kMropeBlock, 0, stream>>>(pos_ids, segments);
+}
+
+void invokeQwenVitWindowReorder(
+    Tensor& out, const Tensor& in, const int* window_idx, int merge_unit, int group_count, cudaStream_t stream)
+{
+    if (group_count == 0) {
+        return;
+    }
+
+    const int dim     = in.shape(1);
+    const int threads = 256;
+
+    auto invoke = [&](auto t) {
+        using T               = decltype(t);
+        constexpr int max_vec = sizeof(uint4) / sizeof(T);
+
+        int best_vec_size = 1;
+        for (int v = max_vec; v >= 1; v >>= 1) {
+            if (dim % v == 0 && in.stride(0) % v == 0 && out.stride(0) % v == 0) {
+                best_vec_size = v;
+                break;
+            }
+        }
+
+        auto launch = [&](auto vec_size_c) {
+            constexpr int vec_size = decltype(vec_size_c)::value;
+            const dim3    grid(group_count, merge_unit, cdiv(dim, threads * vec_size));
+            windowReorderKernel<vec_size, T><<<grid, threads, 0, stream>>>(
+                out.data<T>(), in.data<T>(), window_idx, out.stride(0), in.stride(0), merge_unit, group_count, dim);
+        };
+
+        switch (best_vec_size) {
+            case 8:
+                return launch(std::integral_constant<int, 8>{});
+            case 4:
+                return launch(std::integral_constant<int, 4>{});
+            case 2:
+                return launch(std::integral_constant<int, 2>{});
+            default:
+                return launch(std::integral_constant<int, 1>{});
+        }
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(in.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeQwenVitReverseWindow(
+    Tensor& out, const Tensor& in, const int* window_idx, int group_count, cudaStream_t stream)
+{
+    if (group_count == 0) {
+        return;
+    }
+
+    const int dim     = in.shape(1);
+    const int threads = 256;
+
+    auto invoke = [&](auto t) {
+        using T               = decltype(t);
+        constexpr int max_vec = sizeof(uint4) / sizeof(T);
+
+        int best_vec_size = 1;
+        for (int v = max_vec; v >= 1; v >>= 1) {
+            if (dim % v == 0 && in.stride(0) % v == 0 && out.stride(0) % v == 0) {
+                best_vec_size = v;
+                break;
+            }
+        }
+
+        auto launch = [&](auto vec_size_c) {
+            constexpr int vec_size = decltype(vec_size_c)::value;
+            const dim3    grid(group_count, cdiv(dim, threads * vec_size));
+            reverseWindowKernel<vec_size, T><<<grid, threads, 0, stream>>>(
+                out.data<T>(), in.data<T>(), window_idx, out.stride(0), in.stride(0), group_count, dim);
+        };
+
+        switch (best_vec_size) {
+            case 8:
+                return launch(std::integral_constant<int, 8>{});
+            case 4:
+                return launch(std::integral_constant<int, 4>{});
+            case 2:
+                return launch(std::integral_constant<int, 2>{});
+            default:
+                return launch(std::integral_constant<int, 1>{});
+        }
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(in.dtype(), invoke);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+void invokeQwenVitBuildWindowMappedIdx(int*         window_mapped_idx,
+                                       const int*   mapped_idx,
+                                       const int*   window_idx,
+                                       int          merge_unit,
+                                       int          group_count,
+                                       cudaStream_t stream)
+{
+    if (group_count == 0) {
+        return;
+    }
+
+    const int total   = group_count * merge_unit;
+    const int threads = 256;
+    buildWindowMappedIdxKernel<<<cdiv(total, threads), threads, 0, stream>>>(
+        window_mapped_idx, mapped_idx, window_idx, merge_unit, total);
+    TM_CUDA_CHECK(cudaGetLastError());
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/qwenvit/qwenvit_kernels.h b/src/turbomind/models/qwenvit/qwenvit_kernels.h
new file mode 100644
index 0000000000..29cf4d78da
--- /dev/null
+++ b/src/turbomind/models/qwenvit/qwenvit_kernels.h
@@ -0,0 +1,148 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/data_type.h"
+
+#include <cuda_runtime.h>
+
+namespace turbomind {
+
+// =====================================================================================
+// QKV preprocessing (qkv_preprocess)
+// =====================================================================================
+
+// Prepare the Qwen ViT attention inputs after the fused QKV projection.
+//
+// qkv layout:
+//   [token, local_q_heads + 2 * local_kv_heads, head_dim]
+// Q is updated in place with bias + RoPE. K/V are written to `kv` as:
+//   [local_kv_heads, 2, token, head_dim]
+//
+// `rope_head_dim` is the per-head dim of the rotary_pos_emb buffer and is
+// also the cutoff below which RoPE is applied. When the model's real head_dim
+// is not natively supported by the attention kernel, Q/K/V are zero-padded
+// per-head to a kernel-supported `head_dim` >= `rope_head_dim`; the padded
+// `[rope_head_dim, head_dim)` slice has zero Q/K so RoPE is skipped there.
+void invokeQwenVitPrepareQKV(void*        qkv,
+                             void*        kv,
+                             const void*  qkv_bias,
+                             const void*  rotary_pos_emb,
+                             const int*   mapped_idx,
+                             DataType     dtype,
+                             int          token_num,
+                             int          local_head_num,
+                             int          head_dim,
+                             int          rope_head_dim,
+                             cudaStream_t stream);
+
+// =====================================================================================
+// Spatial-merge index mapping (grid_mapping)
+// =====================================================================================
+
+void invokeQwenVitBuildMappedIdx(int*         mapped_idx,
+                                 int          token_offset,
+                                 int          natural_offset,
+                                 int          t,
+                                 int          h,
+                                 int          w,
+                                 int          spatial_merge_size,
+                                 cudaStream_t stream);
+
+void invokeQwenVitBuildMappedIdx(int*         mapped_idx,
+                                 const int*   grid_thws,
+                                 const int*   grid_offsets,
+                                 int          num_grids,
+                                 int          spatial_merge_size,
+                                 cudaStream_t stream);
+
+// =====================================================================================
+// Learned positional-embedding bilinear interpolation (fast_pos_embed) — Qwen3.5
+// =====================================================================================
+
+// Precomputes the 4 bilinear-interpolation gather indices and weights
+// used by the subsequent pos-embed merge step in Qwen3-VL.
+void invokeFastPosEmbedIdxWeight(int*         idx_out,     // [total_n * 4]
+                                 void*        weight_out,  // [total_n * 4]
+                                 DataType     dtype,
+                                 const int*   grid_thws,     // [num_grids * 3], (t, h, w)
+                                 const int*   grid_offsets,  // [num_grids * 2], (t*h*w, h*w)
+                                 int          num_grids,
+                                 int          total_n,
+                                 int          num_grid_per_side,
+                                 cudaStream_t stream);
+
+// Fuses the spatial-merge permutation, the bilinear-weighted sum, and the
+// t-expansion of Qwen3-VL ViT pos_embed interpolation into a single pass on
+// top of the patch_embed linear output. (fused_embed_merge)
+void invokeFusedPosEmbedMerge(void*        hidden_states,      // [batch, hidden]
+                              const void*  pos_embeds,         // [total_hw * 4, hidden]
+                              const void*  pos_embed_weights,  // [total_hw * 4]
+                              const int*   mapped_idx,         // [batch]
+                              const void*  bias,               // [hidden] or nullptr
+                              int          batch,
+                              int          hidden,
+                              DataType     dtype,
+                              cudaStream_t stream);
+
+// =====================================================================================
+// 2D rotary position embedding table (fast_rotary_pos_emb)
+// =====================================================================================
+
+// Precomputes the (cos, sin) rotary-embedding table for Qwen-VL vision tokens.
+// Layout per natural flat position (keyed by the same index `mapped_idx` carries):
+//   [c_0, s_0, c_1, s_1, ..., c_{head_dim/2-1}, s_{head_dim/2-1}]
+// The pair index `k` uses `h_coord` for k < head_dim/4 and `w_coord` otherwise,
+// with inv_freq = theta^(-2*(k % (head_dim/4)) / (head_dim/2)).
+void invokeQwenVitRotaryPosEmb(void*        cos_sin,  // [total_hw, head_dim]
+                               DataType     dtype,
+                               const int*   grid_thws,     // [num_grids * 3], (t, h, w)
+                               const int*   grid_offsets,  // [num_grids * 2], (t*h*w, h*w)
+                               int          num_grids,
+                               int          total_hw,
+                               int          head_dim,
+                               float        theta,
+                               cudaStream_t stream);
+
+// =====================================================================================
+// mrope position ids (mrope_position_ids)
+// =====================================================================================
+
+// One descriptor per text / image run, clipped to a prefill chunk's active window.
+// `h2 == 0` flags a text run (real image grids always have h2 > 0).
+struct MropeSegment {
+    int dst_offset;  // flat forward-token index of the first token written by this segment
+    int n_tok;       // tokens to write (already clipped to the active range)
+    int base_pos;    // text: position id at local_k = 0; image: image's mm_start
+    int h2;          // image grid h after spatial merge (0 => text)
+    int w2;          // image grid w after spatial merge (ignored when h2 == 0)
+    int k_offset;    // starting "k" for image grid math (clip-offset within the run); unused for text
+};
+
+// Scatter `num_segments` segments into `pos_ids` of shape (max_fwd_tokens, 3).
+// `pos_ids` may be null when num_segments == 0.
+void invokeMropePositionIds(int*                pos_ids,
+                            const MropeSegment* segments,  // device
+                            int                 num_segments,
+                            int                 max_seg_len,
+                            cudaStream_t        stream);
+
+// =====================================================================================
+// Window attention reordering (window kernels) — Qwen2.5
+// =====================================================================================
+
+void invokeQwenVitWindowReorder(
+    Tensor& out, const Tensor& in, const int* window_idx, int merge_unit, int group_count, cudaStream_t stream);
+
+void invokeQwenVitReverseWindow(
+    Tensor& out, const Tensor& in, const int* window_idx, int group_count, cudaStream_t stream);
+
+void invokeQwenVitBuildWindowMappedIdx(int*         window_mapped_idx,
+                                       const int*   mapped_idx,
+                                       const int*   window_idx,
+                                       int          merge_unit,
+                                       int          group_count,
+                                       cudaStream_t stream);
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc b/src/turbomind/models/qwenvit/qwenvit_weight.cc
similarity index 57%
rename from src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc
rename to src/turbomind/models/qwenvit/qwenvit_weight.cc
index ff55324f1d..14caf479cb 100644
--- a/src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.cc
+++ b/src/turbomind/models/qwenvit/qwenvit_weight.cc
@@ -1,36 +1,38 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_weight.h"
 
 #include "src/turbomind/core/registry.h"
 #include "src/turbomind/models/layer_norm_weight.h"
 #include "src/turbomind/models/linear_weight.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h"
 #include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
-Qwen3_5VitWeight::Qwen3_5VitWeight(const core::Qwen3_5VitConfig& cfg): config_{cfg} {}
+QwenVitWeight::QwenVitWeight(const core::QwenVitConfig& cfg): config_{cfg} {}
 
-void Qwen3_5VitWeight::prepare()
+void QwenVitWeight::prepare()
 {
     for_each_child([](const char* /*name*/, Module* child) {
         if (child)
             child->prepare();
     });
 
+    // Qwen3.5 carries a learned positional-embedding table; keep it in the ViT dtype.
     if (pos_embed) {
         EnsureFloatDtype(pos_embed, config_.data_type);
     }
 }
 
-bool Qwen3_5VitWeight::verify(std::vector<std::string>& missing)
+bool QwenVitWeight::verify(std::vector<std::string>& missing)
 {
     Module::verify(missing);
     if (!patch_embed) {
         missing.push_back(full_path() + ": missing patch_embed");
     }
-    if (!pos_embed) {
+    // pos_embed only exists for models with a learned position-embedding table (Qwen3.5).
+    if (config_.num_position_embeddings > 0 && !pos_embed) {
         missing.push_back(full_path() + ": missing pos_embed");
     }
     if (!blocks || blocks->size() != config_.depth) {
@@ -42,16 +44,16 @@ bool Qwen3_5VitWeight::verify(std::vector<std::string>& missing)
     return missing.empty();
 }
 
-Qwen3_5VitBlockWeight* Qwen3_5VitWeight::block(int i) const
+QwenVitBlockWeight* QwenVitWeight::block(int i) const
 {
     if (!blocks) {
         return nullptr;
     }
-    return static_cast<Qwen3_5VitBlockWeight*>(blocks->child(std::to_string(i)));
+    return static_cast<QwenVitBlockWeight*>(blocks->child(std::to_string(i)));
 }
 
-TM_MODULE_REGISTER(Qwen3_5VitWeight, core::Qwen3_5VitConfig);
+TM_MODULE_REGISTER(QwenVitWeight, core::QwenVitConfig);
 
-TM_MODULE_METHODS(Qwen3_5VitWeight, QWEN3_5VIT_WEIGHT_CHILDREN, QWEN3_5VIT_WEIGHT_PARAMS)
+TM_MODULE_METHODS(QwenVitWeight, QWENVIT_WEIGHT_CHILDREN, QWENVIT_WEIGHT_PARAMS)
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/qwenvit/qwenvit_weight.h b/src/turbomind/models/qwenvit/qwenvit_weight.h
new file mode 100644
index 0000000000..5fc0ca9ffa
--- /dev/null
+++ b/src/turbomind/models/qwenvit/qwenvit_weight.h
@@ -0,0 +1,132 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#pragma once
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/kernels/norm/norm.h"
+#include "src/turbomind/models/vision_model_weight.h"
+
+#include <vector>
+
+namespace turbomind::core {
+
+/// Root config for the Qwen ViT family (Qwen2-VL / Qwen2.5-VL / Qwen3.5).
+///
+/// Carries every structural scalar the C++ runtime needs to allocate kernels.
+/// The feature set of the two model families is the union of orthogonal toggles:
+///   - window attention       (Qwen2.5):  use_window_attention / window_size / fullatt_block_indexes
+///   - learned pos embedding   (Qwen3.5):  num_position_embeddings > 0 (+ pos_embed weight)
+///   - gated SiLU MLP          (Qwen2.5):  gated_mlp
+///   - tanh-approx GELU MLP    (Qwen3.5):  gelu_tanh
+///   - RMSNorm vs LayerNorm    (Qwen2):    norm_type
+///
+/// Each field is visited by the X-macro below so pybind11 exposes it as a
+/// read/write attribute on the Python ``QwenVitConfig``.
+struct QwenVitConfig: ModuleConfig {
+    QwenVitConfig(): ModuleConfig{"QwenVitWeight"} {}
+
+    DataType         data_type{};
+    int              hidden_dim{0};
+    int              out_hidden_dim{0};
+    int              depth{0};
+    int              head_num{0};
+    int              intermediate_size{0};
+    int              patch_in_dim{0};
+    int              in_channels{0};
+    int              patch_size{0};
+    int              temporal_patch_size{0};
+    int              spatial_merge_size{0};
+    int              num_position_embeddings{0};
+    int              window_size{0};
+    bool             gated_mlp{false};
+    bool             use_window_attention{false};
+    bool             gelu_tanh{false};
+    NormType         norm_type{NormType::kLayerNorm};
+    float            norm_eps{1e-6f};
+    std::vector<int> fullatt_block_indexes;
+
+#define QWENVIT_FIELDS(X)                                                                                              \
+    X(DataType, data_type)                                                                                             \
+    X(int, hidden_dim)                                                                                                 \
+    X(int, out_hidden_dim)                                                                                             \
+    X(int, depth)                                                                                                      \
+    X(int, head_num)                                                                                                   \
+    X(int, intermediate_size)                                                                                          \
+    X(int, patch_in_dim)                                                                                               \
+    X(int, in_channels)                                                                                                \
+    X(int, patch_size)                                                                                                 \
+    X(int, temporal_patch_size)                                                                                        \
+    X(int, spatial_merge_size)                                                                                         \
+    X(int, num_position_embeddings, 0)                                                                                 \
+    X(int, window_size, 0)                                                                                             \
+    X(bool, gated_mlp, false)                                                                                          \
+    X(bool, use_window_attention, false)                                                                               \
+    X(bool, gelu_tanh, false)                                                                                          \
+    X(NormType, norm_type, NormType::kLayerNorm)                                                                       \
+    X(std::vector<int>, fullatt_block_indexes)                                                                         \
+    X(float, norm_eps, 1e-6f)
+
+    TM_FOR_EACH(QwenVitConfig, QWENVIT_FIELDS)
+
+#undef QWENVIT_FIELDS
+};
+
+}  // namespace turbomind::core
+
+namespace turbomind {
+
+// Forward decls
+class LayerNormWeight;
+class LinearWeight;
+class QwenVitBlockWeight;
+
+/// Unified Qwen ViT weight tree (Qwen2-VL / Qwen2.5-VL / Qwen3.5).
+///
+/// Tree:
+///   patch_embed   LinearWeight (Conv3d-as-Linear; in_dim = C·T·patch²)
+///   pos_embed     raw tensor   (num_position_embeddings × hidden_dim) — Qwen3.5 only, optional
+///   blocks        ModuleList   of QwenVitBlockWeight × depth
+///   merger_fc1    LinearWeight (in: hidden·spatial_merge², out: 4·hidden)
+///   merger_fc2    LinearWeight (in: 4·hidden,              out: out_hidden)
+///   merger_norm   LayerNormWeight or NormWeight (over hidden_dim)
+///
+/// We expose ``merger_*`` as direct children rather than a sub-module to
+/// keep the weight tree shallow — the merger has only three pieces.
+class QwenVitWeight: public VisionModelWeight {
+public:
+    const char* type() const override
+    {
+        return "QwenVitWeight";
+    }
+
+    QwenVitWeight() = default;
+    explicit QwenVitWeight(const core::QwenVitConfig& cfg);
+
+    void prepare() override;
+    bool verify(std::vector<std::string>& missing) override;
+
+    // --- X-macro field lists ---
+#define QWENVIT_WEIGHT_CHILDREN(X)                                                                                     \
+    X(LinearWeight, patch_embed)                                                                                       \
+    X(core::ModuleList, blocks)                                                                                        \
+    X(LinearWeight, merger_fc1)                                                                                        \
+    X(LinearWeight, merger_fc2)                                                                                        \
+    X(core::Module, merger_norm)
+
+#define QWENVIT_WEIGHT_PARAMS(X) X(pos_embed)
+
+    TM_MODULE_DECLARE(QwenVitWeight, QWENVIT_WEIGHT_CHILDREN, QWENVIT_WEIGHT_PARAMS)
+
+    // --- Accessors ---
+    const core::QwenVitConfig& config() const noexcept
+    {
+        return config_;
+    }
+
+    QwenVitBlockWeight* block(int i) const;
+
+private:
+    core::QwenVitConfig config_{};
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu b/src/turbomind/models/qwenvit/test_mrope_position_ids.cu
similarity index 71%
rename from src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu
rename to src/turbomind/models/qwenvit/test_mrope_position_ids.cu
index b622eda4c8..eb6854d081 100644
--- a/src/turbomind/models/qwen3_5vit/test_mrope_position_ids.cu
+++ b/src/turbomind/models/qwenvit/test_mrope_position_ids.cu
@@ -1,7 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 //
 // Standalone test for invokeMropePositionIds. Builds segment descriptors with the same
-// logic the production Qwen3_5Vit::Impl::Setup() uses, runs the device kernel, and
+// logic the production QwenVit::Impl::Setup() uses, runs the device kernel, and
 // compares against a CPU reference that replicates the pre-refactor scalar loop.
 
 #include <algorithm>
@@ -13,12 +13,21 @@
 
 #include <cuda_runtime.h>
 
-#include "src/turbomind/models/qwen3_5vit/mrope_position_ids.h"
+#include "src/turbomind/models/qwenvit/qwenvit_kernels.h"
 
 using namespace turbomind;
 
 namespace {
 
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        const cudaError_t err = (call);                                                                                \
+        if (err != cudaSuccess) {                                                                                      \
+            std::printf("[CUDA] %s failed: %s\n", #call, cudaGetErrorString(err));                                     \
+            return 1;                                                                                                  \
+        }                                                                                                              \
+    } while (0)
+
 struct ImageSpec {
     int seq_start;
     int t;
@@ -36,7 +45,6 @@ struct RequestSpec {
 
 // Pre-refactor scalar reference: builds the full (seq_len, 3) row for each request,
 // then we read out the [active_start, active_end) slice to compare against kernel output.
-// Mirrors qwen3_5vit.cc:440-516 (pre-refactor) exactly.
 std::vector<int> cpu_reference_full_row(const RequestSpec& r, int S, int& out_delta)
 {
     std::vector<int> row(r.seq_len * 3, 0);
@@ -87,8 +95,8 @@ std::vector<int> cpu_reference_full_row(const RequestSpec& r, int S, int& out_de
     return row;
 }
 
-// Mirrors the host walk in qwen3_5vit.cc Setup() — same logic, returns the segment list.
-void emit_segments(const RequestSpec& r, int request_idx, int S, std::vector<MropeSegment>& out)
+// Mirrors the host walk in qwenvit.cc Setup() - same logic, returns the segment list.
+void emit_segments(const RequestSpec& r, int q_offset, int S, std::vector<MropeSegment>& out)
 {
     if (r.autoregres || r.images.empty()) {
         return;
@@ -102,8 +110,7 @@ void emit_segments(const RequestSpec& r, int request_idx, int S, std::vector<Mro
         }
         const int    local_off = a - run_start;
         MropeSegment seg{};
-        seg.dst_row    = request_idx;
-        seg.dst_offset = a;
+        seg.dst_offset = q_offset + (a - r.active_start);
         seg.n_tok      = b - a;
         seg.base_pos   = (h2 == 0) ? run_base + local_off : run_base;
         seg.h2         = h2;
@@ -132,49 +139,49 @@ void emit_segments(const RequestSpec& r, int request_idx, int S, std::vector<Mro
 
 int run_case(const std::string& name, const std::vector<RequestSpec>& batch, int S)
 {
-    const int bsz            = (int)batch.size();
-    int       max_active_end = 0;
-    int       max_seg_len    = 0;
-    bool      any_table      = false;
+    const int        bsz = (int)batch.size();
+    std::vector<int> q_offsets(bsz + 1, 0);
+    int              max_seg_len = 0;
+    bool             any_table   = false;
 
     std::vector<MropeSegment> segs;
     for (int i = 0; i < bsz; ++i) {
+        q_offsets[i + 1]    = q_offsets[i] + (batch[i].autoregres ? 1 : batch[i].active_end - batch[i].active_start);
         const size_t before = segs.size();
-        emit_segments(batch[i], i, S, segs);
+        emit_segments(batch[i], q_offsets[i], S, segs);
         for (size_t j = before; j < segs.size(); ++j) {
             max_seg_len = std::max(max_seg_len, segs[j].n_tok);
         }
         if (!batch[i].autoregres && !batch[i].images.empty()) {
-            max_active_end = std::max(max_active_end, batch[i].active_end);
-            any_table      = true;
+            any_table = true;
         }
     }
 
     // Run kernel
     std::vector<int> kernel_out;
     if (any_table) {
-        const ssize_t pos_ids_count = (ssize_t)bsz * max_active_end * 3;
+        const ssize_t pos_ids_count = (ssize_t)q_offsets.back() * 3;
         int*          d_pos_ids     = nullptr;
-        cudaMalloc(&d_pos_ids, pos_ids_count * sizeof(int));
-        cudaMemset(d_pos_ids, 0xCC, pos_ids_count * sizeof(int));  // poison
+        CHECK_CUDA(cudaMalloc(&d_pos_ids, pos_ids_count * sizeof(int)));
+        CHECK_CUDA(cudaMemset(d_pos_ids, 0xCC, pos_ids_count * sizeof(int)));  // poison
 
         MropeSegment* d_segs = nullptr;
-        cudaMalloc(&d_segs, segs.size() * sizeof(MropeSegment));
-        cudaMemcpy(d_segs, segs.data(), segs.size() * sizeof(MropeSegment), cudaMemcpyHostToDevice);
+        CHECK_CUDA(cudaMalloc(&d_segs, segs.size() * sizeof(MropeSegment)));
+        CHECK_CUDA(cudaMemcpy(d_segs, segs.data(), segs.size() * sizeof(MropeSegment), cudaMemcpyHostToDevice));
 
         invokeMropePositionIds(d_pos_ids,
-                               max_active_end * 3,
                                d_segs,
                                (int)segs.size(),
                                max_seg_len,
                                /*stream=*/0);
-        cudaDeviceSynchronize();
+        CHECK_CUDA(cudaGetLastError());
+        CHECK_CUDA(cudaDeviceSynchronize());
 
         kernel_out.resize(pos_ids_count);
-        cudaMemcpy(kernel_out.data(), d_pos_ids, pos_ids_count * sizeof(int), cudaMemcpyDeviceToHost);
+        CHECK_CUDA(cudaMemcpy(kernel_out.data(), d_pos_ids, pos_ids_count * sizeof(int), cudaMemcpyDeviceToHost));
 
-        cudaFree(d_pos_ids);
-        cudaFree(d_segs);
+        CHECK_CUDA(cudaFree(d_pos_ids));
+        CHECK_CUDA(cudaFree(d_segs));
     }
 
     // Compare against CPU reference within each request's active range
@@ -188,7 +195,7 @@ int run_case(const std::string& name, const std::vector<RequestSpec>& batch, int
         }
         for (int k = r.active_start; k < r.active_end; ++k) {
             for (int c = 0; c < 3; ++c) {
-                const int got = kernel_out[(size_t)i * max_active_end * 3 + (size_t)k * 3 + c];
+                const int got = kernel_out[((size_t)q_offsets[i] + k - r.active_start) * 3 + c];
                 const int ref = full_row[k * 3 + c];
                 if (got != ref) {
                     if (errors < 16) {
@@ -202,10 +209,10 @@ int run_case(const std::string& name, const std::vector<RequestSpec>& batch, int
     }
 
     if (errors == 0) {
-        std::printf("[PASS] %s — bsz=%d segs=%zu max_active_end=%d\n", name.c_str(), bsz, segs.size(), max_active_end);
+        std::printf("[PASS] %s - bsz=%d segs=%zu q_tokens=%d\n", name.c_str(), bsz, segs.size(), q_offsets.back());
     }
     else {
-        std::printf("[FAIL] %s — %d mismatches\n", name.c_str(), errors);
+        std::printf("[FAIL] %s - %d mismatches\n", name.c_str(), errors);
     }
     return errors;
 }
@@ -217,7 +224,7 @@ int main()
     const int S      = 2;  // spatial_merge_size
     int       errors = 0;
 
-    // (a) Decode-only batch — no table writes expected.
+    // (a) Decode-only batch - no table writes expected.
     errors += run_case("decode_only",
                        {RequestSpec{/*seq_len=*/64,
                                     /*active_start=*/64,
@@ -226,7 +233,7 @@ int main()
                                     /*images=*/{}}},
                        S);
 
-    // (b) Pure-text prefill — empty images, identity positions.
+    // (b) Pure-text prefill - empty images, identity positions.
     errors += run_case("pure_text", {RequestSpec{32, 0, 32, false, {}}}, S);
 
     // (c) Single-image prefill (image in the middle).
@@ -249,7 +256,7 @@ int main()
                         RequestSpec{30, 0, 30, false, {{6, 1, 6, 4}}}},  // image prefill
                        S);
 
-    // (g) Chunked prefill — second chunk, history_len > 0, active range mid-prompt.
+    // (g) Chunked prefill - second chunk, history_len > 0, active range mid-prompt.
     //     Image overlaps both chunks.
     errors += run_case("chunked_prefill",
                        {RequestSpec{60, 16, 32, false, {{8, 1, 8, 8}}}},  // image spans 8..24
@@ -258,10 +265,15 @@ int main()
     // (h) Multi-image with clipping.
     errors += run_case("multi_image_clip", {RequestSpec{80, 10, 50, false, {{6, 1, 4, 4}, {30, 2, 4, 4}}}}, S);
 
+    // (i) Chunked prefill fully after image tokens - validates post-image delta/table rows.
+    errors += run_case("chunked_after_image",
+                       {RequestSpec{80, 48, 72, false, {{8, 1, 8, 8}}}},  // image spans 8..24
+                       S);
+
     if (errors == 0) {
         std::printf("All cases passed.\n");
         return 0;
     }
-    std::printf("FAILED — %d total mismatches.\n", errors);
+    std::printf("FAILED - %d total mismatches.\n", errors);
     return 1;
 }
diff --git a/src/turbomind/models/vision_model.cc b/src/turbomind/models/vision_model.cc
index 6cfc8cbf60..4f2e10dcf4 100644
--- a/src/turbomind/models/vision_model.cc
+++ b/src/turbomind/models/vision_model.cc
@@ -3,8 +3,10 @@
 #include "src/turbomind/models/vision_model.h"
 
 #include "src/turbomind/core/logger.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h"
+#include "src/turbomind/models/internvit/internvit.h"
+#include "src/turbomind/models/internvit/internvit_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit.h"
+#include "src/turbomind/models/qwenvit/qwenvit_weight.h"
 
 #include <string_view>
 
@@ -15,8 +17,11 @@ std::unique_ptr<VisionModel> CreateVisionModel(const VisionModelWeight& weights,
                                                const Context&           ctx,
                                                int                      phases)
 {
-    if (std::string_view{weights.type()} == "Qwen3_5VitWeight") {
-        return std::make_unique<Qwen3_5Vit>(engine, ctx, static_cast<const Qwen3_5VitWeight&>(weights), phases);
+    if (std::string_view{weights.type()} == "QwenVitWeight") {
+        return std::make_unique<QwenVit>(engine, ctx, static_cast<const QwenVitWeight&>(weights), phases);
+    }
+    if (std::string_view{weights.type()} == "InternVitWeight") {
+        return std::make_unique<InternVit>(engine, ctx, static_cast<const InternVitWeight&>(weights), phases);
     }
 
     TM_LOG_FATAL("Unsupported vision model weight type: {}", weights.type());
diff --git a/src/turbomind/models/vision_model.h b/src/turbomind/models/vision_model.h
index 281ec6cc6b..a1c2edc9aa 100644
--- a/src/turbomind/models/vision_model.h
+++ b/src/turbomind/models/vision_model.h
@@ -16,7 +16,7 @@ namespace turbomind {
 
 /// Polymorphic peer of ``LanguageModel`` for the vision sub-graph.
 ///
-/// Concrete subclasses (one per VLM family — ``Qwen3_5Vit``,
+/// Concrete subclasses (one per VLM family — ``QwenVit``,
 /// ``InternVit``, …) wire up the per-family C++ runtime. The
 /// engine talks to this base via ``Run(BatchOp, phase, env)``,
 /// mirroring ``LanguageModel::Run``.
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 1a7a107a7f..21917807d9 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -22,18 +22,22 @@
 #include "src/turbomind/engine/engine_config.h"
 #include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/engine/multimodal_input.h"
+#include "src/turbomind/kernels/norm/norm.h"
 #include "src/turbomind/models/attention_weight.h"
 #include "src/turbomind/models/decoder_layer_weight.h"
 #include "src/turbomind/models/delta_net_weight.h"
 #include "src/turbomind/models/ffn_weight.h"
+#include "src/turbomind/models/internvit/internvit_block_weight.h"
+#include "src/turbomind/models/internvit/internvit_input.h"
+#include "src/turbomind/models/internvit/internvit_weight.h"
 #include "src/turbomind/models/layer_norm_weight.h"
 #include "src/turbomind/models/linear_weight.h"
 #include "src/turbomind/models/model_weight.h"
 #include "src/turbomind/models/moe_weight.h"
 #include "src/turbomind/models/norm_weight.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_block_weight.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_input.h"
-#include "src/turbomind/models/qwen3_5vit/qwen3_5vit_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_block_weight.h"
+#include "src/turbomind/models/qwenvit/qwenvit_input.h"
+#include "src/turbomind/models/qwenvit/qwenvit_weight.h"
 #include "src/turbomind/models/vision_model_weight.h"
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/turbomind.h"
@@ -333,10 +337,12 @@ PYBIND11_MODULE(_turbomind, m)
 {
     py::module_ multimodal = m.def_submodule("multimodal");
 
-    using MMInput      = ft::multimodal::Input;
-    using MMModality   = ft::multimodal::Modality;
-    using QwenVitItem  = ft::multimodal::Qwen3_5VitItem;
-    using QwenVitInput = ft::multimodal::Qwen3_5VitInput;
+    using MMInput        = ft::multimodal::Input;
+    using MMModality     = ft::multimodal::Modality;
+    using InternVitItem  = ft::multimodal::InternVitItem;
+    using InternVitInput = ft::multimodal::InternVitInput;
+    using QwenVitItem    = ft::multimodal::QwenVitItem;
+    using QwenVitInput   = ft::multimodal::QwenVitInput;
     py::class_<MMInput, std::shared_ptr<MMInput>>(multimodal, "Input");
     py::enum_<MMModality>(multimodal, "Modality")
         .value("IMAGE", MMModality::kImage)
@@ -344,7 +350,7 @@ PYBIND11_MODULE(_turbomind, m)
         .value("AUDIO", MMModality::kAudio)
         .value("TIME_SERIES", MMModality::kTimeSeries)
         .export_values();
-    py::class_<QwenVitItem>(multimodal, "Qwen3_5VitItem")
+    py::class_<QwenVitItem>(multimodal, "QwenVitItem")
         .def(py::init<>())
         .def(py::init([](MMModality              modality,
                          std::shared_ptr<Tensor> data,
@@ -366,10 +372,30 @@ PYBIND11_MODULE(_turbomind, m)
         .def_readwrite("token_begin", &QwenVitItem::token_begin)
         .def_readwrite("token_end", &QwenVitItem::token_end)
         .def_readwrite("grid_thw", &QwenVitItem::grid_thw);
-    py::class_<QwenVitInput, MMInput, std::shared_ptr<QwenVitInput>>(multimodal, "Qwen3_5VitInput")
+    py::class_<QwenVitInput, MMInput, std::shared_ptr<QwenVitInput>>(multimodal, "QwenVitInput")
         .def(py::init<>())
         .def(py::init<std::vector<QwenVitItem>>(), "items"_a)
         .def_readwrite("items", &QwenVitInput::items);
+    py::class_<InternVitItem>(multimodal, "InternVitItem")
+        .def(py::init<>())
+        .def(py::init([](MMModality modality, std::shared_ptr<Tensor> data, int token_begin, int token_end) {
+                 return InternVitItem{modality, *data, token_begin, token_end};
+             }),
+             "modality"_a,
+             "data"_a,
+             "token_begin"_a,
+             "token_end"_a)
+        .def_readwrite("modality", &InternVitItem::modality)
+        .def_property(
+            "data",
+            [](const InternVitItem& self) { return std::make_shared<Tensor>(self.data); },
+            [](InternVitItem& self, std::shared_ptr<Tensor> data) { self.data = *data; })
+        .def_readwrite("token_begin", &InternVitItem::token_begin)
+        .def_readwrite("token_end", &InternVitItem::token_end);
+    py::class_<InternVitInput, MMInput, std::shared_ptr<InternVitInput>>(multimodal, "InternVitInput")
+        .def(py::init<>())
+        .def(py::init<std::vector<InternVitItem>>(), "items"_a)
+        .def_readwrite("items", &InternVitInput::items);
 
     py::class_<ft::RequestMetrics, std::shared_ptr<ft::RequestMetrics>>(m, "RequestMetrics")
         .def(py::init())
@@ -440,9 +466,9 @@ PYBIND11_MODULE(_turbomind, m)
     py::class_<ft::AtomicRequestState, std::shared_ptr<ft::AtomicRequestState>>(m, "AtomicRequestState")
         .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); });
 
-    // data type
     {
         using namespace turbomind;
+        // data type
         py::enum_<ft::DataType>(m, "DataType")
             .value("TYPE_INVALID", kNull)
             .value("TYPE_BOOL", kBool)
@@ -467,6 +493,12 @@ PYBIND11_MODULE(_turbomind, m)
             .value("MEMORY_CPU", ft::DeviceType::kCPU)
             .value("MEMORY_CPU_PINNED", ft::DeviceType::kCPUpinned)
             .value("MEMORY_GPU", ft::DeviceType::kDEVICE);
+
+        // norm type
+        py::enum_<ft::NormType>(m, "NormType")
+            .value("NONE", ft::NormType::kNone)
+            .value("LAYER_NORM", ft::NormType::kLayerNorm)
+            .value("RMS_NORM", ft::NormType::kRMSNorm);
     }
 
     // DataFormat descriptors
@@ -508,8 +540,10 @@ PYBIND11_MODULE(_turbomind, m)
     bind_config<turbomind::core::DecoderLayerConfig>(m, "DecoderLayerConfig");
     bind_config<turbomind::core::ModelWeightConfig>(m, "ModelWeightConfig");
     bind_config<turbomind::core::LayerNormConfig>(m, "LayerNormConfig");
-    bind_config<turbomind::core::Qwen3_5VitConfig>(m, "Qwen3_5VitConfig");
-    bind_config<turbomind::core::Qwen3_5VitBlockConfig>(m, "Qwen3_5VitBlockConfig");
+    bind_config<turbomind::core::QwenVitConfig>(m, "QwenVitConfig");
+    bind_config<turbomind::core::QwenVitBlockConfig>(m, "QwenVitBlockConfig");
+    bind_config<turbomind::core::InternVitConfig>(m, "InternVitConfig");
+    bind_config<turbomind::core::InternVitBlockConfig>(m, "InternVitBlockConfig");
 
     // tensor
     py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")