NVIDIA
diff --git a/‎tensorrt_llm/_torch/models/modeling_qwen3vl.py‎
Lines changed: 147 additions & 62 deletions b/‎tensorrt_llm/_torch/models/modeling_qwen3vl.py‎
Lines changed: 147 additions & 62 deletions
@@ -4,6 +4,8 @@
 
 import torch
 import torch.nn as nn
+import triton
+import triton.language as tl
 from PIL import Image
 from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig, PreTrainedModel
 from transformers.activations import ACT2FN as HF_ACT2FN
@@ -609,6 +611,140 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+# ---------------------------------------------------------------------------
+# Fused bilinear position-embedding interpolation for the Qwen3-VL vision
+# tower. Ported from vLLM
+# (vllm/model_executor/models/qwen3_vl.py @ 21eb2c33).
+#
+# ``fast_pos_embed_interpolate`` resamples the learned grid of positional
+# embeddings onto each (t, h, w) image grid using bilinear interpolation and
+# then reorders the spatial axis to match the spatial-merge layout used by
+# the rest of the vision tower. The Triton path fuses the
+# bilinear-interp + spatial-merge reorder into a single kernel so the
+# embedding gather, the 4 corner reads, and the permute are all one fused
+# pass instead of separate ops.
+# ---------------------------------------------------------------------------
+
+
+@triton.jit
+def _bilinear_pos_embed_kernel(
+    embed_ptr,
+    output_ptr,
+    H,
+    W,
+    h_scale,
+    w_scale,
+    NUM_GRID: tl.constexpr,
+    M_SIZE: tl.constexpr,
+    HIDDEN_DIM: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    """Fused bilinear pos-embed interpolation with spatial-merge reorder."""
+    pid = tl.program_id(0)
+    total_spatial = H * W
+    spatial_idx = pid % total_spatial
+
+    num_blocks_w = W // M_SIZE
+    block_idx = spatial_idx // (M_SIZE * M_SIZE)
+    local_idx = spatial_idx % (M_SIZE * M_SIZE)
+    br = block_idx // num_blocks_w
+    bc = block_idx % num_blocks_w
+    lr = local_idx // M_SIZE
+    lc = local_idx % M_SIZE
+    row = br * M_SIZE + lr
+    col = bc * M_SIZE + lc
+
+    h_frac = row.to(tl.float32) * h_scale
+    w_frac = col.to(tl.float32) * w_scale
+
+    hf = tl.math.floor(h_frac).to(tl.int32)
+    wf = tl.math.floor(w_frac).to(tl.int32)
+    hc = tl.minimum(hf + 1, NUM_GRID - 1)
+    wc = tl.minimum(wf + 1, NUM_GRID - 1)
+
+    dh = h_frac - hf.to(tl.float32)
+    dw = w_frac - wf.to(tl.float32)
+    w11 = dh * dw
+    w10 = dh - w11
+    w01 = dw - w11
+    w00 = 1.0 - dh - w01
+
+    off00 = (hf * NUM_GRID + wf) * HIDDEN_DIM
+    off01 = (hf * NUM_GRID + wc) * HIDDEN_DIM
+    off10 = (hc * NUM_GRID + wf) * HIDDEN_DIM
+    off11 = (hc * NUM_GRID + wc) * HIDDEN_DIM
+    out_off = pid * HIDDEN_DIM
+
+    # Cast weights to output dtype so the multiply-accumulate stays in
+    # the same precision as the reference PyTorch implementation used in
+    # the unit test.
+    out_dtype = output_ptr.dtype.element_ty
+    w00_c = w00.to(out_dtype)
+    w01_c = w01.to(out_dtype)
+    w10_c = w10.to(out_dtype)
+    w11_c = w11.to(out_dtype)
+
+    for d in tl.range(0, HIDDEN_DIM, BLOCK_D):
+        cols = d + tl.arange(0, BLOCK_D)
+        mask = cols < HIDDEN_DIM
+
+        e00 = tl.load(embed_ptr + off00 + cols, mask=mask)
+        e01 = tl.load(embed_ptr + off01 + cols, mask=mask)
+        e10 = tl.load(embed_ptr + off10 + cols, mask=mask)
+        e11 = tl.load(embed_ptr + off11 + cols, mask=mask)
+
+        val = w00_c * e00 + w01_c * e01 + w10_c * e10 + w11_c * e11
+
+        tl.store(output_ptr + out_off + cols, val, mask=mask)
+
+
+def _triton_pos_embed_interpolate(
+    embed_weight: torch.Tensor,
+    t: int,
+    h: int,
+    w: int,
+    num_grid_per_side: int,
+    m_size: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Launch the fused Triton kernel for one (t, h, w) grid.
+
+    Returns a tensor of shape ``(t * h * w, hidden_dim)`` with the
+    bilinearly-interpolated position embeddings already in spatial-merge
+    order.
+    """
+    assert h % m_size == 0 and w % m_size == 0, (
+        f"h={h} and w={w} must be divisible by m_size={m_size}"
+    )
+    hidden_dim = embed_weight.shape[1]
+    total_out = t * h * w
+    output = torch.empty(
+        total_out,
+        hidden_dim,
+        device=embed_weight.device,
+        dtype=dtype,
+    )
+
+    h_scale = float(num_grid_per_side - 1) / float(h - 1) if h > 1 else 0.0
+    w_scale = float(num_grid_per_side - 1) / float(w - 1) if w > 1 else 0.0
+
+    BLOCK_D = triton.next_power_of_2(hidden_dim)
+
+    _bilinear_pos_embed_kernel[(total_out,)](
+        embed_weight,
+        output,
+        h,
+        w,
+        h_scale,
+        w_scale,
+        num_grid_per_side,
+        m_size,
+        hidden_dim,
+        BLOCK_D,
+    )
+    return output
+
+
 class Qwen3VisionModel(torch.nn.Module):
     def __init__(self, model_config: ModelConfig[PretrainedConfig]):
         super().__init__()
@@ -698,70 +834,19 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
 
     # Adopted from https://github.com/vllm-project/vllm/blob/21eb2c3372fb6447ef36bee44ff7af79a330ffec/vllm/model_executor/models/qwen3_vl.py#L470)
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
+        embed_weight = self.pos_embed.weight
         num_grid_per_side = self.num_grid_per_side
         m_size = self.spatial_merge_size
-        hidden_dim = self.pos_embed.embedding_dim
-
-        outputs = []
-        for t, h, w in grid_thw:
-            h_idxs = torch.linspace(
-                0,
-                num_grid_per_side - 1,
-                h,
-                dtype=torch.float32,
-                device=self.pos_embed.weight.device,
-            )
-            w_idxs = torch.linspace(
-                0,
-                num_grid_per_side - 1,
-                w,
-                dtype=torch.float32,
-                device=self.pos_embed.weight.device,
-            )
-
-            h_floor = h_idxs.to(torch.long)
-            w_floor = w_idxs.to(torch.long)
-            h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
-            w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)
-
-            dh = h_idxs - h_floor
-            dw = w_idxs - w_floor
-
-            # Create meshgrid view for all h, w vars
-            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
-            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
-            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
-
-            # original computation of weights
-            # w00 = (1 - dh_grid) * (1 - dw_grid)
-            # w01 = (1 - dh_grid) * dw_grid
-            # w10 = dh_grid * (1 - dw_grid)
-            # w11 = dh_grid * dw_grid
-            # we reuse w11 here to avoid duplicate
-            # dh_grid * dw_grid computation
-            w11 = dh_grid * dw_grid
-            w10 = dh_grid - w11
-            w01 = dw_grid - w11
-            w00 = 1 - dh_grid - w01
-
-            h_grid = torch.stack([h_floor_grid, h_floor_grid, h_ceil_grid, h_ceil_grid])
-            w_grid = torch.stack([w_floor_grid, w_ceil_grid, w_floor_grid, w_ceil_grid])
-            h_grid_idx = h_grid * num_grid_per_side
-
-            indices = (h_grid_idx + w_grid).reshape(4, -1)
-            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
-            weights = weights.to(dtype=self.pos_embed.weight.dtype)
-
-            embeds = self.pos_embed(indices)
-            embeds *= weights
-            combined = embeds.sum(dim=0)
-
-            combined = combined.reshape(h // m_size, m_size, w // m_size, m_size, hidden_dim)
-            combined = combined.permute(0, 2, 1, 3, 4).reshape(1, -1, hidden_dim)
-            repeated = combined.expand(t, -1, -1).reshape(-1, hidden_dim)
-            outputs.append(repeated)
-
-        return torch.cat(outputs, dim=0)
+        dtype = embed_weight.dtype
+        return torch.cat(
+            [
+                _triton_pos_embed_interpolate(
+                    embed_weight, t, h, w, num_grid_per_side, m_size, dtype
+                )
+                for t, h, w in grid_thw
+            ],
+            dim=0,
+        )
 
     def prepare_attn_metadata(self, seq_lens, attn_metadata: AttentionMetadata):
         # NOTE: The single prompt is divided into multiple seq_lens, so pretending have many batch_sizes.