neo moe inferece speedup

shihaobai · shihaobai · commit cac2edf0a632 · 2026-02-06T03:12:36.000Z
diff --git a/lightllm/models/neo_chat_moe/infer_struct.py b/lightllm/models/neo_chat_moe/infer_struct.py
@@ -20,6 +20,9 @@ def __init__(self):
     def init_some_extra_state(self, model: LlamaTpPartModel):
         LlamaInferStateInfo.init_some_extra_state(self, model)
         if self.is_prefill:
+            self.b_image_token_tag = torch.zeros([self.position_ids.size(0)], dtype=torch.bool, device="cpu").cuda(
+                non_blocking=True
+            )
             self.position_ids = self.get_neo_position(self.multimodal_params)
         else:
             b_position_delta = [0 for _ in range(self.b_seq_len.shape[0])]
@@ -95,5 +98,6 @@ def get_neo_position(self, multimodal_params: List[dict]) -> torch.Tensor:
             b_ready_cache_len=self.b_ready_cache_len,
             b_q_seq_len=self.b_q_seq_len,
             b_start_loc=self.b_q_start_loc,
+            b_image_token_tag=self.b_image_token_tag,
         )
         return position_ids
diff --git a/lightllm/models/neo_chat_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/neo_chat_moe/layer_infer/transformer_layer_infer.py
@@ -182,6 +182,7 @@ def _context_attention_kernel(
             infer_state.b_ready_cache_len,
             infer_state.max_q_seq_len,
             infer_state.req_manager.req_to_token_indexs,
+            infer_state.b_image_token_tag,
         )
         o3 = o_tensor.view(-1, self.tp_q_head_num_, self.head_dim_ * 2)
         o3 = o3[:, :, : self.head_dim_].contiguous()
diff --git a/lightllm/models/neo_chat_moe/triton_kernel/context_attention_fwd_neo.py b/lightllm/models/neo_chat_moe/triton_kernel/context_attention_fwd_neo.py
@@ -34,8 +34,10 @@ def _fwd_kernel(
     stride_req_to_tokens_s,
     kv_group_num,
     b_prompt_cache_len,
+    b_image_token_tag,
     H: tl.constexpr,
-    BLOCK_DMODEL: tl.constexpr,
+    QK_HEAD_DIM: tl.constexpr,
+    V_HEAD_DIM: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
@@ -53,16 +55,19 @@ def _fwd_kernel(
     cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
 
     block_start_loc = BLOCK_M * start_m
+    if block_start_loc >= cur_batch_seq_len:
+        return
 
     offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_d_qk = tl.arange(0, QK_HEAD_DIM)
+    offs_d_v = tl.arange(0, V_HEAD_DIM)
     offs_m = block_start_loc + tl.arange(0, BLOCK_M)
 
     # Q pointers
     off_q = (
         (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
         + cur_head * stride_qh
-        + offs_d[None, :] * stride_qd
+        + offs_d_qk[None, :] * stride_qd
     )
 
     q_valid = offs_m < cur_batch_seq_len
@@ -71,24 +76,14 @@ def _fwd_kernel(
     # online softmax state
     m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
+    acc = tl.zeros([BLOCK_M, V_HEAD_DIM], dtype=tl.float32)
     block_end_loc = total_len
 
     # absolute q positions in the request
     q_pos = prompt_cache_len + offs_m  # [M]
+    q_image_token_tag = tl.load(b_image_token_tag + cur_batch_in_all_start_index + offs_m, mask=q_valid, other=False)
 
-    # q_gid from packed position_ids (aligned with Q rows)
-    q_gid = tl.load(
-        position_ids + cur_batch_in_all_start_index + offs_m,
-        mask=q_valid,
-        other=-2147483648,
-    ).to(tl.int32)
-
-    BIG = tl.full([BLOCK_N], 1000000000, tl.int32)  # ensure != any normal gid
-
-    for start_n in range(0, block_mask * block_end_loc, BLOCK_N):
+    for start_n in range(0, block_end_loc, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
 
         k_pos = start_n + offs_n  # [N]
@@ -102,32 +97,13 @@ def _fwd_kernel(
         ).to(tl.int64)
 
         # load K
-        off_k = kv_loc[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd
+        off_k = kv_loc[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d_qk[:, None] * stride_kd
         k = tl.load(K + off_k, mask=k_valid[None, :], other=0.0)
-
-        qk = tl.dot(q, k)
-
-        # k_gid:
-        # - for cached keys (k_pos < prompt_cache_len): set to BIG + k_pos so equality is always false
-        # - for new keys (k_pos >= prompt_cache_len): read from packed position_ids by (k_pos - prompt_cache_len)
-        k_in_new = k_pos >= prompt_cache_len
-        k_new_idx = (k_pos - prompt_cache_len).to(tl.int32)  # [N] valid only when k_in_new
-        k_gid_new = tl.load(
-            position_ids + cur_batch_in_all_start_index + k_new_idx,
-            mask=k_valid & k_in_new,
-            other=-2147483647,
-        ).to(tl.int32)
-
-        k_gid = tl.where(
-            k_in_new,
-            k_gid_new,
-            (k_pos.to(tl.int32) + BIG),
-        )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
 
         # mask: causal OR same gid (only possible inside NEW part)
-        mask = (q_pos[:, None] >= k_pos[None, :]) | (q_gid[:, None] == k_gid[None, :])
-        mask = mask & q_valid[:, None] & k_valid[None, :]
-
+        mask = (q_pos[:, None] >= k_pos[None, :]) | q_image_token_tag[:, None]
         qk = tl.where(mask, qk * sm_scale, -1.0e8)
 
         # online softmax
@@ -141,7 +117,7 @@ def _fwd_kernel(
         acc = acc * alpha[:, None]
 
         # load V
-        off_v = kv_loc[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd
+        off_v = kv_loc[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d_v[None, :] * stride_vd
         v = tl.load(V + off_v, mask=k_valid[:, None], other=0.0)
 
         p = p.to(v.dtype)
@@ -154,7 +130,7 @@ def _fwd_kernel(
     off_o = (
         (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
         + cur_head * stride_oh
-        + offs_d[None, :] * stride_od
+        + offs_d_v[None, :] * stride_od
     )
     tl.store(Out + off_o, acc, mask=q_valid[:, None])
 
@@ -172,6 +148,7 @@ def context_attention_fwd_neo(
     b_prompt_cache_len,
     max_input_len,
     req_to_token_indexs,
+    b_image_token_tag,
 ):
     # minimal safety: position_ids must cover packed q rows
     assert position_ids.numel() >= q.shape[0], (position_ids.numel(), q.shape[0])
@@ -220,8 +197,10 @@ def context_attention_fwd_neo(
         req_to_token_indexs.stride(1),
         kv_group_num=kv_group_num,
         b_prompt_cache_len=b_prompt_cache_len,
+        b_image_token_tag=b_image_token_tag,
         H=head,
-        BLOCK_DMODEL=Lk,
+        QK_HEAD_DIM=Lk,
+        V_HEAD_DIM=Lk // 2,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         num_warps=num_warps,
diff --git a/lightllm/models/neo_chat_moe/triton_kernel/get_neo_position.py b/lightllm/models/neo_chat_moe/triton_kernel/get_neo_position.py
@@ -16,6 +16,7 @@ def _get_neo_position_triton(
     b_ready_cache_len: torch.Tensor,
     b_q_seq_len: torch.Tensor,
     b_start_loc: torch.Tensor,
+    b_image_token_tag: torch.Tensor,
     BLOCK_SIZE: tl.constexpr,
 ) -> torch.Tensor:
     cur_batch = tl.program_id(0)
@@ -36,6 +37,13 @@ def _get_neo_position_triton(
             t_pos = local_image_start_idx + off * 0
             h_pos = off // image_w
             w_pos = off % image_w
+            tl.store(
+                b_image_token_tag + off + image_start_idx,
+                True,
+                mask=(off < image_len)
+                & (off + local_image_start_idx - cache_len < q_seq_len)
+                & (local_image_start_idx - cache_len + off >= 0),
+            )
             tl.store(
                 position_ids + off + image_start_idx,
                 t_pos,
@@ -87,6 +95,7 @@ def get_neo_position_triton(
     b_ready_cache_len: torch.Tensor,
     b_q_seq_len: torch.Tensor,
     b_start_loc: torch.Tensor,
+    b_image_token_tag: torch.Tensor,
 ) -> torch.Tensor:
 
     batch_size = b_q_seq_len.shape[0]
@@ -105,6 +114,7 @@ def get_neo_position_triton(
         b_ready_cache_len=b_ready_cache_len,
         b_q_seq_len=b_q_seq_len,
         b_start_loc=b_start_loc,
+        b_image_token_tag=b_image_token_tag,
         BLOCK_SIZE=BLOCK_SIZE,
     )
 
@@ -121,6 +131,7 @@ def test():
         .expand(3, -1)
         .contiguous()
     )
+    b_image_token_tag = torch.zeros([position_ids.size(1)], dtype=torch.bool, device="cuda")
     position_ids[1:].zero_()
     b_ready_cache_len = torch.tensor([0, 0], dtype=torch.int32, device="cuda")
     b_q_seq_len = torch.tensor([7, 13], dtype=torch.int32, device="cuda")
@@ -135,8 +146,10 @@ def test():
         b_ready_cache_len,
         b_q_seq_len,
         b_start_loc,
+        b_image_token_tag,
     )
 
+    print(b_image_token_tag)
     print(position_ids)
     # old_value = torch.cat([position_ids[:, 2:7], position_ids[:, 7 + 2 :]], dim=1)
 
@@ -172,3 +185,7 @@ def test():
         [0, 1, 0, 1, 2, 3, 4, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 6, 7, 8]],
        device='cuda:0', dtype=torch.int32)
     """
+
+
+if __name__ == "__main__":
+    test()
diff --git a/lightllm/models/neo_chat_moe/vision_process.py b/lightllm/models/neo_chat_moe/vision_process.py
@@ -136,6 +136,6 @@ def load_image_native(image, patch_size=16, downsample_ratio=0.5, min_pixels=655
     )
     pixel_values, grid_hw = preprocess_pixel_values(transform(new_image).to(torch.float32), patch_size=patch_size)
 
-    print(f"Transfer image_size from ({image.height, image.width}) to ({new_image.height, new_image.width})")
+    # print(f"Transfer image_size from ({image.height, image.width}) to ({new_image.height, new_image.width})")
 
     return pixel_values, grid_hw

Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ def _context_attention_kernel(`
`182`	`182`	`infer_state.b_ready_cache_len,`
`183`	`183`	`infer_state.max_q_seq_len,`
`184`	`184`	`infer_state.req_manager.req_to_token_indexs,`
	`185`	`+ infer_state.b_image_token_tag,`
`185`	`186`	`)`
`186`	`187`	`o3 = o_tensor.view(-1, self.tp_q_head_num_, self.head_dim_ * 2)`
`187`	`188`	`o3 = o3[:, :, : self.head_dim_].contiguous()`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,6 @@ def load_image_native(image, patch_size=16, downsample_ratio=0.5, min_pixels=655`
`136`	`136`	`)`
`137`	`137`	`pixel_values, grid_hw = preprocess_pixel_values(transform(new_image).to(torch.float32), patch_size=patch_size)`
`138`	`138`
`139`		`- print(f"Transfer image_size from ({image.height, image.width}) to ({new_image.height, new_image.width})")`
	`139`	`+ # print(f"Transfer image_size from ({image.height, image.width}) to ({new_image.height, new_image.width})")`
`140`	`140`
`141`	`141`	`return pixel_values, grid_hw`