ByteDance-Seed
diff --git a/‎python/triton_dist/layers/amd/tp_attn.py‎
Lines changed: 4 additions & 4 deletions b/‎python/triton_dist/layers/amd/tp_attn.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/triton_dist/layers/amd/tp_mlp.py‎
Lines changed: 4 additions & 4 deletions b/‎python/triton_dist/layers/amd/tp_mlp.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/triton_dist/layers/nvidia/tp_attn.py‎
Lines changed: 12 additions & 24 deletions b/‎python/triton_dist/layers/nvidia/tp_attn.py‎
Lines changed: 12 additions & 24 deletions
diff --git a/‎python/triton_dist/layers/nvidia/tp_mlp.py‎
Lines changed: 27 additions & 43 deletions b/‎python/triton_dist/layers/nvidia/tp_mlp.py‎
Lines changed: 27 additions & 43 deletions
diff --git a/‎python/triton_dist/models/engine.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton_dist/models/engine.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton_dist/models/qwen.py‎
Lines changed: 8 additions & 11 deletions b/‎python/triton_dist/models/qwen.py‎
Lines changed: 8 additions & 11 deletions
@@ -207,16 +207,16 @@ def _init_parameters(self, self_attn: nn.Module, verbose=False):
         if verbose:
             print(f"[RANK {self.rank}] Attn initialized with parameters: qkv ({self.wqkv.shape}, o ({self.wo.shape}))")
 
-    def _init_ctx(self, max_M, gemm_stream, ag_intranode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages, serial=False,
+    def _init_ctx(self, max_M, ag_intranode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages, serial=False,
                   ag_internode_stream=None):
         if serial:
             print(f"[RANK {self.rank}] Using serial mode for AG-GEMM.")
         self.ag_ctx = create_ag_gemm_intra_node_context(max_M=max_M, N=self.ag_N_per_rank, K=self.K, rank=self.rank,
                                                         num_ranks=self.world_size, input_dtype=self.dtype,
                                                         output_dtype=self.dtype, tp_group=self.group,
-                                                        ag_streams=ag_intranode_stream, gemm_stream=gemm_stream,
-                                                        serial=serial, autotune=True, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-                                                        BLOCK_K=BLOCK_K, stages=stages, M_PER_CHUNK=256)
+                                                        ag_streams=ag_intranode_stream, serial=serial, autotune=True,
+                                                        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
+                                                        stages=stages, M_PER_CHUNK=256)
         self.rs_ctx = create_gemm_rs_intra_node_context(
             max_M=max_M,
             N=self.K,
 
@@ -86,17 +86,17 @@ def _init_parameters(self, mlp: nn.Module, verbose=False):
                 f"[RANK {self.rank}] MLP initialized with parameters: gate_up_proj shape: {self.gate_up_proj.shape}, down_proj shape: {self.down_proj.shape}"
             )
 
-    def _init_ctx(self, max_M, gemm_stream, ag_intranode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages, serial=False,
+    def _init_ctx(self, max_M, ag_intranode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages, serial=False,
                   ag_internode_stream=None):
         """Initializes contexts for triton_dist AllGather-GEMM and GEMM-ReduceScatter operations."""
         if serial:
             print(f"[RANK {self.rank}] Using serial mode for AG-GEMM.")
         self.ag_ctx = create_ag_gemm_intra_node_context(max_M=max_M, N=self.ag_N_per_rank, K=self.K, rank=self.rank,
                                                         num_ranks=self.world_size, input_dtype=self.dtype,
                                                         output_dtype=self.dtype, tp_group=self.group,
-                                                        ag_streams=ag_intranode_stream, gemm_stream=gemm_stream,
-                                                        serial=serial, autotune=True, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-                                                        BLOCK_K=BLOCK_K, stages=stages, M_PER_CHUNK=256)
+                                                        ag_streams=ag_intranode_stream, serial=serial, autotune=True,
+                                                        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
+                                                        stages=stages, M_PER_CHUNK=256)
         self.rs_ctx = create_gemm_rs_intra_node_context(
             max_M=max_M,
             N=self.K,
 
@@ -28,6 +28,7 @@
 import torch.distributed
 import flashinfer
 
+from triton_dist.kernels.allreduce import AllReduceMethod
 from triton_dist.kernels.nvidia.allgather_gemm import AllGatherGEMMTensorParallelContext, get_auto_all_gather_method, ag_gemm
 from triton_dist.kernels.nvidia import create_gemm_rs_context, gemm_rs
 from triton_dist.utils import nvshmem_barrier_all_on_stream
@@ -91,7 +92,7 @@ def __init__(self, rank=0, world_size=8, group=None):
         self.wo = None
         self.ag_ctx = None
         self.rs_ctx = None
-        self.ctx = None
+        self.ar_ctx = None
 
     def _init_parameters(self, self_attn: nn.Module, verbose=False):
         self.q_size = self_attn.q_proj.weight.shape[0] // self.world_size
@@ -117,13 +118,11 @@ def _init_parameters(self, self_attn: nn.Module, verbose=False):
         if verbose:
             print(f"[RANK {self.rank}] Attn initialized with parameters: qkv ({self.wqkv.shape}, o ({self.wo.shape}))")
 
-    def _init_ctx(self, max_M, gemm_stream, ag_intranode_stream, ag_internode_stream, BLOCK_M, BLOCK_N, BLOCK_K,
-                  stages):
+    def _init_ctx(self, max_M, ag_intranode_stream, ag_internode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages):
         self.ag_ctx = AllGatherGEMMTensorParallelContext(
             N_per_rank=self.ag_N_per_rank, K=self.K, tensor_dtype=self.dtype, rank=self.rank, num_ranks=self.world_size,
-            num_local_ranks=self.world_size, max_M=max_M, gemm_stream=gemm_stream,
-            ag_intranode_stream=ag_intranode_stream, ag_internode_stream=ag_internode_stream, BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, stages=stages,
+            num_local_ranks=self.world_size, max_M=max_M, ag_intranode_stream=ag_intranode_stream,
+            ag_internode_stream=ag_internode_stream, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, stages=stages,
             all_gather_method=get_auto_all_gather_method(self.world_size, self.world_size))
         self.rs_ctx = create_gemm_rs_context(
             max_M=max_M,
@@ -141,27 +140,20 @@ def _init_ctx(self, max_M, gemm_stream, ag_intranode_stream, ag_internode_stream
         nvshmem_barrier_all_on_stream(torch.cuda.current_stream())
         torch.cuda.synchronize()
 
-    def _init_AR_ctx(self, max_M, method, dtype=torch.bfloat16, signal_stages=1):
+    def _init_AR_ctx(self, max_M, method: AllReduceMethod, dtype=torch.bfloat16):
         self.ar_method = method
-        input_tensor = torch.empty([max_M, self.K], dtype=dtype, device="meta")
-        self.ctx = create_allreduce_ctx(
-            numel=max_M * self.K,
-            dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+        self.ar_ctx = create_allreduce_ctx(
+            workspace_nbytes=max_M * self.K * dtype.itemsize, rank=self.rank, world_size=self.world_size,
             local_world_size=self.world_size,  # TODO(houqi.1993) does not support multiple nodes now.
-            method=method,
-            signal_stages=signal_stages,
         )
-        self.ar_output = torch.empty_like(input_tensor, device="cuda", dtype=dtype).contiguous()
 
     def finalize(self):
         if self.ag_ctx:
             self.ag_ctx.finailize()
         if self.rs_ctx:
             self.rs_ctx.finalize()
-        if self.ctx:
-            self.ctx.finalize()
+        if self.ar_ctx:
+            self.ar_ctx.finalize()
 
     @torch.inference_mode()
     def apply_rotary_pos_emb(self, q: torch.Tensor, k: torch.Tensor, position_ids: torch.Tensor,
@@ -274,12 +266,8 @@ def dist_triton_AR_fwd(self, x, position_ids, cos_sin_cache, kv_cache, layer_idx
 
         out = torch.nn.functional.linear(out.view(bsz, q_len, -1), self.wo).view(bsz * q_len, -1)
         if self.world_size > 1:
-            out = all_reduce(
-                input=out.contiguous(),
-                output=self.ar_output,
-                method=self.ar_method,
-                ctx=self.ctx,
-            )
+            out_allreduce = torch.empty_like(out)
+            out = all_reduce(x=out.contiguous(), output=out_allreduce, method=self.ar_method, ctx=self.ar_ctx)
         return out.view(bsz, q_len, -1)
 
     def fwd(self, x: torch.Tensor, position_ids: torch.Tensor, cos_sin_cache: torch.Tensor, kv_cache, layer_idx: int):
 
@@ -27,6 +27,7 @@
 from torch import nn
 import torch.distributed
 
+from triton_dist.kernels.allreduce import AllReduceMethod
 from triton_dist.kernels.nvidia.allgather_gemm import AllGatherGEMMTensorParallelContext, get_auto_all_gather_method, ag_gemm
 from triton_dist.kernels.nvidia import create_gemm_rs_context, gemm_rs
 from triton_dist.utils import nvshmem_barrier_all_on_stream
@@ -64,7 +65,7 @@ def __init__(self, rank=0, world_size=8, group=None):
         self.down_proj = None
         self.ag_ctx = None
         self.rs_ctx = None
-        self.ctx = None
+        self.ar_ctx = None
 
     def _init_parameters(self, mlp: nn.Module, verbose=False):
         """
@@ -91,14 +92,13 @@ def _init_parameters(self, mlp: nn.Module, verbose=False):
                 f"[RANK {self.rank}] MLP initialized with parameters: gate_up_proj shape: {self.gate_up_proj.shape}, down_proj shape: {self.down_proj.shape}"
             )
 
-    def _init_ctx(self, max_M, gemm_stream, ag_intranode_stream, ag_internode_stream, BLOCK_M, BLOCK_N, BLOCK_K,
-                  stages):
+    def _init_ctx(self, max_M, ag_intranode_stream, ag_internode_stream, BLOCK_M, BLOCK_N, BLOCK_K, stages):
+        # TODO(houqi.1993) BLOCK_SIZE should not be part of arguments, but be determined on forward.
         """Initializes contexts for triton_dist AllGather-GEMM and GEMM-ReduceScatter operations."""
         self.ag_ctx = AllGatherGEMMTensorParallelContext(
             N_per_rank=self.ag_N_per_rank, K=self.K, tensor_dtype=self.dtype, rank=self.rank, num_ranks=self.world_size,
-            num_local_ranks=self.world_size, max_M=max_M, gemm_stream=gemm_stream,
-            ag_intranode_stream=ag_intranode_stream, ag_internode_stream=ag_internode_stream, BLOCK_M=BLOCK_M,
-            BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, stages=stages,
+            num_local_ranks=self.world_size, max_M=max_M, ag_intranode_stream=ag_intranode_stream,
+            ag_internode_stream=ag_internode_stream, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, stages=stages,
             all_gather_method=get_auto_all_gather_method(self.world_size, self.world_size))
         self.rs_ctx = create_gemm_rs_context(
             max_M=max_M,
@@ -121,8 +121,8 @@ def finalize(self):
             self.ag_ctx.finailize()
         if self.rs_ctx:
             self.rs_ctx.finalize()
-        if self.ctx:
-            self.ctx.finalize()
+        if self.ar_ctx:
+            self.ar_ctx.finalize()
 
     @torch.inference_mode()
     def torch_fwd(self, x):
@@ -140,7 +140,7 @@ def torch_fwd(self, x):
         return out
 
     @torch.inference_mode()
-    def dist_triton_fwd(self, x, ag_gemm_persistent=False, gemm_rs_persistent=False, autotune=True):
+    def dist_triton_fwd(self, x: torch.Tensor, ag_gemm_persistent=False, gemm_rs_persistent=False, autotune=True):
         """
         triton_dist forward pass for TP.
         This version uses ag_gemm and gemm_rs.
@@ -165,22 +165,16 @@ def dist_triton_fwd(self, x, ag_gemm_persistent=False, gemm_rs_persistent=False,
             out = out.view(bsz, seq, -1)
         return out
 
-    def _init_AR_ctx(self, M, method, dtype=torch.bfloat16, signal_stages=1):
+    def _init_AR_ctx(self, max_M, method: AllReduceMethod, dtype=torch.bfloat16):
         self.ar_method = method
         N = self.down_proj.shape[0]
-        self.ctx = create_allreduce_ctx(
-            numel=M * N,
-            dtype=dtype,
-            rank=self.rank,
-            world_size=self.world_size,
+        self.ar_ctx = create_allreduce_ctx(
+            workspace_nbytes=max_M * N * dtype.itemsize, rank=self.rank, world_size=self.world_size,
             local_world_size=self.world_size,  # TODO(houqi.1993) does not support multiple nodes now.
-            method=method,
-            signal_stages=signal_stages,
         )
-        self.ar_output = torch.empty((M, N), device="cuda", dtype=dtype).contiguous()
 
     @torch.inference_mode()
-    def dist_triton_AR_fwd(self, x):
+    def dist_triton_AR_fwd(self, x: torch.Tensor):
         """
         triton_dist AR forward pass for TP.
         This version uses gemm + gemm + AllReduce
@@ -189,9 +183,11 @@ def dist_triton_AR_fwd(self, x):
         out_fused = torch.nn.functional.linear(x, self.gate_up_proj)
         wg, w1 = torch.chunk(out_fused, 2, dim=-1)
         out = self.act_fn(wg) * w1
-        out = torch.nn.functional.linear(out, self.down_proj).view_as(self.ar_output)
+        out = torch.nn.functional.linear(out, self.down_proj).view_as(x)
         if self.world_size > 1:
-            out = all_reduce(out.contiguous(), self.ar_output, method=self.ar_method, ctx=self.ctx)
+            out_ar = torch.empty_like(out)
+            assert self.ar_ctx is not None, "AllReduce context is not initialized."
+            out = all_reduce(out.contiguous(), out_ar, method=self.ar_method, ctx=self.ar_ctx)
         return out.view_as(x)
 
     @torch.inference_mode()
@@ -203,20 +199,13 @@ def torch_ag_gemm(self, x: torch.Tensor):
         """
         Reference PyTorch forward pass using AllGather-GEMM.
         """
-
         M_per_rank, K = x.shape
         M = M_per_rank * self.world_size
-
-        if not hasattr(self, 'ag_buffer'):
-            self.ag_buffer = torch.empty([M, K], dtype=x.dtype, device="cuda")
-
+        ag_buffer = torch.empty([M, K], dtype=x.dtype, device="cuda")
         # ag
-        torch.distributed.all_gather_into_tensor(self.ag_buffer, x, group=self.group)
-
+        torch.distributed.all_gather_into_tensor(ag_buffer, x, group=self.group)
         # gemm
-        golden = torch.matmul(self.ag_buffer, self.gate_up_proj.T)
-
-        return golden
+        return torch.matmul(ag_buffer, self.gate_up_proj.T)
 
     @torch.inference_mode()
     def dist_triton_ag_gemm(self, x: torch.Tensor, persistent=True, autotune=False):
@@ -225,8 +214,8 @@ def dist_triton_ag_gemm(self, x: torch.Tensor, persistent=True, autotune=False):
         This version uses ag_gemm.
         x: input tensor, shape [batch_size * seq_len, hidden_size]
         """
-        out = ag_gemm(x, self.gate_up_proj, ctx=self.ag_ctx, persistent=persistent, autotune=autotune)
-        return out
+        assert self.ag_ctx is not None
+        return ag_gemm(x, self.gate_up_proj, ctx=self.ag_ctx, persistent=persistent, autotune=autotune)
 
     @torch.inference_mode()
     def torch_gemm_rs(self, x: torch.Tensor):
@@ -235,16 +224,11 @@ def torch_gemm_rs(self, x: torch.Tensor):
         """
         # x: [M, K]
         M, K = x.shape
-        if not hasattr(self, 'rs_buffer'):
-            self.rs_buffer = torch.empty([M // self.world_size, self.down_proj.shape[0]], dtype=x.dtype, device="cuda")
-
+        rs_buffer = torch.empty([M // self.world_size, self.down_proj.shape[0]], dtype=x.dtype, device="cuda")
         # gemm
         gemm_out = torch.matmul(x, self.down_proj.T)
-
-        # rs
-        torch.distributed.reduce_scatter_tensor(self.rs_buffer, gemm_out, group=self.group)
-
-        return self.rs_buffer
+        torch.distributed.reduce_scatter_tensor(rs_buffer, gemm_out, group=self.group)
+        return rs_buffer
 
     @torch.inference_mode()
     def dist_triton_gemm_rs(self, x: torch.Tensor, persistent=False):
@@ -253,5 +237,5 @@ def dist_triton_gemm_rs(self, x: torch.Tensor, persistent=False):
         This version uses gemm_rs.
         x: input tensor, shape [batch_size * seq_len, hidden_size]
         """
-        out = gemm_rs(x, self.down_proj, self.rs_ctx, persistent=persistent, fuse_scatter=True)
-        return out
+        assert self.rs_ctx is not None
+        return gemm_rs(x, self.down_proj, self.rs_ctx, persistent=persistent, fuse_scatter=True)
@@ -28,6 +28,7 @@
 from tqdm import tqdm
 from datetime import datetime
 
+from triton_dist.kernels.allreduce import AllReduceMethod
 from triton_dist.models.kv_cache import KV_Cache
 from triton_dist.models import AutoLLM, AutoTokenizer, ModelConfig
 from triton_dist.models.utils import logger, sample_token
@@ -128,7 +129,7 @@ def serve(self, input_ids: torch.Tensor, gen_len: int):
             self.model.init_triton_dist_ctx(max_M=bsz)
         elif self.backend == 'triton_dist_AR':
             self.model.set_fwd(mode='triton_dist_AR')
-            self.model.init_triton_dist_AR_ctx(max_M=bsz, ar_method='two_shot_ld_reduce')
+            self.model.init_triton_dist_AR_ctx(max_M=bsz, ar_method=AllReduceMethod.TwoShot_Multimem)
 
         if self.no_graph:
 
@@ -184,4 +185,3 @@ def run(input_ids, position_ids):
             print(self.tokenizer.batch_decode(output_ids, skip_special_tokens=True))
 
         del self.model_launch
-        torch.distributed.destroy_process_group()
@@ -30,6 +30,8 @@
 from transformers import Qwen3ForCausalLM, Qwen3Config
 from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer
 
+from triton_dist.kernels.allreduce import AllReduceMethod
+
 if not torch.cuda.is_available():
     raise RuntimeError("CUDA is not available. Please ensure you have a compatible GPU and CUDA installed.")
 try:
@@ -173,14 +175,11 @@ def init_triton_dist_ctx(self, max_M: int = 4096):
             self.ag_intranode_stream = [torch.cuda.Stream(priority=-1) for i in range(self.world_size)]
         else:
             raise RuntimeError(f"Unsupported platform: {PLATFORM}. Supported platforms are 'nvidia' and 'amd'.")
-        self.gemm_stream = torch.cuda.Stream()
         self.ag_internode_stream = torch.cuda.Stream()
-        self.layers[0].attn._init_ctx(max_M=max_M, gemm_stream=self.gemm_stream,
-                                      ag_intranode_stream=self.ag_intranode_stream,
+        self.layers[0].attn._init_ctx(max_M=max_M, ag_intranode_stream=self.ag_intranode_stream,
                                       ag_internode_stream=self.ag_internode_stream, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
                                       BLOCK_K=BLOCK_K, stages=stages)
-        self.layers[0].mlp._init_ctx(max_M=max_M, gemm_stream=self.gemm_stream,
-                                     ag_intranode_stream=self.ag_intranode_stream,
+        self.layers[0].mlp._init_ctx(max_M=max_M, ag_intranode_stream=self.ag_intranode_stream,
                                      ag_internode_stream=self.ag_internode_stream, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
                                      BLOCK_K=BLOCK_K, stages=stages)
         for layer in self.layers[1:]:
@@ -191,17 +190,15 @@ def init_triton_dist_ctx(self, max_M: int = 4096):
 
         self.use_ar = False
 
-    def init_triton_dist_AR_ctx(self, max_M: int = 128, ar_method: str = 'two_shot_ld_reduce'):
+    def init_triton_dist_AR_ctx(self, max_M: int = 128, ar_method: AllReduceMethod = AllReduceMethod.DoubleTree):
         self.layers[0].attn._init_AR_ctx(max_M=max_M, method=ar_method, dtype=self.dtype)
-        self.layers[0].mlp._init_AR_ctx(M=max_M, method=ar_method, dtype=self.dtype)
+        self.layers[0].mlp._init_AR_ctx(max_M=max_M, method=ar_method, dtype=self.dtype)
 
         for layer in self.layers[1:]:
-            layer.attn.ctx = self.layers[0].attn.ctx
+            layer.attn.ar_ctx = self.layers[0].attn.ar_ctx
             layer.attn.ar_method = self.layers[0].attn.ar_method
-            layer.attn.ar_output = self.layers[0].attn.ar_output
-            layer.mlp.ctx = self.layers[0].mlp.ctx
+            layer.mlp.ar_ctx = self.layers[0].mlp.ar_ctx
             layer.mlp.ar_method = self.layers[0].mlp.ar_method
-            layer.mlp.ar_output = self.layers[0].mlp.ar_output
         self.use_ar = True
 
     def finalize(self):