@@ -156,28 +156,10 @@ def new_deepep_group(
156156 self .ll_decode_num_tokens = decode_num_max_dispatch_tokens_per_rank
157157 self .ll_hidden = hidden_size
158158 self .ll_num_experts = n_routed_experts + get_redundancy_expert_num () * global_world_size
159- self .ep_buffer = deep_ep .ElasticBuffer (
160- deepep_group ,
161- num_max_tokens_per_rank = self .ll_num_tokens ,
162- hidden = self .ll_hidden ,
163- num_topk = num_experts_per_tok ,
164- use_fp8_dispatch = True ,
165- allow_multiple_reduction = False ,
166- )
167- self .ep_mega_moe_buffer = None
168159 self .ep_low_latency_buffer = None
169- if not is_sm100_gpu ():
170- num_rdma_bytes = deep_ep .Buffer .get_low_latency_rdma_size_hint (
171- self .ll_decode_num_tokens , self .ll_hidden , global_world_size , self .ll_num_experts
172- )
173- self .ep_low_latency_buffer = deep_ep .Buffer (
174- deepep_group ,
175- int (1e9 ),
176- num_rdma_bytes ,
177- low_latency_mode = True ,
178- num_qps_per_rank = (self .ll_num_experts // global_world_size ),
179- )
180- else :
160+ self .ep_mega_moe_buffer = None
161+ if is_sm100_gpu ():
162+ self .ep_buffer = None
181163 if moe_intermediate_size is None :
182164 raise ValueError ("SM100 Mega MoE requires moe_intermediate_size or intermediate_size in model config" )
183165
@@ -191,6 +173,28 @@ def new_deepep_group(
191173 self .ll_hidden ,
192174 moe_intermediate_size ,
193175 )
176+ self ._set_num_sms_for_deep_gemm (0 )
177+ logger .info ("SM100 detected: skip DeepEP ElasticBuffer init and use Mega MoE buffer only." )
178+ return
179+
180+ self .ep_buffer = deep_ep .ElasticBuffer (
181+ deepep_group ,
182+ num_max_tokens_per_rank = self .ll_num_tokens ,
183+ hidden = self .ll_hidden ,
184+ num_topk = num_experts_per_tok ,
185+ use_fp8_dispatch = True ,
186+ allow_multiple_reduction = False ,
187+ )
188+ num_rdma_bytes = deep_ep .Buffer .get_low_latency_rdma_size_hint (
189+ self .ll_decode_num_tokens , self .ll_hidden , global_world_size , self .ll_num_experts
190+ )
191+ self .ep_low_latency_buffer = deep_ep .Buffer (
192+ deepep_group ,
193+ int (1e9 ),
194+ num_rdma_bytes ,
195+ low_latency_mode = True ,
196+ num_qps_per_rank = (self .ll_num_experts // global_world_size ),
197+ )
194198 theoretical_sms = self .ep_buffer .get_theoretical_num_sms (self .ll_num_experts , num_experts_per_tok )
195199 self ._set_num_sms_for_deep_gemm (theoretical_sms )
196200
0 commit comments