[JAX] MoE: cherry-pick 3 independent fixes from jberchtold/te_ep_integration

tdophung · tdophung · commit fecb0ed837f2 · 2026-06-10T13:56:18.000-07:00
Pull three small, orthogonal correctness improvements from jberchtold's parallel work on teddy/te_ep_integration that don't touch the FFN shard_map or our dispatch zero-init workaround: 1. ``effective_align = max(align_size, 128)`` floor on the per-rank receive slots in ``moe.py``. NCCL EP requires each expert-major output block to be at least 128-token aligned; the previous ``align_size > 0`` branch could emit a smaller natural block on tiny configs and trip the dispatch buffer check. (df61642) 2. Size-1-axis guard in ``_ep_outer_axis()`` in both ``cpp_extensions/ep.py`` and ``ep.py``. A dp/fsdp axis that is sized 1 in the active mesh is now treated as absent so we don't pin EP-output specs to a degenerate axis that JAX may silently collapse. Mirrored the helper into ``ep.py`` so both files share the same predicate. (2210702) 3. ``_with_sharding_constraint_cast_bwd`` custom-VJP wrapper in ``moe.py``, applied to the inbound activation re-pin. Keeps the bwd cotangent in the primal dtype and re-asserts the same sharding on the bwd path, instead of letting a wider gradient land back at the caller. (2210702) Deliberately deferred: his shard_map removal + new global-view FFN call sites in ``2210702a``'s ``moe.py`` rewrite. Those depend on the grouped-GEMM custom partitioning landing on main and are a later-phase integration sweep.
diff --git a/transformer_engine/jax/cpp_extensions/ep.py b/transformer_engine/jax/cpp_extensions/ep.py
@@ -24,7 +24,7 @@
 
 import transformer_engine_jax
 from .base import BasePrimitive, register_primitive
-from ..sharding import global_mesh_resource
+from ..sharding import global_mesh_resource, get_mesh_axis_size
 
 __all__ = [
     "EpConfig",
@@ -187,8 +187,15 @@ def _ep_outer_axis():
 
     When set, EP-output globals carry an extra leading ``dp_size`` dim so SPMD
     sees each DP color's slab as distinct (rather than replicated across DP).
+
+    A dp/fsdp axis that is sized 1 in the active mesh is treated as absent so
+    we don't pin EP-output specs to a degenerate axis that JAX may collapse.
     """
     gsr = global_mesh_resource()
+    if gsr.dp_resource is not None and get_mesh_axis_size(gsr.dp_resource) > 1:
+        return gsr.dp_resource
+    if gsr.fsdp_resource is not None and get_mesh_axis_size(gsr.fsdp_resource) > 1:
+        return gsr.fsdp_resource
     return gsr.dp_resource or gsr.fsdp_resource
 
 
@@ -536,7 +543,7 @@ def _resolve_out_partition_spec(out_partition_spec, num_leading):
             "ep_combine: ep_resource is not set on the active MeshResource;"
             " pass out_sharding=... explicitly."
         )
-    outer = gsr.dp_resource or gsr.fsdp_resource
+    outer = _ep_outer_axis()
     leading = (outer, gsr.ep_resource) if outer is not None else gsr.ep_resource
     return (leading,) + (None,) * num_leading
 
diff --git a/transformer_engine/jax/ep.py b/transformer_engine/jax/ep.py
@@ -229,14 +229,24 @@ def _dispatch_fwd(handle, topk_idx, tokens, topk_weights, recv_capacity_per_rank
     return primal, (handle_mem, out_leading, top_k)
 
 
+def _ep_outer_axis():
+    """Mirror of cpp_extensions.ep._ep_outer_axis (size-1 axes treated as absent)."""
+    gsr = global_mesh_resource()
+    if gsr.dp_resource is not None and get_mesh_axis_size(gsr.dp_resource) > 1:
+        return gsr.dp_resource
+    if gsr.fsdp_resource is not None and get_mesh_axis_size(gsr.fsdp_resource) > 1:
+        return gsr.fsdp_resource
+    return gsr.dp_resource or gsr.fsdp_resource
+
+
 def _dispatch_bwd(handle, recv_capacity_per_rank, res, g_outputs):
     del recv_capacity_per_rank
     handle_mem, out_leading, top_k = res
     # Re-pin cotangent sharding: XLA transpose can drop the EP axis on a
     # single-fwd-output cotangent, landing a global tensor in the FFI.
     gsr = global_mesh_resource()
     ep_axis = gsr.ep_resource
-    outer = gsr.dp_resource or gsr.fsdp_resource
+    outer = _ep_outer_axis()
     leading = (outer, ep_axis) if outer is not None else ep_axis
     g_recv_tokens = jax.lax.with_sharding_constraint(
         g_outputs[0], jax.sharding.PartitionSpec(leading, None, None)
@@ -315,7 +325,7 @@ def _combine_bwd(handle, _num_local_tokens, _out_sharding, res, g_result):
         spec = jax.sharding.PartitionSpec(*_out_sharding)
     else:
         ep_axis = gsr.ep_resource
-        outer = gsr.dp_resource or gsr.fsdp_resource
+        outer = _ep_outer_axis()
         leading = (outer, ep_axis) if outer is not None and ep_axis is not None else ep_axis
         spec = (
             jax.sharding.PartitionSpec(leading, *([None] * (g_result.ndim - 1)))
diff --git a/transformer_engine/jax/moe.py b/transformer_engine/jax/moe.py
@@ -83,6 +83,31 @@ def _inspect(x: jnp.ndarray, name: str) -> jnp.ndarray:
 __all__ = ["moe"]
 
 
+def _with_sharding_constraint_cast_bwd(x: jnp.ndarray, sharding) -> jnp.ndarray:
+    """Apply a sharding constraint while keeping bwd cotangents in the primal dtype.
+
+    Plain ``jax.lax.with_sharding_constraint`` propagates cotangents in
+    whatever dtype the upstream gradient lands in; under mixed precision
+    that can be wider than the primal, blowing up bandwidth and (for
+    bf16 primals) breaking downstream kernels that pin a bf16 input
+    layout. This wrapper re-casts the cotangent back to the primal
+    dtype and re-asserts the same sharding on the bwd path.
+    """
+
+    @jax.custom_vjp
+    def _constraint(y):
+        return jax.lax.with_sharding_constraint(y, sharding)
+
+    def _constraint_fwd(y):
+        return jax.lax.with_sharding_constraint(y, sharding), jnp.zeros((), dtype=y.dtype)
+
+    def _constraint_bwd(dtype_ref, grad):
+        return (jax.lax.with_sharding_constraint(grad.astype(dtype_ref.dtype), sharding),)
+
+    _constraint.defvjp(_constraint_fwd, _constraint_bwd)
+    return _constraint(x)
+
+
 # =============================================================================
 # Process-level NCCL EP bootstrap (must run eagerly, outside jax.jit)
 # =============================================================================
@@ -631,10 +656,11 @@ def _moe_fwd_rule(
     # local expert. We must size to that worst case or NCCL EP's HT kernel
     # rejects the dispatch buffer with ``invalid argument``.
     natural_spe = num_ep * max_tokens_per_rank  # = (B // dp_size) * S
-    if align_size > 0:
-        slots_per_expert = ((natural_spe + align_size - 1) // align_size) * align_size
-    else:
-        slots_per_expert = natural_spe
+    # NCCL EP requires each expert-major output block to be at least
+    # 128-token aligned. Keep larger caller-requested alignments, but
+    # do not emit a smaller natural block size for tiny tests.
+    effective_align = max(int(align_size), 128)
+    slots_per_expert = ((natural_spe + effective_align - 1) // effective_align) * effective_align
     recv_pr = num_local_experts * slots_per_expert
 
     _te_ep_assert_compatible_bootstrap(
@@ -1406,7 +1432,7 @@ def moe(
             UserWarning,
             stacklevel=2,
         )
-    x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, expected_spec))
+    x = _with_sharding_constraint_cast_bwd(x, NamedSharding(mesh, expected_spec))
 
     # custom_vjp can't trace through None args; lower expert_bias to an
     # empty shape-(0,) tensor that fused_topk_with_score_function treats