[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 0ff3bffd7d89 · 2026-06-10T21:03:48.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/jax/test_multi_process_ep.py b/tests/jax/test_multi_process_ep.py
@@ -245,11 +245,13 @@ def test_two_layer_dispatch_no_handle_aliasing(self):
             w = jax.lax.with_sharding_constraint(topk_w, NamedSharding(self.mesh, dp_spec))
 
             def one_layer(hk, idx, toks, w_):
-                recv_t, recv_w, hm, tc = ep_dispatch(
-                    hk, idx, toks, w_, self.recv_capacity_per_rank
+                recv_t, recv_w, hm, tc = ep_dispatch(hk, idx, toks, w_, self.recv_capacity_per_rank)
+                recv_t = jax.lax.with_sharding_constraint(
+                    recv_t, NamedSharding(self.mesh, ep_spec_3d)
+                )
+                recv_w = jax.lax.with_sharding_constraint(
+                    recv_w, NamedSharding(self.mesh, ep_spec_2d)
                 )
-                recv_t = jax.lax.with_sharding_constraint(recv_t, NamedSharding(self.mesh, ep_spec_3d))
-                recv_w = jax.lax.with_sharding_constraint(recv_w, NamedSharding(self.mesh, ep_spec_2d))
                 return ep_combine(
                     hk, hm, tc, recv_t, recv_w, T_global, out_sharding=(("dp", "ep"), None)
                 )
@@ -269,12 +271,14 @@ def run(idx, ta_, tb_, w_):
             np.testing.assert_allclose(
                 np.asarray(out_a_g.astype(jnp.float32)),
                 np.asarray(tokens.astype(jnp.float32)),
-                atol=5e-2, rtol=5e-2,
+                atol=5e-2,
+                rtol=5e-2,
             )
             np.testing.assert_allclose(
                 np.asarray(out_b_g.astype(jnp.float32)),
                 np.asarray(tokens_b.astype(jnp.float32)),
-                atol=5e-2, rtol=5e-2,
+                atol=5e-2,
+                rtol=5e-2,
             )
 
     def test_primitive_prepare(self):
@@ -328,7 +332,10 @@ def run(idx, toks, w):
                     weighted, NamedSharding(self.mesh, ep_spec_3d)
                 )
                 out = ep_combine_fwd(
-                    self.hk, hm, weighted, T_global,
+                    self.hk,
+                    hm,
+                    weighted,
+                    T_global,
                     out_partition_spec=(("dp", "ep"), None),
                 )
                 return jax.lax.with_sharding_constraint(out, NamedSharding(self.mesh, dp_spec))
@@ -372,7 +379,9 @@ def loss_fn(toks):
                 toks = jax.lax.with_sharding_constraint(toks, NamedSharding(self.mesh, dp_spec))
                 idx = jax.lax.with_sharding_constraint(topk_idx, NamedSharding(self.mesh, dp_spec))
                 w = jax.lax.with_sharding_constraint(topk_w, NamedSharding(self.mesh, dp_spec))
-                recv_t, recv_w, hm, tc = ep_dispatch(self.hk, idx, toks, w, self.recv_capacity_per_rank)
+                recv_t, recv_w, hm, tc = ep_dispatch(
+                    self.hk, idx, toks, w, self.recv_capacity_per_rank
+                )
                 recv_t = jax.lax.with_sharding_constraint(
                     recv_t, NamedSharding(self.mesh, ep_spec_3d)
                 )
@@ -420,7 +429,9 @@ def test_dispatch_combine_3d_input_output(self):
 
             @jax.jit
             def run(idx, toks, w):
-                recv_t, recv_w, hm, _tc = ep_dispatch(self.hk, idx, toks, w, self.recv_capacity_per_rank)
+                recv_t, recv_w, hm, _tc = ep_dispatch(
+                    self.hk, idx, toks, w, self.recv_capacity_per_rank
+                )
                 recv_t = jax.lax.with_sharding_constraint(recv_t, NamedSharding(self.mesh, ep_t))
                 recv_w = jax.lax.with_sharding_constraint(recv_w, NamedSharding(self.mesh, ep_w))
                 out = ep_combine(
@@ -463,7 +474,9 @@ def test_dispatch_combine_dp_only_first_dim(self):
 
             @jax.jit
             def run(idx, toks, w):
-                recv_t, recv_w, hm, _tc = ep_dispatch(self.hk, idx, toks, w, self.recv_capacity_per_rank)
+                recv_t, recv_w, hm, _tc = ep_dispatch(
+                    self.hk, idx, toks, w, self.recv_capacity_per_rank
+                )
                 recv_t = jax.lax.with_sharding_constraint(recv_t, NamedSharding(self.mesh, ep_t))
                 recv_w = jax.lax.with_sharding_constraint(recv_w, NamedSharding(self.mesh, ep_w))
                 out = ep_combine(
@@ -641,7 +654,9 @@ def run(idx, toks, w):
                 idx = jax.lax.with_sharding_constraint(idx, NamedSharding(self.mesh, dp_spec))
                 toks = jax.lax.with_sharding_constraint(toks, NamedSharding(self.mesh, dp_spec))
                 w = jax.lax.with_sharding_constraint(w, NamedSharding(self.mesh, dp_spec))
-                recv_t, recv_w, hm, tc = ep_dispatch(self.hk, idx, toks, w, self.recv_capacity_per_rank)
+                recv_t, recv_w, hm, tc = ep_dispatch(
+                    self.hk, idx, toks, w, self.recv_capacity_per_rank
+                )
                 recv_t = jax.lax.with_sharding_constraint(
                     recv_t, NamedSharding(self.mesh, ep_spec_3d)
                 )
@@ -688,7 +703,9 @@ def fwd(eo, toks, idx, w):
                 w = jax.lax.with_sharding_constraint(w, NamedSharding(self.mesh, dp_spec))
                 _rt, rw, hm, tc = ep_dispatch(self.hk, idx, toks, w, self.recv_capacity_per_rank)
                 rw = jax.lax.with_sharding_constraint(rw, NamedSharding(self.mesh, ep_spec_2d))
-                combined = ep_combine(self.hk, hm, tc, eo, rw, T_dp, out_sharding=(("dp", "ep"), None))
+                combined = ep_combine(
+                    self.hk, hm, tc, eo, rw, T_dp, out_sharding=(("dp", "ep"), None)
+                )
                 return jax.lax.with_sharding_constraint(combined, NamedSharding(self.mesh, dp_spec))
 
             # jax.vjp + pinned cotangent feeds ep_combine_bwd/ep_dispatch_bwd
diff --git a/tests/jax/test_te_ep_moe.py b/tests/jax/test_te_ep_moe.py
@@ -112,8 +112,7 @@ def _read_mp_options():
 
 if not _MP_ACTIVE:
     pytest.skip(
-        "test_te_ep_moe.py requires the multiprocess launcher "
-        "(run_te_ep_moe.sh). Skipping.",
+        "test_te_ep_moe.py requires the multiprocess launcher (run_te_ep_moe.sh). Skipping.",
         allow_module_level=True,
     )
 
@@ -231,9 +230,7 @@ def mesh():
     # Eager bootstrap: ep_bootstrap does a host-side NCCL UID allgather
     # and cannot run from inside jax.jit. Sized to the worst-case recv_pr
     # across _CONFIGS so every parametrized config is bootstrap-compatible.
-    with mesh_obj, global_shard_guard(
-        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
-    ):
+    with mesh_obj, global_shard_guard(MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)):
         ep_bootstrap(
             world_size=num_procs,
             rank=jax.process_index(),
@@ -325,9 +322,7 @@ def _pure_jax_moe_reference(
         raise ValueError(f"Unsupported score_function={score_function!r}")
 
     routing_weights_full = jnp.zeros((T, num_experts), dtype=jnp.float32)
-    routing_weights_full = routing_weights_full.at[
-        jnp.arange(T)[:, None], top_indices
-    ].set(weights)
+    routing_weights_full = routing_weights_full.at[jnp.arange(T)[:, None], top_indices].set(weights)
 
     # FFN. ``apply_topk_weights_early`` is a fusion knob that doesn't
     # change the math (wo is linear), so the reference is identical for
@@ -337,9 +332,7 @@ def _pure_jax_moe_reference(
     intermediate = jax.nn.silu(layer_w0.astype(jnp.float32)) * layer_w1.astype(jnp.float32)
     intermediate = intermediate.astype(x.dtype)
     expert_out = jnp.einsum("tem,emh->teh", intermediate, wo)  # [T, E, H]
-    output_2d = jnp.einsum(
-        "te,teh->th", routing_weights_full.astype(x.dtype), expert_out
-    )
+    output_2d = jnp.einsum("te,teh->th", routing_weights_full.astype(x.dtype), expert_out)
     output = output_2d.reshape(B, S, H).astype(x.dtype)
 
     if aux_loss_coeff > 0.0:
@@ -354,9 +347,7 @@ def _pure_jax_moe_reference(
         else:  # sigmoid
             aux_scores = jax.nn.sigmoid(logits)
             if K > 1:
-                aux_scores = aux_scores / (
-                    aux_scores.sum(axis=-1, keepdims=True) + 1e-20
-                )
+                aux_scores = aux_scores / (aux_scores.sum(axis=-1, keepdims=True) + 1e-20)
         routing_map = (routing_weights_full > 0).astype(jnp.int32)
         tokens_per_expert = jnp.sum(routing_map, axis=0)  # [E]
         sum_probs_per_expert = jnp.sum(aux_scores, axis=0)  # [E]
@@ -567,9 +558,7 @@ def _reference_kwargs_from_config(config, params_np):
     return dict(
         score_function=config.get("score_function", "softmax"),
         expert_bias=(
-            jnp.asarray(params_np["expert_bias"])
-            if config.get("use_expert_bias", False)
-            else None
+            jnp.asarray(params_np["expert_bias"]) if config.get("use_expert_bias", False) else None
         ),
     )
 
@@ -720,9 +709,7 @@ def test_aux_loss(self, mesh):
         # wired.
         aux_grads = _grad_aux_only(block, variables, mesh, x)
         g_gate = np.asarray(
-            jax.device_get(
-                _unwrap(aux_grads["params"]["gate_kernel"]).addressable_data(0)
-            )
+            jax.device_get(_unwrap(aux_grads["params"]["gate_kernel"]).addressable_data(0))
         )
         assert np.all(np.isfinite(g_gate)), "gate grad NaN/Inf under aux-only loss"
         assert np.any(g_gate != 0.0), "aux bwd should propagate to gate_kernel"
@@ -735,9 +722,7 @@ def test_combined_loss_grads(self, mesh):
         variables, _, _ = _init_apply(block, mesh, x, jax.random.PRNGKey(23))
         grads = _grad_step(block, variables, mesh, x, include_aux=True)
         for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
-            g_local = np.asarray(
-                jax.device_get(_unwrap(grads["params"][name]).addressable_data(0))
-            )
+            g_local = np.asarray(jax.device_get(_unwrap(grads["params"][name]).addressable_data(0)))
             assert np.all(np.isfinite(g_local)), f"{name} grad NaN/Inf under main+aux"
             assert np.any(g_local != 0.0), f"{name} grad zero under main+aux"
 
@@ -779,9 +764,7 @@ def test_init_apply_parity(self, mesh):
 
         grads = _grad_step(block, variables, mesh, x)
         for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
-            g_local = np.asarray(
-                jax.device_get(_unwrap(grads["params"][name]).addressable_data(0))
-            )
+            g_local = np.asarray(jax.device_get(_unwrap(grads["params"][name]).addressable_data(0)))
             assert np.all(np.isfinite(g_local)), f"{name} grad NaN/Inf"
             assert np.any(g_local != 0.0), f"{name} grad zero"
 
@@ -801,9 +784,7 @@ def test_bootstrap_signature_mismatch_raises(self, mesh):
 
         # Different hidden dim → different bootstrap signature.
         bigger_hidden = HIDDEN * 2
-        x_b = jax.random.normal(
-            jax.random.PRNGKey(16), (BATCH, SEQ, bigger_hidden), dtype=DTYPE
-        )
+        x_b = jax.random.normal(jax.random.PRNGKey(16), (BATCH, SEQ, bigger_hidden), dtype=DTYPE)
         block_b = MoEBlock(
             num_experts=NUM_EXPERTS,
             num_experts_per_tok=TOPK,
diff --git a/transformer_engine/jax/cpp_extensions/ep.py b/transformer_engine/jax/cpp_extensions/ep.py
@@ -931,7 +931,12 @@ def ep_combine_fwd(handle, handle_mem, expert_out, num_local_tokens, out_partiti
 
 @compute_on("gpu_stream:collective")
 def ep_dispatch_bwd(
-    handle, handle_mem, grad, g_recv_topk_weights, top_k, num_local_tokens,
+    handle,
+    handle_mem,
+    grad,
+    g_recv_topk_weights,
+    top_k,
+    num_local_tokens,
     out_partition_spec=None,
 ):
     """Backward of dispatch; returns (grad_tokens, grad_topk_weights)."""
diff --git a/transformer_engine/jax/csrc/extensions/inspect.cpp b/transformer_engine/jax/csrc/extensions/inspect.cpp
@@ -121,14 +121,14 @@ Error_Type InspectFFI(cudaStream_t stream, Buffer_Type input_buf, Buffer_Type mi
 
 XLA_FFI_DEFINE_HANDLER_SYMBOL(InspectHandler, InspectFFI,
                               FFI::Bind()
-                                  .Ctx<FFI_Stream_Type>()           // stream
-                                  .Arg<Buffer_Type>()               // input
-                                  .Arg<Buffer_Type>()               // min
-                                  .Arg<Buffer_Type>()               // max
-                                  .Arg<Buffer_Type>()               // mean
-                                  .Arg<Buffer_Type>()               // std
-                                  .Ret<Buffer_Type>()               // output
-                                  .Attr<std::string_view>("name")   // probe name
+                                  .Ctx<FFI_Stream_Type>()          // stream
+                                  .Arg<Buffer_Type>()              // input
+                                  .Arg<Buffer_Type>()              // min
+                                  .Arg<Buffer_Type>()              // max
+                                  .Arg<Buffer_Type>()              // mean
+                                  .Arg<Buffer_Type>()              // std
+                                  .Ret<Buffer_Type>()              // output
+                                  .Attr<std::string_view>("name")  // probe name
 );
 
 }  // namespace jax
diff --git a/transformer_engine/jax/ep.py b/transformer_engine/jax/ep.py
@@ -49,8 +49,7 @@ def _allgather_uid(uid_arr, world_size, uid_size):
     devices = np.asarray(jax.devices())
     if devices.size != world_size:
         raise RuntimeError(
-            f"_allgather_uid fallback expected {world_size} global devices,"
-            f" got {devices.size}."
+            f"_allgather_uid fallback expected {world_size} global devices, got {devices.size}."
         )
     mesh = jax.sharding.Mesh(devices, ("_uid_all",))
     sharded = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("_uid_all", None))
@@ -268,8 +267,13 @@ def _dispatch_bwd(handle, recv_capacity_per_rank, res, g_outputs):
 
 @partial(jax.custom_vjp, nondiff_argnums=(0, 5, 6))
 def ep_combine(
-    handle, handle_mem, token_counts, expert_out, recv_topk_weights,
-    num_local_tokens, out_sharding=None,
+    handle,
+    handle_mem,
+    token_counts,
+    expert_out,
+    recv_topk_weights,
+    num_local_tokens,
+    out_sharding=None,
 ):
     """Reduce weighted expert outputs back to source ranks.
 
@@ -291,8 +295,13 @@ def ep_combine(
         ``[..., H]`` combined output shaped per ``num_local_tokens``.
     """
     return _combine_fwd(
-        handle, handle_mem, token_counts, expert_out, recv_topk_weights,
-        num_local_tokens, out_sharding,
+        handle,
+        handle_mem,
+        token_counts,
+        expert_out,
+        recv_topk_weights,
+        num_local_tokens,
+        out_sharding,
     )[0]
 
 
@@ -302,8 +311,13 @@ def _make_valid_mask(recv_topk_weights, dtype):
 
 
 def _combine_fwd(
-    handle, handle_mem, token_counts, expert_out, recv_topk_weights,
-    num_local_tokens, out_sharding,
+    handle,
+    handle_mem,
+    token_counts,
+    expert_out,
+    recv_topk_weights,
+    num_local_tokens,
+    out_sharding,
 ):
     del token_counts
     w = recv_topk_weights[..., None]
diff --git a/transformer_engine/jax/moe.py b/transformer_engine/jax/moe.py