Re-enable JAX 0.10.0 with MoE layout-constraint fix (#2683)

QiliangCui2023 · QiliangCui · web-flow · commit acd00f95b91c · 2026-05-21T01:52:05.000Z
Signed-off-by: Qiliang Cui &lt;derrhein@gmail.com&gt;
Co-authored-by: Qiliang Cui &lt;derrhein@gmail.com&gt;
diff --git a/requirements.txt b/requirements.txt
@@ -5,9 +5,9 @@ pytest-mock
 absl-py
 numpy
 google-cloud-storage
-jax==0.9.2
-jaxlib==0.9.2
-libtpu==0.0.39
+jax==0.10.0
+jaxlib==0.10.0
+libtpu==0.0.40
 jaxtyping
 flax==0.12.4
 torchax==0.0.11
diff --git a/tests/kernels/mla_v2_test.py b/tests/kernels/mla_v2_test.py
@@ -14,8 +14,6 @@
 
 import os
 
-import pytest
-
 os.environ["LIBTPU_INIT_ARGS"] = (os.environ.get("LIBTPU_INIT_ARGS", "") +
                                   " --xla_tpu_scoped_vmem_limit_kib=65536")
 
@@ -408,9 +406,6 @@ def _test_mla_ragged_paged_attention(
         self.assertAllClose(expected_out, kernel_out, atol=0.1, rtol=0.2)
 
 
-# The test is slow on v6e, causing timeouts in presubmit. See b/513860288.
-@pytest.mark.skipif(not jtu.is_device_tpu_at_least(version=7),
-                    reason="Expect TPUv7+")
 @jtu.with_config(jax_numpy_dtype_promotion="standard")
 class MlaRaggedPagedAttentionKernelV2Test(MlaRaggedPagedAttentionTestBase):
 
diff --git a/tests/kernels/ragged_gather_reduce_test.py b/tests/kernels/ragged_gather_reduce_test.py
@@ -83,9 +83,6 @@ class ScatterTest(jtu.JaxTestCase):
     @parameterized.parameters(*_test_cases)
     def test_sc_ragged_gather_reduce(self, out_size, hidden_size, start_end,
                                      dtype, reduce_group_size):
-        # The test is slow on v6e, causing timeouts in presubmit. See b/513860288.
-        if not jtu.is_device_tpu_at_least(version=7):
-            self.skipTest("Expect TPUv7+")
         start, end = start_end
         start = min(start, out_size)
         end = min(end, out_size)
diff --git a/tests/layers/vllm/test_fp8.py b/tests/layers/vllm/test_fp8.py
@@ -431,9 +431,9 @@ def test_fused_moe(use_ep, num_devices, num_tokens, intermediate_size,
 
     a = torch.randn((num_tokens, hidden_size), dtype=dtype) / 10
     w1 = torch.randn(
-        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 10
+        (num_experts, 2 * intermediate_size, hidden_size), dtype=dtype) / 100
     w2 = torch.randn(
-        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 10
+        (num_experts, hidden_size, intermediate_size), dtype=dtype) / 100
     score = torch.randn((num_tokens, num_experts), dtype=dtype)
 
     engine_args = EngineArgs(model=MODELS[0],
diff --git a/tpu_inference/kernels/sparse_core/gather_reduce.py b/tpu_inference/kernels/sparse_core/gather_reduce.py
@@ -63,6 +63,8 @@ def __getitem__(self, shape):
 
 def is_supported_by_sc_gather_reduce(x_shape: int,
                                      sc_kernel_threshold: int) -> bool:
+    # TODO: Skip until numeric issue is fixed.
+    return False
     if x_shape > sc_kernel_threshold and pltpu.get_tpu_info().generation == 7:
         return True
     return False
diff --git a/tpu_inference/kernels/sparse_core/ragged_gather_reduce.py b/tpu_inference/kernels/sparse_core/ragged_gather_reduce.py
@@ -99,9 +99,10 @@ def main_kernel(
     # Inputs.
     num_rows_per_row_partition_ref: jax.Ref,
     in_hbm_ref: jax.Ref,
-    src_indices_hbm_ref: jax.Ref,
+    indices_hbm_ref: jax.Ref,
     dst_indices_hbm_ref: jax.Ref,
     topk_weights_hbm_ref: jax.Ref,
+    sorted_by_validity_hbm_ref: jax.Ref,
     # Outputs.
     out_hbm_ref: jax.Ref,
     # Scratch.
@@ -111,6 +112,7 @@ def main_kernel(
     src_indices_vmem_ref: jax.Ref,
     dst_indices_vmem_ref: jax.Ref,
     topk_weights_vmem_ref: jax.Ref,
+    sorted_by_validity_vmem_ref: jax.Ref,
     sem_ref: jax.Ref,
     *,
     core_axis_name: str,
@@ -176,9 +178,9 @@ def row_loop(row_block_id):
             dma_list = []
             dma_list.append(
                 pltpu.make_async_copy(
-                    src_indices_hbm_ref.at[pl.ds(row_tile_start,
-                                                 num_simd_lanes)],
-                    src_indices_vmem_ref,
+                    sorted_by_validity_hbm_ref.at[pl.ds(
+                        row_tile_start, num_simd_lanes)],
+                    sorted_by_validity_vmem_ref,
                     recv_sem,
                 ))
             dma_list.append(
@@ -188,13 +190,22 @@ def row_loop(row_block_id):
                     dst_indices_vmem_ref,
                     recv_sem,
                 ))
+            jax.tree.map(lambda x: x.start(), dma_list)
+            jax.tree.map(lambda x: x.wait(), dma_list)
+
+            dma_list = []
             dma_list.append(
                 pltpu.make_async_copy(
-                    topk_weights_hbm_ref.at[pl.ds(row_tile_start,
-                                                  num_simd_lanes)],
+                    topk_weights_hbm_ref.at[sorted_by_validity_vmem_ref],
                     topk_weights_vmem_ref,
                     recv_sem,
                 ))
+            dma_list.append(
+                pltpu.make_async_copy(
+                    indices_hbm_ref.at[sorted_by_validity_vmem_ref],
+                    src_indices_vmem_ref,
+                    recv_sem,
+                ))
             jax.tree.map(lambda x: x.start(), dma_list)
             jax.tree.map(lambda x: x.wait(), dma_list)
 
@@ -227,9 +238,12 @@ def row_loop(row_block_id):
 
             # VMEM to HBM transfer.
             # Use dynamic loop to minimize register spills.
+            @pl.loop(0,
+                     col_size,
+                     step=num_lanes,
+                     init_carry=(prev_dst_row_hbm, ))
             @jax.named_scope("dma_write_loop")
-            def dma_write_loop(i, carry):
-                col_vmem_start = i * num_lanes
+            def dma_write_loop(col_vmem_start, carry):
                 col_hbm_start = col_start + col_vmem_start
 
                 for _ in range(num_simd_lanes):
@@ -359,12 +373,6 @@ def dma_write_loop(i, carry):
 
                 return carry
 
-            jax.lax.fori_loop(
-                0,
-                pl.cdiv(col_size, num_lanes),
-                dma_write_loop,
-                init_val=(prev_dst_row_hbm, ),
-            )
             # Wait for dma write to finish.
             for _ in range(0, col_size, num_lanes):
                 for _ in range(num_simd_lanes):
@@ -380,12 +388,11 @@ def dma_write_loop(i, carry):
 # TODO(gxd): investigate if we can make the preprocessing more efficient.
 def _preprocess(
     indices: jax.Array,
-    topk_weights: jax.Array,
     valid_rows_mask: jax.Array,
     reduce_group_size: int,
     num_row_partitions: int,
     num_simd_lanes: int,
-) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array, jax.Array]:
+) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array]:
     """Preprocesses indices for ragged gather reduce."""
     assert indices.ndim == 1, "Ragged scatter only supports 1d indices."
 
@@ -403,12 +410,10 @@ def _preprocess(
     ) * row_partition_size)
     sorted_by_validity = sorted_by_validity.reshape(-1)
 
-    src_indices = indices[sorted_by_validity]
     # `reduce_group_size` source rows are mapped (and reduced) to the same output
     # row.
     dst_indices = sorted_by_validity // reduce_group_size
-    topk_weights = topk_weights[sorted_by_validity]
-    topk_weights = topk_weights.astype(jnp.float32)
+    sorted_by_validity = sorted_by_validity.astype(jnp.int32)
 
     num_src_rows_per_row_partition = jnp.sum(valid_rows_mask, axis=-1)
     assert num_row_partitions <= num_simd_lanes
@@ -421,9 +426,8 @@ def _preprocess(
     mask = jnp.any(valid_rows_mask.reshape(-1, reduce_group_size), axis=-1)
 
     return (
-        src_indices,
         dst_indices,
-        topk_weights,
+        sorted_by_validity,
         num_src_rows_per_row_partition,
         mask,
     )
@@ -521,14 +525,12 @@ def ragged_gather_reduce(
     col_size = x.shape[-1] // num_column_partitions
 
     (
-        src_indices,
         dst_indices,
-        topk_weights,
+        sorted_by_validity,
         num_src_rows_per_row_partition,
         mask,
     ) = _preprocess(
         indices,
-        topk_weights,
         valid_rows_mask,
         reduce_group_size,
         num_row_partitions,
@@ -566,12 +568,19 @@ def ragged_gather_reduce(
             pltpu.VMEM((num_simd_lanes, ), jnp.int32),
             pltpu.VMEM((num_simd_lanes, ), jnp.int32),
             pltpu.VMEM((num_simd_lanes, ), jnp.float32),
+            pltpu.VMEM((num_simd_lanes, ), jnp.int32),
             pltpu.SemaphoreType.DMA((2, )),
         ],
         mesh=vector_mesh,
         name="sc_ragged_gather_reduce",
-    )(num_src_rows_per_row_partition, x, src_indices, dst_indices,
-      topk_weights)
+    )(
+        num_src_rows_per_row_partition,
+        x,
+        indices,
+        dst_indices,
+        topk_weights.astype(jnp.float32),
+        sorted_by_validity,
+    )
 
     # If there is no valid source row in a reduce group, set that group's output
     # to zero.
diff --git a/tpu_inference/layers/common/process_weights/moe_weights.py b/tpu_inference/layers/common/process_weights/moe_weights.py
@@ -16,7 +16,7 @@
 
 import jax
 import jax.numpy as jnp
-from jax.experimental.layout import Layout, with_layout_constraint
+from jax.experimental.layout import Layout
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torchax.tensor import Tensor
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
@@ -323,10 +323,6 @@ def process_moe_weights(
     w13_weight = jnp.swapaxes(w13_weight, 1, 2)
     w2_weight = jnp.swapaxes(w2_weight, 1, 2)
 
-    # Workaround for JAX error "must have valid byte strides"
-    w13_weight = with_layout_constraint(w13_weight, Layout((0, 1, 2)))
-    w2_weight = with_layout_constraint(w2_weight, Layout((0, 1, 2)))
-
     if w13_weight_scale is not None:
         # For block scales (experts, out_blocks, in_blocks), we need to maintain
         # the block dims
@@ -374,8 +370,6 @@ def process_moe_weights(
                 intermediate_size,
             )
             w13_weight = jnp.swapaxes(w13_weight, 1, 2)
-            w13_weight = with_layout_constraint(w13_weight, Layout(
-                (0, 1, 2, 3)))
 
             # Fused moe kernel expects dims to be multiple of 256.
             pad_width_intermediate_size = (align_to(intermediate_size, 256) -