Add Ulysses ring split override

csgoogle · csgoogle · commit d6638b6668d2 · 2026-05-15T09:06:03.000Z
diff --git a/README.md b/README.md
@@ -603,13 +603,17 @@ To generate images, run the following command:
   Public configs still shard sequence only over the `context` mesh axis. The attention kernel privately reshapes
   that context axis into hidden ring and Ulysses axes, runs the Ulysses all-to-all over the hidden Ulysses axis,
   and reuses Tokamax ring attention over the hidden ring axis.
+  By default, the split is selected automatically. For tuning, set
+  `ulysses_ring_ulysses_parallelism=<ulysses_shards>`; ring shards are derived as
+  `ici_context_parallelism / ulysses_ring_ulysses_parallelism`.
 
   ```bash
   python src/maxdiffusion/generate_wan.py \
   src/maxdiffusion/configs/base_wan_i2v_27b.yml \
   attention="ulysses_ring" \
   dcn_context_parallelism=<num_slices> \
   ici_context_parallelism=<context_shards_per_slice> \
+  ulysses_ring_ulysses_parallelism=<optional_ulysses_shards> \
   ...
   ```
 
diff --git a/docs/tpu_multihost_wan_bench.md b/docs/tpu_multihost_wan_bench.md
@@ -188,7 +188,7 @@ run_case ulysses_ring_dp2_cp8    ulysses_ring 2 8 1
 
 ## Topology Note
 
-TPU v7x exposes dual chiplets as two JAX devices. For `ulysses_ring`, expose only the total sequence sharding through `context`; the attention kernel derives a private ring and Ulysses split from that axis.
+TPU v7x exposes dual chiplets as two JAX devices. For `ulysses_ring`, expose only the total sequence sharding through `context`; the attention kernel derives a private ring and Ulysses split from that axis. To tune that split explicitly, set `ulysses_ring_ulysses_parallelism`; ring shards are derived as `ici_context_parallelism / ulysses_ring_ulysses_parallelism`.
 - `4x4` uses tensor `4`, so the dual-chip pairing is still inside the Ulysses side.
 
 The plain `ring` baseline has no Ulysses group, so it cannot preserve that property by construction.
diff --git a/docs/tpu_wan_bench_guide.md b/docs/tpu_wan_bench_guide.md
@@ -143,7 +143,7 @@ Set in `src/maxdiffusion/configs/base_wan_27b.yml` or overridden on the command
 **Parallelism rule**: product of all ICI axes must equal 8 (chips per host):
 - `ici_dp × ici_fsdp × ici_cp × ici_tp = 8`
 
-For `ulysses_ring`, set the desired total sequence shards with `ici_context_parallelism`; the internal ring and Ulysses split is selected by the attention kernel.
+For `ulysses_ring`, set the desired total sequence shards with `ici_context_parallelism`; the internal ring and Ulysses split is selected by the attention kernel. To tune it manually, set `ulysses_ring_ulysses_parallelism=<ulysses_shards>` and the ring shard count is derived as `ici_context_parallelism / ulysses_ring_ulysses_parallelism`.
 
 ---
 
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -67,6 +67,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
diff --git a/src/maxdiffusion/configs/base_wan_1_3b.yml b/src/maxdiffusion/configs/base_wan_1_3b.yml
@@ -63,6 +63,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
diff --git a/src/maxdiffusion/configs/base_wan_27b.yml b/src/maxdiffusion/configs/base_wan_27b.yml
@@ -67,6 +67,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
diff --git a/src/maxdiffusion/configs/base_wan_animate.yml b/src/maxdiffusion/configs/base_wan_animate.yml
@@ -65,6 +65,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
diff --git a/src/maxdiffusion/configs/base_wan_i2v_14b.yml b/src/maxdiffusion/configs/base_wan_i2v_14b.yml
@@ -67,6 +67,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
diff --git a/src/maxdiffusion/configs/base_wan_i2v_27b.yml b/src/maxdiffusion/configs/base_wan_i2v_27b.yml
@@ -67,6 +67,9 @@ split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# Optional for attention=ulysses_ring. -1 auto-selects the hidden split; otherwise
+# this many context shards are used for Ulysses and ring shards are context / this.
+ulysses_ring_ulysses_parallelism: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -206,11 +206,28 @@ def _replace_mesh_axis_names(axis_names, old_axis: str, new_axes: tuple[str, ...
   )
 
 
-def _choose_internal_ulysses_shards(context_shards: int, heads: int) -> int:
+def _choose_internal_ulysses_shards(
+    context_shards: int,
+    heads: int,
+    requested_ulysses_shards: int = -1,
+) -> int:
   """Choose a hidden Ulysses split inside the public context axis."""
   if context_shards <= 1:
     raise ValueError(f"Ulysses ring attention requires context_shards > 1, got {context_shards}.")
 
+  if requested_ulysses_shards and requested_ulysses_shards > 0:
+    if context_shards % requested_ulysses_shards != 0:
+      raise ValueError(
+          "Ulysses ring attention requires the requested Ulysses shard count to divide the context shard count, "
+          f"got context_shards={context_shards} and ulysses_shards={requested_ulysses_shards}."
+      )
+    if heads % requested_ulysses_shards != 0:
+      raise ValueError(
+          "Ulysses ring attention requires the number of heads to be divisible by the requested Ulysses shard "
+          f"count, got heads={heads} and ulysses_shards={requested_ulysses_shards}."
+      )
+    return requested_ulysses_shards
+
   balanced_limit = int(math.sqrt(context_shards))
   balanced_candidates = [
       factor
@@ -844,6 +861,7 @@ def _ulysses_ring_attention(
     ring_axis: str = INTERNAL_RING_AXIS,
     use_base2_exp: bool = False,
     use_experimental_scheduler: bool = False,
+    ulysses_ring_ulysses_parallelism: int = -1,
 ) -> jax.Array:
   """2D context-parallel attention using a private Ulysses x ring mesh.
 
@@ -857,7 +875,11 @@ def _ulysses_ring_attention(
     raise ValueError(f"Ulysses ring attention requires mesh axis {context_axis!r}, got mesh axes {mesh.shape}.")
 
   num_context_shards = mesh.shape[context_axis]
-  num_ulysses_shards = _choose_internal_ulysses_shards(num_context_shards, heads)
+  num_ulysses_shards = _choose_internal_ulysses_shards(
+      num_context_shards,
+      heads,
+      requested_ulysses_shards=ulysses_ring_ulysses_parallelism,
+  )
   num_ring_shards = num_context_shards // num_ulysses_shards
   internal_mesh = _create_internal_ulysses_ring_mesh(
       mesh,
@@ -1166,6 +1188,7 @@ def ulysses_ring_kernel(q, k, v, context):
       attention_mask=context["attention_mask"],
       use_base2_exp=context["use_base2_exp"],
       use_experimental_scheduler=context["use_experimental_scheduler"],
+      ulysses_ring_ulysses_parallelism=context["ulysses_ring_ulysses_parallelism"],
   )
 
 
@@ -1279,6 +1302,7 @@ def _apply_attention(
     attention_mask: Array = None,
     use_base2_exp: bool = False,
     use_experimental_scheduler: bool = False,
+    ulysses_ring_ulysses_parallelism: int = -1,
 ):
   """Routes to different attention kernels using a module-level registry."""
 
@@ -1316,6 +1340,7 @@ def _apply_attention(
       "scale": scale,
       "use_base2_exp": use_base2_exp,
       "use_experimental_scheduler": use_experimental_scheduler,
+      "ulysses_ring_ulysses_parallelism": ulysses_ring_ulysses_parallelism,
       "dim_head": dim_head,
       "split_head_dim": split_head_dim,
       "float32_qk_product": float32_qk_product,
@@ -1521,10 +1546,12 @@ def __init__(
       residual_checkpoint_name: str | None = None,
       use_base2_exp: bool = False,
       use_experimental_scheduler: bool = False,
+      ulysses_ring_ulysses_parallelism: int = -1,
   ):
     self.dpa_layer = None
     self.use_base2_exp = use_base2_exp
     self.use_experimental_scheduler = use_experimental_scheduler
+    self.ulysses_ring_ulysses_parallelism = ulysses_ring_ulysses_parallelism
     if attention_kernel == "cudnn_flash_te":
       from transformer_engine.jax.flax.transformer import DotProductAttention  # pytype: disable=import-error
 
@@ -1587,6 +1614,9 @@ def apply_attention(self, query: Array, key: Array, value: Array, attention_mask
         attention_mask=attention_mask,
         use_base2_exp=self.use_base2_exp if hasattr(self, "use_base2_exp") else False,
         use_experimental_scheduler=self.use_experimental_scheduler if hasattr(self, "use_experimental_scheduler") else False,
+        ulysses_ring_ulysses_parallelism=(
+            self.ulysses_ring_ulysses_parallelism if hasattr(self, "ulysses_ring_ulysses_parallelism") else -1
+        ),
     )
 
 
@@ -1607,6 +1637,7 @@ class AttentionOp(nn.Module):
   quant: Quant = None
   use_base2_exp: bool = False
   use_experimental_scheduler: bool = False
+  ulysses_ring_ulysses_parallelism: int = -1
 
   def setup(self):
     self.dpa_layer = None
@@ -1654,6 +1685,7 @@ def apply_attention(self, query: Array, key: Array, value: Array, attention_mask
         attention_mask=attention_mask,
         use_base2_exp=self.use_base2_exp,
         use_experimental_scheduler=self.use_experimental_scheduler,
+        ulysses_ring_ulysses_parallelism=self.ulysses_ring_ulysses_parallelism,
     )
 
 
@@ -1692,6 +1724,7 @@ def __init__(
       image_seq_len: Optional[int] = None,  # New for I2V
       use_base2_exp: bool = False,
       use_experimental_scheduler: bool = False,
+      ulysses_ring_ulysses_parallelism: int = -1,
   ):
     if attention_kernel in {"flash", "cudnn_flash_te"} and mesh is None:
       raise ValueError(f"The flash attention kernel requires a value for mesh, but mesh is {self.mesh}")
@@ -1740,6 +1773,7 @@ def __init__(
         residual_checkpoint_name=residual_checkpoint_name,
         use_base2_exp=use_base2_exp,
         use_experimental_scheduler=use_experimental_scheduler,
+        ulysses_ring_ulysses_parallelism=ulysses_ring_ulysses_parallelism,
     )
     # None axes corresponds to the stacked weights across all blocks
     # because of the use of nnx.vmap and nnx.scan.
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -355,6 +355,7 @@ def __init__(
       enable_jax_named_scopes: bool = False,
       use_base2_exp: bool = False,
       use_experimental_scheduler: bool = False,
+      ulysses_ring_ulysses_parallelism: int = -1,
   ):
     self.enable_jax_named_scopes = enable_jax_named_scopes
 
@@ -381,6 +382,7 @@ def __init__(
         enable_jax_named_scopes=enable_jax_named_scopes,
         use_base2_exp=use_base2_exp,
         use_experimental_scheduler=use_experimental_scheduler,
+        ulysses_ring_ulysses_parallelism=ulysses_ring_ulysses_parallelism,
     )
 
     # 1. Cross-attention
@@ -407,6 +409,7 @@ def __init__(
         enable_jax_named_scopes=enable_jax_named_scopes,
         use_base2_exp=use_base2_exp,
         use_experimental_scheduler=use_experimental_scheduler,
+        ulysses_ring_ulysses_parallelism=ulysses_ring_ulysses_parallelism,
     )
     assert cross_attn_norm is True
     self.norm2 = FP32LayerNorm(rngs=rngs, dim=dim, eps=eps, elementwise_affine=True)
@@ -572,6 +575,7 @@ def __init__(
       enable_jax_named_scopes: bool = False,
       use_base2_exp: bool = False,
       use_experimental_scheduler: bool = False,
+      ulysses_ring_ulysses_parallelism: int = -1,
   ):
     inner_dim = num_attention_heads * attention_head_dim
     out_channels = out_channels or in_channels
@@ -639,6 +643,7 @@ def init_block(rngs):
           image_seq_len=image_seq_len,
           use_base2_exp=use_base2_exp,
           use_experimental_scheduler=use_experimental_scheduler,
+          ulysses_ring_ulysses_parallelism=ulysses_ring_ulysses_parallelism,
       )
 
     self.gradient_checkpoint = GradientCheckpointType.from_str(remat_policy)
@@ -667,6 +672,7 @@ def init_block(rngs):
             precision=precision,
             attention=attention,
             enable_jax_named_scopes=enable_jax_named_scopes,
+            ulysses_ring_ulysses_parallelism=ulysses_ring_ulysses_parallelism,
         )
         blocks.append(block)
       self.blocks = nnx.data(blocks)
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -140,6 +140,7 @@ def create_model(rngs: nnx.Rngs, wan_config: dict):
   wan_config["enable_jax_named_scopes"] = config.enable_jax_named_scopes
   wan_config["use_base2_exp"] = config.use_base2_exp
   wan_config["use_experimental_scheduler"] = config.use_experimental_scheduler
+  wan_config["ulysses_ring_ulysses_parallelism"] = getattr(config, "ulysses_ring_ulysses_parallelism", -1)
 
   # 2. eval_shape - will not use flops or create weights on device
   # thus not using HBM memory.
diff --git a/src/maxdiffusion/tests/attention_test.py b/src/maxdiffusion/tests/attention_test.py
@@ -87,6 +87,16 @@ def _ulysses_block_sizes(self, block_size=4):
         use_fused_bwd_kernel=False,
     )
 
+  def test_choose_internal_ulysses_shards_honors_requested_split(self):
+    self.assertEqual(
+        attention_flax._choose_internal_ulysses_shards(4, 40, requested_ulysses_shards=2),
+        2,
+    )
+    with self.assertRaisesRegex(ValueError, r"context_shards=4 and ulysses_shards=3"):
+      attention_flax._choose_internal_ulysses_shards(4, 40, requested_ulysses_shards=3)
+    with self.assertRaisesRegex(ValueError, r"heads=40 and ulysses_shards=3"):
+      attention_flax._choose_internal_ulysses_shards(6, 40, requested_ulysses_shards=3)
+
   def test_splash_attention(self):
     """Test numerics of splash attention are equivalent to dot_product"""
 
@@ -512,6 +522,7 @@ def fake_kernel(q, k, v, segment_ids):
           ),
           flash_block_sizes=self._ulysses_block_sizes(),
           dtype=jnp.float32,
+          ulysses_ring_ulysses_parallelism=4,
       )
 
     self.assertEqual(output.shape, query.shape)