Add packed (THD) ring attention with hardware-aware reorder dispatch

kocchop · kocchop · commit eb52c0ba4e89 · 2026-04-20T11:27:36.000-07:00
Enable CP + packing for context_parallel_strategy="ring" with load
balancing. On GPU, uses Transformer Engine's striped reorder for
THD-packed sequences. On TPU/CPU, falls back to pure-JAX reorder_sequence
and never imports TE.

Changes:
- common_types: Add ReorderStrategy enum (AUTO, DUAL_CHUNK_SWAP, STRIPED).
- configs: Add context_parallel_reorder_strategy (default "auto"). Reject
    explicit STRIPED on non-GPU at config validation time.
- attention_op: Thread segment_positions through apply_attention,
    cudnn_flash_attention, and __call__. Use segment_positions in TE's
    SequenceDescriptor for packing. Restrict packing+CP to load-balanced
    ring only. Note TE version constraint.
- attentions.py, attention_mla.py, gpt3.py: Pass inputs_positions into
    attention_op calls (None for gpt3).
- max_utils: Hardware-dispatched reorder_causal_load_balanced. GPU uses
    TE's reorder_causal_load_balancing; TPU/CPU uses reorder_sequence.
    TE import is lazy and GPU-only.
- maxtext_utils: Thread reorder_strategy and hardware through
    shard_reorder_causal_load_balanced and get_reorder_callable. Default
    hardware="tpu" never triggers TE import.
- train_utils: Allow ring+packing; forbid all_gather+packing and
    synthetic+packing. Resolve AUTO-&gt;STRIPED for packing else
    DUAL_CHUNK_SWAP. Pass config.hardware to reorder callable. Build
    data_loader after reorder wrapper is applied.
- attention_test_util: Pass cfg_cp.hardware so TPU tests use pure-JAX
    reorder. Helper is TPU-oriented and does not model GPU packed behavior.
- tests: Add test_gpu_ring_attention_with_packing (sm90+).

Requires TE with reorder_causal_load_balancing; works with TE &lt;=2.11 or
&gt;=2.14 (incompatible with 2.12 and 2.13 due to a known bug).
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -122,6 +122,16 @@ class ShardMode(enum.Enum):
   EXPLICIT = "explicit"
 
 
+class ReorderStrategy(enum.Enum):
+  """Reorder strategies for load-balanced context parallelism.
+  Maps to transformer_engine.jax.attention.ReorderStrategy at runtime.
+  """
+
+  AUTO = "auto"
+  DUAL_CHUNK_SWAP = "dual_chunk_swap"
+  STRIPED = "striped"
+
+
 class HyperConnectionType(enum.Enum):
   ATTENTION = "attention"
   MLP_MOE = "mlp_moe"
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1038,6 +1038,7 @@ use_splash_scheduler: False # to use tokamax splash attention scheduler.
 ### Determine if we want to use load balance for context parallelism
 context_parallel_load_balance: True
 context_parallel_strategy: "all_gather" # "all_gather" or "ring"
+context_parallel_reorder_strategy: "auto" # "auto", "dual_chunk_swap", or "striped"
 
 ### Paged Attention ###
 # These settings take effect only when `attention=paged`.
diff --git a/src/maxtext/configs/pyconfig_deprecated.py b/src/maxtext/configs/pyconfig_deprecated.py
@@ -30,7 +30,7 @@
 
 from maxtext.utils import accelerator_to_spec_map
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT, MAXTEXT_REPO_ROOT, MAXTEXT_PKG_DIR
-from maxtext.common.common_types import AttentionType, DecoderBlockType, ShardMode
+from maxtext.common.common_types import AttentionType, DecoderBlockType, ReorderStrategy, ShardMode
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
@@ -856,6 +856,7 @@ def user_init(raw_keys):
 
     raw_keys["decoder_block"] = DecoderBlockType(raw_keys["decoder_block"])
     raw_keys["shard_mode"] = ShardMode(raw_keys["shard_mode"])
+    raw_keys["context_parallel_reorder_strategy"] = ReorderStrategy(raw_keys["context_parallel_reorder_strategy"])
 
   @staticmethod
   def configure_gpt3_task(raw_keys):
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -29,7 +29,7 @@
 from typing import Any, Literal, NewType, Optional
 
 import jax
-from maxtext.common.common_types import AttentionType, DecoderBlockType, ShardMode
+from maxtext.common.common_types import AttentionType, DecoderBlockType, ReorderStrategy, ShardMode
 from maxtext.utils import gcs_utils
 from maxtext.utils import max_utils
 from maxtext.utils.globals import MAXTEXT_ASSETS_ROOT
@@ -821,6 +821,10 @@ class HardwareAndMesh(BaseModel):
       "all_gather",
       description="Strategy for context parallelism ('all_gather' or 'ring').",
   )
+  context_parallel_reorder_strategy: ReorderStrategy = Field(
+      "auto",
+      description="Reorder strategy for load-balanced context parallelism.",
+  )
   custom_mesh: str = Field("", description="Available options: ['hybrid_ring_64x4', 'hybrid_ring_32x8']")
   custom_mesh_and_rule: str = Field("", description="Customized mesh and logical rules for granularity.")
   allow_split_physical_axes: bool = Field(False, description="Allow splitting physical axes for device mesh creation.")
@@ -2672,6 +2676,20 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         raise ValueError(
             "Ring context parallelism strategy (context_parallel_strategy='ring') is only supported on GPUs."
         )
+    # STRIPED reorder strategy is a Transformer Engine feature and is GPU-only.
+    # The AUTO + packing case (which training resolves to STRIPED) is not validated here
+    # because test code paths may load the same config but use a different reorder path.
+    # Training's runtime path in max_utils.reorder_causal_load_balanced enforces this.
+    if (
+        self.context_parallel_size > 1
+        and "gpu" not in self.hardware
+        and self.context_parallel_load_balance
+        and self.context_parallel_reorder_strategy == ReorderStrategy.STRIPED
+    ):
+      raise ValueError(
+          "STRIPED reorder strategy requires Transformer Engine and is only supported on GPUs. "
+          f"Got hardware={self.hardware!r}."
+      )
     if self.hardware == "gpu" and self.packing and self.attention == "cudnn_flash_te" and self.max_segments_per_seq <= 0:
       raise ValueError("max_segments_per_seq must be set when using TransformerEngine attention and packing")
     dcn_product = (
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -1217,6 +1217,7 @@ def __call__(
           key,
           value,
           decoder_segment_ids,
+          inputs_positions,
           model_mode,
           cached_values,
           indexer_mask=indexer_mask,
diff --git a/src/maxtext/layers/attention_op.py b/src/maxtext/layers/attention_op.py
@@ -871,6 +871,7 @@ def apply_attention(
       key: Array | KVTensor,
       value: Array | KVTensor,
       decoder_segment_ids: Array | None,
+      segment_positions: Array | None,
       lengths: Array | None,
       model_mode: str,
       use_ragged_attention: bool = False,
@@ -1003,7 +1004,7 @@ def apply_attention(
                            Use `dot_product` instead."""
         )
       return (
-          self.cudnn_flash_attention(query, key, value, decoder_segment_ids, model_mode),
+          self.cudnn_flash_attention(query, key, value, decoder_segment_ids, segment_positions, model_mode),
           None,
           None,
       )
@@ -1513,12 +1514,15 @@ def cudnn_flash_attention(
       key: Array,
       value: Array,
       decoder_segment_ids: Array | None,
+      segment_positions: Array | None,
       model_mode: str = MODEL_MODE_TRAIN,
   ) -> Array:
     """CUDNN Flash Attention with Transformer Engine.
-
-    1. Stable API, supports MHA, GQA, SWA, Packing and Context Parallelism 2.
-    Context Parallelism currently only supports causal masking and no packing
+    1. Stable API, supports MHA, GQA, SWA, Packing and Context Parallelism
+    2. Context Parallelism currently only supports causal masking
+    3. Only Ring attention has packing support with striped load balancing
+      (context_parallel_strategy="ring" and context_parallel_load_balance=true)
+    4. Breaks with TE 2.12 and 2.13 (known bug); works with TE stable release <=2.11 or >=2.14.
     """
     # These imports are only meant to work in a GPU build.
     # pylint: disable=import-outside-toplevel
@@ -1528,6 +1532,11 @@ def cudnn_flash_attention(
     _, _, _, head_dim = query.shape  # pylint: disable=unused-variable
 
     using_context_parallelism = self.mesh.shape[self.config.context_sharding] > 1
+    using_load_balanced_ring_cp = (
+        using_context_parallelism
+        and self.config.context_parallel_strategy == "ring"
+        and self.config.context_parallel_load_balance
+    )
 
     # Initialize default attention configuration
     sliding_window_size = None
@@ -1541,18 +1550,27 @@ def cudnn_flash_attention(
 
     # Handle packing configurations
     if self.config.packing and self.config.dataset_type != "synthetic":
+      if using_context_parallelism and not using_load_balanced_ring_cp:
+        raise ValueError("Packing is only supported for load balanced ring attention with context parallelism.")
       qkv_layout = "THD_THD_THD"  # Packed format: 'T3HD', 'THD_T2HD' or 'THD_THD_THD'
       if decoder_segment_ids is None:
         decoder_segment_ids = jnp.ones(shape=query.shape[:2], dtype=jnp.int32)
-      attn_mask = SequenceDescriptor.from_segment_ids_and_pos(segment_ids=decoder_segment_ids, segment_pos=None)
+      attn_mask = SequenceDescriptor.from_segment_ids_and_pos(
+          segment_ids=decoder_segment_ids, segment_pos=segment_positions
+      )
       # Create dummy SequenceDescriptor for lazy_init
       dummy_segment_ids = jnp.ones(shape=query.shape[:2], dtype=jnp.int32)
-      dummy_attn_mask = SequenceDescriptor.from_segment_ids_and_pos(segment_ids=dummy_segment_ids, segment_pos=None)
+      dummy_attn_mask = SequenceDescriptor.from_segment_ids_and_pos(
+          segment_ids=dummy_segment_ids, segment_pos=segment_positions
+      )
       max_segments_per_seq = self.config.max_segments_per_seq
     elif using_context_parallelism:
       if self.attention_type == AttentionType.LOCAL_SLIDING:
-        raise AssertionError("Sliding window attention is not supported for context parallelism")
-      # Context parallelism without packing: only supports causal masking
+        raise AssertionError(
+            "Sliding window attention requires context parallelism with load-balanced ring strategy "
+            "and packing enabled."
+        )
+      # Context parallelism without packing: only supports causal masking, but not sliding window attention
       attn_mask = None
       dummy_attn_mask = None
       mask_type = "causal"
@@ -2003,6 +2021,7 @@ def __call__(
       key,
       value,
       decoder_segment_ids,
+      inputs_positions,
       model_mode,
       cached_values=None,
       previous_chunk=None,
@@ -2034,6 +2053,7 @@ def __call__(
         key=key,
         value=value,
         decoder_segment_ids=decoder_segment_ids,
+        segment_positions=inputs_positions,
         lengths=None,
         model_mode=model_mode,
         use_ragged_attention=self.use_ragged_attention,
@@ -2059,6 +2079,7 @@ def __call__(
         key=key,
         value=value,
         decoder_segment_ids=decoder_segment_ids,
+        segment_positions=inputs_positions,
         lengths=lengths,
         model_mode=model_mode,
         use_ragged_attention=self.use_ragged_attention,
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -1184,6 +1184,7 @@ def __call__(
           key,
           value,
           decoder_segment_ids,
+          inputs_positions,
           model_mode,
           cached_values,
           previous_chunk,
diff --git a/src/maxtext/models/gpt3.py b/src/maxtext/models/gpt3.py
@@ -328,7 +328,7 @@ def __call__(
     value = nn.with_logical_constraint(value, self.value_axis_names)
     value = checkpoint_name(value, "value_proj")
 
-    out = self.attention_op(query, key, value, decoder_segment_ids, model_mode)
+    out = self.attention_op(query, key, value, decoder_segment_ids, None, model_mode)
 
     out = nn.with_logical_constraint(out, self.out_axis_names)
 
diff --git a/src/maxtext/utils/max_utils.py b/src/maxtext/utils/max_utils.py
@@ -887,27 +887,86 @@ def reorder_sequence(tensor, cp_size: int, seq_dim: int = 1, to_contiguous: bool
   return reordered.reshape(ori_tensor_shape)
 
 
-@partial(jax.jit, static_argnums=1)
-def reorder_causal_load_balanced(batch, cp_size):
-  """Reorders the example batch sequences"""
-  return {
-      key: reorder_sequence(
-          value,  # Pass each key's value inside batch separately
-          cp_size=cp_size,
-      )
-      if key
-      in [
-          "inputs",
-          "targets",
-          "inputs_position",
-          "targets_position",
-          "inputs_segmentation",
-          "targets_segmentation",
-      ]
-      else value
-      for key, value in batch.items()
+@partial(jax.jit, static_argnums=(1, 2, 3))
+def reorder_causal_load_balanced(batch, cp_size, reorder_strategy, hardware="tpu"):
+  """Reorders the example batch sequences using a hardware-appropriate backend.
+
+  On GPU (hardware="gpu" or "gpu_multiprocess"), uses Transformer Engine's
+  reorder_causal_load_balancing which supports both DUAL_CHUNK_SWAP and STRIPED strategies.
+  On TPU/CPU, falls back to the pure-JAX reorder_sequence (DUAL_CHUNK_SWAP only).
+
+  Args:
+    batch: The batch to reorder.
+    cp_size: The size of the compute parallelism.
+    reorder_strategy: The ReorderStrategy enum value (DUAL_CHUNK_SWAP or STRIPED).
+    hardware: The hardware type string ("tpu", "gpu", "gpu_multiprocess", "cpu").
+
+  Returns:
+    The reordered batch.
+
+  Reorder Strategy:
+  - DUAL_CHUNK_SWAP: This strategy splits each query into two chunks and do the mirror swap between
+    GPUs. This is currently used for non-THD load balance. It requires the max_seqlens be the
+    multiple of 2 * cp_size.
+    Examples:
+    - Before reorder: GPU0: [0, 1, 2, 3]; GPU1: [4, 5, 6, 7]; GPU2: [8, 9, 10, 11]; GPU3: [12, 13, 14, 15];
+    - After reorder: GPU0: [0, 1, 14, 15]; GPU1: [4, 5, 10, 11]; GPU2: [8, 9, 6, 7]; GPU3: [12, 13, 2, 3]
+
+  - STRIPED: This strategy distributes the tokens in a striped (interleaved) manner across
+    the sequence. This is currently used for THD load balance.
+    Example: Consider 4 GPUs with seqlens=16.
+    - Before reorder: GPU0: [0, 1, 2, 3]; GPU1: [4, 5, 6, 7]; ...; GPU3: [12, 13, 14, 15]
+    - After reorder: GPU0: [0, 4, 8, 12]; GPU1: [1, 5, 9, 13]; ...; GPU3: [3, 7, 11, 15]
+
+  See: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/jax/attention.py
+  """
+  # pylint: disable=import-outside-toplevel
+  from maxtext.common.common_types import ReorderStrategy
+
+  _reorder_keys = {
+      "inputs",
+      "targets",
+      "inputs_position",
+      "targets_position",
+      "inputs_segmentation",
+      "targets_segmentation",
   }
 
+  if hardware in ("gpu", "gpu_multiprocess"):
+    from transformer_engine.jax.attention import ReorderStrategy as TE_ReorderStrategy
+    from transformer_engine.jax.attention import reorder_causal_load_balancing
+
+    reorder_strategy_map = {
+        ReorderStrategy.DUAL_CHUNK_SWAP: TE_ReorderStrategy.DualChunkSwap,
+        ReorderStrategy.STRIPED: TE_ReorderStrategy.Striped,
+    }
+
+    return {
+        key: reorder_causal_load_balancing(
+            value,
+            reorder_strategy_map[reorder_strategy],
+            cp_size=cp_size,
+            seq_dim=1,
+        )
+        if key in _reorder_keys
+        else value
+        for key, value in batch.items()
+    }
+  else:
+    if reorder_strategy == ReorderStrategy.STRIPED:
+      raise ValueError(
+          f"STRIPED reorder strategy requires Transformer Engine and is only supported on GPU, got hardware={hardware!r}."
+      )
+    return {
+        key: reorder_sequence(
+            value,
+            cp_size=cp_size,
+        )
+        if key in _reorder_keys
+        else value
+        for key, value in batch.items()
+    }
+
 
 @staticmethod
 def reorder_mask_load_balancing(tensor, cp_size: int, seq_dim: int):
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # pylint: disable=line-too-long, disable=bare-except, consider-using-generator
-""" Utils that are only interesting to MaxText. """
+"""Utils that are only interesting to MaxText."""
 
 import functools
 import pickle
@@ -39,7 +39,13 @@
 import orbax.checkpoint.experimental.emergency.replicator_checkpoint_manager as emergency_replicator_checkpoint_manager
 
 from maxtext.configs import pyconfig
-from maxtext.common.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE, ShardMode
+from maxtext.common.common_types import (
+    DecoderBlockType,
+    MODEL_MODE_PREFILL,
+    MODEL_MODE_AUTOREGRESSIVE,
+    ReorderStrategy,
+    ShardMode,
+)
 from maxtext.configs import types
 from maxtext.inference.page_manager import PageState
 from maxtext.common import checkpointing
@@ -113,19 +119,27 @@ def get_functional_eval_with_signature(eval_step, data_sharding, state_mesh_shar
   return functional_eval, in_shardings, out_shardings, static_argnums, donate_argnums
 
 
-def shard_reorder_causal_load_balanced(batch, cp_size, shard_mode):
+def shard_reorder_causal_load_balanced(
+    batch, cp_size, shard_mode, reorder_strategy=ReorderStrategy.DUAL_CHUNK_SWAP, hardware="tpu"
+):
   """Shard the output of the reordered sequence."""
-  reordered = max_utils.reorder_causal_load_balanced(batch, cp_size)
+  reordered = max_utils.reorder_causal_load_balanced(batch, cp_size, reorder_strategy, hardware)
   for _, v in batch.items():
     if isinstance(v, jax.Array):
       reordered = sharding.maybe_shard_with_name(reordered, v.sharding, shard_mode)
       break
   return reordered
 
 
-def get_reorder_callable(cp_size, shard_mode):
+def get_reorder_callable(cp_size, shard_mode, reorder_strategy=ReorderStrategy.DUAL_CHUNK_SWAP, hardware="tpu"):
   """Creates a callable that can be used with map() to reorder batches."""
-  return functools.partial(shard_reorder_causal_load_balanced, cp_size=cp_size, shard_mode=shard_mode)
+  return functools.partial(
+      shard_reorder_causal_load_balanced,
+      cp_size=cp_size,
+      shard_mode=shard_mode,
+      reorder_strategy=reorder_strategy,
+      hardware=hardware,
+  )
 
 
 def get_shaped_batch(config):
diff --git a/src/maxtext/utils/train_utils.py b/src/maxtext/utils/train_utils.py
diff --git a/tests/integration/train_tests.py b/tests/integration/train_tests.py
diff --git a/tests/utils/attention_test_util.py b/tests/utils/attention_test_util.py