Move block_sizes computation to top of _ulysses_attention after reshape

csgoogle · csgoogle · commit 656e1508de84 · 2026-04-16T17:13:36.000Z
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -487,12 +487,11 @@ def _ulysses_attention(
         "Ulysses attention requires the number of heads to be divisible by the context shard count, "
         f"got heads={num_heads} and context_shards={num_shards}."
     )
+  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, "flash")
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
 
-  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, "flash")
-
   @functools.partial(
       jax.shard_map,
       mesh=mesh,