AI-Hypercomputer
diff --git a/‎src/maxdiffusion/common_types.py‎
Lines changed: 12 additions & 0 deletions b/‎src/maxdiffusion/common_types.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 28 additions & 26 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 28 additions & 26 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 11 additions & 9 deletions b/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 20 additions & 18 deletions b/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 20 additions & 18 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 22 additions & 20 deletions b/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 19 additions & 17 deletions b/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 19 additions & 17 deletions
@@ -95,3 +95,15 @@
     [CROSS_ATTN_Q_LENGTH, CONTEXT],
     [CROSS_ATTN_KV_LENGTH, CONTEXT],
 ]
+
+### Common axis rules for 2D Ulysses + ring attention ###
+# Public configs shard sequence on `context`; attention code privately reshapes
+# that axis into hidden ring and Ulysses axes for the hybrid kernel.
+ULYSSES_RING_ATTENTION_AXIS_RULES = [
+    [SELF_ATTN_HEAD, None],
+    [SELF_ATTN_Q_LENGTH, CONTEXT],
+    [SELF_ATTN_KV_LENGTH, CONTEXT],
+    [CROSS_ATTN_HEAD, None],
+    [CROSS_ATTN_Q_LENGTH, CONTEXT],
+    [CROSS_ATTN_KV_LENGTH, None],
+]
@@ -20,7 +20,7 @@ metrics_file: "" # for testing, local file that stores scalar metrics. If empty,
 write_metrics: True
 
 timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
-write_timing_metrics: True 
+write_timing_metrics: True
 
 gcs_metrics: False
 # If true save config to GCS in {base_output_directory}/{run_name}/
@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
@@ -81,38 +83,38 @@ mask_padding_tokens: True
 attention_sharding_uniform: True
 
 flash_block_sizes: {
-  "block_q" : 512,
-  "block_kv_compute" : 512,
-  "block_kv" : 512,
-  "block_q_dkv" : 512,
-  "block_kv_dkv" : 512,
-  "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
+  "block_q": 512,
+  "block_kv_compute": 512,
+  "block_kv": 512,
+  "block_q_dkv": 512,
+  "block_kv_dkv": 512,
+  "block_kv_dkv_compute": 512,
+  "block_q_dq": 512,
+  "block_kv_dq": 512,
   "use_fused_bwd_kernel": False,
 }
 # Use on v6e
 # flash_block_sizes: {
-#   "block_q" : 3024,
-#   "block_kv_compute" : 1024,
-#   "block_kv" : 2048,
-#   "block_q_dkv" : 3024,
-#   "block_kv_dkv" : 2048,
-#   "block_kv_dkv_compute" : 1024,
-#   "block_q_dq" : 3024,
-#   "block_kv_dq" : 2048,
+#   "block_q": 3024,
+#   "block_kv_compute": 1024,
+#   "block_kv": 2048,
+#   "block_q_dkv": 3024,
+#   "block_kv_dkv": 2048,
+#   "block_kv_dkv_compute": 1024,
+#   "block_q_dq": 3024,
+#   "block_kv_dq": 2048,
 #   "use_fused_bwd_kernel": False,
 # }
 # Use on v5p
 # flash_block_sizes: {
-#   "block_q" : 3024,
-#   "block_kv_compute" : 1024,
-#   "block_kv" : 2048,
-#   "block_q_dkv" : 1024,
-#   "block_kv_dkv" : 3072,
-#   "block_kv_dkv_compute" : 256,
-#   "block_q_dq" : 1024,
-#   "block_kv_dq" : 3072
+#   "block_q": 3024,
+#   "block_kv_compute": 1024,
+#   "block_kv": 2048,
+#   "block_q_dkv": 1024,
+#   "block_kv_dkv": 3072,
+#   "block_kv_dkv_compute": 256,
+#   "block_q_dq": 1024,
+#   "block_kv_dq": 3072
 # }
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -60,9 +60,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
@@ -77,14 +79,14 @@ attention_sharding_uniform: True
 dropout: 0.0
 
 flash_block_sizes: {
-  "block_q" : 512,
-  "block_kv_compute" : 512,
-  "block_kv" : 512,
-  "block_q_dkv" : 512,
-  "block_kv_dkv" : 512,
-  "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
+  "block_q": 512,
+  "block_kv_compute": 512,
+  "block_kv": 512,
+  "block_q_dkv": 512,
+  "block_kv_dkv": 512,
+  "block_kv_dkv_compute": 512,
+  "block_q_dq": 512,
+  "block_kv_dq": 512,
   "use_fused_bwd_kernel": False,
 }
 # GroupNorm groups
 
@@ -20,7 +20,7 @@ metrics_file: "" # for testing, local file that stores scalar metrics. If empty,
 write_metrics: True
 
 timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
-write_timing_metrics: True 
+write_timing_metrics: True
 
 gcs_metrics: False
 # If true save config to GCS in {base_output_directory}/{run_name}/
@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
@@ -81,26 +83,26 @@ mask_padding_tokens: True
 attention_sharding_uniform: True
 
 flash_block_sizes: {
-  "block_q" : 512,
-  "block_kv_compute" : 512,
-  "block_kv" : 512,
-  "block_q_dkv" : 512,
-  "block_kv_dkv" : 512,
-  "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
+  "block_q": 2048,
+  "block_kv_compute": 1024,
+  "block_kv": 2048,
+  "block_q_dkv": 2048,
+  "block_kv_dkv": 2048,
+  "block_kv_dkv_compute": 1024,
+  "block_q_dq": 2048,
+  "block_kv_dq": 2048,
   "use_fused_bwd_kernel": False,
 }
 # Use on v6e
 # flash_block_sizes: {
-#   "block_q" : 3024,
-#   "block_kv_compute" : 1024,
-#   "block_kv" : 2048,
-#   "block_q_dkv" : 3024,
-#   "block_kv_dkv" : 2048,
-#   "block_kv_dkv_compute" : 2048,
-#   "block_q_dq" : 3024,
-#   "block_kv_dq" : 2048
+#   "block_q": 3024,
+#   "block_kv_compute": 1024,
+#   "block_kv": 2048,
+#   "block_q_dkv": 3024,
+#   "block_kv_dkv": 2048,
+#   "block_kv_dkv_compute": 2048,
+#   "block_q_dq": 3024,
+#   "block_kv_dq": 2048
 #   "use_fused_bwd_kernel": False,
 # }
 # GroupNorm groups
 
@@ -20,7 +20,7 @@ metrics_file: "" # for testing, local file that stores scalar metrics. If empty,
 write_metrics: True
 
 timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
-write_timing_metrics: True 
+write_timing_metrics: True
 
 gcs_metrics: False
 # If true save config to GCS in {base_output_directory}/{run_name}/
@@ -62,45 +62,47 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
 # However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
+mask_padding_tokens: True
 # Maxdiffusion has 2 types of attention sharding strategies:
 # 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
 # 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
 #    in cross attention q.
-attention_sharding_uniform: True 
+attention_sharding_uniform: True
 dropout: 0.0
 
 # Tuned for 720p (720x1280), 81 frames, CP=8 on Trillium (32MB VMEM):
 #   block_q=2048, block_kv=4096, block_kv_compute=1024
 #   ~31% faster than default (512,512,512): 389s vs 508s at 40 steps
 flash_block_sizes: {
-  "block_q" : 2048,
-  "block_kv_compute" : 1024,
-  "block_kv" : 4096,
-  "block_q_dkv" : 512,
-  "block_kv_dkv" : 512,
-  "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
+  "block_q": 2048,
+  "block_kv_compute": 1024,
+  "block_kv": 4096,
+  "block_q_dkv": 512,
+  "block_kv_dkv": 512,
+  "block_kv_dkv_compute": 512,
+  "block_q_dq": 512,
+  "block_kv_dq": 512,
   "use_fused_bwd_kernel": False,
 }
 # Default smaller-shape block sizes:
 # flash_block_sizes: {
-#   "block_q" : 512,
-#   "block_kv_compute" : 512,
-#   "block_kv" : 512,
-#   "block_q_dkv" : 512,
-#   "block_kv_dkv" : 512,
-#   "block_kv_dkv_compute" : 512,
-#   "block_q_dq" : 512,
-#   "block_kv_dq" : 512,
+#   "block_q": 512,
+#   "block_kv_compute": 512,
+#   "block_kv": 512,
+#   "block_q_dkv": 512,
+#   "block_kv_dkv": 512,
+#   "block_kv_dkv_compute": 512,
+#   "block_q_dq": 512,
+#   "block_kv_dq": 512,
 #   "use_fused_bwd_kernel": False,
 # }
 # GroupNorm groups
 
@@ -20,7 +20,7 @@ metrics_file: "" # for testing, local file that stores scalar metrics. If empty,
 write_metrics: True
 
 timing_metrics_file: "" # for testing, local file that stores function timing metrics such as state creation, compilation. If empty, no metrics are written.
-write_timing_metrics: True 
+write_timing_metrics: True
 
 gcs_metrics: False
 # If true save config to GCS in {base_output_directory}/{run_name}/
@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
@@ -81,24 +83,24 @@ mask_padding_tokens: True
 attention_sharding_uniform: True
 
 flash_block_sizes: {
-  "block_q" : 2048,
-  "block_kv_compute" : 512,
-  "block_kv" : 2048,
-  "block_q_dkv" : 2048,
-  "block_kv_dkv" : 2048,
-  "block_kv_dkv_compute" : 512,
-  "use_fused_bwd_kernel" : True
+  "block_q": 2048,
+  "block_kv_compute": 512,
+  "block_kv": 2048,
+  "block_q_dkv": 2048,
+  "block_kv_dkv": 2048,
+  "block_kv_dkv_compute": 512,
+  "use_fused_bwd_kernel": True
 }
 # Use on v6e
 # flash_block_sizes: {
-#   "block_q" : 3024,
-#   "block_kv_compute" : 1024,
-#   "block_kv" : 2048,
-#   "block_q_dkv" : 3024,
-#   "block_kv_dkv" : 2048,
-#   "block_kv_dkv_compute" : 2048,
-#   "block_q_dq" : 3024,
-#   "block_kv_dq" : 2048,
+#   "block_q": 3024,
+#   "block_kv_compute": 1024,
+#   "block_kv": 2048,
+#   "block_q_dkv": 3024,
+#   "block_kv_dkv": 2048,
+#   "block_kv_dkv_compute": 2048,
+#   "block_q_dq": 3024,
+#   "block_kv_dq": 2048,
 #   "use_fused_bwd_kernel": False,
 # }
 # GroupNorm groups