AI-Hypercomputer
diff --git a/‎dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎dependencies/requirements/generated_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎dependencies/requirements/generated_requirements/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎maxdiffusion_dependencies.Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎maxdiffusion_dependencies.Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxdiffusion/common_types.py‎
Lines changed: 12 additions & 0 deletions b/‎src/maxdiffusion/common_types.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_14b.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_1_3b.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_27b.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_animate.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_i2v_14b.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxdiffusion/configs/base_wan_i2v_27b.yml‎
Lines changed: 3 additions & 6 deletions
@@ -2,7 +2,6 @@
 absl-py
 accelerate
 aqtp
-av
 chex
 datasets
 einops
 
@@ -15,7 +15,6 @@ astroid>=4.0.4
 astunparse>=1.6.3
 attrs>=25.4.0
 auditwheel>=6.6.0
-av>=17.0.1
 black>=25.12.0
 build>=1.4.0
 certifi>=2026.1.4
 
@@ -17,7 +17,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN python -m pip install --upgrade pip uv --no-warn-script-location
 
 # Install system dependencies
-RUN apt-get update && apt-get install -y apt-utils git curl gnupg procps iproute2 ethtool g++ && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y apt-utils git curl gnupg procps iproute2 ethtool && rm -rf /var/lib/apt/lists/*
 
 # Add the Google Cloud SDK package repository
 RUN curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \
 
@@ -95,3 +95,15 @@
     [CROSS_ATTN_Q_LENGTH, CONTEXT],
     [CROSS_ATTN_KV_LENGTH, CONTEXT],
 ]
+
+### Common axis rules for 2D Ulysses + ring attention ###
+# Public configs shard sequence on `context`; attention code privately reshapes
+# that axis into hidden ring and Ulysses axes for the hybrid kernel.
+ULYSSES_RING_ATTENTION_AXIS_RULES = [
+    [SELF_ATTN_HEAD, None],
+    [SELF_ATTN_Q_LENGTH, CONTEXT],
+    [SELF_ATTN_KV_LENGTH, CONTEXT],
+    [CROSS_ATTN_HEAD, None],
+    [CROSS_ATTN_Q_LENGTH, CONTEXT],
+    [CROSS_ATTN_KV_LENGTH, None],
+]
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -69,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
 
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -65,9 +60,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -69,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
 
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -67,9 +62,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
 
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -69,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
 
@@ -41,11 +41,6 @@ revision: ''
 weights_dtype: 'bfloat16'
 # This sets the layer's dtype in the model. Ex: nn.Dense(dtype=activations_dtype)
 activations_dtype: 'bfloat16'
-# The dtype for text_encoder model during load/compile
-text_encoder_dtype: 'float32'
-
-# Whether to compile the text_encoder with torch.compile
-compile_text_encoder: False
 
 # Replicates vae across devices instead of using the model's sharding annotations for sharding.
 replicate_vae: False
@@ -69,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0