AI-Hypercomputer
diff --git a/‎src/maxtext/common/common_types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/common/common_types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 6 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 12 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 12 additions & 0 deletions
@@ -120,6 +120,7 @@ class AttentionType(enum.Enum):
   LOCAL_SLIDING = "local_sliding"
   CHUNK = "chunk"
   MLA = "mla"
+  COMPRESSED = "compressed"
   FULL = "full"
 
 
 
@@ -411,6 +411,12 @@ qk_nope_head_dim: 128
 qk_rope_head_dim: 64
 v_head_dim: 128
 
+# Compressed Attention parameters
+o_lora_rank: 0
+o_groups: 0
+compress_ratios: []
+compressed_rope_max_timescale: 160_000 # Timescale for Compressed Sparse/Heavy Attention
+
 # QK-Clip (Muon Clip) Configuration
 use_qk_clip: false  # Enable QK-Clip (supported in MLA with DotProduct or Tokamax Splash)
 qk_clip_threshold: 100.0  # Threshold for clipping (tau in the paper)
 
@@ -636,6 +636,17 @@ class MlaAttention(BaseModel):
   v_head_dim: NonNegativeInt = Field(128, description="Dimension of V heads in MLA.")
 
 
+class CompressedAttention(BaseModel):
+  """Configuration for Compressed Attention."""
+
+  o_lora_rank: NonNegativeInt = Field(0, description="Output LoRA rank for Compressed Attention.")
+  o_groups: NonNegativeInt = Field(0, description="Output groups for Compressed Attention.")
+  compress_ratios: list[int] = Field(default_factory=list, description="Per-layer compression ratios (0, 4, 128, etc).")
+  compressed_rope_max_timescale: int = Field(
+      160000, description="If positive, used for Compressed Sparse/Heavy Attention."
+  )
+
+
 class AttentionIndexer(BaseModel):
   """Configuration for DeepSeek Sparse Attention (DSA): DeepSeek3.2-style MLA with indexer."""
 
@@ -2254,6 +2265,7 @@ class MaxTextConfig(
     # Attention Mechanisms
     Attention,
     MlaAttention,
+    CompressedAttention,
     MoBa,
     AttentionIndexer,
     Llama4Attention,