AI-Hypercomputer
diff --git a/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/run_tests_against_package.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/inference/kvcache.py‎
Lines changed: 105 additions & 74 deletions b/‎src/maxtext/inference/kvcache.py‎
Lines changed: 105 additions & 74 deletions
diff --git a/‎src/maxtext/inference/paged_attention.py‎
Lines changed: 21 additions & 21 deletions b/‎src/maxtext/inference/paged_attention.py‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/layers/attention_op.py‎
Lines changed: 3 additions & 3 deletions b/‎src/maxtext/layers/attention_op.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxtext/layers/embeddings.py‎
Lines changed: 2 additions & 2 deletions b/‎src/maxtext/layers/embeddings.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxtext/layers/engram.py‎
Lines changed: 11 additions & 11 deletions b/‎src/maxtext/layers/engram.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎src/maxtext/layers/initializers.py‎
Lines changed: 9 additions & 8 deletions b/‎src/maxtext/layers/initializers.py‎
Lines changed: 9 additions & 8 deletions
@@ -94,7 +94,7 @@ jobs:
             PAPERMILL_EXE=".venv/bin/papermill"
             source .venv/bin/activate
           fi
-          export PYTHONPATH="${pwd}/src${PYTHONPATH:+:${PYTHONPATH}}"
+          export PYTHONPATH="${PWD}/src${PYTHONPATH:+:${PYTHONPATH}}"
 
           export MAXTEXT_REPO_ROOT=$(pwd)
           export MAXTEXT_PKG_DIR=$(pwd)/src/maxtext
 
@@ -138,7 +138,7 @@ jobs:
             PYTHON_EXE=".venv/bin/python3"
             # Ensure pytest-cov is available and enable coverage flags
             uv pip install pytest-cov
-            PYTEST_COV_ARGS="--cov=MaxText --cov=maxtext --cov-report=xml --cov-report=term"
+            PYTEST_COV_ARGS="--cov=maxtext --cov-report=xml --cov-report=term"
           fi
           export PYTHONPATH="${PWD}/src${PYTHONPATH:+:${PYTHONPATH}}"
 
@@ -208,7 +208,7 @@ jobs:
         continue-on-error: true
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
-          file: ./coverage.xml
+          files: ./coverage.xml
           # If scheduled, upload to scheduled flag only. If PR, upload to regular flag only.
           flags: ${{ inputs.is_scheduled_run == 'true' && 'scheduled' || 'regular' }}
           verbose: true
@@ -848,7 +848,7 @@ class HardwareAndMesh(BaseModel):
       description="Strategy for context parallelism ('all_gather' or 'ring').",
   )
   context_parallel_reorder_strategy: ReorderStrategy = Field(
-      "auto",
+      ReorderStrategy.AUTO,
       description="Reorder strategy for load-balanced context parallelism.",
   )
   custom_mesh: str = Field("", description="Available options: ['hybrid_ring_64x4', 'hybrid_ring_32x8']")
 
@@ -170,22 +170,22 @@ def __init__(
 
     self.key_pages = nnx.Cache(
         jnp.zeros(self.kv_pages_shape, dtype=self.dtype),
-        sharding=self.kv_pages_axis_names,
+        out_sharding=self.kv_pages_axis_names,
     )
     self.value_pages = nnx.Cache(
         jnp.zeros(self.kv_pages_shape, dtype=self.dtype),
-        sharding=self.kv_pages_axis_names,
+        out_sharding=self.kv_pages_axis_names,
     )
 
   def _maybe_materialize_cache(self, cache: nnx.Cache) -> nnx.Cache:
     """Materializes the cache if it's currently a ShapeDtypeStruct."""
-    if isinstance(cache.value, jax.ShapeDtypeStruct):
+    if isinstance(cache.get_value(), jax.ShapeDtypeStruct):
       # This is needed because the Linen bridge lazily creates this state. We
       # need to ensure the cache state is accessible at runtime.
       # TODO: Delete this function when the to_linen bridge is no longer needed.
       return nnx.Cache(
           jnp.zeros(self.kv_pages_shape, dtype=self.dtype),
-          sharding=cache.sharding,
+          out_sharding=cache.get_metadata("out_sharding"),
       )
     return cache
 
@@ -204,8 +204,8 @@ def get_kv_pages(self):
     self.key_pages = self._maybe_materialize_cache(self.key_pages)
     self.value_pages = self._maybe_materialize_cache(self.value_pages)
 
-    self.key_pages.value = nn.with_logical_constraint(self.key_pages.value, self.kv_pages_axis_names)
-    self.value_pages.value = nn.with_logical_constraint(self.value_pages.value, self.kv_pages_axis_names)
+    self.key_pages.set_value(nn.with_logical_constraint(self.key_pages.get_value(), self.kv_pages_axis_names))
+    self.value_pages.set_value(nn.with_logical_constraint(self.value_pages.get_value(), self.kv_pages_axis_names))
     return self.key_pages, self.value_pages
 
   def pad_qkv(self, *qkv):
@@ -264,9 +264,9 @@ def paged_attention_v2_prefill(
     is the batch_size is only 1
     """
     assert query.shape[0] == 1  # ensure the batch size is 0
-    # shape of key_pages_cache.value is [num_kv_heads, num_pages, tokens_per_page, head_dim]
-    k_p = jnp.permute_dims(key_pages_cache.value, (1, 2, 0, 3))
-    v_p = jnp.permute_dims(value_pages_cache.value, (1, 2, 0, 3))
+    # shape of key_pages_cache.get_value() is [num_kv_heads, num_pages, tokens_per_page, head_dim]
+    k_p = jnp.permute_dims(key_pages_cache.get_value(), (1, 2, 0, 3))
+    v_p = jnp.permute_dims(value_pages_cache.get_value(), (1, 2, 0, 3))
     c_q_l = jnp.array([0, page_state.sequence_lengths[0]])  # [0, prefill_true_length]
     num_seqs = jnp.array([1])
     query = query[0]  # [batch_size, max_num_tokens, num_kv_heads, head_dim] to [max_num_tokens, num_kv_heads, head_dim]
@@ -294,8 +294,8 @@ def paged_attention_v2_decode(
     """Apply ragged input Paged Attention in decode only."""
     batch_size = query.shape[0]
     query = jnp.squeeze(query, axis=1)  # [batch_size, seq_len, n_kv_head, head_dim] to [batch_size, n_kv_head, head_dim]
-    k_p = jnp.permute_dims(key_pages_cache.value, (1, 2, 0, 3))
-    v_p = jnp.permute_dims(value_pages_cache.value, (1, 2, 0, 3))
+    k_p = jnp.permute_dims(key_pages_cache.get_value(), (1, 2, 0, 3))
+    v_p = jnp.permute_dims(value_pages_cache.get_value(), (1, 2, 0, 3))
     c_q_l = jnp.arange(batch_size + 1)  # one token per sequence
     num_seqs = jnp.array([batch_size])  # real number of requests, set it to batch_size
     result = paged_attention_kernel_v2.ragged_paged_attention(
@@ -352,8 +352,8 @@ def wrap_paged_attention(q, k_pages, v_pages, lengths, page_indices, pages_per_c
 
     return wrap_paged_attention(
         query,
-        key_pages_cache.value,
-        value_pages_cache.value,
+        key_pages_cache.get_value(),
+        value_pages_cache.get_value(),
         page_state.sequence_lengths,
         page_state.page_map,
         self.pages_per_compute_block,
@@ -441,12 +441,12 @@ def update_prefill_step_pages(
     ), f"prefill_step key/value should have the same shape, but getting {key.shape=} and {value.shape=} instead"
     batch_size, seq_len, n_kv_head, head_dim = key.shape
     assert seq_len % self.tokens_per_page == 0, f"seq_length {seq_len} and  tokens_per_page {self.tokens_per_page}"
-    assert key_pages_cache.value.shape == value_pages_cache.value.shape, (
+    assert key_pages_cache.get_value().shape == value_pages_cache.get_value().shape, (
         f"prefill_step key/value_pages_cache should have the same shape, but "
         f"getting {key_pages_cache.shape=} and {value_pages_cache.shape=} instead"
     )
 
-    v_n_kv, _, v_p, v_d = key_pages_cache.value.shape
+    v_n_kv, _, v_p, v_d = key_pages_cache.get_value().shape
     assert v_n_kv == n_kv_head, f"{v_n_kv=} {n_kv_head=}"
     assert v_p == self.tokens_per_page, f"{v_p=} {self.tokens_per_page=}"
     assert v_d == head_dim, f"{v_d=} {head_dim=}"
@@ -485,13 +485,13 @@ def update_prefill_step_pages(
         ),
     )
 
-    key_pages_cache.value = nn.with_logical_constraint(key, self.kv_pages_axis_names)
-    value_pages_cache.value = nn.with_logical_constraint(value, self.kv_pages_axis_names)
+    key_pages_cache.set_value(nn.with_logical_constraint(key, self.kv_pages_axis_names))
+    value_pages_cache.set_value(nn.with_logical_constraint(value, self.kv_pages_axis_names))
 
   def update_decode_step_pages(self, key_pages_cache, value_pages_cache, key, value, page_state):
     """Update decode-step pages"""
-    key_pages = key_pages_cache.value
-    value_pages = value_pages_cache.value
+    key_pages = key_pages_cache.get_value()
+    value_pages = value_pages_cache.get_value()
 
     batch_size, _, kv_heads, head_dim = key.shape
     kv_heads, _, _, head_dim = key_pages.shape
@@ -511,6 +511,6 @@ def update_decode_step_pages(self, key_pages_cache, value_pages_cache, key, valu
     key_pages_updated = key_pages.at[kv_indices, broadcast_pages, broadcast_pos].set(new_key)
     value_pages_updated = value_pages.at[kv_indices, broadcast_pages, broadcast_pos].set(new_value)
 
-    key_pages_cache.value = key_pages_updated
-    value_pages_cache.value = value_pages_updated
+    key_pages_cache.set_value(key_pages_updated)
+    value_pages_cache.set_value(value_pages_updated)
     return key_pages_cache, value_pages_cache
@@ -1200,7 +1200,7 @@ def __call__(
             sparse_loss=self.config.indexer_sparse_training,
             scaling_factor=self.config.indexer_loss_scaling_factor,
         )
-        self.sow(nnx.Intermediate, "indexer_loss", indexer_loss)
+        self.indexer_loss = nnx.Intermediate(indexer_loss)
 
     # Check if we need QK Clip stats
     use_qk_clip = self.model_mode == MODEL_MODE_TRAIN and self.config.use_qk_clip
 
@@ -902,7 +902,7 @@ def apply_attention(
 
       local_out, local_max, local_sum = impl(query, key, value, lengths, self.ragged_block_size)
       if record_max_logits:
-        self.sow("intermediates", "max_logits", local_max)
+        self.max_logits = nnx.Intermediate(local_max)
       return local_out, local_max, local_sum
 
     # 'vllm_rpa' uses the same dot-attention wrapper but routes to the vLLM
@@ -951,7 +951,7 @@ def apply_attention(
             record_max_logits=record_max_logits,
         )
         if max_logits is not None:
-          self.sow("intermediates", "max_logits", max_logits)
+          self.max_logits = nnx.Intermediate(max_logits)
         return out, None, None
 
       else:
@@ -1861,7 +1861,7 @@ def apply_attention_dot(
       max_logits_per_group = jnp.max(attn_weights, axis=(-2, -1))
       b, n_kv, g = max_logits_per_group.shape
       max_logits = max_logits_per_group.reshape(b, n_kv * g)
-      self.sow("intermediates", "max_logits", max_logits)
+      self.max_logits = nnx.Intermediate(max_logits)
 
     return self.compute_local_attention(attn_weights, value, q_seq_len, model_mode, wv_product_einsum, sinks)
 
 
@@ -152,7 +152,7 @@ def __call__(self, inputs: Array, model_mode: str = MODEL_MODE_TRAIN) -> Array:
       raise ValueError("Input type must be an integer or unsigned integer.")
 
     embedding = jnp.asarray(
-        _maybe_move_embedding_to_device(self.embedding.value, self.config),
+        _maybe_move_embedding_to_device(self.embedding.get_value(), self.config),
         self.dtype,
     )
 
@@ -196,7 +196,7 @@ def attend(self, query: Array, out_sharding: NamedSharding | None = None) -> Arr
       Commonly used for weight-sharing between embeddings and logit transform
       in NLP models.
     """
-    embedding = self.embedding.value
+    embedding = self.embedding.get_value()
     attend_dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
     return attend_on_embedding(query, embedding, attend_dtype, self.config, out_sharding)
 
 
@@ -15,7 +15,7 @@
 """
 DeepSeek-AI, `Conditional Memory via Scalable Lookup: A New Axis of Sparsity for Large Language Models
   <https://arxiv.org/pdf/2601.07372>`_, 2026
-  
+
 Reference implementation: https://github.com/deepseek-ai/Engram/blob/main/engram_demo_v1.py
 """
 
@@ -53,7 +53,7 @@ class CompressedTokenizer:
   def __init__(self, tokenizer: HFTokenizer):
     normalizer = self._build_normalizer()
     self.lookup_table_np, self.num_new_token = self._build_lookup_table(tokenizer, normalizer)
-    self.lookup_table = jnp.array(self.lookup_table_np, dtype=jnp.int64)
+    self.lookup_table = jnp.array(self.lookup_table_np, dtype=jnp.int32)
 
   def __len__(self) -> int:
     return self.num_new_token
@@ -125,7 +125,7 @@ def __call__(self, input_ids) -> Array:
     """
     Maps original token IDs to compressed IDs.
     """
-    input_ids = jnp.asarray(input_ids, dtype=jnp.int64)
+    input_ids = jnp.asarray(input_ids, dtype=jnp.int32)
 
     # Map negative IDs to 0 for lookup, then mask output back.
     safe_ids = jnp.where(input_ids < 0, 0, input_ids)
@@ -187,7 +187,7 @@ def __init__(
     # Pre-calculate odd multipliers for hashing: {layer_id: multipliers}
     # Store as JAX arrays
     self.layer_multipliers = {
-        k: jnp.array(v, dtype=jnp.int64) for k, v in self._calculate_multipliers_across_layers(seed).items()
+        k: jnp.array(v, dtype=jnp.int32) for k, v in self._calculate_multipliers_across_layers(seed).items()
     }
 
     # Pre-calculate unique prime vocab sizes for every head
@@ -201,9 +201,9 @@ def _calculate_multipliers_across_layers(self, seed: int) -> dict[int, np.ndarra
     Returns:
       A dictionary mapping layer_id to a list of `max_ngram_size` multipliers.
     """
-    # Pre-calculate bounds for random generation
-    max_long = np.iinfo(np.int64).max
-    m_max = int(max_long // self.tokenizer_vocab_size)
+    # Pre-calculate bounds for random generation using int32 to avoid overflow
+    max_int = np.iinfo(np.int32).max
+    m_max = int(max_int // self.tokenizer_vocab_size)
     half_bound = max(1, m_max // 2)
     # Hard-code prime number to align with reference
     LAYER_PRIME_OFFSET = 10007
@@ -214,7 +214,7 @@ def _calculate_multipliers_across_layers(self, seed: int) -> dict[int, np.ndarra
       layer_seed = int(seed + LAYER_PRIME_OFFSET * int(layer_id))
       np_rng = np.random.default_rng(layer_seed)
       # Generate random odd integers
-      random_value = np_rng.integers(low=0, high=half_bound, size=(self.max_ngram_size,), dtype=np.int64)
+      random_value = np_rng.integers(low=0, high=half_bound, size=(self.max_ngram_size,), dtype=np.int32)
       multipliers = random_value * 2 + 1
       layer_multipliers[layer_id] = multipliers
     return layer_multipliers
@@ -272,7 +272,7 @@ def _get_ngram_hashes(self, compressed_ids: Array, layer_id: int) -> Array:
     Returns:
       hash_ids: [B, S, H_total] where H_total = H * num_ngram_orders
     """
-    x = jnp.asarray(compressed_ids, dtype=jnp.int64)
+    x = jnp.asarray(compressed_ids, dtype=jnp.int32)
     B, _ = x.shape
 
     # 1. Create Sliding Windows via Shifting
@@ -282,7 +282,7 @@ def _get_ngram_hashes(self, compressed_ids: Array, layer_id: int) -> Array:
         shifted_inputs.append(x)
       else:
         # Pre-allocate full array with PAD_ID
-        padding = jnp.full((B, k), self.pad_id, dtype=jnp.int64)
+        padding = jnp.full((B, k), self.pad_id, dtype=jnp.int32)
         # Fast memory copy, slicing and assignment
         # e.g., k=1, [PAD, The, cat]
         #       k=2, [PAD, PAD, The]
@@ -309,7 +309,7 @@ def _get_ngram_hashes(self, compressed_ids: Array, layer_id: int) -> Array:
 
       # Retrieve prime vocab sizes for all heads of this n-gram order
       vocab_sizes_for_this_gram = vocab_sizes[n - 2]
-      mods = jnp.array(vocab_sizes_for_this_gram, dtype=jnp.int64)
+      mods = jnp.array(vocab_sizes_for_this_gram, dtype=jnp.int32)
 
       # Broadcast Modulo: Map hash to valid table indices
       # [B, S, 1] % [H] -> [B, S, H]
 
@@ -60,10 +60,10 @@ def init_fn(key, shape, dtype, in_axis, out_axis):
   return init_fn
 
 
-def variable_to_logically_partitioned(variable: nnx.VariableState):
+def variable_to_logically_partitioned(variable: nnx.Variable):
   """Wraps an NNX variable's value in `nn.LogicallyPartitioned`.
 
-  This function inspects the metadata of an `nnx.VariableState` object. If
+  This function inspects the metadata of an `nnx.Variable` object. If
   sharding information ('out_sharding', 'sharding' or 'sharding_names') is
   present, it wraps the variable's value in `nn.LogicallyPartitioned` to apply
   the specified sharding constraints.
@@ -73,16 +73,17 @@ def variable_to_logically_partitioned(variable: nnx.VariableState):
   wrapping.
 
   Args:
-    variable: The `nnx.VariableState` object to process.
+    variable: The `nnx.Variable` object to process.
 
   Returns:
     The variable's value, potentially wrapped in `nn.LogicallyPartitioned`.
   """
-  if isinstance(variable.value, aqt_tensor.QTensor):
-    return variable.value
+  val = variable.get_value()
+  if isinstance(val, aqt_tensor.QTensor):
+    return val
 
   if variable.type.__name__ == "_overwrite_with_gradient":
-    return variable.value
+    return val
 
   metadata = variable.get_metadata()
   out_sharding = None
@@ -95,10 +96,10 @@ def variable_to_logically_partitioned(variable: nnx.VariableState):
 
   if out_sharding is not None:
     return nn.LogicallyPartitioned(  # type: ignore[wrong-keyword-args]
-        variable.value,
+        val,
         out_sharding,  # type: ignore[arg-type]
         mesh=metadata.get("mesh"),
         rules=metadata.get("rules"),
     )
   else:
-    return variable.value
+    return val
Original file line number	Diff line number	Diff line change
`@@ -848,7 +848,7 @@ class HardwareAndMesh(BaseModel):`
`848`	`848`	`description="Strategy for context parallelism ('all_gather' or 'ring').",`
`849`	`849`	`)`
`850`	`850`	`context_parallel_reorder_strategy: ReorderStrategy = Field(`
`851`		`- "auto",`
	`851`	`+ ReorderStrategy.AUTO,`
`852`	`852`	`description="Reorder strategy for load-balanced context parallelism.",`
`853`	`853`	`)`
`854`	`854`	`custom_mesh: str = Field("", description="Available options: ['hybrid_ring_64x4', 'hybrid_ring_32x8']")`
Original file line number	Diff line number	Diff line change
`@@ -1200,7 +1200,7 @@ def __call__(`
`1200`	`1200`	`sparse_loss=self.config.indexer_sparse_training,`
`1201`	`1201`	`scaling_factor=self.config.indexer_loss_scaling_factor,`
`1202`	`1202`	`)`
`1203`		`- self.sow(nnx.Intermediate, "indexer_loss", indexer_loss)`
	`1203`	`+ self.indexer_loss = nnx.Intermediate(indexer_loss)`
`1204`	`1204`
`1205`	`1205`	`# Check if we need QK Clip stats`
`1206`	`1206`	`use_qk_clip = self.model_mode == MODEL_MODE_TRAIN and self.config.use_qk_clip`