AI-Hypercomputer
diff --git a/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 1 addition & 137 deletions b/‎src/maxtext/layers/attention_mla.py‎
Lines changed: 1 addition & 137 deletions
diff --git a/‎src/maxtext/layers/attention_op.py‎
Lines changed: 0 additions & 95 deletions b/‎src/maxtext/layers/attention_op.py‎
Lines changed: 0 additions & 95 deletions
diff --git a/‎src/maxtext/layers/attentions.py‎
Lines changed: 0 additions & 10 deletions b/‎src/maxtext/layers/attentions.py‎
Lines changed: 0 additions & 10 deletions
@@ -63,9 +63,8 @@
     DEFAULT_MASK_VALUE,
 )
 
-from maxtext.layers import nnx_wrappers
 from maxtext.layers.attentions import Attention
-from maxtext.layers.initializers import nd_dense_init, NdInitializer, variable_to_logically_partitioned
+from maxtext.layers.initializers import nd_dense_init, NdInitializer
 from maxtext.layers.linears import DenseGeneral
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.layers.quantizations import AqtQuantization as Quant
@@ -381,141 +380,6 @@ def __call__(
     return indexer_mask, topk_indices, indexer_score
 
 
-def mla_as_linen(
-    *,
-    config: Config,
-    num_query_heads: int,
-    num_kv_heads: int,
-    head_dim: int,
-    max_target_length: int,
-    mesh: Mesh,
-    attention_kernel: str,
-    inputs_q_shape: Tuple,
-    inputs_kv_shape: Tuple,
-    dtype: DType = jnp.float32,
-    weight_dtype: DType = jnp.float32,
-    max_prefill_predict_length: int = -1,
-    dropout_rate: float = 0.0,
-    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal"),
-    float32_qk_product: bool = False,  # computes logits in float32 for stability.
-    float32_logits: bool = False,  # cast logits in float32 for stability.
-    quant: Optional[Quant] = None,
-    kv_quant: Optional[KVQuant] = None,
-    attention_type: AttentionType = AttentionType.MLA,  # Default to MLA attention
-    attn_logits_soft_cap: float | None = None,
-    sliding_window_size: int | None = None,
-    use_ragged_attention: bool = False,
-    ragged_block_size: int = 256,
-    use_qk_norm: bool = False,
-    query_pre_attn_scalar: float | None = None,
-    use_bias_in_projections: bool = False,  # Set to True will enable bias in q, k, v, o projections
-    # Temperature tuning parameters used for Llama4
-    temperature_tuning: bool = False,
-    temperature_tuning_scale: float = 0.1,
-    temperature_tuning_floor_scale: float = 8192.0,
-    # Shard the query activation as the same as the key and value.
-    # TODO: Find a better sharding axis name.
-    # TODO: Further break down the Training and Inference axes for the q, k, v.
-    prefill_query_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-    prefill_key_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-    prefill_value_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, KV_HEAD, KV_HEAD_DIM),
-    query_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    key_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    value_axis_names: AxisNames = (KV_BATCH, LENGTH, KV_HEAD, KV_HEAD_DIM),
-    input_axis_names: AxisNames = (BATCH_ATTN, LENGTH, EMBED),
-    out_axis_names: AxisNames = (BATCH_ATTN, LENGTH, HEAD, D_KV),
-    prefill_input_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, EMBED),
-    decode_input_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, EMBED),
-    prefill_out_axis_names: AxisNames = (PREFILL_KV_BATCH, PREFILL_LENGTH, HEAD, D_KV),
-    decode_out_axis_names: AxisNames = (DECODE_BATCH, DECODE_LENGTH, HEAD, D_KV),
-    prefill_cache_axis_order: AxisIdxes = (1, 2, 0, 3),
-    ar_cache_axis_order: AxisIdxes = (1, 2, 0, 3),
-    compute_axis_order: AxisIdxes = (0, 1, 2, 3),
-    reshape_q: bool = False,
-    is_nope_layer: bool = False,
-    is_vision: bool = False,
-    model_mode: str = MODEL_MODE_TRAIN,
-    q_lora_rank: int = 0,
-    kv_lora_rank: int = 512,
-    qk_nope_head_dim: int = 128,
-    qk_rope_head_dim: int = 64,
-    v_head_dim: int = 128,
-    max_position_embeddings: int = 4096 * 4,
-    original_max_position_embeddings: int = 4096,
-    mscale: float = 1.0,  # scaling factor for softmax
-    rope_factor: float = 40.0,  # rotary embedding factor
-    name: str | None = None,
-):
-  """A factory function to create an MLA as a Linen module.
-
-  This function serves as a bridge to use the NNX-based `MLA` within a
-  Linen model.
-  """
-  return nnx_wrappers.to_linen(
-      MLA,
-      config=config,
-      num_query_heads=num_query_heads,
-      num_kv_heads=num_kv_heads,
-      head_dim=head_dim,
-      max_target_length=max_target_length,
-      mesh=mesh,
-      attention_kernel=attention_kernel,
-      inputs_q_shape=inputs_q_shape,
-      inputs_kv_shape=inputs_kv_shape,
-      dtype=dtype,
-      weight_dtype=weight_dtype,
-      max_prefill_predict_length=max_prefill_predict_length,
-      dropout_rate=dropout_rate,
-      kernel_init=kernel_init,
-      float32_qk_product=float32_qk_product,
-      float32_logits=float32_logits,
-      quant=quant,
-      kv_quant=kv_quant,
-      attention_type=attention_type,
-      attn_logits_soft_cap=attn_logits_soft_cap,
-      sliding_window_size=sliding_window_size,
-      use_ragged_attention=use_ragged_attention,
-      ragged_block_size=ragged_block_size,
-      use_qk_norm=use_qk_norm,
-      query_pre_attn_scalar=query_pre_attn_scalar,
-      use_bias_in_projections=use_bias_in_projections,
-      temperature_tuning=temperature_tuning,
-      temperature_tuning_scale=temperature_tuning_scale,
-      temperature_tuning_floor_scale=temperature_tuning_floor_scale,
-      prefill_query_axis_names=prefill_query_axis_names,
-      prefill_key_axis_names=prefill_key_axis_names,
-      prefill_value_axis_names=prefill_value_axis_names,
-      query_axis_names=query_axis_names,
-      key_axis_names=key_axis_names,
-      value_axis_names=value_axis_names,
-      input_axis_names=input_axis_names,
-      out_axis_names=out_axis_names,
-      prefill_input_axis_names=prefill_input_axis_names,
-      decode_input_axis_names=decode_input_axis_names,
-      prefill_out_axis_names=prefill_out_axis_names,
-      decode_out_axis_names=decode_out_axis_names,
-      prefill_cache_axis_order=prefill_cache_axis_order,
-      ar_cache_axis_order=ar_cache_axis_order,
-      compute_axis_order=compute_axis_order,
-      reshape_q=reshape_q,
-      is_nope_layer=is_nope_layer,
-      is_vision=is_vision,
-      model_mode=model_mode,
-      q_lora_rank=q_lora_rank,
-      kv_lora_rank=kv_lora_rank,
-      qk_nope_head_dim=qk_nope_head_dim,
-      qk_rope_head_dim=qk_rope_head_dim,
-      v_head_dim=v_head_dim,
-      max_position_embeddings=max_position_embeddings,
-      original_max_position_embeddings=original_max_position_embeddings,
-      mscale=mscale,
-      rope_factor=rope_factor,
-      name=name,
-      metadata_fn=variable_to_logically_partitioned,
-      abstract_init=False,
-  )
-
-
 class MLA(Attention):
   """Multi-Head Latent Attention (MLA) layer."""
 
 
@@ -69,7 +69,6 @@
 from maxtext.kernels.attention.ragged_attention import ragged_gqa
 from maxtext.kernels.attention.ragged_attention import ragged_mha
 from maxtext.layers import nnx_wrappers
-from maxtext.layers.initializers import variable_to_logically_partitioned
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.utils import max_utils
 from maxtext.utils.sharding import logical_to_mesh_axes, maybe_shard_with_pspec
@@ -285,100 +284,6 @@ def _make_bidirectional_block_mask(bidirectional_mask):
   return bidirectional_block_mask
 
 
-def attention_op_as_linen(
-    *,
-    config: Config,
-    mesh: Mesh,
-    attention_kernel: str,
-    max_target_length: int,
-    num_query_heads: int,
-    num_kv_heads: int,
-    float32_qk_product: bool = False,
-    max_prefill_predict_length: int = -1,
-    float32_logits: bool = False,
-    flash_axis_names_q: AxisNames = (BATCH_ATTN, HEAD, LENGTH, D_KV),
-    flash_axis_names_kv: AxisNames = (BATCH_ATTN, HEAD, KV_LENGTH, D_KV),
-    flash_axis_names_splash_kernel: AxisNames = (HEAD, LENGTH),
-    prefill_cache_logical_axis_names: AxisNames = (
-        CACHE_BATCH_PREFILL,
-        CACHE_SEQUENCE,
-        CACHE_HEADS,
-        CACHE_KV,
-    ),
-    cache_logical_axis_names: AxisNames = (
-        CACHE_BATCH,
-        CACHE_SEQUENCE,
-        CACHE_HEADS,
-        CACHE_KV,
-    ),
-    cache_scale_logical_axis_names: AxisNames = (
-        CACHE_SCALE_BATCH,
-        CACHE_SCALE_SEQUENCE,
-        CACHE_SCALE_HEADS,
-        CACHE_SCALE_KV,
-    ),
-    ragged_qkv_axis_names: AxisNames = (
-        CACHE_BATCH,
-        CACHE_HEADS,
-        CACHE_SEQUENCE,
-        CACHE_KV,
-    ),
-    ragged_lengths_names: AxisNames = (CACHE_BATCH,),
-    compute_axis_order: AxisIdxes = (0, 1, 2, 3),
-    key_axis_order: AxisIdxes = (2, 0, 1, 3),
-    reshape_q: bool = False,
-    dropout_rate: float = 0.0,
-    dtype: DType = jnp.float32,
-    quant: Optional[Quant] = None,
-    kv_quant: Optional[KVQuant] = None,
-    attention_type: AttentionType = AttentionType.GLOBAL,  # Default to global attention
-    attn_logits_soft_cap: float | None = None,
-    sliding_window_size: int | None = None,
-    chunk_attn_window_size: int | None = None,
-    use_ragged_attention: bool = False,
-    ragged_block_size: int = 256,
-):
-  """A factory function to create an AttentionOp as a Linen module.
-
-  This function serves as a bridge to use the NNX-based `AttentionOp` within a
-  Linen model.
-  """
-  return nnx_wrappers.to_linen(
-      AttentionOp,
-      config=config,
-      mesh=mesh,
-      attention_kernel=attention_kernel,
-      max_target_length=max_target_length,
-      num_query_heads=num_query_heads,
-      num_kv_heads=num_kv_heads,
-      float32_qk_product=float32_qk_product,
-      max_prefill_predict_length=max_prefill_predict_length,
-      float32_logits=float32_logits,
-      flash_axis_names_q=flash_axis_names_q,
-      flash_axis_names_kv=flash_axis_names_kv,
-      flash_axis_names_splash_kernel=flash_axis_names_splash_kernel,
-      prefill_cache_logical_axis_names=prefill_cache_logical_axis_names,
-      cache_logical_axis_names=cache_logical_axis_names,
-      cache_scale_logical_axis_names=cache_scale_logical_axis_names,
-      ragged_qkv_axis_names=ragged_qkv_axis_names,
-      ragged_lengths_names=ragged_lengths_names,
-      compute_axis_order=compute_axis_order,
-      key_axis_order=key_axis_order,
-      reshape_q=reshape_q,
-      dropout_rate=dropout_rate,
-      dtype=dtype,
-      quant=quant,
-      kv_quant=kv_quant,
-      attention_type=attention_type,
-      attn_logits_soft_cap=attn_logits_soft_cap,
-      sliding_window_size=sliding_window_size,
-      chunk_attn_window_size=chunk_attn_window_size,
-      use_ragged_attention=use_ragged_attention,
-      ragged_block_size=ragged_block_size,
-      metadata_fn=variable_to_logically_partitioned,
-  )
-
-
 class AttentionOp(nnx.Module):
   """Attention operation"""
 
 
@@ -89,16 +89,6 @@ def __call__(self, x):
     return x * jax.lax.rsqrt(jnp.mean(x**2, axis=-1, keepdims=True) + self.eps)
 
 
-def l2_norm_as_linen(self, eps: float = 1e-6):
-  """
-  Initializes the L2Norm module and returns it as a Linen module.
-
-  Args:
-    eps: float, epsilon used for numerical stability (default value should be ok for most cases).
-  """
-  return nnx_wrappers.to_linen(L2Norm, eps=eps, metadata_fn=variable_to_logically_partitioned)
-
-
 def attention_as_linen(
     *,
     config: Config,