Fixed rebase errors

Rohan-Bierneni · Rohan-Bierneni · commit e25d9bcdc11f · 2026-03-18T17:09:43.000Z
diff --git a/src/maxtext/models/qwen3.py b/src/maxtext/models/qwen3.py
@@ -29,7 +29,6 @@
 from flax import linen as nn
 from flax import nnx
 
-<<<<<<< HEAD:src/maxtext/models/qwen3.py
 from maxtext.common.common_types import AttentionType, Config, DType, Array, BATCH, LENGTH_NO_EXP, EMBED, MODEL_MODE_TRAIN
 from maxtext.layers import attentions
 from maxtext.layers import initializers as max_initializers
@@ -44,25 +43,8 @@
 from maxtext.layers.moe import RoutedMoE
 from maxtext.layers.initializers import nd_dense_init, variable_to_logically_partitioned
 
-=======
 from jax.sharding import PartitionSpec as P
 from jax.experimental.shard_map import shard_map
-
-from MaxText.common_types import AttentionType, Config, DType, Array, BATCH, LENGTH_NO_EXP, EMBED, MODEL_MODE_TRAIN
-from MaxText.layers import attentions
-from MaxText.layers import initializers as max_initializers
-from MaxText.layers import moe
-from MaxText.layers import nnx_wrappers
-from MaxText.layers import quantizations
-from MaxText.layers.embeddings import Qwen3OmniMoeVisionPosEmbedInterpolate, PositionalEmbedding
-from MaxText.layers.normalizations import RMSNorm, l2norm, Qwen3NextRMSNorm, Qwen3NextRMSNormGated
-from MaxText.layers.quantizations import AqtQuantization as Quant
-from MaxText.layers.attentions import Attention
-from MaxText.layers.linears import DenseGeneral, MlpBlock
-from MaxText.layers.moe import RoutedMoE
-from MaxText.layers.initializers import nd_dense_init, variable_to_logically_partitioned
-from maxtext.inference import page_manager
->>>>>>> 7461955dc (add shardmap to kernel):src/MaxText/layers/qwen3.py
 from maxtext.utils import max_utils
 from maxtext.inference import page_manager, kvcache
 
@@ -218,7 +200,7 @@ def pallas_chunk_gated_delta_rule(
   # =========================================================================
   initial_dtype = query.dtype
   if use_qk_norm_in_gdn:
-    from MaxText.layers.normalizations import l2norm 
+    from maxtext.layers.normalizations import l2norm 
     query = l2norm(query, dim=-1, eps=1e-6)
     key = l2norm(key, dim=-1, eps=1e-6)
 
@@ -546,11 +528,7 @@ class Qwen3NextGatedDeltaNet(nnx.Module):
   2. output = Linear_out(y)
   """
 
-<<<<<<< HEAD:src/maxtext/models/qwen3.py
-  def __init__(self, config: Config, dtype: DType = jnp.float32, model_mode: str = MODEL_MODE_TRAIN, *, rngs: nnx.Rngs):
-=======
-  def __init__(self, config: Config, *, rngs: nnx.Rngs, mesh: Mesh=None):
->>>>>>> 7461955dc (add shardmap to kernel):src/MaxText/layers/qwen3.py
+  def __init__(self, config: Config, dtype: DType = jnp.float32, model_mode: str = MODEL_MODE_TRAIN, *, rngs: nnx.Rngs, mesh: Mesh=None):
     """
     Args:
       config: MaxText configuration object.
@@ -1148,11 +1126,7 @@ def __init__(
           rngs=rngs,
       )
     else:
-<<<<<<< HEAD:src/maxtext/models/qwen3.py
       self.attention = Qwen3NextGatedDeltaNet(config=cfg, dtype=cfg.dtype, model_mode=model_mode, rngs=rngs)
-=======
-      self.attention = Qwen3NextGatedDeltaNet(config=cfg, rngs=rngs, mesh=self.mesh)
->>>>>>> 7461955dc (add shardmap to kernel):src/MaxText/layers/qwen3.py
 
     # Second LayerNorm, applied before the MoE block.
     self.post_attention_layernorm = Qwen3NextRMSNorm(
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -536,37 +536,13 @@ def calculate_gated_delta_net_flops_per_device(config):
   # We multiply by 2 for FMA
   flops_conv = 2 * B * S * K_conv * (2 * K_dim + V_dim)
 
-<<<<<<< HEAD
   # 3. Core Gated Delta Net
   # This counts 4 distinct O(D^2) operations in the recurrent update:
   #   KK^T, VK^T, S(a(I-bKK^T)), and SQ.
   # We multiply by 2 for FMA.
   # Total Core FLOPs = 2 (FMA) * 4 (Ops) * H * D^2 = 8 * H * D^2 per token.
   # We use D_k * D_v to generalize D^2 for potentially differing head dimensions.
   flops_core_per_token = H_v * (D_k * D_v) * 8
-=======
-  # 3. Core Gated Delta Net (Optimized WY Representation)
-  # The implementation broadcasts K heads to V heads if H_v > H_k
-  H_eff = max(H_k, H_v) 
-
-  # Per-token costs derived from jax_chunk_gated_delta_rule:
-  # Intra-chunk Pre-computation:
-  #   S = K @ K.T: 2 * C * D_k
-  #   A = (I+S)^-1: ~ C^2 (Triangular solve approximation)
-  #   U = A @ V: 2 * C * D_v
-  #   W = A @ K: 2 * C * D_k
-  # Scan / Output:
-  #   Out_Inter (Q @ h): 2 * D_k * D_v
-  #   Out_Intra_QK (Q @ K.T): 2 * C * D_k
-  #   Out_Intra_AV (Attn @ V): 2 * C * D_v
-  #   State_Update (W.T @ U): 2 * D_k * D_v
-  
-  # Summing per-token factors: 
-  # (2*C*D_k) + C^2 + (2*C*D_v) + (2*C*D_k) + (2*D_k*D_v) + (2*C*D_k) + (2*C*D_v) + (2*D_k*D_v)
-  # = 6*C*D_k + 4*C*D_v + 4*D_k*D_v + C^2
-  
-  flops_core_per_token = H_eff * (6 * C * D_k + 4 * C * D_v + 4 * D_k * D_v + C**2)
->>>>>>> 09f85a04f (Update tflops calc to align with WY-optimized GDN)
   flops_core = B * S * flops_core_per_token
 
   # Weights part: Projections + Conv