AI-Hypercomputer
diff --git a/‎MaxText/configs/base.yml‎
Lines changed: 3 additions & 0 deletions b/‎MaxText/configs/base.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎MaxText/convert_gemma3_chkpt.py‎
Lines changed: 45 additions & 0 deletions b/‎MaxText/convert_gemma3_chkpt.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎MaxText/layers/gemma3.py‎
Lines changed: 276 additions & 7 deletions b/‎MaxText/layers/gemma3.py‎
Lines changed: 276 additions & 7 deletions
diff --git a/‎MaxText/layers/models.py‎
Lines changed: 3 additions & 3 deletions b/‎MaxText/layers/models.py‎
Lines changed: 3 additions & 3 deletions
@@ -698,6 +698,9 @@ temperature_tuning: False
 
 # Multimodal flags
 use_multimodal: False
+freeze_vision_encoder_params: True
+dtype_mm: "float32"  # Data type for multimodal model's vision encoder
+remat_policy_for_vit: "minimal"  # Remat policy for multimodal model's vision encoder. Check `remat_policy` for options.
 image_size_for_vit: 896 # Default for Gemma3, and should be overwritten by model's config
 image_path: "" # Local image path used for decoding
 
@@ -45,6 +45,28 @@ def nest_params(params: Params) -> Params:
   return nested_params
 
 
+def rename_nested_keys(data, old_key, new_key):
+  """
+  Recursively renames keys in a nested dictionary.
+  Args:
+      data (dict): The nested dictionary to process.
+      old_key (str): The key to find and rename.
+      new_key (str): The new name for the key.
+  Returns:
+      dict: A new dictionary with the specified keys renamed.
+  """
+  new_data = {}
+  for key, value in data.items():
+    new_k = new_key if key == old_key else key
+    if isinstance(value, dict):
+      new_data[new_k] = rename_nested_keys(value, old_key, new_key)
+    elif isinstance(value, list):
+      new_data[new_k] = [rename_nested_keys(item, old_key, new_key) if isinstance(item, dict) else item for item in value]
+    else:
+      new_data[new_k] = value
+  return new_data
+
+
 def main(raw_args=None) -> None:
   parser = argparse.ArgumentParser()
   parser.add_argument("--base_model_path", type=str, required=True)
@@ -76,7 +98,29 @@ def main(raw_args=None) -> None:
           "decoder_norm": {"scale": params["transformer"]["final_norm"]["scale"] + 1},
       },
       "token_embedder": {"embedding": params["transformer"]["embedder"]["input_embedding"] * jnp.sqrt(embed_dim)},
+      "vision_encoder": {
+          "Gemma3VisionEncoderLayer_0": {
+              "embedding": {
+                  "bias": params["SigLiPFromPatches_0"]["siglip_encoder"]["embedding"]["bias"],
+                  "kernel": params["SigLiPFromPatches_0"]["siglip_encoder"]["embedding"]["kernel"],
+              },
+              "pos_embedding": params["SigLiPFromPatches_0"]["siglip_encoder"]["pos_embedding"],
+              "Transformer": params["SigLiPFromPatches_0"]["siglip_encoder"]["Transformer"],
+              "VisionEmbedder_0": {
+                  "mm_input_projection": params["transformer"]["embedder"]["mm_input_projection"],
+                  "mm_soft_embedding_norm": {
+                      "scale": params["transformer"]["embedder"]["mm_soft_embedding_norm"]["scale"] + 1
+                  },
+              },
+          }
+      },
   }
+  # Rename MlpBlock_0 to MlpBlockViT_0 in vision encoder
+  # This is because the gemma3 model has MlpBlock in the vision encoder,
+  # which has the same name as the MlpBlock in the MaxText decoder but different structure.
+  # Hence, we need to rename it to avoid confusion.
+  vision_encoder_weights = rename_nested_keys(jax_weights["vision_encoder"], "MlpBlock_0", "MlpBlockViT_0")
+  jax_weights["vision_encoder"] = vision_encoder_weights
   self_attention = dict(
       {
           "query": {"kernel": []},
@@ -191,6 +235,7 @@ def astype_fn(x):
   if checkpoint_manager is not None:
     if save_checkpoint(checkpoint_manager, 0, state_new):
       max_logging.log("saved a checkpoint at step 0")
+      max_logging.log(f"Checkpoint saved to: {args.maxtext_model_path}")
     # Upon preemption, exit when and only when all ongoing saves are complete.
     if checkpoint_manager.reached_preemption(0):
       checkpoint_manager.wait_until_finished()
 
@@ -82,26 +82,295 @@ def get_query_pre_attn_scalar(config) -> float:
     raise ValueError(f"Unsupported model name: {config.model_name}")
 
 
+def _posemb_sincos_2d(
+    h: int,
+    w: int,
+    *,
+    width: int,
+    temperature: float = 10_000.0,
+    dtype: jnp.dtype = jnp.float32,
+):
+  """Follows the MoCo v3 logic."""
+  y, x = jnp.mgrid[:h, :w]
+
+  assert width % 4 == 0, "Width must be mult of 4 for sincos posemb"
+  omega = jnp.arange(width // 4) / (width // 4 - 1)
+  omega = 1.0 / (temperature**omega)
+  y = jnp.einsum("m,d->md", y.flatten(), omega)
+  x = jnp.einsum("m,d->md", x.flatten(), omega)
+  pe = jnp.concatenate([jnp.sin(x), jnp.cos(x), jnp.sin(y), jnp.cos(y)], axis=1)
+  return jnp.asarray(pe, dtype)[None, :, :]
+
+
+class MlpBlockViT(nn.Module):
+  """Transformer MLP / feed-forward block."""
+
+  block_id: int
+  dtype_mm: str
+  mlp_dim: int | None = None  # Defaults to 4x input dim
+  dropout: float = 0.0
+
+  @nn.compact
+  def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
+    """Applies Transformer MlpBlock module."""
+    inits = dict(
+        kernel_init=nn.initializers.xavier_uniform(),
+        bias_init=nn.initializers.normal(stddev=1e-6),
+    )
+
+    d = x.shape[-1]
+    x = nn.Dense(features=self.mlp_dim or 4 * d, dtype=self.dtype_mm, **inits)(x)
+    x = nn.gelu(x)
+    x = nn.Dropout(rate=self.dropout)(x, deterministic)
+    x = nn.Dense(
+        features=d,
+        dtype=self.dtype_mm,
+        **inits,
+    )(x)
+    return x
+
+
+class Encoder1DBlock(nn.Module):
+  """Single transformer encoder block (MHSA + MLP)."""
+
+  block_id: int
+  dtype_mm: str
+  mlp_dim: int | None = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  dropout: float = 0.0
+
+  @nn.compact
+  def __call__(self, x: jax.Array, deterministic: bool = True) -> tuple[jax.Array, dict[str, jax.Array]]:
+    x = nn.with_logical_constraint(x, ("activation_batch", "activation_length", "activation_embed"))
+    y = nn.LayerNorm()(x)
+
+    y = nn.MultiHeadDotProductAttention(
+        num_heads=self.num_heads,
+        kernel_init=nn.initializers.xavier_uniform(),
+        deterministic=deterministic,
+        dtype=self.dtype_mm,
+    )(y, y)
+    y = nn.with_logical_constraint(y, ("activation_batch", "activation_length", "activation_embed"))
+    y = nn.Dropout(rate=self.dropout)(y, deterministic)
+    x = x + y
+
+    y = nn.LayerNorm()(x)
+    y = MlpBlockViT(
+        block_id=self.block_id,
+        mlp_dim=self.mlp_dim,
+        dropout=self.dropout,
+        dtype_mm=self.dtype_mm,
+    )(y, deterministic)
+    y = nn.with_logical_constraint(y, ("activation_batch", "activation_length", "activation_embed"))
+    y = nn.Dropout(rate=self.dropout)(y, deterministic)
+    x = x + y
+    x = nn.with_logical_constraint(x, ("activation_batch", "activation_length", "activation_embed"))
+    return x
+
+
+class Encoder(nn.Module):
+  """Transformer Model Encoder for sequence to sequence translation."""
+
+  depth: int
+  dtype_mm: str
+  remat_policy: str
+  mlp_dim: int | None = None  # Defaults to 4x input dim
+  num_heads: int = 12
+  dropout: float = 0.0
+  scan: bool = False
+
+  @nn.compact
+  def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
+    if self.scan:
+      block = nn.remat(
+          Encoder1DBlock,
+          prevent_cse=False,
+          static_argnums=(2,),  # 0=self, 2=deterministic
+          policy=getattr(jax.checkpoint_policies, self.remat_policy, None),
+      )
+      x = nn.scan(
+          block,
+          variable_axes={"params": 0},
+          split_rngs={"params": True, "dropout": True},
+          in_axes=nn.broadcast,
+          length=self.depth,
+      )(
+          block_id=0,
+          name="encoderblock",
+          dtype_mm=self.dtype_mm,
+          mlp_dim=self.mlp_dim,
+          num_heads=self.num_heads,
+          dropout=self.dropout,
+      )(
+          x, deterministic
+      )
+    else:
+      # Input Encoder
+      for lyr in range(self.depth):
+        block_cur = Encoder1DBlock(
+            block_id=lyr,
+            name=f"encoderblock_{lyr}",
+            dtype_mm=self.dtype_mm,
+            mlp_dim=self.mlp_dim,
+            num_heads=self.num_heads,
+            dropout=self.dropout,
+        )
+        x = block_cur(x, deterministic)
+    x: jax.Array = nn.LayerNorm(name="encoder_norm")(x)
+    return x
+
+
+class Einsum(nn.Module):
+  """Einsum is a convenience module for parameterized tensor multiplication."""
+
+  shape: tuple[int, ...]
+  weight_name: str = "w"
+  initializer: nn.initializers.Initializer = nn.initializers.normal()
+  dtype: jnp.dtype | None = None
+
+  @nn.compact
+  def __call__(self, eqn: str, x: jax.Array) -> jax.Array:
+    w = self.param(
+        self.weight_name,
+        self.initializer,
+        self.shape,
+        self.dtype if self.dtype is not None else None,
+    )
+    return jnp.einsum(eqn, x, w)
+
+
+class VisionEmbedder(nn.Module):
+  """Projects image embeddings to the embedding space of the text encoder."""
+
+  embed_dim: int
+  vision_proj_dim: int | None = None
+
+  def setup(self):
+    if self.vision_proj_dim:
+      self.mm_soft_embedding_norm = RMSNorm()
+      self.mm_input_projection = Einsum((self.vision_proj_dim, self.embed_dim))
+
+  def encode_vision(self, x: jax.Array) -> jax.Array:
+    x = self.mm_soft_embedding_norm(x)
+    x = self.mm_input_projection("...tm,md->...td", x)
+    return x
+
+  def __call__(self, x: jax.Array) -> jax.Array:
+    return self.encode_vision(x)
+
+
+class VisionExit(nn.Module):
+  """The vision exit layer.
+
+  Possibly downsample the soft tokens to a required output length.
+
+  Attributes:
+    output_length: The embed will be spatially avg-pooled to this output length.
+  """
+
+  output_length: int = 256
+
+  def __call__(self, x):
+    cur_length = x.shape[1]
+    if cur_length == self.output_length:
+      return x
+    cur_width = int(cur_length**0.5)
+    assert cur_width**2 == cur_length
+    output_width = int(self.output_length**0.5)
+    assert output_width**2 == self.output_length, f"Cannot pool {x.shape=} to {self.output_length}=!"
+    batch_size = x.shape[0]
+    embed_dim = x.shape[-1]
+    x = jnp.reshape(x, (batch_size, cur_width, cur_width, embed_dim))
+    assert not cur_width % output_width, f"{cur_width=} {output_width=}"
+    window = cur_width // output_width
+    window_shape = (window, window)
+    x = nn.avg_pool(x, window_shape=window_shape, strides=window_shape)
+    batch_size, height, width, embed_dim = x.shape
+    return jnp.reshape(x, (batch_size, height * width, embed_dim))
+
+
 class Gemma3VisionEncoderLayer(nn.Module):
   config: Config
+  patch_size: tuple[int, int] = (14, 14)
+  width: int = 1152
+  mlp_dim: int | None = 4304  # Defaults to 4x input dim
+  depth: int = 27
+  num_heads: int = 16
+  posemb: str = "learn"  # Can also be "sincos2d"
+  dropout: float = 0.0
+  # or "dots_with_no_batch_dims_saveable" for more speed (memory costly)
+
+  def _get_posemb(
+      self,
+      typ: str,
+      *,
+      seqshape: tuple[int, int],
+      width: int,
+      name: str,
+      dtype: jnp.dtype = jnp.float32,
+  ):
+    """Returns the position embedding."""
+    if typ == "learn":
+      shape_product = seqshape[0] * seqshape[1]
+      return self.param(
+          name,
+          nn.initializers.normal(stddev=1 / (width**0.5)),
+          (1, shape_product, width),
+          dtype,
+      )
+    elif typ == "sincos2d":
+      return _posemb_sincos_2d(*seqshape, width=width, dtype=dtype)
+    else:
+      raise ValueError(f"Unknown posemb type: {typ}")
 
   @nn.compact
-  def __call__(self, inputs, train=False):
+  def __call__(self, inputs, deterministic, train=False):
     """ViT model that transforms image inputs to image embeddings.
     Args:
       inputs: jnp.array shaped [B, N, H, W, C], e.g. [4, 1, 896, 896, 3]
     Returns:
       jnp.array for image embeddings, shaped [B, N, P, D], e.g. [4, 1, 256, 2560]
     """
+    cfg = self.config
     b, n, h, w, c = inputs.shape
     x = jnp.reshape(inputs, [b * n, h, w, c])
+    # Gemma3 uses conv2d with stride 14 and kernel size 14 to extract patches.
     x = nn.Conv(features=1152, kernel_size=(14, 14), strides=14, padding="VALID", name="embedding")(x)
-    jax.debug.print("x after: {}", x.mean())
-    n, h, w, c = x.shape
-    x = jnp.reshape(x, [n, h * w, c])
-    # TODO(hengtaoguo): finish the ViT with posemb, dropout and transformation layers.
-    # Currently it is only a placeholder with one Conv layer.
-    # Placeholder x shape (B, 4096, 1152).
+    bn, h, w, c = x.shape
+    x = jnp.reshape(x, [bn, h * w, c])
+
+    # Add posemb before adding extra token.
+    x = x + self._get_posemb(
+        self.posemb,
+        seqshape=(h, w),
+        width=c,
+        name="pos_embedding",
+        dtype=x.dtype,
+    )
+
+    x = nn.Dropout(rate=self.dropout)(x, not train)
+
+    # Transformer encoder to extract image features.
+    x = Encoder(
+        depth=self.depth,
+        mlp_dim=self.mlp_dim,
+        num_heads=self.num_heads,
+        dropout=self.dropout,
+        scan=cfg.scan_layers,
+        remat_policy=cfg.remat_policy_for_vit,
+        dtype_mm=cfg.dtype_mm,
+        name="Transformer",
+    )(x, deterministic=deterministic)
+
+    # Gemma3 use a vision exit layer to downsample the soft tokens to a required output length.
+    x = VisionExit(output_length=256)(x)
+    bn, l, c = x.shape
+    x = jnp.reshape(x, [b, n, l, c])
+
+    # VisionEmbedder is a projection layer that projects the image embeddings to align with text embeddings emb_dim.
+    x = VisionEmbedder(embed_dim=cfg.emb_dim, vision_proj_dim=self.width)(x)
+    if cfg.freeze_vision_encoder_params:
+      x = jax.lax.stop_gradient(x)
     return x
 
 
 
@@ -623,8 +623,8 @@ def get_vision_encoder_layers(self):
       raise ValueError(f"No VisionEncoder implemented for {self.config.model_name} yet")
 
   @nn.compact
-  def __call__(self, input_images):
-    embeddings = self.vision_encoder_layer[0](config=self.config)(input_images)
+  def __call__(self, input_images, deterministic=False):
+    embeddings = self.vision_encoder_layer[0](config=self.config)(input_images, deterministic=deterministic)
     return embeddings
 
 
@@ -685,7 +685,7 @@ def __call__(
 
     bidirectional_mask = None
     if self.config.use_multimodal and encoder_images is not None:
-      image_embeddings = self.vision_encoder(input_images=encoder_images)
+      image_embeddings = self.vision_encoder(input_images=encoder_images, deterministic=not enable_dropout)
       # TODO(hengtaoguo, aireen): merge image_embeddings with decoder_input_tokens.
 
       if self.config.decoder_block == DecoderBlockType.GEMMA3: