AI-Hypercomputer
diff --git a/‎src/maxtext/layers/embeddings.py‎
Lines changed: 5 additions & 4 deletions b/‎src/maxtext/layers/embeddings.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/maxtext/layers/linears.py‎
Lines changed: 36 additions & 20 deletions b/‎src/maxtext/layers/linears.py‎
Lines changed: 36 additions & 20 deletions
diff --git a/‎src/maxtext/layers/moe.py‎
Lines changed: 60 additions & 41 deletions b/‎src/maxtext/layers/moe.py‎
Lines changed: 60 additions & 41 deletions
@@ -151,10 +151,11 @@ def __call__(self, inputs: Array, model_mode: str = MODEL_MODE_TRAIN) -> Array:
     if not jnp.issubdtype(inputs.dtype, jnp.integer):
       raise ValueError("Input type must be an integer or unsigned integer.")
 
-    embedding = jnp.asarray(
-        _maybe_move_embedding_to_device(self.embedding.value, self.config),
-        self.dtype,
-    )
+    embedding_val = _maybe_move_embedding_to_device(self.embedding.value, self.config)
+    if isinstance(embedding_val, jax.ShapeDtypeStruct):
+      embedding = embedding_val
+    else:
+      embedding = jnp.asarray(embedding_val, self.dtype)
 
     output_axis_names = (
         (
 
@@ -220,32 +220,48 @@ def __call__(self, inputs: Array, _initializing: bool = False, out_sharding: Nam
       kernel_shape = self.in_features_shape + self.out_features_shape
       kernel = jnp.zeros(kernel_shape, dtype=self.dtype)
     else:
-      kernel = self.kernel[...]
-      # Move logit_dense kernel to device if parameter offloading is enabled
-      if self.parameter_memory_host_offload:
-        max_logging.log("linear.py: Moving parameter logits_dense kernel to device")
-        kernel = jax.device_put(kernel, max_utils.device_space())
-      kernel = jnp.asarray(kernel, self.dtype)
+      kernel_val = self.kernel.value
+      if kernel_val is not None:
+        if isinstance(kernel_val, jax.ShapeDtypeStruct):
+          # Bypass concrete indexing for abstract tracers
+          kernel = kernel_val
+        else:
+          kernel = self.kernel[...]
+          # Move logit_dense kernel to device if parameter offloading is enabled
+          if self.parameter_memory_host_offload:
+            max_logging.log("linear.py: Moving parameter logits_dense kernel to device")
+            kernel = jax.device_put(kernel, max_utils.device_space())
+          kernel = jnp.asarray(kernel, self.dtype)
+      else:
+        kernel = None
 
     # out_sharding should be None for auto mesh axis
     if self.shard_mode != ShardMode.EXPLICIT:
       out_sharding = None
 
-    contract_ind = tuple(range(0, len(self.axis)))
-    output = _compute_dot_general_nnx(
-        inputs,
-        kernel,
-        norm_axis,
-        contract_ind,
-        self.matmul_precision,
-        self.quant_dot_general,
-        _initializing,
-        out_sharding,
-    )
+    if kernel is not None:
+      contract_ind = tuple(range(0, len(self.axis)))
+      output = _compute_dot_general_nnx(
+          inputs,
+          kernel,
+          norm_axis,
+          contract_ind,
+          self.matmul_precision,
+          self.quant_dot_general,
+          _initializing,
+          out_sharding,
+      )
+
+      if self.bias is not None:
+        bias_val = self.bias.value
+        if bias_val is not None:
+          bias = jnp.asarray(self.bias[...], self.dtype)
+          output += bias
+    else:
+      # If kernel is missing (e.g. masked in pipeline), return zeros.
+      out_shape = inputs.shape[: -len(self.axis)] + self.out_features_shape
+      output = jnp.zeros(out_shape, dtype=self.dtype)
 
-    if self.bias is not None:
-      bias = jnp.asarray(self.bias[...], self.dtype)
-      output += bias
     return output
 
 
 
@@ -273,25 +273,35 @@ def __call__(self, inputs: jax.Array, _initializing: bool = False) -> Tuple[jax.
       kernel_shape = self.in_features_shape + self.out_features_shape
       kernel = jnp.zeros(kernel_shape, dtype=self.dtype)
     else:
-      kernel = self.kernel[...]
-    kernel = jnp.asarray(kernel, self.dtype)
+      kernel_val = self.kernel.value
+      if kernel_val is not None:
+        kernel = self.kernel[...]
+        kernel = jnp.asarray(kernel, self.dtype)
+      else:
+        kernel = None
+
+    if kernel is not None:
+      contract_ind = tuple(range(0, len(norm_axis)))
+      output_sharding = (
+          create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_no_exp_moe", None))
+          if self.shard_mode == ShardMode.EXPLICIT
+          else None
+      )
+      output = linears._compute_dot_general_nnx(
+          inputs,
+          kernel,
+          norm_axis,
+          contract_ind,
+          self.matmul_precision,
+          self.quant_dot_general,
+          _initializing,
+          out_sharding=output_sharding,
+      )
+    else:
+      # If kernel is missing (e.g. masked in pipeline), return zeros.
+      out_shape = inputs.shape[:-1] + self.out_features_shape
+      output = jnp.zeros(out_shape, dtype=self.dtype)
 
-    contract_ind = tuple(range(0, len(norm_axis)))
-    output_sharding = (
-        create_sharding(self.mesh, ("activation_batch_no_exp_moe", "activation_length_moe", None))
-        if self.shard_mode == ShardMode.EXPLICIT
-        else None
-    )
-    output = linears._compute_dot_general_nnx(
-        inputs,
-        kernel,
-        norm_axis,
-        contract_ind,
-        self.matmul_precision,
-        self.quant_dot_general,
-        _initializing,
-        out_sharding=output_sharding,
-    )
     pre_bias_logits = None
 
     if self.score_func:
@@ -300,8 +310,10 @@ def __call__(self, inputs: jax.Array, _initializing: bool = False) -> Tuple[jax.
         pre_bias_logits = output
 
     if self.use_bias:
-      bias = jnp.asarray(self.bias[...], self.dtype)
-      output += bias
+      bias_val = self.bias.value
+      if bias_val is not None:
+        bias = jnp.asarray(self.bias[...], self.dtype)
+        output += bias
     return output, pre_bias_logits
 
 
@@ -2024,9 +2036,10 @@ def __call__(
     routing_inputs = inputs if gate_inputs is None else gate_inputs.astype(gate_dtype)
     gate_logits, pre_bias_logits = self.gate(routing_inputs)
 
-    w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
-    w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)
-    wo_kernel = jnp.asarray(self.wo[...], self.dtype)
+    if self.wi_0.value is not None:
+      w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
+      w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)
+      wo_kernel = jnp.asarray(self.wo[...], self.dtype)
 
     if self.per_expert_scale is not None:
       wo_kernel = wo_kernel * jnp.asarray(self.per_expert_scale[...], self.dtype)[:, None, None]
@@ -2038,26 +2051,32 @@ def __call__(
     else:
       w0_bias, w1_bias, wo_bias = None, None, None
 
-    if cfg.sparse_matmul:
-      if quantizations.in_serve_mode(self.quant):
-        w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(
-            inputs,
-            gate_logits,
-            pre_bias_logits,
-            w0_kernel,
-            w1_kernel,
-            wo_kernel,
-            w0_bias,
-            w1_bias,
-            wo_bias,
+      if cfg.sparse_matmul:
+        if quantizations.in_serve_mode(self.quant):
+          w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(
+              inputs,
+              gate_logits,
+              pre_bias_logits,
+              w0_kernel,
+              w1_kernel,
+              wo_kernel,
+              w0_bias,
+              w1_bias,
+              wo_bias,
+          )
+        output, lb_loss, bias_updates = self.sparse_matmul(
+            inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
+        )
+      else:
+        output, lb_loss, bias_updates = self.dense_matmul(
+            inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
         )
-      output, lb_loss, bias_updates = self.sparse_matmul(
-          inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
-      )
     else:
-      output, lb_loss, bias_updates = self.dense_matmul(
-          inputs, gate_logits, pre_bias_logits, w0_kernel, w1_kernel, wo_kernel, w0_bias, w1_bias, wo_bias
-      )
+      # If kernels are missing (e.g. masked in pipeline), return zeros.
+      output = jnp.zeros_like(inputs)
+      lb_loss = None
+      bias_updates = None
+
     return output, lb_loss, bias_updates