Fix Linting

hsuan-lun-chiang · hsuan-lun-chiang · commit 53f3052ba9de · 2026-04-09T09:10:36.000Z
diff --git a/src/maxtext/layers/nnx_decoders.py b/src/maxtext/layers/nnx_decoders.py
@@ -41,7 +41,6 @@
 from maxtext.layers import initializers, linears, mhc, normalizations, quantizations
 from maxtext.layers.attentions import Attention
 from maxtext.layers.embeddings import Embed, PositionalEmbedding, attend_on_embedding
-from maxtext.layers.engram import Engram, NgramHashMapping
 from maxtext.layers.normalizations import RMSNorm
 from maxtext.layers.quantizations import AqtQuantization as Quant
 from maxtext.models import (
@@ -328,11 +327,15 @@ def __init__(
             else:
               next_boundary = self._find_next_boundary(current_idx, config.first_num_dense_layers, config.engram_layers)
               chunk_name = f"dense_layers_{current_idx}_{next_boundary - 1}"
-              setattr(self, chunk_name, self._create_scanned_layers(
-                  dense_cls, length=(next_boundary - current_idx), metadata_axis_name=chunk_name, rngs=rngs
-              ))
+              setattr(
+                  self,
+                  chunk_name,
+                  self._create_scanned_layers(
+                      dense_cls, length=(next_boundary - current_idx), metadata_axis_name=chunk_name, rngs=rngs
+                  ),
+              )
               current_idx = next_boundary
-              
+
           # 2. Create MoE Chunks (Direct setattr, NO nnx.Dict)
           current_idx = config.first_num_dense_layers
           while current_idx < config.num_decoder_layers:
@@ -343,9 +346,13 @@ def __init__(
             else:
               next_boundary = self._find_next_boundary(current_idx, config.num_decoder_layers, config.engram_layers)
               chunk_name = f"moe_layers_{current_idx}_{next_boundary - 1}"
-              setattr(self, chunk_name, self._create_scanned_layers(
-                  moe_cls, length=(next_boundary - current_idx), metadata_axis_name=chunk_name, rngs=rngs
-              ))
+              setattr(
+                  self,
+                  chunk_name,
+                  self._create_scanned_layers(
+                      moe_cls, length=(next_boundary - current_idx), metadata_axis_name=chunk_name, rngs=rngs
+                  ),
+              )
               current_idx = next_boundary
         else:
           # Standard DeepSeek logic when Engrams are disabled
@@ -374,7 +381,7 @@ def __init__(
         self.layers_remainder = RemattedGemma3Block(
             config=self.config, mesh=mesh, quant=self.quant, model_mode=self.model_mode, **rem_layer_kwargs, rngs=rngs
         )  # pytype: disable=wrong-keyword-args
-      elif self.is_gemma4:                                                # <-- ADDED BLOCK
+      elif self.is_gemma4:  # <-- ADDED BLOCK
         attention_pattern_length = len(gemma4.GEMMA4_ATTENTION_PATTERN)
         scan_length = config.num_decoder_layers // attention_pattern_length
         num_remaining_layers = config.num_decoder_layers % attention_pattern_length
@@ -424,7 +431,7 @@ def __init__(
           layer_kwargs = {}
           if config.decoder_block == DecoderBlockType.GEMMA3:
             layer_kwargs = {"attention_type": gemma3.get_attention_type(layer_id=lyr)}
-          elif config.decoder_block == DecoderBlockType.GEMMA4:           # <-- ADDED
+          elif config.decoder_block == DecoderBlockType.GEMMA4:  # <-- ADDED
             layer_kwargs = {"attention_type": gemma4.get_attention_type(layer_id=lyr)}
           elif config.decoder_block == DecoderBlockType.LLAMA4:
             layer_kwargs = {
@@ -932,16 +939,11 @@ def _find_next_boundary(self, current_idx, end_idx, engram_indices):
   def _apply_single_engram_layer(self, y, layer_name, *args, **kwargs):
     """Applies a single, unscanned Engram layer."""
     layer = getattr(self, layer_name)
-    
+
     decoder_input_tokens = kwargs.get("decoder_input_tokens")
     layer_kwargs = kwargs.get("layer_kwargs", {})
 
-    out = layer(
-        y,
-        *args,
-        decoder_input_tokens=decoder_input_tokens,
-        **layer_kwargs
-    )
+    out = layer(y, *args, decoder_input_tokens=decoder_input_tokens, **layer_kwargs)
     if isinstance(out, tuple):
       y = out[0]
     else:
@@ -997,7 +999,7 @@ def _apply_interleaved_scanned_layers(self, y, layer_prefix, start_idx, end_idx,
         chunk_name = f"{layer_prefix}_{current_idx}_{next_boundary - 1}"
         chunk_stack = getattr(self, chunk_name)
         scan_length = next_boundary - current_idx
-        
+
         y, chunk_stack = self._apply_layers_sequentially(
             chunk_stack, y, *args, length=scan_length, **kwargs.get("layer_kwargs", {})
         )
@@ -1046,7 +1048,7 @@ def __call__(
     # Extract the bidirectional mask locally for layer configurations
     bidirectional_mask = multimodal_input.bidirectional_mask if multimodal_input is not None else None
 
-    if cfg.decoder_block in (DecoderBlockType.GEMMA3, DecoderBlockType.GEMMA4): # <-- UPDATED
+    if cfg.decoder_block in (DecoderBlockType.GEMMA3, DecoderBlockType.GEMMA4):  # <-- UPDATED
       layer_kwargs["bidirectional_mask"] = bidirectional_mask
 
     if attention_metadata is not None:
@@ -1071,7 +1073,13 @@ def __call__(
           )
 
           y = self._apply_interleaved_scanned_layers(
-              y, "moe_layers", cfg.first_num_dense_layers, cfg.num_decoder_layers, cfg.engram_layers, *layer_args, **common_kwargs
+              y,
+              "moe_layers",
+              cfg.first_num_dense_layers,
+              cfg.num_decoder_layers,
+              cfg.engram_layers,
+              *layer_args,
+              **common_kwargs,
           )
         else:
           y, self.dense_layers = self._apply_layers_sequentially(
@@ -1123,7 +1131,7 @@ def __call__(
             previous_chunk,
             page_state,
             slot,
-        )      
+        )
       else:
         scan_length = int(cfg.num_decoder_layers / cfg.inhomogeneous_layer_cycle_interval)
         if scan_length > 0:
@@ -1303,6 +1311,7 @@ def pure_gemma_fn(graphdef, state_in, y_in):
 
     return y
 
+
 def decoder_as_linen(
     config: Config,
     mesh: Mesh,
diff --git a/src/maxtext/layers/quantizations.py b/src/maxtext/layers/quantizations.py
@@ -26,7 +26,6 @@
 from aqt.jax.v2 import tiled_dot_general
 from aqt.jax.v2 import calibration
 
-from maxtext.layers import nnx_wrappers
 import qwix
 from qwix._src.core import dot_general_qt
 
diff --git a/tests/unit/nnx_decoder_test.py b/tests/unit/nnx_decoder_test.py
@@ -533,4 +533,4 @@ def test_different_random_seeds_produce_different_logits(self):
 
 
 if __name__ == "__main__":
-  unittest.main()
+  unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -533,4 +533,4 @@ def test_different_random_seeds_produce_different_logits(self):`
`533`	`533`
`534`	`534`
`535`	`535`	`if __name__ == "__main__":`
`536`		`- unittest.main()`
	`536`	`+ unittest.main()`