[NNX] Delete Linen (4/4): remove the pure_nnx/enable_nnx/pure_nnx_decoder config flags

ecnal-cienet · ecnal-cienet · commit 6c8c56fe2b35 · 2026-06-09T14:08:50.000Z
Remove the three flags from types.py, base.yml, inference/vllm.yml, pyconfig, and the
post-train distillation configs. NNX is the only path; the flags no longer exist.
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -1171,11 +1171,6 @@ position_id_per_seconds: 25
 # Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
 subslice_shape: ""
 
-# NNX
-enable_nnx: true
-pure_nnx_decoder: true
-pure_nnx: true
-
 ################################## Qwen3-Next Specific Configs ##################################
 # Kernel size for the 1D convolution in the Gated Delta Net
 gdn_conv_kernel_dim: 4
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -16,8 +16,6 @@ base_config: "base.yml"
 attention: "vllm_rpa"
 model_call_mode: "inference"
 
-# NNX required for vLLM integration
-enable_nnx: true
 # Avoid re-initializing JAX distributed system when using vLLM
 skip_jax_distributed_system: true
 # Scanned layers are not supported with vLLM integration
diff --git a/src/maxtext/configs/post_train/distillation_gpt_oss_20b.yml b/src/maxtext/configs/post_train/distillation_gpt_oss_20b.yml
@@ -19,7 +19,6 @@ distill_alpha: 0.5
 distill_temperature: 1.0
 distill_beta: 0
 distill_layer_indices: []
-enable_nnx: True
 load_balance_loss_weight: 0.001
 
 ici_fsdp_parallelism: 32
diff --git a/src/maxtext/configs/post_train/distillation_qwen3_30b_base.yml b/src/maxtext/configs/post_train/distillation_qwen3_30b_base.yml
@@ -21,7 +21,6 @@ distill_alpha: 0.6
 distill_temperature: 1.0
 distill_beta: 1.0
 distill_layer_indices: [0,1,2,3,4,5,6,7]
-enable_nnx: True
 load_balance_loss_weight: 0.001
 
 ici_fsdp_parallelism: -1
diff --git a/src/maxtext/configs/post_train/distillation_qwen3_30b_base_pdbs8.yml b/src/maxtext/configs/post_train/distillation_qwen3_30b_base_pdbs8.yml
@@ -21,7 +21,6 @@ distill_alpha: 0.6
 distill_temperature: 1.0
 distill_beta: 1.0
 distill_layer_indices: [0,1,2,3,4,5,6,7]
-enable_nnx: True
 load_balance_loss_weight: 0.001
 
 ici_fsdp_parallelism: -1
diff --git a/src/maxtext/configs/pyconfig_deprecated.py b/src/maxtext/configs/pyconfig_deprecated.py
@@ -193,8 +193,7 @@ def validate_expert_shard_attention_option(expert_shard_attention_option: str) -
     )
 
 
-def validate_vocab_tiling(num_vocab_tiling: int, per_device_batch_size: int, max_target_length: int, enable_nnx: bool):
-  del enable_nnx  # NNX vocab tiling supported via vocab_tiling_nnx_loss in vocabulary_tiling.py
+def validate_vocab_tiling(num_vocab_tiling: int, per_device_batch_size: int, max_target_length: int):
   if (per_device_batch_size * max_target_length) % num_vocab_tiling != 0:
     raise ValueError("Per device batch size times sequence length should be divisible by the number of vocab tiles.")
 
@@ -238,9 +237,7 @@ def validate_keys(keys):
   validate_model_call_mode(keys["model_call_mode"])
   validate_prefill_and_target_lengths(keys["max_prefill_predict_length"], keys["max_target_length"])
   validate_rope_type(keys["rope_type"])
-  validate_vocab_tiling(
-      keys["num_vocab_tiling"], keys["per_device_batch_size"], keys["max_target_length"], keys["enable_nnx"]
-  )
+  validate_vocab_tiling(keys["num_vocab_tiling"], keys["per_device_batch_size"], keys["max_target_length"])
   if keys["enable_rampup_batch_size"]:
     validate_rampup_batch_size(
         keys["per_device_batch_size_start"],
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -895,11 +895,8 @@ class HardwareAndMesh(BaseModel):
       CustomRule.DEFAULT, description="Customized mesh and logical rules for granularity."
   )
   allow_split_physical_axes: bool = Field(False, description="Allow splitting physical axes for device mesh creation.")
-  enable_nnx: bool = Field(False, description="Whether to use NNX for model definition.")
   optimize_mesh_for_tpu_v6e: bool = Field(False, description="Apply transformations to the mesh for TPU v6e.")
   shardy: bool = Field(True, description="Whether to use shardy XLA backend.")
-  pure_nnx_decoder: bool = Field(False, description="Whether to enable pure NNX decoder.")
-  pure_nnx: bool = Field(False, description="Whether to enable pure NNX mode.")
   remove_size_one_mesh_axis_from_type: bool = Field(
       True, description="Whether to remove size one mesh axis from type through jax.config."
   )
@@ -2555,8 +2552,6 @@ def validate_and_set_hlo_dump_defaults():
     if self.distill_beta > 0.0:
       if not self.scan_layers:
         raise ValueError("a value of self.distill_beta > 0.0 requires self.scan_layers = True")
-      if not self.enable_nnx:
-        raise ValueError("a value of self.distill_beta > 0.0 requires self.enable_nnx = True")
 
     # Validate distillation schedule parameters
     if self.distill_alpha_end is not None and not 0.0 <= self.distill_alpha_end <= 1.0: