refactor

BirdsOfAFthr · BirdsOfAFthr · commit 93b53b30b34c · 2026-04-24T04:13:33.000Z
diff --git a/src/dependencies/requirements/base_requirements/requirements.txt b/src/dependencies/requirements/base_requirements/requirements.txt
@@ -46,4 +46,4 @@ tiktoken
 tokamax!=0.1.0
 transformers
 uvloop
-qwix
+qwix>=0.1.6
diff --git a/src/dependencies/requirements/generated_requirements/cuda12-requirements.txt b/src/dependencies/requirements/generated_requirements/cuda12-requirements.txt
@@ -23,7 +23,7 @@ cffi>=2.0.0 ; platform_python_implementation != 'PyPy'
 cfgv>=3.5.0
 charset-normalizer>=3.4.6
 chex>=0.1.91
-click>=8.3.2
+click>=8.3.3
 cloud-accelerator-diagnostics>=0.1.1
 cloud-tpu-diagnostics>=0.1.5
 cloudpickle>=3.1.2
@@ -40,14 +40,14 @@ dill>=0.4.1
 distlib>=0.4.0
 distro>=1.9.0
 dm-tree>=0.1.10
-docstring-parser>=0.17.0
+docstring-parser>=0.18.0
 drjax>=0.1.4
 editdistance>=0.8.1
 einops>=0.8.2
 einshape>=1.0
 etils>=1.14.0
 execnet>=2.1.2
-fastapi>=0.135.3
+fastapi>=0.136.1
 filelock>=3.20.3
 flatbuffers>=25.12.19
 flax>=0.12.6
@@ -61,7 +61,7 @@ google-api-python-client>=2.194.0
 google-auth-httplib2>=0.3.1
 google-auth-oauthlib>=1.3.1
 google-auth>=2.49.2
-google-cloud-aiplatform>=1.147.0
+google-cloud-aiplatform>=1.148.1
 google-cloud-appengine-logging>=1.9.0
 google-cloud-audit-log>=0.5.0
 google-cloud-bigquery>=3.41.0
@@ -73,7 +73,7 @@ google-cloud-resource-manager>=1.17.0
 google-cloud-storage-control>=1.11.0
 google-cloud-storage>=3.10.1
 google-crc32c>=1.8.0
-google-genai>=1.72.0
+google-genai>=1.73.1
 google-pasta>=0.2.0
 google-resumable-media>=2.8.2
 googleapis-common-protos>=1.74.0
@@ -88,10 +88,10 @@ hf-xet>=1.4.3 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or
 httpcore>=1.0.9
 httplib2>=0.31.2
 httpx>=0.28.1
-huggingface-hub>=1.10.1
+huggingface-hub>=1.11.0
 humanize>=4.15.0
 hypothesis>=6.142.1
-identify>=2.6.18
+identify>=2.6.19
 idna>=3.11
 immutabledict>=4.3.1
 importlab>=0.8.1
@@ -155,30 +155,30 @@ opt-einsum>=3.4.0
 optax>=0.2.8
 optree>=0.19.0
 optype>=0.17.0
-orbax-checkpoint>=0.11.34
+orbax-checkpoint>=0.11.36
 orbax-export>=0.0.8
 packaging>=26.0
 pandas>=3.0.2
 parameterized>=0.9.0
-pathspec>=1.0.4
+pathspec>=1.1.0
 pathwaysutils>=0.1.7
 pillow>=12.1.1
 platformdirs>=4.9.6
 pluggy>=1.6.0
 portpicker>=1.6.0
-pre-commit>=4.5.1
+pre-commit>=4.6.0
 promise>=2.3
 propcache>=0.4.1
 proto-plus>=1.27.2
 protobuf>=6.33.6
 psutil>=7.2.2
-pyarrow>=23.0.1
+pyarrow>=24.0.0
 pyasn1-modules>=0.4.2
 pyasn1>=0.6.3
 pycnite>=2024.7.31
 pycparser>=3.0 ; implementation_name != 'PyPy' and platform_python_implementation != 'PyPy'
-pydantic-core>=2.46.0
-pydantic>=2.13.0
+pydantic-core>=2.46.3
+pydantic>=2.13.3
 pydot>=4.0.1
 pyelftools>=0.32
 pyglove>=0.4.5
@@ -193,7 +193,7 @@ python-dateutil>=2.9.0.post0
 pytokens>=0.4.1
 pytype>=2024.10.11
 pyyaml>=6.0.3
-qwix>=0.1.5
+qwix>=0.1.6
 regex>=2026.4.4
 requests-oauthlib>=2.0.0
 requests>=2.32.5
@@ -206,7 +206,7 @@ seqio>=0.0.20
 setuptools>=82.0.1
 shellingham>=1.5.4
 simple-parsing>=0.1.8
-simplejson>=3.20.2
+simplejson>=4.1.0
 six>=1.17.0
 sniffio>=1.3.1
 sortedcontainers>=2.4.0
@@ -234,17 +234,17 @@ tqdm>=4.67.3
 transformer-engine-cu12>=2.13.0
 transformer-engine-jax>=2.13.0
 transformer-engine>=2.13.0
-transformers>=5.5.4
+transformers>=5.6.1
 treescope>=0.1.10
 typeguard>=2.13.3
-typer>=0.24.1
+typer>=0.24.2
 typing-extensions>=4.15.0
 typing-inspect>=0.9.0
 typing-inspection>=0.4.2
 tzdata>=2026.1 ; sys_platform == 'emscripten' or sys_platform == 'win32'
 uritemplate>=4.2.0
 urllib3>=2.6.3
-uvicorn>=0.44.0
+uvicorn>=0.46.0
 uvloop>=0.22.1
 virtualenv>=20.36.1
 wadler-lindig>=0.1.7
diff --git a/src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt b/src/dependencies/requirements/generated_requirements/tpu-post-train-requirements.txt
@@ -303,7 +303,7 @@ pytype>=2024.10.11
 pytz>=2025.2
 PyYAML>=6.0.3
 pyzmq>=27.1.0
-qwix>=0.1.4
+qwix>=0.1.6
 ray>=2.54.0
 referencing>=0.37.0
 regex>=2025.11.3
diff --git a/src/dependencies/requirements/generated_requirements/tpu-requirements.txt b/src/dependencies/requirements/generated_requirements/tpu-requirements.txt
@@ -23,7 +23,7 @@ cffi>=2.0.0 ; platform_python_implementation != 'PyPy'
 cfgv>=3.5.0
 charset-normalizer>=3.4.6
 chex>=0.1.91
-click>=8.3.2
+click>=8.3.3
 cloud-accelerator-diagnostics>=0.1.1
 cloud-tpu-diagnostics>=0.1.5
 cloudpickle>=3.1.2
@@ -39,14 +39,14 @@ dill>=0.4.1
 distlib>=0.4.0
 distro>=1.9.0
 dm-tree>=0.1.10
-docstring-parser>=0.17.0
+docstring-parser>=0.18.0
 drjax>=0.1.4
 editdistance>=0.8.1
 einops>=0.8.2
 einshape>=1.0
 etils>=1.14.0
 execnet>=2.1.2
-fastapi>=0.135.3
+fastapi>=0.136.0
 filelock>=3.20.3
 flatbuffers>=25.12.19
 flax>=0.12.6
@@ -60,7 +60,7 @@ google-api-python-client>=2.194.0
 google-auth-httplib2>=0.3.1
 google-auth-oauthlib>=1.3.1
 google-auth>=2.49.2
-google-cloud-aiplatform>=1.147.0
+google-cloud-aiplatform>=1.148.1
 google-cloud-appengine-logging>=1.9.0
 google-cloud-audit-log>=0.5.0
 google-cloud-bigquery>=3.41.0
@@ -72,7 +72,7 @@ google-cloud-resource-manager>=1.17.0
 google-cloud-storage-control>=1.11.0
 google-cloud-storage>=3.10.1
 google-crc32c>=1.8.0
-google-genai>=1.72.0
+google-genai>=1.73.1
 google-pasta>=0.2.0
 google-resumable-media>=2.8.2
 googleapis-common-protos>=1.74.0
@@ -87,10 +87,10 @@ hf-xet>=1.4.3 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or
 httpcore>=1.0.9
 httplib2>=0.31.2
 httpx>=0.28.1
-huggingface-hub>=1.10.1
+huggingface-hub>=1.11.0
 humanize>=4.15.0
 hypothesis>=6.142.1
-identify>=2.6.18
+identify>=2.6.19
 idna>=3.11
 immutabledict>=4.3.1
 importlab>=0.8.1
@@ -140,30 +140,30 @@ opt-einsum>=3.4.0
 optax>=0.2.8
 optree>=0.19.0
 optype>=0.17.0
-orbax-checkpoint>=0.11.34
+orbax-checkpoint>=0.11.36
 orbax-export>=0.0.8
 packaging>=26.0
 pandas>=3.0.2
 parameterized>=0.9.0
-pathspec>=1.0.4
+pathspec>=1.1.0
 pathwaysutils>=0.1.7
 pillow>=12.1.1
 platformdirs>=4.9.6
 pluggy>=1.6.0
 portpicker>=1.6.0
-pre-commit>=4.5.1
+pre-commit>=4.6.0
 promise>=2.3
 propcache>=0.4.1
 proto-plus>=1.27.2
 protobuf>=6.33.6
 psutil>=7.2.2
-pyarrow>=23.0.1
+pyarrow>=24.0.0
 pyasn1-modules>=0.4.2
 pyasn1>=0.6.3
 pycnite>=2024.7.31
 pycparser>=3.0 ; implementation_name != 'PyPy' and platform_python_implementation != 'PyPy'
-pydantic-core>=2.46.0
-pydantic>=2.13.0
+pydantic-core>=2.46.3
+pydantic>=2.13.3
 pydot>=4.0.1
 pyelftools>=0.32
 pyglove>=0.4.5
@@ -191,7 +191,7 @@ seqio>=0.0.20
 setuptools>=82.0.1
 shellingham>=1.5.4
 simple-parsing>=0.1.8
-simplejson>=3.20.2
+simplejson>=4.1.0
 six>=1.17.0
 sniffio>=1.3.1
 sortedcontainers>=2.4.0
@@ -216,17 +216,17 @@ toml>=0.10.2
 tomlkit>=0.14.0
 toolz>=1.1.0
 tqdm>=4.67.3
-transformers>=5.5.4
+transformers>=5.6.1
 treescope>=0.1.10
 typeguard>=2.13.3
-typer>=0.24.1
+typer>=0.24.2
 typing-extensions>=4.15.0
 typing-inspect>=0.9.0
 typing-inspection>=0.4.2
 tzdata>=2026.1 ; sys_platform == 'emscripten' or sys_platform == 'win32'
 uritemplate>=4.2.0
 urllib3>=2.6.3
-uvicorn>=0.44.0
+uvicorn>=0.46.0
 uvloop>=0.22.1
 virtualenv>=20.36.1
 wadler-lindig>=0.1.7
diff --git a/src/dependencies/requirements/requirements.txt b/src/dependencies/requirements/requirements.txt
@@ -32,7 +32,7 @@ pyink
 pylint
 pytest
 pytype
-qwix
+qwix>=0.1.6
 sentencepiece
 tensorboard-plugin-profile
 tensorboardx
diff --git a/src/maxtext/models/deepseek_batchsplit_fp8.py b/src/maxtext/models/deepseek_batchsplit_fp8.py
@@ -959,7 +959,7 @@ def gmm(
           use_qwix_quantization=config.use_qwix_quantization,
           use_tokamax_backend=config.use_tokamax_gmm,
           weight_gather_axes=weight_gather_axes,
-          qwix_rule=quantizations.get_fp8_full_qwix_rule(config),
+          qwix_rule=quantizations.get_fp8_full_qwix_rule_w_sparsity(config),
       )
     else:
       output = tokamax.ragged_dot(
diff --git a/src/maxtext/trainers/pre_train/train.py b/src/maxtext/trainers/pre_train/train.py
@@ -127,14 +127,9 @@ def loss_fn(model, config, data, dropout_rng, params, sparsity_state=None, is_tr
     rng1, aqt_rng = jax.random.split(dropout_rng)
 
     # Flax Linen model
-    if sparsity_enabled:
-      model_vars = {"params": params}
-    else:
-      model_vars = params
-
-    if sparsity_state and sparsity_enabled:
+    model_vars = {"params": params}
+    if sparsity_state:
       model_vars["batch_stats"] = sparsity_state
-
     logits, intermediate_outputs = model.apply(
         model_vars,
         data["inputs"],
@@ -341,16 +336,20 @@ def train_step(model, config, state_mesh_shardings, params_shardings, state, dat
           params,
           params_shardings,
       )
-    sparsity_enabled = config.weight_sparsity_n and config.weight_sparsity_m
-    pure_params = params["params"] if sparsity_enabled else params
+    pure_params = params["params"] if "params" in params else params
     batch_stats = params.get("batch_stats", {})
 
     grad_func = jax.value_and_grad(_loss_fn, argnums=4, has_aux=True)
-
-    kwargs = {"is_train": True}
-    if sparsity_enabled:
-      kwargs["sparsity_state"] = batch_stats
-    (loss, aux), raw_grads = grad_func(model, config, data, dropout_rng, pure_params, *extra_dpo_args, kwargs)
+    (loss, aux), raw_grads = grad_func(
+        model,
+        config,
+        data,
+        dropout_rng,
+        pure_params,
+        *extra_dpo_args,
+        sparsity_state=batch_stats,
+        is_train=True,
+    )
 
   raw_grads = jax.tree_util.tree_map(
       lambda x: x.astype(config.grad_dtype) if x.dtype == jnp.float32 else x,
@@ -425,10 +424,9 @@ def move(path, value):
         )
     )
   # Re-wrap grads to match state.params structure if it's a dict of collections
-  sparsity_enabled = config.weight_sparsity_n and config.weight_sparsity_m
-  if sparsity_enabled:
+  if isinstance(state.params, dict) and "params" in state.params:
     full_grads = {"params": grads}
-    if sparsity_enabled and "batch_stats" in state.params:
+    if "batch_stats" in state.params:
       batch_stats_grads = jax.tree_util.tree_map(jnp.zeros_like, state.params.get("batch_stats", {}))
       full_grads["batch_stats"] = batch_stats_grads
     full_grads = max_utils.unbox_logicallypartioned(full_grads)
@@ -461,7 +459,6 @@ def move(path, value):
       and "batch_stats" in state.params
   )
 
-  jax.debug.print("amanda has_batch_stats: {s}", s=has_batch_stats)
   if has_batch_stats:
     new_params = dict(new_state.params)
     new_params["batch_stats"] = max_utils.unbox_logicallypartioned(aux["batch_stats"])
@@ -524,15 +521,11 @@ def eval_step(model, config, state, data, dropout_rng):
     extra_dpo_args = [reference_params]
     _loss_fn = dpo_loss_fn
 
-  sparsity_enabled = config.weight_sparsity_n and config.weight_sparsity_m
-  pure_params = state.params["params"] if sparsity_enabled else state.params
+  pure_params = state.params["params"] if "params" in state.params else state.params
   batch_stats = state.params.get("batch_stats", {})
 
   eval_loss_fn = functools.partial(_loss_fn, model, config, data, dropout_rng, is_train=False)
-  kwargs = {}
-  if sparsity_enabled:
-    kwargs["sparsity_state"] = batch_stats
-  loss, aux = eval_loss_fn(pure_params, *extra_dpo_args, **kwargs)
+  loss, aux = eval_loss_fn(pure_params, *extra_dpo_args, sparsity_state=batch_stats)
 
   mtp_acceptance_rate = 0.0
   if config.mtp_eval_target_module > 0:

Original file line number	Diff line number	Diff line change
`@@ -959,7 +959,7 @@ def gmm(`
`959`	`959`	`use_qwix_quantization=config.use_qwix_quantization,`
`960`	`960`	`use_tokamax_backend=config.use_tokamax_gmm,`
`961`	`961`	`weight_gather_axes=weight_gather_axes,`
`962`		`- qwix_rule=quantizations.get_fp8_full_qwix_rule(config),`
	`962`	`+ qwix_rule=quantizations.get_fp8_full_qwix_rule_w_sparsity(config),`
`963`	`963`	`)`
`964`	`964`	`else:`
`965`	`965`	`output = tokamax.ragged_dot(`