AI-Hypercomputer
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guides/checkpointing_solutions/convert_checkpoint.md‎
Lines changed: 9 additions & 9 deletions b/‎docs/guides/checkpointing_solutions/convert_checkpoint.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/tutorials/first_run.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/first_run.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tutorials/posttraining/knowledge_distillation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/posttraining/knowledge_distillation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tutorials/posttraining/multimodal.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/tutorials/posttraining/multimodal.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/MaxText/integration/tunix/tunix_adapter.py‎
Lines changed: 1 addition & 1 deletion b/‎src/MaxText/integration/tunix/tunix_adapter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/MaxText/integration/tunix/utils.py‎
Lines changed: 2 additions & 2 deletions b/‎src/MaxText/integration/tunix/utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MaxText/rl/train_rl.py‎
Lines changed: 4 additions & 10 deletions b/‎src/MaxText/rl/train_rl.py‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/utils/__init__.py‎
Lines changed: 0 additions & 13 deletions b/‎src/MaxText/utils/ckpt_conversion/utils/__init__.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎…/MaxText/utils/ckpt_conversion/README.md‎ ‎…/maxtext/checkpoint_conversion/README.md‎src/MaxText/utils/ckpt_conversion/README.md renamed to src/maxtext/checkpoint_conversion/README.md b/‎…/MaxText/utils/ckpt_conversion/README.md‎ ‎…/maxtext/checkpoint_conversion/README.md‎src/MaxText/utils/ckpt_conversion/README.md renamed to src/maxtext/checkpoint_conversion/README.md
@@ -3,7 +3,7 @@
 # Model bring-up
 src/MaxText/assets @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande
 src/MaxText/configs/models @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande @suexu1025 @jesselu-google
-src/MaxText/utils/ckpt_conversion @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @hengtaoguo @gagika @shralex @richjames0 @NicoGrande
+src/maxtext/checkpoint_conversion @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @hengtaoguo @gagika @shralex @richjames0 @NicoGrande
 src/MaxText/layers @parambole @shuningjin @RissyRan @suexu1025 @jiangjy1982 @gobbleturk @bvandermoon @gagika @shralex @richjames0 @NicoGrande @suexu1025 @jesselu-google
 
 # Features
 
@@ -1,6 +1,6 @@
 # Checkpoint conversion utilities
 
-This guide provides instructions for using the [scripts](https://github.com/AI-Hypercomputer/maxtext/tree/main/src/MaxText/utils/ckpt_conversion) that convert model checkpoints bidirectionally between Hugging Face and MaxText formats.
+This guide provides instructions for using the [scripts](https://github.com/AI-Hypercomputer/maxtext/tree/main/src/MaxText/checkpoint_conversion) that convert model checkpoints bidirectionally between Hugging Face and MaxText formats.
 
 ## Supported models
 
@@ -66,7 +66,7 @@ export LAZY_LOAD_TENSORS=<Flag to lazy load> # True to use lazy load, False to u
 Finally, run below command to complete the conversion
 
 ```bash
-python3 -m MaxText.utils.ckpt_conversion.to_maxtext maxtext/configs/base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \
     model_name=${HF_MODEL} \
     hf_access_token=${HF_TOKEN} \
     base_output_directory=${MODEL_CHECKPOINT_DIRECTORY} \
@@ -90,7 +90,7 @@ python3 -m MaxText.utils.ckpt_conversion.to_maxtext maxtext/configs/base.yml \
 - `checkpoint_storage_use_zarr3`: # Set to True to use zarr3 format (recommended for McJAX); set to False for Pathways.
 - `checkpoint_storage_use_ocdbt`: # Set to True to use OCDBT format (recommended for McJAX); set to False for Pathways.
 - `--lazy_load_tensors` (optional): If `true`, loads Hugging Face weights on-demand to minimize RAM usage. For large models, it is recommended to use the `--lazy_load_tensors=true` flag to reduce memory usage during conversion. For example, converting a Llama3.1-70B model with `--lazy_load_tensors=true` uses around 200GB of RAM and completes in ~10 minutes.
-- `--hf_model_path` (optional): Specifies a local or remote directory containing the model weights. If unspecified, we use the [default Hugging Face repository ID](https://github.com/AI-Hypercomputer/maxtext/blob/0d909c44391539db4e8cc2a33de9d77a891beb31/src/MaxText/utils/ckpt_conversion/utils/utils.py#L58-L85) (e.g., openai/gpt-oss-20b). This is necessary for locally dequantized models like GPT-OSS or DeepSeek.
+- `--hf_model_path` (optional): Specifies a local or remote directory containing the model weights. If unspecified, we use the [default Hugging Face repository ID](https://github.com/AI-Hypercomputer/maxtext/blob/0d909c44391539db4e8cc2a33de9d77a891beb31/src/MaxText/checkpoint_conversion/utils/utils.py#L58-L85) (e.g., openai/gpt-oss-20b). This is necessary for locally dequantized models like GPT-OSS or DeepSeek.
 
 Above command will download the Hugging Face model to local machine, convert it to the MaxText format and save it to `${MODEL_CHECKPOINT_DIRECTORY}/0/items`.
 
@@ -104,7 +104,7 @@ Use the `to_huggingface.py` script to convert a MaxText checkpoint into the Hugg
 The following command converts a MaxText checkpoint and saves it locally, to GCS, or uploads it directly to the Hugging Face Hub.
 
 ```bash
-python3 -m MaxText.utils.ckpt_conversion.to_huggingface src/maxtext/configs/base.yml \
+python3 -m maxtext.checkpoint_conversion.to_huggingface src/maxtext/configs/base.yml \
     model_name=<MODEL_NAME> \
     load_parameters_path=<path-to-maxtext-checkpoint> \
     base_output_directory=<path-to-save-converted-checkpoint> \
@@ -212,12 +212,12 @@ To extend conversion support to a new model architecture, you must define its sp
 
 1. **Add parameter mappings**:
 
-- In [`utils/param_mapping.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py), add the parameter name mappings(`def {MODEL}_MAXTEXT_TO_HF_PARAM_MAPPING`). This is the 1-to-1 mappings of parameters names per layer.
-- In [`utils/param_mapping.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/utils/param_mapping.py), add the `hook_fn` logic (`def {MODEL}_MAXTEXT_TO_HF_PARAM_HOOK_FN`). This is the transformation needed per layer.
+- In [`utils/param_mapping.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/utils/param_mapping.py), add the parameter name mappings(`def {MODEL}_MAXTEXT_TO_HF_PARAM_MAPPING`). This is the 1-to-1 mappings of parameters names per layer.
+- In [`utils/param_mapping.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/utils/param_mapping.py), add the `hook_fn` logic (`def {MODEL}_MAXTEXT_TO_HF_PARAM_HOOK_FN`). This is the transformation needed per layer.
 
-2. **Add Hugging Face weights Shape**: In [`utils/hf_shape.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/utils/hf_shape.py), define the tensor shape of Hugging Face format (`def {MODEL}_HF_WEIGHTS_TO_SHAPE`). This is used to ensure the tensor shape is matched after to_huggingface conversion.
-3. **Register model key**: In [`utils/utils.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/utils/utils.py), add the new model key in `HF_IDS`.
-4. **Add transformer config**: In [`utils/hf_model_configs.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py), add the `transformers.Config` object, describing the Hugging Face model configuration (defined in ['src/maxtext/configs/models'](https://github.com/AI-Hypercomputer/maxtext/tree/main/src/maxtext/configs/models)). **Note**: This configuration must precisely match the MaxText model's architecture.
+2. **Add Hugging Face weights Shape**: In [`utils/hf_shape.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/utils/hf_shape.py), define the tensor shape of Hugging Face format (`def {MODEL}_HF_WEIGHTS_TO_SHAPE`). This is used to ensure the tensor shape is matched after to_huggingface conversion.
+3. **Register model key**: In [`utils/utils.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/utils/utils.py), add the new model key in `HF_IDS`.
+4. **Add transformer config**: In [`utils/hf_model_configs.py`](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/utils/hf_model_configs.py), add the `transformers.Config` object, describing the Hugging Face model configuration (defined in ['src/maxtext/configs/models'](https://github.com/AI-Hypercomputer/maxtext/tree/main/src/maxtext/configs/models)). **Note**: This configuration must precisely match the MaxText model's architecture.
 
 Here is an example [PR to add support for gemma3 multi-modal model](https://github.com/AI-Hypercomputer/maxtext/pull/1983)
 
 
@@ -75,7 +75,7 @@ In the same TPU VM where you just installed all the dependencies of MaxText, You
 
 #### Decoding in MaxText via notebook
 
-You can use [demo_decoding.ipynb](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/examples/demo_decoding.ipynb) to try out decoding on MaxText's `Llama3.1-8b` model implementation. In this notebook, we give `"I love to"` as the prompt, and the greedily sampled first output token is `" cook"`. Please remember to provide the path to your `Llama3.1-8b` checkpoint for the `load_parameters_path` argument in the config inside the notebook. You can use [to_maxtext.py](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/to_maxtext.py) to create a MaxText/Orbax checkpoint from a Huggingface checkpoint.
+You can use [demo_decoding.ipynb](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/maxtext/examples/demo_decoding.ipynb) to try out decoding on MaxText's `Llama3.1-8b` model implementation. In this notebook, we give `"I love to"` as the prompt, and the greedily sampled first output token is `" cook"`. Please remember to provide the path to your `Llama3.1-8b` checkpoint for the `load_parameters_path` argument in the config inside the notebook. You can use [to_maxtext.py](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/to_maxtext.py) to create a MaxText/Orbax checkpoint from a Huggingface checkpoint.
 
 ### Run MaxText on NVIDIA GPUs
 
 
@@ -132,7 +132,7 @@ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 export PRE_TRAINED_MODEL_CKPT_DIRECTORY=${BASE_DIRECTORY}/llama3.1-8b-ckpt
 
 # Convert to MaxText format
-python3 -m MaxText.utils.ckpt_conversion.to_maxtext src/maxtext/configs/base.yml \
+python3 -m maxtext.checkpoint_conversion.to_maxtext src/maxtext/configs/base.yml \
     model_name=llama3.1-8b \
     hf_access_token=${HF_TOKEN} \
     base_output_directory=${PRE_TRAINED_MODEL_CKPT_DIRECTORY} \
 
@@ -25,7 +25,7 @@ Multimodal Large Language Models (LLMs) extend traditional text-only models by i
 
 ## Checkpoint Conversion
 
-Recently we have onboarded a new centralized tool for bidirectional checkpoint conversion between MaxText and HuggingFace ([README](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/README.md)).
+Recently we have onboarded a new centralized tool for bidirectional checkpoint conversion between MaxText and HuggingFace ([README](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/checkpoint_conversion/README.md)).
 
 Install pytorch:
 
@@ -38,7 +38,7 @@ Then use this command to convert an unscanned checkpoint from HuggingFace to Max
 ```shell
 export HF_ACCESS_TOKEN=hf_...
 export MAXTEXT_CKPT_GCS_PATH=gs://...
-python -m MaxText.utils.ckpt_conversion.to_maxtext maxtext/configs/base.yml \
+python -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \
     model_name=gemma3-4b \
     hf_access_token=$HF_ACCESS_TOKEN \
     base_output_directory=$MAXTEXT_CKPT_GCS_PATH \
@@ -51,7 +51,7 @@ For the Llama4 model family, we are using a separate checkpoint conversion scrip
 ```shell
 export LOCAL_HF_MODEL_PATH=...  # Need to pre-download the safetensors from HuggingFace
 export MAXTEXT_CKPT_GCS_PATH=gs://...
-python -m MaxText.utils.ckpt_scripts.llama4_ckpt_unscanned \
+python -m maxtext.checkpoint_conversion.standalone_scripts.llama4_ckpt_unscanned \
     --model-size=llama4-17b-16e \
     --huggingface-checkpoint=True \
     --base-model-path=$LOCAL_HF_MODEL_PATH \
 
@@ -27,7 +27,7 @@
 from flax import nnx
 from MaxText.layers.models import Transformer
 from MaxText.integration.tunix.utils import VllmWeightMapping
-from MaxText.utils.ckpt_conversion.utils.hf_model_configs import HF_MODEL_CONFIGS  # pylint: disable=ungrouped-imports
+from maxtext.checkpoint_conversion.utils.hf_model_configs import HF_MODEL_CONFIGS  # pylint: disable=ungrouped-imports
 
 
 class TunixMaxTextAdapter(nnx.Module):
 
@@ -17,8 +17,8 @@
 import re
 
 import MaxText.integration.tunix.weight_mapping as weight_mapping  # pylint: disable=consider-using-from-import
-from MaxText.utils.ckpt_conversion.utils.param_mapping import PARAM_MAPPING
-from MaxText.utils.ckpt_conversion.utils.param_mapping import VLLM_HOOK_FNS
+from maxtext.checkpoint_conversion.utils.param_mapping import PARAM_MAPPING
+from maxtext.checkpoint_conversion.utils.param_mapping import VLLM_HOOK_FNS
 
 STANDALONE_VLLM_WEIGHT_MAPPING = weight_mapping.StandaloneVllmWeightMapping()
 
 
@@ -84,9 +84,9 @@ def get_maxtext_model(config, devices=None):
   """
   Load MaxText model with Tunix adapter.
   # Note: pass the path to your scanned checkpoint for 'load_parameters_path'.
-  # To create a scanned checkpoint, you can use /maxtext/src/MaxText/utils/ckpt_conversion/to_maxtext.py and if
+  # To create a scanned checkpoint, you can use /maxtext/src/MaxText/checkpoint_conversion/to_maxtext.py and if
   # using Pathways, please set `checkpoint_storage_use_ocdbt=False checkpoint_storage_use_zarr3=False`
-  # python src/MaxText/utils/ckpt_conversion/to_maxtext.py \
+  # python src/MaxText/checkpoint_conversion/to_maxtext.py \
   #  --model_name="gemma2-2b" \
   #  --base_output_directory="/path/to/your/output/directory" \
   #  --scan_layers=True \
@@ -321,10 +321,7 @@ def _filter_long_prompts(x):
   train_dataset = train_dataset[:dataset_size]
   train_dataset = train_dataset.repeat(trainer_config.num_epoch)
 
-  train_dataset = (
-      train_dataset.to_iter_dataset()
-      .batch(trainer_config.batch_size)
-  )
+  train_dataset = train_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
   eval_dataset_name = getattr(trainer_config, "eval_dataset_name", None)
   if not eval_dataset_name:
@@ -342,10 +339,7 @@ def _filter_long_prompts(x):
   test_dataset = test_dataset.filter(_filter_long_prompts)
   test_dataset = test_dataset[: trainer_config.num_test_batches * trainer_config.batch_size]
 
-  test_dataset = (
-      test_dataset.to_iter_dataset()
-      .batch(trainer_config.batch_size)
-  )
+  test_dataset = test_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
   # Load reference model
   max_logging.log("Creating reference model and also meshes for reference and rollout")