AI-Hypercomputer
diff --git a/‎src/maxtext/checkpoint_conversion/utils/BUILD‎
Lines changed: 30 additions & 0 deletions b/‎src/maxtext/checkpoint_conversion/utils/BUILD‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/maxtext/checkpoint_conversion/utils/hf_model_configs.py‎
Lines changed: 3 additions & 3 deletions b/‎src/maxtext/checkpoint_conversion/utils/hf_model_configs.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/maxtext/configs/post_train/rl.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/post_train/rl.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/pyconfig.py‎
Lines changed: 59 additions & 10 deletions b/‎src/maxtext/configs/pyconfig.py‎
Lines changed: 59 additions & 10 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 5 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/maxtext/examples/rl_llama3_demo.ipynb‎
Lines changed: 2 additions & 22 deletions b/‎src/maxtext/examples/rl_llama3_demo.ipynb‎
Lines changed: 2 additions & 22 deletions
diff --git a/‎src/maxtext/inference/vllm_decode.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/inference/vllm_decode.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/integration/tunix/BUILD‎
Lines changed: 60 additions & 0 deletions b/‎src/maxtext/integration/tunix/BUILD‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/trainers/post_train/distillation/train_distill.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/trainers/post_train/distillation/train_distill.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,30 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+    default_applicable_licenses = ["//third_party/py/maxtext:license"],
+    default_visibility = ["//third_party/py/maxtext:__subpackages__"],
+)
+
+filegroup(
+    name = "param_mapping_file",
+    srcs = ["param_mapping.py"],
+    visibility = ["//third_party/py/maxtext:__pkg__"],
+)
+
+filegroup(
+    name = "hf_model_configs_file",
+    srcs = ["hf_model_configs.py"],
+    visibility = ["//third_party/py/maxtext:__pkg__"],
+)
@@ -20,7 +20,7 @@
 import transformers
 
 if transformers.__version__ >= "5.0.0":
-  from transformers.configuration_utils import PreTrainedConfig as PTConfig
+  from transformers.configuration_utils import PreTrainedConfig as PTConfig  # pytype: disable=import-error
 else:
   from transformers.configuration_utils import PretrainedConfig as PTConfig
 
@@ -151,8 +151,8 @@
   gemma4_31b_config = transformers.Gemma4Config(**gemma4_31b_dict)
 except AttributeError:
   # Graceful fallback to raw dict-based PTConfig if Gemma 4 natively is missing
-  gemma4_26b_config = PTConfig(**gemma4_26b_dict)
-  gemma4_31b_config = PTConfig(**gemma4_31b_dict)
+  gemma4_26b_config = PTConfig(**gemma4_26b_dict)  # pytype: disable=wrong-arg-types
+  gemma4_31b_config = PTConfig(**gemma4_31b_dict)  # pytype: disable=wrong-arg-types
 
 
 gemma3_4b_config = transformers.Gemma3Config(
 
@@ -96,6 +96,7 @@ checkpoint_storage_use_ocdbt: False # For Pathways
 checkpoint_storage_use_zarr3: False # For Pathways
 use_pathways: True
 log_period: 20
+convert_checkpoint_if_possible: True
 
 # ====== Debugging ======
 debug:
 
@@ -77,16 +77,48 @@ def _module_from_path(path: str) -> str | None:
   return None
 
 
-def _resolve_or_infer_config(argv: list[str]) -> tuple[str, list[str]]:
+def _resolve_or_infer_config(argv: list[str] | None = None, **kwargs) -> tuple[str, list[str]]:
   """Resolves or infers config file path from module."""
+  if argv is None:
+    argv = [""]
+
+  if kwargs.get("base_config"):
+    logger.info("Using config : %s", kwargs["base_config"])
+    return resolve_config_path(kwargs["base_config"]), argv[1:]
+
+  # if passing at least two arguments via list (no kwargs), then we have to specify
+  # first one as either "" or python script like train_rl.py or train.py
+  # the second argument is the yaml file
   if len(argv) >= 2 and argv[1].endswith(".yml"):
     return resolve_config_path(argv[1]), argv[2:]
-  module = _module_from_path(argv[0])
+  module = _module_from_path(argv[0]) if len(argv) > 0 else None
   if module not in _CONFIG_FILE_MAPPING:
-    raise ValueError(f"No config file provided and no default config found for module '{module}'")
-  config_path = os.path.join(MAXTEXT_CONFIGS_DIR, _CONFIG_FILE_MAPPING[module])
-  logger.warning("No config file provided, using default config mapping: %s", config_path)
-  return config_path, argv[1:]
+    config_path = os.path.join(MAXTEXT_CONFIGS_DIR, "base.yml")
+    logger.warning("No config file provided and no default config found for module '%s', using base.yml", module)
+  else:
+    config_path = os.path.join(MAXTEXT_CONFIGS_DIR, _CONFIG_FILE_MAPPING[module])
+    logger.warning("No config file provided, using default config mapping: %s", config_path)
+  remaining_argv = argv[1:]
+
+  return config_path, remaining_argv
+
+
+def _resolve_or_infer_addl_config(**kwargs):
+  """Resolves or infers more configs from module."""
+  inferred_kwargs = {}
+  # if base_output_directory key is not seen
+  if not kwargs.get("base_output_directory"):
+    max_logging.warning("base_output_directory is not provided; Using local directory called maxtext_output")
+    base_output_directory = os.path.abspath("maxtext_output")
+    inferred_kwargs["base_output_directory"] = base_output_directory
+
+  # if hf_access_token key is not seen
+  if not kwargs.get("hf_access_token"):
+    hf_access_token = os.environ.get("HF_TOKEN")
+    if hf_access_token:
+      inferred_kwargs["hf_access_token"] = hf_access_token
+
+  return inferred_kwargs
 
 
 def yaml_key_to_env_key(s: str) -> str:
@@ -289,28 +321,35 @@ def get_keys(self) -> dict[str, Any]:
     return self._flat_config
 
 
-def initialize(argv: list[str], **kwargs) -> HyperParameters:
+def initialize(argv: list[str] | None = None, **kwargs) -> HyperParameters:
   """Initializes the configuration by loading YAML files, and applying CLI, env, and kwarg overrides."""
   pydantic_config = initialize_pydantic(argv, **kwargs)
   config = HyperParameters(pydantic_config)
   return config
 
 
-def initialize_pydantic(argv: list[str], **kwargs) -> MaxTextConfig:
+def initialize_pydantic(argv: list[str] | None = None, **kwargs) -> MaxTextConfig:
   """Initializes the configuration by loading YAML files, and applying CLI, env, and kwarg overrides.
   Returns pydantic MaxTextConfig class whereas `initialize` returns the og `HyperParameters`
   """
   # 1. Load base and inherited configs from file(s)
-  config_path, cli_args = _resolve_or_infer_config(argv)
+  config_path, cli_args = _resolve_or_infer_config(argv, **kwargs)
   base_yml_config = _load_config(config_path)
 
   # 2. Get overrides from CLI and kwargs
   cli_cfg = omegaconf.OmegaConf.from_cli(cli_args)
   kwargs_cfg = omegaconf.OmegaConf.create(kwargs)
   overrides_cfg = omegaconf.OmegaConf.merge(cli_cfg, kwargs_cfg)
 
-  # 3. Handle model-specific config
+  temp_cfg1 = omegaconf.OmegaConf.merge(base_yml_config, overrides_cfg)
+  # 3.1. infer more configs if possible
+  temp_cfg1 = _resolve_or_infer_addl_config(**temp_cfg1)
+  # update overrides_cfg with temp_cfg1
+  overrides_cfg = omegaconf.OmegaConf.merge(overrides_cfg, temp_cfg1)
   temp_cfg = omegaconf.OmegaConf.merge(base_yml_config, overrides_cfg)
+
+  # 3.2. Handle model-specific config
+
   model_name = temp_cfg.get("model_name", "default")
   # The architecture for -Instruct v/s base models are the same, so for identifying the
   # architecture we replace "-Instruct" from the model_name and get the base model name
@@ -437,3 +476,13 @@ def initialize_pydantic(argv: list[str], **kwargs) -> MaxTextConfig:
 # Shim for backward compatibility with pyconfig_deprecated_test.py
 validate_and_update_keys = pyconfig_deprecated.validate_and_update_keys
 __all__ = ["initialize", "initialize_pydantic"]
+
+
+class _CallablePyconfigModule(sys.modules[__name__].__class__):
+  """Allows calling the module directly as mt.pyconfig()."""
+
+  def __call__(self, argv: list[str] | None = None, **kwargs) -> HyperParameters:
+    return initialize(argv, **kwargs)
+
+
+sys.modules[__name__].__class__ = _CallablePyconfigModule
@@ -1950,6 +1950,11 @@ class DerivedValues(BaseModel):
       None,
       description="The full path to the checkpoint directory, derived from `run_name`.",
   )
+  convert_checkpoint_if_possible: bool = Field(
+      False,
+      description="Whether to convert checkpoint on the fly if not provided via\
+        load_parameters_path or base_output_directory",
+  )
   metrics_dir: None | str = Field(
       None,
       description="The full path to the metrics directory, derived from `run_name`.",
 
@@ -135,27 +135,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import datetime\n",
-    "import os\n",
-    "import sys\n",
-    "import subprocess\n",
-    "from pathlib import Path\n",
-    "from huggingface_hub import login\n",
-    "from etils import epath\n",
-    "import jax\n",
-    "\n",
-    "from maxtext.trainers.post_train.rl.train_rl import rl_train, setup_configs_and_devices\n",
-    "from maxtext.utils.globals import MAXTEXT_REPO_ROOT, MAXTEXT_PKG_DIR\n",
-    "\n",
-    "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"0\"\n",
-    "os.environ[\"SKIP_JAX_PRECOMPILE\"] = \"1\"  # Faster startup for vLLM\n",
-    "# Suppress vLLM logging with a severity level below ERROR\n",
-    "os.environ[\"VLLM_LOGGING_LEVEL\"] = \"ERROR\"\n",
-    "\n",
-    "\n",
-    "print(f\"MaxText installation path: {MAXTEXT_PKG_DIR}\")"
-   ]
+   "source": "import datetime\nimport os\nimport sys\nimport subprocess\nfrom pathlib import Path\nfrom huggingface_hub import login\nfrom etils import epath\nimport jax\n\nfrom maxtext.trainers.post_train.rl.train_rl import rl_train\nfrom maxtext.utils.model_creation_utils import setup_configs_and_devices\nfrom maxtext.utils.globals import MAXTEXT_REPO_ROOT, MAXTEXT_PKG_DIR\n\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"0\"\nos.environ[\"SKIP_JAX_PRECOMPILE\"] = \"1\"  # Faster startup for vLLM\n# Suppress vLLM logging with a severity level below ERROR\nos.environ[\"VLLM_LOGGING_LEVEL\"] = \"ERROR\"\n\n\nprint(f\"MaxText installation path: {MAXTEXT_PKG_DIR}\")"
   },
   {
    "cell_type": "code",
@@ -386,4 +366,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
@@ -241,7 +241,7 @@ def main(argv: Sequence[str]) -> None:
   config = pyconfig.initialize(argv)
 
   if FLAGS.use_tunix:
-    maxtext_model, mesh = model_creation_utils.create_nnx_model(config)
+    maxtext_model, mesh = model_creation_utils.from_pretrained(config)
     decode_with_tunix(config, model=maxtext_model, mesh=mesh)
   else:
     decode_with_vllm(config)
 
@@ -0,0 +1,60 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load(
+    "//devtools/python/blaze:pytype.bzl",
+    "pytype_strict_library",
+)
+
+package(
+    default_applicable_licenses = ["//third_party/py/maxtext:license"],
+    default_visibility = ["//third_party/py/maxtext:__subpackages__"],
+)
+
+pytype_strict_library(
+    name = "weight_mapping",
+    srcs = [
+        "weight_mapping/__init__.py",
+        "weight_mapping/deepseek3.py",
+        "weight_mapping/gpt_oss.py",
+        "weight_mapping/llama3.py",
+        "weight_mapping/qwen2.py",
+        "weight_mapping/qwen3.py",
+    ],
+    deps = [
+        "//third_party/py/jax",
+        "//third_party/py/numpy",
+    ],
+)
+
+pytype_strict_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    deps = [
+        ":weight_mapping",
+        "//third_party/py/maxtext:checkpoint_conversion_utils_param_mapping",
+    ],
+)
+
+pytype_strict_library(
+    name = "tunix_adapter",
+    srcs = ["tunix_adapter.py"],
+    deps = [
+        ":utils",
+        "//third_party/py/flax/nnx",
+        "//third_party/py/jax",
+        "//third_party/py/maxtext:checkpoint_conversion_utils_hf_model_configs",
+        "//third_party/py/maxtext:layers",
+    ],
+)
@@ -251,7 +251,7 @@ def load_weights(self, rng_key: jax.Array) -> None:
       return
 
     with self.mesh, nn.logical_axis_rules(""):
-      model, _ = model_creation_utils.create_nnx_model(
+      model = model_creation_utils.from_pretrained(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
       self.model = nnx.data(model)
@@ -463,7 +463,7 @@ def get_maxtext_model(config: pyconfig.HyperParameters, mesh: jax.sharding.Mesh)
     The loaded MaxText model.
   """
   max_logging.log(f"Initializing model: {config.model_name}...")
-  model, _ = model_creation_utils.create_nnx_model(config, mesh=mesh)
+  model = model_creation_utils.from_pretrained(config, mesh=mesh)
   return model
Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ def load_weights(self, rng_key: jax.Array) -> None:`
`251`	`251`	`return`
`252`	`252`
`253`	`253`	`with self.mesh, nn.logical_axis_rules(""):`
`254`		`- model, _ = model_creation_utils.create_nnx_model(`
	`254`	`+ model = model_creation_utils.from_pretrained(`
`255`	`255`	`self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key`
`256`	`256`	`)`
`257`	`257`	`self.model = nnx.data(model)`