Skip to content

Commit 75e87c8

Browse files
committed
Merge branch 'main' of github.com:AI-Hypercomputer/maxtext into shuningjin-fix
2 parents 73adf2a + 49fd452 commit 75e87c8

90 files changed

Lines changed: 3479 additions & 2811 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.coveragerc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# .coveragerc
22

33
[run]
4-
source = MaxText
4+
source = maxtext
55
branch = True
66
omit =
77
tests/*

.github/workflows/build_and_test_maxtext.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ jobs:
199199
base_image: maxtext-unit-test-tpu:py312
200200
cloud_runner: linux-x86-ct6e-180-4tpu
201201
pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
202+
pytest_addopts: '--ignore=tests/post_training'
202203
xla_python_client_mem_fraction: 0.75
203204
tf_force_gpu_allow_growth: false
204205
container_resource_option: "--privileged"
@@ -217,6 +218,7 @@ jobs:
217218
base_image: maxtext-unit-test-tpu:py312
218219
cloud_runner: linux-x86-ct6e-180-4tpu
219220
pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
221+
pytest_addopts: '--ignore=tests/post_training'
220222
xla_python_client_mem_fraction: 0.75
221223
tf_force_gpu_allow_growth: false
222224
container_resource_option: "--privileged"

.github/workflows/run_jupyter_notebooks.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,13 @@ jobs:
103103
# Register maxtext_venv as a selectable kernel in Jupyter
104104
$PYTHON_EXE -m ipykernel install --user --name maxtext_venv
105105
106+
# Run Hugging Face authentication
107+
hf auth login --token "$HF_TOKEN"
108+
106109
for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
107110
filename=$(basename "$notebook")
108-
if [[ "$filename" == "sft_llama3_demo_gpu.ipynb" || "$filename" == "maxtext_with_gepa.ipynb" ]]; then
111+
# TODO: Update runnner to v6e-8 as RL with LLama3.1-8b doesn't fit on v6e-4
112+
if [[ "$filename" == "sft_llama3_demo_gpu.ipynb" || "$filename" == "maxtext_with_gepa.ipynb" || "$filename" == "rl_llama3_demo.ipynb" ]]; then
109113
echo "Skipping $filename"
110114
continue
111115
fi

.github/workflows/run_tests_coordinator.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,24 +102,24 @@ jobs:
102102
${{ fromJSON('{
103103
"tpu-unit": "not cpu_only and not gpu_only and not integration_test and not post_training",
104104
"tpu-integration": "not cpu_only and not gpu_only and integration_test and not post_training",
105-
"tpu-post-training-unit": "not cpu_only and not gpu_only and not integration_test",
105+
"tpu-post-training-unit": "not cpu_only and not gpu_only and not integration_test and post_training",
106106
"tpu-post-training-integration": "not cpu_only and not gpu_only and integration_test",
107107
"gpu-unit": "not cpu_only and not tpu_only and not integration_test and not post_training",
108108
"gpu-integration": "not cpu_only and not tpu_only and integration_test and not post_training",
109109
"cpu-unit": "cpu_only and not post_training",
110-
"cpu-post-training-unit": "cpu_only"
110+
"cpu-post-training-unit": "cpu_only and post_training"
111111
}')[inputs.flavor] }}
112112
113113
pytest_addopts: >-
114114
${{ fromJSON('{
115115
"tpu-unit": "",
116116
"tpu-integration": "",
117-
"tpu-post-training-unit": "tests/post_training/unit",
117+
"tpu-post-training-unit": "tests/post_training/unit tests/unit",
118118
"tpu-post-training-integration": "tests/post_training/integration",
119119
"gpu-unit": "",
120120
"gpu-integration": "",
121121
"cpu-unit": "",
122-
"cpu-post-training-unit": "tests/post_training/unit"
122+
"cpu-post-training-unit": "tests/post_training/unit tests/unit"
123123
}')[inputs.flavor] }}
124124
125125
pytest_extra_args: >-

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,4 @@ repos:
6060
args: ['--number']
6161
additional_dependencies: [mdformat-myst, mdformat-ruff]
6262
files: (docs/.)
63-
exclude: docs/guides/checkpointing_solutions.md
63+
exclude: docs/guides/checkpointing_solutions.md|docs/guides.md

benchmarks/maxtext_xpk_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ class WorkloadConfig:
115115
disruption_configs: DisruptionConfig = None
116116
xpk_storage: None | list[str] = None
117117
hlo_dump: None | bool = None
118+
skip_validation: bool = False
118119

119120
def __post_init__(self):
120121
"""Initializes num_devices_per_slice and topology for recording the run into BigQuery"""
@@ -644,6 +645,9 @@ def generate_xpk_workload_cmd(
644645
else:
645646
docker_image_flag = f'--base-docker-image="{wl_config.base_docker_image}"'
646647

648+
if wl_config.skip_validation:
649+
workload_create_command += " --skip-validation"
650+
647651
upload_metrics_to_bq_cmd = ""
648652
if wl_config.generate_metrics_and_upload_to_big_query and not is_pathways_headless_enabled:
649653
# TODO (optionally) make it so that this upload step is done on local device instead of within the workload.

benchmarks/recipes/parser_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,13 @@ def add_arguments(parser: argparse.ArgumentParser):
151151
help="BigQuery dataset name where metrics will be written.",
152152
)
153153

154+
parser.add_argument(
155+
"--skip-validation",
156+
action="store_true",
157+
default=False,
158+
help="Skip xpk health checks and system dependency validation during workload execution",
159+
)
160+
154161
# Other configurations
155162
parser.add_argument("--xpk_path", type=str, default="~/xpk", help="Path to xpk.")
156163
parser.add_argument("--delete", action="store_true", help="Delete the cluster workload")

benchmarks/recipes/runner_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def _create_workload_config(
4848
"generate_metrics_and_upload_to_big_query": user_config.bq_enable,
4949
"db_project": user_config.bq_db_project,
5050
"db_dataset": user_config.bq_db_dataset,
51+
"skip_validation": user_config.skip_validation,
5152
}
5253
# Add any extra arguments, like disruption_configs, if they exist
5354
config_args.update(kwargs)

benchmarks/recipes/user_configs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class UserConfig:
8282
max_restarts: int = 0
8383
temp_key: str = None
8484
workload_id: str = None
85+
skip_validation: bool = False
8586

8687
def __post_init__(self):
8788
"""Automatically generate derived attributes after the object is created."""

docs/guides.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ Interactive development guides for running MaxText on Google Colab or local Jupy
6262

6363
A step-by-step guide for the community to help expand MaxText's model library.
6464
:::
65+
66+
:::{grid-item-card} 🎓 Distillation
67+
:link: guides/distillation
68+
:link-type: doc
69+
70+
How online distillation works in MaxText: loss anatomy, α / β / temperature schedule tuning, layer indices, monitoring metrics, and troubleshooting.
71+
:::
6572
::::
6673

6774
```{toctree}
@@ -75,4 +82,5 @@ guides/checkpointing_solutions.md
7582
guides/monitoring_and_debugging.md
7683
guides/run_python_notebook.md
7784
guides/model_bringup.md
85+
guides/distillation.md
7886
```

0 commit comments

Comments
 (0)