Skip to content

Commit 4f4c45c

Browse files
authored
Merge branch 'main' into ajkv/sft-grain-implementation
2 parents 59c10dc + a86e2fb commit 4f4c45c

175 files changed

Lines changed: 4033 additions & 1878 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ tests/inference/ @vipannalla @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @p
2222
src/maxtext/inference @vipannalla @mitalisi @gpolovets1 @mailvijayasingh @jrplatin @patemotter @lumosis @richjames0
2323

2424
# Dockerfiles and dependencies
25-
src/dependencies/ @bvandermoon @parambole @richjames0 @shralex
25+
src/dependencies/ @bvandermoon @SurbhiJainUSC @parambole @richjames0 @shralex
2626

2727
# Docs
2828
docs/ @jacoguzo @bvandermoon @richjames0 @shralex @gobbleturk @RissyRan @gagika @A9isha @jiangjy1982 @vipannalla

.github/workflows/build_and_push_docker_image.yml

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
runs-on: linux-x86-n2-16-buildkit
5555
container: google/cloud-sdk:524.0.0
5656
if: >
57+
github.event_name == 'release' ||
5758
github.event_name == 'schedule' ||
5859
github.event_name == 'pull_request' ||
5960
github.event_name == 'workflow_dispatch' && (
@@ -86,15 +87,8 @@ jobs:
8687
# This ensures that every job clones the exact same commit as "setup" job
8788
ref: ${{ inputs.maxtext_sha }}
8889

89-
- name: Checkout post-training dependencies
90-
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
91-
run: |
92-
git clone https://github.com/google/tunix.git ./tunix
93-
git clone https://github.com/vllm-project/vllm.git ./vllm
94-
git clone https://github.com/vllm-project/tpu-inference.git ./tpu-inference
95-
9690
- name: Mark git repositories as safe
97-
run: git config --global --add safe.directory '*'
91+
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}
9892
if: steps.check.outputs.should_run == 'true'
9993

10094
- name: Configure Docker
@@ -122,6 +116,7 @@ jobs:
122116
DEVICE=${{ inputs.device }}
123117
MODE=${{ inputs.build_mode }}
124118
WORKFLOW=${{ inputs.workflow }}
119+
PACKAGE_DIR=./src
125120
JAX_VERSION=NONE
126121
LIBTPU_VERSION=NONE
127122
INCLUDE_TEST_ASSETS=true
@@ -147,16 +142,6 @@ jobs:
147142
# Add MaxText tag
148143
maxtext_hash=$(git rev-parse --short HEAD)
149144
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
150-
151-
# Add post-training dependencies tags
152-
if [ "${{ inputs.workflow }}" == "post-training" ]; then
153-
for dir in tunix vllm tpu-inference; do
154-
if [ -d "./$dir" ]; then
155-
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
156-
gcloud container images add-tag "$SOURCE_IMAGE:${{ github.run_id }}" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
157-
fi
158-
done
159-
fi
160145
fi
161146
env:
162147
INPUTS_IMAGE_NAME: ${{ inputs.image_name }}

.github/workflows/build_and_test_maxtext.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ jobs:
262262
tf_force_gpu_allow_growth: false
263263
container_resource_option: "--privileged"
264264
is_scheduled_run: ${{ github.event_name == 'schedule' }}
265-
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
265+
extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
266266
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
267267

268268
maxtext_post_training_tpu_unit_tests:
@@ -284,7 +284,7 @@ jobs:
284284
tf_force_gpu_allow_growth: false
285285
container_resource_option: "--privileged"
286286
is_scheduled_run: ${{ github.event_name == 'schedule' }}
287-
extra_pip_deps_file: 'src/install_maxtext_extra_deps/extra_post_train_base_deps_from_github.txt'
287+
extra_pip_deps_file: 'src/dependencies/github_deps/post_train_base_deps.txt'
288288
maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }}
289289

290290
maxtext_gpu_integration_tests:

.github/workflows/run_pathways_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ jobs:
8585
source .venv/bin/activate
8686
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
8787
uv pip install ${maxtext_wheel}[tpu] --resolution=lowest
88-
uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
88+
uv pip install -r src/dependencies/github_deps/pre_train_deps.txt
8989
python3 --version
9090
python3 -m pip freeze
9191
- name: Copy test assets files

.github/workflows/run_tests_against_package.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ jobs:
9696
source .venv/bin/activate
9797
maxtext_wheel=$(ls maxtext-*-py3-none-any.whl 2>/dev/null)
9898
uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest
99-
uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt
99+
uv pip install -r src/dependencies/github_deps/pre_train_deps.txt
100100
python3 --version
101101
python3 -m pip freeze
102102
uv pip install pytest-cov
@@ -131,12 +131,10 @@ jobs:
131131
else
132132
SPLIT_ARGS=""
133133
fi
134-
# TODO: Fix the skipped tests and remove the deselect flags
135134
.venv/bin/python3 -m pytest ${INPUTS_PYTEST_ADDOPTS} \
136135
-v \
137136
-m "${FINAL_PYTEST_MARKER}" \
138137
--durations=0 \
139-
--deselect "tests/unit/tokenizer_test.py::TokenizerTest::test_detokenize" \
140138
--cov=MaxText \
141139
--cov=maxtext \
142140
--cov-report=xml \

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ repos:
5252
args:
5353
- '--pyink-indentation=2'
5454
- '--line-length=122'
55-
- '--check'
5655

5756
- repo: https://github.com/executablebooks/mdformat
5857
rev: 0.7.22

LICENSE_HEADER

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Copyright 2023–2026 Google LLC
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.

PREFLIGHT.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
11
# Optimization 1: Multihost recommended network settings
2-
We included all the recommended network settings in [rto_setup.sh](https://github.com/google/maxtext/blob/main/rto_setup.sh).
2+
We included all the recommended network settings in [rto_setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/rto_setup.sh).
33

4-
[preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you apply them based on GCE or GKE platform.
4+
[preflight.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/preflight.sh) will help you apply them based on GCE or GKE platform.
55

66
Before you run ML workload on Multihost with GCE or GKE, simply apply `bash preflight.sh PLATFORM=[GCE or GKE]` to leverage the best DCN network performance.
77

88
Here is an example for GCE:
99
```
10-
bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
10+
bash src/dependencies/scripts/preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
1111
```
1212

1313
Here is an example for GKE:
1414
```
15-
bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
15+
bash src/dependencies/scripts/preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
1616
```
1717

1818
# Optimization 2: Numa binding (You can only apply this to v4 and v5p)
1919
NUMA binding is recommended for enhanced performance, as it reduces memory latency and maximizes data throughput, ensuring that your high-performance applications operate more efficiently and effectively.
2020

2121
For GCE,
22-
[preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example:
22+
[preflight.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example:
2323

2424
```
25-
bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
25+
bash src/dependencies/scripts/preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
2626
```
2727

2828
For GKE,
2929
`numactl` should be built into your docker image from [maxtext_tpu_dependencies.Dockerfile](https://github.com/google/maxtext/blob/main/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile), so you can use it directly if you built the maxtext docker image. Here is an example
3030

3131
```
32-
bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
32+
bash src/dependencies/scripts/preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
3333
```
3434

3535
1. `numactl`: This is the command-line tool used for controlling NUMA policy for processes or shared memory. It's particularly useful on multi-socket systems where memory locality can impact performance.

benchmarks/maxtext_xpk_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ def build_user_command(
428428
if wl_config.hlo_dump:
429429
hlo_dump = "XLA_FLAGS='--xla_dump_large_constants --xla_dump_to=/tmp/xla_dump'"
430430
upload_hlo_dump = (
431-
f" && gsutil -m cp -r /tmp/xla_dump {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"
431+
f" && gcloud storage cp -r /tmp/xla_dump {wl_config.base_output_directory}/{wl_config.run_name}/hlo_dump"
432432
)
433433
# Construct the command string with proper formatting and line continuations
434434
command = " ".join(

benchmarks/upload_metrics_to_bq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def add_parser_arguments(parser: argparse.ArgumentParser):
187187

188188

189189
def download_metrics_file_locally(metrics_gcs_file: str, local_file: str) -> int:
190-
command = f"gsutil cp -r {metrics_gcs_file} {local_file}"
190+
command = f"gcloud storage cp --recursive {metrics_gcs_file} {local_file}"
191191
return run_command_with_updates(command, f"Download {metrics_gcs_file} in {local_file}")
192192

193193

0 commit comments

Comments
 (0)