Merge pull request #3458 from AI-Hypercomputer:docker_build_fix

Google-ML-Automation · Google-ML-Automation · commit f52d9c09ef50 · 2026-03-19T14:26:06.000-07:00
PiperOrigin-RevId: 886375125
diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
@@ -123,6 +123,7 @@ jobs:
             MODE=${{ inputs.build_mode }}
             WORKFLOW=${{ inputs.workflow }}
             PACKAGE_DIR=./src
+            TESTS_DIR=./tests
             JAX_VERSION=NONE
             LIBTPU_VERSION=NONE
             INCLUDE_TEST_ASSETS=true
diff --git a/PREFLIGHT.md b/PREFLIGHT.md
@@ -1,35 +1,35 @@
 # Optimization 1: Multihost recommended network settings
-We included all the recommended network settings in [rto_setup.sh](https://github.com/google/maxtext/blob/main/rto_setup.sh). 
+We included all the recommended network settings in [rto_setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/rto_setup.sh). 
 
-[preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you apply them based on GCE or GKE platform.
+[preflight.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/preflight.sh) will help you apply them based on GCE or GKE platform.
 
 Before you run ML workload on Multihost with GCE or GKE, simply apply `bash preflight.sh PLATFORM=[GCE or GKE]` to leverage the best DCN network performance.
 
 Here is an example for GCE:
 ```
-bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
+bash src/dependencies/scripts/preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
 ```
 
 Here is an example for GKE:
 ```
-bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
+bash src/dependencies/scripts/preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
 ```
 
 # Optimization 2: Numa binding (You can only apply this to v4 and v5p)
 NUMA binding is recommended for enhanced performance, as it reduces memory latency and maximizes data throughput, ensuring that your high-performance applications operate more efficiently and effectively.
 
 For GCE, 
-[preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example:
+[preflight.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example:
 
 ```
-bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
+bash src/dependencies/scripts/preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
 ```
 
 For GKE,
 `numactl` should be built into your docker image from [maxtext_tpu_dependencies.Dockerfile](https://github.com/google/maxtext/blob/main/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile), so you can use it directly if you built the maxtext docker image. Here is an example
 
 ```
-bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
+bash src/dependencies/scripts/preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train run_name=${YOUR_JOB_NAME?}
 ```
 
 1. `numactl`: This is the command-line tool used for controlling NUMA policy for processes or shared memory. It's particularly useful on multi-socket systems where memory locality can impact performance.
diff --git a/src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile b/src/dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
@@ -41,6 +41,9 @@ ENV ENV_DEVICE=$DEVICE
 ARG PACKAGE_DIR
 ENV PACKAGE_DIR=$PACKAGE_DIR
 
+ARG TESTS_DIR
+ENV TESTS_DIR=$TESTS_DIR
+
 ENV MAXTEXT_ASSETS_ROOT=/deps/src/maxtext/assets
 ENV MAXTEXT_TEST_ASSETS_ROOT=/deps/tests/assets
 ENV MAXTEXT_PKG_DIR=/deps/src/MaxText
@@ -63,6 +66,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Now copy the remaining code (source files that may change frequently)
 COPY ${PACKAGE_DIR}/maxtext/ src/MaxText/
+COPY ${TESTS_DIR}*/ tests/
 
 # Download test assets from GCS if building image with test assets
 ARG INCLUDE_TEST_ASSETS=false
diff --git a/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile b/src/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
@@ -38,6 +38,9 @@ ENV ENV_DEVICE=$DEVICE
 ARG PACKAGE_DIR
 ENV PACKAGE_DIR=$PACKAGE_DIR
 
+ARG TESTS_DIR
+ENV TESTS_DIR=$TESTS_DIR
+
 ENV MAXTEXT_ASSETS_ROOT=/deps/src/maxtext/assets
 ENV MAXTEXT_TEST_ASSETS_ROOT=/deps/tests/assets
 ENV MAXTEXT_PKG_DIR=/deps/src/maxtext
@@ -63,6 +66,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Now copy the remaining code (source files that may change frequently)
 COPY ${PACKAGE_DIR}/maxtext/ src/maxtext/
+COPY ${TESTS_DIR}*/ tests/
 
 # Download test assets from GCS if building image with test assets
 ARG INCLUDE_TEST_ASSETS=false
diff --git a/src/dependencies/scripts/docker_build_dependency_image.sh b/src/dependencies/scripts/docker_build_dependency_image.sh
@@ -22,6 +22,8 @@
 
 PACKAGE_DIR="${PACKAGE_DIR:-src}"
 echo "PACKAGE_DIR: $PACKAGE_DIR"
+TESTS_DIR="${TESTS_DIR:-tests}"
+echo "TESTS_DIR: $TESTS_DIR"
 
 # Enable "exit immediately if any command fails" option
 set -e
@@ -71,6 +73,7 @@ docker_build_args=(
   "MODE=${MODE}"
   "JAX_VERSION=${JAX_VERSION}"
   "PACKAGE_DIR=${PACKAGE_DIR}"
+  "TESTS_DIR=${TESTS_DIR}"
 )
 
 run_docker_build() {
diff --git a/src/dependencies/scripts/preflight.sh b/src/dependencies/scripts/preflight.sh
@@ -3,7 +3,7 @@ echo "Running preflight.sh"
 # Command Flags:
 #
 # Example to invoke this script:
-# bash preflight.sh
+# bash src/dependencies/scripts/preflight.sh
 
 # Warning:
 # For any dependencies, please add them into `setup.sh` or `maxtext_tpu_dependencies.Dockerfile`. 
@@ -24,11 +24,11 @@ if command -v sudo >/dev/null 2>&1; then
     echo "running rto_setup.sh with sudo"
 
     # apply network settings.
-    sudo bash rto_setup.sh
+    sudo bash src/dependencies/scripts/rto_setup.sh
 else
     # sudo is not available, run the script without sudo
     echo "running rto_setup.sh without sudo"
 
     # apply network settings.
-    bash rto_setup.sh
+    bash src/dependencies/scripts/rto_setup.sh
 fi
diff --git a/src/dependencies/scripts/rto_setup.sh b/src/dependencies/scripts/rto_setup.sh
diff --git a/src/maxtext/configs/README.md b/src/maxtext/configs/README.md
@@ -19,7 +19,7 @@ This directory contains high performance model configurations for different gene
 
 These configurations do 3 things:
 * Sets various XLA compiler flags (see [below](/src/maxtext/configs#xla-flags-used-by-maxtext)) as `LIBTPU_INIT_ARGS` to optimize runtime performance.
-* Runs [rto_setup.sh](https://github.com/google/maxtext/blob/main/rto_setup.sh) to optimize communication protocols for network performance.
+* Runs [rto_setup.sh](https://github.com/google/maxtext/blob/main/src/dependencies/scripts/rto_setup.sh) to optimize communication protocols for network performance.
 (This only needs to be run once on each worker)
 * Runs [train.py](https://github.com/google/maxtext/blob/main/src/maxtext/trainers/pre_train/train.py) with specific hyper-parameters (batch size, etc.)
 
diff --git a/src/maxtext/configs/experimental/1024b.sh b/src/maxtext/configs/experimental/1024b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/experimental/128b.sh b/src/maxtext/configs/experimental/128b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/experimental/256b.sh b/src/maxtext/configs/experimental/256b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/experimental/32b.sh b/src/maxtext/configs/experimental/32b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/experimental/512b.sh b/src/maxtext/configs/experimental/512b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/experimental/64b.sh b/src/maxtext/configs/experimental/64b.sh
@@ -15,7 +15,7 @@ for ARGUMENT in "$@"; do
 done
 
 # Use preflight.sh to set up env based on platform
-bash preflight.sh PLATFORM=$PLATFORM
+bash src/dependencies/scripts/preflight.sh PLATFORM=$PLATFORM
 
 # Train
 export LIBTPU_INIT_ARGS="--xla_tpu_megacore_fusion_allow_ags=false --xla_enable_async_collective_permute=true --xla_tpu_enable_ag_backward_pipelining=true --xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/src/maxtext/configs/tpu/v4/22b.sh b/src/maxtext/configs/tpu/v4/22b.sh
@@ -50,7 +50,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v4/52b.sh b/src/maxtext/configs/tpu/v4/52b.sh
@@ -50,7 +50,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/128b.sh b/src/maxtext/configs/tpu/v5e/128b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/16b.sh b/src/maxtext/configs/tpu/v5e/16b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/32b.sh b/src/maxtext/configs/tpu/v5e/32b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/64b.sh b/src/maxtext/configs/tpu/v5e/64b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/gpt3_175b.sh b/src/maxtext/configs/tpu/v5e/gpt3_175b.sh
@@ -35,7 +35,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/llama2_13b.sh b/src/maxtext/configs/tpu/v5e/llama2_13b.sh
@@ -35,7 +35,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/llama2_70b.sh b/src/maxtext/configs/tpu/v5e/llama2_70b.sh
@@ -35,7 +35,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5e/llama2_7b.sh b/src/maxtext/configs/tpu/v5e/llama2_7b.sh
@@ -35,7 +35,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/1024b.sh b/src/maxtext/configs/tpu/v5p/1024b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/128b.sh b/src/maxtext/configs/tpu/v5p/128b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/256b.sh b/src/maxtext/configs/tpu/v5p/256b.sh
@@ -37,7 +37,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/32b.sh b/src/maxtext/configs/tpu/v5p/32b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/512b.sh b/src/maxtext/configs/tpu/v5p/512b.sh
@@ -37,7 +37,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/64b.sh b/src/maxtext/configs/tpu/v5p/64b.sh
@@ -36,7 +36,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/gpt3_175b/gpt3_175b_base.sh b/src/maxtext/configs/tpu/v5p/gpt3_175b/gpt3_175b_base.sh
@@ -12,7 +12,7 @@
 
 set -euox pipefail
 
-bash preflight.sh PLATFORM=gke
+bash src/dependencies/scripts/preflight.sh PLATFORM=gke
 
 # flags set as default
 
diff --git a/src/maxtext/configs/tpu/v5p/llama2_70b.sh b/src/maxtext/configs/tpu/v5p/llama2_70b.sh
@@ -38,7 +38,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v5p/llama2_7b.sh b/src/maxtext/configs/tpu/v5p/llama2_7b.sh
@@ -39,7 +39,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v6e/gemma2_27b.sh b/src/maxtext/configs/tpu/v6e/gemma2_27b.sh
@@ -33,7 +33,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v6e/gemma2_9b.sh b/src/maxtext/configs/tpu/v6e/gemma2_9b.sh
@@ -33,7 +33,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v6e/gemma3_27b.sh b/src/maxtext/configs/tpu/v6e/gemma3_27b.sh
@@ -33,7 +33,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v6e/gpt3_175b.sh b/src/maxtext/configs/tpu/v6e/gpt3_175b.sh
@@ -33,7 +33,7 @@ fi
 
 # Set up network optimizations
 if [ "$RUN_PREFLIGHT" = "true" ]; then
-    bash preflight.sh
+    bash src/dependencies/scripts/preflight.sh
 fi
 
 # Train
diff --git a/src/maxtext/configs/tpu/v6e/llama2_7b_4096.sh b/src/maxtext/configs/tpu/v6e/llama2_7b_4096.sh
diff --git a/src/maxtext/configs/tpu/v6e/mixtral_8x7b.sh b/src/maxtext/configs/tpu/v6e/mixtral_8x7b.sh