AI-Hypercomputer
diff --git a/‎.coveragerc‎
Lines changed: 1 addition & 1 deletion b/‎.coveragerc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/run_jupyter_notebooks.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/workflows/run_tests_coordinator.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/run_tests_coordinator.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 4 additions & 0 deletions b/‎benchmarks/maxtext_xpk_runner.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎benchmarks/recipes/parser_utils.py‎
Lines changed: 7 additions & 0 deletions b/‎benchmarks/recipes/parser_utils.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/recipes/runner_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/recipes/runner_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/recipes/user_configs.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/recipes/user_configs.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/guides.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/guides.md‎
Lines changed: 8 additions & 0 deletions
@@ -1,7 +1,7 @@
 # .coveragerc
 
 [run]
-source = MaxText
+source = maxtext
 branch = True
 omit =
     tests/*
 
@@ -199,6 +199,7 @@ jobs:
       base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
+      pytest_addopts: '--ignore=tests/post_training'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
@@ -217,6 +218,7 @@ jobs:
       base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
+      pytest_addopts: '--ignore=tests/post_training'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
 
@@ -103,9 +103,13 @@ jobs:
           # Register maxtext_venv as a selectable kernel in Jupyter
           $PYTHON_EXE -m ipykernel install --user --name maxtext_venv
 
+          # Run Hugging Face authentication
+          hf auth login --token "$HF_TOKEN"
+
           for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do
             filename=$(basename "$notebook")
-            if [[ "$filename" == "sft_llama3_demo_gpu.ipynb" || "$filename" == "maxtext_with_gepa.ipynb" ]]; then
+            # TODO: Update runnner to v6e-8 as RL with LLama3.1-8b doesn't fit on v6e-4
+            if [[ "$filename" == "sft_llama3_demo_gpu.ipynb" || "$filename" == "maxtext_with_gepa.ipynb" || "$filename" == "rl_llama3_demo.ipynb" ]]; then
               echo "Skipping $filename"
               continue
             fi
 
@@ -102,24 +102,24 @@ jobs:
         ${{ fromJSON('{
             "tpu-unit": "not cpu_only and not gpu_only and not integration_test and not post_training",
             "tpu-integration": "not cpu_only and not gpu_only and integration_test and not post_training",
-            "tpu-post-training-unit": "not cpu_only and not gpu_only and not integration_test",
+            "tpu-post-training-unit": "not cpu_only and not gpu_only and not integration_test and post_training",
             "tpu-post-training-integration": "not cpu_only and not gpu_only and integration_test",
             "gpu-unit": "not cpu_only and not tpu_only and not integration_test and not post_training",
             "gpu-integration": "not cpu_only and not tpu_only and integration_test and not post_training",
             "cpu-unit": "cpu_only and not post_training",
-            "cpu-post-training-unit": "cpu_only"
+            "cpu-post-training-unit": "cpu_only and post_training"
           }')[inputs.flavor] }}
 
       pytest_addopts: >-
         ${{ fromJSON('{
             "tpu-unit": "",
             "tpu-integration": "",
-            "tpu-post-training-unit": "tests/post_training/unit",
+            "tpu-post-training-unit": "tests/post_training/unit tests/unit",
             "tpu-post-training-integration": "tests/post_training/integration",
             "gpu-unit": "",
             "gpu-integration": "",
             "cpu-unit": "",
-            "cpu-post-training-unit": "tests/post_training/unit"
+            "cpu-post-training-unit": "tests/post_training/unit tests/unit"
           }')[inputs.flavor] }}
 
       pytest_extra_args: >-
 
@@ -60,4 +60,4 @@ repos:
         args: ['--number']
         additional_dependencies: [mdformat-myst, mdformat-ruff]
         files: (docs/.)
-        exclude: docs/guides/checkpointing_solutions.md
+        exclude: docs/guides/checkpointing_solutions.md|docs/guides.md
@@ -115,6 +115,7 @@ class WorkloadConfig:
   disruption_configs: DisruptionConfig = None
   xpk_storage: None | list[str] = None
   hlo_dump: None | bool = None
+  skip_validation: bool = False
 
   def __post_init__(self):
     """Initializes num_devices_per_slice and topology for recording the run into BigQuery"""
@@ -644,6 +645,9 @@ def generate_xpk_workload_cmd(
   else:
     docker_image_flag = f'--base-docker-image="{wl_config.base_docker_image}"'
 
+  if wl_config.skip_validation:
+    workload_create_command += " --skip-validation"
+
   upload_metrics_to_bq_cmd = ""
   if wl_config.generate_metrics_and_upload_to_big_query and not is_pathways_headless_enabled:
     # TODO (optionally) make it so that this upload step is done on local device instead of within the workload.
 
@@ -151,6 +151,13 @@ def add_arguments(parser: argparse.ArgumentParser):
       help="BigQuery dataset name where metrics will be written.",
   )
 
+  parser.add_argument(
+      "--skip-validation",
+      action="store_true",
+      default=False,
+      help="Skip xpk health checks and system dependency validation during workload execution",
+)
+
   # Other configurations
   parser.add_argument("--xpk_path", type=str, default="~/xpk", help="Path to xpk.")
   parser.add_argument("--delete", action="store_true", help="Delete the cluster workload")
 
@@ -48,6 +48,7 @@ def _create_workload_config(
       "generate_metrics_and_upload_to_big_query": user_config.bq_enable,
       "db_project": user_config.bq_db_project,
       "db_dataset": user_config.bq_db_dataset,
+      "skip_validation": user_config.skip_validation,
   }
   # Add any extra arguments, like disruption_configs, if they exist
   config_args.update(kwargs)
 
@@ -82,6 +82,7 @@ class UserConfig:
   max_restarts: int = 0
   temp_key: str = None
   workload_id: str = None
+  skip_validation: bool = False
 
   def __post_init__(self):
     """Automatically generate derived attributes after the object is created."""
 
@@ -62,6 +62,13 @@ Interactive development guides for running MaxText on Google Colab or local Jupy
 
 A step-by-step guide for the community to help expand MaxText's model library.
 :::
+
+:::{grid-item-card} 🎓 Distillation
+:link: guides/distillation
+:link-type: doc
+
+How online distillation works in MaxText: loss anatomy, α / β / temperature schedule tuning, layer indices, monitoring metrics, and troubleshooting.
+:::
 ::::
 
 ```{toctree}
@@ -75,4 +82,5 @@ guides/checkpointing_solutions.md
 guides/monitoring_and_debugging.md
 guides/run_python_notebook.md
 guides/model_bringup.md
+guides/distillation.md
 ```
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ def _create_workload_config(`
`48`	`48`	`"generate_metrics_and_upload_to_big_query": user_config.bq_enable,`
`49`	`49`	`"db_project": user_config.bq_db_project,`
`50`	`50`	`"db_dataset": user_config.bq_db_dataset,`
	`51`	`+ "skip_validation": user_config.skip_validation,`
`51`	`52`	`}`
`52`	`53`	`# Add any extra arguments, like disruption_configs, if they exist`
`53`	`54`	`config_args.update(kwargs)`