AI-Hypercomputer
diff --git a/‎.github/workflows/AddLabel.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/AddLabel.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/gemini-investigate.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/gemini-investigate.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/run_tests_coordinator.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/run_tests_coordinator.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama4_ckpt_unscanned.py‎
Lines changed: 1 addition & 2 deletions b/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama4_ckpt_unscanned.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_ckpt_conversion_inference_only.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_ckpt_conversion_inference_only.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_or_mistral_ckpt.py‎
Lines changed: 2 additions & 3 deletions b/‎src/maxtext/checkpoint_conversion/standalone_scripts/llama_or_mistral_ckpt.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/maxtext/configs/pyconfig.py‎
Lines changed: 5 additions & 0 deletions b/‎src/maxtext/configs/pyconfig.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/maxtext/eval/README.md‎
Lines changed: 169 additions & 0 deletions b/‎src/maxtext/eval/README.md‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎src/maxtext/eval/configs/base_eval.yml‎
Lines changed: 8 additions & 0 deletions b/‎src/maxtext/eval/configs/base_eval.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/maxtext/eval/configs/mlperf.yml‎
Lines changed: 5 additions & 0 deletions b/‎src/maxtext/eval/configs/mlperf.yml‎
Lines changed: 5 additions & 0 deletions
@@ -112,7 +112,7 @@ jobs:
               // Ignore the current running workflow
               if (checkRun.name.endsWith(context.job)) continue
 
-              if (checkRun.status !== 'completed' || checkRun.conclusion !== 'success') {
+              if (checkRun.status !== 'completed' || !['success', 'skipped'].includes(checkRun.conclusion)) {
                 core.info(`Waiting for check: ${checkRun.name} (Status: ${checkRun.status}, Conclusion: ${checkRun.conclusion})`);
                 return; // Exit without failing
               }
 
@@ -85,7 +85,7 @@ jobs:
           settings: |-
             {
               "model": {
-                "maxSessionTurns": 15
+                "maxSessionTurns": 50
               },
               "mcpServers": {
                 "github": {
 
@@ -66,7 +66,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu-unit') && '[1, 2]' || '[1]') }}
+        worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu-unit') && '[1, 2, 3, 4]' || '[1]') }}
 
     uses: ./.github/workflows/run_tests_against_package.yml
     with:
@@ -158,6 +158,6 @@ jobs:
       is_scheduled_run: ${{ inputs.is_scheduled_run }}
       maxtext_installed: ${{ inputs.maxtext_installed }}
       worker_group: ${{ matrix.worker_group }}
-      total_workers: ${{ contains(inputs.flavor, 'cpu-unit') && 2 || 1 }}
+      total_workers: ${{ contains(inputs.flavor, 'cpu-unit') && 4 || 1 }}
       maxtext_sha: ${{ inputs.maxtext_sha }}
       is_update_hlo: ${{ inputs.is_update_hlo }}
@@ -600,9 +600,8 @@ def _convert_pytorch_to_jax_weights(base_model_path: str, model_size: str, model
   for i, ckpt_path in enumerate(ckpt_paths):
     max_logging.log(f"Loading checkpoint {i+1} of {len(ckpt_paths)} ...")
     # NOTE: starting in PT2.6, `weights_only` was switched from the default of `False` to `True`
-    # thus we need to specify this or else loading will fail
     chkpt_vars[int(ckpt_path.name.split(".", maxsplit=2)[1])] = torch.load(
-        ckpt_path, map_location="cpu", weights_only=False
+        ckpt_path, map_location="cpu", weights_only=True
     )
   chkpt_vars = [chkpt_vars[i] for i in sorted(list(chkpt_vars.keys()))]
   # map weight names if they use HuggingFace instead of PyTorch convention
 
@@ -157,7 +157,7 @@ def convert(base_model_path, maxtext_model_path, model_size):
   for i, ckpt_path in enumerate(ckpt_paths):
     print(f"Loading checkpoint {i+1} of {len(ckpt_paths)} ...")
 
-    checkpoint = torch.load(ckpt_path, map_location="cpu")
+    checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
     pytorch_vars[int(ckpt_path.name.split(".", maxsplit=2)[1])] = checkpoint
     print("memory usage in GB: ", psutil.Process().memory_info().rss / (1024 * 1024))
 
 
@@ -428,7 +428,7 @@ def convert_lora_weights_to_jax_weights(lora_config: dict, model_size: str):
 
   max_logging.log(f"Loading the lora  model from {lora_config['lora_model_path']}")
   # Load LoRA model weights
-  lora_chkpt_vars = torch.load(lora_config["lora_model_path"])
+  lora_chkpt_vars = torch.load(lora_config["lora_model_path"], weights_only=True)
   lora_chkpt_vars = _NamespaceMapper(lora_chkpt_vars)
 
   jax_weights_lora = {
@@ -1112,9 +1112,8 @@ def _convert_pytorch_to_jax_weights(base_model_path: str, model_size: str, model
   for i, ckpt_path in enumerate(ckpt_paths):
     max_logging.log(f"Loading checkpoint {i+1} of {len(ckpt_paths)} ...")
     # NOTE: starting in PT2.6, `weights_only` was switched from the default of `False` to `True`
-    # thus we need to specify this or else loading will fail
     chkpt_vars[int(ckpt_path.name.split(".", maxsplit=2)[1])] = torch.load(
-        ckpt_path, map_location="cpu", weights_only=False
+        ckpt_path, map_location="cpu", weights_only=True
     )
   chkpt_vars = [chkpt_vars[i] for i in sorted(list(chkpt_vars.keys()))]
   # map weight names if they use HuggingFace instead of PyTorch convention
 
@@ -324,6 +324,11 @@ def initialize_pydantic(argv: list[str] | None = None, **kwargs) -> MaxTextConfi
 
   # 2. Get overrides from CLI and kwargs
   cli_cfg = omegaconf.OmegaConf.from_cli(cli_args)
+  if "hf_access_token" in cli_cfg:
+    logger.warning(
+        "WARNING: Passing 'hf_access_token' via command-line arguments is deprecated and insecure because it makes "
+        "your token visible in 'ps' and shell history. Please set the 'HF_TOKEN' environment variable instead."
+    )
   kwargs_cfg = omegaconf.OmegaConf.create(kwargs)
   overrides_cfg = omegaconf.OmegaConf.merge(cli_cfg, kwargs_cfg)
 
 
@@ -0,0 +1,169 @@
+# MaxText vLLM Eval Framework
+
+A vLLM-native evaluation framework for MaxText models supporting harness-based eval (lm-eval, evalchemy) and custom datasets.
+
+## Quick Start
+
+All runners share a single entry point:
+
+```bash
+python -m maxtext.eval.runner.run --runner <eval|lm_eval|evalchemy> [flags]
+```
+
+### Custom dataset (MLPerf OpenOrca, ROUGE scoring, Other)
+
+```bash
+python -m maxtext.eval.runner.run \
+  --runner eval \
+  --config src/maxtext/eval/configs/mlperf.yml \
+  --checkpoint_path gs://<bucket>/checkpoints/0/items \
+  --model_name llama3.1-8b \
+  --hf_path meta-llama/Llama-3.1-8B-Instruct \
+  --base_output_directory gs://<bucket>/ \
+  --run_name eval_run \
+  --max_model_len 8192 \
+  --hf_token $HF_TOKEN
+```
+
+HF safetensors mode (no MaxText checkpoint):
+
+```bash
+python -m maxtext.eval.runner.run \
+  --runner eval \
+  --config src/maxtext/eval/configs/mlperf.yml \
+  --hf_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+  --model_name tinyllama \
+  --base_output_directory gs://<bucket>/ \
+  --run_name eval_test \
+  --hf_mode \
+  --num_samples 20 \
+  --max_model_len 2048 \
+  --tensor_parallel_size 1
+```
+
+### LM Eval
+
+Requires: `pip install "lm_eval[api]"`
+
+```bash
+python -m maxtext.eval.runner.run \
+  --runner lm_eval \
+  --checkpoint_path gs://<bucket>/checkpoints/0/items \
+  --model_name qwen3-30b-a3b \
+  --hf_path Qwen/Qwen3-30B-A3B \
+  --tasks gsm8k \
+  --base_output_directory gs://<bucket>/ \
+  --run_name my_run \
+  --max_model_len 8192 \
+  --tensor_parallel_size 8 \
+  --expert_parallel_size 8 \
+  --hf_token $HF_TOKEN
+```
+
+### Evalchemy
+
+Requires: `pip install git+https://github.com/mlfoundations/evalchemy.git`
+
+```bash
+python -m maxtext.eval.runner.run \
+  --runner evalchemy \
+  --checkpoint_path gs://<bucket>/checkpoints/0/items \
+  --model_name llama3.1-8b \
+  --hf_path meta-llama/Llama-3.1-8B-Instruct \
+  --tasks ifeval math500 gpqa_diamond \
+  --base_output_directory gs://<bucket>/ \
+  --run_name eval_run \
+  --max_model_len 8192 \
+  --tensor_parallel_size 4 \
+  --hf_token $HF_TOKEN
+```
+
+## Common Flags
+
+| Flag | Description |
+|---|---|
+| `--checkpoint_path` | MaxText Orbax checkpoint path. Enables `MaxTextForCausalLM` mode. |
+| `--model_name` | MaxText model name (e.g. `llama3.1-8b`) |
+| `--hf_path` | HF model ID or local path |
+| `--max_model_len` | vLLM max context length. |
+| `--tensor_parallel_size` | Chips per model replica |
+| `--expert_parallel_size` | Chips for the expert mesh axis |
+| `--data_parallel_size` | Number of model replicas |
+| `--hbm_memory_utilization` | Fraction of HBM reserved for KV cache |
+| `--hf_token` | HF token (or set `HF_TOKEN` env var) |
+| `--hf_mode` | HF safetensors mode, no MaxText checkpoint loading |
+| `--server_host` / `--server_port` | vLLM server address (default: localhost:8000) |
+| `--max_num_batched_tokens` | vLLM tokens per scheduler step |
+| `--max_num_seqs` | vLLM max concurrent sequences |
+| `--gcs_results_path` | GCS path to upload results JSON |
+| `--log_level` | Logging verbosity (default: INFO) |
+
+ Custom `eval` specific:
+
+| Flag | Description |
+|---|---|
+| `--config` | Benchmark YAML config (required) |
+| `--num_samples` | Limit eval samples |
+| `--max_tokens` | Max tokens per generation |
+| `--temperature` | Sampling temperature (default: 0.0) |
+| `--concurrency` | HTTP request concurrency (default: 64) |
+
+Harness `lm_eval` / `evalchemy` specific:
+
+| Flag | Description |
+|---|---|
+| `--tasks` | Space-separated task names |
+| `--num_fewshot` | Few-shot examples per task (default: 0) |
+| `--num_samples` | Limit samples per task (default: full dataset) |
+
+## Eval on RL Checkpoints
+
+
+
+Example (Qwen3-30B-A3B, v6e-8):
+
+```bash
+STEP=244
+MODEL=qwen3-30b-a3b
+HF_PATH=Qwen/Qwen3-30B-A3B
+CHECKPOINT=gs://<bucket>/run/checkpoints/actor/${STEP}/model_params
+OUTPUT=gs://<bucket>/eval/
+
+python -m maxtext.eval.runner.run \
+  --runner lm_eval \
+  --checkpoint_path ${CHECKPOINT} \
+  --model_name ${MODEL} \
+  --hf_path ${HF_PATH} \
+  --tasks gsm8k \
+  --base_output_directory ${OUTPUT} \
+  --run_name rl_${MODEL}_step${STEP} \
+  --max_model_len 4096 \
+  --tensor_parallel_size 8 \
+  --expert_parallel_size 8 \
+  --num_samples 20 \
+  --hf_token $HF_TOKEN
+```
+
+
+## Adding a Custom Benchmark
+
+1. Implement `BenchmarkDataset` in `src/maxtext/eval/datasets/`:
+
+```python
+from maxtext.eval.datasets.base import BenchmarkDataset, SampleRequest
+
+class MyDataset(BenchmarkDataset):
+    name = "my_benchmark"
+
+    def sample_requests(self, num_samples, tokenizer) -> list[SampleRequest]:
+        # load dataset, build prompts, return SampleRequest list
+```
+
+2. Register in `src/maxtext/eval/datasets/registry.py`:
+
+```python
+from maxtext.eval.datasets.my_dataset import MyDataset
+DATASET_REGISTRY["my_benchmark"] = MyDataset
+```
+
+3. Add a scorer in `src/maxtext/eval/scoring/` and register it in `src/maxtext/eval/scoring/registry.py`.
@@ -0,0 +1,8 @@
+# Base evaluation configuration.
+
+temperature: 0.0
+concurrency: 64
+server_host: "localhost"
+server_port: 8000
+tensor_parallel_size: 4
+num_samples: null
@@ -0,0 +1,5 @@
+# MLPerf OpenOrca evaluation config.
+
+benchmark: "mlperf_openorca"
+max_tokens: 1024
+num_samples: 5000
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ jobs:`
`112`	`112`	`// Ignore the current running workflow`
`113`	`113`	`if (checkRun.name.endsWith(context.job)) continue`
`114`	`114`
`115`		`- if (checkRun.status !== 'completed' \|\| checkRun.conclusion !== 'success') {`
	`115`	`+ if (checkRun.status !== 'completed' \|\| !['success', 'skipped'].includes(checkRun.conclusion)) {`
`116`	`116`	core.info(`Waiting for check: ${checkRun.name} (Status: ${checkRun.status}, Conclusion: ${checkRun.conclusion})`);
`117`	`117`	`return; // Exit without failing`
`118`	`118`	`}`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ jobs:`
`85`	`85`	`settings: \|-`
`86`	`86`	`{`
`87`	`87`	`"model": {`
`88`		`- "maxSessionTurns": 15`
	`88`	`+ "maxSessionTurns": 50`
`89`	`89`	`},`
`90`	`90`	`"mcpServers": {`
`91`	`91`	`"github": {`