Add hbm utilization and some vllm args

dipannita08 · dipannita08 · commit 96e193733141 · 2026-04-10T12:48:43.000-07:00
diff --git a/src/maxtext/eval/runner/common.py b/src/maxtext/eval/runner/common.py
@@ -65,6 +65,7 @@ def build_server_manager(cfg: dict, token: str | None) -> "VllmServerManager":
     max_num_seqs = int(max_num_seqs)
 
   expert_parallel_size = int(cfg.get("expert_parallel_size") or 1)
+  hbm_memory_utilization = float(cfg.get("hbm_memory_utilization") or 0.3)
 
   server_env = {"HF_TOKEN": token} if token else None
 
@@ -79,6 +80,7 @@ def build_server_manager(cfg: dict, token: str | None) -> "VllmServerManager":
       max_model_len=max_model_len,
       max_num_batched_tokens=max_num_batched_tokens,
       max_num_seqs=max_num_seqs,
+      hbm_memory_utilization=hbm_memory_utilization,
       env=server_env,
   )
 
@@ -120,6 +122,14 @@ def add_server_args(parser: argparse.ArgumentParser) -> None:
           "Chips allocated to the expert mesh axis (EP). "
       ),
   )
+  parser.add_argument(
+      "--hbm_memory_utilization",
+      type=float,
+      default=0.3,
+      help=(
+          "Fraction of HBM reserved for KV cache."
+      ),
+  )
   parser.add_argument("--hf_token", help="HuggingFace token for gated models.")
   parser.add_argument(
       "--gcs_results_path", help="Optional secondary GCS path to upload the results JSON."
diff --git a/src/maxtext/eval/runner/harness_runner.py b/src/maxtext/eval/runner/harness_runner.py
@@ -76,22 +76,37 @@ def _map_results(raw_results: dict, tasks: list[str], task_map: dict[str, str])
     lm_task = task_map.get(task, task)
     task_r = results_section.get(lm_task, {})
 
-    acc = task_r.get("acc,none")
-    if acc is None:
-      acc = task_r.get("exact_match,none")
-    if acc is None:
-      acc = task_r.get("acc")
-    if acc is None:
-      acc = task_r.get("score")
-
-    acc_norm = task_r.get("acc_norm,none")
-    if acc_norm is None:
-      acc_norm = task_r.get("acc_norm")
+    acc = None
+    for key in (
+        "acc,none",
+        "exact_match,strict-match",
+        "exact_match,flexible-extract",
+        "exact_match,none",
+        "acc",
+        "score",
+    ):
+      if task_r.get(key) is not None:
+        acc = task_r[key]
+        break
+
+    acc_norm = None
+    for key in ("acc_norm,none", "acc_norm"):
+      if task_r.get(key) is not None:
+        acc_norm = task_r[key]
+        break
 
     if acc is not None:
       scores[f"{task}_accuracy"] = round(float(acc) * 100, 2)
     if acc_norm is not None:
       scores[f"{task}_accuracy_norm"] = round(float(acc_norm) * 100, 2)
+
+    if acc is None and task_r:
+      logger.warning(
+          "No known accuracy keys found for task '%s'. Available: %s",
+          task,
+          list(task_r.keys()),
+      )
+
   return scores
 
 
@@ -206,7 +221,10 @@ def run_harness(cfg: dict, hf_token: str | None = None) -> dict:
       benchmark="+".join(tasks),
       model_name=model_name,
       scores=scores,
-      generation_stats={f"{backend}_config": raw_results.get("config", {})},
+      generation_stats={
+          f"{backend}_config": raw_results.get("config", {}),
+          f"{backend}_results": raw_results.get("results", {}),
+      },
       config=cfg,
       results_path=results_path,
   )
diff --git a/src/maxtext/eval/runner/server_manager.py b/src/maxtext/eval/runner/server_manager.py
@@ -33,6 +33,7 @@ def _build_app(llm: Any) -> Any:
   """Return a FastAPI app that wraps an in-process vLLM LLM instance."""
   import fastapi  # pylint: disable=import-outside-toplevel
   from vllm.sampling_params import SamplingParams  # pylint: disable=import-outside-toplevel
+  globals()["fastapi"] = fastapi
 
   app = fastapi.FastAPI()
 
@@ -195,6 +196,7 @@ class VllmServerManager:
     max_num_batched_tokens: Tokens per scheduler step (None = vLLM default).
     max_num_seqs: Max concurrent sequences (None = vLLM default).
     startup_timeout: Seconds to wait for /health to return healthy.
+    hbm_memory_utilization: Fraction of HBM reserved for KV cache.
     env: Optional environment-variable overrides.
     additional_vllm_kwargs: Extra kwargs merged into the vLLM LLM() constructor.
   """
@@ -213,6 +215,7 @@ def __init__(
       max_num_batched_tokens: int | None = None,
       max_num_seqs: int | None = None,
       startup_timeout: int = 600,
+      hbm_memory_utilization: float = 0.3,
       env: dict[str, str] | None = None,
       additional_vllm_kwargs: dict | None = None,
   ):
@@ -235,6 +238,7 @@ def __init__(
     self.max_num_batched_tokens = max_num_batched_tokens
     self.max_num_seqs = max_num_seqs
     self.startup_timeout = startup_timeout
+    self.hbm_memory_utilization = hbm_memory_utilization
     self.env = env
     self.additional_vllm_kwargs = additional_vllm_kwargs or {}
 
@@ -255,6 +259,8 @@ def start(self) -> None:
     # V1 engine architecture is otherwise preserved (tpu-inference plugin works),
     # and JAX/TPU is initialised exactly once inside LLM() in this process.
     os.environ.setdefault("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    os.environ.setdefault("NEW_MODEL_DESIGN", "1")
+    os.environ.setdefault("SKIP_JAX_PRECOMPILE", "1")
 
     if self.env:
       os.environ.update(self.env)
@@ -268,6 +274,7 @@ def start(self) -> None:
         "tensor_parallel_size": ici_tp,
         "max_model_len": self.max_model_len,
         "dtype": self.dtype,
+        "gpu_memory_utilization": self.hbm_memory_utilization,
     }
     if self.max_num_batched_tokens is not None:
       vllm_kwargs["max_num_batched_tokens"] = self.max_num_batched_tokens
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -20,12 +20,11 @@
 from flax import nnx
 import flax.linen as nn
 from jax import numpy as jnp
-from jax.sharding import AxisType, Mesh
+from jax.sharding import Mesh
 from maxtext.configs import pyconfig
 from maxtext.utils.globals import MAXTEXT_CONFIGS_DIR
 from maxtext.common.common_types import MODEL_MODE_AUTOREGRESSIVE
 from maxtext.utils import max_logging
-from maxtext.utils import maxtext_utils
 from maxtext.utils import model_creation_utils
 
 
@@ -99,9 +98,8 @@ def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array, mesh: Mesh):
     self.cfg = vllm_config.model_config
     self.maxtext_config = generate_maxtext_config(vllm_config)
 
-    devices_array = maxtext_utils.create_device_mesh(self.maxtext_config)
-    axis_types = tuple([AxisType.Auto] * len(self.maxtext_config.mesh_axes))
-    self.mesh = Mesh(devices_array, self.maxtext_config.mesh_axes, axis_types=axis_types)
+    # Model configuration
+    self.mesh = mesh
     self.model_mode = MODEL_MODE_AUTOREGRESSIVE
     self.is_text_generation_model = True
 
@@ -238,4 +236,4 @@ def load_weights(self, rng_key: jax.Array) -> None:
       model, _ = model_creation_utils.create_nnx_model(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
-      self.model = nnx.data(model)
+      self.model = nnx.data(model)