Add wandb support

Zephyr271828 · Zephyr271828 · commit 22de737a5427 · 2026-03-23T12:16:06.000+08:00
diff --git a/src/maxtext/common/metric_logger.py b/src/maxtext/common/metric_logger.py
@@ -20,6 +20,7 @@
 import os
 import queue
 import enum
+import wandb
 
 import numpy as np
 
@@ -99,8 +100,16 @@ def __init__(self, config, learning_rate_schedule):
     self.learning_rate_schedule = learning_rate_schedule
     self.cumulative_eval_metrics = {"scalar": defaultdict(float)}
     self.buffered_train_metrics = None
+    
     if self.config.managed_mldiagnostics:
       ManagedMLDiagnostics(config)  # Initialize the MLRun instance.
+      
+    if self.config.enable_wandb and jax.process_index() == 0: 
+      wandb.init(
+        project=config.wandb_project_name,
+        name=config.wandb_run_name,
+        resume="allow",
+      ) # Initialize wandb logger.
 
   def reset_eval_metrics(self):
     """Resets the cumulative metrics dictionary for a new evaluation run."""
@@ -122,6 +131,9 @@ def write_metrics(self, metrics, step, is_training=True):
 
       if self.config.managed_mldiagnostics:
         self.write_metrics_to_managed_mldiagnostics(metrics, step)
+        
+      if self.config.enable_wandb and jax.process_index() == 0:
+        self.write_metrics_to_wandb(metrics, step)
 
   def log_metrics(self, metrics, step, is_training):
     """Logs metrics via max_logging."""
@@ -267,6 +279,16 @@ def write_metrics_to_managed_mldiagnostics(self, metrics, step):
         mapped_metric_name = _METRICS_TO_MANAGED.get(metric_name, metric_name)
         mldiag.metrics.record(mapped_metric_name, value, step=int(step))
 
+  def write_metrics_to_wandb(self, metrics, step):
+    """Write metrics to weights and biases (wandb)."""
+    flat_metrics = {}
+    for key, val in metrics.get("scalar", {}).items():
+      flat_metrics[key] = float(val)
+    for key, val in metrics.get("scalars", {}).items():
+      for subkey, subval in val.items():
+        flat_metrics[f"{key}/{subkey}"] = float(subval)
+    wandb.log(flat_metrics, step=step)
+
   def write_setup_info_to_tensorboard(self, params):
     """Writes setup information like train config params, num model params, and XLA flags to TensorBoard."""
     num_model_parameters = max_utils.calculate_num_params_from_pytree(params)
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -93,6 +93,10 @@ metrics_file: "" # for testing, local file that stores scalar metrics. if empty,
 # if true save metrics such as loss and tflops to gcs in {base_output_directory}/{run_name}/metrics/
 gcs_metrics: false
 
+enable_wandb: False
+wandb_project_name: ""
+wandb_run_name: ""
+
 # if true save config to gcs in {base_output_directory}/{run_name}/
 save_config_to_gcs: false
 
diff --git a/src/maxtext/configs/pyconfig.py b/src/maxtext/configs/pyconfig.py
@@ -170,7 +170,7 @@ def _prepare_for_pydantic(raw_keys: dict[str, Any]) -> dict[str, Any]:
   for key, value in raw_keys.items():
     if key not in valid_fields:
       logger.warning("Ignoring invalid/unsupported field from YAML/CLI: %s", repr(key))
-      raise ValueError(f"{key!r} not in {', '.join(map(repr, valid_fields))}.")
+      raise ValueError(f"{key!r} not in {", ".join(map(repr, valid_fields))}.")
 
     new_value = value
     if isinstance(new_value, str) and new_value.lower() == "none":
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1471,6 +1471,9 @@ class Metrics(BaseModel):
       False,
       description="Whether to enable Tunix-managed metrics measurement. The metrics will be uploaded to tensorboard.",
   )
+  enable_wandb: bool = Field(False, description="Enable Weights & Biases logging.")
+  wandb_project_name: str = Field("maxtext", description="Weights & Biases project name.")
+  wandb_run_name: str = Field("", description="Weights & Biases run name. If empty, a default name is generated.")
 
 
 class ManagedMLDiagnostics(BaseModel):

Original file line number	Diff line number	Diff line change
`@@ -1471,6 +1471,9 @@ class Metrics(BaseModel):`
`1471`	`1471`	`False,`
`1472`	`1472`	`description="Whether to enable Tunix-managed metrics measurement. The metrics will be uploaded to tensorboard.",`
`1473`	`1473`	`)`
	`1474`	`+ enable_wandb: bool = Field(False, description="Enable Weights & Biases logging.")`
	`1475`	`+ wandb_project_name: str = Field("maxtext", description="Weights & Biases project name.")`
	`1476`	`+ wandb_run_name: str = Field("", description="Weights & Biases run name. If empty, a default name is generated.")`
`1474`	`1477`
`1475`	`1478`
`1476`	`1479`	`class ManagedMLDiagnostics(BaseModel):`