Merge pull request #3780 from luarss/topic/tb-sweep

vvbandeira · web-flow · commit 169b1966d57b · 2026-01-22T14:13:37.000-03:00
Add TensorBoard logging for AutoTuner sweep mode
diff --git a/tools/AutoTuner/src/autotuner/distributed.py b/tools/AutoTuner/src/autotuner/distributed.py
@@ -92,14 +92,15 @@
     read_config,
     read_metrics,
     prepare_ray_server,
+    calculate_score,
+    ERROR_METRIC,
     CONSTRAINTS_SDC,
     FASTROUTE_TCL,
 )
+from autotuner.tensorboard_logger import TensorBoardLogger
 
 # Name of the final metric
 METRIC = "metric"
-# The worst of optimized metric
-ERROR_METRIC = 9e99
 # Path to the FLOW_HOME directory
 ORFS_FLOW_DIR = os.path.abspath(
     os.path.join(os.path.dirname(__file__), "../../../../flow")
@@ -172,16 +173,7 @@ def evaluate(self, metrics):
         It can change in any form to minimize the score (return value).
         Default evaluation function optimizes effective clock period.
         """
-        error = "ERR" in metrics.values()
-        not_found = "N/A" in metrics.values()
-        if error or not_found:
-            return (ERROR_METRIC, "-", "-", "-")
-        effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
-        num_drc = metrics["num_drc"]
-        gamma = effective_clk_period / 10
-        score = effective_clk_period
-        score = score * (100 / self.step_) + gamma * num_drc
-        return (score, effective_clk_period, num_drc, metrics["die_area"])
+        return calculate_score(metrics, step=self.step_)
 
     def _is_valid_config(self, config):
         """
@@ -566,6 +558,14 @@ def sweep():
     else:
         repo_dir = os.path.abspath(os.path.join(ORFS_FLOW_DIR, ".."))
     print(f"[INFO TUN-0012] Log folder {LOCAL_DIR}.")
+
+    tb_log_dir = os.path.join(LOCAL_DIR, args.experiment)
+    print(
+        f"[INFO TUN-0034] TensorBoard logging enabled. Run: tensorboard --logdir={tb_log_dir}"
+    )
+
+    tb_logger = TensorBoardLogger.remote(log_dir=tb_log_dir)
+
     queue = Queue()
     parameter_list = list()
     for name, content in config_dict.items():
@@ -581,10 +581,22 @@ def sweep():
         temp = dict()
         for value in parameter:
             temp.update(value)
-        queue.put([args, repo_dir, temp, SDC_ORIGINAL, FR_ORIGINAL, INSTALL_PATH])
+        queue.put(
+            [
+                args,
+                repo_dir,
+                temp,
+                SDC_ORIGINAL,
+                FR_ORIGINAL,
+                INSTALL_PATH,
+                tb_logger,
+            ]
+        )
     workers = [consumer.remote(queue) for _ in range(args.jobs)]
     print("[INFO TUN-0009] Waiting for results.")
     ray.get(workers)
+    ray.get(tb_logger.close.remote())
+    print(f"[INFO TUN-0035] TensorBoard events written to {tb_log_dir}")
     print("[INFO TUN-0010] Sweep complete.")
 
 
diff --git a/tools/AutoTuner/src/autotuner/tensorboard_logger.py b/tools/AutoTuner/src/autotuner/tensorboard_logger.py
@@ -0,0 +1,67 @@
+import logging
+import os
+from typing import Any, Union
+
+import ray
+from tensorboardX import SummaryWriter
+
+from autotuner.utils import ERROR_METRIC
+
+logger = logging.getLogger(__name__)
+
+
+@ray.remote
+class TensorBoardLogger:
+    """TensorBoard logger for AutoTuner experiments"""
+
+    def __init__(self, log_dir: str):
+        os.makedirs(log_dir, exist_ok=True)
+        self.writer = SummaryWriter(log_dir=log_dir)
+        self.log_dir = log_dir
+        self.step = 0
+        logger.info(f"TensorBoard logs will be written to {log_dir}")
+
+    def log_sweep_metrics(
+        self,
+        params: dict[str, Any],
+        metrics: dict[str, Any],
+        score: float,
+        effective_clk_period: Union[float, str],
+        num_drc: Union[int, str],
+        die_area: Union[float, str],
+    ) -> None:
+        """Log metrics from a single sweep run"""
+        self.writer.add_scalar("sweep/score", score, self.step)
+
+        if isinstance(effective_clk_period, (int, float)):
+            self.writer.add_scalar(
+                "sweep/effective_clk_period", effective_clk_period, self.step
+            )
+
+        if isinstance(num_drc, (int, float)):
+            self.writer.add_scalar("sweep/num_drc", num_drc, self.step)
+
+        if isinstance(die_area, (int, float)):
+            self.writer.add_scalar("sweep/die_area", die_area, self.step)
+
+        for key, value in metrics.items():
+            if isinstance(value, (int, float)):
+                self.writer.add_scalar(f"metrics/{key}", value, self.step)
+
+        self.writer.add_hparams(
+            {
+                k: v if isinstance(v, (int, float, str, bool)) else str(v)
+                for k, v in params.items()
+            },
+            {"hparam/metric": score},
+        )
+
+        self.step += 1
+
+    def close(self) -> None:
+        """Close the TensorBoard writer and log completion message"""
+        self.writer.close()
+        logger.info(
+            f"Sweep complete. View results with: tensorboard --logdir={self.log_dir}"
+        )
+        logger.info(f"Total runs logged: {self.step}")
diff --git a/tools/AutoTuner/src/autotuner/utils.py b/tools/AutoTuner/src/autotuner/utils.py
@@ -69,6 +69,24 @@
 # Name of the TCL script run before routing
 FASTROUTE_TCL = "fastroute.tcl"
 DATE = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+# The worst of optimized metric
+ERROR_METRIC = 9e99
+
+
+def calculate_score(metrics, step=1):
+    """Calculate optimization score from metrics."""
+    error = "ERR" in metrics.values()
+    not_found = "N/A" in metrics.values()
+
+    if error or not_found:
+        return (ERROR_METRIC, "-", "-", "-")
+
+    effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
+    num_drc = metrics["num_drc"]
+    gamma = effective_clk_period / 10
+    score = effective_clk_period * (100 / step) + gamma * num_drc
+
+    return (score, effective_clk_period, num_drc, metrics["die_area"])
 
 
 def write_sdc(variables, path, sdc_original, constraints_sdc):
@@ -287,6 +305,21 @@ def run_command(
         raise RuntimeError
 
 
+def calculate_trial_path(args, base_dir, flow_variant):
+    """
+    Calculate the log path and flow variant
+    """
+    flow_variant_with_experiment = f"{args.experiment}/{flow_variant}"
+    log_path = os.path.abspath(
+        os.path.join(
+            base_dir,
+            f"flow/logs/{args.platform}/{args.design}",
+            flow_variant_with_experiment,
+        )
+    )
+    return log_path, flow_variant_with_experiment
+
+
 def openroad(
     args,
     base_dir,
@@ -297,10 +330,8 @@ def openroad(
     """
     Run OpenROAD-flow-scripts with a given set of parameters.
     """
-    # Make sure path ends in a slash, i.e., is a folder
-    flow_variant = f"{args.experiment}/{flow_variant}"
-    log_path = os.path.abspath(
-        os.path.join(base_dir, f"flow/logs/{args.platform}/{args.design}", flow_variant)
+    log_path, flow_variant = calculate_trial_path(
+        args=args, base_dir=base_dir, flow_variant=flow_variant
     )
     report_path = os.path.abspath(
         os.path.join(
@@ -643,6 +674,20 @@ def openroad_distributed(
     variant=None,
 ):
     """Simple wrapper to run openroad distributed with Ray."""
+    if variant is None:
+        variant_parts = []
+        for key, value in config.items():
+            if key not in ["_SDC_FILE_PATH", "_FR_FILE_PATH"]:
+                variant_parts.append(f"{key}_{value}")
+        variant = "_".join(variant_parts) if variant_parts else ""
+    flow_variant = f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}"
+
+    trial_path, _ = calculate_trial_path(
+        args=args, base_dir=repo_dir, flow_variant=flow_variant
+    )
+
+    os.makedirs(trial_path, exist_ok=True)
+
     config = parse_config(
         config=config,
         base_dir=repo_dir,
@@ -651,15 +696,15 @@ def openroad_distributed(
         constraints_sdc=CONSTRAINTS_SDC,
         fr_original=fr_original,
         fastroute_tcl=FASTROUTE_TCL,
+        path=trial_path,
     )
-    if variant is None:
-        variant = config.replace(" ", "_").replace("=", "_")
+
     t = time.time()
     metric_file = openroad(
         args=args,
         base_dir=repo_dir,
         parameters=config,
-        flow_variant=f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}",
+        flow_variant=flow_variant,
         install_path=install_path,
     )
     duration = time.time() - t
@@ -669,9 +714,29 @@ def openroad_distributed(
 @ray.remote
 def consumer(queue):
     """consumer"""
-    while not queue.empty():
-        next_item = queue.get()
-        name = next_item[1]
-        print(f"[INFO TUN-0007] Scheduling run for parameter {name}.")
-        ray.get(openroad_distributed.remote(*next_item))
-        print(f"[INFO TUN-0008] Finished run for parameter {name}.")
+    item = queue.get()
+    tb_logger = item[6]
+
+    while item:
+        args, repo_dir, config, sdc, fr, install, tb_logger = item
+        print(f"[INFO TUN-0007] Scheduling run for parameter {config}.")
+        metric_file, _ = ray.get(
+            openroad_distributed.remote(args, repo_dir, config, sdc, fr, install)
+        )
+        print(f"[INFO TUN-0008] Finished run for parameter {config}.")
+
+        metrics = read_metrics(metric_file, args.stop_stage)
+        score, effective_clk_period, num_drc, die_area = calculate_score(metrics)
+
+        ray.get(
+            tb_logger.log_sweep_metrics.remote(
+                params=config,
+                metrics=metrics,
+                score=score,
+                effective_clk_period=effective_clk_period,
+                num_drc=num_drc,
+                die_area=die_area,
+            )
+        )
+
+        item = queue.get() if not queue.empty() else None