Skip to content

Commit 169b196

Browse files
authored
Merge pull request #3780 from luarss/topic/tb-sweep
Add TensorBoard logging for AutoTuner sweep mode
2 parents fcbb70d + 7170733 commit 169b196

3 files changed

Lines changed: 170 additions & 26 deletions

File tree

tools/AutoTuner/src/autotuner/distributed.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,15 @@
9292
read_config,
9393
read_metrics,
9494
prepare_ray_server,
95+
calculate_score,
96+
ERROR_METRIC,
9597
CONSTRAINTS_SDC,
9698
FASTROUTE_TCL,
9799
)
100+
from autotuner.tensorboard_logger import TensorBoardLogger
98101

99102
# Name of the final metric
100103
METRIC = "metric"
101-
# The worst of optimized metric
102-
ERROR_METRIC = 9e99
103104
# Path to the FLOW_HOME directory
104105
ORFS_FLOW_DIR = os.path.abspath(
105106
os.path.join(os.path.dirname(__file__), "../../../../flow")
@@ -172,16 +173,7 @@ def evaluate(self, metrics):
172173
It can change in any form to minimize the score (return value).
173174
Default evaluation function optimizes effective clock period.
174175
"""
175-
error = "ERR" in metrics.values()
176-
not_found = "N/A" in metrics.values()
177-
if error or not_found:
178-
return (ERROR_METRIC, "-", "-", "-")
179-
effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
180-
num_drc = metrics["num_drc"]
181-
gamma = effective_clk_period / 10
182-
score = effective_clk_period
183-
score = score * (100 / self.step_) + gamma * num_drc
184-
return (score, effective_clk_period, num_drc, metrics["die_area"])
176+
return calculate_score(metrics, step=self.step_)
185177

186178
def _is_valid_config(self, config):
187179
"""
@@ -566,6 +558,14 @@ def sweep():
566558
else:
567559
repo_dir = os.path.abspath(os.path.join(ORFS_FLOW_DIR, ".."))
568560
print(f"[INFO TUN-0012] Log folder {LOCAL_DIR}.")
561+
562+
tb_log_dir = os.path.join(LOCAL_DIR, args.experiment)
563+
print(
564+
f"[INFO TUN-0034] TensorBoard logging enabled. Run: tensorboard --logdir={tb_log_dir}"
565+
)
566+
567+
tb_logger = TensorBoardLogger.remote(log_dir=tb_log_dir)
568+
569569
queue = Queue()
570570
parameter_list = list()
571571
for name, content in config_dict.items():
@@ -581,10 +581,22 @@ def sweep():
581581
temp = dict()
582582
for value in parameter:
583583
temp.update(value)
584-
queue.put([args, repo_dir, temp, SDC_ORIGINAL, FR_ORIGINAL, INSTALL_PATH])
584+
queue.put(
585+
[
586+
args,
587+
repo_dir,
588+
temp,
589+
SDC_ORIGINAL,
590+
FR_ORIGINAL,
591+
INSTALL_PATH,
592+
tb_logger,
593+
]
594+
)
585595
workers = [consumer.remote(queue) for _ in range(args.jobs)]
586596
print("[INFO TUN-0009] Waiting for results.")
587597
ray.get(workers)
598+
ray.get(tb_logger.close.remote())
599+
print(f"[INFO TUN-0035] TensorBoard events written to {tb_log_dir}")
588600
print("[INFO TUN-0010] Sweep complete.")
589601

590602

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import logging
2+
import os
3+
from typing import Any, Union
4+
5+
import ray
6+
from tensorboardX import SummaryWriter
7+
8+
from autotuner.utils import ERROR_METRIC
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
@ray.remote
14+
class TensorBoardLogger:
15+
"""TensorBoard logger for AutoTuner experiments"""
16+
17+
def __init__(self, log_dir: str):
18+
os.makedirs(log_dir, exist_ok=True)
19+
self.writer = SummaryWriter(log_dir=log_dir)
20+
self.log_dir = log_dir
21+
self.step = 0
22+
logger.info(f"TensorBoard logs will be written to {log_dir}")
23+
24+
def log_sweep_metrics(
25+
self,
26+
params: dict[str, Any],
27+
metrics: dict[str, Any],
28+
score: float,
29+
effective_clk_period: Union[float, str],
30+
num_drc: Union[int, str],
31+
die_area: Union[float, str],
32+
) -> None:
33+
"""Log metrics from a single sweep run"""
34+
self.writer.add_scalar("sweep/score", score, self.step)
35+
36+
if isinstance(effective_clk_period, (int, float)):
37+
self.writer.add_scalar(
38+
"sweep/effective_clk_period", effective_clk_period, self.step
39+
)
40+
41+
if isinstance(num_drc, (int, float)):
42+
self.writer.add_scalar("sweep/num_drc", num_drc, self.step)
43+
44+
if isinstance(die_area, (int, float)):
45+
self.writer.add_scalar("sweep/die_area", die_area, self.step)
46+
47+
for key, value in metrics.items():
48+
if isinstance(value, (int, float)):
49+
self.writer.add_scalar(f"metrics/{key}", value, self.step)
50+
51+
self.writer.add_hparams(
52+
{
53+
k: v if isinstance(v, (int, float, str, bool)) else str(v)
54+
for k, v in params.items()
55+
},
56+
{"hparam/metric": score},
57+
)
58+
59+
self.step += 1
60+
61+
def close(self) -> None:
62+
"""Close the TensorBoard writer and log completion message"""
63+
self.writer.close()
64+
logger.info(
65+
f"Sweep complete. View results with: tensorboard --logdir={self.log_dir}"
66+
)
67+
logger.info(f"Total runs logged: {self.step}")

tools/AutoTuner/src/autotuner/utils.py

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,24 @@
6969
# Name of the TCL script run before routing
7070
FASTROUTE_TCL = "fastroute.tcl"
7171
DATE = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
72+
# The worst of optimized metric
73+
ERROR_METRIC = 9e99
74+
75+
76+
def calculate_score(metrics, step=1):
77+
"""Calculate optimization score from metrics."""
78+
error = "ERR" in metrics.values()
79+
not_found = "N/A" in metrics.values()
80+
81+
if error or not_found:
82+
return (ERROR_METRIC, "-", "-", "-")
83+
84+
effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
85+
num_drc = metrics["num_drc"]
86+
gamma = effective_clk_period / 10
87+
score = effective_clk_period * (100 / step) + gamma * num_drc
88+
89+
return (score, effective_clk_period, num_drc, metrics["die_area"])
7290

7391

7492
def write_sdc(variables, path, sdc_original, constraints_sdc):
@@ -287,6 +305,21 @@ def run_command(
287305
raise RuntimeError
288306

289307

308+
def calculate_trial_path(args, base_dir, flow_variant):
309+
"""
310+
Calculate the log path and flow variant
311+
"""
312+
flow_variant_with_experiment = f"{args.experiment}/{flow_variant}"
313+
log_path = os.path.abspath(
314+
os.path.join(
315+
base_dir,
316+
f"flow/logs/{args.platform}/{args.design}",
317+
flow_variant_with_experiment,
318+
)
319+
)
320+
return log_path, flow_variant_with_experiment
321+
322+
290323
def openroad(
291324
args,
292325
base_dir,
@@ -297,10 +330,8 @@ def openroad(
297330
"""
298331
Run OpenROAD-flow-scripts with a given set of parameters.
299332
"""
300-
# Make sure path ends in a slash, i.e., is a folder
301-
flow_variant = f"{args.experiment}/{flow_variant}"
302-
log_path = os.path.abspath(
303-
os.path.join(base_dir, f"flow/logs/{args.platform}/{args.design}", flow_variant)
333+
log_path, flow_variant = calculate_trial_path(
334+
args=args, base_dir=base_dir, flow_variant=flow_variant
304335
)
305336
report_path = os.path.abspath(
306337
os.path.join(
@@ -643,6 +674,20 @@ def openroad_distributed(
643674
variant=None,
644675
):
645676
"""Simple wrapper to run openroad distributed with Ray."""
677+
if variant is None:
678+
variant_parts = []
679+
for key, value in config.items():
680+
if key not in ["_SDC_FILE_PATH", "_FR_FILE_PATH"]:
681+
variant_parts.append(f"{key}_{value}")
682+
variant = "_".join(variant_parts) if variant_parts else ""
683+
flow_variant = f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}"
684+
685+
trial_path, _ = calculate_trial_path(
686+
args=args, base_dir=repo_dir, flow_variant=flow_variant
687+
)
688+
689+
os.makedirs(trial_path, exist_ok=True)
690+
646691
config = parse_config(
647692
config=config,
648693
base_dir=repo_dir,
@@ -651,15 +696,15 @@ def openroad_distributed(
651696
constraints_sdc=CONSTRAINTS_SDC,
652697
fr_original=fr_original,
653698
fastroute_tcl=FASTROUTE_TCL,
699+
path=trial_path,
654700
)
655-
if variant is None:
656-
variant = config.replace(" ", "_").replace("=", "_")
701+
657702
t = time.time()
658703
metric_file = openroad(
659704
args=args,
660705
base_dir=repo_dir,
661706
parameters=config,
662-
flow_variant=f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}",
707+
flow_variant=flow_variant,
663708
install_path=install_path,
664709
)
665710
duration = time.time() - t
@@ -669,9 +714,29 @@ def openroad_distributed(
669714
@ray.remote
670715
def consumer(queue):
671716
"""consumer"""
672-
while not queue.empty():
673-
next_item = queue.get()
674-
name = next_item[1]
675-
print(f"[INFO TUN-0007] Scheduling run for parameter {name}.")
676-
ray.get(openroad_distributed.remote(*next_item))
677-
print(f"[INFO TUN-0008] Finished run for parameter {name}.")
717+
item = queue.get()
718+
tb_logger = item[6]
719+
720+
while item:
721+
args, repo_dir, config, sdc, fr, install, tb_logger = item
722+
print(f"[INFO TUN-0007] Scheduling run for parameter {config}.")
723+
metric_file, _ = ray.get(
724+
openroad_distributed.remote(args, repo_dir, config, sdc, fr, install)
725+
)
726+
print(f"[INFO TUN-0008] Finished run for parameter {config}.")
727+
728+
metrics = read_metrics(metric_file, args.stop_stage)
729+
score, effective_clk_period, num_drc, die_area = calculate_score(metrics)
730+
731+
ray.get(
732+
tb_logger.log_sweep_metrics.remote(
733+
params=config,
734+
metrics=metrics,
735+
score=score,
736+
effective_clk_period=effective_clk_period,
737+
num_drc=num_drc,
738+
die_area=die_area,
739+
)
740+
)
741+
742+
item = queue.get() if not queue.empty() else None

0 commit comments

Comments
 (0)