Skip to content

Commit ce13400

Browse files
[UX] Minor improvements of dstack metrics
* Show run/job status * Better-format GPU metrics * Allow finished runs * Show GB (instead of MB)
1 parent aa25964 commit ce13400

1 file changed

Lines changed: 8 additions & 8 deletions

File tree

src/dstack/_internal/cli/commands/metrics.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ def _command(self, args: argparse.Namespace):
3939
run = self.api.runs.get(run_name=args.run_name)
4040
if run is None:
4141
raise CLIError(f"Run {args.run_name} not found")
42-
if run.status.is_finished():
43-
raise CLIError(f"Run {args.run_name} is finished")
4442
metrics = _get_run_jobs_metrics(api=self.api, run=run)
4543

4644
if not args.watch:
@@ -78,11 +76,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
7876
def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
7977
table = Table(box=None)
8078
table.add_column("NAME", style="bold", no_wrap=True)
79+
table.add_column("STATUS")
8180
table.add_column("CPU")
8281
table.add_column("MEMORY")
8382
table.add_column("GPU")
8483

85-
run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
84+
run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
8685
if len(run._run.jobs) != 1:
8786
add_row_from_dict(table, run_row)
8887

@@ -101,9 +100,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
101100
cpu_usage = f"{cpu_usage:.0f}%"
102101
memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
103102
if memory_usage is not None:
104-
memory_usage = f"{round(memory_usage / 1024 / 1024)}MB"
103+
memory_usage = f"{round(memory_usage / 1024 / 1024 / 1024)}GB"
105104
if resources is not None:
106-
memory_usage += f"/{resources.memory_mib}MB"
105+
memory_usage += f"/{round(resources.memory_mib / 1024)}GB"
107106
gpu_metrics = ""
108107
gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
109108
if gpus_detected_num is not None:
@@ -113,13 +112,14 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
113112
if gpu_memory_usage is not None:
114113
if i != 0:
115114
gpu_metrics += "\n"
116-
gpu_metrics += f"#{i} {round(gpu_memory_usage / 1024 / 1024)}MB"
115+
gpu_metrics += f"gpu={i} mem={round(gpu_memory_usage / 1024 / 1024 / 1024)}GB"
117116
if resources is not None:
118-
gpu_metrics += f"/{resources.gpus[i].memory_mib}MB"
119-
gpu_metrics += f" {gpu_util_percent}% Util"
117+
gpu_metrics += f"/{round(resources.gpus[i].memory_mib / 1024)}GB"
118+
gpu_metrics += f" util={gpu_util_percent}%"
120119

121120
job_row: Dict[Union[str, int], Any] = {
122121
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
122+
"STATUS": job.job_submissions[-1].status.value,
123123
"CPU": cpu_usage or "-",
124124
"MEMORY": memory_usage or "-",
125125
"GPU": gpu_metrics or "-",

0 commit comments

Comments
 (0)