Skip to content

Commit 7f9632d

Browse files
committed
cli fixes
1 parent 069ebca commit 7f9632d

7 files changed

Lines changed: 255 additions & 68 deletions

File tree

README.md

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,34 @@
1010

1111
## Gallery
1212

13-
```markdown
14-
<!-- Web dashboard demo GIF -->
15-
![Web Dashboard Demo](assets/gallery/web-dashboard-demo.gif)
16-
17-
<!-- CLI mode screenshot -->
18-
![CLI Mode](assets/gallery/cli-mode.png)
19-
```
20-
13+
<details>
14+
<summary>Web Dashboard</summary>
15+
<div style="display:flex; overflow-x:auto; gap:10px; padding:12px 0; scroll-snap-type:x mandatory; -webkit-overflow-scrolling:touch;">
16+
<!-- Use first image aspect ratio 1624x675 for slide frame; images fit inside using object-fit:contain -->
17+
<div style="flex:0 0 100%; scroll-snap-align:center; aspect-ratio:1624/675; display:flex; align-items:center; justify-content:center;">
18+
<img src="monitor/api/static/web1.png" style="width:100%; height:100%; object-fit:contain;" />
19+
</div>
20+
<div style="flex:0 0 100%; scroll-snap-align:center; aspect-ratio:1624/675; display:flex; align-items:center; justify-content:center;">
21+
<img src="monitor/api/static/web2.png" style="width:100%; height:100%; object-fit:contain;" />
22+
</div>
23+
<div style="flex:0 0 100%; scroll-snap-align:center; aspect-ratio:1624/675; display:flex; align-items:center; justify-content:center;">
24+
<img src="monitor/api/static/web3.png" style="width:100%; height:100%; object-fit:contain;" />
25+
</div>
26+
<div style="flex:0 0 100%; scroll-snap-align:center; aspect-ratio:1624/675; display:flex; align-items:center; justify-content:center;">
27+
<img src="monitor/api/static/web4.png" style="width:100%; height:100%; object-fit:contain;" />
28+
</div>
29+
</div>
30+
31+
</details>
32+
<details>
33+
<summary>CLI</summary>
34+
<div style="display:flex; overflow-x:auto; gap:10px; padding:12px 0; scroll-snap-type:x mandatory; -webkit-overflow-scrolling:touch;">
35+
36+
<div style="flex:0 0 100%; scroll-snap-align:center; aspect-ratio:1624/675; display:flex; align-items:center; justify-content:center;">
37+
<img src="monitor/api/static/web1.png" style="width:100%; height:100%; object-fit:contain;" />
38+
</div>
39+
40+
</details>
2141
---
2242

2343
## Overview

health_monitor.py

Lines changed: 196 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from datetime import datetime
1515
from pathlib import Path
1616
from typing import Optional, List
17+
import shutil
1718

1819
import yaml
1920
import click
@@ -22,6 +23,7 @@
2223
from rich.table import Table
2324
from rich.panel import Panel
2425
from rich.layout import Layout
26+
import socket
2527

2628
from monitor.collectors.gpu import GPUCollector
2729
from monitor.collectors.system import SystemCollector
@@ -37,7 +39,7 @@
3739
BANNER = f"""
3840
╔══════════════════════════════════════════════════╗
3941
║ LOCAL GPU MONITOR v{_pkg_version}
40-
Local GPU monitoring and diagnostics
42+
Monitor and Analysis
4143
╚══════════════════════════════════════════════════╝
4244
"""
4345

@@ -104,29 +106,35 @@ def load_config(config_path: Optional[str]) -> dict:
104106

105107
def collect_metrics() -> dict:
106108
"""Collect metrics from local system."""
107-
metrics = {
108-
'timestamp': datetime.now().isoformat(),
109-
'hostname': None,
110-
'gpus': [],
111-
'system': {},
112-
'status': 'healthy'
113-
}
114-
115-
# System metrics
116-
try:
117-
sys_collector = SystemCollector()
118-
metrics['system'] = sys_collector.collect()
119-
metrics['hostname'] = metrics['system'].get('hostname', 'unknown')
120-
except Exception as e:
121-
metrics['system'] = {'error': str(e)}
122-
109+
metrics = {}
110+
123111
# GPU metrics
124112
try:
125113
gpu_collector = GPUCollector()
126114
metrics['gpus'] = gpu_collector.collect()
127115
except Exception as e:
128116
metrics['gpus'] = [{'error': str(e)}]
129-
117+
118+
# System metrics
119+
try:
120+
sys_collector = SystemCollector()
121+
sys_metrics = sys_collector.collect()
122+
metrics['system'] = sys_metrics
123+
# Ensure a hostname is present at top-level for header
124+
if 'hostname' in sys_metrics and sys_metrics['hostname']:
125+
metrics['hostname'] = sys_metrics['hostname']
126+
except Exception as e:
127+
metrics['system'] = {'error': str(e)}
128+
129+
# Optionally collect network metrics if collector exists
130+
try:
131+
from monitor.collectors.network import NetworkCollector
132+
net_collector = NetworkCollector()
133+
metrics['network'] = net_collector.collect()
134+
except Exception:
135+
# Network collector is optional; ignore failures
136+
pass
137+
130138
return metrics
131139

132140

@@ -232,32 +240,175 @@ async def run_web_server(config: dict):
232240
async def run_cli_monitor(config: dict):
233241
storage = MetricsStorage(config['storage']['path'])
234242
await storage.initialize()
235-
243+
236244
alert_engine = AlertEngine(config.get('alerts', {}))
237-
238-
with Live(console=console, refresh_per_second=1) as live:
239-
while True:
240-
try:
241-
# Collect metrics
242-
metrics = collect_metrics()
243-
244-
# Check alerts
245-
alerts = alert_engine.check(metrics)
246-
247-
# Store metrics
248-
await storage.store(metrics)
249-
250-
# Update dashboard
251-
dashboard = create_dashboard(metrics, alerts)
252-
live.update(dashboard)
253-
254-
await asyncio.sleep(config['monitoring']['interval_seconds'])
255-
256-
except KeyboardInterrupt:
257-
break
258-
except Exception as e:
259-
console.print(f"[red]Error: {e}[/red]")
260-
await asyncio.sleep(5)
245+
246+
# Use a fixed-size console for the CLI dashboard so it does not expand
247+
# with the user's terminal window. Width/height can be configured via
248+
# config['cli'] with keys 'width' and 'height'. Defaults: 120x30.
249+
cli_cfg = config.get('cli', {}) if isinstance(config, dict) else {}
250+
# Default to the current terminal width when a width is not configured
251+
try:
252+
term_width = shutil.get_terminal_size().columns
253+
except Exception:
254+
term_width = 120
255+
fixed_width = int(cli_cfg.get('width', term_width))
256+
# Use a smaller default height so the terminal dashboard is compact
257+
fixed_height = int(cli_cfg.get('height', 18))
258+
fixed_console = Console(width=fixed_width, height=fixed_height)
259+
260+
# Do NOT mutate module-level `console`; use `fixed_console` explicitly.
261+
try:
262+
# Build an initial dashboard layout once and then update only the
263+
# inner renderables (Text and Table). We create mutable Text
264+
# objects for header/system/footer so updating their contents does
265+
# not recreate the top-level panels, minimizing redraw flicker.
266+
initial_metrics = collect_metrics()
267+
initial_alerts = alert_engine.check(initial_metrics)
268+
269+
# Create layout
270+
dashboard = Layout()
271+
dashboard.split_column(
272+
Layout(name="header", size=3),
273+
Layout(name="main"),
274+
Layout(name="footer", size=3)
275+
)
276+
277+
# Mutable text objects for in-place updates (use markup-aware Text)
278+
from rich.text import Text
279+
280+
# Helper: format GPU list as a fixed-width monospaced text grid
281+
# Use stable fixed column widths to ensure all columns are visible
282+
def _format_gpu_grid(gpus, total_width: int = None):
283+
label_w = 6
284+
util_w = 8
285+
mem_w = 18
286+
temp_w = 8
287+
power_w = 8
288+
289+
header = f"{ 'GPU':<{label_w}}{ 'Util':>{util_w}}{ 'Memory':>{mem_w}}{ 'Temp':>{temp_w}}{ 'Power':>{power_w}}"
290+
sep = "-" * (label_w + util_w + mem_w + temp_w + power_w)
291+
lines = [header, sep]
292+
293+
for gpu in gpus:
294+
if 'error' in gpu:
295+
lines.append(f"ERR {str(gpu['error'])}")
296+
continue
297+
idx = f"GPU{gpu.get('index', '?')}"
298+
util = f"{gpu.get('utilization', 0)}%"
299+
mem_used = gpu.get('memory_used', 0)
300+
mem_total = gpu.get('memory_total', 1)
301+
mem = f"{mem_used/1024:.1f}/{mem_total/1024:.1f}GB"
302+
temp = f"{gpu.get('temperature', 0)}C"
303+
power = f"{gpu.get('power', 0):.0f}W"
304+
lines.append(f"{idx:<{label_w}}{util:>{util_w}}{mem:>{mem_w}}{temp:>{temp_w}}{power:>{power_w}}")
305+
306+
return Text("\n".join(lines), no_wrap=True)
307+
308+
# Initial GPU text grid (use dashboard width)
309+
gpu_text = _format_gpu_grid(initial_metrics.get('gpus', []), fixed_width - 6)
310+
311+
# Initialize header, system and footer text using markup-aware Text
312+
node_name = initial_metrics.get('hostname') or socket.gethostname()
313+
header_text = Text.from_markup(
314+
f"[bold cyan]LOCAL GPU MONITOR[/bold cyan] | Node: [green]{node_name}[/green] | "
315+
f"Last Update: {datetime.now().strftime('%H:%M:%S')} | Alerts: [{'red' if initial_alerts else 'green'}]{len(initial_alerts)}[/]"
316+
)
317+
318+
sys_info = initial_metrics.get('system', {})
319+
system_text = Text.from_markup(
320+
f"[bold]CPU:[/bold] {sys_info.get('cpu_percent', 0):.1f}%\n"
321+
f"[bold]RAM:[/bold] {sys_info.get('memory_used_gb', 0):.1f}/{sys_info.get('memory_total_gb', 0):.1f} GB\n"
322+
f"[bold]Disk:[/bold] {sys_info.get('disk_used_gb', 0):.1f}/{sys_info.get('disk_total_gb', 0):.1f} GB"
323+
)
324+
325+
if initial_alerts:
326+
footer_text = Text.from_markup(" | ".join([f"[red]{a['message']}[/red]" for a in initial_alerts[:3]]))
327+
else:
328+
footer_text = Text.from_markup("[green]All systems healthy[/green]")
329+
330+
dashboard["header"].update(Panel(header_text, style="bold"))
331+
# Main split: left=GPUs, right=(system over help)
332+
dashboard["main"].split_row(Layout(name="gpus", ratio=2), Layout(name="right", ratio=1))
333+
# Allocate explicit sizes so Help panel has visible vertical space
334+
# Give Help a larger area so the full benchmark flags list is visible
335+
dashboard["right"].split_column(Layout(name="system", size=6), Layout(name="help", size=12))
336+
dashboard["gpus"].update(Panel(gpu_text, title="GPU Metrics"))
337+
dashboard["right"]["system"].update(Panel(system_text, title="System"))
338+
# Help panel with available CLI commands and flags (populated from repo)
339+
# Replace Help panel with a focused Benchmark panel (key options only)
340+
benchmark_text = Text.from_markup(
341+
"[bold]Benchmark (quick reference)[/bold]\n"
342+
"Run with: [cyan]python health_monitor.py benchmark -v[/cyan]\n"
343+
"[bold]Options:[/bold]\n"
344+
" [cyan]-t, --type[/cyan] : gemm | particle \n"
345+
" [cyan]-v, --visualize[/cyan] : Show simulation"
346+
)
347+
dashboard["right"]["help"].update(Panel(benchmark_text, title="Benchmark"))
348+
dashboard["footer"].update(Panel(footer_text, title="Status"))
349+
350+
with Live(console=fixed_console, refresh_per_second=1) as live:
351+
# Render initial frame
352+
live.update(dashboard)
353+
354+
while True:
355+
try:
356+
# Collect metrics
357+
metrics = collect_metrics()
358+
359+
# Check alerts
360+
alerts = alert_engine.check(metrics)
361+
362+
# Store metrics
363+
await storage.store(metrics)
364+
365+
# Rebuild only the inner renderables (text grid and strings)
366+
gpu_text = _format_gpu_grid(metrics.get('gpus', []), fixed_width - 6)
367+
368+
# System Info (omit Load)
369+
sys_info = metrics.get('system', {})
370+
system_content = (
371+
f"[bold]CPU:[/bold] {sys_info.get('cpu_percent', 0):.1f}%\n"
372+
f"[bold]RAM:[/bold] {sys_info.get('memory_used_gb', 0):.1f}/{sys_info.get('memory_total_gb', 0):.1f} GB\n"
373+
f"[bold]Disk:[/bold] {sys_info.get('disk_used_gb', 0):.1f}/{sys_info.get('disk_total_gb', 0):.1f} GB"
374+
)
375+
376+
# Replace header/system/footer panels with updated Text
377+
node_name = metrics.get('hostname') or socket.gethostname()
378+
new_header = Text.from_markup(
379+
f"[bold cyan]LOCAL GPU MONITOR[/bold cyan] | Node: [green]{node_name}[/green] | "
380+
f"Last Update: {datetime.now().strftime('%H:%M:%S')} | Alerts: [{'red' if alerts else 'green'}]{len(alerts)}[/]"
381+
)
382+
dashboard["header"].update(Panel(new_header, style="bold"))
383+
384+
new_system = Text.from_markup(system_content)
385+
dashboard["right"]["system"].update(Panel(new_system, title="System"))
386+
387+
if alerts:
388+
new_footer = Text.from_markup(" | ".join([f"[red]{a['message']}[/red]" for a in alerts[:3]]))
389+
else:
390+
new_footer = Text.from_markup("[green]All systems healthy[/green]")
391+
# keep the benchmark panel static (no per-iteration rebuild)
392+
dashboard["right"]["help"].update(Panel(benchmark_text, title="Benchmark"))
393+
dashboard["footer"].update(Panel(new_footer, title="Status"))
394+
395+
# Update GPU panel (text grid)
396+
dashboard["gpus"].update(Panel(gpu_text, title="GPU Metrics"))
397+
398+
# Push diffs to the live display
399+
live.update(dashboard)
400+
401+
await asyncio.sleep(config['monitoring']['interval_seconds'])
402+
403+
except KeyboardInterrupt:
404+
break
405+
except Exception as e:
406+
# Print exceptions to the fixed console in-place
407+
fixed_console.print(f"[red]Error: {e}[/red]")
408+
await asyncio.sleep(5)
409+
finally:
410+
# Clean exit from CLI loop (no global console mutation to restore)
411+
pass
261412

262413

263414
def _run_app(config_path, port, nodes, once, web_mode=False, cli_mode=False):
@@ -318,10 +469,9 @@ def _ps_quote(s):
318469
except Exception:
319470
pass
320471

321-
console.print(BANNER, style="bold cyan")
322-
323-
# Load configuration
472+
# Load configuration (print comes from load_config)
324473
cfg = load_config(config_path)
474+
console.print(BANNER, style="bold cyan")
325475

326476
# CLI port overrides config only if explicitly specified
327477
if port is not None:

monitor/api/static/web1.png

57.9 KB
Loading

monitor/api/static/web2.png

162 KB
Loading

monitor/api/static/web3.png

42.7 KB
Loading

monitor/api/static/web4.png

78.9 KB
Loading

monitor/cli/benchmark_cli.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -103,22 +103,39 @@ def benchmark_cli(bench_type, mode, duration, matrix_size, particles, temp_limit
103103
) as progress:
104104
task = progress.add_task(f"[cyan]{bench_type.upper()} Benchmark", total=100)
105105

106-
bench_thread = threading.Thread(target=lambda: bench.start(config, visualize=visualize))
106+
# Run benchmark in a daemon thread so interpreter shutdown won't hang
107+
bench_thread = threading.Thread(target=lambda: bench.start(config, visualize=visualize), daemon=True)
107108
bench_thread.start()
108109

109-
while bench.running:
110-
status = bench.get_status()
111-
fps = status.get('fps', 0.0)
112-
gpu = status.get('gpu_util', 0)
113-
workload = status.get('workload_type', bench_type)
114-
iters = status['iterations']
115-
116-
desc = f"[cyan]FPS:{fps:5.1f} GPU:{gpu:3.0f}% {workload} - {iters} iterations"
117-
progress.update(task, completed=status['progress'], description=desc)
118-
time.sleep(0.5)
119-
120-
bench_thread.join()
121-
progress.update(task, completed=100)
110+
try:
111+
while bench.running:
112+
status = bench.get_status()
113+
fps = status.get('fps', 0.0)
114+
gpu = status.get('gpu_util', 0)
115+
workload = status.get('workload_type', bench_type)
116+
iters = status.get('iterations', 0)
117+
118+
desc = f"[cyan]FPS:{fps:5.1f} GPU:{gpu:3.0f}% {workload} - {iters} iterations"
119+
progress.update(task, completed=status.get('progress', 0), description=desc)
120+
time.sleep(0.5)
121+
except KeyboardInterrupt:
122+
console.print('\n[yellow]Keyboard interrupt received, stopping benchmark...[/yellow]')
123+
try:
124+
# Attempt graceful stop
125+
if hasattr(bench, 'stop') and callable(bench.stop):
126+
bench.stop()
127+
else:
128+
# Fallback: flip running flag if present
129+
setattr(bench, 'running', False)
130+
except Exception:
131+
pass
132+
finally:
133+
# Ensure thread is not left blocking; join briefly
134+
try:
135+
bench_thread.join(timeout=2)
136+
except Exception:
137+
pass
138+
progress.update(task, completed=100)
122139

123140
results = bench.get_results()
124141

0 commit comments

Comments
 (0)