|
14 | 14 | from datetime import datetime |
15 | 15 | from pathlib import Path |
16 | 16 | from typing import Optional, List |
| 17 | +import shutil |
17 | 18 |
|
18 | 19 | import yaml |
19 | 20 | import click |
|
22 | 23 | from rich.table import Table |
23 | 24 | from rich.panel import Panel |
24 | 25 | from rich.layout import Layout |
| 26 | +import socket |
25 | 27 |
|
26 | 28 | from monitor.collectors.gpu import GPUCollector |
27 | 29 | from monitor.collectors.system import SystemCollector |
|
37 | 39 | BANNER = f""" |
38 | 40 | ╔══════════════════════════════════════════════════╗ |
39 | 41 | ║ LOCAL GPU MONITOR v{_pkg_version} ║ |
40 | | -║ Local GPU monitoring and diagnostics ║ |
| 42 | +║ Monitor and Analysis ║ |
41 | 43 | ╚══════════════════════════════════════════════════╝ |
42 | 44 | """ |
43 | 45 |
|
@@ -104,29 +106,35 @@ def load_config(config_path: Optional[str]) -> dict: |
104 | 106 |
|
105 | 107 | def collect_metrics() -> dict: |
106 | 108 | """Collect metrics from local system.""" |
107 | | - metrics = { |
108 | | - 'timestamp': datetime.now().isoformat(), |
109 | | - 'hostname': None, |
110 | | - 'gpus': [], |
111 | | - 'system': {}, |
112 | | - 'status': 'healthy' |
113 | | - } |
114 | | - |
115 | | - # System metrics |
116 | | - try: |
117 | | - sys_collector = SystemCollector() |
118 | | - metrics['system'] = sys_collector.collect() |
119 | | - metrics['hostname'] = metrics['system'].get('hostname', 'unknown') |
120 | | - except Exception as e: |
121 | | - metrics['system'] = {'error': str(e)} |
122 | | - |
| 109 | + metrics = {} |
| 110 | + |
123 | 111 | # GPU metrics |
124 | 112 | try: |
125 | 113 | gpu_collector = GPUCollector() |
126 | 114 | metrics['gpus'] = gpu_collector.collect() |
127 | 115 | except Exception as e: |
128 | 116 | metrics['gpus'] = [{'error': str(e)}] |
129 | | - |
| 117 | + |
| 118 | + # System metrics |
| 119 | + try: |
| 120 | + sys_collector = SystemCollector() |
| 121 | + sys_metrics = sys_collector.collect() |
| 122 | + metrics['system'] = sys_metrics |
| 123 | + # Ensure a hostname is present at top-level for header |
| 124 | + if 'hostname' in sys_metrics and sys_metrics['hostname']: |
| 125 | + metrics['hostname'] = sys_metrics['hostname'] |
| 126 | + except Exception as e: |
| 127 | + metrics['system'] = {'error': str(e)} |
| 128 | + |
| 129 | + # Optionally collect network metrics if collector exists |
| 130 | + try: |
| 131 | + from monitor.collectors.network import NetworkCollector |
| 132 | + net_collector = NetworkCollector() |
| 133 | + metrics['network'] = net_collector.collect() |
| 134 | + except Exception: |
| 135 | + # Network collector is optional; ignore failures |
| 136 | + pass |
| 137 | + |
130 | 138 | return metrics |
131 | 139 |
|
132 | 140 |
|
@@ -232,32 +240,175 @@ async def run_web_server(config: dict): |
232 | 240 | async def run_cli_monitor(config: dict): |
233 | 241 | storage = MetricsStorage(config['storage']['path']) |
234 | 242 | await storage.initialize() |
235 | | - |
| 243 | + |
236 | 244 | alert_engine = AlertEngine(config.get('alerts', {})) |
237 | | - |
238 | | - with Live(console=console, refresh_per_second=1) as live: |
239 | | - while True: |
240 | | - try: |
241 | | - # Collect metrics |
242 | | - metrics = collect_metrics() |
243 | | - |
244 | | - # Check alerts |
245 | | - alerts = alert_engine.check(metrics) |
246 | | - |
247 | | - # Store metrics |
248 | | - await storage.store(metrics) |
249 | | - |
250 | | - # Update dashboard |
251 | | - dashboard = create_dashboard(metrics, alerts) |
252 | | - live.update(dashboard) |
253 | | - |
254 | | - await asyncio.sleep(config['monitoring']['interval_seconds']) |
255 | | - |
256 | | - except KeyboardInterrupt: |
257 | | - break |
258 | | - except Exception as e: |
259 | | - console.print(f"[red]Error: {e}[/red]") |
260 | | - await asyncio.sleep(5) |
| 245 | + |
| 246 | + # Use a fixed-size console for the CLI dashboard so it does not expand |
| 247 | + # with the user's terminal window. Width/height can be configured via |
| 248 | + # config['cli'] with keys 'width' and 'height'. Defaults: 120x30. |
| 249 | + cli_cfg = config.get('cli', {}) if isinstance(config, dict) else {} |
| 250 | + # Default to the current terminal width when a width is not configured |
| 251 | + try: |
| 252 | + term_width = shutil.get_terminal_size().columns |
| 253 | + except Exception: |
| 254 | + term_width = 120 |
| 255 | + fixed_width = int(cli_cfg.get('width', term_width)) |
| 256 | + # Use a smaller default height so the terminal dashboard is compact |
| 257 | + fixed_height = int(cli_cfg.get('height', 18)) |
| 258 | + fixed_console = Console(width=fixed_width, height=fixed_height) |
| 259 | + |
| 260 | + # Do NOT mutate module-level `console`; use `fixed_console` explicitly. |
| 261 | + try: |
| 262 | + # Build an initial dashboard layout once and then update only the |
| 263 | + # inner renderables (Text and Table). We create mutable Text |
| 264 | + # objects for header/system/footer so updating their contents does |
| 265 | + # not recreate the top-level panels, minimizing redraw flicker. |
| 266 | + initial_metrics = collect_metrics() |
| 267 | + initial_alerts = alert_engine.check(initial_metrics) |
| 268 | + |
| 269 | + # Create layout |
| 270 | + dashboard = Layout() |
| 271 | + dashboard.split_column( |
| 272 | + Layout(name="header", size=3), |
| 273 | + Layout(name="main"), |
| 274 | + Layout(name="footer", size=3) |
| 275 | + ) |
| 276 | + |
| 277 | + # Mutable text objects for in-place updates (use markup-aware Text) |
| 278 | + from rich.text import Text |
| 279 | + |
| 280 | + # Helper: format GPU list as a fixed-width monospaced text grid |
| 281 | + # Use stable fixed column widths to ensure all columns are visible |
| 282 | + def _format_gpu_grid(gpus, total_width: int = None): |
| 283 | + label_w = 6 |
| 284 | + util_w = 8 |
| 285 | + mem_w = 18 |
| 286 | + temp_w = 8 |
| 287 | + power_w = 8 |
| 288 | + |
| 289 | + header = f"{ 'GPU':<{label_w}}{ 'Util':>{util_w}}{ 'Memory':>{mem_w}}{ 'Temp':>{temp_w}}{ 'Power':>{power_w}}" |
| 290 | + sep = "-" * (label_w + util_w + mem_w + temp_w + power_w) |
| 291 | + lines = [header, sep] |
| 292 | + |
| 293 | + for gpu in gpus: |
| 294 | + if 'error' in gpu: |
| 295 | + lines.append(f"ERR {str(gpu['error'])}") |
| 296 | + continue |
| 297 | + idx = f"GPU{gpu.get('index', '?')}" |
| 298 | + util = f"{gpu.get('utilization', 0)}%" |
| 299 | + mem_used = gpu.get('memory_used', 0) |
| 300 | + mem_total = gpu.get('memory_total', 1) |
| 301 | + mem = f"{mem_used/1024:.1f}/{mem_total/1024:.1f}GB" |
| 302 | + temp = f"{gpu.get('temperature', 0)}C" |
| 303 | + power = f"{gpu.get('power', 0):.0f}W" |
| 304 | + lines.append(f"{idx:<{label_w}}{util:>{util_w}}{mem:>{mem_w}}{temp:>{temp_w}}{power:>{power_w}}") |
| 305 | + |
| 306 | + return Text("\n".join(lines), no_wrap=True) |
| 307 | + |
| 308 | + # Initial GPU text grid (use dashboard width) |
| 309 | + gpu_text = _format_gpu_grid(initial_metrics.get('gpus', []), fixed_width - 6) |
| 310 | + |
| 311 | + # Initialize header, system and footer text using markup-aware Text |
| 312 | + node_name = initial_metrics.get('hostname') or socket.gethostname() |
| 313 | + header_text = Text.from_markup( |
| 314 | + f"[bold cyan]LOCAL GPU MONITOR[/bold cyan] | Node: [green]{node_name}[/green] | " |
| 315 | + f"Last Update: {datetime.now().strftime('%H:%M:%S')} | Alerts: [{'red' if initial_alerts else 'green'}]{len(initial_alerts)}[/]" |
| 316 | + ) |
| 317 | + |
| 318 | + sys_info = initial_metrics.get('system', {}) |
| 319 | + system_text = Text.from_markup( |
| 320 | + f"[bold]CPU:[/bold] {sys_info.get('cpu_percent', 0):.1f}%\n" |
| 321 | + f"[bold]RAM:[/bold] {sys_info.get('memory_used_gb', 0):.1f}/{sys_info.get('memory_total_gb', 0):.1f} GB\n" |
| 322 | + f"[bold]Disk:[/bold] {sys_info.get('disk_used_gb', 0):.1f}/{sys_info.get('disk_total_gb', 0):.1f} GB" |
| 323 | + ) |
| 324 | + |
| 325 | + if initial_alerts: |
| 326 | + footer_text = Text.from_markup(" | ".join([f"[red]{a['message']}[/red]" for a in initial_alerts[:3]])) |
| 327 | + else: |
| 328 | + footer_text = Text.from_markup("[green]All systems healthy[/green]") |
| 329 | + |
| 330 | + dashboard["header"].update(Panel(header_text, style="bold")) |
| 331 | + # Main split: left=GPUs, right=(system over help) |
| 332 | + dashboard["main"].split_row(Layout(name="gpus", ratio=2), Layout(name="right", ratio=1)) |
| 333 | + # Allocate explicit sizes so Help panel has visible vertical space |
| 334 | + # Give Help a larger area so the full benchmark flags list is visible |
| 335 | + dashboard["right"].split_column(Layout(name="system", size=6), Layout(name="help", size=12)) |
| 336 | + dashboard["gpus"].update(Panel(gpu_text, title="GPU Metrics")) |
| 337 | + dashboard["right"]["system"].update(Panel(system_text, title="System")) |
| 338 | + # Help panel with available CLI commands and flags (populated from repo) |
| 339 | + # Replace Help panel with a focused Benchmark panel (key options only) |
| 340 | + benchmark_text = Text.from_markup( |
| 341 | + "[bold]Benchmark (quick reference)[/bold]\n" |
| 342 | + "Run with: [cyan]python health_monitor.py benchmark -v[/cyan]\n" |
| 343 | + "[bold]Options:[/bold]\n" |
| 344 | + " [cyan]-t, --type[/cyan] : gemm | particle \n" |
| 345 | + " [cyan]-v, --visualize[/cyan] : Show simulation" |
| 346 | + ) |
| 347 | + dashboard["right"]["help"].update(Panel(benchmark_text, title="Benchmark")) |
| 348 | + dashboard["footer"].update(Panel(footer_text, title="Status")) |
| 349 | + |
| 350 | + with Live(console=fixed_console, refresh_per_second=1) as live: |
| 351 | + # Render initial frame |
| 352 | + live.update(dashboard) |
| 353 | + |
| 354 | + while True: |
| 355 | + try: |
| 356 | + # Collect metrics |
| 357 | + metrics = collect_metrics() |
| 358 | + |
| 359 | + # Check alerts |
| 360 | + alerts = alert_engine.check(metrics) |
| 361 | + |
| 362 | + # Store metrics |
| 363 | + await storage.store(metrics) |
| 364 | + |
| 365 | + # Rebuild only the inner renderables (text grid and strings) |
| 366 | + gpu_text = _format_gpu_grid(metrics.get('gpus', []), fixed_width - 6) |
| 367 | + |
| 368 | + # System Info (omit Load) |
| 369 | + sys_info = metrics.get('system', {}) |
| 370 | + system_content = ( |
| 371 | + f"[bold]CPU:[/bold] {sys_info.get('cpu_percent', 0):.1f}%\n" |
| 372 | + f"[bold]RAM:[/bold] {sys_info.get('memory_used_gb', 0):.1f}/{sys_info.get('memory_total_gb', 0):.1f} GB\n" |
| 373 | + f"[bold]Disk:[/bold] {sys_info.get('disk_used_gb', 0):.1f}/{sys_info.get('disk_total_gb', 0):.1f} GB" |
| 374 | + ) |
| 375 | + |
| 376 | + # Replace header/system/footer panels with updated Text |
| 377 | + node_name = metrics.get('hostname') or socket.gethostname() |
| 378 | + new_header = Text.from_markup( |
| 379 | + f"[bold cyan]LOCAL GPU MONITOR[/bold cyan] | Node: [green]{node_name}[/green] | " |
| 380 | + f"Last Update: {datetime.now().strftime('%H:%M:%S')} | Alerts: [{'red' if alerts else 'green'}]{len(alerts)}[/]" |
| 381 | + ) |
| 382 | + dashboard["header"].update(Panel(new_header, style="bold")) |
| 383 | + |
| 384 | + new_system = Text.from_markup(system_content) |
| 385 | + dashboard["right"]["system"].update(Panel(new_system, title="System")) |
| 386 | + |
| 387 | + if alerts: |
| 388 | + new_footer = Text.from_markup(" | ".join([f"[red]{a['message']}[/red]" for a in alerts[:3]])) |
| 389 | + else: |
| 390 | + new_footer = Text.from_markup("[green]All systems healthy[/green]") |
| 391 | + # keep the benchmark panel static (no per-iteration rebuild) |
| 392 | + dashboard["right"]["help"].update(Panel(benchmark_text, title="Benchmark")) |
| 393 | + dashboard["footer"].update(Panel(new_footer, title="Status")) |
| 394 | + |
| 395 | + # Update GPU panel (text grid) |
| 396 | + dashboard["gpus"].update(Panel(gpu_text, title="GPU Metrics")) |
| 397 | + |
| 398 | + # Push diffs to the live display |
| 399 | + live.update(dashboard) |
| 400 | + |
| 401 | + await asyncio.sleep(config['monitoring']['interval_seconds']) |
| 402 | + |
| 403 | + except KeyboardInterrupt: |
| 404 | + break |
| 405 | + except Exception as e: |
| 406 | + # Print exceptions to the fixed console in-place |
| 407 | + fixed_console.print(f"[red]Error: {e}[/red]") |
| 408 | + await asyncio.sleep(5) |
| 409 | + finally: |
| 410 | + # Clean exit from CLI loop (no global console mutation to restore) |
| 411 | + pass |
261 | 412 |
|
262 | 413 |
|
263 | 414 | def _run_app(config_path, port, nodes, once, web_mode=False, cli_mode=False): |
@@ -318,10 +469,9 @@ def _ps_quote(s): |
318 | 469 | except Exception: |
319 | 470 | pass |
320 | 471 |
|
321 | | - console.print(BANNER, style="bold cyan") |
322 | | - |
323 | | - # Load configuration |
| 472 | + # Load configuration (print comes from load_config) |
324 | 473 | cfg = load_config(config_path) |
| 474 | + console.print(BANNER, style="bold cyan") |
325 | 475 |
|
326 | 476 | # CLI port overrides config only if explicitly specified |
327 | 477 | if port is not None: |
|
0 commit comments