Skip to content

Commit 421b545

Browse files
Gregg CochranCopilot
andcommitted
Fix metaswarm metric truthfulness
Distinguish running, seen, completed, failed, stale, and recent-launch sub-agent metrics so Agent Pulse does not show stale zeroes during Agent Orchestra runs. Compute orchestrator commentary from live commander telemetry before rendering dashboard insights. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 4fbc47f commit 421b545

1 file changed

Lines changed: 102 additions & 29 deletions

File tree

agent_pulse.py

Lines changed: 102 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,19 @@ class MetaswarmRun:
411411
commander_target: int = 0
412412

413413

414+
def metaswarm_commander_is_active(commander: MetaswarmCommander) -> bool:
415+
status = (commander.status or "").lower()
416+
if status in TERMINAL_COMMANDER_STATUSES:
417+
return False
418+
if commander.pid_status == "run":
419+
return True
420+
if commander.pid_status == "dead":
421+
return False
422+
return status in {"running", "starting"} and (
423+
commander.heartbeat_age_s is None or commander.heartbeat_age_s < 90
424+
)
425+
426+
414427
@dataclass
415428
class LiveAgent:
416429
source: str
@@ -1509,18 +1522,60 @@ def metric_int(*values: object, default: int = 0) -> int:
15091522

15101523
if commanders:
15111524
self.store.insert_agent_events(commander_launch_events)
1525+
commander_target = metric_int(
1526+
state.get("commander_count"),
1527+
state.get("total_tasks"),
1528+
default=len(commanders),
1529+
)
1530+
active_count = sum(
1531+
1 for commander in commanders if metaswarm_commander_is_active(commander)
1532+
)
1533+
child_running = sum(commander.child_agents_running for commander in commanders)
1534+
child_completed = sum(commander.child_agents_completed for commander in commanders)
1535+
child_failed = sum(commander.child_agents_failed for commander in commanders)
1536+
child_stale = sum(commander.child_agents_stale for commander in commanders)
1537+
child_seen = sum(commander.child_agents_seen for commander in commanders)
1538+
queue_count = len(list((run_dir / "queue").glob("*.json"))) if (run_dir / "queue").exists() else 0
1539+
claimed_count = len(list((run_dir / "claimed").glob("*.json"))) if (run_dir / "claimed").exists() else 0
1540+
result_count = len(list((run_dir / "results").glob("commander-*.json"))) if (run_dir / "results").exists() else 0
1541+
collab_counts = {}
1542+
for ledger_name in ("proposals", "reviews", "improvements", "consensus", "broadcasts"):
1543+
ledger_path = run_dir / "collab" / f"{ledger_name}.jsonl"
1544+
if ledger_path.exists():
1545+
try:
1546+
collab_counts[ledger_name] = sum(
1547+
1 for line in ledger_path.read_text(errors="replace").splitlines() if line.strip()
1548+
)
1549+
except OSError:
1550+
collab_counts[ledger_name] = 0
1551+
else:
1552+
collab_counts[ledger_name] = 0
1553+
computed_commentary = [
1554+
(
1555+
f"cmd {active_count}/{commander_target} active · "
1556+
f"sub-agents {child_running} running / {child_completed} done / "
1557+
f"{child_failed} failed / {child_stale} stale / {child_seen} seen · "
1558+
f"q {queue_count} · claimed {claimed_count} · results {result_count}/{commander_target}"
1559+
),
1560+
(
1561+
"collab "
1562+
f"p{collab_counts['proposals']} r{collab_counts['reviews']} "
1563+
f"i{collab_counts['improvements']} c{collab_counts['consensus']} "
1564+
f"b{collab_counts['broadcasts']}"
1565+
),
1566+
]
1567+
raw_commentary = [
1568+
line for line in commentary
1569+
if not line.startswith("cmd ") and not line.startswith("collab ")
1570+
]
15121571
runs.append(
15131572
MetaswarmRun(
15141573
run_id=run_id,
15151574
repo_path=repo_path,
15161575
profile=profile,
15171576
commanders=commanders,
1518-
commentary=commentary,
1519-
commander_target=metric_int(
1520-
state.get("commander_count"),
1521-
state.get("total_tasks"),
1522-
default=len(commanders),
1523-
),
1577+
commentary=[*computed_commentary, *raw_commentary][:5],
1578+
commander_target=commander_target,
15241579
)
15251580
)
15261581

@@ -1591,17 +1646,7 @@ def live_agents(self, metaswarm_runs: Optional[List[MetaswarmRun]] = None) -> Li
15911646
for commander in run.commanders:
15921647
commander_agent_id = f"{run.run_id}/{commander.commander_id}"
15931648
commander_status_text = (commander.status or "").lower()
1594-
commander_active = (
1595-
commander_status_text not in TERMINAL_COMMANDER_STATUSES
1596-
and commander.pid_status == "run"
1597-
or (
1598-
commander_status_text not in TERMINAL_COMMANDER_STATUSES
1599-
and
1600-
commander.pid_status == "unknown"
1601-
and commander_status_text in {"running", "starting"}
1602-
and (commander.heartbeat_age_s is None or commander.heartbeat_age_s < 90)
1603-
)
1604-
)
1649+
commander_active = metaswarm_commander_is_active(commander)
16051650
if commander_agent_id not in seen_stampede_agents:
16061651
if commander_status_text in TERMINAL_COMMANDER_STATUSES:
16071652
commander_status = terminal_commander_status_label(commander_status_text)
@@ -2098,6 +2143,8 @@ class PulseMetrics:
20982143
metaswarm_children_running: int = 0
20992144
metaswarm_children_last5m: int = 0
21002145
metaswarm_children_seen: int = 0
2146+
metaswarm_children_completed: int = 0
2147+
metaswarm_children_failed: int = 0
21012148
metaswarm_children_stale: int = 0
21022149
live_agents: List[LiveAgent] = None
21032150
commander_alerts: List[Dict[str, str]] = None
@@ -2346,14 +2393,7 @@ def poll(self) -> PulseMetrics:
23462393
# remain the preferred sources for long-lived runs.
23472394
running_subagents_from_events = self.store.running_subagents_since(ts - LIVE_EVENT_WINDOW_S)
23482395
def commander_is_active(c: MetaswarmCommander) -> bool:
2349-
status = (c.status or "").lower()
2350-
if status in TERMINAL_COMMANDER_STATUSES:
2351-
return False
2352-
if c.pid_status == "run":
2353-
return True
2354-
if c.pid_status == "dead":
2355-
return False
2356-
return c.status in {"running", "starting"} and (c.heartbeat_age_s is None or c.heartbeat_age_s < 90)
2396+
return metaswarm_commander_is_active(c)
23572397

23582398
active_metaswarm_runs = [
23592399
run
@@ -2383,6 +2423,16 @@ def commander_running_squad_leads(c: MetaswarmCommander) -> int:
23832423
for run in display_metaswarm_runs
23842424
for c in run.commanders
23852425
)
2426+
metaswarm_children_completed = sum(
2427+
c.child_agents_completed
2428+
for run in display_metaswarm_runs
2429+
for c in run.commanders
2430+
)
2431+
metaswarm_children_failed = sum(
2432+
c.child_agents_failed
2433+
for run in display_metaswarm_runs
2434+
for c in run.commanders
2435+
)
23862436
metaswarm_children_stale = sum(
23872437
c.child_agents_stale
23882438
for run in display_metaswarm_runs
@@ -2494,7 +2544,11 @@ def live_rank(a: LiveAgent) -> Tuple[int, int, str, str]:
24942544
if display_metaswarm_runs
24952545
else live_level_counts.get("workers", 0)
24962546
)
2497-
subagents_last5m = agent_events_last5m
2547+
subagents_last5m = (
2548+
metaswarm_children_last5m
2549+
if display_metaswarm_runs
2550+
else agent_events_last5m
2551+
)
24982552
launch_level_counts_5m = self.store.agent_events_by_level_since(ts - 5 * 60)
24992553

25002554
# Feature 1: Success rate
@@ -2573,6 +2627,8 @@ def live_rank(a: LiveAgent) -> Tuple[int, int, str, str]:
25732627
metaswarm_children_running=metaswarm_children_running,
25742628
metaswarm_children_last5m=metaswarm_children_last5m,
25752629
metaswarm_children_seen=metaswarm_children_seen,
2630+
metaswarm_children_completed=metaswarm_children_completed,
2631+
metaswarm_children_failed=metaswarm_children_failed,
25762632
metaswarm_children_stale=metaswarm_children_stale,
25772633
live_agents=live_agents,
25782634
commander_alerts=commander_alerts,
@@ -2634,11 +2690,13 @@ def render(self) -> Panel:
26342690
(str(levels.get("other", 0)), "bold #8D99AE"),
26352691
(" · seen ", "#8D99AE"),
26362692
(str(m.metaswarm_children_seen), "bold #00F5D4"),
2693+
(" · done ", "#8D99AE"),
2694+
(str(m.metaswarm_children_completed), "bold #7CFF6B"),
26372695
(" · stale ", "#8D99AE"),
26382696
(str(m.metaswarm_children_stale), "bold #FFD166"),
26392697
)
26402698
metrics_line_3 = Text.assemble(
2641-
("launch events 5m ", "#8D99AE"),
2699+
("sub-agent launches 5m ", "#8D99AE"),
26422700
(str(m.subagents_last5m), "bold #00F5D4"),
26432701
(" · velocity ", "#8D99AE"),
26442702
(f"{m.velocity}/hr", "bold #FFD166"),
@@ -2732,10 +2790,15 @@ def bar(val: int, max_val: int = 20, width: int = 16) -> Text:
27322790
bar(levels.get("squad_leads", 0)),
27332791
)
27342792
t.add_row(
2735-
Text("Sub-agents :", style="bold white"),
2793+
Text("Sub-agents running:", style="bold white"),
27362794
Text(str(levels.get("workers", 0)), style="bold #7CFF6B"),
27372795
bar(levels.get("workers", 0)),
27382796
)
2797+
t.add_row(
2798+
Text("Sub-agents seen :", style="bold white"),
2799+
Text(str(m.metaswarm_children_seen), style="bold #00F5D4"),
2800+
Text(f"done {m.metaswarm_children_completed} · stale {m.metaswarm_children_stale}", style="#8D99AE"),
2801+
)
27392802
t.add_row(
27402803
Text("Reviewers :", style="bold white"),
27412804
Text(str(levels.get("reviewers", 0)), style="bold #FF4D6D"),
@@ -2747,7 +2810,7 @@ def bar(val: int, max_val: int = 20, width: int = 16) -> Text:
27472810
bar(levels.get("other", 0)),
27482811
)
27492812
t.add_row(
2750-
Text("Launch events 5m:", style="bold white"),
2813+
Text("Sub-agent launches 5m:", style="bold white"),
27512814
Text(str(m.subagents_last5m), style="bold #00F5D4"),
27522815
bar(m.subagents_last5m),
27532816
)
@@ -3109,6 +3172,8 @@ def render(self) -> Panel:
31093172
f"Stampede ledgers: {m.metaswarm_active_commanders}/{commander_total} commanders · "
31103173
f"{m.metaswarm_children_seen} sub-agents seen · "
31113174
f"{m.metaswarm_children_running} running · "
3175+
f"{m.metaswarm_children_completed} done · "
3176+
f"{m.metaswarm_children_failed} failed · "
31123177
f"{m.metaswarm_children_stale} stale · "
31133178
f"{m.metaswarm_children_last5m} sub-agent launches in 5m",
31143179
style="#8D99AE",
@@ -3471,10 +3536,14 @@ def action_export_snapshot(self) -> None:
34713536
"active_commanders": m.metaswarm_active_commanders,
34723537
"sub_agents_seen": m.metaswarm_children_seen,
34733538
"sub_agents_running": m.metaswarm_children_running,
3539+
"sub_agents_completed": m.metaswarm_children_completed,
3540+
"sub_agents_failed": m.metaswarm_children_failed,
34743541
"sub_agents_stale": m.metaswarm_children_stale,
34753542
"sub_agents_last5m": m.metaswarm_children_last5m,
34763543
"children_seen": m.metaswarm_children_seen,
34773544
"children_running": m.metaswarm_children_running,
3545+
"children_completed": m.metaswarm_children_completed,
3546+
"children_failed": m.metaswarm_children_failed,
34783547
"children_stale": m.metaswarm_children_stale,
34793548
"children_last5m": m.metaswarm_children_last5m,
34803549
"runs": [dataclasses.asdict(r) for r in m.metaswarm_runs],
@@ -3629,10 +3698,14 @@ def _mode_export() -> None:
36293698
"total_commanders": m.metaswarm_total_commanders,
36303699
"sub_agents_seen": m.metaswarm_children_seen,
36313700
"sub_agents_running": m.metaswarm_children_running,
3701+
"sub_agents_completed": m.metaswarm_children_completed,
3702+
"sub_agents_failed": m.metaswarm_children_failed,
36323703
"sub_agents_stale": m.metaswarm_children_stale,
36333704
"sub_agents_last5m": m.metaswarm_children_last5m,
36343705
"children_seen": m.metaswarm_children_seen,
36353706
"children_running": m.metaswarm_children_running,
3707+
"children_completed": m.metaswarm_children_completed,
3708+
"children_failed": m.metaswarm_children_failed,
36363709
"children_stale": m.metaswarm_children_stale,
36373710
"children_last5m": m.metaswarm_children_last5m,
36383711
"runs": [dataclasses.asdict(r) for r in m.metaswarm_runs],

0 commit comments

Comments
 (0)