Skip to content

Commit 0e9c532

Browse files
tbitcsoz-agent
andcommitted
feat: P2-P5 — HF sync, eval framework, spawner/teams, instinct tests
P2: agent/hf_sync.py — HuggingFace leaderboard sync with model score persistence, staleness detection, and benchmark extraction from HF API. P3: eval/ package — Eval-Driven Development framework with EvalCase, EvalResult, EvalSuite, EvalReport types; built-in core suite (5 eval cases: code_gen, architecture_review, test_gen, patent_claim, intent_classify); stub runner and markdown report generator. P4: agent/spawner.py — SubAgentSpawner for isolated agent workers with role-based tool subsets. agent/teams.py — 4 predefined team compositions (pair-review, full-stack, ip-analysis, spec-draft). P5: Instinct system already existed — added 5 tests for InstinctStore (add, remove, persistence, record_accepted, export_markdown). 26 new tests in test_new_modules.py (all passing). Total: 570 tests. Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent c797a27 commit 0e9c532

8 files changed

Lines changed: 943 additions & 3 deletions

File tree

src/specsmith/agent/hf_sync.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""HuggingFace Open LLM Leaderboard sync for model intelligence (REQ-223).
4+
5+
Fetches benchmark scores from the HuggingFace API and populates
6+
`.specsmith/model_scores.json` so that `rank_models_for_role()` uses
7+
real data instead of hardcoded baselines.
8+
9+
Usage:
10+
from specsmith.agent.hf_sync import sync_scores
11+
results = sync_scores() # returns dict of model_id -> {benchmark: score}
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import json
17+
import time
18+
import urllib.request
19+
from pathlib import Path
20+
from typing import Any
21+
22+
# HF Inference API endpoint for model info
23+
HF_API_BASE = "https://huggingface.co/api"
24+
25+
# Models we track (subset of popular models with known benchmark data)
26+
TRACKED_MODELS: list[str] = [
27+
"gpt-4.1",
28+
"gpt-4.1-mini",
29+
"gpt-4o",
30+
"gpt-4o-mini",
31+
"claude-sonnet-4-20250514",
32+
"claude-3.5-sonnet",
33+
"gemini-2.5-pro",
34+
"gemini-2.5-flash",
35+
"Qwen/Qwen2.5-Coder-32B-Instruct",
36+
"Qwen/Qwen2.5-Coder-7B-Instruct",
37+
"mistralai/Mistral-Large-Latest",
38+
"meta-llama/Llama-3.3-70B-Instruct",
39+
"deepseek-ai/DeepSeek-V3",
40+
]
41+
42+
# Default scores file path (relative to project root)
43+
SCORES_FILENAME = "model_scores.json"
44+
45+
46+
def _scores_path(project_dir: str | Path = ".") -> Path:
47+
return Path(project_dir).resolve() / ".specsmith" / SCORES_FILENAME
48+
49+
50+
def load_cached_scores(project_dir: str | Path = ".") -> dict[str, Any]:
51+
"""Load cached model scores from disk."""
52+
path = _scores_path(project_dir)
53+
if not path.is_file():
54+
return {}
55+
try:
56+
return json.loads(path.read_text(encoding="utf-8"))
57+
except (OSError, ValueError):
58+
return {}
59+
60+
61+
def save_scores(scores: dict[str, Any], project_dir: str | Path = ".") -> None:
62+
"""Persist model scores to disk."""
63+
path = _scores_path(project_dir)
64+
path.parent.mkdir(parents=True, exist_ok=True)
65+
data = {
66+
"synced_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
67+
"models": scores,
68+
}
69+
path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
70+
71+
72+
def fetch_hf_model_info(model_id: str, timeout: int = 10) -> dict[str, Any]:
73+
"""Fetch model metadata from HuggingFace API.
74+
75+
Returns a dict with model card data. On failure returns empty dict.
76+
"""
77+
url = f"{HF_API_BASE}/models/{model_id}"
78+
try:
79+
req = urllib.request.Request(url, headers={"Accept": "application/json"})
80+
with urllib.request.urlopen(req, timeout=timeout) as resp: # noqa: S310
81+
return json.loads(resp.read())
82+
except Exception: # noqa: BLE001
83+
return {}
84+
85+
86+
def _extract_benchmark_scores(model_info: dict[str, Any]) -> dict[str, float]:
87+
"""Extract benchmark scores from HF model card metadata.
88+
89+
Looks for eval_results in the model card data. Returns a dict of
90+
benchmark_name -> score.
91+
"""
92+
scores: dict[str, float] = {}
93+
# HF model cards store eval results in cardData.eval_results
94+
card_data = model_info.get("cardData", {}) or {}
95+
eval_results = card_data.get("eval_results", []) or []
96+
for result in eval_results:
97+
if not isinstance(result, dict):
98+
continue
99+
dataset = result.get("dataset", {})
100+
name = dataset.get("name", "") if isinstance(dataset, dict) else str(dataset)
101+
metrics = result.get("metrics", []) or []
102+
for metric in metrics:
103+
if isinstance(metric, dict):
104+
metric_name = metric.get("name", "")
105+
value = metric.get("value")
106+
if metric_name and value is not None:
107+
try:
108+
key = f"{name}/{metric_name}" if name else metric_name
109+
scores[key] = float(value)
110+
except (TypeError, ValueError):
111+
continue
112+
return scores
113+
114+
115+
def sync_scores(
116+
project_dir: str | Path = ".",
117+
models: list[str] | None = None,
118+
timeout: int = 10,
119+
) -> dict[str, dict[str, float]]:
120+
"""Sync model scores from HuggingFace.
121+
122+
For HF-hosted models, fetches real benchmark data from model cards.
123+
For proprietary models (GPT, Claude, Gemini), uses curated baselines.
124+
125+
Returns dict of model_id -> {benchmark: score}.
126+
"""
127+
from specsmith.agent.model_intelligence import BASELINE_SCORES
128+
129+
target_models = models or TRACKED_MODELS
130+
all_scores: dict[str, dict[str, float]] = {}
131+
132+
for model_id in target_models:
133+
# For non-HF models, use baseline scores
134+
if "/" not in model_id:
135+
baseline = BASELINE_SCORES.get(model_id)
136+
if baseline:
137+
all_scores[model_id] = {"baseline_composite": baseline}
138+
continue
139+
140+
# For HF models, try to fetch real data
141+
info = fetch_hf_model_info(model_id, timeout=timeout)
142+
if info:
143+
benchmarks = _extract_benchmark_scores(info)
144+
if benchmarks:
145+
all_scores[model_id] = benchmarks
146+
continue
147+
148+
# Fallback to baseline
149+
baseline = BASELINE_SCORES.get(model_id)
150+
if baseline:
151+
all_scores[model_id] = {"baseline_composite": baseline}
152+
153+
save_scores(all_scores, project_dir)
154+
return all_scores
155+
156+
157+
def is_stale(project_dir: str | Path = ".", max_age_hours: int = 24) -> bool:
158+
"""Check if cached scores are older than max_age_hours."""
159+
cached = load_cached_scores(project_dir)
160+
synced_at = cached.get("synced_at", "")
161+
if not synced_at:
162+
return True
163+
try:
164+
from datetime import datetime, timezone
165+
166+
synced = datetime.fromisoformat(synced_at.replace("Z", "+00:00"))
167+
age = datetime.now(timezone.utc) - synced
168+
return age.total_seconds() > max_age_hours * 3600
169+
except (ValueError, TypeError):
170+
return True
171+
172+
173+
__all__ = [
174+
"fetch_hf_model_info",
175+
"is_stale",
176+
"load_cached_scores",
177+
"save_scores",
178+
"sync_scores",
179+
]

src/specsmith/agent/spawner.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""Sub-agent spawner — spawn isolated agent workers with tool subsets.
4+
5+
ARCHITECTURE.md §13 Phase 2: Multi-Agent Layer.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from dataclasses import dataclass, field
11+
from typing import Any
12+
13+
14+
@dataclass
15+
class SpawnedAgent:
16+
"""Metadata for a spawned sub-agent."""
17+
18+
id: str
19+
role: str
20+
tools: list[str]
21+
status: str = "idle" # idle, running, completed, failed
22+
result: dict[str, Any] = field(default_factory=dict)
23+
24+
def to_dict(self) -> dict[str, Any]:
25+
return {
26+
"id": self.id,
27+
"role": self.role,
28+
"tools": self.tools,
29+
"status": self.status,
30+
"result": self.result,
31+
}
32+
33+
34+
# Tool subsets for different agent roles
35+
ROLE_TOOLS: dict[str, list[str]] = {
36+
"coder": ["read_file", "write_file", "run_shell", "apply_diff"],
37+
"reviewer": ["read_file", "run_shell", "git_diff"],
38+
"tester": ["read_file", "run_shell", "run_tests"],
39+
"architect": ["read_file", "write_file"],
40+
"researcher": ["read_file", "search_web", "search_repo"],
41+
}
42+
43+
44+
class SubAgentSpawner:
45+
"""Spawn and manage isolated agent workers.
46+
47+
Each spawned agent gets a restricted tool subset based on its role,
48+
preventing accidental cross-domain actions (e.g., a reviewer can't
49+
write files).
50+
"""
51+
52+
def __init__(self) -> None:
53+
self._agents: dict[str, SpawnedAgent] = {}
54+
self._counter = 0
55+
56+
def spawn(self, role: str, tools: list[str] | None = None) -> SpawnedAgent:
57+
"""Spawn a new sub-agent with the given role and tool set."""
58+
self._counter += 1
59+
agent_id = f"agent-{role}-{self._counter:03d}"
60+
effective_tools = tools or ROLE_TOOLS.get(role, [])
61+
agent = SpawnedAgent(id=agent_id, role=role, tools=effective_tools)
62+
self._agents[agent_id] = agent
63+
return agent
64+
65+
def get(self, agent_id: str) -> SpawnedAgent | None:
66+
"""Get a spawned agent by ID."""
67+
return self._agents.get(agent_id)
68+
69+
def list_active(self) -> list[SpawnedAgent]:
70+
"""List all agents that are not completed/failed."""
71+
return [a for a in self._agents.values() if a.status in ("idle", "running")]
72+
73+
def list_all(self) -> list[SpawnedAgent]:
74+
"""List all spawned agents."""
75+
return list(self._agents.values())
76+
77+
def complete(self, agent_id: str, result: dict[str, Any]) -> None:
78+
"""Mark an agent as completed with its result."""
79+
agent = self._agents.get(agent_id)
80+
if agent:
81+
agent.status = "completed"
82+
agent.result = result
83+
84+
def fail(self, agent_id: str, error: str) -> None:
85+
"""Mark an agent as failed."""
86+
agent = self._agents.get(agent_id)
87+
if agent:
88+
agent.status = "failed"
89+
agent.result = {"error": error}
90+
91+
92+
__all__ = ["ROLE_TOOLS", "SpawnedAgent", "SubAgentSpawner"]

src/specsmith/agent/teams.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2026 BitConcepts, LLC. All rights reserved.
3+
"""Team definitions for multi-agent coordination.
4+
5+
ARCHITECTURE.md §13 Phase 2: predefined agent team compositions.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from dataclasses import dataclass, field
11+
from typing import Any
12+
13+
14+
@dataclass
15+
class TeamMember:
16+
"""A role slot within a team."""
17+
18+
role: str
19+
required: bool = True
20+
tools_override: list[str] | None = None
21+
22+
23+
@dataclass
24+
class TeamDefinition:
25+
"""A named team of agent roles that work together."""
26+
27+
id: str
28+
name: str
29+
description: str
30+
members: list[TeamMember] = field(default_factory=list)
31+
32+
def to_dict(self) -> dict[str, Any]:
33+
return {
34+
"id": self.id,
35+
"name": self.name,
36+
"description": self.description,
37+
"members": [{"role": m.role, "required": m.required} for m in self.members],
38+
}
39+
40+
41+
# Pre-defined teams
42+
PAIR_REVIEW = TeamDefinition(
43+
id="pair-review",
44+
name="Pair Review",
45+
description="Coder + Reviewer pair for code changes with built-in review",
46+
members=[
47+
TeamMember(role="coder"),
48+
TeamMember(role="reviewer"),
49+
],
50+
)
51+
52+
FULL_STACK = TeamDefinition(
53+
id="full-stack",
54+
name="Full Stack",
55+
description="Architect + Coder + Tester trio for complete feature development",
56+
members=[
57+
TeamMember(role="architect"),
58+
TeamMember(role="coder"),
59+
TeamMember(role="tester"),
60+
],
61+
)
62+
63+
IP_ANALYSIS = TeamDefinition(
64+
id="ip-analysis",
65+
name="IP Analysis",
66+
description="IP Analyst + Researcher + Strategist for patent work",
67+
members=[
68+
TeamMember(role="ip-analyst"),
69+
TeamMember(role="researcher"),
70+
TeamMember(role="strategist"),
71+
],
72+
)
73+
74+
SPEC_DRAFT = TeamDefinition(
75+
id="spec-draft",
76+
name="Specification Drafting",
77+
description="Architect + Drafter + Reviewer for specification writing",
78+
members=[
79+
TeamMember(role="architect"),
80+
TeamMember(role="drafter"),
81+
TeamMember(role="reviewer"),
82+
],
83+
)
84+
85+
BUILTIN_TEAMS: dict[str, TeamDefinition] = {
86+
"pair-review": PAIR_REVIEW,
87+
"full-stack": FULL_STACK,
88+
"ip-analysis": IP_ANALYSIS,
89+
"spec-draft": SPEC_DRAFT,
90+
}
91+
92+
93+
def get_team(team_id: str) -> TeamDefinition | None:
94+
"""Get a built-in team by ID."""
95+
return BUILTIN_TEAMS.get(team_id)
96+
97+
98+
def list_teams() -> list[TeamDefinition]:
99+
"""List all available teams."""
100+
return list(BUILTIN_TEAMS.values())
101+
102+
103+
__all__ = ["BUILTIN_TEAMS", "TeamDefinition", "TeamMember", "get_team", "list_teams"]

0 commit comments

Comments
 (0)