Skip to content

Commit 279eec7

Browse files
tbitcsoz-agent
andcommitted
feat: topic-aware classification (#47), type-specific thresholds (#48)
#47 — Body content classification: - Unmatched sections now get a secondary scan of body text (first 2000 chars) - Register maps, address offsets, block diagrams, milestones, roadmaps route to verification.md (technical reference) - Windows path setup, per-machine config, NTFS workarounds route to drift-metrics.md (environment) - Reduces misclassification where heading is generic but content is specific #48 — Type-specific governance thresholds: - fpga-rtl: rules=1000, workflow=500, verification=600 - yocto-bsp: rules=1000, workflow=500, verification=500 - embedded-hardware: rules=1000, verification=500 - pcb-hardware: rules=900, verification=500 - Reads project type from scaffold.yml, falls back to defaults - Software projects keep lower defaults (rules=800, etc.) Closes #47, closes #48 Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent b2269ff commit 279eec7

2 files changed

Lines changed: 76 additions & 13 deletions

File tree

src/specsmith/auditor.py

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -283,20 +283,62 @@ def check_ledger_health(root: Path) -> list[AuditResult]:
283283
# ---------------------------------------------------------------------------
284284

285285

286+
# Default thresholds (used when no project type is detected)
287+
_DEFAULT_THRESHOLDS: dict[str, int] = {
288+
"AGENTS.md": 200,
289+
"docs/governance/rules.md": 800,
290+
"docs/governance/workflow.md": 400,
291+
"docs/governance/roles.md": 300,
292+
"docs/governance/context-budget.md": 300,
293+
"docs/governance/verification.md": 400,
294+
"docs/governance/drift-metrics.md": 300,
295+
}
296+
297+
# Type-specific overrides — hardware/embedded projects have denser rules.
298+
_TYPE_THRESHOLD_OVERRIDES: dict[str, dict[str, int]] = {
299+
"fpga-rtl": {
300+
"docs/governance/rules.md": 1000,
301+
"docs/governance/workflow.md": 500,
302+
"docs/governance/verification.md": 600,
303+
},
304+
"yocto-bsp": {
305+
"docs/governance/rules.md": 1000,
306+
"docs/governance/workflow.md": 500,
307+
"docs/governance/verification.md": 500,
308+
},
309+
"embedded-hardware": {
310+
"docs/governance/rules.md": 1000,
311+
"docs/governance/verification.md": 500,
312+
},
313+
"pcb-hardware": {
314+
"docs/governance/rules.md": 900,
315+
"docs/governance/verification.md": 500,
316+
},
317+
}
318+
319+
320+
def _get_thresholds(root: Path) -> dict[str, int]:
321+
"""Get governance size thresholds, scaled by project type if available."""
322+
thresholds = dict(_DEFAULT_THRESHOLDS)
323+
scaffold_path = root / "scaffold.yml"
324+
if scaffold_path.exists():
325+
try:
326+
import yaml
327+
328+
with open(scaffold_path) as f:
329+
raw = yaml.safe_load(f) or {}
330+
ptype = raw.get("type", "")
331+
overrides = _TYPE_THRESHOLD_OVERRIDES.get(ptype, {})
332+
thresholds.update(overrides)
333+
except Exception: # noqa: BLE001
334+
pass # Use defaults on any error
335+
return thresholds
336+
337+
286338
def check_context_size(root: Path) -> list[AuditResult]:
287-
"""Check governance file sizes against thresholds."""
339+
"""Check governance file sizes against type-aware thresholds."""
288340
results: list[AuditResult] = []
289-
# AGENTS.md is strict (loaded every session). Modular governance files
290-
# are lazily loaded per task type so can be larger.
291-
thresholds = {
292-
"AGENTS.md": 200,
293-
"docs/governance/rules.md": 800,
294-
"docs/governance/workflow.md": 400,
295-
"docs/governance/roles.md": 300,
296-
"docs/governance/context-budget.md": 300,
297-
"docs/governance/verification.md": 400,
298-
"docs/governance/drift-metrics.md": 300,
299-
}
341+
thresholds = _get_thresholds(root)
300342

301343
for rel_path, max_lines in thresholds.items():
302344
path = root / rel_path

src/specsmith/importer.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,20 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
786786
}
787787
unmatched: list[tuple[str, str]] = []
788788

789+
# Body-level content keywords for secondary classification.
790+
# Used when heading doesn't match — scan body text for strong signals.
791+
_BODY_ARCHITECTURE_KW = [
792+
"register map", "address offset", "0x0", "register name",
793+
"block diagram", "data flow", "interface spec",
794+
"directory layout", "src/", "repository structure",
795+
"milestone", "roadmap", "completion", "phase 2 target",
796+
]
797+
_BODY_DRIFT_KW = [
798+
"subst v:", "path-length", "one-time setup", "per-machine",
799+
"environment variable", "install once", "bootstrap",
800+
"windows path", "ntfs",
801+
]
802+
789803
for heading, body in sections.items():
790804
key_lower = heading.lower()
791805
matched = False
@@ -795,7 +809,14 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
795809
matched = True
796810
break # First match wins
797811
if not matched:
798-
unmatched.append((heading, body))
812+
# Secondary pass: scan body content for strong topic signals
813+
body_lower = body[:2000].lower() # Cap scan for performance
814+
if any(kw in body_lower for kw in _BODY_ARCHITECTURE_KW):
815+
buckets["verification"].append((heading, body)) # technical reference
816+
elif any(kw in body_lower for kw in _BODY_DRIFT_KW):
817+
buckets["drift-metrics"].append((heading, body))
818+
else:
819+
unmatched.append((heading, body))
799820

800821
# Unmatched sections go to rules.md as project-specific rules
801822
if unmatched:

0 commit comments

Comments
 (0)