Skip to content

Commit 46ce19a

Browse files
committed
feat(wiki redesign): Phase 7.3 — seed wiki from existing repo docs
Finishes the cold-start trilogy. wiki_seed_codebase scans the current repo for README, CHANGELOG, CONTRIBUTING, ARCHITECTURE, HISTORY, SECURITY, CLAUDE, AGENTS, docs/**/*.md, adr/**/*.md, ADR-*.md, and .claude/**/*.md files, imports each as a memory (kind auto-detected from filename/path), and kicks off the full wiki pipeline. Caps per-file reads at 8kB (avoids flooding the extractor on a 50- page monolithic README; `[...truncated]` marker added). Skips vendored / generated paths (node_modules, .venv, .build, dist, .generated, etc.). Caps total files at 50 by default. Verified dry-run against the Cortex repo surfaces 30 legitimate docs (README, CLAUDE.md, 18 ADRs in docs/adr/, …) with correct kind inference. Intended call path on fresh install: /cortex-setup-project → wiki_seed_codebase → pipeline → 20-50 pages in the first session, before any session-derived memories accumulate. run_pipeline default True; dry_run available for previewing what would get ingested.
1 parent 56c86fc commit 46ce19a

1 file changed

Lines changed: 203 additions & 0 deletions

File tree

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""Seed the wiki from a project's existing markdown docs (Phase 7.3).
2+
3+
Purpose: first-run users get concrete pages in their wiki within
4+
minutes of install, without waiting for session-derived memories to
5+
accumulate.
6+
7+
Scanned by default:
8+
README.md, CHANGELOG.md, CONTRIBUTING.md, ARCHITECTURE.md,
9+
HISTORY.md, SECURITY.md, docs/**/*.md, ADR-*.md, adr/*.md
10+
11+
Each file becomes ONE memory (via remember), tagged `seed:codebase`
12+
and the detected kind. The wiki pipeline is run afterward so the
13+
imports produce claim events → concepts → drafts → pages in one call.
14+
15+
Per-file size capped at 8 kB (head-only — prevents a 50-page README
16+
from flooding the extractor). Binary and huge files skipped.
17+
18+
Never raises per-file; collects errors in the summary.
19+
"""
20+
21+
from __future__ import annotations
22+
23+
from pathlib import Path
24+
from typing import Any
25+
26+
27+
schema = {
28+
"description": (
29+
"Seed the wiki from markdown documents already in the repo "
30+
"(README, ADRs, docs/, …). Imports each as a memory, runs the "
31+
"full wiki pipeline, returns per-stage counts."
32+
),
33+
"inputSchema": {
34+
"type": "object",
35+
"properties": {
36+
"repo_root": {
37+
"type": "string",
38+
"description": "Path to the repo. Defaults to cwd.",
39+
},
40+
"max_files": {"type": "integer", "default": 50},
41+
"max_bytes_per_file": {"type": "integer", "default": 8192},
42+
"dry_run": {"type": "boolean", "default": False},
43+
"run_pipeline": {"type": "boolean", "default": True},
44+
},
45+
},
46+
}
47+
48+
49+
# Priority-ordered; scanner walks the repo once and keeps files whose
50+
# relative path matches any of these patterns.
51+
_SEED_PATTERNS: list[str] = [
52+
"README.md",
53+
"CHANGELOG.md",
54+
"CONTRIBUTING.md",
55+
"ARCHITECTURE.md",
56+
"HISTORY.md",
57+
"SECURITY.md",
58+
"AGENTS.md",
59+
"CLAUDE.md",
60+
"docs/**/*.md",
61+
"adr/**/*.md",
62+
"ADR-*.md",
63+
".claude/**/*.md",
64+
]
65+
66+
# Skip these paths even if they match a pattern (vendored / generated).
67+
_SKIP_PATH_FRAGMENTS = (
68+
"/node_modules/",
69+
"/.venv/",
70+
"/.build/",
71+
"/.git/",
72+
"/dist/",
73+
"/build/",
74+
"/__pycache__/",
75+
"/.cache/",
76+
"/.generated/",
77+
)
78+
79+
80+
def _kind_for(rel_path: str) -> str:
81+
low = rel_path.lower()
82+
if "adr" in low or "decision" in low:
83+
return "adr"
84+
if "architecture" in low:
85+
return "spec"
86+
if "convention" in low or "style" in low:
87+
return "convention"
88+
if "lesson" in low or "postmortem" in low:
89+
return "lesson"
90+
if low.startswith("readme") or low.endswith("/readme.md"):
91+
return "note"
92+
return "note"
93+
94+
95+
def _collect_files(
96+
root: Path, max_files: int, max_bytes: int
97+
) -> list[tuple[Path, str]]:
98+
"""Return [(abs_path, rel_path)] for seed-worthy markdown."""
99+
root = root.resolve()
100+
seen: set[Path] = set()
101+
results: list[tuple[Path, str]] = []
102+
for pattern in _SEED_PATTERNS:
103+
for p in sorted(root.glob(pattern)):
104+
try:
105+
pr = p.resolve()
106+
except OSError:
107+
continue
108+
if pr in seen:
109+
continue
110+
seen.add(pr)
111+
rel = str(pr.relative_to(root)).replace("\\", "/")
112+
if any(frag in f"/{rel}" for frag in _SKIP_PATH_FRAGMENTS):
113+
continue
114+
if not pr.is_file():
115+
continue
116+
try:
117+
size = pr.stat().st_size
118+
except OSError:
119+
continue
120+
if size == 0 or size > 2_000_000:
121+
continue
122+
_ = max_bytes # capped at read time, not filter time
123+
results.append((pr, rel))
124+
if len(results) >= max_files:
125+
return results
126+
return results
127+
128+
129+
async def handler(args: dict[str, Any] | None = None) -> dict[str, Any]:
130+
args = args or {}
131+
repo_root = Path(args.get("repo_root") or Path.cwd()).resolve()
132+
max_files = int(args.get("max_files", 50))
133+
max_bytes = int(args.get("max_bytes_per_file", 8192))
134+
dry_run = bool(args.get("dry_run", False))
135+
run_pipeline = bool(args.get("run_pipeline", True))
136+
137+
from mcp_server.handlers.remember import handler as h_remember
138+
139+
files = _collect_files(repo_root, max_files, max_bytes)
140+
if not files:
141+
return {
142+
"files_found": 0,
143+
"imported": 0,
144+
"note": "no seed-eligible markdown found in this repo",
145+
"dry_run": dry_run,
146+
}
147+
148+
if dry_run:
149+
return {
150+
"files_found": len(files),
151+
"preview": [
152+
{"path": rel, "kind": _kind_for(rel), "size": p.stat().st_size}
153+
for p, rel in files
154+
],
155+
"dry_run": True,
156+
}
157+
158+
imported = 0
159+
errors: list[str] = []
160+
for p, rel in files:
161+
try:
162+
content = p.read_text(encoding="utf-8", errors="replace")
163+
if len(content) > max_bytes:
164+
content = content[:max_bytes] + "\n\n[...truncated]"
165+
domain = repo_root.name or "seed"
166+
kind = _kind_for(rel)
167+
result = await h_remember(
168+
{
169+
"content": content,
170+
"tags": ["seed:codebase", f"kind:{kind}", f"file:{rel}"],
171+
"domain": domain,
172+
"source": f"seed:{rel}",
173+
"force": True,
174+
}
175+
)
176+
if result.get("stored") or result.get("memory_id"):
177+
imported += 1
178+
except Exception as e:
179+
errors.append(f"{rel}: {e}")
180+
181+
summary: dict[str, Any] = {
182+
"files_found": len(files),
183+
"imported": imported,
184+
"errors": errors[:10],
185+
"error_count": len(errors),
186+
"dry_run": False,
187+
}
188+
189+
if run_pipeline and imported > 0:
190+
try:
191+
from mcp_server.handlers.wiki_pipeline import handler as h_pipeline
192+
193+
pipe = await h_pipeline({"limit_per_stage": 1000})
194+
summary["pipeline"] = {
195+
"claims_inserted": pipe.get("claims_inserted", 0),
196+
"concepts_inserted": pipe.get("concepts_inserted", 0),
197+
"drafts_approved": pipe.get("drafts_approved", 0),
198+
"pages_published": pipe.get("pages_published", 0),
199+
}
200+
except Exception as e:
201+
summary["pipeline"] = {"error": str(e)}
202+
203+
return summary

0 commit comments

Comments
 (0)