|
| 1 | +"""Seed the wiki from a project's existing markdown docs (Phase 7.3). |
| 2 | +
|
| 3 | +Purpose: first-run users get concrete pages in their wiki within |
| 4 | +minutes of install, without waiting for session-derived memories to |
| 5 | +accumulate. |
| 6 | +
|
| 7 | +Scanned by default: |
| 8 | + README.md, CHANGELOG.md, CONTRIBUTING.md, ARCHITECTURE.md, |
| 9 | + HISTORY.md, SECURITY.md, docs/**/*.md, ADR-*.md, adr/*.md |
| 10 | +
|
| 11 | +Each file becomes ONE memory (via remember), tagged `seed:codebase` |
| 12 | +and the detected kind. The wiki pipeline is run afterward so the |
| 13 | +imports produce claim events → concepts → drafts → pages in one call. |
| 14 | +
|
| 15 | +Per-file size capped at 8 kB (head-only — prevents a 50-page README |
| 16 | +from flooding the extractor). Binary and huge files skipped. |
| 17 | +
|
| 18 | +Never raises per-file; collects errors in the summary. |
| 19 | +""" |
| 20 | + |
| 21 | +from __future__ import annotations |
| 22 | + |
| 23 | +from pathlib import Path |
| 24 | +from typing import Any |
| 25 | + |
| 26 | + |
| 27 | +schema = { |
| 28 | + "description": ( |
| 29 | + "Seed the wiki from markdown documents already in the repo " |
| 30 | + "(README, ADRs, docs/, …). Imports each as a memory, runs the " |
| 31 | + "full wiki pipeline, returns per-stage counts." |
| 32 | + ), |
| 33 | + "inputSchema": { |
| 34 | + "type": "object", |
| 35 | + "properties": { |
| 36 | + "repo_root": { |
| 37 | + "type": "string", |
| 38 | + "description": "Path to the repo. Defaults to cwd.", |
| 39 | + }, |
| 40 | + "max_files": {"type": "integer", "default": 50}, |
| 41 | + "max_bytes_per_file": {"type": "integer", "default": 8192}, |
| 42 | + "dry_run": {"type": "boolean", "default": False}, |
| 43 | + "run_pipeline": {"type": "boolean", "default": True}, |
| 44 | + }, |
| 45 | + }, |
| 46 | +} |
| 47 | + |
| 48 | + |
| 49 | +# Priority-ordered; scanner walks the repo once and keeps files whose |
| 50 | +# relative path matches any of these patterns. |
| 51 | +_SEED_PATTERNS: list[str] = [ |
| 52 | + "README.md", |
| 53 | + "CHANGELOG.md", |
| 54 | + "CONTRIBUTING.md", |
| 55 | + "ARCHITECTURE.md", |
| 56 | + "HISTORY.md", |
| 57 | + "SECURITY.md", |
| 58 | + "AGENTS.md", |
| 59 | + "CLAUDE.md", |
| 60 | + "docs/**/*.md", |
| 61 | + "adr/**/*.md", |
| 62 | + "ADR-*.md", |
| 63 | + ".claude/**/*.md", |
| 64 | +] |
| 65 | + |
| 66 | +# Skip these paths even if they match a pattern (vendored / generated). |
| 67 | +_SKIP_PATH_FRAGMENTS = ( |
| 68 | + "/node_modules/", |
| 69 | + "/.venv/", |
| 70 | + "/.build/", |
| 71 | + "/.git/", |
| 72 | + "/dist/", |
| 73 | + "/build/", |
| 74 | + "/__pycache__/", |
| 75 | + "/.cache/", |
| 76 | + "/.generated/", |
| 77 | +) |
| 78 | + |
| 79 | + |
| 80 | +def _kind_for(rel_path: str) -> str: |
| 81 | + low = rel_path.lower() |
| 82 | + if "adr" in low or "decision" in low: |
| 83 | + return "adr" |
| 84 | + if "architecture" in low: |
| 85 | + return "spec" |
| 86 | + if "convention" in low or "style" in low: |
| 87 | + return "convention" |
| 88 | + if "lesson" in low or "postmortem" in low: |
| 89 | + return "lesson" |
| 90 | + if low.startswith("readme") or low.endswith("/readme.md"): |
| 91 | + return "note" |
| 92 | + return "note" |
| 93 | + |
| 94 | + |
| 95 | +def _collect_files( |
| 96 | + root: Path, max_files: int, max_bytes: int |
| 97 | +) -> list[tuple[Path, str]]: |
| 98 | + """Return [(abs_path, rel_path)] for seed-worthy markdown.""" |
| 99 | + root = root.resolve() |
| 100 | + seen: set[Path] = set() |
| 101 | + results: list[tuple[Path, str]] = [] |
| 102 | + for pattern in _SEED_PATTERNS: |
| 103 | + for p in sorted(root.glob(pattern)): |
| 104 | + try: |
| 105 | + pr = p.resolve() |
| 106 | + except OSError: |
| 107 | + continue |
| 108 | + if pr in seen: |
| 109 | + continue |
| 110 | + seen.add(pr) |
| 111 | + rel = str(pr.relative_to(root)).replace("\\", "/") |
| 112 | + if any(frag in f"/{rel}" for frag in _SKIP_PATH_FRAGMENTS): |
| 113 | + continue |
| 114 | + if not pr.is_file(): |
| 115 | + continue |
| 116 | + try: |
| 117 | + size = pr.stat().st_size |
| 118 | + except OSError: |
| 119 | + continue |
| 120 | + if size == 0 or size > 2_000_000: |
| 121 | + continue |
| 122 | + _ = max_bytes # capped at read time, not filter time |
| 123 | + results.append((pr, rel)) |
| 124 | + if len(results) >= max_files: |
| 125 | + return results |
| 126 | + return results |
| 127 | + |
| 128 | + |
| 129 | +async def handler(args: dict[str, Any] | None = None) -> dict[str, Any]: |
| 130 | + args = args or {} |
| 131 | + repo_root = Path(args.get("repo_root") or Path.cwd()).resolve() |
| 132 | + max_files = int(args.get("max_files", 50)) |
| 133 | + max_bytes = int(args.get("max_bytes_per_file", 8192)) |
| 134 | + dry_run = bool(args.get("dry_run", False)) |
| 135 | + run_pipeline = bool(args.get("run_pipeline", True)) |
| 136 | + |
| 137 | + from mcp_server.handlers.remember import handler as h_remember |
| 138 | + |
| 139 | + files = _collect_files(repo_root, max_files, max_bytes) |
| 140 | + if not files: |
| 141 | + return { |
| 142 | + "files_found": 0, |
| 143 | + "imported": 0, |
| 144 | + "note": "no seed-eligible markdown found in this repo", |
| 145 | + "dry_run": dry_run, |
| 146 | + } |
| 147 | + |
| 148 | + if dry_run: |
| 149 | + return { |
| 150 | + "files_found": len(files), |
| 151 | + "preview": [ |
| 152 | + {"path": rel, "kind": _kind_for(rel), "size": p.stat().st_size} |
| 153 | + for p, rel in files |
| 154 | + ], |
| 155 | + "dry_run": True, |
| 156 | + } |
| 157 | + |
| 158 | + imported = 0 |
| 159 | + errors: list[str] = [] |
| 160 | + for p, rel in files: |
| 161 | + try: |
| 162 | + content = p.read_text(encoding="utf-8", errors="replace") |
| 163 | + if len(content) > max_bytes: |
| 164 | + content = content[:max_bytes] + "\n\n[...truncated]" |
| 165 | + domain = repo_root.name or "seed" |
| 166 | + kind = _kind_for(rel) |
| 167 | + result = await h_remember( |
| 168 | + { |
| 169 | + "content": content, |
| 170 | + "tags": ["seed:codebase", f"kind:{kind}", f"file:{rel}"], |
| 171 | + "domain": domain, |
| 172 | + "source": f"seed:{rel}", |
| 173 | + "force": True, |
| 174 | + } |
| 175 | + ) |
| 176 | + if result.get("stored") or result.get("memory_id"): |
| 177 | + imported += 1 |
| 178 | + except Exception as e: |
| 179 | + errors.append(f"{rel}: {e}") |
| 180 | + |
| 181 | + summary: dict[str, Any] = { |
| 182 | + "files_found": len(files), |
| 183 | + "imported": imported, |
| 184 | + "errors": errors[:10], |
| 185 | + "error_count": len(errors), |
| 186 | + "dry_run": False, |
| 187 | + } |
| 188 | + |
| 189 | + if run_pipeline and imported > 0: |
| 190 | + try: |
| 191 | + from mcp_server.handlers.wiki_pipeline import handler as h_pipeline |
| 192 | + |
| 193 | + pipe = await h_pipeline({"limit_per_stage": 1000}) |
| 194 | + summary["pipeline"] = { |
| 195 | + "claims_inserted": pipe.get("claims_inserted", 0), |
| 196 | + "concepts_inserted": pipe.get("concepts_inserted", 0), |
| 197 | + "drafts_approved": pipe.get("drafts_approved", 0), |
| 198 | + "pages_published": pipe.get("pages_published", 0), |
| 199 | + } |
| 200 | + except Exception as e: |
| 201 | + summary["pipeline"] = {"error": str(e)} |
| 202 | + |
| 203 | + return summary |
0 commit comments