|
| 1 | +"""Materialize a sparse, API-sourced repo skeleton for clone-free scoring. |
| 2 | +
|
| 3 | +The audit engine's analyzers read a repo from the local filesystem. To score an |
| 4 | +arbitrary public GitHub user *without* cloning every repo (the hosted, multi-tenant |
| 5 | +path), this module reconstructs a sparse on-disk skeleton from the GitHub API: |
| 6 | +
|
| 7 | +* one Git Trees API call yields every path → directories are created and files are |
| 8 | + ``touch``-ed so presence-based analyzers (structure, testing, CI, docs, build) |
| 9 | + see the real shape of the repo; |
| 10 | +* a bounded set of high-signal files (README, dependency manifests) are fetched via |
| 11 | + the Contents API and written with real content, so content-based analyzers |
| 12 | + (README quality, dependency counts, test-framework detection) still work. |
| 13 | +
|
| 14 | +The existing analyzers run against this skeleton unmodified. ``materialize_api_workspace`` |
| 15 | +mirrors ``cloner.clone_workspace`` exactly (context manager yielding ``{name: Path}``), |
| 16 | +so it is a drop-in replacement for the clone step. |
| 17 | +
|
| 18 | +Materialization is sequential on purpose: it keeps API access well under GitHub's |
| 19 | +secondary rate limits (concurrent-request and points-per-minute caps) that a |
| 20 | +parallel burst across many repos would trip. |
| 21 | +""" |
| 22 | + |
| 23 | +from __future__ import annotations |
| 24 | + |
| 25 | +import logging |
| 26 | +import tempfile |
| 27 | +from contextlib import contextmanager |
| 28 | +from pathlib import Path |
| 29 | +from typing import TYPE_CHECKING, Callable, Generator |
| 30 | + |
| 31 | +from src.models import RepoMetadata |
| 32 | + |
| 33 | +if TYPE_CHECKING: |
| 34 | + from src.github_client import GitHubClient |
| 35 | + |
| 36 | +logger = logging.getLogger(__name__) |
| 37 | + |
| 38 | +DEFAULT_MAX_FILES = 5000 |
| 39 | +DEFAULT_MAX_CONTENT_FILES = 20 |
| 40 | + |
| 41 | +# Files whose *content* (not just presence) carries real scoring signal. Matched |
| 42 | +# case-insensitively by basename; anything starting with ``readme`` also qualifies. |
| 43 | +CONTENT_FILE_NAMES = { |
| 44 | + "package.json", |
| 45 | + "pyproject.toml", |
| 46 | + "requirements.txt", |
| 47 | + "setup.py", |
| 48 | + "setup.cfg", |
| 49 | + "pipfile", |
| 50 | + "cargo.toml", |
| 51 | + "go.mod", |
| 52 | + "pom.xml", |
| 53 | + "build.gradle", |
| 54 | + "gemfile", |
| 55 | + "composer.json", |
| 56 | +} |
| 57 | + |
| 58 | + |
| 59 | +def _is_content_file(path: str) -> bool: |
| 60 | + base = path.rsplit("/", 1)[-1].lower() |
| 61 | + return base.startswith("readme") or base in CONTENT_FILE_NAMES |
| 62 | + |
| 63 | + |
| 64 | +def _safe_target(dest: Path, rel: str) -> Path | None: |
| 65 | + """Resolve ``rel`` under ``dest``, rejecting traversal/absolute escapes. |
| 66 | +
|
| 67 | + Tree paths come from arbitrary remote repos, so a malicious entry like |
| 68 | + ``../../etc/passwd`` or ``/abs/evil`` must never resolve outside ``dest``. |
| 69 | + """ |
| 70 | + rel = rel.strip() |
| 71 | + if not rel or rel in (".", "..") or "\x00" in rel: |
| 72 | + return None |
| 73 | + candidate = (dest / rel).resolve() |
| 74 | + dest_resolved = dest.resolve() |
| 75 | + if candidate == dest_resolved: |
| 76 | + return None |
| 77 | + if dest_resolved not in candidate.parents: |
| 78 | + return None |
| 79 | + return candidate |
| 80 | + |
| 81 | + |
| 82 | +def materialize_api_checkout( |
| 83 | + metadata: RepoMetadata, |
| 84 | + client: "GitHubClient", |
| 85 | + dest: Path, |
| 86 | + *, |
| 87 | + max_files: int = DEFAULT_MAX_FILES, |
| 88 | + max_content_files: int = DEFAULT_MAX_CONTENT_FILES, |
| 89 | +) -> Path: |
| 90 | + """Build a sparse skeleton of one repo under ``dest`` from the GitHub API. |
| 91 | +
|
| 92 | + Returns ``dest``. If the repo tree is expectedly unavailable (empty repo, |
| 93 | + missing ref, private repo, gone), ``dest`` is created empty so downstream |
| 94 | + analyzers score it as a near-empty repo rather than crashing. Transient, |
| 95 | + rate-limit, and server errors propagate to the API boundary. |
| 96 | + """ |
| 97 | + dest = Path(dest) |
| 98 | + dest.mkdir(parents=True, exist_ok=True) |
| 99 | + |
| 100 | + owner, _, repo = metadata.full_name.partition("/") |
| 101 | + if not owner or not repo: |
| 102 | + logger.warning( |
| 103 | + "Cannot materialize %r: full_name is not 'owner/repo'", |
| 104 | + metadata.full_name, |
| 105 | + ) |
| 106 | + return dest |
| 107 | + |
| 108 | + tree = client.get_repo_tree(owner, repo, metadata.default_branch) |
| 109 | + if not tree.get("available"): |
| 110 | + return dest |
| 111 | + if tree.get("truncated"): |
| 112 | + logger.warning( |
| 113 | + "Tree truncated for %s — skeleton is incomplete", metadata.full_name |
| 114 | + ) |
| 115 | + |
| 116 | + for rel in tree.get("dirs", []): |
| 117 | + target = _safe_target(dest, rel) |
| 118 | + if target is not None: |
| 119 | + target.mkdir(parents=True, exist_ok=True) |
| 120 | + |
| 121 | + content_budget = max_content_files |
| 122 | + for rel in tree.get("files", [])[:max_files]: |
| 123 | + target = _safe_target(dest, rel) |
| 124 | + if target is None: |
| 125 | + continue |
| 126 | + target.parent.mkdir(parents=True, exist_ok=True) |
| 127 | + text = "" |
| 128 | + if content_budget > 0 and _is_content_file(rel): |
| 129 | + fetched = client.get_file_content( |
| 130 | + owner, repo, rel, ref=metadata.default_branch |
| 131 | + ) |
| 132 | + if fetched is not None: |
| 133 | + text = fetched |
| 134 | + content_budget -= 1 |
| 135 | + target.write_text(text, encoding="utf-8") |
| 136 | + |
| 137 | + return dest |
| 138 | + |
| 139 | + |
| 140 | +@contextmanager |
| 141 | +def materialize_api_workspace( |
| 142 | + repos: list[RepoMetadata], |
| 143 | + client: "GitHubClient", |
| 144 | + *, |
| 145 | + on_progress: Callable[[int, int, str], None] | None = None, |
| 146 | + on_error: Callable[[str, str], None] | None = None, |
| 147 | + max_files: int = DEFAULT_MAX_FILES, |
| 148 | + max_content_files: int = DEFAULT_MAX_CONTENT_FILES, |
| 149 | +) -> Generator[dict[str, Path], None, None]: |
| 150 | + """Materialize API skeletons for many repos into a session-unique temp dir. |
| 151 | +
|
| 152 | + Drop-in replacement for ``cloner.clone_workspace``: yields a dict mapping |
| 153 | + repo name → skeleton path. A repo that fails to materialize is skipped with |
| 154 | + a warning so one bad repo never aborts a portfolio scan. |
| 155 | + """ |
| 156 | + with tempfile.TemporaryDirectory(prefix="audit-api-") as tmpdir: |
| 157 | + root = Path(tmpdir) |
| 158 | + workspace: dict[str, Path] = {} |
| 159 | + total = len(repos) |
| 160 | + for index, repo in enumerate(repos, 1): |
| 161 | + if on_progress: |
| 162 | + on_progress(index, total, repo.name) |
| 163 | + try: |
| 164 | + dest = materialize_api_checkout( |
| 165 | + repo, |
| 166 | + client, |
| 167 | + root / repo.name, |
| 168 | + max_files=max_files, |
| 169 | + max_content_files=max_content_files, |
| 170 | + ) |
| 171 | + workspace[repo.name] = dest |
| 172 | + except Exception as exc: # noqa: BLE001 — one bad repo must not abort the scan |
| 173 | + logger.warning("API checkout failed for %s: %s", repo.name, exc) |
| 174 | + if on_error: |
| 175 | + on_error(repo.name, str(exc)) |
| 176 | + yield workspace |
0 commit comments