|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +sparse-bootstrap.py |
| 4 | +
|
| 5 | +Given a seed module directory, walks its parent-chain and BOM-import graph, |
| 6 | +adds the discovered dependency directories to the sparse checkout, and |
| 7 | +installs them into ~/.m2 in dependency order. |
| 8 | +
|
| 9 | +Prerequisites: |
| 10 | + git clone --sparse git@github.com:googleapis/google-cloud-java.git |
| 11 | + cd google-cloud-java |
| 12 | + git sparse-checkout set <seed-module> |
| 13 | +
|
| 14 | +Usage (from repo root): |
| 15 | + python java-bigtable/scripts/sparse-bootstrap.py <seed-dir> [--dry-run] |
| 16 | +
|
| 17 | +Example: |
| 18 | + python java-bigtable/scripts/sparse-bootstrap.py java-bigtable |
| 19 | +""" |
| 20 | + |
| 21 | +import io |
| 22 | +import os |
| 23 | +import subprocess |
| 24 | +import sys |
| 25 | +import xml.etree.ElementTree as ET |
| 26 | +from collections import defaultdict, deque |
| 27 | +from dataclasses import dataclass |
| 28 | +from pathlib import Path |
| 29 | +from typing import Optional |
| 30 | + |
| 31 | +NS = '{http://maven.apache.org/POM/4.0.0}' |
| 32 | + |
| 33 | + |
| 34 | +@dataclass(frozen=True) |
| 35 | +class Coord: |
| 36 | + group_id: Optional[str] |
| 37 | + artifact_id: Optional[str] |
| 38 | + |
| 39 | + |
| 40 | +@dataclass(frozen=True) |
| 41 | +class GitBlob: |
| 42 | + sha: str |
| 43 | + path: Path |
| 44 | + |
| 45 | + |
| 46 | +@dataclass(frozen=True) |
| 47 | +class ParentRef: |
| 48 | + coord: Coord |
| 49 | + relative_path: str # '' means <relativePath/> was explicit → fetch from ~/.m2 |
| 50 | + |
| 51 | + |
| 52 | +@dataclass |
| 53 | +class InstallCommand: |
| 54 | + cwd: str |
| 55 | + cmd: list[str] |
| 56 | + |
| 57 | + |
| 58 | +# ── git helpers ──────────────────────────────────────────────────────────────── |
| 59 | + |
| 60 | +def ls_tree_poms() -> list[GitBlob]: |
| 61 | + """Return a GitBlob for every pom.xml in HEAD.""" |
| 62 | + # Each output line is: <mode> <type> <sha>\t<path> |
| 63 | + # e.g.: 100644 blob abc123def\tjava-bigtable/pom.xml |
| 64 | + out = subprocess.run( |
| 65 | + ['git', 'ls-tree', '-r', 'HEAD'], |
| 66 | + capture_output=True, text=True, check=True, |
| 67 | + ).stdout |
| 68 | + results: list[GitBlob] = [] |
| 69 | + for line in out.splitlines(): |
| 70 | + meta, path = line.split('\t', 1) |
| 71 | + if not path.endswith('pom.xml'): |
| 72 | + continue |
| 73 | + _, _, sha = meta.split() |
| 74 | + results.append(GitBlob(sha=sha, path=Path(path))) |
| 75 | + return results |
| 76 | + |
| 77 | + |
| 78 | +def batch_cat(pom_refs: list[GitBlob]) -> dict[Path, str]: |
| 79 | + """Read many git blobs in one subprocess call. Returns {path: text}.""" |
| 80 | + if not pom_refs: |
| 81 | + return {} |
| 82 | + # stdin: one sha per line |
| 83 | + # stdout: for each sha, either: |
| 84 | + # "<sha> blob <size>\n<content bytes>\n" |
| 85 | + # "<sha> missing\n" |
| 86 | + raw = subprocess.run( |
| 87 | + ['git', 'cat-file', '--batch', '--buffer'], |
| 88 | + input='\n'.join(ref.sha for ref in pom_refs).encode(), |
| 89 | + capture_output=True, |
| 90 | + check=True, |
| 91 | + ).stdout |
| 92 | + contents: dict[Path, str] = {} |
| 93 | + buf = io.BytesIO(raw) |
| 94 | + for ref in pom_refs: |
| 95 | + header = buf.readline().decode() |
| 96 | + expected = f'{ref.sha} blob ' |
| 97 | + if not header.startswith(expected): |
| 98 | + raise RuntimeError(f'unexpected git cat-file response for {ref.path}: {header.strip()!r}') |
| 99 | + size = int(header[len(expected):]) |
| 100 | + text = buf.read(size).decode('utf-8', errors='replace') |
| 101 | + buf.read(1) # trailing newline after each blob |
| 102 | + if text.lstrip().startswith('<'): |
| 103 | + contents[ref.path] = text |
| 104 | + else: |
| 105 | + print(f' skipping non-XML pom: {ref.path}') |
| 106 | + return contents |
| 107 | + |
| 108 | + |
| 109 | +# ── pom parsing ──────────────────────────────────────────────────────────────── |
| 110 | + |
| 111 | +def t(name: str) -> str: |
| 112 | + return f'{NS}{name}' |
| 113 | + |
| 114 | + |
| 115 | +def child_text(elem: ET.Element, path: str) -> Optional[str]: |
| 116 | + e = elem.find('/'.join(t(tag) for tag in path.split('/'))) |
| 117 | + return e.text.strip() if e is not None and e.text else None |
| 118 | + |
| 119 | + |
| 120 | +def get_coordinates(root: ET.Element) -> Coord: |
| 121 | + g = child_text(root, 'groupId') or child_text(root, 'parent/groupId') |
| 122 | + a = child_text(root, 'artifactId') |
| 123 | + return Coord(group_id=g, artifact_id=a) |
| 124 | + |
| 125 | + |
| 126 | +def get_parent(root: ET.Element) -> Optional[ParentRef]: |
| 127 | + p = root.find(t('parent')) |
| 128 | + if p is None: |
| 129 | + return None |
| 130 | + rel_elem = p.find(t('relativePath')) |
| 131 | + if rel_elem is None: |
| 132 | + rel = '../pom.xml' # Maven default when element is absent |
| 133 | + else: |
| 134 | + rel = (rel_elem.text or '').strip() # empty string → remote |
| 135 | + return ParentRef( |
| 136 | + coord=Coord( |
| 137 | + group_id=child_text(p, 'groupId'), |
| 138 | + artifact_id=child_text(p, 'artifactId'), |
| 139 | + ), |
| 140 | + relative_path=rel, |
| 141 | + ) |
| 142 | + |
| 143 | + |
| 144 | +def get_bom_imports(root: ET.Element) -> list[Coord]: |
| 145 | + """Coord for every scope=import dependency.""" |
| 146 | + results: list[Coord] = [] |
| 147 | + for dep in root.findall( |
| 148 | + f'.//{t("dependencyManagement")}/{t("dependencies")}/{t("dependency")}' |
| 149 | + ): |
| 150 | + if child_text(dep, 'scope') == 'import': |
| 151 | + g = child_text(dep, 'groupId') |
| 152 | + a = child_text(dep, 'artifactId') |
| 153 | + if g and a: |
| 154 | + results.append(Coord(group_id=g, artifact_id=a)) |
| 155 | + return results |
| 156 | + |
| 157 | + |
| 158 | +# ── dependency discovery ─────────────────────────────────────────────────────── |
| 159 | + |
| 160 | +def find_needed_modules( |
| 161 | + seed_dir: Path, |
| 162 | + pom_contents: dict[Path, str], |
| 163 | + coord_to_pom: dict[Coord, Path], |
| 164 | +) -> tuple[set[Path], dict[Path, set[Path]]]: |
| 165 | + """BFS from all poms under seed_dir following parent chain and BOM imports. |
| 166 | +
|
| 167 | + Returns: |
| 168 | + needed_poms: set of pom paths outside seed_dir that must be installed |
| 169 | + dep_edges: {pom_path: {pom_paths it depends on}} for topological sort |
| 170 | + """ |
| 171 | + needed: set[Path] = set() |
| 172 | + dep_edges: dict[Path, set[Path]] = defaultdict(set) |
| 173 | + visited: set[Path] = set() |
| 174 | + queue: deque[Path] = deque() |
| 175 | + |
| 176 | + def enqueue(pom_path: Path, required_by: Optional[Path] = None) -> None: |
| 177 | + if required_by: |
| 178 | + dep_edges[required_by].add(pom_path) |
| 179 | + if pom_path in visited: |
| 180 | + return |
| 181 | + visited.add(pom_path) |
| 182 | + needed.add(pom_path) |
| 183 | + queue.append(pom_path) |
| 184 | + |
| 185 | + # Seed: every pom under seed_dir — pre-visited so they won't enter `needed` |
| 186 | + for path in pom_contents: |
| 187 | + if path.is_relative_to(seed_dir): |
| 188 | + visited.add(path) |
| 189 | + queue.append(path) |
| 190 | + |
| 191 | + while queue: |
| 192 | + pom_path = queue.popleft() |
| 193 | + root = ET.fromstring(pom_contents[pom_path]) |
| 194 | + |
| 195 | + # Follow <parent> chain |
| 196 | + parent = get_parent(root) |
| 197 | + if parent: |
| 198 | + resolved = None |
| 199 | + if parent.relative_path: |
| 200 | + # normpath collapses '..' without making the path absolute |
| 201 | + local_parent = Path(os.path.normpath(pom_path.parent / parent.relative_path)) |
| 202 | + # <relativePath> may point to a directory; Maven appends pom.xml in that case |
| 203 | + if local_parent.name != 'pom.xml': |
| 204 | + local_parent = local_parent / 'pom.xml' |
| 205 | + if local_parent in pom_contents: |
| 206 | + resolved = local_parent |
| 207 | + # fall back to coordinate lookup if relativePath missing or not found locally |
| 208 | + if resolved is None and parent.coord in coord_to_pom: |
| 209 | + resolved = coord_to_pom[parent.coord] |
| 210 | + if resolved is not None: |
| 211 | + enqueue(resolved, required_by=pom_path) |
| 212 | + |
| 213 | + # Follow BOM imports |
| 214 | + for coord in get_bom_imports(root): |
| 215 | + if coord in coord_to_pom: |
| 216 | + enqueue(coord_to_pom[coord], required_by=pom_path) |
| 217 | + |
| 218 | + return needed, dep_edges |
| 219 | + |
| 220 | + |
| 221 | +# ── topological sort ─────────────────────────────────────────────────────────── |
| 222 | + |
| 223 | +def topo_sort( |
| 224 | + nodes: set[Path], |
| 225 | + dep_edges: dict[Path, set[Path]], |
| 226 | +) -> list[Path]: |
| 227 | + """Returns nodes ordered so each node appears after its dependencies.""" |
| 228 | + visited: set[Path] = set() |
| 229 | + order: list[Path] = [] |
| 230 | + |
| 231 | + def visit(n: Path) -> None: |
| 232 | + if n in visited: |
| 233 | + return |
| 234 | + visited.add(n) |
| 235 | + for dep in sorted(dep_edges.get(n, [])): |
| 236 | + visit(dep) |
| 237 | + order.append(n) |
| 238 | + |
| 239 | + for n in sorted(nodes): |
| 240 | + visit(n) |
| 241 | + return order |
| 242 | + |
| 243 | + |
| 244 | +# ── install command generation ───────────────────────────────────────────────── |
| 245 | + |
| 246 | +def make_install_commands(sorted_poms: list[Path]) -> list[InstallCommand]: |
| 247 | + """One mvn install command per top-level project, in dependency order.""" |
| 248 | + by_project: dict[str, list[str]] = defaultdict(list) |
| 249 | + project_order: list[str] = [] |
| 250 | + seen_projects: set[str] = set() |
| 251 | + |
| 252 | + for pom_path in sorted_poms: |
| 253 | + project = pom_path.parts[0] |
| 254 | + if project not in seen_projects: |
| 255 | + seen_projects.add(project) |
| 256 | + project_order.append(project) |
| 257 | + |
| 258 | + pom_dir = pom_path.parent |
| 259 | + rel = str(pom_dir.relative_to(project)) if pom_dir != Path(project) else '.' |
| 260 | + if rel not in by_project[project]: |
| 261 | + by_project[project].append(rel) |
| 262 | + |
| 263 | + commands: list[InstallCommand] = [] |
| 264 | + for project in project_order: |
| 265 | + sub_modules = [m for m in by_project[project] if m != '.'] |
| 266 | + cmd = ['mvn', 'install', '-T', '1C', '-DskipTests', '-P', 'quick-build'] |
| 267 | + if sub_modules: |
| 268 | + for m in sub_modules: |
| 269 | + cmd += ['-pl', m] |
| 270 | + cmd.append('-am') |
| 271 | + else: |
| 272 | + cmd.append('-N') # root pom only — skip recursive submodule build |
| 273 | + commands.append(InstallCommand(cwd=project, cmd=cmd)) |
| 274 | + return commands |
| 275 | + |
| 276 | + |
| 277 | +# ── main ─────────────────────────────────────────────────────────────────────── |
| 278 | + |
| 279 | +def main() -> None: |
| 280 | + raw_args = [a for a in sys.argv[1:] if not a.startswith('--')] |
| 281 | + dry_run = '--dry-run' in sys.argv |
| 282 | + |
| 283 | + if not raw_args: |
| 284 | + sys.exit(f'usage: {sys.argv[0]} <seed-module-dir> [--dry-run]') |
| 285 | + |
| 286 | + seed_dir = Path(raw_args[0]) |
| 287 | + |
| 288 | + if not seed_dir.is_dir(): |
| 289 | + sys.exit(f'error: seed directory not found: {seed_dir!r}') |
| 290 | + if not (seed_dir / 'pom.xml').is_file(): |
| 291 | + sys.exit(f'error: no pom.xml in {seed_dir!r}') |
| 292 | + |
| 293 | + print(f'Seed module: {seed_dir}') |
| 294 | + |
| 295 | + print('Listing pom.xml files in repo...') |
| 296 | + pom_refs = ls_tree_poms() |
| 297 | + print(f' {len(pom_refs)} pom files') |
| 298 | + |
| 299 | + print('Reading all pom files from git object store...') |
| 300 | + pom_contents = batch_cat(pom_refs) |
| 301 | + print(f' {len(pom_contents)} pom files read') |
| 302 | + |
| 303 | + print('Building coordinate → pom path index...') |
| 304 | + coord_to_pom: dict[Coord, Path] = {} |
| 305 | + for path, content in pom_contents.items(): |
| 306 | + root = ET.fromstring(content) |
| 307 | + coord = get_coordinates(root) |
| 308 | + if coord.group_id and coord.artifact_id: |
| 309 | + coord_to_pom[coord] = path |
| 310 | + print(f' {len(coord_to_pom)} artifacts indexed') |
| 311 | + |
| 312 | + print(f'Walking dependency graph from {seed_dir}...') |
| 313 | + needed_poms, dep_edges = find_needed_modules(seed_dir, pom_contents, coord_to_pom) |
| 314 | + sorted_poms = topo_sort(needed_poms, dep_edges) |
| 315 | + |
| 316 | + # root-level files are always present in cone mode — only add subdirectories |
| 317 | + top_dirs = sorted({p.parts[0] for p in needed_poms if len(p.parts) > 1}) |
| 318 | + print(f'\nTop-level directories needed:') |
| 319 | + for d in top_dirs: |
| 320 | + print(f' {d}') |
| 321 | + |
| 322 | + print(f'\nPom files to install ({len(sorted_poms)}):') |
| 323 | + for p in sorted_poms: |
| 324 | + print(f' {p}') |
| 325 | + |
| 326 | + install_cmds = make_install_commands(sorted_poms) |
| 327 | + print('\nInstall commands (in order):') |
| 328 | + for ic in install_cmds: |
| 329 | + print(f' (cd {ic.cwd} && {" ".join(ic.cmd)})') |
| 330 | + |
| 331 | + if dry_run: |
| 332 | + print('\n--dry-run: stopping before making changes') |
| 333 | + return |
| 334 | + |
| 335 | + if top_dirs: |
| 336 | + print('\nUpdating sparse checkout...') |
| 337 | + subprocess.run(['git', 'sparse-checkout', 'add'] + top_dirs, check=True) |
| 338 | + |
| 339 | + print('\nInstalling...') |
| 340 | + for ic in install_cmds: |
| 341 | + print(f'\n>>> cd {ic.cwd} && {" ".join(ic.cmd)}') |
| 342 | + result = subprocess.run(ic.cmd, cwd=ic.cwd) |
| 343 | + if result.returncode != 0: |
| 344 | + sys.exit(f'error: install failed in {ic.cwd}') |
| 345 | + |
| 346 | + print(f'\nBootstrap complete. You can now develop in {seed_dir}.') |
| 347 | + |
| 348 | + |
| 349 | +if __name__ == '__main__': |
| 350 | + main() |
0 commit comments