Skip to content

Commit bc90e19

Browse files
chore: add a script to do a sparse checkout of the mono repo to make it manageable (#13408)
This repo is 5GB, breaks a lot of tooling. To make it manageable this script will do a sparse clone of it, then walk the dependency tree and do a cone checkout of just the relevant modules. The script should work with any module, but currently lives under bigtable until there is interest to move it to the monorepo root Example usage: ```sh # download the git repo, but don't populate the worktree git clone --sparse git@github.com:googleapis/google-cloud-java.git # now populate the worktree with your team's module cd google-cloud-java git sparse-checkout set java-bigtable # then use the script to add all of the relevant modules to the worktree and install them in the local maven repo java-bigtable/scripts/sparse-bootstrap.py java-bigtable # now you can work in you module dir cd java-bigtable mvn install # after a release, git pull & re-run the script to install updated dependencies into your local maven repo git pull java-bigtable/scripts/sparse-bootstrap.py java-bigtable ```
1 parent dde97d4 commit bc90e19

1 file changed

Lines changed: 350 additions & 0 deletions

File tree

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
#!/usr/bin/env python3
2+
"""
3+
sparse-bootstrap.py
4+
5+
Given a seed module directory, walks its parent-chain and BOM-import graph,
6+
adds the discovered dependency directories to the sparse checkout, and
7+
installs them into ~/.m2 in dependency order.
8+
9+
Prerequisites:
10+
git clone --sparse git@github.com:googleapis/google-cloud-java.git
11+
cd google-cloud-java
12+
git sparse-checkout set <seed-module>
13+
14+
Usage (from repo root):
15+
python java-bigtable/scripts/sparse-bootstrap.py <seed-dir> [--dry-run]
16+
17+
Example:
18+
python java-bigtable/scripts/sparse-bootstrap.py java-bigtable
19+
"""
20+
21+
import io
22+
import os
23+
import subprocess
24+
import sys
25+
import xml.etree.ElementTree as ET
26+
from collections import defaultdict, deque
27+
from dataclasses import dataclass
28+
from pathlib import Path
29+
from typing import Optional
30+
31+
NS = '{http://maven.apache.org/POM/4.0.0}'
32+
33+
34+
@dataclass(frozen=True)
35+
class Coord:
36+
group_id: Optional[str]
37+
artifact_id: Optional[str]
38+
39+
40+
@dataclass(frozen=True)
41+
class GitBlob:
42+
sha: str
43+
path: Path
44+
45+
46+
@dataclass(frozen=True)
47+
class ParentRef:
48+
coord: Coord
49+
relative_path: str # '' means <relativePath/> was explicit → fetch from ~/.m2
50+
51+
52+
@dataclass
53+
class InstallCommand:
54+
cwd: str
55+
cmd: list[str]
56+
57+
58+
# ── git helpers ────────────────────────────────────────────────────────────────
59+
60+
def ls_tree_poms() -> list[GitBlob]:
61+
"""Return a GitBlob for every pom.xml in HEAD."""
62+
# Each output line is: <mode> <type> <sha>\t<path>
63+
# e.g.: 100644 blob abc123def\tjava-bigtable/pom.xml
64+
out = subprocess.run(
65+
['git', 'ls-tree', '-r', 'HEAD'],
66+
capture_output=True, text=True, check=True,
67+
).stdout
68+
results: list[GitBlob] = []
69+
for line in out.splitlines():
70+
meta, path = line.split('\t', 1)
71+
if not path.endswith('pom.xml'):
72+
continue
73+
_, _, sha = meta.split()
74+
results.append(GitBlob(sha=sha, path=Path(path)))
75+
return results
76+
77+
78+
def batch_cat(pom_refs: list[GitBlob]) -> dict[Path, str]:
79+
"""Read many git blobs in one subprocess call. Returns {path: text}."""
80+
if not pom_refs:
81+
return {}
82+
# stdin: one sha per line
83+
# stdout: for each sha, either:
84+
# "<sha> blob <size>\n<content bytes>\n"
85+
# "<sha> missing\n"
86+
raw = subprocess.run(
87+
['git', 'cat-file', '--batch', '--buffer'],
88+
input='\n'.join(ref.sha for ref in pom_refs).encode(),
89+
capture_output=True,
90+
check=True,
91+
).stdout
92+
contents: dict[Path, str] = {}
93+
buf = io.BytesIO(raw)
94+
for ref in pom_refs:
95+
header = buf.readline().decode()
96+
expected = f'{ref.sha} blob '
97+
if not header.startswith(expected):
98+
raise RuntimeError(f'unexpected git cat-file response for {ref.path}: {header.strip()!r}')
99+
size = int(header[len(expected):])
100+
text = buf.read(size).decode('utf-8', errors='replace')
101+
buf.read(1) # trailing newline after each blob
102+
if text.lstrip().startswith('<'):
103+
contents[ref.path] = text
104+
else:
105+
print(f' skipping non-XML pom: {ref.path}')
106+
return contents
107+
108+
109+
# ── pom parsing ────────────────────────────────────────────────────────────────
110+
111+
def t(name: str) -> str:
112+
return f'{NS}{name}'
113+
114+
115+
def child_text(elem: ET.Element, path: str) -> Optional[str]:
116+
e = elem.find('/'.join(t(tag) for tag in path.split('/')))
117+
return e.text.strip() if e is not None and e.text else None
118+
119+
120+
def get_coordinates(root: ET.Element) -> Coord:
121+
g = child_text(root, 'groupId') or child_text(root, 'parent/groupId')
122+
a = child_text(root, 'artifactId')
123+
return Coord(group_id=g, artifact_id=a)
124+
125+
126+
def get_parent(root: ET.Element) -> Optional[ParentRef]:
127+
p = root.find(t('parent'))
128+
if p is None:
129+
return None
130+
rel_elem = p.find(t('relativePath'))
131+
if rel_elem is None:
132+
rel = '../pom.xml' # Maven default when element is absent
133+
else:
134+
rel = (rel_elem.text or '').strip() # empty string → remote
135+
return ParentRef(
136+
coord=Coord(
137+
group_id=child_text(p, 'groupId'),
138+
artifact_id=child_text(p, 'artifactId'),
139+
),
140+
relative_path=rel,
141+
)
142+
143+
144+
def get_bom_imports(root: ET.Element) -> list[Coord]:
145+
"""Coord for every scope=import dependency."""
146+
results: list[Coord] = []
147+
for dep in root.findall(
148+
f'.//{t("dependencyManagement")}/{t("dependencies")}/{t("dependency")}'
149+
):
150+
if child_text(dep, 'scope') == 'import':
151+
g = child_text(dep, 'groupId')
152+
a = child_text(dep, 'artifactId')
153+
if g and a:
154+
results.append(Coord(group_id=g, artifact_id=a))
155+
return results
156+
157+
158+
# ── dependency discovery ───────────────────────────────────────────────────────
159+
160+
def find_needed_modules(
161+
seed_dir: Path,
162+
pom_contents: dict[Path, str],
163+
coord_to_pom: dict[Coord, Path],
164+
) -> tuple[set[Path], dict[Path, set[Path]]]:
165+
"""BFS from all poms under seed_dir following parent chain and BOM imports.
166+
167+
Returns:
168+
needed_poms: set of pom paths outside seed_dir that must be installed
169+
dep_edges: {pom_path: {pom_paths it depends on}} for topological sort
170+
"""
171+
needed: set[Path] = set()
172+
dep_edges: dict[Path, set[Path]] = defaultdict(set)
173+
visited: set[Path] = set()
174+
queue: deque[Path] = deque()
175+
176+
def enqueue(pom_path: Path, required_by: Optional[Path] = None) -> None:
177+
if required_by:
178+
dep_edges[required_by].add(pom_path)
179+
if pom_path in visited:
180+
return
181+
visited.add(pom_path)
182+
needed.add(pom_path)
183+
queue.append(pom_path)
184+
185+
# Seed: every pom under seed_dir — pre-visited so they won't enter `needed`
186+
for path in pom_contents:
187+
if path.is_relative_to(seed_dir):
188+
visited.add(path)
189+
queue.append(path)
190+
191+
while queue:
192+
pom_path = queue.popleft()
193+
root = ET.fromstring(pom_contents[pom_path])
194+
195+
# Follow <parent> chain
196+
parent = get_parent(root)
197+
if parent:
198+
resolved = None
199+
if parent.relative_path:
200+
# normpath collapses '..' without making the path absolute
201+
local_parent = Path(os.path.normpath(pom_path.parent / parent.relative_path))
202+
# <relativePath> may point to a directory; Maven appends pom.xml in that case
203+
if local_parent.name != 'pom.xml':
204+
local_parent = local_parent / 'pom.xml'
205+
if local_parent in pom_contents:
206+
resolved = local_parent
207+
# fall back to coordinate lookup if relativePath missing or not found locally
208+
if resolved is None and parent.coord in coord_to_pom:
209+
resolved = coord_to_pom[parent.coord]
210+
if resolved is not None:
211+
enqueue(resolved, required_by=pom_path)
212+
213+
# Follow BOM imports
214+
for coord in get_bom_imports(root):
215+
if coord in coord_to_pom:
216+
enqueue(coord_to_pom[coord], required_by=pom_path)
217+
218+
return needed, dep_edges
219+
220+
221+
# ── topological sort ───────────────────────────────────────────────────────────
222+
223+
def topo_sort(
224+
nodes: set[Path],
225+
dep_edges: dict[Path, set[Path]],
226+
) -> list[Path]:
227+
"""Returns nodes ordered so each node appears after its dependencies."""
228+
visited: set[Path] = set()
229+
order: list[Path] = []
230+
231+
def visit(n: Path) -> None:
232+
if n in visited:
233+
return
234+
visited.add(n)
235+
for dep in sorted(dep_edges.get(n, [])):
236+
visit(dep)
237+
order.append(n)
238+
239+
for n in sorted(nodes):
240+
visit(n)
241+
return order
242+
243+
244+
# ── install command generation ─────────────────────────────────────────────────
245+
246+
def make_install_commands(sorted_poms: list[Path]) -> list[InstallCommand]:
247+
"""One mvn install command per top-level project, in dependency order."""
248+
by_project: dict[str, list[str]] = defaultdict(list)
249+
project_order: list[str] = []
250+
seen_projects: set[str] = set()
251+
252+
for pom_path in sorted_poms:
253+
project = pom_path.parts[0]
254+
if project not in seen_projects:
255+
seen_projects.add(project)
256+
project_order.append(project)
257+
258+
pom_dir = pom_path.parent
259+
rel = str(pom_dir.relative_to(project)) if pom_dir != Path(project) else '.'
260+
if rel not in by_project[project]:
261+
by_project[project].append(rel)
262+
263+
commands: list[InstallCommand] = []
264+
for project in project_order:
265+
sub_modules = [m for m in by_project[project] if m != '.']
266+
cmd = ['mvn', 'install', '-T', '1C', '-DskipTests', '-P', 'quick-build']
267+
if sub_modules:
268+
for m in sub_modules:
269+
cmd += ['-pl', m]
270+
cmd.append('-am')
271+
else:
272+
cmd.append('-N') # root pom only — skip recursive submodule build
273+
commands.append(InstallCommand(cwd=project, cmd=cmd))
274+
return commands
275+
276+
277+
# ── main ───────────────────────────────────────────────────────────────────────
278+
279+
def main() -> None:
280+
raw_args = [a for a in sys.argv[1:] if not a.startswith('--')]
281+
dry_run = '--dry-run' in sys.argv
282+
283+
if not raw_args:
284+
sys.exit(f'usage: {sys.argv[0]} <seed-module-dir> [--dry-run]')
285+
286+
seed_dir = Path(raw_args[0])
287+
288+
if not seed_dir.is_dir():
289+
sys.exit(f'error: seed directory not found: {seed_dir!r}')
290+
if not (seed_dir / 'pom.xml').is_file():
291+
sys.exit(f'error: no pom.xml in {seed_dir!r}')
292+
293+
print(f'Seed module: {seed_dir}')
294+
295+
print('Listing pom.xml files in repo...')
296+
pom_refs = ls_tree_poms()
297+
print(f' {len(pom_refs)} pom files')
298+
299+
print('Reading all pom files from git object store...')
300+
pom_contents = batch_cat(pom_refs)
301+
print(f' {len(pom_contents)} pom files read')
302+
303+
print('Building coordinate → pom path index...')
304+
coord_to_pom: dict[Coord, Path] = {}
305+
for path, content in pom_contents.items():
306+
root = ET.fromstring(content)
307+
coord = get_coordinates(root)
308+
if coord.group_id and coord.artifact_id:
309+
coord_to_pom[coord] = path
310+
print(f' {len(coord_to_pom)} artifacts indexed')
311+
312+
print(f'Walking dependency graph from {seed_dir}...')
313+
needed_poms, dep_edges = find_needed_modules(seed_dir, pom_contents, coord_to_pom)
314+
sorted_poms = topo_sort(needed_poms, dep_edges)
315+
316+
# root-level files are always present in cone mode — only add subdirectories
317+
top_dirs = sorted({p.parts[0] for p in needed_poms if len(p.parts) > 1})
318+
print(f'\nTop-level directories needed:')
319+
for d in top_dirs:
320+
print(f' {d}')
321+
322+
print(f'\nPom files to install ({len(sorted_poms)}):')
323+
for p in sorted_poms:
324+
print(f' {p}')
325+
326+
install_cmds = make_install_commands(sorted_poms)
327+
print('\nInstall commands (in order):')
328+
for ic in install_cmds:
329+
print(f' (cd {ic.cwd} && {" ".join(ic.cmd)})')
330+
331+
if dry_run:
332+
print('\n--dry-run: stopping before making changes')
333+
return
334+
335+
if top_dirs:
336+
print('\nUpdating sparse checkout...')
337+
subprocess.run(['git', 'sparse-checkout', 'add'] + top_dirs, check=True)
338+
339+
print('\nInstalling...')
340+
for ic in install_cmds:
341+
print(f'\n>>> cd {ic.cwd} && {" ".join(ic.cmd)}')
342+
result = subprocess.run(ic.cmd, cwd=ic.cwd)
343+
if result.returncode != 0:
344+
sys.exit(f'error: install failed in {ic.cwd}')
345+
346+
print(f'\nBootstrap complete. You can now develop in {seed_dir}.')
347+
348+
349+
if __name__ == '__main__':
350+
main()

0 commit comments

Comments
 (0)