|
4 | 4 | import hashlib |
5 | 5 | import json |
6 | 6 | import os |
| 7 | +import re |
7 | 8 | from datetime import UTC, datetime |
8 | 9 | from urllib.error import HTTPError, URLError |
9 | 10 | from urllib.request import Request, urlopen |
|
13 | 14 |
|
14 | 15 | API = "https://api.github.com" |
15 | 16 | USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)" |
| 17 | +FETCH_TIMEOUT = 30 |
| 18 | +MAX_RESPONSE_BYTES = 10 * 1024 * 1024 # 10 MiB safety cap per page |
| 19 | +MAX_PAGES = 5 # follow Link.rel="next" up to this many pages per repo |
16 | 20 |
|
17 | 21 |
|
18 | 22 | def _id(url: str) -> str: |
19 | 23 | return hashlib.sha256(url.encode("utf-8")).hexdigest()[:16] |
20 | 24 |
|
21 | 25 |
|
22 | | -def _get(path: str) -> list[dict]: |
23 | | - headers = { |
| 26 | +def _headers() -> dict[str, str]: |
| 27 | + h = { |
24 | 28 | "Accept": "application/vnd.github+json", |
25 | 29 | "X-GitHub-Api-Version": "2022-11-28", |
26 | 30 | "User-Agent": USER_AGENT, |
27 | 31 | } |
28 | 32 | token = os.environ.get("GITHUB_TOKEN") |
29 | 33 | if token: |
30 | | - headers["Authorization"] = f"Bearer {token}" |
31 | | - req = Request(f"{API}{path}", headers=headers) |
| 34 | + h["Authorization"] = f"Bearer {token}" |
| 35 | + return h |
| 36 | + |
| 37 | + |
| 38 | +_NEXT_LINK_RE = re.compile(r'<([^>]+)>;\s*rel="next"') |
| 39 | + |
| 40 | + |
| 41 | +def _next_url(link_header: str | None) -> str | None: |
| 42 | + if not link_header: |
| 43 | + return None |
| 44 | + m = _NEXT_LINK_RE.search(link_header) |
| 45 | + return m.group(1) if m else None |
| 46 | + |
| 47 | + |
| 48 | +def _get_page(url: str) -> tuple[list[dict], str | None]: |
| 49 | + """Fetch one page. Returns (items, next_url). Empty list + None on error.""" |
| 50 | + req = Request(url, headers=_headers()) |
32 | 51 | try: |
33 | | - with urlopen(req, timeout=30) as r: |
34 | | - res = json.loads(r.read().decode("utf-8")) |
35 | | - # GitHub returns a JSON object (not a list) on error envelopes |
36 | | - # (rate-limit, 404, etc.); guard so callers can iterate safely. |
37 | | - return res if isinstance(res, list) else [] |
| 52 | + with urlopen(req, timeout=FETCH_TIMEOUT) as r: |
| 53 | + body = r.read(MAX_RESPONSE_BYTES).decode("utf-8", errors="replace") |
| 54 | + link = r.headers.get("Link") |
| 55 | + res = json.loads(body) |
38 | 56 | except HTTPError as e: |
39 | | - print(f"[collect_github] {path}: HTTP {e.code}") |
40 | | - return [] |
| 57 | + print(f"[collect_github] {url}: HTTP {e.code}") |
| 58 | + return [], None |
41 | 59 | except (URLError, TimeoutError, json.JSONDecodeError) as e: |
42 | | - print(f"[collect_github] {path}: error {e}") |
43 | | - return [] |
| 60 | + print(f"[collect_github] {url}: error {e}") |
| 61 | + return [], None |
| 62 | + # GitHub returns a JSON object (not a list) on error envelopes (rate-limit etc.); |
| 63 | + # guard so callers can iterate safely. |
| 64 | + items = res if isinstance(res, list) else [] |
| 65 | + return items, _next_url(link) |
| 66 | + |
| 67 | + |
| 68 | +def _get_all(path: str) -> list[dict]: |
| 69 | + """Follow Link.rel="next" up to MAX_PAGES pages.""" |
| 70 | + url = f"{API}{path}" |
| 71 | + out: list[dict] = [] |
| 72 | + for _ in range(MAX_PAGES): |
| 73 | + items, nxt = _get_page(url) |
| 74 | + out.extend(items) |
| 75 | + if not nxt: |
| 76 | + break |
| 77 | + url = nxt |
| 78 | + return out |
44 | 79 |
|
45 | 80 |
|
46 | 81 | def release_to_item(rel: dict, repo: str, track: str, now: datetime) -> dict | None: |
@@ -72,8 +107,8 @@ def collect(track: str) -> None: |
72 | 107 | items: list[dict] = [] |
73 | 108 | for entry in repos: |
74 | 109 | repo = entry["repo"] |
75 | | - per_page = entry.get("per_page", 20) |
76 | | - releases = _get(f"/repos/{repo}/releases?per_page={per_page}") |
| 110 | + per_page = entry.get("per_page", 50) |
| 111 | + releases = _get_all(f"/repos/{repo}/releases?per_page={per_page}") |
77 | 112 | for rel in releases: |
78 | 113 | item = release_to_item(rel, repo, track, now) |
79 | 114 | if item is not None: |
|
0 commit comments