|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Fetch all issues (open and closed) from a GitHub repository via GraphQL and store as structured JSON.""" |
| 3 | + |
| 4 | +import argparse |
| 5 | +import json |
| 6 | +import subprocess |
| 7 | +import sys |
| 8 | +import time |
| 9 | +from datetime import datetime, timezone |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +GRAPHQL_QUERY = """ |
| 13 | +query($owner: String!, $repo: String!, $cursor: String, $states: [IssueState!]) { |
| 14 | + repository(owner: $owner, name: $repo) { |
| 15 | + issues(states: $states, first: 100, after: $cursor, orderBy: {field: CREATED_AT, direction: ASC}) { |
| 16 | + totalCount |
| 17 | + pageInfo { |
| 18 | + hasNextPage |
| 19 | + endCursor |
| 20 | + } |
| 21 | + nodes { |
| 22 | + number |
| 23 | + title |
| 24 | + body |
| 25 | + state |
| 26 | + createdAt |
| 27 | + updatedAt |
| 28 | + closedAt |
| 29 | + author { login } |
| 30 | + assignees(first: 10) { nodes { login } } |
| 31 | + labels(first: 20) { nodes { name } } |
| 32 | + milestone { title number dueOn } |
| 33 | + reactionGroups { content users { totalCount } } |
| 34 | + comments(first: 100) { |
| 35 | + totalCount |
| 36 | + nodes { |
| 37 | + author { login } |
| 38 | + body |
| 39 | + createdAt |
| 40 | + updatedAt |
| 41 | + reactionGroups { content users { totalCount } } |
| 42 | + } |
| 43 | + } |
| 44 | + timelineItems(first: 50, itemTypes: [CROSS_REFERENCED_EVENT, REFERENCED_EVENT, CLOSED_EVENT, REOPENED_EVENT, LABELED_EVENT, UNLABELED_EVENT, CONNECTED_EVENT]) { |
| 45 | + nodes { |
| 46 | + __typename |
| 47 | + ... on CrossReferencedEvent { |
| 48 | + createdAt |
| 49 | + source { |
| 50 | + __typename |
| 51 | + ... on PullRequest { number title state url } |
| 52 | + ... on Issue { number title state url } |
| 53 | + } |
| 54 | + } |
| 55 | + ... on LabeledEvent { label { name } createdAt } |
| 56 | + ... on UnlabeledEvent { label { name } createdAt } |
| 57 | + ... on ClosedEvent { createdAt } |
| 58 | + ... on ReopenedEvent { createdAt } |
| 59 | + } |
| 60 | + } |
| 61 | + } |
| 62 | + } |
| 63 | + } |
| 64 | + rateLimit { cost remaining resetAt } |
| 65 | +} |
| 66 | +""" |
| 67 | + |
| 68 | + |
| 69 | +def gh_graphql(query: str, variables: dict) -> dict: |
| 70 | + """Execute a GraphQL query via the gh CLI, passing the full payload as JSON on stdin.""" |
| 71 | + clean_vars = {k: v for k, v in variables.items() if v is not None} |
| 72 | + payload = json.dumps({"query": query, "variables": clean_vars}) |
| 73 | + result = subprocess.run( |
| 74 | + ["gh", "api", "graphql", "--input", "-"], |
| 75 | + input=payload, capture_output=True, text=True, |
| 76 | + ) |
| 77 | + if result.returncode != 0: |
| 78 | + raise RuntimeError(f"gh api graphql failed: {result.stderr}") |
| 79 | + return json.loads(result.stdout) |
| 80 | + |
| 81 | + |
| 82 | +def transform_reactions(reaction_groups: list) -> dict: |
| 83 | + """Convert reactionGroups to a flat dict, dropping zeros.""" |
| 84 | + reactions = {} |
| 85 | + for rg in reaction_groups: |
| 86 | + count = rg["users"]["totalCount"] |
| 87 | + if count > 0: |
| 88 | + reactions[rg["content"]] = count |
| 89 | + return reactions |
| 90 | + |
| 91 | + |
| 92 | +def transform_timeline_event(event: dict) -> dict | None: |
| 93 | + """Flatten a timeline event node.""" |
| 94 | + typename = event.get("__typename") |
| 95 | + if typename == "CrossReferencedEvent": |
| 96 | + source = event.get("source", {}) |
| 97 | + return { |
| 98 | + "type": "CrossReferencedEvent", |
| 99 | + "created_at": event.get("createdAt"), |
| 100 | + "source_type": source.get("__typename"), |
| 101 | + "source_number": source.get("number"), |
| 102 | + "source_title": source.get("title"), |
| 103 | + "source_state": source.get("state"), |
| 104 | + "source_url": source.get("url"), |
| 105 | + } |
| 106 | + elif typename in ("LabeledEvent", "UnlabeledEvent"): |
| 107 | + return { |
| 108 | + "type": typename, |
| 109 | + "label": event.get("label", {}).get("name"), |
| 110 | + "created_at": event.get("createdAt"), |
| 111 | + } |
| 112 | + elif typename in ("ClosedEvent", "ReopenedEvent"): |
| 113 | + return { |
| 114 | + "type": typename, |
| 115 | + "created_at": event.get("createdAt"), |
| 116 | + } |
| 117 | + return None |
| 118 | + |
| 119 | + |
| 120 | +def transform_issue(raw: dict) -> dict: |
| 121 | + """Transform a raw GraphQL issue node into our clean structure.""" |
| 122 | + comments = [] |
| 123 | + for c in raw["comments"]["nodes"]: |
| 124 | + comments.append({ |
| 125 | + "author": c["author"]["login"] if c.get("author") else None, |
| 126 | + "body": c["body"], |
| 127 | + "created_at": c["createdAt"], |
| 128 | + "updated_at": c["updatedAt"], |
| 129 | + "reactions": transform_reactions(c.get("reactionGroups", [])), |
| 130 | + }) |
| 131 | + |
| 132 | + timeline = [] |
| 133 | + for t in raw["timelineItems"]["nodes"]: |
| 134 | + transformed = transform_timeline_event(t) |
| 135 | + if transformed: |
| 136 | + timeline.append(transformed) |
| 137 | + |
| 138 | + return { |
| 139 | + "number": raw["number"], |
| 140 | + "title": raw["title"], |
| 141 | + "body": raw["body"], |
| 142 | + "state": raw["state"], |
| 143 | + "author": raw["author"]["login"] if raw.get("author") else None, |
| 144 | + "created_at": raw["createdAt"], |
| 145 | + "updated_at": raw["updatedAt"], |
| 146 | + "closed_at": raw["closedAt"], |
| 147 | + "assignees": [a["login"] for a in raw["assignees"]["nodes"]], |
| 148 | + "labels": [l["name"] for l in raw["labels"]["nodes"]], |
| 149 | + "milestone": raw.get("milestone"), |
| 150 | + "reactions": transform_reactions(raw.get("reactionGroups", [])), |
| 151 | + "comment_count": raw["comments"]["totalCount"], |
| 152 | + "comments": comments, |
| 153 | + "timeline": timeline, |
| 154 | + } |
| 155 | + |
| 156 | + |
| 157 | +def fetch_all_issues(owner: str, repo: str, states: list[str] | None = None) -> list[dict]: |
| 158 | + """Fetch issues with pagination and exponential backoff.""" |
| 159 | + if states is None: |
| 160 | + states = ["OPEN"] |
| 161 | + all_issues = [] |
| 162 | + cursor = None |
| 163 | + page = 1 |
| 164 | + max_retries = 5 |
| 165 | + label = "/".join(s.lower() for s in states) |
| 166 | + |
| 167 | + while True: |
| 168 | + for attempt in range(max_retries): |
| 169 | + try: |
| 170 | + print(f"Fetching {label} issues page {page}...", file=sys.stderr) |
| 171 | + data = gh_graphql(GRAPHQL_QUERY, { |
| 172 | + "owner": owner, "repo": repo, "cursor": cursor, "states": states, |
| 173 | + }) |
| 174 | + break |
| 175 | + except RuntimeError as e: |
| 176 | + wait = min(2 ** attempt, 60) |
| 177 | + print(f"Error on attempt {attempt + 1}: {e}", file=sys.stderr) |
| 178 | + if attempt < max_retries - 1: |
| 179 | + print(f"Retrying in {wait}s...", file=sys.stderr) |
| 180 | + time.sleep(wait) |
| 181 | + else: |
| 182 | + raise |
| 183 | + |
| 184 | + rate = data["data"]["rateLimit"] |
| 185 | + print(f" Rate limit: {rate['remaining']} remaining, cost: {rate['cost']}", file=sys.stderr) |
| 186 | + |
| 187 | + if rate["remaining"] < 100: |
| 188 | + reset_at = datetime.fromisoformat(rate["resetAt"].replace("Z", "+00:00")) |
| 189 | + wait_seconds = (reset_at - datetime.now(timezone.utc)).total_seconds() + 5 |
| 190 | + if wait_seconds > 0: |
| 191 | + print(f" Rate limit low, waiting {wait_seconds:.0f}s until reset...", file=sys.stderr) |
| 192 | + time.sleep(wait_seconds) |
| 193 | + |
| 194 | + issues_data = data["data"]["repository"]["issues"] |
| 195 | + raw_issues = issues_data["nodes"] |
| 196 | + total = issues_data["totalCount"] |
| 197 | + |
| 198 | + for raw in raw_issues: |
| 199 | + all_issues.append(transform_issue(raw)) |
| 200 | + |
| 201 | + print(f" Fetched {len(all_issues)}/{total} issues", file=sys.stderr) |
| 202 | + |
| 203 | + page_info = issues_data["pageInfo"] |
| 204 | + if not page_info["hasNextPage"]: |
| 205 | + break |
| 206 | + |
| 207 | + cursor = page_info["endCursor"] |
| 208 | + page += 1 |
| 209 | + |
| 210 | + return all_issues |
| 211 | + |
| 212 | + |
| 213 | +def main(): |
| 214 | + parser = argparse.ArgumentParser(description="Fetch all GitHub issues into a JSON file.") |
| 215 | + parser.add_argument("--owner", default="bitsandbytes-foundation", help="Repository owner") |
| 216 | + parser.add_argument("--repo", default="bitsandbytes", help="Repository name") |
| 217 | + parser.add_argument("--open-only", action="store_true", help="Only fetch open issues") |
| 218 | + parser.add_argument("-o", "--output", default=None, |
| 219 | + help="Output JSON file path (default: <repo>_issues.json in script dir)") |
| 220 | + args = parser.parse_args() |
| 221 | + |
| 222 | + output_path = args.output or str(Path(__file__).parent / f"{args.repo}_issues.json") |
| 223 | + |
| 224 | + open_issues = fetch_all_issues(args.owner, args.repo, ["OPEN"]) |
| 225 | + print(file=sys.stderr) |
| 226 | + |
| 227 | + if args.open_only: |
| 228 | + closed_issues = [] |
| 229 | + else: |
| 230 | + closed_issues = fetch_all_issues(args.owner, args.repo, ["CLOSED"]) |
| 231 | + print(file=sys.stderr) |
| 232 | + |
| 233 | + result = { |
| 234 | + "repository": f"{args.owner}/{args.repo}", |
| 235 | + "fetched_at": datetime.now(timezone.utc).isoformat(), |
| 236 | + "open_issues": open_issues, |
| 237 | + "open_count": len(open_issues), |
| 238 | + "closed_issues": closed_issues, |
| 239 | + "closed_count": len(closed_issues), |
| 240 | + } |
| 241 | + |
| 242 | + with open(output_path, "w") as f: |
| 243 | + json.dump(result, f, indent=2, ensure_ascii=False) |
| 244 | + |
| 245 | + print(f"Wrote {len(open_issues)} open + {len(closed_issues)} closed issues to {output_path}", |
| 246 | + file=sys.stderr) |
| 247 | + |
| 248 | + |
| 249 | +if __name__ == "__main__": |
| 250 | + main() |
0 commit comments