|
1 | | -import os |
2 | | -import requests |
3 | | -import sys |
4 | | -import json |
5 | | -import click |
6 | | - |
7 | | -try: |
8 | | - token = os.environ["GRAPH_API_KEY"] |
9 | | -except KeyError: |
10 | | - print("You need to set GRAPH_API_KEY") |
11 | | - print("But you shouldn't use this yet.") |
12 | | - sys.exit() |
13 | | - |
14 | | -endpoint = r"https://api.github.com/graphql" |
15 | | -headers = {"Authorization": f"bearer {token}"} |
16 | | - |
17 | | - |
18 | | -def load_query_from_file(fname, repo_owner="numpy", repo_name="numpy"): |
19 | | - """ |
20 | | - Load an 'issue' query from file and set the target repository, where |
21 | | - the target repository has the format: |
22 | | -
|
23 | | - https://github.com/<repo_owner>/<repo_name> |
24 | | -
|
25 | | - Parameters |
26 | | - ---------- |
27 | | - fname : str |
28 | | - Path to a text file containing a valid issue query according to the |
29 | | - GitHub GraphQL schema. |
30 | | - repo_owner : str |
31 | | - Owner of target repository on GitHub. Default is 'numpy'. |
32 | | - repo_name : str |
33 | | - Name of target repository on GitHub. Default is 'numpy'. |
34 | | -
|
35 | | - Returns |
36 | | - ------- |
37 | | - query : str |
38 | | - Query loaded from file in text form suitable for ``send_query``. |
39 | | -
|
40 | | - Notes |
41 | | - ----- |
42 | | - This function expects the query to have a specific form and will not work |
43 | | - for general GitHub GraphQL queries. See ``examples/`` for some valid |
44 | | - templated issue queries. |
45 | | - """ |
46 | | - with open(fname) as fh: |
47 | | - query = fh.read() |
48 | | - # Set target repo from template |
49 | | - query = query.replace("_REPO_OWNER_", repo_owner) |
50 | | - query = query.replace("_REPO_NAME_", repo_name) |
51 | | - return query |
52 | | - |
53 | | - |
54 | | -def send_query(query, query_type, cursor=None): |
55 | | - """ |
56 | | - Send a GraphQL query via requests.post |
57 | | -
|
58 | | - No validation is done on the query before sending. GitHub GraphQL is |
59 | | - supported with the `cursor` argument. |
60 | | -
|
61 | | - Parameters |
62 | | - ---------- |
63 | | - query : str |
64 | | - The GraphQL query to be sent |
65 | | - query_type : {"issues", "pullRequests"} |
66 | | - The object being queried according to the GitHub GraphQL schema. |
67 | | - Currently only issues and pullRequests are supported |
68 | | - cursor : str, optional |
69 | | - If given, then the cursor is injected into the query to support |
70 | | - GitHub's GraphQL pagination. |
71 | | -
|
72 | | - Returns |
73 | | - ------- |
74 | | - dict |
75 | | - The result of the query (json) parsed by `json.loads` |
76 | | -
|
77 | | - Notes |
78 | | - ----- |
79 | | - This is intended mostly for internal use within `get_all_responses`. |
80 | | - """ |
81 | | - # TODO: Expand this, either by parsing the query type from the query |
82 | | - # directly or manually adding more query_types to the set |
83 | | - if query_type not in {"issues", "pullRequests"}: |
84 | | - raise ValueError( |
85 | | - "Only 'issues' and 'pullRequests' queries are currently supported" |
86 | | - ) |
87 | | - # TODO: Generalize this |
88 | | - # WARNING: The cursor injection depends on the specific structure of the |
89 | | - # query, this is the main reason why query types are limited to issues/PRs |
90 | | - if cursor is not None: |
91 | | - cursor_insertion_key = query_type + "(" |
92 | | - cursor_ind = query.find(cursor_insertion_key) + len(cursor_insertion_key) |
93 | | - query = query[:cursor_ind] + f'after:"{cursor}", ' + query[cursor_ind:] |
94 | | - # Build request payload |
95 | | - payload = {"query": "".join(query.split("\n"))} |
96 | | - response = requests.post(endpoint, json=payload, headers=headers) |
97 | | - return json.loads(response.content) |
98 | | - |
99 | | - |
100 | | -def get_all_responses(query, query_type): |
101 | | - """ |
102 | | - Helper function to bypass GitHub GraphQL API node limit. |
103 | | - """ |
104 | | - # Get data from a single response |
105 | | - initial_data = send_query(query, query_type) |
106 | | - data, last_cursor, total_count = parse_single_query(initial_data, query_type) |
107 | | - print(f"Retrieving {len(data)} out of {total_count} values...") |
108 | | - # Continue requesting data (with pagination) until all are acquired |
109 | | - while len(data) < total_count: |
110 | | - rdata = send_query(query, query_type, cursor=last_cursor) |
111 | | - pdata, last_cursor, _ = parse_single_query(rdata, query_type) |
112 | | - data.extend(pdata) |
113 | | - print(f"Retrieving {len(data)} out of {total_count} values...") |
114 | | - print("Done.") |
115 | | - return data |
116 | | - |
117 | | - |
118 | | -def parse_single_query(data, query_type): |
119 | | - """ |
120 | | - Parse the data returned by `send_query` |
121 | | -
|
122 | | - .. warning:: |
123 | | -
|
124 | | - Like `send_query`, the logic here depends on the specific structure |
125 | | - of the query (e.g. it must be an issue or PR query, and must have a |
126 | | - total count). |
127 | | - """ |
128 | | - try: |
129 | | - total_count = data["data"]["repository"][query_type]["totalCount"] |
130 | | - data = data["data"]["repository"][query_type]["edges"] |
131 | | - last_cursor = data[-1]["cursor"] |
132 | | - except KeyError as e: |
133 | | - print(data) |
134 | | - raise e |
135 | | - return data, last_cursor, total_count |
136 | | - |
137 | | - |
138 | | -class GithubGrabber: |
139 | | - """ |
140 | | - Pull down data via the GitHub APIv.4 given a valid GraphQL query. |
141 | | - """ |
142 | | - |
143 | | - def __init__(self, query_fname, query_type, repo_owner="numpy", repo_name="numpy"): |
144 | | - """ |
145 | | - Create an object to send/recv queries related to the issue tracker |
146 | | - for the given repository via the GitHub API v.4. |
147 | | -
|
148 | | - The repository to query against is given by: |
149 | | - https://github.com/<repo_owner>/<repo_name> |
150 | | -
|
151 | | - Parameters |
152 | | - ---------- |
153 | | - query_fname : str |
154 | | - Path to a valid GraphQL query conforming to the GitHub GraphQL |
155 | | - schema |
156 | | - query_type : {"issues", "pullRequests"} |
157 | | - Type of object that is being queried according to the GitHub GraphQL |
158 | | - schema. Currently only "issues" and "pullRequests" are supported. |
159 | | - repo_owner : str |
160 | | - Repository owner. Default is "numpy" |
161 | | - repo_name : str |
162 | | - Repository name. Default is "numpy" |
163 | | - """ |
164 | | - self.query_fname = query_fname |
165 | | - self.query_type = query_type # TODO: Parse this directly from query |
166 | | - self.repo_owner = repo_owner |
167 | | - self.repo_name = repo_name |
168 | | - self.raw_data = None |
169 | | - self.load_query() |
170 | | - |
171 | | - def load_query(self): |
172 | | - self.query = load_query_from_file( |
173 | | - self.query_fname, self.repo_owner, self.repo_name |
174 | | - ) |
175 | | - |
176 | | - def get(self): |
177 | | - """ |
178 | | - Get JSON-formatted raw data from the query. |
179 | | - """ |
180 | | - self.raw_data = get_all_responses(self.query, self.query_type) |
181 | | - |
182 | | - def dump(self, outfile): |
183 | | - """ |
184 | | - Dump raw json to `outfile`. |
185 | | - """ |
186 | | - if not self.raw_data: |
187 | | - raise ValueError("raw_data is currently empty, nothing to dump") |
188 | | - |
189 | | - with open(outfile, "w") as outf: |
190 | | - json.dump(self.raw_data, outf) |
191 | | - |
192 | | - |
193 | | -@click.command() |
194 | | -@click.argument("repo_owner") |
195 | | -@click.argument("repo_name") |
196 | | -def main(repo_owner, repo_name): |
197 | | - """Download and save issue and pr data for `repo_owner`/`repo_name`.""" |
198 | | - # Download issue data |
199 | | - issues = GithubGrabber( |
200 | | - "query_examples/issue_activity_since_date.gql", |
201 | | - "issues", |
202 | | - repo_owner=repo_owner, |
203 | | - repo_name=repo_name, |
204 | | - ) |
205 | | - issues.get() |
206 | | - issues.dump(f"{repo_name}_issues.json") |
207 | | - # Download PR data |
208 | | - prs = GithubGrabber( |
209 | | - "query_examples/pr_data_query.gql", |
210 | | - "pullRequests", |
211 | | - repo_owner=repo_owner, |
212 | | - repo_name=repo_name, |
213 | | - ) |
214 | | - prs.get() |
215 | | - prs.dump(f"{repo_name}_prs.json") |
216 | | - |
217 | | - |
218 | | -if __name__ == "__main__": |
219 | | - main() |
0 commit comments