|
2 | 2 | from os import environ as env |
3 | 3 | import re |
4 | 4 | from datetime import datetime, timezone |
| 5 | +import time |
5 | 6 |
|
6 | 7 | ghDataDir = env.get("GITHUB_DATA", "../github-data") |
7 | 8 | datfilepath = "%s/intRepos_ActivityCommits.json" % ghDataDir |
|
20 | 21 | dataCollector.data = {"data": {}} |
21 | 22 |
|
22 | 23 | # Initialize query manager |
23 | | -queryMan = qm.GitHubQueryManager(maxRetry=10, retryDelay=2) |
| 24 | +queryMan = qm.GitHubQueryManager(maxRetry=3, retryDelay=2) |
| 25 | + |
| 26 | +""" Unique handling for queries with especially slow response times. |
| 27 | + Prioritizes successful collection from at many repos as possible by moving |
| 28 | + on and coming back to repos that we're still waiting on, |
| 29 | + (rather than awaiting one at a time). |
| 30 | + Also allows for graceful termination of the script when exceeding a given |
| 31 | + time limit, preserving any successfully collected data.""" |
| 32 | +# Set maximum loop count (like maxRetry, but for full list, not per-query) |
| 33 | +maxLoops = 5 |
| 34 | +# Set execution time limit (can use `None` to remove limit) |
| 35 | +maxRuntime = 5.5 * 60 * 60 # 5.5 hrs as seconds (suited to GitHub job limits) |
| 36 | +# Counters |
| 37 | +endTime = None if maxRuntime is None else time.monotonic() + maxRuntime |
| 38 | +loopCount = 0 |
24 | 39 |
|
25 | 40 | # Iterate through internal repos |
26 | 41 | print("Gathering data across multiple queries...") |
27 | | -for repo in repolist: |
28 | | - print("\n'%s'" % (repo)) |
| 42 | +while (endTime is None or time.monotonic() < endTime) and (loopCount < maxLoops): |
| 43 | + loopCount += 1 |
| 44 | + print("\nPass %s (max %s)" % (loopCount, maxLoops)) |
| 45 | + |
| 46 | + for repo in repolist: |
| 47 | + |
| 48 | + # Stop iteration if time limit exceeded |
| 49 | + if endTime is not None and time.monotonic() >= endTime: |
| 50 | + print("\nWarning: Script time limit reached.") |
| 51 | + print( |
| 52 | + "Runtime exceeded %s seconds during Pass %s of %s" |
| 53 | + % (maxRuntime, loopCount, maxLoops) |
| 54 | + ) |
| 55 | + break |
| 56 | + |
| 57 | + print("\n'%s'" % (repo)) |
29 | 58 |
|
30 | | - r = repo.split("/") |
| 59 | + # Only check repos that weren't recorded in previous loops. |
| 60 | + if "data" in dataCollector.data.keys() and repo in dataCollector.data["data"]: |
| 61 | + print("Already recorded data for '%s'" % (repo)) |
| 62 | + continue |
31 | 63 |
|
32 | | - gitquery = re.sub("OWNNAME", r[0], query_in) |
33 | | - gitquery = re.sub("REPONAME", r[1], gitquery) |
| 64 | + r = repo.split("/") |
34 | 65 |
|
35 | | - try: |
36 | | - outObj = queryMan.queryGitHub(gitquery, rest=True) |
37 | | - except Exception as error: |
38 | | - print("Warning: Could not complete '%s'" % (repo)) |
39 | | - print(error) |
40 | | - continue |
| 66 | + gitquery = re.sub("OWNNAME", r[0], query_in) |
| 67 | + gitquery = re.sub("REPONAME", r[1], gitquery) |
41 | 68 |
|
42 | | - for item in outObj: |
43 | | - # Remove per-day data, keep only weekly totals |
44 | 69 | try: |
45 | | - del item["days"] |
46 | | - except KeyError: |
47 | | - pass |
48 | | - # Convert unix timestamps into standard dates (rounded to nearest week to improve aggregate data) |
49 | | - weekinfo = datetime.fromtimestamp(item["week"], tz=timezone.utc).isocalendar() |
50 | | - weekstring = str(weekinfo[0]) + "-W" + str(weekinfo[1]) + "-1" |
51 | | - item["week"] = datetime.strptime(weekstring, "%Y-W%W-%w").strftime("%Y-%m-%d") |
52 | | - |
53 | | - # Update collective data |
54 | | - dataCollector.data["data"][repo] = outObj |
55 | | - |
56 | | - print("'%s' Done!" % (repo)) |
| 70 | + outObj = queryMan.queryGitHub(gitquery, rest=True) |
| 71 | + except Exception as error: |
| 72 | + print("Warning: Could not complete '%s'" % (repo)) |
| 73 | + print(error) |
| 74 | + continue |
| 75 | + |
| 76 | + for item in outObj: |
| 77 | + # Remove per-day data, keep only weekly totals |
| 78 | + try: |
| 79 | + del item["days"] |
| 80 | + except KeyError: |
| 81 | + pass |
| 82 | + # Convert unix timestamps into standard dates (rounded to nearest week to improve aggregate data) |
| 83 | + weekinfo = datetime.fromtimestamp( |
| 84 | + item["week"], tz=timezone.utc |
| 85 | + ).isocalendar() |
| 86 | + weekstring = str(weekinfo[0]) + "-W" + str(weekinfo[1]) + "-1" |
| 87 | + item["week"] = datetime.strptime(weekstring, "%Y-W%W-%w").strftime( |
| 88 | + "%Y-%m-%d" |
| 89 | + ) |
| 90 | + |
| 91 | + # Update collective data |
| 92 | + dataCollector.data["data"][repo] = outObj |
| 93 | + |
| 94 | + print("'%s' Done!" % (repo)) |
57 | 95 |
|
58 | 96 | print("\nCollective data gathering complete!") |
59 | 97 |
|
|
0 commit comments