diff --git a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 108326b50..11cedff32 100644 --- a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -92,7 +92,7 @@ def contributor_breadth_model(self) -> None: logger.info(f"Processing cntrb {index} of {total}") index += 1 - repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" + repo_cntrb_url = github_data_access.endpoint_url(f"users/{cntrb['gh_login']}/events") newest_event_in_db = datetime(1970, 1, 1) if cntrb["gh_login"] in cntrb_newest_events_map: diff --git a/collectoss/tasks/github/contributors.py b/collectoss/tasks/github/contributors.py index f3eaaa802..fc113386d 100644 --- a/collectoss/tasks/github/contributors.py +++ b/collectoss/tasks/github/contributors.py @@ -5,6 +5,7 @@ from collectoss.tasks.init.celery_app import celery_app as celery from collectoss.tasks.init.celery_app import CoreRepoCollectionTask from collectoss.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.github.util.github_data_access import GithubDataAccess from collectoss.tasks.github.facade_github.tasks import * from collectoss.application.db.models import Contributor from collectoss.application.db.util import execute_session_query @@ -27,6 +28,8 @@ def process_contributors(): key_auth = GithubRandomKeyAuth(logger) + github_data_access = GithubDataAccess(key_auth, logger) + with get_session() as session: query = ( @@ -56,7 +59,7 @@ def process_contributors(): del contributor_dict["_sa_instance_state"] - url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" + url = github_data_access.endpoint_url(f"users/{contributor_dict['cntrb_login']}") try: data = retrieve_dict_data(url, key_auth, logger) diff --git a/collectoss/tasks/github/detect_move/core.py b/collectoss/tasks/github/detect_move/core.py index 1c0d7dba8..fe2d8ab14 100644 --- a/collectoss/tasks/github/detect_move/core.py +++ b/collectoss/tasks/github/detect_move/core.py @@ -1,6 +1,7 @@ from collectoss.tasks.github.util.github_task_session import * from collectoss.application.db.models import Repo, CollectionStatus from collectoss.tasks.github.util.github_paginator import hit_api +from collectoss.tasks.github.util.github_data_access import GithubDataAccess from collectoss.tasks.github.util.util import get_owner_repo from collectoss.tasks.github.util.util import parse_json_response from datetime import datetime @@ -76,7 +77,10 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'): owner, name = get_owner_repo(repo.repo_git) - url = f"https://api.github.com/repos/{owner}/{name}" + + github_data_access = GithubDataAccess(key_auth, logger) + + url = github_data_access.endpoint_url(f"repos/{owner}/{name}") attempts = 0 while attempts < 10: diff --git a/collectoss/tasks/github/events.py b/collectoss/tasks/github/events.py index 24b1e42ff..21dc06959 100644 --- a/collectoss/tasks/github/events.py +++ b/collectoss/tasks/github/events.py @@ -48,10 +48,11 @@ def collect_events(repo_git: str, full_collection: bool): def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100" - github_data_access = GithubDataAccess(key_auth, logger) + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/events", {"per_page": "100"}) + + page_count = github_data_access.get_resource_page_count(url) if page_count > 300: @@ -133,11 +134,11 @@ def collect(self, repo_git, key_auth, since): def _collect_events(self, repo_git: str, key_auth, since): owner, repo = get_owner_repo(repo_git) - - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" github_data_access = GithubDataAccess(key_auth, self._logger) + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/events") + for event in github_data_access.paginate_resource(url): yield event @@ -314,7 +315,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc issue_number = issue["issue_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/events" + event_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/{issue_number}/events") try: @@ -377,7 +378,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): pr_number = pr["gh_pr_number"] - event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/events" + event_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/{pr_number}/events") try: diff --git a/collectoss/tasks/github/facade_github/core.py b/collectoss/tasks/github/facade_github/core.py index 64b42e0d2..98dfcc07d 100644 --- a/collectoss/tasks/github/facade_github/core.py +++ b/collectoss/tasks/github/facade_github/core.py @@ -26,12 +26,6 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too logger.error(f"Encountered bad url: {github_url}") raise e - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?state=all" - ) - # Get contributors that we already have stored # Set our duplicate and update column map keys (something other than PK) to # check dupicates/needed column updates with @@ -42,6 +36,9 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too github_data_access = GithubDataAccess(key_auth, logger) + # Set the base of the url and place to hold contributors to insert + contributors_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/contributors", {"state": "all"}) + contributor_count = github_data_access.get_resource_count(contributors_url) logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n") @@ -54,7 +51,7 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too # Need to hit this single contributor endpoint to get extra data including... # `created at` # i think that's it - cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) + cntrb_url = github_data_access.endpoint_url(f"users/{repo_contributor['login']}") logger.info("Hitting endpoint: " + cntrb_url + " ...\n") diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index ab7a18eab..8bef19b8b 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -88,7 +88,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id # move on to the next contributor continue - url = ("https://api.github.com/users/" + login) + url = github_data_access.endpoint_url(f"users/{login}") try: user_data = github_data_access.get_resource(url) diff --git a/collectoss/tasks/github/issues.py b/collectoss/tasks/github/issues.py index 406718759..f25287be6 100644 --- a/collectoss/tasks/github/issues.py +++ b/collectoss/tasks/github/issues.py @@ -103,13 +103,13 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git logger.info(f"Collecting issues for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all" + github_data_access = GithubDataAccess(key_auth, logger) + + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/", {"state":"all"}) if since: url += f"&since={since.isoformat()}" - - github_data_access = GithubDataAccess(key_auth, logger) - + num_pages = github_data_access.get_resource_page_count(url) logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues") diff --git a/collectoss/tasks/github/messages.py b/collectoss/tasks/github/messages.py index 342eeb2ca..7f8fc0424 100644 --- a/collectoss/tasks/github/messages.py +++ b/collectoss/tasks/github/messages.py @@ -64,8 +64,10 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas owner, repo = get_owner_repo(repo_git) + github_data_access = GithubDataAccess(key_auth, logger) + # url to get issue and pull request comments - url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments" + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/comments") if since: url += f"?since={since.isoformat()}" @@ -73,8 +75,6 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas # define logger for task logger.info(f"Collecting github comments for {owner}/{repo}") - github_data_access = GithubDataAccess(key_auth, logger) - message_count = github_data_access.get_resource_count(url) logger.info(f"{task_name}: Collecting {message_count} github messages") diff --git a/collectoss/tasks/github/pull_requests/tasks.py b/collectoss/tasks/github/pull_requests/tasks.py index 3efaddf3b..55aa96ad2 100644 --- a/collectoss/tasks/github/pull_requests/tasks.py +++ b/collectoss/tasks/github/pull_requests/tasks.py @@ -227,11 +227,19 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - """ owner, repo = get_owner_repo(repo_git) - review_msg_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments" logger = logging.getLogger(collect_pull_request_review_comments.__name__) logger.debug(f"Collecting pull request review comments for {owner}/{repo}") + tool_source = "Pr review comment task" + tool_version = "2.0" + data_source = "Github API" + + key_auth = GithubRandomKeyAuth(logger) + github_data_access = GithubDataAccess(key_auth, logger) + + review_msg_search_args = {} + repo_id = get_repo_by_repo_git(repo_git).repo_id if not full_collection: @@ -240,7 +248,7 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - if last_collected_date: # Subtract 2 days to ensure all data is collected core_data_last_collected = (last_collected_date - timedelta(days=2)).replace(tzinfo=timezone.utc) - review_msg_url += f"?since={core_data_last_collected.isoformat()}" + review_msg_search_args['since'] = core_data_last_collected.isoformat() else: logger.warning(f"core_data_last_collected is NULL for recollection on repo: {repo_git}") @@ -253,13 +261,6 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - logger.debug(f"{owner}/{repo} No PR reviews to collect review comments for") return - tool_source = "Pr review comment task" - tool_version = "2.0" - data_source = "Github API" - - key_auth = GithubRandomKeyAuth(logger) - github_data_access = GithubDataAccess(key_auth, logger) - pr_review_comment_batch_size = get_batch_size() # Batch processing: accumulate comments until batch size reached, then flush @@ -268,6 +269,8 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) - pr_review_msg_mapping_data = {} total_refs_inserted = 0 + review_msg_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls/comments", review_msg_search_args) + # Single-pass extraction: get both contributor and comment data together for comment in github_data_access.paginate_resource(review_msg_url): # Extract contributor @@ -512,7 +515,7 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: if index % 100 == 0: logger.debug(f"{owner}/{repo} Processing PR {index + 1} of {pr_count}") - pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" + pr_review_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls/{pr_number}/reviews") try: pr_reviews = list(github_data_access.paginate_resource(pr_review_url)) diff --git a/collectoss/tasks/github/repo_info/core.py b/collectoss/tasks/github/repo_info/core.py index 55b1def2a..2c0a4fb79 100644 --- a/collectoss/tasks/github/repo_info/core.py +++ b/collectoss/tasks/github/repo_info/core.py @@ -16,11 +16,13 @@ def query_committers_count(key_auth, logger, owner, repo): data = {} logger.info('Querying committers count\n') - url = f'https://api.github.com/repos/{owner}/{repo}/contributors?per_page=100' + + github_data_access = GithubDataAccess(key_auth, logger) + + url = github_data_access.endpoint_url(f"/repos/{owner}/{repo}/contributors", {"per_page": 100}) ## If the repository is empty there are zero committers, and the API returns nothing at all. Response ## header of 200 along with an empty JSON. try: - github_data_access = GithubDataAccess(key_auth, logger) try: data = github_data_access.get_resource_count(url) except Exception as e: @@ -57,9 +59,10 @@ def get_repo_data(logger, url, response): """ def get_repo_data(logger, owner, repo): + github_data_access = GithubDataAccess(None, logger) + try: - url = f'https://api.github.com/repos/{owner}/{repo}' - github_data_access = GithubDataAccess(None, logger) + url = github_data_access.endpoint_url(f"/repos/{owner}/{repo}") result = github_data_access.get_resource(url) return result except UrlNotFoundException as e: diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 0fc017124..17dfd1339 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -35,6 +35,14 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent super().__init__(message) class GithubDataAccess: + """Utilities for accessing the GitHub REST API + + Public facing functions in this class should refrain from returning data in a structure + that is derived from githubs API responses to keep all platform-specific parsing here. + """ + + def _base_url(self): + return "https://api.github.com" def __init__(self, key_manager, logger: logging.Logger, feature="rest"): @@ -61,7 +69,7 @@ def endpoint_url(self, path: str, params: dict = None) -> str: if not path.startswith("/"): path = "/" + path - url = "https://api.github.com" + path + url = self._base_url() + path return self.__add_query_params(url, params or {})