Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def contributor_breadth_model(self) -> None:
logger.info(f"Processing cntrb {index} of {total}")
index += 1

repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events"
repo_cntrb_url = github_data_access.endpoint_url(f"users/{cntrb['gh_login']}/events")

newest_event_in_db = datetime(1970, 1, 1)
if cntrb["gh_login"] in cntrb_newest_events_map:
Expand Down
5 changes: 4 additions & 1 deletion collectoss/tasks/github/contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collectoss.tasks.init.celery_app import celery_app as celery
from collectoss.tasks.init.celery_app import CoreRepoCollectionTask
from collectoss.tasks.github.util.github_paginator import hit_api
from collectoss.tasks.github.util.github_data_access import GithubDataAccess
from collectoss.tasks.github.facade_github.tasks import *
from collectoss.application.db.models import Contributor
from collectoss.application.db.util import execute_session_query
Expand All @@ -27,6 +28,8 @@ def process_contributors():

key_auth = GithubRandomKeyAuth(logger)

github_data_access = GithubDataAccess(key_auth, logger)

with get_session() as session:

query = (
Expand Down Expand Up @@ -56,7 +59,7 @@ def process_contributors():

del contributor_dict["_sa_instance_state"]

url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}"
url = github_data_access.endpoint_url(f"users/{contributor_dict['cntrb_login']}")

try:
data = retrieve_dict_data(url, key_auth, logger)
Expand Down
6 changes: 5 additions & 1 deletion collectoss/tasks/github/detect_move/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collectoss.tasks.github.util.github_task_session import *
from collectoss.application.db.models import Repo, CollectionStatus
from collectoss.tasks.github.util.github_paginator import hit_api
from collectoss.tasks.github.util.github_data_access import GithubDataAccess
from collectoss.tasks.github.util.util import get_owner_repo
from collectoss.tasks.github.util.util import parse_json_response
from datetime import datetime
Expand Down Expand Up @@ -76,7 +77,10 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger):
def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'):

owner, name = get_owner_repo(repo.repo_git)
url = f"https://api.github.com/repos/{owner}/{name}"

github_data_access = GithubDataAccess(key_auth, logger)

url = github_data_access.endpoint_url(f"repos/{owner}/{name}")

attempts = 0
while attempts < 10:
Expand Down
13 changes: 7 additions & 6 deletions collectoss/tasks/github/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ def collect_events(repo_git: str, full_collection: bool):

def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events?per_page=100"

github_data_access = GithubDataAccess(key_auth, logger)

url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/events", {"per_page": "100"})


page_count = github_data_access.get_resource_page_count(url)

if page_count > 300:
Expand Down Expand Up @@ -133,11 +134,11 @@ def collect(self, repo_git, key_auth, since):
def _collect_events(self, repo_git: str, key_auth, since):

owner, repo = get_owner_repo(repo_git)

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events"

github_data_access = GithubDataAccess(key_auth, self._logger)

url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/events")

for event in github_data_access.paginate_resource(url):

yield event
Expand Down Expand Up @@ -314,7 +315,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc

issue_number = issue["issue_number"]

event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/events"
event_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/{issue_number}/events")

try:

Expand Down Expand Up @@ -377,7 +378,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since):

pr_number = pr["gh_pr_number"]

event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{pr_number}/events"
event_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/{pr_number}/events")

try:

Expand Down
11 changes: 4 additions & 7 deletions collectoss/tasks/github/facade_github/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too
logger.error(f"Encountered bad url: {github_url}")
raise e

# Set the base of the url and place to hold contributors to insert
contributors_url = (
f"https://api.github.com/repos/{owner}/{name}/" +
"contributors?state=all"
)

# Get contributors that we already have stored
# Set our duplicate and update column map keys (something other than PK) to
# check dupicates/needed column updates with
Expand All @@ -42,6 +36,9 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too

github_data_access = GithubDataAccess(key_auth, logger)

# Set the base of the url and place to hold contributors to insert
contributors_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/contributors", {"state": "all"})
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
E0602: Undefined variable 'repo' (undefined-variable)


contributor_count = github_data_access.get_resource_count(contributors_url)

logger.info("Count of contributors needing insertion: " + str(contributor_count) + "\n")
Expand All @@ -54,7 +51,7 @@ def query_github_contributors(logger, key_auth, github_url, tool_source:str, too
# Need to hit this single contributor endpoint to get extra data including...
# `created at`
# i think that's it
cntrb_url = ("https://api.github.com/users/" + repo_contributor['login'])
cntrb_url = github_data_access.endpoint_url(f"users/{repo_contributor['login']}")


logger.info("Hitting endpoint: " + cntrb_url + " ...\n")
Expand Down
2 changes: 1 addition & 1 deletion collectoss/tasks/github/facade_github/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id
# move on to the next contributor
continue

url = ("https://api.github.com/users/" + login)
url = github_data_access.endpoint_url(f"users/{login}")

try:
user_data = github_data_access.get_resource(url)
Expand Down
8 changes: 4 additions & 4 deletions collectoss/tasks/github/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def retrieve_all_issue_data(repo_git: str, logger: logging.Logger, key_auth: Git

logger.info(f"Collecting issues for {owner}/{repo}")

url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all"
github_data_access = GithubDataAccess(key_auth, logger)

url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/", {"state":"all"})

if since:
url += f"&since={since.isoformat()}"

github_data_access = GithubDataAccess(key_auth, logger)


num_pages = github_data_access.get_resource_page_count(url)
logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of issues")

Expand Down
6 changes: 3 additions & 3 deletions collectoss/tasks/github/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, tas

owner, repo = get_owner_repo(repo_git)

github_data_access = GithubDataAccess(key_auth, logger)

# url to get issue and pull request comments
url = f"https://api.github.com/repos/{owner}/{repo}/issues/comments"
url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/issues/comments")

if since:
url += f"?since={since.isoformat()}"

# define logger for task
logger.info(f"Collecting github comments for {owner}/{repo}")

github_data_access = GithubDataAccess(key_auth, logger)

message_count = github_data_access.get_resource_count(url)

logger.info(f"{task_name}: Collecting {message_count} github messages")
Expand Down
23 changes: 13 additions & 10 deletions collectoss/tasks/github/pull_requests/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,19 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -
"""
owner, repo = get_owner_repo(repo_git)

review_msg_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments"

logger = logging.getLogger(collect_pull_request_review_comments.__name__)
logger.debug(f"Collecting pull request review comments for {owner}/{repo}")

tool_source = "Pr review comment task"
tool_version = "2.0"
data_source = "Github API"

key_auth = GithubRandomKeyAuth(logger)
github_data_access = GithubDataAccess(key_auth, logger)

review_msg_search_args = {}

repo_id = get_repo_by_repo_git(repo_git).repo_id

if not full_collection:
Expand All @@ -240,7 +248,7 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -
if last_collected_date:
# Subtract 2 days to ensure all data is collected
core_data_last_collected = (last_collected_date - timedelta(days=2)).replace(tzinfo=timezone.utc)
review_msg_url += f"?since={core_data_last_collected.isoformat()}"
review_msg_search_args['since'] = core_data_last_collected.isoformat()
else:
logger.warning(f"core_data_last_collected is NULL for recollection on repo: {repo_git}")

Expand All @@ -253,13 +261,6 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -
logger.debug(f"{owner}/{repo} No PR reviews to collect review comments for")
return

tool_source = "Pr review comment task"
tool_version = "2.0"
data_source = "Github API"

key_auth = GithubRandomKeyAuth(logger)
github_data_access = GithubDataAccess(key_auth, logger)

pr_review_comment_batch_size = get_batch_size()

# Batch processing: accumulate comments until batch size reached, then flush
Expand All @@ -268,6 +269,8 @@ def collect_pull_request_review_comments(repo_git: str, full_collection: bool) -
pr_review_msg_mapping_data = {}
total_refs_inserted = 0

review_msg_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls/comments", review_msg_search_args)

# Single-pass extraction: get both contributor and comment data together
for comment in github_data_access.paginate_resource(review_msg_url):
# Extract contributor
Expand Down Expand Up @@ -512,7 +515,7 @@ def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None:
if index % 100 == 0:
logger.debug(f"{owner}/{repo} Processing PR {index + 1} of {pr_count}")

pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews"
pr_review_url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls/{pr_number}/reviews")

try:
pr_reviews = list(github_data_access.paginate_resource(pr_review_url))
Expand Down
11 changes: 7 additions & 4 deletions collectoss/tasks/github/repo_info/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ def query_committers_count(key_auth, logger, owner, repo):

data = {}
logger.info('Querying committers count\n')
url = f'https://api.github.com/repos/{owner}/{repo}/contributors?per_page=100'

github_data_access = GithubDataAccess(key_auth, logger)

url = github_data_access.endpoint_url(f"/repos/{owner}/{repo}/contributors", {"per_page": 100})
## If the repository is empty there are zero committers, and the API returns nothing at all. Response
## header of 200 along with an empty JSON.
try:
github_data_access = GithubDataAccess(key_auth, logger)
try:
data = github_data_access.get_resource_count(url)
except Exception as e:
Expand Down Expand Up @@ -57,9 +59,10 @@ def get_repo_data(logger, url, response):
"""
def get_repo_data(logger, owner, repo):

github_data_access = GithubDataAccess(None, logger)

try:
url = f'https://api.github.com/repos/{owner}/{repo}'
github_data_access = GithubDataAccess(None, logger)
url = github_data_access.endpoint_url(f"/repos/{owner}/{repo}")
result = github_data_access.get_resource(url)
return result
except UrlNotFoundException as e:
Expand Down
10 changes: 9 additions & 1 deletion collectoss/tasks/github/util/github_data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ def __init__(self, message="Resource returned HTTP 410 Gone. It is likely intent
super().__init__(message)

class GithubDataAccess:
"""Utilities for accessing the GitHub REST API

Public facing functions in this class should refrain from returning data in a structure
that is derived from githubs API responses to keep all platform-specific parsing here.
"""

def _base_url(self):
return "https://api.github.com"

def __init__(self, key_manager, logger: logging.Logger, feature="rest"):

Expand All @@ -61,7 +69,7 @@ def endpoint_url(self, path: str, params: dict = None) -> str:
if not path.startswith("/"):
path = "/" + path

url = "https://api.github.com" + path
url = self._base_url() + path

return self.__add_query_params(url, params or {})

Expand Down
Loading