Skip to content

Commit 62b6ecd

Browse files
author
Gerit Wagner
committed
github: export more fields
1 parent dc30dec commit 62b6ecd

2 files changed

Lines changed: 152 additions & 0 deletions

File tree

colrev/constants.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,11 +193,31 @@ class Fields:
193193
EUROPE_PMC_ID = "colrev.europe_pmc.europe_pmc_id"
194194
PROSPERO_ID = "colrev.prospero.id"
195195

196+
# --- GitHub fields ---
196197
GITHUB_VERSION = "colrev.github.version"
197198
GITHUB_LICENSE = "colrev.github.license"
198199
GITHUB_LANGUAGE = "colrev.github.language"
199200
GITHUB_NR_CONTRIBUTORS = "colrev.github.nr_contributors"
200201
GITHUB_NR_COMMITS = "colrev.github.nr_commits"
202+
GITHUB_STAR_COUNT = "colrev.github.star_count"
203+
GITHUB_WATCHER_COUNT = "colrev.github.watcher_count"
204+
GITHUB_FORK_COUNT = "colrev.github.fork_count"
205+
GITHUB_ISSUE_COUNT_OPEN = "colrev.github.issue_count_open"
206+
GITHUB_ISSUE_COUNT_CLOSED = "colrev.github.issue_count_closed"
207+
GITHUB_ISSUE_COUNT_TOTAL = "colrev.github.issue_count_total"
208+
GITHUB_COMMIT_LAST_DATE = "colrev.github.commit_last_date" # YYYY-MM-DD
209+
GITHUB_LATEST_RELEASE_DATE = "colrev.github.latest_release_date" # YYYY-MM-DD
210+
GITHUB_REPO_IS_ARCHIVED = "colrev.github.repo_is_archived" # bool
211+
GITHUB_REPO_IS_ARCHIVED_DATE = (
212+
"colrev.github.repo_is_archived_date" # YYYY-MM-DD (proxy)
213+
)
214+
GITHUB_LANGUAGES_PCT = "colrev.github.languages_pct" # {lang: pct}
215+
GITHUB_BRANCH_COUNT_ACTIVE = "colrev.github.branch_count_active"
216+
GITHUB_BRANCH_COUNT_STALE = "colrev.github.branch_count_stale"
217+
GITHUB_BRANCH_COUNT_ALL = "colrev.github.branch_count_all"
218+
GITHUB_BASIC_INFO_OWNER = "colrev.github.basic_info_owner"
219+
GITHUB_BASIC_INFO_REPO = "colrev.github.basic_info_repo"
220+
GITHUB_BASIC_INFO_URL = "colrev.github.basic_info_url"
201221

202222

203223
class FieldsRegex:

colrev/packages/github/src/record_transformer.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,14 @@
33
from __future__ import annotations
44

55
import re
6+
from datetime import datetime
7+
from datetime import timedelta
8+
from datetime import timezone
9+
from typing import Dict
610

711
from github import Github
812
from github.GithubException import GithubException
13+
from github.GithubException import UnknownObjectException
914

1015
import colrev.record.record
1116
import colrev.record.record_prep
@@ -84,6 +89,122 @@ def _set_nr_commits(record_dict: dict, repo: Github.Repository.Repository) -> No
8489
pass
8590

8691

92+
def _set_issue_counts(record_dict: dict, repo: Github.Repository.Repository) -> None:
93+
try:
94+
open_ = repo.get_issues(state="open").totalCount
95+
closed = repo.get_issues(state="closed").totalCount
96+
record_dict[Fields.GITHUB_ISSUE_COUNT_OPEN] = open_
97+
record_dict[Fields.GITHUB_ISSUE_COUNT_CLOSED] = closed
98+
record_dict[Fields.GITHUB_ISSUE_COUNT_TOTAL] = open_ + closed
99+
except GithubException:
100+
record_dict[Fields.GITHUB_ISSUE_COUNT_OPEN] = 0
101+
record_dict[Fields.GITHUB_ISSUE_COUNT_CLOSED] = 0
102+
record_dict[Fields.GITHUB_ISSUE_COUNT_TOTAL] = 0
103+
104+
105+
def _set_stars_watch_forks(
106+
record_dict: dict, repo: Github.Repository.Repository
107+
) -> None:
108+
try:
109+
record_dict[Fields.GITHUB_STAR_COUNT] = (
110+
getattr(repo, "stargazers_count", 0) or 0
111+
)
112+
except GithubException:
113+
record_dict[Fields.GITHUB_STAR_COUNT] = 0
114+
try:
115+
record_dict[Fields.GITHUB_WATCHER_COUNT] = (
116+
getattr(repo, "subscribers_count", 0) or 0
117+
)
118+
except GithubException:
119+
record_dict[Fields.GITHUB_WATCHER_COUNT] = 0
120+
try:
121+
record_dict[Fields.GITHUB_FORK_COUNT] = getattr(repo, "forks_count", 0) or 0
122+
except GithubException:
123+
record_dict[Fields.GITHUB_FORK_COUNT] = 0
124+
125+
126+
def _set_archive_status(record_dict: dict, repo: Github.Repository.Repository) -> None:
127+
try:
128+
is_archived = bool(getattr(repo, "archived", False))
129+
record_dict[Fields.GITHUB_REPO_IS_ARCHIVED] = is_archived
130+
if is_archived:
131+
pushed_at = getattr(repo, "pushed_at", None)
132+
if pushed_at:
133+
record_dict[Fields.GITHUB_REPO_IS_ARCHIVED_DATE] = pushed_at.strftime(
134+
"%Y-%m-%d"
135+
)
136+
except GithubException:
137+
record_dict[Fields.GITHUB_REPO_IS_ARCHIVED] = False
138+
139+
140+
def _set_last_commit_date(
141+
record_dict: dict, repo: Github.Repository.Repository
142+
) -> None:
143+
try:
144+
c = repo.get_commits()[0]
145+
dt = c.commit.author.date
146+
record_dict[Fields.GITHUB_COMMIT_LAST_DATE] = dt.strftime("%Y-%m-%d")
147+
except (GithubException, IndexError):
148+
pass
149+
150+
151+
def _set_latest_release_date(
152+
record_dict: dict, repo: Github.Repository.Repository
153+
) -> None:
154+
try:
155+
rel = repo.get_latest_release()
156+
if rel and getattr(rel, "published_at", None):
157+
record_dict[Fields.GITHUB_LATEST_RELEASE_DATE] = rel.published_at.strftime(
158+
"%Y-%m-%d"
159+
)
160+
except (UnknownObjectException, GithubException):
161+
pass
162+
163+
164+
def _calc_language_percentages(repo: Github.Repository.Repository) -> Dict[str, float]:
165+
try:
166+
lang_bytes = repo.get_languages()
167+
total = sum(lang_bytes.values()) or 0
168+
if total == 0:
169+
return {}
170+
return {k: round((v / total) * 100.0, 2) for k, v in lang_bytes.items()}
171+
except GithubException:
172+
return {}
173+
174+
175+
def _set_languages_pct(record_dict: dict, repo: Github.Repository.Repository) -> None:
176+
lang_pct = _calc_language_percentages(repo)
177+
if lang_pct:
178+
record_dict[Fields.GITHUB_LANGUAGES_PCT] = lang_pct
179+
180+
181+
def _set_branch_statuses(record_dict: dict, repo: Github.Repository.Repository) -> None:
182+
try:
183+
branches = list(repo.get_branches())
184+
except GithubException:
185+
return
186+
187+
active = stale = 0
188+
now = datetime.now(timezone.utc)
189+
threshold = now - timedelta(days=90)
190+
191+
for br in branches:
192+
try:
193+
sha = br.commit.sha
194+
commit = repo.get_commit(sha)
195+
dt = commit.commit.author.date
196+
if dt >= threshold:
197+
active += 1
198+
else:
199+
stale += 1
200+
except GithubException:
201+
stale += 1
202+
203+
record_dict[Fields.GITHUB_BRANCH_COUNT_ACTIVE] = active
204+
record_dict[Fields.GITHUB_BRANCH_COUNT_STALE] = stale
205+
record_dict[Fields.GITHUB_BRANCH_COUNT_ALL] = active + stale
206+
207+
87208
def _update_record_based_on_citation_cff(
88209
record_dict: dict, repo: Github.Repository.Repository
89210
) -> dict:
@@ -114,11 +235,22 @@ def repo_to_record(
114235
Fields.URL: repo.html_url,
115236
Fields.ABSTRACT: repo.description,
116237
Fields.GITHUB_LANGUAGE: repo.language,
238+
# Basic info triplet
239+
Fields.GITHUB_BASIC_INFO_OWNER: repo.owner.login,
240+
Fields.GITHUB_BASIC_INFO_REPO: repo.name,
241+
Fields.GITHUB_BASIC_INFO_URL: repo.html_url,
117242
}
118243

119244
_set_license(record_dict, repo)
120245
_set_nr_contributors(record_dict, repo)
121246
_set_nr_commits(record_dict, repo)
247+
_set_issue_counts(record_dict, repo)
248+
_set_stars_watch_forks(record_dict, repo)
249+
_set_archive_status(record_dict, repo)
250+
_set_last_commit_date(record_dict, repo)
251+
_set_latest_release_date(record_dict, repo)
252+
_set_languages_pct(record_dict, repo)
253+
_set_branch_statuses(record_dict, repo)
122254

123255
# Use data from CITATION.cff file (if available)
124256
_update_record_based_on_citation_cff(record_dict, repo)

0 commit comments

Comments
 (0)