Skip to content

Commit 3e5bcfa

Browse files
committed
package: osf refactor author resolution
1 parent 47778c4 commit 3e5bcfa

1 file changed

Lines changed: 87 additions & 93 deletions

File tree

colrev/packages/osf/src/osf_api.py

Lines changed: 87 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -60,114 +60,108 @@ def _query_api(self, url: str) -> str:
6060
response = requests.get(url, headers=self.headers, timeout=60)
6161
return response.text
6262

63-
# pylint: disable=broad-exception-caught
6463
def _resolve_authors(self, record: dict) -> None:
65-
"""Resolve OSF authors for a single record (in place).
66-
67-
Replaces record[Fields.AUTHOR] (contributors URL) with a BibTeX-style string
68-
'Family, Given and Family, Given'. Guarantees the URL is not kept:
69-
- prefer unregistered_contributor
70-
- otherwise fetch user by id (relationships.users.data.id)
71-
- fallback: user id string
72-
- if nothing at all is found, drop the AUTHOR field
73-
"""
64+
"""Resolve OSF authors for a single record (in place)."""
7465
href = record.get(Fields.AUTHOR)
7566
if not href or not isinstance(href, str):
7667
return
7768

78-
def _name_from_attrs(uattr: dict) -> str:
79-
given = (uattr.get("given_name") or "").strip()
80-
family = (uattr.get("family_name") or "").strip()
81-
full = (uattr.get("full_name") or "").strip()
82-
if family and given:
83-
return f"{family}, {given}"
84-
return full or (given or family) or ""
85-
86-
def _fetch_user_attrs(uid: str) -> dict:
87-
if not uid:
88-
return {}
89-
if uid in self._user_cache:
90-
return self._user_cache[uid]
91-
url = f"https://api.osf.io/v2/users/{uid}/"
92-
try:
93-
# keep it simple; no fields filter to avoid oddities
94-
js = json.loads(self._query_api(url))
95-
data = js.get("data", {})
96-
if isinstance(data, list):
97-
data = data[0] if data else {}
98-
attrs = (data.get("attributes") or {}) if isinstance(data, dict) else {}
99-
if attrs:
100-
self._user_cache[uid] = attrs
101-
return attrs
102-
except Exception:
103-
return {}
69+
names, id_fallbacks = self._resolve_contributor_authors(href)
70+
if names:
71+
record[Fields.AUTHOR] = " and ".join(names)
72+
return
73+
if id_fallbacks:
74+
record[Fields.AUTHOR] = " and ".join(id_fallbacks)
75+
return
10476

105-
try:
106-
# Pull only bibliographic contributors
107-
sep = "&" if "?" in href else "?"
108-
url = f"{href}{sep}filter[bibliographic]=true"
77+
record.pop(Fields.AUTHOR, None)
10978

110-
names: list[str] = []
111-
id_fallbacks: list[str] = []
79+
def _resolve_contributor_authors(self, href: str) -> tuple[list[str], list[str]]:
80+
sep = "&" if "?" in href else "?"
81+
url = f"{href}{sep}filter[bibliographic]=true"
82+
names: list[str] = []
83+
id_fallbacks: list[str] = []
11284

85+
try:
11386
while url:
11487
js = json.loads(self._query_api(url))
115-
for item in js.get("data", []):
116-
attr = item.get("attributes") or {}
117-
if attr.get("bibliographic") is False:
118-
continue
119-
120-
# 1) unregistered contributor name (verbatim)
121-
name = (attr.get("unregistered_contributor") or "").strip()
122-
123-
# 2) registered user via id
124-
if not name:
125-
rel = (item.get("relationships") or {}).get("users") or {}
126-
rel_data = rel.get("data") or {}
127-
uid = rel_data.get("id") if isinstance(rel_data, dict) else None
128-
if uid:
129-
uattrs = _fetch_user_attrs(uid)
130-
name = _name_from_attrs(uattrs)
131-
if not name:
132-
# keep uid as a safe fallback
133-
id_fallbacks.append(uid)
134-
135-
if name:
136-
names.append(name)
137-
138-
# paginate
88+
page_names, page_ids = self._extract_page_author_data(js)
89+
names.extend(page_names)
90+
id_fallbacks.extend(page_ids)
13991
links = js.get("links") or {}
14092
url = links.get("next", "")
93+
except Exception:
94+
return [], self._resolve_fallback_ids(href)
95+
96+
return names, id_fallbacks
97+
98+
def _extract_page_author_data(self, js: dict) -> tuple[list[str], list[str]]:
99+
names: list[str] = []
100+
id_fallbacks: list[str] = []
101+
for item in js.get("data", []):
102+
attr = item.get("attributes") or {}
103+
if attr.get("bibliographic") is False:
104+
continue
105+
106+
name = (attr.get("unregistered_contributor") or "").strip()
107+
if not name:
108+
uid = self._get_contributor_user_id(item)
109+
if uid:
110+
name = self._name_from_attrs(self._fetch_user_attrs(uid))
111+
if not name:
112+
id_fallbacks.append(uid)
113+
114+
if name:
115+
names.append(name)
116+
117+
return names, id_fallbacks
118+
119+
def _get_contributor_user_id(self, item: dict) -> typing.Optional[str]:
120+
rel = (item.get("relationships") or {}).get("users") or {}
121+
rel_data = rel.get("data") or {}
122+
return rel_data.get("id") if isinstance(rel_data, dict) else None
123+
124+
def _name_from_attrs(self, uattr: dict) -> str:
125+
given = (uattr.get("given_name") or "").strip()
126+
family = (uattr.get("family_name") or "").strip()
127+
full = (uattr.get("full_name") or "").strip()
128+
if family and given:
129+
return f"{family}, {given}"
130+
return full or (given or family) or ""
141131

142-
# Always overwrite AUTHOR (never keep the URL):
143-
if names:
144-
record[Fields.AUTHOR] = " and ".join(names)
145-
elif id_fallbacks:
146-
record[Fields.AUTHOR] = " and ".join(id_fallbacks)
147-
else:
148-
# nothing found—drop the field to avoid leaving a URL around
149-
record.pop(Fields.AUTHOR, None)
132+
# pylint: disable=broad-exception-caught
133+
def _fetch_user_attrs(self, uid: str) -> dict:
134+
if not uid:
135+
return {}
136+
if uid in self._user_cache:
137+
return self._user_cache[uid]
138+
url = f"https://api.osf.io/v2/users/{uid}/"
139+
try:
140+
js = json.loads(self._query_api(url))
141+
data = js.get("data", {})
142+
if isinstance(data, list):
143+
data = data[0] if data else {}
144+
attrs = (data.get("attributes") or {}) if isinstance(data, dict) else {}
145+
if attrs:
146+
self._user_cache[uid] = attrs
147+
return attrs
148+
except Exception:
149+
return {}
150150

151+
# pylint: disable=broad-exception-caught
152+
def _resolve_fallback_ids(self, href: str) -> list[str]:
153+
try:
154+
js = json.loads(self._query_api(href))
155+
ids = []
156+
for item in js.get("data", []):
157+
if (item.get("attributes") or {}).get("bibliographic") is False:
158+
continue
159+
uid = self._get_contributor_user_id(item)
160+
if uid:
161+
ids.append(uid)
162+
return ids
151163
except Exception:
152-
# On hard failures, fall back to user ids if we can extract them without extra calls
153-
try:
154-
# last-chance: fetch once without filters and try to read ids
155-
js = json.loads(self._query_api(href))
156-
ids = []
157-
for item in js.get("data", []):
158-
if (item.get("attributes") or {}).get("bibliographic") is False:
159-
continue
160-
rel = (item.get("relationships") or {}).get("users") or {}
161-
rel_data = rel.get("data") or {}
162-
uid = rel_data.get("id") if isinstance(rel_data, dict) else None
163-
if uid:
164-
ids.append(uid)
165-
if ids:
166-
record[Fields.AUTHOR] = " and ".join(ids)
167-
else:
168-
record.pop(Fields.AUTHOR, None)
169-
except Exception:
170-
record.pop(Fields.AUTHOR, None)
164+
return []
171165

172166
def retrieve_records(self) -> typing.Generator:
173167
"""Call the API with the query parameters and return the results."""

0 commit comments

Comments
 (0)