@@ -60,114 +60,108 @@ def _query_api(self, url: str) -> str:
6060 response = requests .get (url , headers = self .headers , timeout = 60 )
6161 return response .text
6262
63- # pylint: disable=broad-exception-caught
6463 def _resolve_authors (self , record : dict ) -> None :
65- """Resolve OSF authors for a single record (in place).
66-
67- Replaces record[Fields.AUTHOR] (contributors URL) with a BibTeX-style string
68- 'Family, Given and Family, Given'. Guarantees the URL is not kept:
69- - prefer unregistered_contributor
70- - otherwise fetch user by id (relationships.users.data.id)
71- - fallback: user id string
72- - if nothing at all is found, drop the AUTHOR field
73- """
64+ """Resolve OSF authors for a single record (in place)."""
7465 href = record .get (Fields .AUTHOR )
7566 if not href or not isinstance (href , str ):
7667 return
7768
78- def _name_from_attrs (uattr : dict ) -> str :
79- given = (uattr .get ("given_name" ) or "" ).strip ()
80- family = (uattr .get ("family_name" ) or "" ).strip ()
81- full = (uattr .get ("full_name" ) or "" ).strip ()
82- if family and given :
83- return f"{ family } , { given } "
84- return full or (given or family ) or ""
85-
86- def _fetch_user_attrs (uid : str ) -> dict :
87- if not uid :
88- return {}
89- if uid in self ._user_cache :
90- return self ._user_cache [uid ]
91- url = f"https://api.osf.io/v2/users/{ uid } /"
92- try :
93- # keep it simple; no fields filter to avoid oddities
94- js = json .loads (self ._query_api (url ))
95- data = js .get ("data" , {})
96- if isinstance (data , list ):
97- data = data [0 ] if data else {}
98- attrs = (data .get ("attributes" ) or {}) if isinstance (data , dict ) else {}
99- if attrs :
100- self ._user_cache [uid ] = attrs
101- return attrs
102- except Exception :
103- return {}
69+ names , id_fallbacks = self ._resolve_contributor_authors (href )
70+ if names :
71+ record [Fields .AUTHOR ] = " and " .join (names )
72+ return
73+ if id_fallbacks :
74+ record [Fields .AUTHOR ] = " and " .join (id_fallbacks )
75+ return
10476
105- try :
106- # Pull only bibliographic contributors
107- sep = "&" if "?" in href else "?"
108- url = f"{ href } { sep } filter[bibliographic]=true"
77+ record .pop (Fields .AUTHOR , None )
10978
110- names : list [str ] = []
111- id_fallbacks : list [str ] = []
79+ def _resolve_contributor_authors (self , href : str ) -> tuple [list [str ], list [str ]]:
80+ sep = "&" if "?" in href else "?"
81+ url = f"{ href } { sep } filter[bibliographic]=true"
82+ names : list [str ] = []
83+ id_fallbacks : list [str ] = []
11284
85+ try :
11386 while url :
11487 js = json .loads (self ._query_api (url ))
115- for item in js .get ("data" , []):
116- attr = item .get ("attributes" ) or {}
117- if attr .get ("bibliographic" ) is False :
118- continue
119-
120- # 1) unregistered contributor name (verbatim)
121- name = (attr .get ("unregistered_contributor" ) or "" ).strip ()
122-
123- # 2) registered user via id
124- if not name :
125- rel = (item .get ("relationships" ) or {}).get ("users" ) or {}
126- rel_data = rel .get ("data" ) or {}
127- uid = rel_data .get ("id" ) if isinstance (rel_data , dict ) else None
128- if uid :
129- uattrs = _fetch_user_attrs (uid )
130- name = _name_from_attrs (uattrs )
131- if not name :
132- # keep uid as a safe fallback
133- id_fallbacks .append (uid )
134-
135- if name :
136- names .append (name )
137-
138- # paginate
88+ page_names , page_ids = self ._extract_page_author_data (js )
89+ names .extend (page_names )
90+ id_fallbacks .extend (page_ids )
13991 links = js .get ("links" ) or {}
14092 url = links .get ("next" , "" )
93+ except Exception :
94+ return [], self ._resolve_fallback_ids (href )
95+
96+ return names , id_fallbacks
97+
98+ def _extract_page_author_data (self , js : dict ) -> tuple [list [str ], list [str ]]:
99+ names : list [str ] = []
100+ id_fallbacks : list [str ] = []
101+ for item in js .get ("data" , []):
102+ attr = item .get ("attributes" ) or {}
103+ if attr .get ("bibliographic" ) is False :
104+ continue
105+
106+ name = (attr .get ("unregistered_contributor" ) or "" ).strip ()
107+ if not name :
108+ uid = self ._get_contributor_user_id (item )
109+ if uid :
110+ name = self ._name_from_attrs (self ._fetch_user_attrs (uid ))
111+ if not name :
112+ id_fallbacks .append (uid )
113+
114+ if name :
115+ names .append (name )
116+
117+ return names , id_fallbacks
118+
119+ def _get_contributor_user_id (self , item : dict ) -> typing .Optional [str ]:
120+ rel = (item .get ("relationships" ) or {}).get ("users" ) or {}
121+ rel_data = rel .get ("data" ) or {}
122+ return rel_data .get ("id" ) if isinstance (rel_data , dict ) else None
123+
124+ def _name_from_attrs (self , uattr : dict ) -> str :
125+ given = (uattr .get ("given_name" ) or "" ).strip ()
126+ family = (uattr .get ("family_name" ) or "" ).strip ()
127+ full = (uattr .get ("full_name" ) or "" ).strip ()
128+ if family and given :
129+ return f"{ family } , { given } "
130+ return full or (given or family ) or ""
141131
142- # Always overwrite AUTHOR (never keep the URL):
143- if names :
144- record [Fields .AUTHOR ] = " and " .join (names )
145- elif id_fallbacks :
146- record [Fields .AUTHOR ] = " and " .join (id_fallbacks )
147- else :
148- # nothing found—drop the field to avoid leaving a URL around
149- record .pop (Fields .AUTHOR , None )
132+ # pylint: disable=broad-exception-caught
133+ def _fetch_user_attrs (self , uid : str ) -> dict :
134+ if not uid :
135+ return {}
136+ if uid in self ._user_cache :
137+ return self ._user_cache [uid ]
138+ url = f"https://api.osf.io/v2/users/{ uid } /"
139+ try :
140+ js = json .loads (self ._query_api (url ))
141+ data = js .get ("data" , {})
142+ if isinstance (data , list ):
143+ data = data [0 ] if data else {}
144+ attrs = (data .get ("attributes" ) or {}) if isinstance (data , dict ) else {}
145+ if attrs :
146+ self ._user_cache [uid ] = attrs
147+ return attrs
148+ except Exception :
149+ return {}
150150
151+ # pylint: disable=broad-exception-caught
152+ def _resolve_fallback_ids (self , href : str ) -> list [str ]:
153+ try :
154+ js = json .loads (self ._query_api (href ))
155+ ids = []
156+ for item in js .get ("data" , []):
157+ if (item .get ("attributes" ) or {}).get ("bibliographic" ) is False :
158+ continue
159+ uid = self ._get_contributor_user_id (item )
160+ if uid :
161+ ids .append (uid )
162+ return ids
151163 except Exception :
152- # On hard failures, fall back to user ids if we can extract them without extra calls
153- try :
154- # last-chance: fetch once without filters and try to read ids
155- js = json .loads (self ._query_api (href ))
156- ids = []
157- for item in js .get ("data" , []):
158- if (item .get ("attributes" ) or {}).get ("bibliographic" ) is False :
159- continue
160- rel = (item .get ("relationships" ) or {}).get ("users" ) or {}
161- rel_data = rel .get ("data" ) or {}
162- uid = rel_data .get ("id" ) if isinstance (rel_data , dict ) else None
163- if uid :
164- ids .append (uid )
165- if ids :
166- record [Fields .AUTHOR ] = " and " .join (ids )
167- else :
168- record .pop (Fields .AUTHOR , None )
169- except Exception :
170- record .pop (Fields .AUTHOR , None )
164+ return []
171165
172166 def retrieve_records (self ) -> typing .Generator :
173167 """Call the API with the query parameters and return the results."""
0 commit comments