Skip to content

Commit 490a0b7

Browse files
committed
refactor: crossref
1 parent 1966461 commit 490a0b7

1 file changed

Lines changed: 109 additions & 57 deletions

File tree

colrev/packages/crossref/src/crossref_api.py

Lines changed: 109 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -525,78 +525,130 @@ def crossref_query(
525525
) -> list:
526526
"""Retrieve records from Crossref based on a query."""
527527
record = record_input.copy_prep_rec()
528-
record_list: list[colrev.record.record.Record] = []
529-
candidates: list[colrev.record.record.Record] = []
528+
endpoint = self._get_crossref_endpoint(
529+
record=record, jour_vol_iss_list=jour_vol_iss_list
530+
)
531+
retrieved_records = self._retrieve_crossref_records(
532+
endpoint=endpoint,
533+
record=record,
534+
jour_vol_iss_list=jour_vol_iss_list,
535+
top_n=top_n,
536+
)
530537

531-
url = self._create_query_url(record=record, jour_vol_iss_list=jour_vol_iss_list)
538+
if jour_vol_iss_list:
539+
self._drop_similarity_scores(records=retrieved_records)
540+
return retrieved_records
541+
542+
return self._prepare_top_n_results(candidates=retrieved_records, top_n=top_n)
532543

544+
def _get_crossref_endpoint(
545+
self, *, record: colrev.record.record.Record, jour_vol_iss_list: bool
546+
) -> Endpoint:
547+
url = self._create_query_url(record=record, jour_vol_iss_list=jour_vol_iss_list)
533548
endpoint = Endpoint(url, email=self.email, cache=self.cache)
534-
if jour_vol_iss_list:
535-
endpoint.request_params["rows"] = "50"
536-
else:
537-
endpoint.request_params["rows"] = "15"
549+
endpoint.request_params["rows"] = "50" if jour_vol_iss_list else "15"
550+
return endpoint
551+
552+
def _retrieve_crossref_records(
553+
self,
554+
*,
555+
endpoint: Endpoint,
556+
record: colrev.record.record.Record,
557+
jour_vol_iss_list: bool,
558+
top_n: int,
559+
) -> list[colrev.record.record.Record]:
560+
records: list[colrev.record.record.Record] = []
538561

539562
counter = 0
540563
while True:
541-
try:
542-
item = next(iter(endpoint), None)
543-
except requests.exceptions.RequestException as exc:
544-
raise colrev_exceptions.ServiceNotAvailableException(
545-
f"Crossref ({Colors.ORANGE}check https://status.crossref.org/{Colors.END})"
546-
) from exc
564+
item = self._get_next_endpoint_item(endpoint=endpoint)
547565
if item is None:
548566
break
549-
try:
550-
retrieved_record = record_transformer.json_to_record(item=item)
551-
similarity = self._get_similarity(
552-
record=record, retrieved_record_dict=retrieved_record.data
567+
retrieved_record = self._parse_record(item=item)
568+
if retrieved_record is not None:
569+
self._append_with_similarity(
570+
records=records,
571+
record=record,
572+
retrieved_record=retrieved_record,
553573
)
554-
retrieved_record.data["_similarity_score"] = similarity
555-
if jour_vol_iss_list:
556-
record_list.append(retrieved_record)
557-
else:
558-
candidates.append(retrieved_record)
559-
560-
except colrev_exceptions.RecordNotParsableException:
561-
pass
574+
562575
counter += 1
563-
if jour_vol_iss_list and counter > 200:
564-
break
565-
if not jour_vol_iss_list and counter > top_n + 10:
576+
if self._should_stop_iteration(
577+
jour_vol_iss_list=jour_vol_iss_list, counter=counter, top_n=top_n
578+
):
566579
break
567580

568-
def _norm_doi(doi: str) -> str:
569-
doi = doi.strip().lower()
570-
for p in ("https://doi.org/", "http://doi.org/", "doi:", "doi.org/"):
571-
if doi.startswith(p):
572-
return doi[len(p) :]
573-
return doi
574-
575-
if not jour_vol_iss_list:
576-
# keep only the best (highest similarity) per DOI
577-
best_by_doi: typing.Dict[str, colrev.record.record.Record] = {}
578-
for rec in candidates:
579-
key = _norm_doi(rec.data["doi"])
580-
if key not in best_by_doi or rec.data.get(
581-
"_similarity_score", 0.0
582-
) > best_by_doi[key].data.get("_similarity_score", 0.0):
583-
best_by_doi[key] = rec
584-
585-
deduped = list(best_by_doi.values())
586-
deduped.sort(
587-
key=lambda r: r.data.get("_similarity_score", 0.0), reverse=True
588-
)
581+
return records
589582

590-
result = []
591-
for rec in deduped[: max(1, top_n)]:
592-
rec.data.pop("_similarity_score", None)
593-
result.append(colrev.record.record_prep.PrepRecord(rec.get_data()))
594-
return result
583+
def _get_next_endpoint_item(self, *, endpoint: Endpoint) -> typing.Optional[dict]:
584+
try:
585+
return next(iter(endpoint), None)
586+
except requests.exceptions.RequestException as exc:
587+
raise colrev_exceptions.ServiceNotAvailableException(
588+
f"Crossref ({Colors.ORANGE}check https://status.crossref.org/{Colors.END})"
589+
) from exc
595590

596-
# For jour_vol_iss_list=True, optionally strip similarity before returning
597-
for rec in record_list:
591+
def _parse_record(
592+
self, *, item: dict
593+
) -> typing.Optional[colrev.record.record.Record]:
594+
try:
595+
return record_transformer.json_to_record(item=item)
596+
except colrev_exceptions.RecordNotParsableException:
597+
return None
598+
599+
def _append_with_similarity(
600+
self,
601+
*,
602+
records: list[colrev.record.record.Record],
603+
record: colrev.record.record.Record,
604+
retrieved_record: colrev.record.record.Record,
605+
) -> None:
606+
similarity = self._get_similarity(
607+
record=record, retrieved_record_dict=retrieved_record.data
608+
)
609+
retrieved_record.data["_similarity_score"] = similarity
610+
records.append(retrieved_record)
611+
612+
def _should_stop_iteration(
613+
self, *, jour_vol_iss_list: bool, counter: int, top_n: int
614+
) -> bool:
615+
if jour_vol_iss_list:
616+
return counter > 200
617+
return counter > top_n + 10
618+
619+
def _prepare_top_n_results(
620+
self, *, candidates: list[colrev.record.record.Record], top_n: int
621+
) -> list[colrev.record.record_prep.PrepRecord]:
622+
best_by_doi: typing.Dict[str, colrev.record.record.Record] = {}
623+
for rec in candidates:
624+
key = self._norm_doi(doi=rec.data["doi"])
625+
if key not in best_by_doi or rec.data.get(
626+
"_similarity_score", 0.0
627+
) > best_by_doi[key].data.get("_similarity_score", 0.0):
628+
best_by_doi[key] = rec
629+
630+
deduped = list(best_by_doi.values())
631+
deduped.sort(key=lambda r: r.data.get("_similarity_score", 0.0), reverse=True)
632+
633+
result = []
634+
for rec in deduped[: max(1, top_n)]:
635+
rec.data.pop("_similarity_score", None)
636+
result.append(colrev.record.record_prep.PrepRecord(rec.get_data()))
637+
return result
638+
639+
def _drop_similarity_scores(
640+
self, *, records: list[colrev.record.record.Record]
641+
) -> None:
642+
for rec in records:
598643
rec.data.pop("_similarity_score", None)
599-
return record_list
644+
645+
@staticmethod
646+
def _norm_doi(*, doi: str) -> str:
647+
doi = doi.strip().lower()
648+
for prefix in ("https://doi.org/", "http://doi.org/", "doi:", "doi.org/"):
649+
if doi.startswith(prefix):
650+
return doi[len(prefix) :]
651+
return doi
600652

601653

602654
def query_doi(

0 commit comments

Comments
 (0)