@@ -525,78 +525,130 @@ def crossref_query(
525525 ) -> list :
526526 """Retrieve records from Crossref based on a query."""
527527 record = record_input .copy_prep_rec ()
528- record_list : list [colrev .record .record .Record ] = []
529- candidates : list [colrev .record .record .Record ] = []
528+ endpoint = self ._get_crossref_endpoint (
529+ record = record , jour_vol_iss_list = jour_vol_iss_list
530+ )
531+ retrieved_records = self ._retrieve_crossref_records (
532+ endpoint = endpoint ,
533+ record = record ,
534+ jour_vol_iss_list = jour_vol_iss_list ,
535+ top_n = top_n ,
536+ )
530537
531- url = self ._create_query_url (record = record , jour_vol_iss_list = jour_vol_iss_list )
538+ if jour_vol_iss_list :
539+ self ._drop_similarity_scores (records = retrieved_records )
540+ return retrieved_records
541+
542+ return self ._prepare_top_n_results (candidates = retrieved_records , top_n = top_n )
532543
544+ def _get_crossref_endpoint (
545+ self , * , record : colrev .record .record .Record , jour_vol_iss_list : bool
546+ ) -> Endpoint :
547+ url = self ._create_query_url (record = record , jour_vol_iss_list = jour_vol_iss_list )
533548 endpoint = Endpoint (url , email = self .email , cache = self .cache )
534- if jour_vol_iss_list :
535- endpoint .request_params ["rows" ] = "50"
536- else :
537- endpoint .request_params ["rows" ] = "15"
549+ endpoint .request_params ["rows" ] = "50" if jour_vol_iss_list else "15"
550+ return endpoint
551+
552+ def _retrieve_crossref_records (
553+ self ,
554+ * ,
555+ endpoint : Endpoint ,
556+ record : colrev .record .record .Record ,
557+ jour_vol_iss_list : bool ,
558+ top_n : int ,
559+ ) -> list [colrev .record .record .Record ]:
560+ records : list [colrev .record .record .Record ] = []
538561
539562 counter = 0
540563 while True :
541- try :
542- item = next (iter (endpoint ), None )
543- except requests .exceptions .RequestException as exc :
544- raise colrev_exceptions .ServiceNotAvailableException (
545- f"Crossref ({ Colors .ORANGE } check https://status.crossref.org/{ Colors .END } )"
546- ) from exc
564+ item = self ._get_next_endpoint_item (endpoint = endpoint )
547565 if item is None :
548566 break
549- try :
550- retrieved_record = record_transformer .json_to_record (item = item )
551- similarity = self ._get_similarity (
552- record = record , retrieved_record_dict = retrieved_record .data
567+ retrieved_record = self ._parse_record (item = item )
568+ if retrieved_record is not None :
569+ self ._append_with_similarity (
570+ records = records ,
571+ record = record ,
572+ retrieved_record = retrieved_record ,
553573 )
554- retrieved_record .data ["_similarity_score" ] = similarity
555- if jour_vol_iss_list :
556- record_list .append (retrieved_record )
557- else :
558- candidates .append (retrieved_record )
559-
560- except colrev_exceptions .RecordNotParsableException :
561- pass
574+
562575 counter += 1
563- if jour_vol_iss_list and counter > 200 :
564- break
565- if not jour_vol_iss_list and counter > top_n + 10 :
576+ if self . _should_stop_iteration (
577+ jour_vol_iss_list = jour_vol_iss_list , counter = counter , top_n = top_n
578+ ) :
566579 break
567580
568- def _norm_doi (doi : str ) -> str :
569- doi = doi .strip ().lower ()
570- for p in ("https://doi.org/" , "http://doi.org/" , "doi:" , "doi.org/" ):
571- if doi .startswith (p ):
572- return doi [len (p ) :]
573- return doi
574-
575- if not jour_vol_iss_list :
576- # keep only the best (highest similarity) per DOI
577- best_by_doi : typing .Dict [str , colrev .record .record .Record ] = {}
578- for rec in candidates :
579- key = _norm_doi (rec .data ["doi" ])
580- if key not in best_by_doi or rec .data .get (
581- "_similarity_score" , 0.0
582- ) > best_by_doi [key ].data .get ("_similarity_score" , 0.0 ):
583- best_by_doi [key ] = rec
584-
585- deduped = list (best_by_doi .values ())
586- deduped .sort (
587- key = lambda r : r .data .get ("_similarity_score" , 0.0 ), reverse = True
588- )
581+ return records
589582
590- result = []
591- for rec in deduped [: max (1 , top_n )]:
592- rec .data .pop ("_similarity_score" , None )
593- result .append (colrev .record .record_prep .PrepRecord (rec .get_data ()))
594- return result
583+ def _get_next_endpoint_item (self , * , endpoint : Endpoint ) -> typing .Optional [dict ]:
584+ try :
585+ return next (iter (endpoint ), None )
586+ except requests .exceptions .RequestException as exc :
587+ raise colrev_exceptions .ServiceNotAvailableException (
588+ f"Crossref ({ Colors .ORANGE } check https://status.crossref.org/{ Colors .END } )"
589+ ) from exc
595590
596- # For jour_vol_iss_list=True, optionally strip similarity before returning
597- for rec in record_list :
591+ def _parse_record (
592+ self , * , item : dict
593+ ) -> typing .Optional [colrev .record .record .Record ]:
594+ try :
595+ return record_transformer .json_to_record (item = item )
596+ except colrev_exceptions .RecordNotParsableException :
597+ return None
598+
599+ def _append_with_similarity (
600+ self ,
601+ * ,
602+ records : list [colrev .record .record .Record ],
603+ record : colrev .record .record .Record ,
604+ retrieved_record : colrev .record .record .Record ,
605+ ) -> None :
606+ similarity = self ._get_similarity (
607+ record = record , retrieved_record_dict = retrieved_record .data
608+ )
609+ retrieved_record .data ["_similarity_score" ] = similarity
610+ records .append (retrieved_record )
611+
612+ def _should_stop_iteration (
613+ self , * , jour_vol_iss_list : bool , counter : int , top_n : int
614+ ) -> bool :
615+ if jour_vol_iss_list :
616+ return counter > 200
617+ return counter > top_n + 10
618+
619+ def _prepare_top_n_results (
620+ self , * , candidates : list [colrev .record .record .Record ], top_n : int
621+ ) -> list [colrev .record .record_prep .PrepRecord ]:
622+ best_by_doi : typing .Dict [str , colrev .record .record .Record ] = {}
623+ for rec in candidates :
624+ key = self ._norm_doi (doi = rec .data ["doi" ])
625+ if key not in best_by_doi or rec .data .get (
626+ "_similarity_score" , 0.0
627+ ) > best_by_doi [key ].data .get ("_similarity_score" , 0.0 ):
628+ best_by_doi [key ] = rec
629+
630+ deduped = list (best_by_doi .values ())
631+ deduped .sort (key = lambda r : r .data .get ("_similarity_score" , 0.0 ), reverse = True )
632+
633+ result = []
634+ for rec in deduped [: max (1 , top_n )]:
635+ rec .data .pop ("_similarity_score" , None )
636+ result .append (colrev .record .record_prep .PrepRecord (rec .get_data ()))
637+ return result
638+
639+ def _drop_similarity_scores (
640+ self , * , records : list [colrev .record .record .Record ]
641+ ) -> None :
642+ for rec in records :
598643 rec .data .pop ("_similarity_score" , None )
599- return record_list
644+
645+ @staticmethod
646+ def _norm_doi (* , doi : str ) -> str :
647+ doi = doi .strip ().lower ()
648+ for prefix in ("https://doi.org/" , "http://doi.org/" , "doi:" , "doi.org/" ):
649+ if doi .startswith (prefix ):
650+ return doi [len (prefix ) :]
651+ return doi
600652
601653
602654def query_doi (
0 commit comments