@@ -278,14 +278,19 @@ def _get_grobid_metadata(self, *, file_path: Path) -> dict:
278278
279279 with pymupdf .open (file_path ) as doc :
280280 pages_in_file = doc .page_count
281- if pages_in_file < 6 :
282- record = colrev .record .record_pdf .PDFRecord (
283- record_dict , path = self .review_manager .path
284- )
285- record .set_text_from_pdf (first_pages = True )
286- record_dict = record .get_data ()
287- if Fields .TEXT_FROM_PDF in record_dict :
288- text : str = record_dict [Fields .TEXT_FROM_PDF ]
281+ record = colrev .record .record_pdf .PDFRecord (
282+ record_dict , path = self .review_manager .path
283+ )
284+ record .set_text_from_pdf (first_pages = True )
285+ record_dict = record .get_data ()
286+ if Fields .TEXT_FROM_PDF in record_dict :
287+ text : str = record_dict [Fields .TEXT_FROM_PDF ]
288+ if "conference" in text .replace (" " , "" ).lower ():
289+ record_dict [Fields .ENTRYTYPE ] = ENTRYTYPES .INPROCEEDINGS
290+ if "journal" in text .replace (" " , "" ).lower ():
291+ record_dict [Fields .ENTRYTYPE ] = ENTRYTYPES .ARTICLE
292+
293+ if pages_in_file < 6 :
289294 if "bookreview" in text .replace (" " , "" ).lower ():
290295 record_dict [Fields .ENTRYTYPE ] = ENTRYTYPES .MISC
291296 record_dict ["note" ] = "Book review"
@@ -301,10 +306,12 @@ def _get_grobid_metadata(self, *, file_path: Path) -> dict:
301306 if "withdrawal" in text .replace (" " , "" ).lower ():
302307 record_dict [Fields .ENTRYTYPE ] = ENTRYTYPES .MISC
303308 record_dict ["note" ] = "Withdrawal"
304- del record_dict [Fields .TEXT_FROM_PDF ]
305309 # else:
306310 # print(f'text extraction error in {record_dict[Fields.ID]}')
307311
312+ record_dict .pop (Fields .TEXT_FROM_PDF , None )
313+ record_dict .pop (Fields .NR_PAGES_IN_FILE , None )
314+
308315 record_dict = {k : v for k , v in record_dict .items () if v is not None }
309316 record_dict = {k : v for k , v in record_dict .items () if v != "NA" }
310317
@@ -885,4 +892,26 @@ def prepare(
885892 record .data [Fields .TITLE ] = FieldValues .UNKNOWN
886893 record .set_status (RecordState .md_needs_manual_preparation )
887894
895+ self ._prep_journal_specific (record )
896+
888897 return record
898+
899+ def _prep_journal_specific (
900+ self , record : colrev .record .record_prep .PrepRecord
901+ ) -> None :
902+
903+ try :
904+ pdf_record = colrev .record .record_pdf .PDFRecord (
905+ record .data , path = self .review_manager .path
906+ )
907+ first_page = pdf_record .extract_text_by_page (pages = [0 ])
908+ if (
909+ record .data .get (Fields .DOI , "" ).startswith ("10.17705/1CAIS" )
910+ or record .data .get (Fields .JOURNAL , "" )
911+ == "Communications of the Association for Information Systems"
912+ ):
913+ if "Education Article" in first_page :
914+ record .data ["category" ] = "Education Article"
915+
916+ except colrev_exceptions .InvalidPDFException :
917+ pass
0 commit comments