Skip to content

Commit e6ad53a

Browse files
author
Gerit Wagner
committed
files_dir: set ENTRYTYPE
1 parent f508e4f commit e6ad53a

1 file changed

Lines changed: 38 additions & 9 deletions

File tree

colrev/packages/files_dir/src/files_dir.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -278,14 +278,19 @@ def _get_grobid_metadata(self, *, file_path: Path) -> dict:
278278

279279
with pymupdf.open(file_path) as doc:
280280
pages_in_file = doc.page_count
281-
if pages_in_file < 6:
282-
record = colrev.record.record_pdf.PDFRecord(
283-
record_dict, path=self.review_manager.path
284-
)
285-
record.set_text_from_pdf(first_pages=True)
286-
record_dict = record.get_data()
287-
if Fields.TEXT_FROM_PDF in record_dict:
288-
text: str = record_dict[Fields.TEXT_FROM_PDF]
281+
record = colrev.record.record_pdf.PDFRecord(
282+
record_dict, path=self.review_manager.path
283+
)
284+
record.set_text_from_pdf(first_pages=True)
285+
record_dict = record.get_data()
286+
if Fields.TEXT_FROM_PDF in record_dict:
287+
text: str = record_dict[Fields.TEXT_FROM_PDF]
288+
if "conference" in text.replace(" ", "").lower():
289+
record_dict[Fields.ENTRYTYPE] = ENTRYTYPES.INPROCEEDINGS
290+
if "journal" in text.replace(" ", "").lower():
291+
record_dict[Fields.ENTRYTYPE] = ENTRYTYPES.ARTICLE
292+
293+
if pages_in_file < 6:
289294
if "bookreview" in text.replace(" ", "").lower():
290295
record_dict[Fields.ENTRYTYPE] = ENTRYTYPES.MISC
291296
record_dict["note"] = "Book review"
@@ -301,10 +306,12 @@ def _get_grobid_metadata(self, *, file_path: Path) -> dict:
301306
if "withdrawal" in text.replace(" ", "").lower():
302307
record_dict[Fields.ENTRYTYPE] = ENTRYTYPES.MISC
303308
record_dict["note"] = "Withdrawal"
304-
del record_dict[Fields.TEXT_FROM_PDF]
305309
# else:
306310
# print(f'text extraction error in {record_dict[Fields.ID]}')
307311

312+
record_dict.pop(Fields.TEXT_FROM_PDF, None)
313+
record_dict.pop(Fields.NR_PAGES_IN_FILE, None)
314+
308315
record_dict = {k: v for k, v in record_dict.items() if v is not None}
309316
record_dict = {k: v for k, v in record_dict.items() if v != "NA"}
310317

@@ -885,4 +892,26 @@ def prepare(
885892
record.data[Fields.TITLE] = FieldValues.UNKNOWN
886893
record.set_status(RecordState.md_needs_manual_preparation)
887894

895+
self._prep_journal_specific(record)
896+
888897
return record
898+
899+
def _prep_journal_specific(
900+
self, record: colrev.record.record_prep.PrepRecord
901+
) -> None:
902+
903+
try:
904+
pdf_record = colrev.record.record_pdf.PDFRecord(
905+
record.data, path=self.review_manager.path
906+
)
907+
first_page = pdf_record.extract_text_by_page(pages=[0])
908+
if (
909+
record.data.get(Fields.DOI, "").startswith("10.17705/1CAIS")
910+
or record.data.get(Fields.JOURNAL, "")
911+
== "Communications of the Association for Information Systems"
912+
):
913+
if "Education Article" in first_page:
914+
record.data["category"] = "Education Article"
915+
916+
except colrev_exceptions.InvalidPDFException:
917+
pass

0 commit comments

Comments
 (0)