Skip to content

Commit a1880b4

Browse files
committed
fix: fetching metadata from oaworks reimplemented for pdf processor only
1 parent 9318385 commit a1880b4

1 file changed

Lines changed: 22 additions & 6 deletions

File tree

src/comproscanner/article_processors/pdfs_processor.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,18 @@ def process_pdfs(self):
268268
self.doi = filename.replace(".pdf", "").replace("_", "/")
269269
self.identifier = self.doi
270270

271-
# Try to get metadata from local CSV
271+
# Try to get metadata (API first, then CSV)
272272
title, journal_name, publisher = "", "", ""
273-
if self.doi:
274-
title, journal_name, publisher = self._get_metadata_from_csv(self.doi)
273+
if self.doi.startswith("10."):
274+
title, journal_name, publisher = get_paper_metadata_from_oaworks(
275+
self.doi
276+
)
277+
278+
if not title or not journal_name or not publisher:
279+
csv_title, csv_journal, csv_publisher = self._get_metadata_from_csv(self.doi)
280+
title = title or csv_title
281+
journal_name = journal_name or csv_journal
282+
publisher = publisher or csv_publisher
275283

276284
row = self._create_empty_row(
277285
self.doi, title, journal_name, publisher
@@ -314,13 +322,21 @@ def process_pdfs(self):
314322
filename = os.path.basename(pdf_file)
315323
self.identifier = filename.replace(".pdf", "")
316324

317-
# Get metadata from local CSV using DOI
325+
# Get metadata from external API (with CSV fallback) using DOI
318326
title, journal_name, publisher = "", "", ""
319327
if self.doi:
320-
title, journal_name, publisher = self._get_metadata_from_csv(self.doi)
328+
title, journal_name, publisher = get_paper_metadata_from_oaworks(
329+
self.doi
330+
)
331+
332+
if not title or not journal_name or not publisher:
333+
csv_title, csv_journal, csv_publisher = self._get_metadata_from_csv(self.doi)
334+
title = title or csv_title
335+
journal_name = journal_name or csv_journal
336+
publisher = publisher or csv_publisher
321337

322338
if not title:
323-
logger.warning(f"Metadata not found in CSV for DOI: {self.doi}")
339+
logger.warning(f"Metadata not found for DOI: {self.doi}")
324340

325341
# Process sections
326342
all_sections = pdf_to_md.clean_text(md_text)

0 commit comments

Comments
 (0)