@@ -268,10 +268,18 @@ def process_pdfs(self):
268268 self .doi = filename .replace (".pdf" , "" ).replace ("_" , "/" )
269269 self .identifier = self .doi
270270
271- # Try to get metadata from local CSV
271+ # Try to get metadata (API first, then CSV)
272272 title , journal_name , publisher = "" , "" , ""
273- if self .doi :
274- title , journal_name , publisher = self ._get_metadata_from_csv (self .doi )
273+ if self .doi .startswith ("10." ):
274+ title , journal_name , publisher = get_paper_metadata_from_oaworks (
275+ self .doi
276+ )
277+
278+ if not title or not journal_name or not publisher :
279+ csv_title , csv_journal , csv_publisher = self ._get_metadata_from_csv (self .doi )
280+ title = title or csv_title
281+ journal_name = journal_name or csv_journal
282+ publisher = publisher or csv_publisher
275283
276284 row = self ._create_empty_row (
277285 self .doi , title , journal_name , publisher
@@ -314,13 +322,21 @@ def process_pdfs(self):
314322 filename = os .path .basename (pdf_file )
315323 self .identifier = filename .replace (".pdf" , "" )
316324
317- # Get metadata from local CSV using DOI
325+ # Get metadata from external API (with CSV fallback) using DOI
318326 title , journal_name , publisher = "" , "" , ""
319327 if self .doi :
320- title , journal_name , publisher = self ._get_metadata_from_csv (self .doi )
328+ title , journal_name , publisher = get_paper_metadata_from_oaworks (
329+ self .doi
330+ )
331+
332+ if not title or not journal_name or not publisher :
333+ csv_title , csv_journal , csv_publisher = self ._get_metadata_from_csv (self .doi )
334+ title = title or csv_title
335+ journal_name = journal_name or csv_journal
336+ publisher = publisher or csv_publisher
321337
322338 if not title :
323- logger .warning (f"Metadata not found in CSV for DOI: { self .doi } " )
339+ logger .warning (f"Metadata not found for DOI: { self .doi } " )
324340
325341 # Process sections
326342 all_sections = pdf_to_md .clean_text (md_text )
0 commit comments