Skip to content

Commit 9318385

Browse files
committed
fix: GLYPH error/empty pdf handled for pdf & wiley processors
1 parent be06452 commit 9318385

3 files changed

Lines changed: 235 additions & 21 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
- [CITATION.cff](https://github.com/slimeslab/ComProScanner/blob/main/CITATION.cff) added for standardized citation information based on the latest release and arXiv preprint.
4040

4141
### Fixed
42+
- Empty/corrupted PDF handled in `pdf_processor.py` and `wiley_processor.py` to avoid having GLYPH errors during text extraction.
43+
4244
- Data extraction failures fixed if composition-property text data is empty.
4345

4446
- CSV progress tracking in `elsevier_processor.py`:

src/comproscanner/article_processors/pdfs_processor.py

Lines changed: 147 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,95 @@ def _extract_doi_from_text(self, text: str):
150150
logger.error(f"Error extracting DOI from text: {e}")
151151
return ""
152152

153+
def _create_empty_row(
154+
self, doi: str, title: str = "", journal_name: str = "", publisher: str = ""
155+
):
156+
"""Create a row with empty values for PDFs with no text detection.
157+
158+
Args:
159+
doi (str): The DOI of the article (may be empty string).
160+
title (str): The title of the article.
161+
journal_name (str): The name of the publication.
162+
publisher (str): The name of the publisher.
163+
164+
Returns:
165+
pd.DataFrame: DataFrame with metadata and empty section values and is_property_mentioned=0.
166+
"""
167+
return pd.DataFrame(
168+
[
169+
{
170+
"doi": doi,
171+
"article_title": title,
172+
"publication_name": journal_name,
173+
"publisher": publisher,
174+
"abstract": "",
175+
"introduction": "",
176+
"exp_methods": "",
177+
"comp_methods": "",
178+
"results_discussion": "",
179+
"conclusion": "",
180+
"is_property_mentioned": "0",
181+
}
182+
]
183+
)
184+
185+
def _is_corrupted_text(self, text: str) -> bool:
186+
"""Check if the text contains corrupted GLYPH patterns from failed OCR.
187+
188+
Args:
189+
text (str): The text to check.
190+
191+
Returns:
192+
bool: True if text is corrupted (high ratio of GLYPH patterns), False otherwise.
193+
"""
194+
if not text:
195+
return True
196+
197+
# Count GLYPH pattern occurrences (both raw and HTML-escaped)
198+
glyph_pattern = r"GLYPH(?:<|&lt;)\d+(?:>|&gt;)"
199+
glyph_matches = re.findall(glyph_pattern, text)
200+
glyph_count = len(glyph_matches)
201+
202+
# If there are many GLYPH patterns, the text is corrupted
203+
# Threshold: if GLYPH patterns make up more than 10% of words, consider it corrupted
204+
words = text.split()
205+
word_count = len(words)
206+
207+
if word_count == 0:
208+
return True
209+
210+
glyph_ratio = glyph_count / word_count
211+
return glyph_ratio > 0.1 # More than 10% GLYPH patterns indicates corruption
212+
213+
def _get_metadata_from_csv(self, doi: str):
214+
"""Try to get metadata from the local metadata CSV file.
215+
216+
Args:
217+
doi (str): The DOI to search for.
218+
219+
Returns:
220+
tuple: (title, journal_name, publisher) or ("", "", "") if not found.
221+
"""
222+
try:
223+
if not os.path.exists(self.metadata_csv_filename):
224+
return "", "", ""
225+
226+
# Load metadata CSV if not already loaded
227+
if self.df is None:
228+
self.df = pd.read_csv(self.metadata_csv_filename)
229+
230+
matching_rows = self.df[self.df["doi"] == doi]
231+
if not matching_rows.empty:
232+
row = matching_rows.iloc[0]
233+
title = row.get("article_title", "")
234+
journal_name = row.get("publication_name", "")
235+
publisher = row.get("metadata_publisher", "")
236+
return title, journal_name, publisher
237+
return "", "", ""
238+
except Exception as e:
239+
logger.warning(f"Error reading metadata from CSV: {e}")
240+
return "", "", ""
241+
153242
def process_pdfs(self):
154243
"""
155244
Main function to process the PDFs in the folder. It reads the PDFs, extracts the text, and writes the data to CSV file, to the SQL database (if set), and creates a vector database if the keyword is found in the text.
@@ -168,6 +257,49 @@ def process_pdfs(self):
168257
pdf_to_md = PDFToMarkdownText(source=pdf_file)
169258
md_text = pdf_to_md.convert_to_markdown()
170259

260+
# Handle empty or corrupted text detection result
261+
if md_text is None or not md_text.strip() or self._is_corrupted_text(md_text):
262+
logger.warning(
263+
f"Text detection result is empty or corrupted for {pdf_file}. "
264+
"Storing with is_property_mentioned=0 and skipping vector database creation."
265+
)
266+
# Try to extract DOI from filename if possible
267+
filename = os.path.basename(pdf_file)
268+
self.doi = filename.replace(".pdf", "").replace("_", "/")
269+
self.identifier = self.doi
270+
271+
# Try to get metadata from local CSV
272+
title, journal_name, publisher = "", "", ""
273+
if self.doi:
274+
title, journal_name, publisher = self._get_metadata_from_csv(self.doi)
275+
276+
row = self._create_empty_row(
277+
self.doi, title, journal_name, publisher
278+
)
279+
sql_dataframes.append(row)
280+
csv_dataframes.append(row)
281+
282+
if len(sql_dataframes) == self.sql_batch_size:
283+
final_sql_df = pd.concat(sql_dataframes, ignore_index=True)
284+
if self.is_sql_db:
285+
self.sql_db_manager.write_to_sql_db(
286+
self.paperdata_table_name, final_sql_df
287+
)
288+
sql_dataframes = []
289+
time.sleep(5)
290+
if len(csv_dataframes) == self.csv_batch_size:
291+
final_csv_df = pd.concat(csv_dataframes, ignore_index=True)
292+
self.csv_db_manager.write_to_csv(
293+
final_csv_df,
294+
self.csv_path,
295+
self.keyword,
296+
self.source,
297+
self.csv_batch_size,
298+
)
299+
csv_dataframes = []
300+
time.sleep(5)
301+
continue
302+
171303
# Extract DOI from the converted markdown text
172304
self.doi = self._extract_doi_from_text(md_text)
173305

@@ -182,14 +314,13 @@ def process_pdfs(self):
182314
filename = os.path.basename(pdf_file)
183315
self.identifier = filename.replace(".pdf", "")
184316

185-
# Get metadata from external API using DOI
317+
# Get metadata from local CSV using DOI
186318
title, journal_name, publisher = "", "", ""
187319
if self.doi:
188-
title, journal_name, publisher = get_paper_metadata_from_oaworks(
189-
self.doi
190-
)
320+
title, journal_name, publisher = self._get_metadata_from_csv(self.doi)
321+
191322
if not title:
192-
logger.warning(f"Metadata not found for DOI: {self.doi}")
323+
logger.warning(f"Metadata not found in CSV for DOI: {self.doi}")
193324

194325
# Process sections
195326
all_sections = pdf_to_md.clean_text(md_text)
@@ -208,20 +339,28 @@ def process_pdfs(self):
208339

209340
if row["is_property_mentioned"].iloc[0] == "1":
210341
self.valid_property_articles += 1
342+
211343
if len(sql_dataframes) == self.sql_batch_size:
212-
final_df = pd.concat(sql_dataframes, ignore_index=True)
344+
final_sql_df = pd.concat(sql_dataframes, ignore_index=True)
213345
if self.is_sql_db:
214346
self.sql_db_manager.write_to_sql_db(
215-
self.paperdata_table_name, final_df
347+
self.paperdata_table_name, final_sql_df
216348
)
217349
sql_dataframes = []
218350
time.sleep(5)
351+
219352
if len(csv_dataframes) == self.csv_batch_size:
353+
final_csv_df = pd.concat(csv_dataframes, ignore_index=True)
220354
self.csv_db_manager.write_to_csv(
221-
final_df, self.csv_path, self.keyword, self.source
355+
final_csv_df,
356+
self.csv_path,
357+
self.keyword,
358+
self.source,
359+
self.csv_batch_size,
222360
)
223361
csv_dataframes = []
224362
time.sleep(5)
363+
225364
time.sleep(0.2)
226365

227366
except KeyboardInterrupt as kie:

src/comproscanner/article_processors/wiley_processor.py

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import sys
1313
import time
1414
import tempfile
15+
import re
1516

1617
# third-party library imports
1718
import requests
@@ -150,6 +151,34 @@ def __init__(
150151
self.vector_db_manager = VectorDatabaseManager(rag_config=self.rag_config)
151152
self.is_exceeded = False
152153

154+
def _is_corrupted_text(self, text: str) -> bool:
155+
"""Check if the text contains corrupted GLYPH patterns from failed OCR.
156+
157+
Args:
158+
text (str): The text to check.
159+
160+
Returns:
161+
bool: True if text is corrupted (high ratio of GLYPH patterns), False otherwise.
162+
"""
163+
if not text:
164+
return True
165+
166+
# Count GLYPH pattern occurrences (both raw and HTML-escaped)
167+
glyph_pattern = r"GLYPH(?:<|&lt;)\d+(?:>|&gt;)"
168+
glyph_matches = re.findall(glyph_pattern, text)
169+
glyph_count = len(glyph_matches)
170+
171+
# If there are many GLYPH patterns, the text is corrupted
172+
# Threshold: if GLYPH patterns make up more than 10% of words, consider it corrupted
173+
words = text.split()
174+
word_count = len(words)
175+
176+
if word_count == 0:
177+
return True
178+
179+
glyph_ratio = glyph_count / word_count
180+
return glyph_ratio > 0.1 # More than 10% GLYPH patterns indicates corruption
181+
153182
def _load_and_preprocess_data(self):
154183
"""
155184
Load and preprocess the metadata CSV file to get the DOIs of the articles to process.
@@ -419,12 +448,19 @@ def _process_articles(self):
419448

420449
# Download PDF
421450
file_path = self._send_request(row["doi"])
422-
if file_path is None:
423-
logger.warning(f"Failed to download PDF for DOI {row['doi']}")
424-
continue
425451

426-
# Handle "Not Found" articles
427-
if file_path == "Not Found":
452+
# Handle failed downloads and "Not Found" articles
453+
if file_path is None or file_path == "Not Found":
454+
if file_path is None:
455+
logger.warning(
456+
f"Failed to download PDF for DOI {row['doi']}. "
457+
"Storing with is_property_mentioned=0."
458+
)
459+
else:
460+
logger.warning(
461+
f"Article not found for DOI {row['doi']}. "
462+
"Storing with is_property_mentioned=0."
463+
)
428464
empty_data = {
429465
"doi": row["doi"],
430466
"article_title": row["article_title"],
@@ -462,19 +498,20 @@ def _process_articles(self):
462498
time.sleep(5)
463499
continue
464500

465-
# Get metadata
466-
title, journal_name, publisher = get_paper_metadata_from_oaworks(
467-
row["doi"]
468-
)
501+
# Get metadata from the metadata CSV row
502+
title = row.get("article_title", "")
503+
journal_name = row.get("publication_name", "")
504+
publisher = row.get("metadata_publisher", "")
469505

470506
# Convert PDF to Markdown
471507
pdf_to_md = PDFToMarkdownText(file_path)
472508
md_text = pdf_to_md.convert_to_markdown()
473509

474-
# Check if conversion was successful
475-
if md_text is None:
476-
logger.error(
477-
f"Failed to convert PDF to markdown for DOI {row['doi']}"
510+
# Check if conversion was successful or text detection is empty/corrupted
511+
if md_text is None or not md_text.strip() or self._is_corrupted_text(md_text):
512+
logger.warning(
513+
f"Text detection result is empty or corrupted for DOI {doi}. "
514+
"Storing with is_property_mentioned=0 and skipping vector database creation."
478515
)
479516
# Clean up temporary file if it exists
480517
if not self.is_save_pdf and os.path.exists(file_path):
@@ -484,6 +521,42 @@ def _process_articles(self):
484521
logger.warning(
485522
f"Failed to remove temp file {file_path}: {e}"
486523
)
524+
# Store empty row with DOI, metadata and is_property_mentioned=0
525+
empty_data = {
526+
"doi": doi,
527+
"article_title": title,
528+
"publication_name": journal_name,
529+
"publisher": publisher,
530+
"abstract": "",
531+
"introduction": "",
532+
"exp_methods": "",
533+
"comp_methods": "",
534+
"results_discussion": "",
535+
"conclusion": "",
536+
"is_property_mentioned": "0",
537+
}
538+
empty_row = pd.DataFrame([empty_data])
539+
sql_dataframes.append(empty_row)
540+
csv_dataframes.append(empty_row)
541+
if len(sql_dataframes) == self.sql_batch_size:
542+
final_sql_df = pd.concat(sql_dataframes, ignore_index=True)
543+
if self.is_sql_db:
544+
self.sql_db_manager.write_to_sql_db(
545+
self.paperdata_table_name, final_sql_df
546+
)
547+
sql_dataframes = []
548+
time.sleep(5)
549+
if len(csv_dataframes) == self.csv_batch_size:
550+
final_csv_df = pd.concat(csv_dataframes, ignore_index=True)
551+
self.csv_db_manager.write_to_csv(
552+
final_csv_df,
553+
self.csv_path,
554+
self.keyword,
555+
self.source,
556+
self.csv_batch_size,
557+
)
558+
csv_dataframes = []
559+
time.sleep(5)
487560
continue
488561

489562
# Process the markdown text

0 commit comments

Comments
 (0)