You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: CHANGELOG.md
+2Lines changed: 2 additions & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -39,6 +39,8 @@
39
39
-[CITATION.cff](https://github.com/slimeslab/ComProScanner/blob/main/CITATION.cff) added for standardized citation information based on the latest release and arXiv preprint.
40
40
41
41
### Fixed
42
+
- Empty/corrupted PDF handled in `pdf_processor.py` and `wiley_processor.py` to avoid having GLYPH errors during text extraction.
43
+
42
44
- Data extraction failures fixed if composition-property text data is empty.
43
45
44
46
- CSV progress tracking in `elsevier_processor.py`:
"""Create a row with empty values for PDFs with no text detection.
157
+
158
+
Args:
159
+
doi (str): The DOI of the article (may be empty string).
160
+
title (str): The title of the article.
161
+
journal_name (str): The name of the publication.
162
+
publisher (str): The name of the publisher.
163
+
164
+
Returns:
165
+
pd.DataFrame: DataFrame with metadata and empty section values and is_property_mentioned=0.
166
+
"""
167
+
returnpd.DataFrame(
168
+
[
169
+
{
170
+
"doi": doi,
171
+
"article_title": title,
172
+
"publication_name": journal_name,
173
+
"publisher": publisher,
174
+
"abstract": "",
175
+
"introduction": "",
176
+
"exp_methods": "",
177
+
"comp_methods": "",
178
+
"results_discussion": "",
179
+
"conclusion": "",
180
+
"is_property_mentioned": "0",
181
+
}
182
+
]
183
+
)
184
+
185
+
def_is_corrupted_text(self, text: str) ->bool:
186
+
"""Check if the text contains corrupted GLYPH patterns from failed OCR.
187
+
188
+
Args:
189
+
text (str): The text to check.
190
+
191
+
Returns:
192
+
bool: True if text is corrupted (high ratio of GLYPH patterns), False otherwise.
193
+
"""
194
+
ifnottext:
195
+
returnTrue
196
+
197
+
# Count GLYPH pattern occurrences (both raw and HTML-escaped)
198
+
glyph_pattern=r"GLYPH(?:<|<)\d+(?:>|>)"
199
+
glyph_matches=re.findall(glyph_pattern, text)
200
+
glyph_count=len(glyph_matches)
201
+
202
+
# If there are many GLYPH patterns, the text is corrupted
203
+
# Threshold: if GLYPH patterns make up more than 10% of words, consider it corrupted
204
+
words=text.split()
205
+
word_count=len(words)
206
+
207
+
ifword_count==0:
208
+
returnTrue
209
+
210
+
glyph_ratio=glyph_count/word_count
211
+
returnglyph_ratio>0.1# More than 10% GLYPH patterns indicates corruption
212
+
213
+
def_get_metadata_from_csv(self, doi: str):
214
+
"""Try to get metadata from the local metadata CSV file.
215
+
216
+
Args:
217
+
doi (str): The DOI to search for.
218
+
219
+
Returns:
220
+
tuple: (title, journal_name, publisher) or ("", "", "") if not found.
221
+
"""
222
+
try:
223
+
ifnotos.path.exists(self.metadata_csv_filename):
224
+
return"", "", ""
225
+
226
+
# Load metadata CSV if not already loaded
227
+
ifself.dfisNone:
228
+
self.df=pd.read_csv(self.metadata_csv_filename)
229
+
230
+
matching_rows=self.df[self.df["doi"] ==doi]
231
+
ifnotmatching_rows.empty:
232
+
row=matching_rows.iloc[0]
233
+
title=row.get("article_title", "")
234
+
journal_name=row.get("publication_name", "")
235
+
publisher=row.get("metadata_publisher", "")
236
+
returntitle, journal_name, publisher
237
+
return"", "", ""
238
+
exceptExceptionase:
239
+
logger.warning(f"Error reading metadata from CSV: {e}")
240
+
return"", "", ""
241
+
153
242
defprocess_pdfs(self):
154
243
"""
155
244
Main function to process the PDFs in the folder. It reads the PDFs, extracts the text, and writes the data to CSV file, to the SQL database (if set), and creates a vector database if the keyword is found in the text.
0 commit comments