Skip to content

Commit 6fac222

Browse files
authored
Merge pull request #24 from MEITREX/pdfprocessor_fix
PdfProcessor: Small bugfix
2 parents 8de26b3 + dda927e commit 6fac222

1 file changed

Lines changed: 8 additions & 1 deletion

File tree

fileextractlib/PdfProcessor.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class PdfProcessor:
1919

2020
def __init__(self):
2121
tika.initVM()
22+
tika_config.getParsers()
2223

2324
def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
2425
"""
@@ -47,7 +48,13 @@ def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
4748
with io.BytesIO() as page_pdf_bytes:
4849
page_pdf_writer.write(page_pdf_bytes)
4950
page_pdf_bytes.seek(0)
50-
page_text = tika.parser.from_buffer(page_pdf_bytes)["content"].strip()
51+
page_text = tika.parser.from_buffer(page_pdf_bytes,
52+
headers={ "X-Tika-PDFextractInlineImages": "true" })["content"]
53+
54+
if page_text is None:
55+
continue
56+
57+
page_text = page_text.strip()
5158

5259
if page_text == "":
5360
continue

0 commit comments

Comments
 (0)