We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents 8de26b3 + dda927e commit 6fac222Copy full SHA for 6fac222
1 file changed
fileextractlib/PdfProcessor.py
@@ -19,6 +19,7 @@ class PdfProcessor:
19
20
def __init__(self):
21
tika.initVM()
22
+ tika_config.getParsers()
23
24
def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
25
"""
@@ -47,7 +48,13 @@ def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
47
48
with io.BytesIO() as page_pdf_bytes:
49
page_pdf_writer.write(page_pdf_bytes)
50
page_pdf_bytes.seek(0)
- page_text = tika.parser.from_buffer(page_pdf_bytes)["content"].strip()
51
+ page_text = tika.parser.from_buffer(page_pdf_bytes,
52
+ headers={ "X-Tika-PDFextractInlineImages": "true" })["content"]
53
+
54
+ if page_text is None:
55
+ continue
56
57
+ page_text = page_text.strip()
58
59
if page_text == "":
60
continue
0 commit comments