Skip to content

Commit f7468a0

Browse files
fix: update
1 parent 9da4604 commit f7468a0

1 file changed

Lines changed: 26 additions & 5 deletions

File tree

test_unstructured/partition/test_auto.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
import pathlib
88
import tempfile
9+
import time
910
import warnings
1011
from importlib import import_module
1112
from typing import Iterator
@@ -1337,11 +1338,31 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
13371338
)
13381339
def test_auto_partition_detects_pdf_language_per_element(strategy):
13391340
filename = example_doc_path("language-docs/fr_olap.pdf")
1340-
elements = partition(
1341-
filename=filename,
1342-
strategy=strategy,
1343-
detect_language_per_element=True,
1344-
)
1341+
1342+
def _partition() -> list[Element]:
1343+
return partition(
1344+
filename=filename,
1345+
strategy=strategy,
1346+
detect_language_per_element=True,
1347+
)
1348+
1349+
# OCR_ONLY shells out to Tesseract with a temp PNG; under CI load the file can disappear
1350+
# before Tesseract reads it ("cannot read input file"). Retry a few times on that flake.
1351+
if strategy == PartitionStrategy.OCR_ONLY:
1352+
from unstructured_pytesseract import TesseractError
1353+
1354+
elements: list[Element] | None = None
1355+
for attempt in range(3):
1356+
try:
1357+
elements = _partition()
1358+
break
1359+
except TesseractError as e:
1360+
if attempt == 2 or "cannot read input file" not in str(e).lower():
1361+
raise
1362+
time.sleep(0.25 * (attempt + 1))
1363+
assert elements is not None
1364+
else:
1365+
elements = _partition()
13451366

13461367
assert len(elements) > 0
13471368
assert elements[0].metadata.languages == ["fra"]

0 commit comments

Comments
 (0)