File tree Expand file tree Collapse file tree
test_unstructured/partition Expand file tree Collapse file tree Original file line number Diff line number Diff line change 66import os
77import pathlib
88import tempfile
9+ import time
910import warnings
1011from importlib import import_module
1112from typing import Iterator
@@ -1337,11 +1338,31 @@ def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
13371338)
13381339def test_auto_partition_detects_pdf_language_per_element (strategy ):
13391340 filename = example_doc_path ("language-docs/fr_olap.pdf" )
1340- elements = partition (
1341- filename = filename ,
1342- strategy = strategy ,
1343- detect_language_per_element = True ,
1344- )
1341+
1342+ def _partition () -> list [Element ]:
1343+ return partition (
1344+ filename = filename ,
1345+ strategy = strategy ,
1346+ detect_language_per_element = True ,
1347+ )
1348+
1349+ # OCR_ONLY shells out to Tesseract with a temp PNG; under CI load the file can disappear
1350+ # before Tesseract reads it ("cannot read input file"). Retry a few times on that flake.
1351+ if strategy == PartitionStrategy .OCR_ONLY :
1352+ from unstructured_pytesseract import TesseractError
1353+
1354+ elements : list [Element ] | None = None
1355+ for attempt in range (3 ):
1356+ try :
1357+ elements = _partition ()
1358+ break
1359+ except TesseractError as e :
1360+ if attempt == 2 or "cannot read input file" not in str (e ).lower ():
1361+ raise
1362+ time .sleep (0.25 * (attempt + 1 ))
1363+ assert elements is not None
1364+ else :
1365+ elements = _partition ()
13451366
13461367 assert len (elements ) > 0
13471368 assert elements [0 ].metadata .languages == ["fra" ]
You can’t perform that action at this time.
0 commit comments