Fix sort_page_element. ensures that sorting is stable and not random. (#3978)

pprados · web-flow · commit d570f4624bb8 · 2025-04-07T15:57:20.000Z
The sort_page_element() use the element id to sort the elements.
Two executions of the same code, on the same file, produce different
results. The order of the elements is random.
This makes it impossible to write stable unit tests, for example, or to
obtain reproducible results.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@
 ### Features
 
 ### Fixes
+- The sort_page_element() use the element id to sort the elements.
+Two executions of the same code, on the same file, produce different results. The order of the elements is random.
+This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
 - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
 
 ## 0.17.5
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
 
     assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
     assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
+
+
+def test_reproductible_pdf_loader():
+    from glob import glob
+
+    for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")):
+        elements_1 = pdf.partition_pdf(
+            filename=f,
+            strategy=PartitionStrategy.AUTO,
+            infer_table_structure=False,
+        )
+        for _ in range(4):
+            elements_2 = pdf.partition_pdf(
+                filename=f,
+                strategy=PartitionStrategy.AUTO,
+                infer_table_structure=False,
+            )
+            for e1, e2 in zip(elements_1, elements_2):
+                assert e1.text == e2.text, f"load two time {f=} return differents results"
+            else:
+                break
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
@@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool):
             key=lambda el: (
                 el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"),
                 el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"),
-                el.id,
             ),
         )
     else:

Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool):`
`179`	`179`	`key=lambda el: (`
`180`	`180`	`el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"),`
`181`	`181`	`el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"),`
`182`		`- el.id,`
`183`	`182`	`),`
`184`	`183`	`)`
`185`	`184`	`else:`