Skip to content

Commit fa780b8

Browse files
committed
GPT 5.4 review
1 parent 3307135 commit fa780b8

File tree

5 files changed

+99
-12
lines changed

5 files changed

+99
-12
lines changed

.github/workflows/ci.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ jobs:
3232
python-version: ${{ matrix.python-version }}
3333
- name: Install dependencies
3434
env:
35+
UV_LOCKED: "1"
3536
UV_PYTHON: ${{ matrix.python-version }}
3637
run: make install
3738
- name: Run unit tests
@@ -51,6 +52,7 @@ jobs:
5152
python-version: "3.13"
5253
- name: Install dependencies
5354
env:
55+
UV_LOCKED: "1"
5456
UV_PYTHON: "3.13"
5557
run: make install
5658
- name: Lint
@@ -74,6 +76,7 @@ jobs:
7476
python-version: ${{ matrix.python-version }}
7577
- name: Install dependencies
7678
env:
79+
UV_LOCKED: "1"
7780
UV_PYTHON: ${{ matrix.python-version }}
7881
run: make install
7982
- name: Run integration tests
@@ -98,6 +101,7 @@ jobs:
98101
python-version: ${{ matrix.python-version }}
99102
- name: Install dependencies
100103
env:
104+
UV_LOCKED: "1"
101105
UV_PYTHON: ${{ matrix.python-version }}
102106
run: |
103107
make install

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
1111
.PHONY: install
1212
install:
1313
python scripts/prepare_readme.py
14-
uv sync
14+
uv sync --locked
1515

1616
## install-speakeasy-cli: download the speakeasy cli tool
1717
.PHONY: install-speakeasy-cli

_test_unstructured_client/integration/test_decorators.py

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
from collections import Counter, defaultdict
4+
import math
35
import tempfile
46
from pathlib import Path
57
from typing import Literal
@@ -26,25 +28,77 @@
2628
_HI_RES_STRATEGIES = ("hi_res", Strategy.HI_RES)
2729

2830

31+
def _allowed_delta(expected: int, *, absolute: int, ratio: float) -> int:
32+
return max(absolute, math.ceil(expected * ratio))
33+
34+
35+
def _text_size(elements) -> int:
36+
return sum(len((element.get("text") or "").strip()) for element in elements)
37+
38+
39+
def _elements_by_page(elements):
40+
pages = defaultdict(list)
41+
for element in elements:
42+
pages[element["metadata"]["page_number"]].append(element)
43+
return pages
44+
45+
46+
def _assert_hi_res_output_is_similar(resp_split, resp_single):
47+
split_pages = _elements_by_page(resp_split.elements)
48+
single_pages = _elements_by_page(resp_single.elements)
49+
50+
assert set(split_pages) == set(single_pages)
51+
52+
assert abs(len(resp_split.elements) - len(resp_single.elements)) <= _allowed_delta(
53+
len(resp_single.elements),
54+
absolute=4,
55+
ratio=0.1,
56+
)
57+
58+
split_type_counts = Counter(element["type"] for element in resp_split.elements)
59+
single_type_counts = Counter(element["type"] for element in resp_single.elements)
60+
assert set(split_type_counts) == set(single_type_counts)
61+
for element_type, expected_count in single_type_counts.items():
62+
assert abs(split_type_counts[element_type] - expected_count) <= _allowed_delta(
63+
expected_count,
64+
absolute=2,
65+
ratio=0.2,
66+
)
67+
68+
assert abs(_text_size(resp_split.elements) - _text_size(resp_single.elements)) <= _allowed_delta(
69+
_text_size(resp_single.elements),
70+
absolute=250,
71+
ratio=0.2,
72+
)
73+
74+
for page_number, single_page_elements in single_pages.items():
75+
split_page_elements = split_pages[page_number]
76+
77+
assert abs(len(split_page_elements) - len(single_page_elements)) <= _allowed_delta(
78+
len(single_page_elements),
79+
absolute=2,
80+
ratio=0.2,
81+
)
82+
assert abs(_text_size(split_page_elements) - _text_size(single_page_elements)) <= _allowed_delta(
83+
_text_size(single_page_elements),
84+
absolute=120,
85+
ratio=0.3,
86+
)
87+
88+
2989
def _assert_split_unsplit_equivalent(resp_split, resp_single, strategy, extra_exclude_paths=None):
3090
"""Compare split-PDF and single-request responses.
3191
3292
For hi_res (OCR-based), splitting changes per-page context so text and
33-
element counts can vary slightly. We only check structural equivalence.
93+
OCR text can vary slightly. We still check page coverage, type distribution,
94+
and text volume so split requests cannot silently drift too far.
3495
For deterministic strategies (fast, etc.) we keep strict DeepDiff equality.
3596
"""
3697
assert resp_split.status_code == resp_single.status_code
3798
assert resp_split.content_type == resp_single.content_type
3899

39100
if strategy in _HI_RES_STRATEGIES:
40-
count_diff = abs(len(resp_split.elements) - len(resp_single.elements))
41-
assert count_diff <= 10, (
42-
f"Element count diverged too far: "
43-
f"{len(resp_split.elements)} vs {len(resp_single.elements)}"
44-
)
45-
split_pages = {e["metadata"]["page_number"] for e in resp_split.elements}
46-
single_pages = {e["metadata"]["page_number"] for e in resp_single.elements}
47-
assert split_pages == single_pages
101+
_assert_hi_res_output_is_similar(resp_split, resp_single)
48102
else:
49103
assert len(resp_split.elements) == len(resp_single.elements)
50104

_test_unstructured_client/integration/test_integration.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
from pathlib import Path
77

8+
from deepdiff import DeepDiff
89
import pytest
910
from unstructured_client import UnstructuredClient
1011
from unstructured_client.models import shared, operations
@@ -134,7 +135,8 @@ async def test_partition_async_returns_elements(client, doc_path):
134135
async def test_partition_async_processes_concurrent_files(client, doc_path):
135136
"""
136137
Assert that partition_async can be used to send multiple files concurrently.
137-
Sends two page ranges via asyncio.gather and verifies both return valid results.
138+
Send two page ranges serially and then via asyncio.gather.
139+
Both execution modes should return the same payloads.
138140
"""
139141
filename = "layout-parser-paper.pdf"
140142

@@ -165,14 +167,28 @@ async def test_partition_async_processes_concurrent_files(client, doc_path):
165167
)
166168
]
167169

170+
serial_results = []
171+
for req in requests:
172+
res = await client.general.partition_async(request=req)
173+
assert res.status_code == 200
174+
serial_results.append(res.elements)
175+
168176
results = await asyncio.gather(
169177
client.general.partition_async(request=requests[0]),
170178
client.general.partition_async(request=requests[1])
171179
)
172180

181+
concurrent_results = []
173182
for res in results:
174183
assert res.status_code == 200
175-
assert len(res.elements) > 0
184+
concurrent_results.append(res.elements)
185+
186+
diff = DeepDiff(
187+
t1=serial_results,
188+
t2=concurrent_results,
189+
ignore_order=True,
190+
)
191+
assert len(diff) == 0
176192

177193

178194
def test_uvloop_partitions_without_errors(client, doc_path):

_test_unstructured_client/unit/test_regeneration_guards.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ def test_publish_script_is_hardened():
4040
assert 'uv publish --token "${PYPI_TOKEN}" --check-url https://pypi.org/simple' in publish_script
4141

4242

43+
def test_makefile_installs_with_locked_uv_sync():
44+
makefile = (REPO_ROOT / "Makefile").read_text()
45+
46+
assert "uv sync --locked" in makefile
47+
48+
49+
def test_ci_installs_with_locked_uv_sync():
50+
workflow = (REPO_ROOT / ".github" / "workflows" / "ci.yaml").read_text()
51+
52+
assert 'UV_LOCKED: "1"' in workflow
53+
assert "run: make install" in workflow
54+
55+
4356
def test_body_create_job_input_files_are_serialized_as_multipart_files():
4457
request = shared.BodyCreateJob(
4558
request_data="{}",

0 commit comments

Comments
 (0)