|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +from collections import Counter, defaultdict |
| 4 | +import math |
3 | 5 | import tempfile |
4 | 6 | from pathlib import Path |
5 | 7 | from typing import Literal |
|
26 | 28 | _HI_RES_STRATEGIES = ("hi_res", Strategy.HI_RES) |
27 | 29 |
|
28 | 30 |
|
| 31 | +def _allowed_delta(expected: int, *, absolute: int, ratio: float) -> int: |
| 32 | + return max(absolute, math.ceil(expected * ratio)) |
| 33 | + |
| 34 | + |
| 35 | +def _text_size(elements) -> int: |
| 36 | + return sum(len((element.get("text") or "").strip()) for element in elements) |
| 37 | + |
| 38 | + |
| 39 | +def _elements_by_page(elements): |
| 40 | + pages = defaultdict(list) |
| 41 | + for element in elements: |
| 42 | + pages[element["metadata"]["page_number"]].append(element) |
| 43 | + return pages |
| 44 | + |
| 45 | + |
| 46 | +def _assert_hi_res_output_is_similar(resp_split, resp_single): |
| 47 | + split_pages = _elements_by_page(resp_split.elements) |
| 48 | + single_pages = _elements_by_page(resp_single.elements) |
| 49 | + |
| 50 | + assert set(split_pages) == set(single_pages) |
| 51 | + |
| 52 | + assert abs(len(resp_split.elements) - len(resp_single.elements)) <= _allowed_delta( |
| 53 | + len(resp_single.elements), |
| 54 | + absolute=4, |
| 55 | + ratio=0.1, |
| 56 | + ) |
| 57 | + |
| 58 | + split_type_counts = Counter(element["type"] for element in resp_split.elements) |
| 59 | + single_type_counts = Counter(element["type"] for element in resp_single.elements) |
| 60 | + assert set(split_type_counts) == set(single_type_counts) |
| 61 | + for element_type, expected_count in single_type_counts.items(): |
| 62 | + assert abs(split_type_counts[element_type] - expected_count) <= _allowed_delta( |
| 63 | + expected_count, |
| 64 | + absolute=2, |
| 65 | + ratio=0.2, |
| 66 | + ) |
| 67 | + |
| 68 | + assert abs(_text_size(resp_split.elements) - _text_size(resp_single.elements)) <= _allowed_delta( |
| 69 | + _text_size(resp_single.elements), |
| 70 | + absolute=250, |
| 71 | + ratio=0.2, |
| 72 | + ) |
| 73 | + |
| 74 | + for page_number, single_page_elements in single_pages.items(): |
| 75 | + split_page_elements = split_pages[page_number] |
| 76 | + |
| 77 | + assert abs(len(split_page_elements) - len(single_page_elements)) <= _allowed_delta( |
| 78 | + len(single_page_elements), |
| 79 | + absolute=2, |
| 80 | + ratio=0.2, |
| 81 | + ) |
| 82 | + assert abs(_text_size(split_page_elements) - _text_size(single_page_elements)) <= _allowed_delta( |
| 83 | + _text_size(single_page_elements), |
| 84 | + absolute=120, |
| 85 | + ratio=0.3, |
| 86 | + ) |
| 87 | + |
| 88 | + |
29 | 89 | def _assert_split_unsplit_equivalent(resp_split, resp_single, strategy, extra_exclude_paths=None): |
30 | 90 | """Compare split-PDF and single-request responses. |
31 | 91 |
|
32 | 92 | For hi_res (OCR-based), splitting changes per-page context so text and |
33 | | - element counts can vary slightly. We only check structural equivalence. |
| 93 | + OCR text can vary slightly. We still check page coverage, type distribution, |
| 94 | + and text volume so split requests cannot silently drift too far. |
34 | 95 | For deterministic strategies (fast, etc.) we keep strict DeepDiff equality. |
35 | 96 | """ |
36 | 97 | assert resp_split.status_code == resp_single.status_code |
37 | 98 | assert resp_split.content_type == resp_single.content_type |
38 | 99 |
|
39 | 100 | if strategy in _HI_RES_STRATEGIES: |
40 | | - count_diff = abs(len(resp_split.elements) - len(resp_single.elements)) |
41 | | - assert count_diff <= 10, ( |
42 | | - f"Element count diverged too far: " |
43 | | - f"{len(resp_split.elements)} vs {len(resp_single.elements)}" |
44 | | - ) |
45 | | - split_pages = {e["metadata"]["page_number"] for e in resp_split.elements} |
46 | | - single_pages = {e["metadata"]["page_number"] for e in resp_single.elements} |
47 | | - assert split_pages == single_pages |
| 101 | + _assert_hi_res_output_is_similar(resp_split, resp_single) |
48 | 102 | else: |
49 | 103 | assert len(resp_split.elements) == len(resp_single.elements) |
50 | 104 |
|
|
0 commit comments