Skip to content

Commit dc5683d

Browse files
committed
move regex patterns
1 parent 757b58e commit dc5683d

3 files changed

Lines changed: 21 additions & 15 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1-
## 0.21.8
1+
## 0.21.9
22

33
### Enhancements
44
- Add a heuristic to check for complex pdfs and bypass pdf_miner
55

6+
## 0.21.8
7+
8+
### Enhancements
9+
- **Optimize PDF render mode patching performance**: Optimized `_patch_current_chars_with_render_mode` in `CustomPDFPageInterpreter` to avoid O(N²) re-scanning by tracking the last-patched index, so each `do_TJ`/`do_Tj` call only processes newly-added characters.
10+
611
## 0.21.7
712

813
### Enhancements

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.21.8" # pragma: no cover
1+
__version__ = "0.21.9" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,20 @@
104104

105105
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
106106

107+
# Regex patterns for counting graphics and text operators in PDF content streams.
108+
GRAPHICS_OPS_PATTERN = re.compile(
109+
rb"(?:^|(?<=\s))"
110+
rb"(?:m|l|c|v|y|h|re|S|s|f|F|f\*|B|B\*|b|b\*|n|W|W\*|cm|q|Q|Do|"
111+
rb"g|G|rg|RG|k|K|cs|CS|w|J|j|M|d|i|gs)"
112+
rb"(?=\s|$)",
113+
re.MULTILINE,
114+
)
115+
TEXT_OPS_PATTERN = re.compile(
116+
rb"(?:^|(?<=\s))" rb"(?:Tj|TJ|'|\"|Tf|Td|TD|Tm|T\*|BT|ET)" rb"(?=\s|$)",
117+
re.MULTILINE,
118+
)
119+
120+
107121
# increase the max pixels so high dpi values like 300 can still be under the PIL limit
108122
PILImage.MAX_IMAGE_PIXELS = 5e8
109123

@@ -631,19 +645,6 @@ def is_pdf_too_complex(
631645
the threshold.
632646
"""
633647

634-
# Regex patterns for counting graphics and text operators in PDF content streams.
635-
GRAPHICS_OPS_PATTERN = re.compile(
636-
rb"(?:^|(?<=\s))"
637-
rb"(?:m|l|c|v|y|h|re|S|s|f|F|f\*|B|B\*|b|b\*|n|W|W\*|cm|q|Q|Do|"
638-
rb"g|G|rg|RG|k|K|cs|CS|w|J|j|M|d|i|gs)"
639-
rb"(?=\s|$)",
640-
re.MULTILINE,
641-
)
642-
TEXT_OPS_PATTERN = re.compile(
643-
rb"(?:^|(?<=\s))" rb"(?:Tj|TJ|'|\"|Tf|Td|TD|Tm|T\*|BT|ET)" rb"(?=\s|$)",
644-
re.MULTILINE,
645-
)
646-
647648
original_pos: Optional[int] = None
648649

649650
try:

0 commit comments

Comments
 (0)