Skip to content

Commit 7c5855b

Browse files
authored
Replace lazyproperty with functools.cached_property (#4282)
## Summary - Replace custom `lazyproperty` descriptor with stdlib `functools.cached_property` - Fix bug where 26 properties returning `None` were re-evaluated on every access instead of caching — `lazyproperty.__get__` uses `if value is None` to detect a cache miss, so any property that legitimately returns `None` re-runs on every access - Slight performance improvement on cached reads — `cached_property` is a non-data descriptor, so after first access the `__dict__` entry shadows the descriptor directly (plain dict lookup vs `__get__` call) - Nothing in the codebase ever assigns to a lazyproperty-decorated attribute, so dropping the write-protection from the data descriptor has no behavioral impact
1 parent 94b3ffd commit 7c5855b

17 files changed

Lines changed: 163 additions & 266 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.22.7-dev0
1+
## 0.22.7
22

33
### Fixes
4+
- **Replace `lazyproperty` with `functools.cached_property`**: Fix a bug where 26 properties returning `None` were re-evaluated on every access instead of caching. Also improves performance on cached reads.
45
- **Preserve nested rows when reconstructing chunked tables**: `reconstruct_table_from_chunks()` now merges only top-level table rows (`tr`, `thead/tr`, `tbody/tr`, `tfoot/tr`) from each chunk so nested table structure is retained.
56

67
## 0.22.6

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.7-dev0" # pragma: no cover
1+
__version__ = "0.22.7" # pragma: no cover

unstructured/chunking/base.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import collections
66
import copy
77
import uuid
8+
from functools import cached_property
89
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
910

1011
import regex
@@ -23,7 +24,6 @@
2324
Title,
2425
)
2526
from unstructured.logger import logger
26-
from unstructured.utils import lazyproperty
2727

2828
# ================================================================================================
2929
# MODEL
@@ -59,7 +59,7 @@ class TokenCounter:
5959
def __init__(self, tokenizer: str):
6060
self._tokenizer_name = tokenizer
6161

62-
@lazyproperty
62+
@cached_property
6363
def _encoder(self):
6464
"""Lazily initialize the tiktoken encoder."""
6565
import tiktoken
@@ -144,15 +144,15 @@ def new(cls, **kwargs: Any) -> Self:
144144
self._validate()
145145
return self
146146

147-
@lazyproperty
147+
@cached_property
148148
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
149149
"""The semantic-boundary detectors to be applied to break pre-chunks.
150150
151151
Overridden by sub-typs to provide semantic-boundary isolation behaviors.
152152
"""
153153
return ()
154154

155-
@lazyproperty
155+
@cached_property
156156
def combine_text_under_n_chars(self) -> int:
157157
"""Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
158158
@@ -162,7 +162,7 @@ def combine_text_under_n_chars(self) -> int:
162162
arg_value = self._kwargs.get("combine_text_under_n_chars")
163163
return arg_value if arg_value is not None else 0
164164

165-
@lazyproperty
165+
@cached_property
166166
def hard_max(self) -> int:
167167
"""The maximum size for a chunk (in characters or tokens depending on mode).
168168
@@ -177,7 +177,7 @@ def hard_max(self) -> int:
177177
arg_value = self._kwargs.get("max_characters")
178178
return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT
179179

180-
@lazyproperty
180+
@cached_property
181181
def include_orig_elements(self) -> bool:
182182
"""When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk.
183183
@@ -186,7 +186,7 @@ def include_orig_elements(self) -> bool:
186186
arg_value = self._kwargs.get("include_orig_elements")
187187
return True if arg_value is None else bool(arg_value)
188188

189-
@lazyproperty
189+
@cached_property
190190
def inter_chunk_overlap(self) -> int:
191191
"""Characters of overlap to add between chunks.
192192
@@ -196,7 +196,7 @@ def inter_chunk_overlap(self) -> int:
196196
overlap_all_arg = self._kwargs.get("overlap_all")
197197
return self.overlap if overlap_all_arg else 0
198198

199-
@lazyproperty
199+
@cached_property
200200
def overlap(self) -> int:
201201
"""The number of characters to overlap text when splitting chunks mid-text.
202202
@@ -206,7 +206,7 @@ def overlap(self) -> int:
206206
overlap_arg = self._kwargs.get("overlap")
207207
return overlap_arg or 0
208208

209-
@lazyproperty
209+
@cached_property
210210
def soft_max(self) -> int:
211211
"""A pre-chunk of this size or greater is considered full.
212212
@@ -238,7 +238,7 @@ def soft_max(self) -> int:
238238
# -- otherwise, give them what they asked for --
239239
return new_after_n_chars_arg
240240

241-
@lazyproperty
241+
@cached_property
242242
def split(self) -> Callable[[str], tuple[str, str]]:
243243
"""A text-splitting function suitable for splitting the text of an oversized pre-chunk.
244244
@@ -247,7 +247,7 @@ def split(self) -> Callable[[str], tuple[str, str]]:
247247
"""
248248
return _TextSplitter(self)
249249

250-
@lazyproperty
250+
@cached_property
251251
def text_separator(self) -> str:
252252
"""The string to insert between elements when concatenating their text for a chunk.
253253
@@ -257,7 +257,7 @@ def text_separator(self) -> str:
257257
"""
258258
return "\n\n"
259259

260-
@lazyproperty
260+
@cached_property
261261
def text_splitting_separators(self) -> tuple[str, ...]:
262262
"""Sequence of text-splitting target strings to be used in order of preference."""
263263
text_splitting_separators_arg = self._kwargs.get("text_splitting_separators")
@@ -267,13 +267,13 @@ def text_splitting_separators(self) -> tuple[str, ...]:
267267
else tuple(text_splitting_separators_arg)
268268
)
269269

270-
@lazyproperty
270+
@cached_property
271271
def token_counter(self) -> TokenCounter | None:
272272
"""The token counter for token-based chunking, or None for character-based chunking."""
273273
tokenizer = self._kwargs.get("tokenizer")
274274
return TokenCounter(tokenizer) if tokenizer else None
275275

276-
@lazyproperty
276+
@cached_property
277277
def use_token_counting(self) -> bool:
278278
"""True when token-based chunking is configured, False for character-based."""
279279
return self._kwargs.get("max_tokens") is not None
@@ -401,7 +401,7 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]:
401401
# -- processed
402402
yield from pre_chunk_builder.flush()
403403

404-
@lazyproperty
404+
@cached_property
405405
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
406406
"""The semantic-boundary detectors to be applied to break pre-chunks."""
407407
return self._opts.boundary_predicates
@@ -600,7 +600,7 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]:
600600
else:
601601
yield from _Chunker.iter_chunks(self._elements, self._text, self._opts)
602602

603-
@lazyproperty
603+
@cached_property
604604
def overlap_tail(self) -> str:
605605
"""The portion of this chunk's text to be repeated as a prefix in the next chunk.
606606
@@ -629,7 +629,7 @@ def _iter_text_segments(self) -> Iterator[str]:
629629
if text:
630630
yield text
631631

632-
@lazyproperty
632+
@cached_property
633633
def _text(self) -> str:
634634
"""The concatenated text of all elements in this pre-chunk, including any overlap.
635635
@@ -686,7 +686,7 @@ def _iter_chunks(self) -> Iterator[CompositeElement]:
686686
s, remainder = split(remainder)
687687
yield CompositeElement(text=s, metadata=self._continuation_metadata)
688688

689-
@lazyproperty
689+
@cached_property
690690
def _all_metadata_values(self) -> dict[str, list[Any]]:
691691
"""Collection of all populated metadata values across elements.
692692
@@ -721,7 +721,7 @@ def iter_populated_fields(metadata: ElementMetadata) -> Iterator[tuple[str, Any]
721721

722722
return dict(field_values)
723723

724-
@lazyproperty
724+
@cached_property
725725
def _consolidated_metadata(self) -> ElementMetadata:
726726
"""Metadata applicable to this pre-chunk as a single chunk.
727727
@@ -737,7 +737,7 @@ def _consolidated_metadata(self) -> ElementMetadata:
737737
consolidated_metadata.orig_elements = self._orig_elements
738738
return consolidated_metadata
739739

740-
@lazyproperty
740+
@cached_property
741741
def _continuation_metadata(self) -> ElementMetadata:
742742
"""Metadata applicable to the second and later text-split chunks of the pre-chunk.
743743
@@ -751,7 +751,7 @@ def _continuation_metadata(self) -> ElementMetadata:
751751
continuation_metadata.is_continuation = True
752752
return continuation_metadata
753753

754-
@lazyproperty
754+
@cached_property
755755
def _meta_kwargs(self) -> dict[str, Any]:
756756
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.
757757
@@ -788,7 +788,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:
788788

789789
return dict(iter_kwarg_pairs())
790790

791-
@lazyproperty
791+
@cached_property
792792
def _orig_elements(self) -> list[Element]:
793793
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
794794

@@ -859,7 +859,7 @@ def _iter_chunks(self) -> Iterator[Table | TableChunk]:
859859
# -- otherwise, form splits with "synchronized" text and html --
860860
yield from self._iter_text_and_html_table_chunks()
861861

862-
@lazyproperty
862+
@cached_property
863863
def _html(self) -> str:
864864
"""The compactified HTML for this table when it has text-as-HTML.
865865
@@ -871,7 +871,7 @@ def _html(self) -> str:
871871

872872
return html_table.html
873873

874-
@lazyproperty
874+
@cached_property
875875
def _html_table(self) -> HtmlTable | None:
876876
"""The `lxml` HTML element object for this table.
877877
@@ -975,7 +975,7 @@ def _metadata(self) -> ElementMetadata:
975975
metadata.orig_elements = self._orig_elements
976976
return metadata
977977

978-
@lazyproperty
978+
@cached_property
979979
def _orig_elements(self) -> list[Element]:
980980
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
981981
@@ -990,14 +990,14 @@ def _orig_elements(self) -> list[Element]:
990990
orig_table.metadata.orig_elements = None
991991
return [orig_table]
992992

993-
@lazyproperty
993+
@cached_property
994994
def _table_text(self) -> str:
995995
"""The text in this table, not including any overlap-prefix or extra whitespace."""
996996
if not self._table.text:
997997
return ""
998998
return " ".join(self._table.text.split())
999999

1000-
@lazyproperty
1000+
@cached_property
10011001
def _text_with_overlap(self) -> str:
10021002
"""The text for this chunk, including the overlap-prefix when present."""
10031003
overlap_prefix = self._overlap_prefix
@@ -1271,7 +1271,7 @@ def _get_token_overlap_tail(self, text: str, target_tokens: int) -> str:
12711271

12721272
return text[pos:]
12731273

1274-
@lazyproperty
1274+
@cached_property
12751275
def _patterns(self) -> tuple[tuple[regex.Pattern[str], int], ...]:
12761276
"""Sequence of (pattern, len) pairs to match against.
12771277

unstructured/chunking/dispatch.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import dataclasses as dc
1111
import functools
1212
import inspect
13+
from functools import cached_property
1314
from typing import Any, Callable, Iterable, Optional, Protocol
1415

1516
from lxml.etree import tostring
@@ -19,7 +20,7 @@
1920
from unstructured.chunking.basic import chunk_elements
2021
from unstructured.chunking.title import chunk_by_title
2122
from unstructured.documents.elements import Element, Table, TableChunk
22-
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
23+
from unstructured.utils import get_call_args_applying_defaults
2324

2425
_P = ParamSpec("_P")
2526

@@ -116,7 +117,7 @@ class _ChunkerSpec:
116117
chunker: Chunker
117118
"""The "chunk_by_{x}() function that implements this chunking strategy."""
118119

119-
@lazyproperty
120+
@cached_property
120121
def kw_arg_names(self) -> tuple[str, ...]:
121122
"""Keyword arguments supported by this chunker.
122123

unstructured/chunking/title.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from __future__ import annotations
77

8+
from functools import cached_property
89
from typing import Iterable, Iterator, Optional
910

1011
from unstructured.chunking.base import (
@@ -17,7 +18,6 @@
1718
is_title,
1819
)
1920
from unstructured.documents.elements import Element
20-
from unstructured.utils import lazyproperty
2121

2222

2323
def chunk_by_title(
@@ -124,7 +124,7 @@ class _ByTitleChunkingOptions(ChunkingOptions):
124124
appearing on two different pages can appear in the same chunk.
125125
"""
126126

127-
@lazyproperty
127+
@cached_property
128128
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
129129
"""The semantic-boundary detectors to be applied to break pre-chunks.
130130
@@ -140,7 +140,7 @@ def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
140140

141141
return tuple(iter_boundary_predicates())
142142

143-
@lazyproperty
143+
@cached_property
144144
def combine_text_under_n_chars(self) -> int:
145145
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.
146146
@@ -152,7 +152,7 @@ def combine_text_under_n_chars(self) -> int:
152152
arg_value = self._kwargs.get("combine_text_under_n_chars")
153153
return self.hard_max if arg_value is None else arg_value
154154

155-
@lazyproperty
155+
@cached_property
156156
def multipage_sections(self) -> bool:
157157
"""When False, break pre-chunks on page-boundaries."""
158158
arg_value = self._kwargs.get("multipage_sections")

0 commit comments

Comments
 (0)