55import collections
66import copy
77import uuid
8+ from functools import cached_property
89from typing import Any , Callable , DefaultDict , Iterable , Iterator , cast
910
1011import regex
2324 Title ,
2425)
2526from unstructured .logger import logger
26- from unstructured .utils import lazyproperty
2727
2828# ================================================================================================
2929# MODEL
@@ -59,7 +59,7 @@ class TokenCounter:
5959 def __init__ (self , tokenizer : str ):
6060 self ._tokenizer_name = tokenizer
6161
62- @lazyproperty
62+ @cached_property
6363 def _encoder (self ):
6464 """Lazily initialize the tiktoken encoder."""
6565 import tiktoken
@@ -144,15 +144,15 @@ def new(cls, **kwargs: Any) -> Self:
144144 self ._validate ()
145145 return self
146146
147- @lazyproperty
147+ @cached_property
148148 def boundary_predicates (self ) -> tuple [BoundaryPredicate , ...]:
149149 """The semantic-boundary detectors to be applied to break pre-chunks.
150150
151151 Overridden by sub-typs to provide semantic-boundary isolation behaviors.
152152 """
153153 return ()
154154
155- @lazyproperty
155+ @cached_property
156156 def combine_text_under_n_chars (self ) -> int :
157157 """Combine two consecutive text pre-chunks if first is smaller than this and both will fit.
158158
@@ -162,7 +162,7 @@ def combine_text_under_n_chars(self) -> int:
162162 arg_value = self ._kwargs .get ("combine_text_under_n_chars" )
163163 return arg_value if arg_value is not None else 0
164164
165- @lazyproperty
165+ @cached_property
166166 def hard_max (self ) -> int :
167167 """The maximum size for a chunk (in characters or tokens depending on mode).
168168
@@ -177,7 +177,7 @@ def hard_max(self) -> int:
177177 arg_value = self ._kwargs .get ("max_characters" )
178178 return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT
179179
180- @lazyproperty
180+ @cached_property
181181 def include_orig_elements (self ) -> bool :
182182 """When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk.
183183
@@ -186,7 +186,7 @@ def include_orig_elements(self) -> bool:
186186 arg_value = self ._kwargs .get ("include_orig_elements" )
187187 return True if arg_value is None else bool (arg_value )
188188
189- @lazyproperty
189+ @cached_property
190190 def inter_chunk_overlap (self ) -> int :
191191 """Characters of overlap to add between chunks.
192192
@@ -196,7 +196,7 @@ def inter_chunk_overlap(self) -> int:
196196 overlap_all_arg = self ._kwargs .get ("overlap_all" )
197197 return self .overlap if overlap_all_arg else 0
198198
199- @lazyproperty
199+ @cached_property
200200 def overlap (self ) -> int :
201201 """The number of characters to overlap text when splitting chunks mid-text.
202202
@@ -206,7 +206,7 @@ def overlap(self) -> int:
206206 overlap_arg = self ._kwargs .get ("overlap" )
207207 return overlap_arg or 0
208208
209- @lazyproperty
209+ @cached_property
210210 def soft_max (self ) -> int :
211211 """A pre-chunk of this size or greater is considered full.
212212
@@ -238,7 +238,7 @@ def soft_max(self) -> int:
238238 # -- otherwise, give them what they asked for --
239239 return new_after_n_chars_arg
240240
241- @lazyproperty
241+ @cached_property
242242 def split (self ) -> Callable [[str ], tuple [str , str ]]:
243243 """A text-splitting function suitable for splitting the text of an oversized pre-chunk.
244244
@@ -247,7 +247,7 @@ def split(self) -> Callable[[str], tuple[str, str]]:
247247 """
248248 return _TextSplitter (self )
249249
250- @lazyproperty
250+ @cached_property
251251 def text_separator (self ) -> str :
252252 """The string to insert between elements when concatenating their text for a chunk.
253253
@@ -257,7 +257,7 @@ def text_separator(self) -> str:
257257 """
258258 return "\n \n "
259259
260- @lazyproperty
260+ @cached_property
261261 def text_splitting_separators (self ) -> tuple [str , ...]:
262262 """Sequence of text-splitting target strings to be used in order of preference."""
263263 text_splitting_separators_arg = self ._kwargs .get ("text_splitting_separators" )
@@ -267,13 +267,13 @@ def text_splitting_separators(self) -> tuple[str, ...]:
267267 else tuple (text_splitting_separators_arg )
268268 )
269269
270- @lazyproperty
270+ @cached_property
271271 def token_counter (self ) -> TokenCounter | None :
272272 """The token counter for token-based chunking, or None for character-based chunking."""
273273 tokenizer = self ._kwargs .get ("tokenizer" )
274274 return TokenCounter (tokenizer ) if tokenizer else None
275275
276- @lazyproperty
276+ @cached_property
277277 def use_token_counting (self ) -> bool :
278278 """True when token-based chunking is configured, False for character-based."""
279279 return self ._kwargs .get ("max_tokens" ) is not None
@@ -401,7 +401,7 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]:
401401 # -- processed
402402 yield from pre_chunk_builder .flush ()
403403
404- @lazyproperty
404+ @cached_property
405405 def _boundary_predicates (self ) -> tuple [BoundaryPredicate , ...]:
406406 """The semantic-boundary detectors to be applied to break pre-chunks."""
407407 return self ._opts .boundary_predicates
@@ -600,7 +600,7 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]:
600600 else :
601601 yield from _Chunker .iter_chunks (self ._elements , self ._text , self ._opts )
602602
603- @lazyproperty
603+ @cached_property
604604 def overlap_tail (self ) -> str :
605605 """The portion of this chunk's text to be repeated as a prefix in the next chunk.
606606
@@ -629,7 +629,7 @@ def _iter_text_segments(self) -> Iterator[str]:
629629 if text :
630630 yield text
631631
632- @lazyproperty
632+ @cached_property
633633 def _text (self ) -> str :
634634 """The concatenated text of all elements in this pre-chunk, including any overlap.
635635
@@ -686,7 +686,7 @@ def _iter_chunks(self) -> Iterator[CompositeElement]:
686686 s , remainder = split (remainder )
687687 yield CompositeElement (text = s , metadata = self ._continuation_metadata )
688688
689- @lazyproperty
689+ @cached_property
690690 def _all_metadata_values (self ) -> dict [str , list [Any ]]:
691691 """Collection of all populated metadata values across elements.
692692
@@ -721,7 +721,7 @@ def iter_populated_fields(metadata: ElementMetadata) -> Iterator[tuple[str, Any]
721721
722722 return dict (field_values )
723723
724- @lazyproperty
724+ @cached_property
725725 def _consolidated_metadata (self ) -> ElementMetadata :
726726 """Metadata applicable to this pre-chunk as a single chunk.
727727
@@ -737,7 +737,7 @@ def _consolidated_metadata(self) -> ElementMetadata:
737737 consolidated_metadata .orig_elements = self ._orig_elements
738738 return consolidated_metadata
739739
740- @lazyproperty
740+ @cached_property
741741 def _continuation_metadata (self ) -> ElementMetadata :
742742 """Metadata applicable to the second and later text-split chunks of the pre-chunk.
743743
@@ -751,7 +751,7 @@ def _continuation_metadata(self) -> ElementMetadata:
751751 continuation_metadata .is_continuation = True
752752 return continuation_metadata
753753
754- @lazyproperty
754+ @cached_property
755755 def _meta_kwargs (self ) -> dict [str , Any ]:
756756 """The consolidated metadata values as a dict suitable for constructing ElementMetadata.
757757
@@ -788,7 +788,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:
788788
789789 return dict (iter_kwarg_pairs ())
790790
791- @lazyproperty
791+ @cached_property
792792 def _orig_elements (self ) -> list [Element ]:
793793 """The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""
794794
@@ -859,7 +859,7 @@ def _iter_chunks(self) -> Iterator[Table | TableChunk]:
859859 # -- otherwise, form splits with "synchronized" text and html --
860860 yield from self ._iter_text_and_html_table_chunks ()
861861
862- @lazyproperty
862+ @cached_property
863863 def _html (self ) -> str :
864864 """The compactified HTML for this table when it has text-as-HTML.
865865
@@ -871,7 +871,7 @@ def _html(self) -> str:
871871
872872 return html_table .html
873873
874- @lazyproperty
874+ @cached_property
875875 def _html_table (self ) -> HtmlTable | None :
876876 """The `lxml` HTML element object for this table.
877877
@@ -975,7 +975,7 @@ def _metadata(self) -> ElementMetadata:
975975 metadata .orig_elements = self ._orig_elements
976976 return metadata
977977
978- @lazyproperty
978+ @cached_property
979979 def _orig_elements (self ) -> list [Element ]:
980980 """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.
981981
@@ -990,14 +990,14 @@ def _orig_elements(self) -> list[Element]:
990990 orig_table .metadata .orig_elements = None
991991 return [orig_table ]
992992
993- @lazyproperty
993+ @cached_property
994994 def _table_text (self ) -> str :
995995 """The text in this table, not including any overlap-prefix or extra whitespace."""
996996 if not self ._table .text :
997997 return ""
998998 return " " .join (self ._table .text .split ())
999999
1000- @lazyproperty
1000+ @cached_property
10011001 def _text_with_overlap (self ) -> str :
10021002 """The text for this chunk, including the overlap-prefix when present."""
10031003 overlap_prefix = self ._overlap_prefix
@@ -1271,7 +1271,7 @@ def _get_token_overlap_tail(self, text: str, target_tokens: int) -> str:
12711271
12721272 return text [pos :]
12731273
1274- @lazyproperty
1274+ @cached_property
12751275 def _patterns (self ) -> tuple [tuple [regex .Pattern [str ], int ], ...]:
12761276 """Sequence of (pattern, len) pairs to match against.
12771277
0 commit comments