Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.17.6-dev2
## 0.17.6

### Enhancements

Expand All @@ -10,6 +10,7 @@ Two executions of the same code, on the same file, produce different results. Th
This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
- Resolve open CVEs
- Properly handle the case when an element's `text` attribute is None


## 0.17.5
Expand Down Expand Up @@ -48,7 +49,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r
### Features

### Fixes
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml

## 0.17.2

Expand Down
11 changes: 11 additions & 0 deletions test_unstructured/chunking/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
CompositeElement,
Element,
ElementMetadata,
Image,
PageBreak,
Table,
TableChunk,
Expand Down Expand Up @@ -234,6 +235,10 @@ def it_accumulates_elements_added_to_it(self):
assert builder._text_length == 112
assert builder._remaining_space == 36

def it_will_fit_when_element_has_none_as_text(self):
builder = PreChunkBuilder(opts=ChunkingOptions())
assert builder.will_fit(Image(None))

def it_will_fit_an_oversized_element_when_empty(self):
builder = PreChunkBuilder(opts=ChunkingOptions())
assert builder.will_fit(Text("abcd " * 200))
Expand Down Expand Up @@ -405,6 +410,12 @@ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self):
pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions())
assert pre_chunk != 42

def it_can_handle_element_with_none_as_text(self):
pre_chunk = PreChunk(
[Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions()
)
assert pre_chunk._text == "hello"

@pytest.mark.parametrize(
("max_characters", "combine_text_under_n_chars", "expected_value"),
[
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.17.6-dev2" # pragma: no cover
__version__ = "0.17.6" # pragma: no cover
4 changes: 3 additions & 1 deletion unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def will_fit(self, element: Element) -> bool:
if self._text_length > self._opts.soft_max:
return False
# -- don't add an element if it would increase total size beyond the hard-max --
return not self._remaining_space < len(element.text)
return not self._remaining_space < len(element.text or "")

@property
def _remaining_space(self) -> int:
Expand Down Expand Up @@ -503,6 +503,8 @@ def _iter_text_segments(self) -> Iterator[str]:
if self._overlap_prefix:
yield self._overlap_prefix
for e in self._elements:
if e.text is None:
continue
text = " ".join(e.text.strip().split())
if not text:
continue
Expand Down
Loading