diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94e2d08612..9fea993395 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: setup: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -31,7 +31,7 @@ jobs: check-deps: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -45,7 +45,7 @@ jobs: check-extras: strategy: matrix: - python-version: [ "3.9","3.10","3.11","3.12" ] + python-version: ["3.10","3.11","3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -78,7 +78,7 @@ jobs: lint: strategy: matrix: - python-version: ["3.9","3.10","3.11"] + python-version: ["3.10","3.11"] runs-on: ubuntu-latest needs: [setup, changelog] steps: @@ -117,7 +117,7 @@ jobs: test_unit: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest needs: [setup, lint] steps: @@ -133,7 +133,7 @@ jobs: - name: Test env: UNS_API_KEY: ${{ secrets.UNS_API_KEY }} - TESSERACT_VERSION : "5.4.1" + TESSERACT_VERSION : "5.5.1" run: | source .venv/bin/activate sudo apt-get update @@ -224,7 +224,7 @@ jobs: setup_ingest: strategy: matrix: - python-version: [ "3.9","3.10" ] + python-version: ["3.10"] runs-on: ubuntu-latest needs: [setup] steps: @@ -237,7 +237,7 @@ jobs: test_ingest_src: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup_ingest, lint] steps: @@ -323,7 +323,7 @@ jobs: test_json_to_html: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup, lint] steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 79143c8413..ba87df9e01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +## 0.17.11-dev2 + +### Enhancements + +### Features + +### Fixes +- Fix type error when `result_file_type` is expected to be a `FileType` but is `None` +- Fix chunking for elements with None text that has AttributeError 'NoneType' object has no attribute 'strip'. +- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element. + +## 0.17.10 +- Drop Python 3.9 support as it reaches EOL in October 2025 +- Update pip-compile script to use Python 3.10 and newer +- Update all packages using pip-compile + ## 0.17.9 - Patch various CVEs diff --git a/requirements/base.txt b/requirements/base.txt index 0daee868f6..64280918af 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./base.in @@ -24,11 +24,11 @@ charset-normalizer==3.4.2 # via # requests # unstructured-client -click==8.1.8 +click==8.2.1 # via # nltk # python-oxmsg -cryptography==45.0.3 +cryptography==45.0.4 # via unstructured-client dataclasses-json==0.6.7 # via @@ -76,7 +76,7 @@ nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 # via -r ./base.in -numpy==2.0.2 +numpy==2.2.6 # via -r ./base.in olefile==0.47 # via python-oxmsg @@ -143,7 +143,7 @@ unstructured-client==0.25.9 # via # -c ././deps/constraints.txt # -r ./base.in -urllib3==1.26.20 +urllib3==2.4.0 # via # -c ././deps/constraints.txt # requests diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 9659e8bac1..88efdd5d2b 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -13,7 +13,7 @@ grpcio>=1.65.5 tokenizers>=0.21,<0.22 # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets # updated or we drop support for 3.9 -urllib3<1.27 +urllib3<3.0.0 # TODO: Constriant due to aiobotocore, remove when that gets updates: botocore<1.34.132 # TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile diff --git a/requirements/dev.txt b/requirements/dev.txt index 632e7e299f..ac2fa3cb41 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./dev.in @@ -8,7 +8,7 @@ build==1.2.2.post1 # via pip-tools cfgv==3.4.0 # via pre-commit -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # -c ./test.txt @@ -19,10 +19,6 @@ filelock==3.18.0 # via virtualenv identify==2.6.12 # via pre-commit -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # build nodeenv==1.9.1 # via pre-commit packaging==25.0 @@ -53,8 +49,6 @@ virtualenv==20.31.2 # via pre-commit wheel==0.45.1 # via pip-tools -zipp==3.22.0 - # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 74e069b2de..d37c6a53dc 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-csv.in # -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 831f636e57..b71b1cd6d7 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 460408c418..db97d6a9be 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index bcdf3368f8..c0c3a476ad 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,14 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-markdown.in # -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # markdown markdown==3.8 # via -r ./extra-markdown.in -zipp==3.22.0 - # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 94bd199821..4c92aae6cc 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 16502d6bbb..e6587d3d0c 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-paddleocr.in @@ -32,19 +32,17 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -cython==3.1.1 +cython==3.1.2 # via unstructured-paddleocr decorator==5.2.1 # via paddlepaddle -eval-type-backport==0.2.2 - # via albumentations exceptiongroup==1.3.0 # via # -c ./base.txt # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.58.1 +fonttools==4.58.2 # via unstructured-paddleocr h11==0.16.0 # via @@ -72,11 +70,11 @@ lxml==5.4.0 # via # -c ./base.txt # python-docx -networkx==3.2.1 +networkx==3.4.2 # via # paddlepaddle # scikit-image -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # albucore @@ -139,15 +137,15 @@ requests==2.32.4 # via # -c ./base.txt # unstructured-paddleocr -scikit-image==0.24.0 +scikit-image==0.25.2 # via unstructured-paddleocr -scipy==1.13.1 +scipy==1.15.3 # via # albumentations # scikit-image -shapely==2.0.7 +shapely==2.1.1 # via unstructured-paddleocr -simsimd==6.4.7 +simsimd==6.4.9 # via albucore sniffio==1.3.1 # via @@ -161,7 +159,7 @@ stringzilla==3.12.5 # via albucore termcolor==3.1.0 # via fire -tifffile==2024.8.30 +tifffile==2025.5.10 # via scikit-image tqdm==4.67.1 # via @@ -170,8 +168,6 @@ tqdm==4.67.1 typing-extensions==4.14.0 # via # -c ./base.txt - # albucore - # albumentations # anyio # beautifulsoup4 # exceptiongroup @@ -184,7 +180,7 @@ typing-inspection==0.4.1 # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in -urllib3==1.26.20 +urllib3==2.4.0 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index dd397c3845..95e2170080 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 22597ecfe6..3110073d99 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pdf-image.in @@ -25,9 +25,9 @@ charset-normalizer==3.4.2 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.0 +contourpy==1.3.2 # via matplotlib -cryptography==45.0.3 +cryptography==45.0.4 # via # -c ./base.txt # pdfminer-six @@ -44,7 +44,7 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.58.1 +fonttools==4.58.2 # via matplotlib fsspec==2025.5.1 # via @@ -56,22 +56,22 @@ google-auth==2.40.3 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.10.1 +google-cloud-vision==3.10.2 # via -r ./extra-pdf-image.in googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status -grpcio==1.72.1 +grpcio==1.73.0 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.72.1 +grpcio-status==1.73.0 # via google-api-core hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.33.0 # via # accelerate # timm @@ -84,11 +84,9 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.2 - # via matplotlib jinja2==3.1.6 # via torch -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib lxml==5.4.0 # via @@ -96,13 +94,13 @@ lxml==5.4.0 # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.9.4 +matplotlib==3.10.3 # via unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # accelerate @@ -123,7 +121,7 @@ onnx==1.18.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.22.0 # via # -r ./extra-pdf-image.in # unstructured-inference @@ -235,7 +233,7 @@ safetensors==0.5.3 # accelerate # timm # transformers -scipy==1.13.1 +scipy==1.15.3 # via unstructured-inference six==1.17.0 # via @@ -284,7 +282,7 @@ unstructured-inference==1.0.5 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in -urllib3==1.26.20 +urllib3==2.4.0 # via # -c ././deps/constraints.txt # -c ./base.txt @@ -293,5 +291,3 @@ wrapt==1.17.2 # via # -c ./base.txt # deprecated -zipp==3.22.0 - # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 1664e9f404..d739fe0367 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pptx.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 922e00bac0..e309ec0961 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,14 +1,14 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl -networkx==3.2.1 +networkx==3.4.2 # via -r ./extra-xlsx.in -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a5d50c8cc5..5fdc31e0e9 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./huggingface.in @@ -12,7 +12,7 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # sacremoses @@ -27,7 +27,7 @@ fsspec==2025.5.1 # torch hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.33.0 # via # tokenizers # transformers @@ -49,9 +49,9 @@ markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # transformers @@ -105,7 +105,7 @@ typing-extensions==4.14.0 # -c ./base.txt # huggingface-hub # torch -urllib3==1.26.20 +urllib3==2.4.0 # via # -c ././deps/constraints.txt # -c ./base.txt diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 364f499029..e817913f10 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,5 +1,5 @@ unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1 s3fs>=2024.9.0 -urllib3>=1.26.20 +urllib3>=2.4.0 backoff>=2.2.1 httpx>=0.27.2 diff --git a/requirements/test.txt b/requirements/test.txt index ce0fd2cc62..2e38a48d8f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./test.in @@ -10,11 +10,11 @@ autoflake==2.3.1 # via -r ./test.in black==25.1.0 # via -r ./test.in -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # black -coverage[toml]==7.8.2 +coverage[toml]==7.9.0 # via # -r ./test.in # pytest-cov @@ -30,7 +30,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.2 # via -r ./test.in -grpcio==1.72.1 +grpcio==1.73.0 # via # -c ././deps/constraints.txt # -r ./test.in @@ -59,7 +59,9 @@ pathspec==0.12.1 platformdirs==4.3.8 # via black pluggy==1.6.0 - # via pytest + # via + # pytest + # pytest-cov pycodestyle==2.13.0 # via # flake8 @@ -78,7 +80,7 @@ pytest==8.4.0 # via # pytest-cov # pytest-mock -pytest-cov==6.1.1 +pytest-cov==6.2.1 # via -r ./test.in pytest-mock==3.14.1 # via -r ./test.in @@ -86,7 +88,7 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.12 +ruff==0.11.13 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -107,12 +109,10 @@ types-click==7.1.8 # via -r ./test.in types-markdown==3.8.0.20250415 # via -r ./test.in -types-requests==2.31.0.6 +types-requests==2.32.4.20250611 # via -r ./test.in types-tabulate==0.9.0.20241207 # via -r ./test.in -types-urllib3==1.26.25.14 - # via types-requests typing-extensions==4.14.0 # via # -c ./base.txt @@ -124,3 +124,8 @@ typing-extensions==4.14.0 # typing-inspection typing-inspection==0.4.1 # via pydantic +urllib3==2.4.0 + # via + # -c ././deps/constraints.txt + # -c ./base.txt + # types-requests diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index 5789a83d14..019810e196 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -10,16 +10,12 @@ """ import argparse +import html import logging import os import select import sys -from collections import defaultdict -from typing import List, Sequence -from bs4 import BeautifulSoup - -from unstructured.documents import elements from unstructured.partition.html.transformations import unstructured_elements_to_ontology from unstructured.staging.base import elements_from_json @@ -28,48 +24,6 @@ logger = logging.getLogger(__name__) -def extract_document_div(html_content: str) -> str: - pos = html_content.find(">") - if pos != -1: - return html_content[: pos + 1] - logger.error("No '>' found in the HTML content.") - raise ValueError("No '>' found in the HTML content.") - - -def extract_page_div(html_content: str) -> str: - soup = BeautifulSoup(html_content, "html.parser") - page_divs = soup.find_all("div", class_="Page") - if len(page_divs) != 1: - logger.error( - "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) - ) - raise ValueError("Expected exactly one
element with class 'Page'.") - return str(page_divs[0]) - - -def fold_document_div( - html_document_start: str, html_document_end: str, html_per_page: List[str] -) -> str: - html_document = html_document_start - for page_html in html_per_page: - html_document += page_html - html_document += html_document_end - return html_document - - -def group_elements_by_page( - unstructured_elements: Sequence[elements.Element], -) -> Sequence[Sequence[elements.Element]]: - pages_dict = defaultdict(list) - - for element in unstructured_elements: - page_number = element.metadata.page_number - pages_dict[page_number].append(element) - - pages_list = list(pages_dict.values()) - return pages_list - - def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: """Renders HTML from a JSON file with unstructured elements. @@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st logger.info("Rendering HTML from text.") unstructured_elements = elements_from_json(filename=filepath, text=text) - unstructured_elements_per_page = group_elements_by_page(unstructured_elements) - # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) - parsed_ontology_per_page = [ - unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page - ] - html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] - - html_document_start = extract_document_div(html_per_page[0]) - html_document_end = "
" - html_per_page = [extract_page_div(page) for page in html_per_page] - - return fold_document_div(html_document_start, html_document_end, html_per_page) + ontology_root = unstructured_elements_to_ontology(unstructured_elements) + html_document = ontology_root.to_html() + unescaped_html = html.unescape(html_document) + return unescaped_html def _main(): diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index ece191698b..460e99733f 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -2,7 +2,7 @@ # python version must match lowest supported (3.9) major=3 -minor=9 +minor=10 if ! python -c "import sys; assert sys.version_info.major == $major and sys.version_info.minor == $minor"; then echo "python version not equal to expected $major.$minor: $(python --version)" exit 1 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index ffaa699cac..a80eb68d7f 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -416,6 +416,20 @@ def it_can_handle_element_with_none_as_text(self): ) assert pre_chunk._text == "hello" + def it_can_chunk_elements_with_none_text_without_error(self): + """Regression test for AttributeError when Image elements have None text.""" + pre_chunk = PreChunk( + [Image(None), Text("hello world"), Image(None)], + overlap_prefix="", + opts=ChunkingOptions(), + ) + + # Should not raise AttributeError when generating chunks + chunks = list(pre_chunk.iter_chunks()) + + assert len(chunks) == 1 + assert chunks[0].text == "hello world" + @pytest.mark.parametrize( ("max_characters", "combine_text_under_n_chars", "expected_value"), [ @@ -1026,6 +1040,15 @@ def it_computes_the_original_elements_list_to_help(self): # -- computation is only on first call, all chunks get exactly the same orig-elements -- assert table_chunker._orig_elements is orig_elements + def it_handles_table_with_none_text_without_error(self): + """Regression test for AttributeError when Table elements have None text.""" + table = Table(None) # Table with None text + + # Should not raise AttributeError and should produce no chunks + chunks = list(_TableChunker.iter_chunks(table, "", ChunkingOptions())) + + assert len(chunks) == 0 + # ================================================================================================ # HTML SPLITTERS diff --git a/test_unstructured/documents/html_files/example.html b/test_unstructured/documents/html_files/example.html index 14be089463..3abd541255 100644 --- a/test_unstructured/documents/html_files/example.html +++ b/test_unstructured/documents/html_files/example.html @@ -1,41 +1,41 @@ - -
-
-

+ +
+
+

Header

-
-
-