From 81ae229bf374e98b29c26fbed653070b14c0677a Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Tue, 10 Jun 2025 20:24:26 -0700 Subject: [PATCH 1/8] fix file_type is None for JSON (chunker) --- unstructured/file_utils/filetype.py | 2 +- unstructured/partition/auto.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 81557562f4..41c6b8f0fd 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -188,7 +188,7 @@ def _file_type(self) -> FileType: # give up and report FileType.UNK result_file_type = FileType.UNK - if result_file_type == FileType.JSON: + if result_file_type == FileType.JSON and self._disambiguate_json_file_type: # edge case where JSON/NDJSON content without file extension # (magic lib can't distinguish them) result_file_type = self._disambiguate_json_file_type diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index fa4c050347..d8555e8244 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -226,7 +226,7 @@ def augment_metadata(elements: list[Element]) -> list[Element]: ) return augment_metadata(elements) - if file_type.partitioner_shortname == "image": + if file_type.partitioner_shortname and file_type.partitioner_shortname == "image": partition_image = partitioner_loader.get(file_type) elements = partition_image( filename=filename, From 7650126fb5b7fa621033f179b295d321fdab8487 Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Wed, 11 Jun 2025 19:31:27 -0700 Subject: [PATCH 2/8] i hope newer version of tesseract works fine here --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94e2d08612..532750bea9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,7 +133,7 @@ jobs: - name: Test env: UNS_API_KEY: ${{ secrets.UNS_API_KEY }} - TESSERACT_VERSION : "5.4.1" + TESSERACT_VERSION : "5.5.1" run: | source .venv/bin/activate sudo apt-get update From d76ebb4c94780b1abba0851e5fb072373f5f8502 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 23:32:11 -0700 Subject: [PATCH 3/8] Drop Python 3.9 support due to dependency conflicts (#4017) --- .github/workflows/ci.yml | 16 +++--- CHANGELOG.md | 5 ++ requirements/base.txt | 14 +++--- requirements/deps/constraints.txt | 2 +- requirements/dev.txt | 10 +--- requirements/extra-csv.txt | 4 +- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 2 +- requirements/extra-markdown.txt | 8 +-- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 30 +++++------ requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.txt | 84 ++++++++++++++++++++++--------- requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 6 +-- requirements/huggingface.txt | 58 ++++++++++++++++++--- requirements/ingest/ingest.txt | 2 +- requirements/test.txt | 19 ++++--- scripts/pip-compile.sh | 2 +- unstructured/__version__.py | 2 +- 20 files changed, 172 insertions(+), 100 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 532750bea9..9fea993395 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: setup: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -31,7 +31,7 @@ jobs: check-deps: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -45,7 +45,7 @@ jobs: check-extras: strategy: matrix: - python-version: [ "3.9","3.10","3.11","3.12" ] + python-version: ["3.10","3.11","3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -78,7 +78,7 @@ jobs: lint: strategy: matrix: - python-version: ["3.9","3.10","3.11"] + python-version: ["3.10","3.11"] runs-on: ubuntu-latest needs: [setup, changelog] steps: @@ -117,7 +117,7 @@ jobs: test_unit: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest needs: [setup, lint] steps: @@ -224,7 +224,7 @@ jobs: setup_ingest: strategy: matrix: - python-version: [ "3.9","3.10" ] + python-version: ["3.10"] runs-on: ubuntu-latest needs: [setup] steps: @@ -237,7 +237,7 @@ jobs: test_ingest_src: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup_ingest, lint] steps: @@ -323,7 +323,7 @@ jobs: test_json_to_html: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup, lint] steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 79143c8413..2140662067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.17.10 +- Drop Python 3.9 support as it reaches EOL in October 2025 +- Update pip-compile script to use Python 3.10 and newer +- Update all packages using pip-compile + ## 0.17.9 - Patch various CVEs diff --git a/requirements/base.txt b/requirements/base.txt index 0daee868f6..09b9a0bdf9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./base.in @@ -24,11 +24,11 @@ charset-normalizer==3.4.2 # via # requests # unstructured-client -click==8.1.8 +click==8.2.1 # via # nltk # python-oxmsg -cryptography==45.0.3 +cryptography==45.0.4 # via unstructured-client dataclasses-json==0.6.7 # via @@ -76,7 +76,7 @@ nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 # via -r ./base.in -numpy==2.0.2 +numpy==2.2.6 # via -r ./base.in olefile==0.47 # via python-oxmsg @@ -141,11 +141,11 @@ typing-inspect==0.9.0 # unstructured-client unstructured-client==0.25.9 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./base.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # requests # unstructured-client webencodings==0.5.1 diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 9659e8bac1..88efdd5d2b 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -13,7 +13,7 @@ grpcio>=1.65.5 tokenizers>=0.21,<0.22 # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets # updated or we drop support for 3.9 -urllib3<1.27 +urllib3<3.0.0 # TODO: Constriant due to aiobotocore, remove when that gets updates: botocore<1.34.132 # TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile diff --git a/requirements/dev.txt b/requirements/dev.txt index 632e7e299f..ac2fa3cb41 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./dev.in @@ -8,7 +8,7 @@ build==1.2.2.post1 # via pip-tools cfgv==3.4.0 # via pre-commit -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # -c ./test.txt @@ -19,10 +19,6 @@ filelock==3.18.0 # via virtualenv identify==2.6.12 # via pre-commit -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # build nodeenv==1.9.1 # via pre-commit packaging==25.0 @@ -53,8 +49,6 @@ virtualenv==20.31.2 # via pre-commit wheel==0.45.1 # via pip-tools -zipp==3.22.0 - # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 74e069b2de..d37c6a53dc 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-csv.in # -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 831f636e57..b71b1cd6d7 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 460408c418..db97d6a9be 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index bcdf3368f8..c0c3a476ad 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,14 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-markdown.in # -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # markdown markdown==3.8 # via -r ./extra-markdown.in -zipp==3.22.0 - # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 94bd199821..4c92aae6cc 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 16502d6bbb..9dce312d7b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-paddleocr.in @@ -32,19 +32,17 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -cython==3.1.1 +cython==3.1.2 # via unstructured-paddleocr decorator==5.2.1 # via paddlepaddle -eval-type-backport==0.2.2 - # via albumentations exceptiongroup==1.3.0 # via # -c ./base.txt # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.58.1 +fonttools==4.58.2 # via unstructured-paddleocr h11==0.16.0 # via @@ -72,11 +70,11 @@ lxml==5.4.0 # via # -c ./base.txt # python-docx -networkx==3.2.1 +networkx==3.4.2 # via # paddlepaddle # scikit-image -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # albucore @@ -117,7 +115,7 @@ pillow==11.2.1 # unstructured-paddleocr protobuf==6.31.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr @@ -139,15 +137,15 @@ requests==2.32.4 # via # -c ./base.txt # unstructured-paddleocr -scikit-image==0.24.0 +scikit-image==0.25.2 # via unstructured-paddleocr -scipy==1.13.1 +scipy==1.15.3 # via # albumentations # scikit-image -shapely==2.0.7 +shapely==2.1.1 # via unstructured-paddleocr -simsimd==6.4.7 +simsimd==6.4.9 # via albucore sniffio==1.3.1 # via @@ -161,7 +159,7 @@ stringzilla==3.12.5 # via albucore termcolor==3.1.0 # via fire -tifffile==2024.8.30 +tifffile==2025.5.10 # via scikit-image tqdm==4.67.1 # via @@ -170,8 +168,6 @@ tqdm==4.67.1 typing-extensions==4.14.0 # via # -c ./base.txt - # albucore - # albumentations # anyio # beautifulsoup4 # exceptiongroup @@ -184,8 +180,8 @@ typing-inspection==0.4.1 # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index dd397c3845..95e2170080 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 22597ecfe6..5c9121dab4 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pdf-image.in @@ -25,9 +25,9 @@ charset-normalizer==3.4.2 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.0 +contourpy==1.3.2 # via matplotlib -cryptography==45.0.3 +cryptography==45.0.4 # via # -c ./base.txt # pdfminer-six @@ -44,7 +44,7 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.58.1 +fonttools==4.58.2 # via matplotlib fsspec==2025.5.1 # via @@ -62,16 +62,16 @@ googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status -grpcio==1.72.1 +grpcio==1.73.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.72.1 +grpcio-status==1.73.0 # via google-api-core hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.32.5 # via # accelerate # timm @@ -84,11 +84,9 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.2 - # via matplotlib jinja2==3.1.6 # via torch -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib lxml==5.4.0 # via @@ -96,13 +94,13 @@ lxml==5.4.0 # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.9.4 +matplotlib==3.10.3 # via unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # accelerate @@ -117,13 +115,50 @@ numpy==2.0.2 # torchvision # transformers # unstructured-inference +nvidia-cublas-cu12==12.6.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch omegaconf==2.3.0 # via effdet onnx==1.18.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.22.0 # via # -r ./extra-pdf-image.in # unstructured-inference @@ -145,7 +180,7 @@ pdf2image==1.17.0 # via -r ./extra-pdf-image.in pdfminer-six==20250327 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./extra-pdf-image.in # unstructured-inference pi-heif==0.22.0 @@ -166,7 +201,7 @@ proto-plus==1.26.1 # google-cloud-vision protobuf==6.31.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos @@ -235,7 +270,7 @@ safetensors==0.5.3 # accelerate # timm # transformers -scipy==1.13.1 +scipy==1.15.3 # via unstructured-inference six==1.17.0 # via @@ -251,7 +286,7 @@ timm==1.0.15 # unstructured-inference tokenizers==0.21.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.7.1 # via @@ -271,6 +306,8 @@ tqdm==4.67.1 # transformers transformers==4.52.4 # via unstructured-inference +triton==3.3.1 + # via torch typing-extensions==4.14.0 # via # -c ./base.txt @@ -284,14 +321,15 @@ unstructured-inference==1.0.5 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests wrapt==1.17.2 # via # -c ./base.txt # deprecated -zipp==3.22.0 - # via importlib-resources + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 1664e9f404..d739fe0367 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pptx.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 922e00bac0..e309ec0961 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,14 +1,14 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl -networkx==3.2.1 +networkx==3.4.2 # via -r ./extra-xlsx.in -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a5d50c8cc5..c9645c27f9 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./huggingface.in @@ -12,7 +12,7 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # sacremoses @@ -27,7 +27,7 @@ fsspec==2025.5.1 # torch hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.32.5 # via # tokenizers # transformers @@ -49,12 +49,49 @@ markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # transformers +nvidia-cublas-cu12==12.6.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch packaging==25.0 # via # -c ./base.txt @@ -88,7 +125,7 @@ sympy==1.14.0 # via torch tokenizers==0.21.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.7.1 # via -r ./huggingface.in @@ -100,13 +137,18 @@ tqdm==4.67.1 # transformers transformers==4.52.4 # via -r ./huggingface.in +triton==3.3.1 + # via torch typing-extensions==4.14.0 # via # -c ./base.txt # huggingface-hub # torch -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 364f499029..e817913f10 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,5 +1,5 @@ unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1 s3fs>=2024.9.0 -urllib3>=1.26.20 +urllib3>=2.4.0 backoff>=2.2.1 httpx>=0.27.2 diff --git a/requirements/test.txt b/requirements/test.txt index ce0fd2cc62..7aba185049 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./test.in @@ -10,7 +10,7 @@ autoflake==2.3.1 # via -r ./test.in black==25.1.0 # via -r ./test.in -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # black @@ -30,9 +30,9 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.2 # via -r ./test.in -grpcio==1.72.1 +grpcio==1.73.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./test.in iniconfig==2.1.0 # via pytest @@ -86,7 +86,7 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.12 +ruff==0.11.13 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -107,12 +107,10 @@ types-click==7.1.8 # via -r ./test.in types-markdown==3.8.0.20250415 # via -r ./test.in -types-requests==2.31.0.6 +types-requests==2.32.4.20250611 # via -r ./test.in types-tabulate==0.9.0.20241207 # via -r ./test.in -types-urllib3==1.26.25.14 - # via types-requests typing-extensions==4.14.0 # via # -c ./base.txt @@ -124,3 +122,8 @@ typing-extensions==4.14.0 # typing-inspection typing-inspection==0.4.1 # via pydantic +urllib3==2.4.0 + # via + # -c ./base.txt + # -c ./deps/constraints.txt + # types-requests diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index ece191698b..460e99733f 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -2,7 +2,7 @@ # python version must match lowest supported (3.9) major=3 -minor=9 +minor=10 if ! python -c "import sys; assert sys.version_info.major == $major and sys.version_info.minor == $minor"; then echo "python version not equal to expected $major.$minor: $(python --version)" exit 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6527bfeb22..1f320cd5fd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.9" # pragma: no cover +__version__ = "0.17.10" # pragma: no cover From b89076ad555b60e958029bdbd36d8709e8803168 Mon Sep 17 00:00:00 2001 From: Pluto Date: Wed, 11 Jun 2025 13:55:02 +0200 Subject: [PATCH 4/8] Remove IDs from HTML code (#4012) In this pull request parent-child relationship for elements generated with v2 parser is based on actual element IDs instead of IDs baked somewhere in the HTML script. With some extra bug fixing it allowed for significantly simplifying json -> HTML script --- CHANGELOG.md | 9 + scripts/html/rendered_html_from_elements.py | 64 +- .../documents/html_files/example.html | 46 +- .../html_files/example_full_doc.html | 1336 ++++++++--------- .../example_with_alternative_text.html | 10 +- .../example_with_inline_fields.html | 14 +- .../documents/html_files/three_tables.html | 8 +- .../unstructured_json_output/example.json | 20 +- .../example_full_doc.json | 186 +-- .../example_with_alternative_text.json | 8 +- .../example_with_inline_fields.json | 8 +- .../three_tables.json | 6 +- ...t_html_to_unstructured_and_back_parsing.py | 240 +-- ...structured_elements_to_ontology_parsing.py | 123 +- unstructured/__version__.py | 2 +- unstructured/documents/mappings.py | 16 +- unstructured/documents/ontology.py | 43 +- .../partition/html/transformations.py | 59 +- 18 files changed, 1098 insertions(+), 1100 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2140662067..15d307657c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.11-dev0 + +### Enhancements + +### Features + +### Fixes +- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element. + ## 0.17.10 - Drop Python 3.9 support as it reaches EOL in October 2025 - Update pip-compile script to use Python 3.10 and newer diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index 5789a83d14..019810e196 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -10,16 +10,12 @@ """ import argparse +import html import logging import os import select import sys -from collections import defaultdict -from typing import List, Sequence -from bs4 import BeautifulSoup - -from unstructured.documents import elements from unstructured.partition.html.transformations import unstructured_elements_to_ontology from unstructured.staging.base import elements_from_json @@ -28,48 +24,6 @@ logger = logging.getLogger(__name__) -def extract_document_div(html_content: str) -> str: - pos = html_content.find(">") - if pos != -1: - return html_content[: pos + 1] - logger.error("No '>' found in the HTML content.") - raise ValueError("No '>' found in the HTML content.") - - -def extract_page_div(html_content: str) -> str: - soup = BeautifulSoup(html_content, "html.parser") - page_divs = soup.find_all("div", class_="Page") - if len(page_divs) != 1: - logger.error( - "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) - ) - raise ValueError("Expected exactly one
element with class 'Page'.") - return str(page_divs[0]) - - -def fold_document_div( - html_document_start: str, html_document_end: str, html_per_page: List[str] -) -> str: - html_document = html_document_start - for page_html in html_per_page: - html_document += page_html - html_document += html_document_end - return html_document - - -def group_elements_by_page( - unstructured_elements: Sequence[elements.Element], -) -> Sequence[Sequence[elements.Element]]: - pages_dict = defaultdict(list) - - for element in unstructured_elements: - page_number = element.metadata.page_number - pages_dict[page_number].append(element) - - pages_list = list(pages_dict.values()) - return pages_list - - def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: """Renders HTML from a JSON file with unstructured elements. @@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st logger.info("Rendering HTML from text.") unstructured_elements = elements_from_json(filename=filepath, text=text) - unstructured_elements_per_page = group_elements_by_page(unstructured_elements) - # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) - parsed_ontology_per_page = [ - unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page - ] - html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] - - html_document_start = extract_document_div(html_per_page[0]) - html_document_end = "
" - html_per_page = [extract_page_div(page) for page in html_per_page] - - return fold_document_div(html_document_start, html_document_end, html_per_page) + ontology_root = unstructured_elements_to_ontology(unstructured_elements) + html_document = ontology_root.to_html() + unescaped_html = html.unescape(html_document) + return unescaped_html def _main(): diff --git a/test_unstructured/documents/html_files/example.html b/test_unstructured/documents/html_files/example.html index 14be089463..3abd541255 100644 --- a/test_unstructured/documents/html_files/example.html +++ b/test_unstructured/documents/html_files/example.html @@ -1,41 +1,41 @@ - -
-
-

+ +
+
+

Header

-
-
-