diff --git a/.github/workflows/partition-benchmark.yaml b/.github/workflows/partition-benchmark.yaml new file mode 100644 index 0000000000..b0f4ba9732 --- /dev/null +++ b/.github/workflows/partition-benchmark.yaml @@ -0,0 +1,120 @@ +name: Partition Benchmark + +# Runs on every PR targeting main to detect regressions. +# Can also be triggered manually to establish or inspect a new baseline. +on: + pull_request: + branches: [main] + workflow_dispatch: + +permissions: + contents: read + +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + PYTHON_VERSION: "3.12" + # Number of times to run the full benchmark suite. + NUM_ITERATIONS: "3" + # 20% threshold for now and tune later + REGRESSION_THRESHOLD: "0.20" + # Increment to change cache key when benchmark-affecting dependencies are updated, to ensure clean slate runs. + CACHE_VERSION: "v2" + # S3 location for metrics – matches core-product convention. + S3_METRICS_BUCKET_KEY: utic-metrics/ci-metrics + S3_BENCHMARK_PATH: open-source/partition-benchmark/benchmark_best.json + +jobs: + setup: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + benchmark: + name: Measure and compare partition() runtime + runs-on: ubuntu-latest + needs: [setup] + + steps: + + - uses: actions/checkout@v4 + + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + + + - name: Restore HuggingFace model cache + uses: actions/cache/restore@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + restore-keys: | + hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}- + hf-models-${{ runner.os }}- + + + - name: Run partition benchmark + env: + NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }} + run: | + uv run --no-sync python scripts/performance/benchmark_partition.py \ + benchmark_results.json + + - name: Save HuggingFace model cache + uses: actions/cache/save@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + + + - name: Download previous best from S3 + continue-on-error: true + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }} + run: | + aws s3 cp \ + "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" \ + benchmark_best.json + + + - name: Compare results against stored best + id: compare + run: | + uv run --no-sync python scripts/performance/compare_benchmark.py \ + benchmark_results.json \ + benchmark_best.json \ + ${{ env.REGRESSION_THRESHOLD }} + + + - name: Upload best result to S3 + continue-on-error: true + env: + AWS_ACCESS_KEY_ID: ${{ secrets.S3_EVAL_ACCESS_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_EVAL_SECRET_KEY }} + run: | + aws s3 cp \ + benchmark_best.json \ + "s3://${{ env.S3_METRICS_BUCKET_KEY }}/${{ env.S3_BENCHMARK_PATH }}" + + + - name: Upload benchmark artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.sha }} + path: | + benchmark_results.json + benchmark_best.json + retention-days: 30 diff --git a/.gitignore b/.gitignore index 74730223c0..92ee2263bd 100644 --- a/.gitignore +++ b/.gitignore @@ -209,4 +209,15 @@ annotated/ pcaps python-output -.vs/ \ No newline at end of file +.vs/ +# Partition benchmark generated output +benchmark_results.json +scripts/performance/partition-speed-test/benchmark_results.json + +# Partition benchmark generated output +benchmark_results.json +scripts/performance/partition-speed-test/benchmark_results.json + +# Partition benchmark generated output +benchmark_results.json +scripts/performance/partition-speed-test/benchmark_results.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ca6d8910..9067f89800 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.21.4 +- Add a git action to test regression in runtime performance + ## 0.21.3 ### Enhancements @@ -17,6 +20,15 @@ ### Fixes - **Replace NLTK with spaCy to remediate CVE-2025-14009**: NLTK's downloader uses `zipfile.extractall()` without path validation, enabling RCE via malicious packages (CVSS 10.0, no patch available). spaCy models install as pip packages, eliminating the vulnerable downloader entirely. +||||||| parent of f7a4a80e (fix: update depdencies (#4247)) +======= +>>>>>>> f7a4a80e (fix: update depdencies (#4247)) +||||||| parent of 0758132c (bump version) +======= +## 0.20.9 +- add an action to test the partition seepd in high_res and fast + +>>>>>>> 0758132c (bump version) ## 0.20.8 ### Fixes diff --git a/file_tree b/file_tree new file mode 100644 index 0000000000..4bc7df42f7 --- /dev/null +++ b/file_tree @@ -0,0 +1,1701 @@ +. +├── benchmark_results.json +├── CHANGELOG.md +├── CODE_OF_CONDUCT.md +├── CONTRIBUTING.md +├── discord-test +│   └── 1100149908494876775.txt +├── docker +│   └── packages +│   └── pandoc-3.1.8-r0.apk +├── Dockerfile +├── environment.yml +├── example-docs +│   ├── 2023-half-year-analyses-by-segment.xlsx +│   ├── book-war-and-peace-1225p.txt +│   ├── book-war-and-peace-1p.txt +│   ├── CantinaBand3.wav +│   ├── category-level.docx +│   ├── codeblock.md +│   ├── contains-pictures.docx +│   ├── csv-with-escaped-commas.csv +│   ├── csv-with-line-delimiter.csv +│   ├── csv-with-long-lines.csv +│   ├── docx-hdrftr.docx +│   ├── docx-shapes.docx +│   ├── docx-tables.docx +│   ├── duplicate-paragraphs.doc +│   ├── duplicate-paragraphs.docx +│   ├── eml +│   │   ├── email-equals-attachment-filename.eml +│   │   ├── email-inline-content-disposition.eml +│   │   ├── email-no-html-content-1.eml +│   │   ├── email-no-utf8-2008-07-16.062410.eml +│   │   ├── email-no-utf8-2014-03-17.111517.eml +│   │   ├── email-replace-mime-encodings-error-1.eml +│   │   ├── email-replace-mime-encodings-error-2.eml +│   │   ├── email-replace-mime-encodings-error-3.eml +│   │   ├── email-replace-mime-encodings-error-4.eml +│   │   ├── email-replace-mime-encodings-error-5.eml +│   │   ├── email-with-image.eml +│   │   ├── empty.eml +│   │   ├── fake-email-attachment.eml +│   │   ├── fake-email-b64.eml +│   │   ├── fake-email-header.eml +│   │   ├── fake-email-image-embedded.eml +│   │   ├── fake-email-malformed-encoding.eml +│   │   ├── fake-email-utf-16-be.eml +│   │   ├── fake-email-utf-16-le.eml +│   │   ├── fake-email-utf-16.eml +│   │   ├── fake-email.eml +│   │   ├── fake-email.txt +│   │   ├── fake-encrypted.eml +│   │   ├── family-day.eml +│   │   ├── mime-attach-mp3.eml +│   │   ├── mime-different-plain-html.eml +│   │   ├── mime-html-only.eml +│   │   ├── mime-multi-to-cc-bcc.eml +│   │   ├── mime-multipart-digest.eml +│   │   ├── mime-no-body.eml +│   │   ├── mime-no-subject.eml +│   │   ├── mime-no-to.eml +│   │   ├── mime-simple.eml +│   │   ├── mime-word-encoded-subject.eml +│   │   ├── rfc822-no-date.eml +│   │   ├── signed-doc.p7s +│   │   ├── simple-rfc-822.eml +│   │   ├── test-invalid-date.eml +│   │   ├── test-iso-8601-date.eml +│   │   └── test-rfc2822-date.eml +│   ├── emoji.xlsx +│   ├── empty.txt +│   ├── empty.xlsx +│   ├── example-10k-1p.html +│   ├── example-10k-230p.html +│   ├── example-10k-utf-16.html +│   ├── example-10k.html +│   ├── example-list-items-multiple.docx +│   ├── example-steelJIS-datasheet-utf-16.html +│   ├── example-steelJIS-datasheet.html +│   ├── example-with-scripts.html +│   ├── factbook-utf-16.xml +│   ├── factbook.xml +│   ├── fake_table.docx +│   ├── fake-doc-emphasized-text.doc +│   ├── fake-doc-emphasized-text.docx +│   ├── fake-doc.rtf +│   ├── fake-email-attachment.msg +│   ├── fake-email-multiple-attachments.msg +│   ├── fake-email-with-cc-and-bcc.msg +│   ├── fake-email.eml +│   ├── fake-email.msg +│   ├── fake-email.txt +│   ├── fake-encrypted.msg +│   ├── fake-html-cp1252.html +│   ├── fake-html-lang-de.html +│   ├── fake-html-pre.htm +│   ├── fake-html-with-base64-image.html +│   ├── fake-html-with-duplicate-elements.html +│   ├── fake-html-with-footer-and-header.html +│   ├── fake-html-with-image-from-url.html +│   ├── fake-html.html +│   ├── fake-incomplete-json.txt +│   ├── fake-power-point-malformed.pptx +│   ├── fake-power-point-many-pages.pptx +│   ├── fake-power-point-table.pptx +│   ├── fake-power-point.ppt +│   ├── fake-power-point.pptx +│   ├── fake-text-all-whitespace.txt +│   ├── fake-text-utf-16-be.txt +│   ├── fake-text-utf-16-le.txt +│   ├── fake-text-utf-16.txt +│   ├── fake-text-utf-32.txt +│   ├── fake-text.txt +│   ├── fake.doc +│   ├── fake.docx +│   ├── fake.go +│   ├── fake.odt +│   ├── file_we_dont_want_imported +│   ├── grid_offset_error.docx +│   ├── group-shapes-nested.pptx +│   ├── handbook-1p-no-rendered-page-breaks.docx +│   ├── handbook-1p.docx +│   ├── handbook-872p.docx +│   ├── hebrew-text-base64-iso88598i.txt +│   ├── hlink-meta.docx +│   ├── ideas-page.html +│   ├── img +│   │   ├── bmp_24.bmp +│   │   ├── chi_sim_image.jpeg +│   │   ├── DA-1p.heic +│   │   ├── DA-1p.jpg +│   │   ├── DA-1p.png +│   │   ├── double-column-A.jpg +│   │   ├── double-column-B.jpg +│   │   ├── embedded-images-tables.jpg +│   │   ├── english-and-korean.png +│   │   ├── example.jpg +│   │   ├── jpn-vert.jpeg +│   │   ├── layout-parser-paper-10p.jpg +│   │   ├── layout-parser-paper-combined.tiff +│   │   ├── layout-parser-paper-fast.jpg +│   │   ├── layout-parser-paper-fast.tiff +│   │   ├── layout-parser-paper-with-table.jpg +│   │   └── table-multi-row-column-cells.png +│   ├── language-docs +│   │   ├── eng_afr_spa.txt +│   │   ├── eng_spa_mult.doc +│   │   ├── eng_spa_mult.docx +│   │   ├── eng_spa_mult.eml +│   │   ├── eng_spa_mult.epub +│   │   ├── eng_spa_mult.html +│   │   ├── eng_spa_mult.md +│   │   ├── eng_spa_mult.odt +│   │   ├── eng_spa_mult.org +│   │   ├── eng_spa_mult.ppt +│   │   ├── eng_spa_mult.pptx +│   │   ├── eng_spa_mult.rst +│   │   ├── eng_spa_mult.rtf +│   │   ├── eng_spa_mult.txt +│   │   ├── eng_spa_mult.xml +│   │   ├── eng_spa.txt +│   │   ├── eng_spa.xlsx +│   │   ├── fr_olap.pdf +│   │   └── UDHR_first_article_all.txt +│   ├── logger.py +│   ├── more-than-1k-cells.xlsx +│   ├── norwich-city.txt +│   ├── not-unstructured-payload.json +│   ├── page-breaks.docx +│   ├── password_protected.xlsx +│   ├── pdf +│   │   ├── a1977-backus-p21.pdf +│   │   ├── all-number-table.pdf +│   │   ├── chevron-page.pdf +│   │   ├── copy-protected.pdf +│   │   ├── DA-1p.pdf +│   │   ├── DA-619p.pdf +│   │   ├── embedded-images-tables.pdf +│   │   ├── embedded-images.pdf +│   │   ├── embedded-link.pdf +│   │   ├── emphasis-text.pdf +│   │   ├── failure-after-repair.pdf +│   │   ├── fake-bold-sample.pdf +│   │   ├── fake-memo-with-duplicate-page.pdf +│   │   ├── fake-memo.pdf +│   │   ├── header-test-doc.pdf +│   │   ├── interface-config-guide-p93.pdf +│   │   ├── invalid-pdf-structure-pdfminer-entire-doc.pdf +│   │   ├── invalid-pdf-structure-pdfminer-one-page.pdf +│   │   ├── korean-text-with-tables.pdf +│   │   ├── layout-parser-paper-fast.pdf +│   │   ├── layout-parser-paper-with-empty-pages.pdf +│   │   ├── layout-parser-paper-with-table.pdf +│   │   ├── layout-parser-paper.pdf +│   │   ├── list-item-example.pdf +│   │   ├── loremipsum-flat.pdf +│   │   ├── multi-column-2p.pdf +│   │   ├── multi-column.pdf +│   │   ├── negative-coords.pdf +│   │   ├── password.pdf +│   │   ├── pdf-bad-color-space.pdf +│   │   ├── pdf-with-ocr-text.pdf +│   │   ├── pdf2image-memory-error-test-400p.pdf +│   │   ├── reliance.pdf +│   │   ├── single_table.pdf +│   │   └── table-multi-row-column-cells.pdf +│   ├── picture.pptx +│   ├── README-w-include.org +│   ├── README-w-include.rst +│   ├── README.md +│   ├── README.org +│   ├── README.rst +│   ├── sample-presentation.pptx +│   ├── science-exploration-1p.pptx +│   ├── science-exploration-369p.pptx +│   ├── semicolon-delimited.csv +│   ├── simple-table.md +│   ├── simple.doc +│   ├── simple.docx +│   ├── simple.epub +│   ├── simple.json +│   ├── simple.ndjson +│   ├── simple.odt +│   ├── simple.pptx +│   ├── simple.yaml +│   ├── simple.zip +│   ├── single-column.csv +│   ├── spring-weather.html.json +│   ├── spring-weather.html.ndjson +│   ├── stanley-cups-utf-16.csv +│   ├── stanley-cups-with-emoji.csv +│   ├── stanley-cups-with-emoji.tsv +│   ├── stanley-cups.csv +│   ├── stanley-cups.tsv +│   ├── stanley-cups.xlsx +│   ├── table-multi-row-column-cells-actual.csv +│   ├── table-semicolon-delimiter.csv +│   ├── tables-with-incomplete-rows.docx +│   ├── teams_chat.docx +│   ├── test_evaluate_files +│   │   ├── filter_list.txt +│   │   ├── gold_standard_cct +│   │   │   ├── Bank Good Credit Loan.pptx.txt +│   │   │   ├── currency.csv.txt +│   │   │   └── Performance-Audit-Discussion.pdf.txt +│   │   ├── gold_standard_element_type +│   │   │   └── IRS-form-1987.pdf.json +│   │   ├── gold_standard_table_structure +│   │   │   ├── 2022-financial-statements-p11.pdf.json +│   │   │   └── IRS-2023-Form-1095-A.pdf.json +│   │   ├── unstructured_output +│   │   │   ├── Bank Good Credit Loan.pptx.json +│   │   │   ├── currency.csv.json +│   │   │   ├── form.json +│   │   │   ├── IRS-form-1987.pdf.json +│   │   │   └── Performance-Audit-Discussion.pdf.json +│   │   ├── unstructured_output_cct +│   │   │   ├── Bank Good Credit Loan.pptx.txt +│   │   │   ├── currency.csv.txt +│   │   │   ├── IRS-form-1987.pdf.txt +│   │   │   └── Performance-Audit-Discussion.pdf.txt +│   │   └── unstructured_output_table_structure +│   │   ├── 2022-financial-statements-p11.pdf.json +│   │   └── IRS-2023-Form-1095-A.pdf.json +│   ├── test-image-jpg-mime.pptx +│   ├── tests-example.xls +│   ├── umlauts-non-utf8.md +│   ├── umlauts-utf8.md +│   ├── unsupported +│   │   └── factbook.xsl +│   ├── vodafone.xlsx +│   ├── winter-sports.epub +│   └── xlsx-subtable-cases.xlsx +├── file_tree +├── github +│   ├── actions +│   │   └── base-cache +│   │   └── action.yml +│   ├── ISSUE_TEMPLATE +│   │   ├── bug_report.md +│   │   ├── custom.md +│   │   └── feature_request.md +│   └── workflows +│   ├── ci.yml +│   ├── claude.yml +│   ├── codeflash.yml +│   ├── codeql-analysis.yml +│   ├── docker-publish.yml +│   ├── ingest-test-fixtures-update-pr.yml +│   ├── release-version-alert.yml +│   └── release.yml +├── img +│   └── unstructured_logo.png +├── LICENSE.md +├── Makefile +├── pyproject.toml +├── README.md +├── renovate.json5 +├── scripts +│   ├── airtable-test-helpers +│   │   ├── component_ids.sh +│   │   ├── create_scale_test_components.py +│   │   ├── create_scale_test_components.sh +│   │   └── print_num_rows_df.py +│   ├── benchmark_partition.py +│   ├── check-licenses.sh +│   ├── check-new-release-version.sh +│   ├── chroma-test-helpers +│   │   └── create-and-check-chroma.sh +│   ├── collect_env.py +│   ├── compare_benchmark.py +│   ├── convert +│   │   ├── elements_json_to_format.py +│   │   └── rendered_html_from_elements.py +│   ├── docker-build-ubuntu.sh +│   ├── docker-build.sh +│   ├── docker-smoke-test.sh +│   ├── elasticsearch-test-helpers +│   │   ├── common +│   │   │   ├── docker-compose.yaml +│   │   │   └── es-dest-ingest-test-creds.env +│   │   ├── destination_connector +│   │   │   ├── create_index.py +│   │   │   ├── create-elasticsearch-instance.sh +│   │   │   ├── elasticsearch_elements_mappings.json +│   │   │   ├── es_cluster_config.py +│   │   │   └── test-ingest-elasticsearch-output.py +│   │   └── source_connector +│   │   ├── create_and_fill_es.py +│   │   ├── create-fill-and-check-es.sh +│   │   ├── es_cluster_config.py +│   │   └── wiki_movie_plots_small.csv +│   ├── image +│   │   ├── test-all-outbound-connectivity-scenarios.sh +│   │   └── test-outbound-connectivity.sh +│   ├── ingest-test-fixtures-update.sh +│   ├── initialize-libreoffice.sh +│   ├── kafka-test-helpers +│   │   ├── create-kafka-instance.sh +│   │   └── docker-compose.yml +│   ├── minio-test-helpers +│   │   ├── create-and-check-minio.sh +│   │   ├── docker-compose.yaml +│   │   └── wiki_movie_plots_small.csv +│   ├── opensearch-test-helpers +│   │   ├── common +│   │   │   └── docker-compose.yaml +│   │   ├── destination_connector +│   │   │   ├── create_index.py +│   │   │   ├── create-opensearch-instance.sh +│   │   │   ├── opensearch_cluster_config.py +│   │   │   ├── opensearch_elements_mappings.json +│   │   │   └── test-ingest-opensearch-output.py +│   │   ├── source_connector +│   │   │   ├── create_and_fill_opensearch.py +│   │   │   └── create-and-check-opensearch.sh +│   │   └── wiki_movie_plots_small.csv +│   ├── performance +│   │   ├── benchmark_partition.py +│   │   ├── benchmark-local.sh +│   │   ├── benchmark.sh +│   │   ├── compare_benchmark.py +│   │   ├── docs +│   │   │   ├── book-war-and-peace-1225p.txt -> ../../../example-docs/book-war-and-peace-1225p.txt +│   │   │   ├── book-war-and-peace-1p.txt -> ../../../example-docs/book-war-and-peace-1p.txt +│   │   │   ├── DA-1p.pdf -> ../../../example-docs/DA-1p.pdf +│   │   │   ├── DA-619p.pdf -> ../../../example-docs/DA-619p.pdf +│   │   │   ├── example-10k-1p.html -> ../../../example-docs/example-10k-1p.html +│   │   │   ├── example-10k-230p.html -> ../../../example-docs/example-10k-230p.html +│   │   │   ├── handbook-1p.docx -> ../../../example-docs/handbook-1p.docx +│   │   │   ├── handbook-872p.docx -> ../../../example-docs/handbook-872p.docx +│   │   │   ├── layout-parser-paper-10p.jpg -> ../../../example-docs/layout-parser-paper-10p.jpg +│   │   │   ├── layout-parser-paper-1p.jpg -> ../../../example-docs/layout-parser-paper-fast.jpg +│   │   │   ├── layout-parser-paper-fast-16p.pdf -> ../../../example-docs/layout-parser-paper.pdf +│   │   │   ├── layout-parser-paper-hi_res-16p.pdf -> ../../../example-docs/layout-parser-paper.pdf +│   │   │   ├── science-exploration-1p.pptx -> ../../../example-docs/science-exploration-1p.pptx +│   │   │   └── science-exploration-369p.pptx -> ../../../example-docs/science-exploration-369p.pptx +│   │   ├── get-stats-name.sh +│   │   ├── profile.sh +│   │   ├── quick_partition_bench.py +│   │   ├── README.md +│   │   ├── requirements.txt +│   │   ├── run_partition.py +│   │   ├── time_partition.py +│   │   └── warmup_docs +│   │   ├── warmup.docx -> ../../../example-docs/handbook-1p.docx +│   │   ├── warmup.html -> ../../../example-docs/example-10k-1p.html +│   │   ├── warmup.jpg -> ../../../example-docs/layout-parser-paper-fast.jpg +│   │   ├── warmup.pdf -> ../../../example-docs/layout-parser-paper-fast.pdf +│   │   ├── warmup.pptx -> ../../../example-docs/science-exploration-1p.pptx +│   │   └── warmup.txt -> ../../../example-docs/book-war-and-peace-1p.txt +│   ├── renovate-security-bump.sh +│   ├── setup_al2.sh +│   ├── setup_ubuntu.sh +│   ├── sftp-test-helpers +│   │   ├── create-and-check-sftp.sh +│   │   ├── docker-compose.yaml +│   │   └── folder1 +│   │   ├── folder2 +│   │   │   └── wiki_movie_plots_small2.csv +│   │   └── wiki_movie_plots_small.csv +│   ├── shellcheck.sh +│   ├── singlestore-test-helpers +│   │   ├── docker-compose.yml +│   │   ├── schema.sql +│   │   └── test_outputs.py +│   ├── sql-test-helpers +│   │   ├── create-pgvector-schema.sql +│   │   ├── create-sql-instance.sh +│   │   ├── create-sqlite-schema.py +│   │   ├── create-sqlite-schema.sql +│   │   └── docker-compose-pgvector.yaml +│   ├── sync_fork.sh +│   ├── user +│   │   ├── process-pdf-parallel-through-api.sh +│   │   ├── split-pdf.sh +│   │   ├── u-tables-inspect.sh +│   │   └── unstructured-get-json.sh +│   ├── version-sync.sh +│   └── weaviate-test-helpers +│   ├── create_schema.py +│   ├── create-weaviate-instance.sh +│   ├── docker-compose.yml +│   └── elements.json +├── test_unstructured +│   ├── __init__.py +│   ├── chunking +│   │   ├── __init__.py +│   │   ├── test_base.py +│   │   ├── test_basic.py +│   │   ├── test_dispatch.py +│   │   ├── test_html_output.py +│   │   └── test_title.py +│   ├── cleaners +│   │   ├── __init__.py +│   │   ├── test_core.py +│   │   ├── test_extract.py +│   │   └── test_translate.py +│   ├── common +│   │   ├── __init__.py +│   │   └── test_html_table.py +│   ├── documents +│   │   ├── __init__.py +│   │   ├── html_files +│   │   │   ├── example_full_doc.html +│   │   │   ├── example_with_alternative_text.html +│   │   │   ├── example_with_inline_fields.html +│   │   │   ├── example.html +│   │   │   └── three_tables.html +│   │   ├── test_coordinates.py +│   │   ├── test_elements.py +│   │   ├── test_mappings.py +│   │   ├── test_ontology_to_unstructured_parsing.py +│   │   └── unstructured_json_output +│   │   ├── example_full_doc.json +│   │   ├── example_with_alternative_text.json +│   │   ├── example_with_inline_fields.json +│   │   ├── example.json +│   │   └── three_tables.json +│   ├── embed +│   │   ├── __init__.py +│   │   ├── test_mixedbreadai.py +│   │   ├── test_octoai.py +│   │   ├── test_openai.py +│   │   ├── test_vertexai.py +│   │   └── test_voyageai.py +│   ├── file_utils +│   │   ├── __init__.py +│   │   ├── test_encoding.py +│   │   ├── test_file_conversion.py +│   │   ├── test_filetype.py +│   │   ├── test_model.py +│   │   └── test-file-contents.txt +│   ├── metrics +│   │   ├── __init__.py +│   │   ├── test_element_type.py +│   │   ├── test_evaluate.py +│   │   ├── test_table_alignment.py +│   │   ├── test_table_detection_metrics.py +│   │   ├── test_table_formats.py +│   │   ├── test_table_structure.py +│   │   ├── test_text_extraction.py +│   │   └── test_utils.py +│   ├── nlp +│   │   ├── __init__.py +│   │   ├── mock_nltk.py +│   │   ├── test_partition.py +│   │   └── test_tokenize.py +│   ├── partition +│   │   ├── __init__.py +│   │   ├── common +│   │   │   ├── __init__.py +│   │   │   ├── test_common.py +│   │   │   ├── test_lang.py +│   │   │   └── test_metadata.py +│   │   ├── conftest.py +│   │   ├── html +│   │   │   ├── __init__.py +│   │   │   ├── test_convert.py +│   │   │   ├── test_html_to_ontology_parsing.py +│   │   │   ├── test_html_to_unstructured_and_back_parsing.py +│   │   │   ├── test_html_utils.py +│   │   │   ├── test_parser.py +│   │   │   ├── test_partition_v2.py +│   │   │   ├── test_partition.py +│   │   │   └── test_unstructured_elements_to_ontology_parsing.py +│   │   ├── pdf_image +│   │   │   ├── __init__.py +│   │   │   ├── conftest.py +│   │   │   ├── test_analysis.py +│   │   │   ├── test_image.py +│   │   │   ├── test_inference_utils.py +│   │   │   ├── test_merge_elements.py +│   │   │   ├── test_ocr.py +│   │   │   ├── test_pdf_image_utils.py +│   │   │   ├── test_pdf.py +│   │   │   ├── test_pdfminer_processing.py +│   │   │   └── test_pdfminer_utils.py +│   │   ├── test_api.py +│   │   ├── test_auto.py +│   │   ├── test_constants.py +│   │   ├── test_csv.py +│   │   ├── test_doc.py +│   │   ├── test_docx.py +│   │   ├── test_email.py +│   │   ├── test_epub.py +│   │   ├── test_json.py +│   │   ├── test_md.py +│   │   ├── test_msg.py +│   │   ├── test_ndjson.py +│   │   ├── test_odt.py +│   │   ├── test_org.py +│   │   ├── test_ppt.py +│   │   ├── test_pptx.py +│   │   ├── test_rst.py +│   │   ├── test_rtf.py +│   │   ├── test_strategies.py +│   │   ├── test_text_type.py +│   │   ├── test_text.py +│   │   ├── test_tsv.py +│   │   ├── test_xlsx.py +│   │   ├── test_xml.py +│   │   └── utils +│   │   ├── __init__.py +│   │   ├── ocr_models +│   │   │   └── test_ocr_interface.py +│   │   ├── test_config.py +│   │   ├── test_sorting.py +│   │   └── test_xycut.py +│   ├── staging +│   │   ├── __init__.py +│   │   ├── test_base.py +│   │   ├── test_baseplate.py +│   │   ├── test_datasaur.py +│   │   ├── test_huggingface.py +│   │   ├── test_label_box.py +│   │   ├── test_label_studio.py +│   │   ├── test_prodigy.py +│   │   └── test_weaviate.py +│   ├── test_utils.py +│   ├── testfiles +│   │   ├── chunking +│   │   │   ├── full_table_long_text_250.json +│   │   │   ├── long_text_table_200.json +│   │   │   ├── table_2000.json +│   │   │   ├── table_text_200.json +│   │   │   └── title_table_200.json +│   │   ├── file_type +│   │   │   └── test_document_from_office365.docx +│   │   └── staging +│   │   ├── embedded-images.pdf.json +│   │   ├── embedded-images.pdf.md +│   │   ├── UDHR_first_article_all.txt.json +│   │   └── UDHR_first_article_all.txt.md +│   └── unit_utils.py +├── test_unstructured_ingest +│   ├── check-diff-evaluation-metrics.sh +│   ├── check-diff-expected-output-html.sh +│   ├── check-diff-expected-output-markdown.sh +│   ├── check-diff-expected-output.sh +│   ├── check-num-dirs-output.sh +│   ├── check-num-files-expected-output.sh +│   ├── check-num-files-output.sh +│   ├── check-num-rows-and-columns-output.sh +│   ├── clean-permissions-files.sh +│   ├── cleanup.sh +│   ├── evaluation-ingest-cp.sh +│   ├── evaluation-metrics.sh +│   ├── example-docs +│   │   ├── layout-parser-paper-with-table.jpg +│   │   └── layout-parser-paper.pdf +│   ├── expected-structured-output +│   │   ├── airtable-diff +│   │   │   ├── app5YQxSfp220fWtm +│   │   │   │   ├── tblBoUk54tWXGqYai.json +│   │   │   │   └── tblxdPc7L2meGIZLE.json +│   │   │   └── appJ43QmP8I17zu88 +│   │   │   ├── tblbj2vBlL2dN2xqq.json +│   │   │   └── tblfu7DzEcCWNKwP4.json +│   │   ├── astradb +│   │   │   ├── 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json +│   │   │   ├── 60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json +│   │   │   ├── 641d99e3-9941-4c18-9d99-e399414c183d.csv.json +│   │   │   ├── 762c0093-2277-4f3e-ac00-932277af3e0e.csv.json +│   │   │   └── ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.json +│   │   ├── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +│   │   │   ├── IRS-form-1987.pdf.json +│   │   │   ├── IRS-form-1987.png.json +│   │   │   ├── rfc854.txt.json +│   │   │   └── spring-weather.html.json +│   │   ├── biomed-api +│   │   │   ├── 65 +│   │   │   │   └── 11 +│   │   │   │   └── main.PMC6312790.pdf.json +│   │   │   └── 75 +│   │   │   └── 29 +│   │   │   └── main.PMC6312793.pdf.json +│   │   ├── biomed-path +│   │   │   └── 07 +│   │   │   └── 07 +│   │   │   └── sbaa031.073.PMC7234218.pdf.json +│   │   ├── box +│   │   │   ├── handbook-1p.docx.json +│   │   │   ├── nested-1 +│   │   │   │   ├── ideas-page.html.json +│   │   │   │   └── nested-2 +│   │   │   │   └── ideas-page.html.json +│   │   │   └── science-exploration-1p.pptx.json +│   │   ├── confluence-diff +│   │   │   ├── MFS +│   │   │   │   ├── 1540126.json +│   │   │   │   ├── 1605928.json +│   │   │   │   ├── 1605942.json +│   │   │   │   ├── 1605956.json +│   │   │   │   └── 229477.json +│   │   │   └── testteamsp +│   │   │   ├── 1605859.json +│   │   │   ├── 1605989.json +│   │   │   ├── 1802252.json +│   │   │   ├── 1867777.json +│   │   │   ├── 2589690.json +│   │   │   └── 2589704.json +│   │   ├── delta-table +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json +│   │   │   └── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json +│   │   ├── discord +│   │   │   ├── 1099442333440802930.json +│   │   │   └── 1099601456321003600.json +│   │   ├── dropbox +│   │   │   ├── handbook-1p.docx.json +│   │   │   ├── nested-1 +│   │   │   │   └── ideas-page.html.json +│   │   │   ├── nested-2 +│   │   │   │   └── ideas-page.html.json +│   │   │   └── science-exploration-1p.pptx.json +│   │   ├── elasticsearch +│   │   │   ├── movies-0-57554198.json +│   │   │   ├── movies-1-57554198.json +│   │   │   ├── movies-2-57554198.json +│   │   │   ├── movies-3-57554198.json +│   │   │   ├── movies-4-57554198.json +│   │   │   ├── movies-5-57554198.json +│   │   │   ├── movies-6-57554198.json +│   │   │   ├── movies-7-57554198.json +│   │   │   ├── movies-8-57554198.json +│   │   │   └── movies-9-57554198.json +│   │   ├── embed +│   │   │   └── book-war-and-peace-1p.txt.json +│   │   ├── embed-bedrock +│   │   │   └── book-war-and-peace-1p.txt.json +│   │   ├── embed-mixedbreadai +│   │   │   └── book-war-and-peace-1p.txt.json +│   │   ├── embed-vertexai +│   │   │   └── book-war-and-peace-1p.txt.json +│   │   ├── embed-voyageai +│   │   │   └── book-war-and-peace-1p.txt.json +│   │   ├── gcs +│   │   │   ├── ideas-page.html.json +│   │   │   ├── nested-1 +│   │   │   │   ├── fake-text.txt.json +│   │   │   │   └── nested +│   │   │   │   └── ideas-page.html.json +│   │   │   └── nested-2 +│   │   │   ├── fake-text.txt.json +│   │   │   ├── nested +│   │   │   │   └── ideas-page.html.json +│   │   │   └── stanley-cups.xlsx.json +│   │   ├── github +│   │   │   ├── LICENSE.txt.json +│   │   │   └── test.html.json +│   │   ├── google-drive +│   │   │   ├── fake.docx.json +│   │   │   ├── nested +│   │   │   │   └── fake.docx.json +│   │   │   ├── recalibrating-risk-report.pdf.json +│   │   │   └── test-drive-doc.docx.json +│   │   ├── hubspot +│   │   │   ├── calls +│   │   │   │   ├── 41286477879.json +│   │   │   │   └── 41287834137.json +│   │   │   ├── communications +│   │   │   │   ├── 41286064390.json +│   │   │   │   └── 41286467154.json +│   │   │   ├── emails +│   │   │   │   └── 41287832127.json +│   │   │   ├── notes +│   │   │   │   └── 41287833179.json +│   │   │   ├── products +│   │   │   │   ├── 2362691415.json +│   │   │   │   ├── 2362691416.json +│   │   │   │   └── 2362691417.json +│   │   │   └── tickets +│   │   │   ├── 1976928828.json +│   │   │   ├── 2002294392.json +│   │   │   └── 2002301007.json +│   │   ├── jira-diff +│   │   │   ├── 1 +│   │   │   │   ├── 10000.json +│   │   │   │   ├── 10001.json +│   │   │   │   ├── 10002.json +│   │   │   │   └── 10013.json +│   │   │   ├── JCTP2 +│   │   │   │   ├── 10006.json +│   │   │   │   ├── 10009.json +│   │   │   │   ├── 10010.json +│   │   │   │   ├── 10012.json +│   │   │   │   └── 10015.json +│   │   │   └── JCTP3 +│   │   │   └── 10014.json +│   │   ├── kafka +│   │   │   └── fake-topic.json +│   │   ├── local-single-file +│   │   │   └── UDHR_first_article_all.txt.json +│   │   ├── local-single-file-basic-chunking +│   │   │   └── handbook-1p.docx.json +│   │   ├── local-single-file-chunk-no-orig-elements +│   │   │   └── multi-column-2p.pdf.json +│   │   ├── local-single-file-with-encoding +│   │   │   └── fake-html-cp1252.html.json +│   │   ├── local-single-file-with-pdf-infer-table-structure +│   │   │   ├── layout-parser-paper-with-table.jpg.json +│   │   │   └── layout-parser-paper.pdf.json +│   │   ├── mongodb +│   │   │   ├── 659daefa21dd8c9054b084b6.json +│   │   │   ├── 659daefa21dd8c9054b084b7.json +│   │   │   ├── 659daefa21dd8c9054b084b8.json +│   │   │   └── 659daefa21dd8c9054b084b9.json +│   │   ├── notion +│   │   │   ├── 122b2c22-996b-435b-9de2-ee0e9d2b04bc.json +│   │   │   ├── 438dbc49-2e06-4f01-8031-bf283be58a60.json +│   │   │   ├── 4695ea53-f2b3-45b4-8638-2212fd054d73.json +│   │   │   ├── 5481f29c-799a-4d7b-93ce-b11bcaede531.json +│   │   │   ├── 60377009-e6b2-47f3-a8ff-f159fd8b69f5.json +│   │   │   ├── 898538f2-26e1-4de7-81e6-354045d4d007.json +│   │   │   ├── 8d8bee42-2167-441c-af6c-7b2cff268809.json +│   │   │   ├── 8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json +│   │   │   ├── 9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json +│   │   │   ├── b2a12157-721e-4207-b3b7-527762b782c2.json +│   │   │   ├── c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +│   │   │   ├── dfcbe584-30b1-4551-b533-e6a5759af842.json +│   │   │   ├── feccfcd4-8ca0-4638-8212-1a5726461029.json +│   │   │   └── fee2149e-6240-4431-8e98-a04a2e460a66.json +│   │   ├── onedrive +│   │   │   └── utic-test-ingest-fixtures +│   │   │   ├── fake-text.txt.json +│   │   │   ├── nested +│   │   │   │   └── fake-text.txt.json +│   │   │   └── tests-example.xls.json +│   │   ├── opensearch +│   │   │   ├── movies-0-57554198.json +│   │   │   ├── movies-1-57554198.json +│   │   │   ├── movies-2-57554198.json +│   │   │   ├── movies-3-57554198.json +│   │   │   ├── movies-4-57554198.json +│   │   │   ├── movies-5-57554198.json +│   │   │   ├── movies-6-57554198.json +│   │   │   ├── movies-7-57554198.json +│   │   │   ├── movies-8-57554198.json +│   │   │   └── movies-9-57554198.json +│   │   ├── outlook +│   │   │   ├── 21be155fb0c95885.eml.json +│   │   │   ├── 497eba8c81c801c6.eml.json +│   │   │   └── 4a16a411f162ebbb.eml.json +│   │   ├── pdf-fast-reprocess +│   │   │   └── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +│   │   │   └── IRS-form-1987.pdf.json +│   │   ├── s3 +│   │   │   ├── 2023-Jan-economic-outlook.pdf.json +│   │   │   ├── page-with-formula.pdf.json +│   │   │   ├── recalibrating-risk-report.pdf.json +│   │   │   └── Silent-Giant-(1).pdf.json +│   │   ├── s3-minio +│   │   │   └── wiki_movie_plots_small.csv.json +│   │   ├── salesforce +│   │   │   ├── Campaign +│   │   │   │   ├── 701Hu000001eX9EIAU.xml.json +│   │   │   │   ├── 701Hu000001eX9FIAU.xml.json +│   │   │   │   ├── 701Hu000001eX9GIAU.xml.json +│   │   │   │   └── 701Hu000001eX9HIAU.xml.json +│   │   │   └── EmailMessage +│   │   │   ├── 02sHu00001efErPIAU.eml.json +│   │   │   └── 02sHu00001efErQIAU.eml.json +│   │   ├── sftp +│   │   │   └── folder1 +│   │   │   ├── folder2 +│   │   │   │   └── wiki_movie_plots_small2.csv.json +│   │   │   └── wiki_movie_plots_small.csv.json +│   │   ├── Sharepoint +│   │   │   ├── Document.docx.json +│   │   │   ├── fake-text.txt.json +│   │   │   ├── ideas-page.html.json +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.json +│   │   │   │   └── page-with-formula.pdf.json +│   │   │   ├── permissions-fake-text.docx.json +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.json +│   │   │   │   └── This-is-a-title.aspx.json +│   │   │   └── stanley-cups.xlsx.json +│   │   ├── Sharepoint-with-permissions +│   │   │   ├── Document.docx.json +│   │   │   ├── fake-text.txt.json +│   │   │   ├── ideas-page.html.json +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.json +│   │   │   │   └── page-with-formula.pdf.json +│   │   │   ├── permissions-fake-text.docx.json +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.json +│   │   │   │   └── This-is-a-title.aspx.json +│   │   │   └── stanley-cups.xlsx.json +│   │   └── slack +│   │   └── C07ABKJ83C6.json +│   ├── expected-structured-output-html +│   │   ├── airtable-diff +│   │   │   ├── app5YQxSfp220fWtm +│   │   │   │   ├── tblBoUk54tWXGqYai.html +│   │   │   │   └── tblxdPc7L2meGIZLE.html +│   │   │   └── appJ43QmP8I17zu88 +│   │   │   ├── tblbj2vBlL2dN2xqq.html +│   │   │   └── tblfu7DzEcCWNKwP4.html +│   │   ├── astradb +│   │   │   ├── 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.html +│   │   │   ├── 60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.html +│   │   │   ├── 641d99e3-9941-4c18-9d99-e399414c183d.csv.html +│   │   │   ├── 762c0093-2277-4f3e-ac00-932277af3e0e.csv.html +│   │   │   └── ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.html +│   │   ├── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html +│   │   │   ├── IRS-form-1987.pdf.html +│   │   │   ├── IRS-form-1987.png.html +│   │   │   ├── rfc854.txt.html +│   │   │   └── spring-weather.html.html +│   │   ├── biomed-api +│   │   │   ├── 65 +│   │   │   │   └── 11 +│   │   │   │   └── main.PMC6312790.pdf.html +│   │   │   └── 75 +│   │   │   └── 29 +│   │   │   └── main.PMC6312793.pdf.html +│   │   ├── biomed-path +│   │   │   └── 07 +│   │   │   └── 07 +│   │   │   └── sbaa031.073.PMC7234218.pdf.html +│   │   ├── box +│   │   │   ├── handbook-1p.docx.html +│   │   │   ├── nested-1 +│   │   │   │   ├── ideas-page.html.html +│   │   │   │   └── nested-2 +│   │   │   │   └── ideas-page.html.html +│   │   │   └── science-exploration-1p.pptx.html +│   │   ├── confluence-diff +│   │   │   ├── MFS +│   │   │   │   ├── 1540126.html +│   │   │   │   ├── 1605928.html +│   │   │   │   ├── 1605942.html +│   │   │   │   ├── 1605956.html +│   │   │   │   └── 229477.html +│   │   │   └── testteamsp +│   │   │   ├── 1605859.html +│   │   │   ├── 1605989.html +│   │   │   ├── 1802252.html +│   │   │   ├── 1867777.html +│   │   │   ├── 2589690.html +│   │   │   └── 2589704.html +│   │   ├── delta-table +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.html +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.html +│   │   │   └── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.html +│   │   ├── discord +│   │   │   ├── 1099442333440802930.html +│   │   │   └── 1099601456321003600.html +│   │   ├── dropbox +│   │   │   ├── handbook-1p.docx.html +│   │   │   ├── nested-1 +│   │   │   │   └── ideas-page.html.html +│   │   │   ├── nested-2 +│   │   │   │   └── ideas-page.html.html +│   │   │   └── science-exploration-1p.pptx.html +│   │   ├── elasticsearch +│   │   │   ├── movies-0-57554198.html +│   │   │   ├── movies-1-57554198.html +│   │   │   ├── movies-2-57554198.html +│   │   │   ├── movies-3-57554198.html +│   │   │   ├── movies-4-57554198.html +│   │   │   ├── movies-5-57554198.html +│   │   │   ├── movies-6-57554198.html +│   │   │   ├── movies-7-57554198.html +│   │   │   ├── movies-8-57554198.html +│   │   │   └── movies-9-57554198.html +│   │   ├── embed +│   │   │   └── book-war-and-peace-1p.txt.html +│   │   ├── embed-bedrock +│   │   │   └── book-war-and-peace-1p.txt.html +│   │   ├── embed-mixedbreadai +│   │   │   └── book-war-and-peace-1p.txt.html +│   │   ├── embed-vertexai +│   │   │   └── book-war-and-peace-1p.txt.html +│   │   ├── embed-voyageai +│   │   │   └── book-war-and-peace-1p.txt.html +│   │   ├── gcs +│   │   │   ├── ideas-page.html.html +│   │   │   ├── nested-1 +│   │   │   │   ├── fake-text.txt.html +│   │   │   │   └── nested +│   │   │   │   └── ideas-page.html.html +│   │   │   └── nested-2 +│   │   │   ├── fake-text.txt.html +│   │   │   ├── nested +│   │   │   │   └── ideas-page.html.html +│   │   │   └── stanley-cups.xlsx.html +│   │   ├── github +│   │   │   ├── LICENSE.txt.html +│   │   │   └── test.html.html +│   │   ├── google-drive +│   │   │   ├── fake.docx.html +│   │   │   ├── nested +│   │   │   │   └── fake.docx.html +│   │   │   ├── recalibrating-risk-report.pdf.html +│   │   │   └── test-drive-doc.docx.html +│   │   ├── hubspot +│   │   │   ├── calls +│   │   │   │   ├── 41286477879.html +│   │   │   │   └── 41287834137.html +│   │   │   ├── communications +│   │   │   │   ├── 41286064390.html +│   │   │   │   └── 41286467154.html +│   │   │   ├── emails +│   │   │   │   └── 41287832127.html +│   │   │   ├── notes +│   │   │   │   └── 41287833179.html +│   │   │   ├── products +│   │   │   │   ├── 2362691415.html +│   │   │   │   ├── 2362691416.html +│   │   │   │   └── 2362691417.html +│   │   │   └── tickets +│   │   │   ├── 1976928828.html +│   │   │   ├── 2002294392.html +│   │   │   └── 2002301007.html +│   │   ├── jira-diff +│   │   │   ├── 1 +│   │   │   │   ├── 10000.html +│   │   │   │   ├── 10001.html +│   │   │   │   ├── 10002.html +│   │   │   │   └── 10013.html +│   │   │   ├── JCTP2 +│   │   │   │   ├── 10006.html +│   │   │   │   ├── 10009.html +│   │   │   │   ├── 10010.html +│   │   │   │   ├── 10012.html +│   │   │   │   └── 10015.html +│   │   │   └── JCTP3 +│   │   │   └── 10014.html +│   │   ├── kafka +│   │   │   └── fake-topic.html +│   │   ├── local-single-file +│   │   │   └── UDHR_first_article_all.txt.html +│   │   ├── local-single-file-basic-chunking +│   │   │   └── handbook-1p.docx.html +│   │   ├── local-single-file-chunk-no-orig-elements +│   │   │   └── multi-column-2p.pdf.html +│   │   ├── local-single-file-with-encoding +│   │   │   └── fake-html-cp1252.html.html +│   │   ├── local-single-file-with-pdf-infer-table-structure +│   │   │   ├── layout-parser-paper-with-table.jpg.html +│   │   │   └── layout-parser-paper.pdf.html +│   │   ├── mongodb +│   │   │   ├── 659daefa21dd8c9054b084b6.html +│   │   │   ├── 659daefa21dd8c9054b084b7.html +│   │   │   ├── 659daefa21dd8c9054b084b8.html +│   │   │   └── 659daefa21dd8c9054b084b9.html +│   │   ├── notion +│   │   │   ├── 122b2c22-996b-435b-9de2-ee0e9d2b04bc.html +│   │   │   ├── 438dbc49-2e06-4f01-8031-bf283be58a60.html +│   │   │   ├── 4695ea53-f2b3-45b4-8638-2212fd054d73.html +│   │   │   ├── 5481f29c-799a-4d7b-93ce-b11bcaede531.html +│   │   │   ├── 60377009-e6b2-47f3-a8ff-f159fd8b69f5.html +│   │   │   ├── 898538f2-26e1-4de7-81e6-354045d4d007.html +│   │   │   ├── 8d8bee42-2167-441c-af6c-7b2cff268809.html +│   │   │   ├── 8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.html +│   │   │   ├── 9e20be3d-cbe0-4e28-ad46-2170d40a8d37.html +│   │   │   ├── b2a12157-721e-4207-b3b7-527762b782c2.html +│   │   │   ├── c47a4566-4c7a-488b-ac2a-1292ee507fcb.html +│   │   │   ├── dfcbe584-30b1-4551-b533-e6a5759af842.html +│   │   │   ├── feccfcd4-8ca0-4638-8212-1a5726461029.html +│   │   │   └── fee2149e-6240-4431-8e98-a04a2e460a66.html +│   │   ├── onedrive +│   │   │   └── utic-test-ingest-fixtures +│   │   │   ├── fake-text.txt.html +│   │   │   ├── nested +│   │   │   │   └── fake-text.txt.html +│   │   │   └── tests-example.xls.html +│   │   ├── opensearch +│   │   │   ├── movies-0-57554198.html +│   │   │   ├── movies-1-57554198.html +│   │   │   ├── movies-2-57554198.html +│   │   │   ├── movies-3-57554198.html +│   │   │   ├── movies-4-57554198.html +│   │   │   ├── movies-5-57554198.html +│   │   │   ├── movies-6-57554198.html +│   │   │   ├── movies-7-57554198.html +│   │   │   ├── movies-8-57554198.html +│   │   │   └── movies-9-57554198.html +│   │   ├── outlook +│   │   │   ├── 21be155fb0c95885.eml.html +│   │   │   ├── 497eba8c81c801c6.eml.html +│   │   │   └── 4a16a411f162ebbb.eml.html +│   │   ├── pdf-fast-reprocess +│   │   │   └── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html +│   │   │   └── IRS-form-1987.pdf.html +│   │   ├── s3 +│   │   │   ├── 2023-Jan-economic-outlook.pdf.html +│   │   │   ├── page-with-formula.pdf.html +│   │   │   ├── recalibrating-risk-report.pdf.html +│   │   │   └── Silent-Giant-(1).pdf.html +│   │   ├── s3-minio +│   │   │   └── wiki_movie_plots_small.csv.html +│   │   ├── salesforce +│   │   │   ├── Campaign +│   │   │   │   ├── 701Hu000001eX9EIAU.xml.html +│   │   │   │   ├── 701Hu000001eX9FIAU.xml.html +│   │   │   │   ├── 701Hu000001eX9GIAU.xml.html +│   │   │   │   └── 701Hu000001eX9HIAU.xml.html +│   │   │   └── EmailMessage +│   │   │   ├── 02sHu00001efErPIAU.eml.html +│   │   │   └── 02sHu00001efErQIAU.eml.html +│   │   ├── sftp +│   │   │   └── folder1 +│   │   │   ├── folder2 +│   │   │   │   └── wiki_movie_plots_small2.csv.html +│   │   │   └── wiki_movie_plots_small.csv.html +│   │   ├── Sharepoint +│   │   │   ├── Document.docx.html +│   │   │   ├── fake-text.txt.html +│   │   │   ├── ideas-page.html.html +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.html +│   │   │   │   └── page-with-formula.pdf.html +│   │   │   ├── permissions-fake-text.docx.html +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.html +│   │   │   │   └── This-is-a-title.aspx.html +│   │   │   └── stanley-cups.xlsx.html +│   │   ├── Sharepoint-with-permissions +│   │   │   ├── Document.docx.html +│   │   │   ├── fake-text.txt.html +│   │   │   ├── ideas-page.html.html +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.html +│   │   │   │   └── page-with-formula.pdf.html +│   │   │   ├── permissions-fake-text.docx.html +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.html +│   │   │   │   └── This-is-a-title.aspx.html +│   │   │   └── stanley-cups.xlsx.html +│   │   └── slack +│   │   └── C07ABKJ83C6.html +│   ├── expected-structured-output-markdown +│   │   ├── airtable-diff +│   │   │   ├── app5YQxSfp220fWtm +│   │   │   │   ├── tblBoUk54tWXGqYai.md +│   │   │   │   └── tblxdPc7L2meGIZLE.md +│   │   │   └── appJ43QmP8I17zu88 +│   │   │   ├── tblbj2vBlL2dN2xqq.md +│   │   │   └── tblfu7DzEcCWNKwP4.md +│   │   ├── astradb +│   │   │   ├── 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.md +│   │   │   ├── 60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.md +│   │   │   ├── 641d99e3-9941-4c18-9d99-e399414c183d.csv.md +│   │   │   ├── 762c0093-2277-4f3e-ac00-932277af3e0e.csv.md +│   │   │   └── ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.md +│   │   ├── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.md +│   │   │   ├── IRS-form-1987.pdf.md +│   │   │   ├── IRS-form-1987.png.md +│   │   │   ├── rfc854.txt.md +│   │   │   └── spring-weather.html.md +│   │   ├── biomed-api +│   │   │   ├── 65 +│   │   │   │   └── 11 +│   │   │   │   └── main.PMC6312790.pdf.md +│   │   │   └── 75 +│   │   │   └── 29 +│   │   │   └── main.PMC6312793.pdf.md +│   │   ├── biomed-path +│   │   │   └── 07 +│   │   │   └── 07 +│   │   │   └── sbaa031.073.PMC7234218.pdf.md +│   │   ├── box +│   │   │   ├── handbook-1p.docx.md +│   │   │   ├── nested-1 +│   │   │   │   ├── ideas-page.html.md +│   │   │   │   └── nested-2 +│   │   │   │   └── ideas-page.html.md +│   │   │   └── science-exploration-1p.pptx.md +│   │   ├── confluence-diff +│   │   │   ├── MFS +│   │   │   │   ├── 1540126.md +│   │   │   │   ├── 1605928.md +│   │   │   │   ├── 1605942.md +│   │   │   │   ├── 1605956.md +│   │   │   │   └── 229477.md +│   │   │   └── testteamsp +│   │   │   ├── 1605859.md +│   │   │   ├── 1605989.md +│   │   │   ├── 1802252.md +│   │   │   ├── 1867777.md +│   │   │   ├── 2589690.md +│   │   │   └── 2589704.md +│   │   ├── delta-table +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.md +│   │   │   ├── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.md +│   │   │   └── 0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.md +│   │   ├── discord +│   │   │   ├── 1099442333440802930.md +│   │   │   └── 1099601456321003600.md +│   │   ├── dropbox +│   │   │   ├── handbook-1p.docx.md +│   │   │   ├── nested-1 +│   │   │   │   └── ideas-page.html.md +│   │   │   ├── nested-2 +│   │   │   │   └── ideas-page.html.md +│   │   │   └── science-exploration-1p.pptx.md +│   │   ├── elasticsearch +│   │   │   ├── movies-0-57554198.md +│   │   │   ├── movies-1-57554198.md +│   │   │   ├── movies-2-57554198.md +│   │   │   ├── movies-3-57554198.md +│   │   │   ├── movies-4-57554198.md +│   │   │   ├── movies-5-57554198.md +│   │   │   ├── movies-6-57554198.md +│   │   │   ├── movies-7-57554198.md +│   │   │   ├── movies-8-57554198.md +│   │   │   └── movies-9-57554198.md +│   │   ├── embed +│   │   │   └── book-war-and-peace-1p.txt.md +│   │   ├── embed-bedrock +│   │   │   └── book-war-and-peace-1p.txt.md +│   │   ├── embed-mixedbreadai +│   │   │   └── book-war-and-peace-1p.txt.md +│   │   ├── embed-vertexai +│   │   │   └── book-war-and-peace-1p.txt.md +│   │   ├── embed-voyageai +│   │   │   └── book-war-and-peace-1p.txt.md +│   │   ├── gcs +│   │   │   ├── ideas-page.html.md +│   │   │   ├── nested-1 +│   │   │   │   ├── fake-text.txt.md +│   │   │   │   └── nested +│   │   │   │   └── ideas-page.html.md +│   │   │   └── nested-2 +│   │   │   ├── fake-text.txt.md +│   │   │   ├── nested +│   │   │   │   └── ideas-page.html.md +│   │   │   └── stanley-cups.xlsx.md +│   │   ├── github +│   │   │   ├── LICENSE.txt.md +│   │   │   └── test.html.md +│   │   ├── google-drive +│   │   │   ├── fake.docx.md +│   │   │   ├── nested +│   │   │   │   └── fake.docx.md +│   │   │   ├── recalibrating-risk-report.pdf.md +│   │   │   └── test-drive-doc.docx.md +│   │   ├── hubspot +│   │   │   ├── calls +│   │   │   │   ├── 41286477879.md +│   │   │   │   └── 41287834137.md +│   │   │   ├── communications +│   │   │   │   ├── 41286064390.md +│   │   │   │   └── 41286467154.md +│   │   │   ├── emails +│   │   │   │   └── 41287832127.md +│   │   │   ├── notes +│   │   │   │   └── 41287833179.md +│   │   │   ├── products +│   │   │   │   ├── 2362691415.md +│   │   │   │   ├── 2362691416.md +│   │   │   │   └── 2362691417.md +│   │   │   └── tickets +│   │   │   ├── 1976928828.md +│   │   │   ├── 2002294392.md +│   │   │   └── 2002301007.md +│   │   ├── jira-diff +│   │   │   ├── 1 +│   │   │   │   ├── 10000.md +│   │   │   │   ├── 10001.md +│   │   │   │   ├── 10002.md +│   │   │   │   └── 10013.md +│   │   │   ├── JCTP2 +│   │   │   │   ├── 10006.md +│   │   │   │   ├── 10009.md +│   │   │   │   ├── 10010.md +│   │   │   │   ├── 10012.md +│   │   │   │   └── 10015.md +│   │   │   └── JCTP3 +│   │   │   └── 10014.md +│   │   ├── kafka +│   │   │   └── fake-topic.md +│   │   ├── local-single-file +│   │   │   └── UDHR_first_article_all.txt.md +│   │   ├── local-single-file-basic-chunking +│   │   │   └── handbook-1p.docx.md +│   │   ├── local-single-file-chunk-no-orig-elements +│   │   │   └── multi-column-2p.pdf.md +│   │   ├── local-single-file-with-encoding +│   │   │   └── fake-html-cp1252.html.md +│   │   ├── local-single-file-with-pdf-infer-table-structure +│   │   │   ├── layout-parser-paper-with-table.jpg.md +│   │   │   └── layout-parser-paper.pdf.md +│   │   ├── mongodb +│   │   │   ├── 659daefa21dd8c9054b084b6.md +│   │   │   ├── 659daefa21dd8c9054b084b7.md +│   │   │   ├── 659daefa21dd8c9054b084b8.md +│   │   │   └── 659daefa21dd8c9054b084b9.md +│   │   ├── notion +│   │   │   ├── 122b2c22-996b-435b-9de2-ee0e9d2b04bc.md +│   │   │   ├── 438dbc49-2e06-4f01-8031-bf283be58a60.md +│   │   │   ├── 4695ea53-f2b3-45b4-8638-2212fd054d73.md +│   │   │   ├── 5481f29c-799a-4d7b-93ce-b11bcaede531.md +│   │   │   ├── 60377009-e6b2-47f3-a8ff-f159fd8b69f5.md +│   │   │   ├── 898538f2-26e1-4de7-81e6-354045d4d007.md +│   │   │   ├── 8d8bee42-2167-441c-af6c-7b2cff268809.md +│   │   │   ├── 8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.md +│   │   │   ├── 9e20be3d-cbe0-4e28-ad46-2170d40a8d37.md +│   │   │   ├── b2a12157-721e-4207-b3b7-527762b782c2.md +│   │   │   ├── c47a4566-4c7a-488b-ac2a-1292ee507fcb.md +│   │   │   ├── dfcbe584-30b1-4551-b533-e6a5759af842.md +│   │   │   ├── feccfcd4-8ca0-4638-8212-1a5726461029.md +│   │   │   └── fee2149e-6240-4431-8e98-a04a2e460a66.md +│   │   ├── onedrive +│   │   │   └── utic-test-ingest-fixtures +│   │   │   ├── fake-text.txt.md +│   │   │   ├── nested +│   │   │   │   └── fake-text.txt.md +│   │   │   └── tests-example.xls.md +│   │   ├── opensearch +│   │   │   ├── movies-0-57554198.md +│   │   │   ├── movies-1-57554198.md +│   │   │   ├── movies-2-57554198.md +│   │   │   ├── movies-3-57554198.md +│   │   │   ├── movies-4-57554198.md +│   │   │   ├── movies-5-57554198.md +│   │   │   ├── movies-6-57554198.md +│   │   │   ├── movies-7-57554198.md +│   │   │   ├── movies-8-57554198.md +│   │   │   └── movies-9-57554198.md +│   │   ├── outlook +│   │   │   ├── 21be155fb0c95885.eml.md +│   │   │   ├── 497eba8c81c801c6.eml.md +│   │   │   └── 4a16a411f162ebbb.eml.md +│   │   ├── pdf-fast-reprocess +│   │   │   └── azure +│   │   │   ├── Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.md +│   │   │   └── IRS-form-1987.pdf.md +│   │   ├── s3 +│   │   │   ├── 2023-Jan-economic-outlook.pdf.md +│   │   │   ├── page-with-formula.pdf.md +│   │   │   ├── recalibrating-risk-report.pdf.md +│   │   │   └── Silent-Giant-(1).pdf.md +│   │   ├── s3-minio +│   │   │   └── wiki_movie_plots_small.csv.md +│   │   ├── salesforce +│   │   │   ├── Campaign +│   │   │   │   ├── 701Hu000001eX9EIAU.xml.md +│   │   │   │   ├── 701Hu000001eX9FIAU.xml.md +│   │   │   │   ├── 701Hu000001eX9GIAU.xml.md +│   │   │   │   └── 701Hu000001eX9HIAU.xml.md +│   │   │   └── EmailMessage +│   │   │   ├── 02sHu00001efErPIAU.eml.md +│   │   │   └── 02sHu00001efErQIAU.eml.md +│   │   ├── sftp +│   │   │   └── folder1 +│   │   │   ├── folder2 +│   │   │   │   └── wiki_movie_plots_small2.csv.md +│   │   │   └── wiki_movie_plots_small.csv.md +│   │   ├── Sharepoint +│   │   │   ├── Document.docx.md +│   │   │   ├── fake-text.txt.md +│   │   │   ├── ideas-page.html.md +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.md +│   │   │   │   └── page-with-formula.pdf.md +│   │   │   ├── permissions-fake-text.docx.md +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.md +│   │   │   │   └── This-is-a-title.aspx.md +│   │   │   └── stanley-cups.xlsx.md +│   │   ├── Sharepoint-with-permissions +│   │   │   ├── Document.docx.md +│   │   │   ├── fake-text.txt.md +│   │   │   ├── ideas-page.html.md +│   │   │   ├── nested +│   │   │   │   ├── 2023-Jan-economic-outlook.pdf.md +│   │   │   │   └── page-with-formula.pdf.md +│   │   │   ├── permissions-fake-text.docx.md +│   │   │   ├── SitePages +│   │   │   │   ├── Home.aspx.md +│   │   │   │   └── This-is-a-title.aspx.md +│   │   │   └── stanley-cups.xlsx.md +│   │   └── slack +│   │   └── C07ABKJ83C6.md +│   ├── failed-partition-docs +│   │   ├── sample.gif +│   │   └── small.txt +│   ├── files +│   │   └── azure_cognitive_index_schema.json +│   ├── json-to-clean-text-folder.sh +│   ├── json-to-text.sh +│   ├── metrics +│   │   ├── element-type +│   │   │   ├── aggregate-scores-element-type.tsv +│   │   │   └── all-docs-element-type-frequency.tsv +│   │   └── metrics-json-manifest.txt +│   ├── python +│   │   ├── test-azure-output.py +│   │   ├── test-databricks-volumes.py +│   │   ├── test-gcs-output.py +│   │   ├── test-ingest-astradb-output.py +│   │   ├── test-ingest-chroma-output.py +│   │   ├── test-ingest-delta-table-output.py +│   │   ├── test-ingest-mongodb.py +│   │   ├── test-ingest-sql-output.py +│   │   ├── test-ingest-weaviate-output.py +│   │   ├── test-kafka-output.py +│   │   └── test-produce-kafka-message.py +│   ├── src +│   │   ├── against-api.sh +│   │   ├── airtable-diff.sh +│   │   ├── airtable-large.sh +│   │   ├── astradb.sh +│   │   ├── azure.sh +│   │   ├── biomed-api.sh +│   │   ├── biomed-path.sh +│   │   ├── box.sh +│   │   ├── confluence-diff.sh +│   │   ├── confluence-large.sh +│   │   ├── delta-table.sh +│   │   ├── discord.sh +│   │   ├── dropbox.sh +│   │   ├── elasticsearch.sh +│   │   ├── gcs.sh +│   │   ├── github.sh +│   │   ├── gitlab.sh +│   │   ├── google-drive.sh +│   │   ├── hubspot.sh +│   │   ├── jira.sh +│   │   ├── kafka-local.sh +│   │   ├── local-embed-bedrock.sh +│   │   ├── local-embed-mixedbreadai.sh +│   │   ├── local-embed-octoai.sh +│   │   ├── local-embed-vertexai.sh +│   │   ├── local-embed-voyageai.sh +│   │   ├── local-embed.sh +│   │   ├── local-failed-partition.sh +│   │   ├── local-single-file-basic-chunking.sh +│   │   ├── local-single-file-chunk-no-orig-elements.sh +│   │   ├── local-single-file-with-encoding.sh +│   │   ├── local-single-file-with-pdf-infer-table-structure.sh +│   │   ├── local-single-file.sh +│   │   ├── local.sh +│   │   ├── mongodb.sh +│   │   ├── notion.sh +│   │   ├── onedrive.sh +│   │   ├── opensearch.sh +│   │   ├── outlook.sh +│   │   ├── pdf-fast-reprocess.sh +│   │   ├── s3-compression.sh +│   │   ├── s3-minio.sh +│   │   ├── s3.sh +│   │   ├── salesforce.sh +│   │   ├── sftp.sh +│   │   ├── sharepoint-with-permissions.sh +│   │   ├── sharepoint.sh +│   │   ├── slack.sh +│   │   └── wikipedia.sh +│   ├── structured-json-to-html.sh +│   ├── structured-json-to-markdown.sh +│   ├── test-ingest-dest.sh +│   └── test-ingest-src.sh +├── typings +│   ├── filetype +│   │   └── __init__.pyi +│   ├── lxml +│   │   ├── _types.pyi +│   │   ├── etree +│   │   │   ├── __init__.pyi +│   │   │   ├── _classlookup.pyi +│   │   │   ├── _cleanup.pyi +│   │   │   ├── _element.pyi +│   │   │   ├── _iterparse.pyi +│   │   │   ├── _module_func.pyi +│   │   │   ├── _module_misc.pyi +│   │   │   ├── _nsclasses.pyi +│   │   │   ├── _parser.pyi +│   │   │   └── _xpath.pyi +│   │   └── html +│   │   ├── __init__.pyi +│   │   ├── _element.pyi +│   │   ├── _parse.pyi +│   │   └── soupparser.pyi +│   ├── magic +│   │   └── __init__.pyi +│   ├── nltk +│   │   ├── __init__.pyi +│   │   ├── data.pyi +│   │   ├── downloader.pyi +│   │   ├── internals.pyi +│   │   ├── tag.pyi +│   │   └── tokenize.pyi +│   ├── pandas +│   │   ├── __init__.pyi +│   │   ├── _typing.pyi +│   │   ├── core +│   │   │   ├── api.pyi +│   │   │   └── frame.pyi +│   │   └── io +│   │   ├── api.pyi +│   │   ├── excel +│   │   │   ├── __init__.pyi +│   │   │   └── _base.pyi +│   │   └── parsers +│   │   ├── __init__.pyi +│   │   └── readers.pyi +│   ├── pptx +│   │   ├── __init__.pyi +│   │   ├── api.pyi +│   │   ├── oxml +│   │   │   ├── __init__.py +│   │   │   ├── text.pyi +│   │   │   └── xmlchemy.pyi +│   │   ├── presentation.pyi +│   │   ├── shapes +│   │   │   ├── __init__.py +│   │   │   ├── autoshape.pyi +│   │   │   ├── base.pyi +│   │   │   ├── graphfrm.pyi +│   │   │   ├── group.pyi +│   │   │   ├── picture.pyi +│   │   │   └── shapetree.pyi +│   │   ├── shared.pyi +│   │   ├── slide.pyi +│   │   ├── table.pyi +│   │   ├── text +│   │   │   └── text.pyi +│   │   └── util.pyi +│   └── pypandoc +│   └── __init__.pyi +├── unstructured +│   ├── __init__.py +│   ├── __pycache__ +│   │   ├── __init__.cpython-312.pyc +│   │   ├── __version__.cpython-312.pyc +│   │   ├── errors.cpython-312.pyc +│   │   ├── logger.cpython-312.pyc +│   │   └── utils.cpython-312.pyc +│   ├── __version__.py +│   ├── chunking +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   ├── base.cpython-312.pyc +│   │   │   ├── basic.cpython-312.pyc +│   │   │   ├── dispatch.cpython-312.pyc +│   │   │   └── title.cpython-312.pyc +│   │   ├── base.py +│   │   ├── basic.py +│   │   ├── dispatch.py +│   │   └── title.py +│   ├── cleaners +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   └── core.cpython-312.pyc +│   │   ├── core.py +│   │   ├── extract.py +│   │   └── translate.py +│   ├── common +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   └── html_table.cpython-312.pyc +│   │   └── html_table.py +│   ├── documents +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   ├── coordinates.cpython-312.pyc +│   │   │   ├── elements.cpython-312.pyc +│   │   │   ├── mappings.cpython-312.pyc +│   │   │   └── ontology.cpython-312.pyc +│   │   ├── coordinates.py +│   │   ├── elements.py +│   │   ├── mappings.py +│   │   └── ontology.py +│   ├── embed +│   │   ├── __init__.py +│   │   ├── bedrock.py +│   │   ├── huggingface.py +│   │   ├── interfaces.py +│   │   ├── mixedbreadai.py +│   │   ├── octoai.py +│   │   ├── openai.py +│   │   ├── README.md +│   │   ├── vertexai.py +│   │   └── voyageai.py +│   ├── errors.py +│   ├── file_utils +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   ├── encoding.cpython-312.pyc +│   │   │   ├── filetype.cpython-312.pyc +│   │   │   └── model.cpython-312.pyc +│   │   ├── encoding.py +│   │   ├── file_conversion.py +│   │   ├── filetype.py +│   │   ├── google_filetype.py +│   │   ├── model.py +│   │   └── ndjson.py +│   ├── logger.py +│   ├── metrics +│   │   ├── __init__.py +│   │   ├── element_type.py +│   │   ├── evaluate.py +│   │   ├── object_detection.py +│   │   ├── table +│   │   │   ├── __init__.py +│   │   │   ├── table_alignment.py +│   │   │   ├── table_eval.py +│   │   │   ├── table_extraction.py +│   │   │   └── table_formats.py +│   │   ├── table_structure.py +│   │   ├── text_extraction.py +│   │   └── utils.py +│   ├── models +│   │   └── __init__.py +│   ├── nlp +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   ├── english_words.cpython-312.pyc +│   │   │   ├── patterns.cpython-312.pyc +│   │   │   └── tokenize.cpython-312.pyc +│   │   ├── english_words.py +│   │   ├── english-words.txt +│   │   ├── partition.py +│   │   ├── patterns.py +│   │   └── tokenize.py +│   ├── partition +│   │   ├── __init__.py +│   │   ├── __pycache__ +│   │   │   ├── __init__.cpython-312.pyc +│   │   │   ├── auto.cpython-312.pyc +│   │   │   ├── docx.cpython-312.pyc +│   │   │   ├── pptx.cpython-312.pyc +│   │   │   └── text_type.cpython-312.pyc +│   │   ├── api.py +│   │   ├── auto.py +│   │   ├── common +│   │   │   ├── __init__.py +│   │   │   ├── __pycache__ +│   │   │   │   ├── __init__.cpython-312.pyc +│   │   │   │   ├── common.cpython-312.pyc +│   │   │   │   ├── lang.cpython-312.pyc +│   │   │   │   └── metadata.cpython-312.pyc +│   │   │   ├── common.py +│   │   │   ├── lang.py +│   │   │   └── metadata.py +│   │   ├── csv.py +│   │   ├── doc.py +│   │   ├── docx.py +│   │   ├── email.py +│   │   ├── epub.py +│   │   ├── html +│   │   │   ├── __init__.py +│   │   │   ├── __pycache__ +│   │   │   │   ├── __init__.cpython-312.pyc +│   │   │   │   ├── parser.cpython-312.pyc +│   │   │   │   ├── partition.cpython-312.pyc +│   │   │   │   └── transformations.cpython-312.pyc +│   │   │   ├── convert.py +│   │   │   ├── html_utils.py +│   │   │   ├── parser.py +│   │   │   ├── partition.py +│   │   │   └── transformations.py +│   │   ├── image.py +│   │   ├── json.py +│   │   ├── md.py +│   │   ├── model_init.py +│   │   ├── msg.py +│   │   ├── ndjson.py +│   │   ├── odt.py +│   │   ├── org.py +│   │   ├── pdf_image +│   │   │   ├── __init__.py +│   │   │   ├── analysis +│   │   │   │   ├── __init__.py +│   │   │   │   ├── bbox_visualisation.py +│   │   │   │   ├── layout_dump.py +│   │   │   │   ├── processor.py +│   │   │   │   └── tools.py +│   │   │   ├── form_extraction.py +│   │   │   ├── inference_utils.py +│   │   │   ├── ocr.py +│   │   │   ├── pdf_image_utils.py +│   │   │   ├── pdfminer_processing.py +│   │   │   ├── pdfminer_utils.py +│   │   │   └── pypdf_utils.py +│   │   ├── pdf.py +│   │   ├── ppt.py +│   │   ├── pptx.py +│   │   ├── rst.py +│   │   ├── rtf.py +│   │   ├── strategies.py +│   │   ├── text_type.py +│   │   ├── text.py +│   │   ├── tsv.py +│   │   ├── utils +│   │   │   ├── __init__.py +│   │   │   ├── __pycache__ +│   │   │   │   ├── __init__.cpython-312.pyc +│   │   │   │   ├── config.cpython-312.pyc +│   │   │   │   └── constants.cpython-312.pyc +│   │   │   ├── config.py +│   │   │   ├── constants.py +│   │   │   ├── ocr_models +│   │   │   │   ├── __init__.py +│   │   │   │   ├── google_vision_ocr.py +│   │   │   │   ├── ocr_interface.py +│   │   │   │   ├── paddle_ocr.py +│   │   │   │   └── tesseract_ocr.py +│   │   │   ├── sorting.py +│   │   │   └── xycut.py +│   │   ├── xlsx.py +│   │   └── xml.py +│   ├── patches +│   │   ├── __init__.py +│   │   └── pdfminer.py +│   ├── py.typed +│   ├── staging +│   │   ├── __init__.py +│   │   ├── argilla.py +│   │   ├── base.py +│   │   ├── baseplate.py +│   │   ├── datasaur.py +│   │   ├── huggingface.py +│   │   ├── label_box.py +│   │   ├── label_studio.py +│   │   ├── prodigy.py +│   │   └── weaviate.py +│   ├── test_workflow.sh +│   └── utils.py +└── uv.lock + +366 directories, 1333 files diff --git a/github/ISSUE_TEMPLATE/bug_report.md b/github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..81da8267f8 --- /dev/null +++ b/github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,27 @@ +--- +name: Bug report +about: Create a report to help us improve +title: bug/ +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Provide a code snippet that reproduces the issue. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Environment Info** +Please run `python scripts/collect_env.py` and paste the output here. +This will help us understand more about the environment in which the bug occurred. + +**Additional context** +Add any other context about the problem here. diff --git a/github/ISSUE_TEMPLATE/custom.md b/github/ISSUE_TEMPLATE/custom.md new file mode 100644 index 0000000000..48d5f81fa4 --- /dev/null +++ b/github/ISSUE_TEMPLATE/custom.md @@ -0,0 +1,10 @@ +--- +name: Custom issue template +about: Describe this issue template's purpose here. +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/github/ISSUE_TEMPLATE/feature_request.md b/github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..a3d2ce0923 --- /dev/null +++ b/github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: feat/ +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/github/actions/base-cache/action.yml b/github/actions/base-cache/action.yml new file mode 100644 index 0000000000..1e62c95385 --- /dev/null +++ b/github/actions/base-cache/action.yml @@ -0,0 +1,23 @@ + +name: 'Base Cache Build' +description: 'Set up uv and install project dependencies' +inputs: + python-version: + description: 'python version associated with the environment' + required: true +runs: + using: "composite" + steps: + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + - name: Install dependencies + shell: bash + run: | + uv sync --locked --all-extras --all-groups + make install-nltk-models diff --git a/github/workflows/ci.yml b/github/workflows/ci.yml new file mode 100644 index 0000000000..d618510058 --- /dev/null +++ b/github/workflows/ci.yml @@ -0,0 +1,325 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + merge_group: + branches: [ main ] + +permissions: + id-token: write + contents: read + +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + +jobs: + setup: + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + + check-licenses: + strategy: + matrix: + python-version: [ "3.12" ] + runs-on: ubuntu-latest + needs: [setup] + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Check licenses + run: make check-licenses + + lint: + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + runs-on: ubuntu-latest + needs: [setup, changelog] + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Lint + run: make check + + shellcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: ShellCheck + uses: ludeeus/action-shellcheck@master + + shfmt: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: setup shfmt + uses: mfinelli/setup-shfmt@v3 + - name: Run shfmt + run: shfmt -i 2 -d . + + + test_unit: + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + runs-on: ubuntu-latest + needs: [setup, lint] + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Test + env: + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + TESSERACT_VERSION: "5.5.1" + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + tesseract --version + installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+') + if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then + echo "Tesseract version ${{env.TESSERACT_VERSION}} is required but found version $installed_tesseract_version" + exit 1 + fi + make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true + make check-coverage + + test_unit_no_extras: + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + runs-on: ubuntu-latest + needs: [setup, lint] + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install base dependencies only (no extras) + env: + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + run: | + uv sync --locked --group test + make install-nltk-models + make test-no-extras CI=true + + test_unit_dependency_extras: + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] + extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"] + include: + - extra: csv + uv-extras: "--extra csv" + - extra: docx + uv-extras: "--extra docx" + - extra: odt + uv-extras: "--extra odt" + - extra: markdown + uv-extras: "--extra md" + - extra: pypandoc + uv-extras: "--extra epub --extra org --extra rtf --extra rst" + - extra: pdf-image + uv-extras: "--extra pdf --extra image --extra paddleocr" + - extra: pptx + uv-extras: "--extra pptx" + - extra: xlsx + uv-extras: "--extra xlsx" + runs-on: ubuntu-latest + needs: [setup, lint, test_unit_no_extras] + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install extra dependencies + run: | + uv sync --locked ${{ matrix.uv-extras }} --group test + make install-nltk-models + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + tesseract --version + - name: Test + env: + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + make test-extra-${{ matrix.extra }} CI=true + + test_ingest_src: + strategy: + matrix: + python-version: ["3.12"] + runs-on: opensource-linux-8core + needs: [setup, lint] + steps: + # actions/checkout MUST come before auth + - uses: 'actions/checkout@v4' + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Set up Docker + uses: docker/setup-docker-action@v4 + - name: Set up Docker Compose + uses: docker/setup-compose-action@v1 + - name: Test (end-to-end) + env: + AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} + BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} + CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} + CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} + DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} + DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} + DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} + DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} + GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} + GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} + HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }} + JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} + JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} + MONGODB_URI: ${{ secrets.MONGODB_URI }} + MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} + MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} + MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} + MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} + MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} + MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} + SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} + SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} + SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} + SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} + SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} + SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} + SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} + SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} + SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} + AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} + PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} + ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} + ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} + MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} + OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" + CI: "true" + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr + sudo apt-get install -y tesseract-ocr-kor + sudo apt-get install diffstat + tesseract --version + uv run --no-sync ./test_unstructured_ingest/test-ingest-src.sh + + test_json_to_html: + strategy: + matrix: + python-version: ["3.12"] + runs-on: ubuntu-latest + needs: [setup, lint] + steps: + - uses: 'actions/checkout@v4' + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Test HTML fixtures + env: + OVERWRITE_FIXTURES: "false" + run: | + sudo apt-get install diffstat + uv run --no-sync ./test_unstructured_ingest/check-diff-expected-output-html.sh + + test_json_to_markdown: + strategy: + matrix: + python-version: ["3.12"] + runs-on: ubuntu-latest + needs: [setup, lint] + steps: + - uses: 'actions/checkout@v4' + - uses: ./.github/actions/base-cache + with: + python-version: ${{ matrix.python-version }} + - name: Test markdown fixtures + env: + OVERWRITE_FIXTURES: "false" + run: | + sudo apt-get install diffstat + uv run --no-sync ./test_unstructured_ingest/check-diff-expected-output-markdown.sh + + changelog: + runs-on: ubuntu-latest + steps: + # need to checkout otherwise paths-filter will fail on merge-queue trigger + - uses: actions/checkout@v4 + - if: github.ref != 'refs/heads/main' + uses: dorny/paths-filter@v3 + id: changes + with: + filters: | + src: + - 'unstructured/**' + + - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' + uses: dangoslen/changelog-enforcer@v3 + + # TODO - figure out best practice for caching docker images + # (Using the virtualenv to get pytest) + test_dockerfile: + runs-on: opensource-linux-8core + needs: [ setup, lint ] + steps: + - uses: actions/checkout@v4 + - name: Test Dockerfile + run: | + echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file + make docker-build + make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true + - name: Scan image + uses: anchore/scan-action@v3 + with: + image: "unstructured:dev" + severity-cutoff: critical + only-fixed: true + output-format: table diff --git a/github/workflows/claude.yml b/github/workflows/claude.yml new file mode 100644 index 0000000000..0c4e751156 --- /dev/null +++ b/github/workflows/claude.yml @@ -0,0 +1,37 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + issues: read + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@beta + with: + anthropic_api_key: ${{ secrets.GH_ANTHROPIC_API_KEY }} + allowed_tools: "Bash(git:*),View,GlobTool,GrepTool,BatchTool" diff --git a/github/workflows/codeflash.yml b/github/workflows/codeflash.yml new file mode 100644 index 0000000000..0268f0b2f7 --- /dev/null +++ b/github/workflows/codeflash.yml @@ -0,0 +1,44 @@ +name: Codeflash Optimization + +on: + pull_request: + paths: + - 'unstructured/**' + + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + optimize: + name: Optimize new Python code + if: ${{ github.actor != 'codeflash-ai[bot]' }} + runs-on: ubuntu-latest + env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: 🐍 Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: 📦 Install Environment + uses: ./.github/actions/base-cache + with: + python-version: 3.12 + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + - name: Codeflash Optimization + env: + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }} + run: uvx codeflash diff --git a/github/workflows/codeql-analysis.yml b/github/workflows/codeql-analysis.yml new file mode 100644 index 0000000000..152bafd0cc --- /dev/null +++ b/github/workflows/codeql-analysis.yml @@ -0,0 +1,76 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ main ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ main ] + # known issue that codeql doesn't support merge_queue event, recommendation is to skip it on merge_queue + # https://github.com/github/codeql-action/issues/1572 and https://github.com/github/codeql-action/issues/1537 + schedule: + - cron: '21 21 * * 5' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/github/workflows/docker-publish.yml b/github/workflows/docker-publish.yml new file mode 100644 index 0000000000..d009281616 --- /dev/null +++ b/github/workflows/docker-publish.yml @@ -0,0 +1,107 @@ +name: Build And Push Docker Image + +on: + push: + branches: + - main + workflow_dispatch: + +env: + DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured + DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured + +jobs: + set-short-sha: + runs-on: ubuntu-latest + outputs: + short_sha: ${{ steps.set_short_sha.outputs.short_sha }} + steps: + - name: Set Short SHA + id: set_short_sha + run: echo "short_sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT + + build-images: + strategy: + fail-fast: false + matrix: + include: + - docker-platform: "linux/amd64" + runs-on: opensource-linux-8core + - docker-platform: "linux/arm64" + runs-on: ubuntu-24.04-arm + runs-on: ${{ matrix.runs-on }} + needs: set-short-sha + env: + SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }} + steps: + - name: Set up Docker + uses: docker/setup-buildx-action@v3 + with: + driver: docker + - name: Checkout code + uses: actions/checkout@v4 + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }} + password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }} + - name: Build images + run: | + ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }}) + DOCKER_BUILDKIT=1 docker buildx build --platform=${{ matrix.docker-platform }} --load \ + -f Dockerfile \ + --build-arg BUILDKIT_INLINE_CACHE=1 \ + --progress plain \ + --cache-from $DOCKER_BUILD_REPOSITORY:$ARCH \ + -t $DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA . + - name: Test images + run: | + echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file + echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" >> uns_test_env_file + ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }}) + DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA" \ + make docker-test CI=true TEST_FILE=test_unstructured/partition/test_text.py + DOCKER_IMAGE=$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA make docker-smoke-test + - name: Push images + run: | + # write to the build repository to cache for the publish-images job + ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }}) + docker push "$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA" + publish-images: + runs-on: ubuntu-latest-m + needs: [set-short-sha, build-images] + env: + SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }} + steps: + - uses: docker/setup-buildx-action@v3 + - name: Checkout code + uses: actions/checkout@v4 + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }} + password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }} + - name: Pull AMD image + run: | + docker pull $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA + - name: Pull ARM image + run: | + docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA + - name: Push latest build tags for AMD and ARM + run: | + # these are used to construct the final manifest but also cache-from in subsequent runs + docker tag $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 + docker push $DOCKER_BUILD_REPOSITORY:amd64 + docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 + docker push $DOCKER_BUILD_REPOSITORY:arm64 + - name: Push multiarch manifest + run: | + docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + docker manifest push $DOCKER_REPOSITORY:latest + docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA + VERSION=$(grep -Po '(?<=__version__ = ")[^"]*' unstructured/__version__.py) + docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + docker manifest push $DOCKER_REPOSITORY:$VERSION diff --git a/github/workflows/ingest-test-fixtures-update-pr.yml b/github/workflows/ingest-test-fixtures-update-pr.yml new file mode 100644 index 0000000000..f1afe0b4f0 --- /dev/null +++ b/github/workflows/ingest-test-fixtures-update-pr.yml @@ -0,0 +1,130 @@ +name: Ingest Test Fixtures Update PR + +on: + workflow_dispatch: + +env: + PYTHON_VERSION: "3.12" + NLTK_DATA: ${{ github.workspace }}/nltk_data + +permissions: + id-token: write + contents: read + +jobs: + setup: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + update-fixtures-and-pr: + runs-on: ubuntu-latest + needs: [setup] + steps: + # actions/checkout MUST come before auth + - uses: "actions/checkout@v4" + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + - name: Set up Docker Compose + uses: docker/setup-compose-action@v1 + - name: Update test fixtures + env: + AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }} + BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} + CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }} + CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }} + DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} + DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} + DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} + DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} + GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} + GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} + HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }} + JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }} + JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }} + MONGODB_URI: ${{ secrets.MONGODB_URI }} + MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} + MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }} + MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }} + MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }} + MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }} + MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }} + SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}} + SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}} + SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}} + SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}} + SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}} + SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} + SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}} + SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}} + SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} + AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} + ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} + ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} + PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} + MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}} + OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" + OVERWRITE_FIXTURES: "true" + CI: "true" + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr + sudo apt-get install -y tesseract-ocr-kor + sudo apt-get install diffstat + tesseract --version + uv run --no-sync ./test_unstructured_ingest/test-ingest-src.sh + - name: Update HTML fixtures + run: make html-fixtures-update + - name: Update markdown fixtures + run: make markdown-fixtures-update + + - name: Save branch name to environment file + id: branch + run: | + original_branch=$(git rev-parse --abbrev-ref HEAD) + suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)" + branch_name="$original_branch$suffix" + echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV + + - name: Save PR name to environment file + id: pr + run: | + commit_sha=$(git rev-parse HEAD) + prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + "https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls") + pr_name=$(echo "$prs" | jq -r '.[0].title') + echo "PR_NAME=$pr_name" >> $GITHUB_ENV + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GH_CREATE_PR_TOKEN }} + add-paths: | + test_unstructured_ingest/expected-structured-output + test_unstructured_ingest/expected-structured-output-html + test_unstructured_ingest/metrics + commit-message: "Update ingest test fixtures" + branch: ${{ env.BRANCH_NAME }} + title: "${{ env.PR_NAME }} <- Ingest test fixtures update" + assignees: ${{ github.actor }} + reviewers: ${{ github.actor }} + delete-branch: true + body: | + This pull request includes updated ingest test fixtures. + Please review and merge if appropriate. + base: ${{ github.head_ref }} diff --git a/github/workflows/partition-benchmark.yaml b/github/workflows/partition-benchmark.yaml new file mode 100644 index 0000000000..0458bb0074 --- /dev/null +++ b/github/workflows/partition-benchmark.yaml @@ -0,0 +1,142 @@ +name: Partition Benchmark + +# Run on PRs +on: + pull_request: + branches: [main] + workflow_dispatch: + +permissions: + contents: read + +env: + NLTK_DATA: ${{ github.workspace }}/nltk_data + PYTHON_VERSION: "3.12" + NUM_ITERATIONS: "1" + REGRESSION_THRESHOLD: "1.2" + CACHE_VERSION: "v2" + RESULTS_DIR: "scripts/performance/partition-speed-test" + +jobs: + setup: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + benchmark: + name: Measure and compare partition() runtime + runs-on: ubuntu-latest + needs: [setup] + + steps: + - uses: actions/checkout@v4 + + - uses: ./.github/actions/base-cache + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y libmagic-dev poppler-utils libreoffice + sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 + sudo apt-get update + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor + + - name: Restore HuggingFace model cache + uses: actions/cache/restore@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + restore-keys: | + hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}- + hf-models-${{ runner.os }}- + + # Writes total seconds to $RESULTS_DIR/current-runtime.txt + - name: Run partition benchmark + id: partition + env: + NUM_ITERATIONS: ${{ env.NUM_ITERATIONS }} + run: | + uv run --no-sync python scripts/performance/benchmark_partition.py + # Read the duration written by the script and expose it as a step output + current_duration=$(cat "${{ env.RESULTS_DIR }}/current-runtime.txt") + echo "duration=${current_duration}" >> $GITHUB_OUTPUT + + - name: Save HuggingFace model cache + uses: actions/cache/save@v4 + with: + path: ~/.cache/huggingface + key: hf-models-${{ runner.os }}-${{ env.CACHE_VERSION }}-${{ github.sha }} + + - name: Try downloading previous best runtime + continue-on-error: true + uses: actions/cache/restore@v4 + with: + path: ${{ env.RESULTS_DIR }}/best-runtime.txt + key: partition-runtime-${{ env.CACHE_VERSION }}-${{ github.sha }} + restore-keys: | + partition-runtime-${{ env.CACHE_VERSION }}- + + - name: Compare with previous runtime (if exists) + id: compare + run: | + current_duration=${{ steps.partition.outputs.duration }} + previous_file="${{ env.RESULTS_DIR }}/best-runtime.txt" + + if [[ -f "$previous_file" ]]; then + previous_duration=$(cat "$previous_file") + echo "Previous best: ${previous_duration}s" + echo "Current: ${current_duration}s" + threshold=$(printf "%.0f" $(echo "$previous_duration * ${{ env.REGRESSION_THRESHOLD }}" | bc -l)) + echo "Threshold (${{ env.REGRESSION_THRESHOLD }}x): ${threshold}s" + + if [ "$current_duration" -gt "$threshold" ]; then + echo "RUNTIME_REGRESSED=true" >> $GITHUB_ENV + echo "SAVE_RUNTIME=false" >> $GITHUB_ENV + else + echo "RUNTIME_REGRESSED=false" >> $GITHUB_ENV + if [ "$current_duration" -lt "$previous_duration" ]; then + echo "New best time - updating stored runtime." + echo "SAVE_RUNTIME=true" >> $GITHUB_ENV + else + echo "Within threshold but not faster - keeping existing best." + echo "SAVE_RUNTIME=false" >> $GITHUB_ENV + fi + fi + else + echo "No previous runtime found. Saving current run as baseline." + echo "RUNTIME_REGRESSED=false" >> $GITHUB_ENV + echo "SAVE_RUNTIME=true" >> $GITHUB_ENV + fi + + # Only save when current is faster (or first run). + # Keeps the stored value as the true best, preventing upward drift. + - name: Save new best runtime + if: env.SAVE_RUNTIME == 'true' + run: | + cp ${{ env.RESULTS_DIR }}/current-runtime.txt ${{ env.RESULTS_DIR }}/best-runtime.txt + + - name: Upload new best runtime to cache + if: env.SAVE_RUNTIME == 'true' + uses: actions/cache/save@v4 + with: + path: ${{ env.RESULTS_DIR }}/best-runtime.txt + key: partition-runtime-${{ env.CACHE_VERSION }}-${{ github.sha }} + + - name: Upload benchmark artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.sha }} + path: ${{ env.RESULTS_DIR }}/ + retention-days: 30 + + - name: Fail if runtime regressed + if: env.RUNTIME_REGRESSED == 'true' + run: | + echo "FAIL: partition runtime exceeded previous best by more than ${{ env.REGRESSION_THRESHOLD }}x" + exit 1 diff --git a/github/workflows/release-version-alert.yml b/github/workflows/release-version-alert.yml new file mode 100644 index 0000000000..7f90b4cf5a --- /dev/null +++ b/github/workflows/release-version-alert.yml @@ -0,0 +1,83 @@ +name: Release Version Alert + +on: + pull_request: + types: + - opened + - synchronize + branches: [ main ] + +jobs: + check-version: + runs-on: ubuntu-latest + steps: + - name: Checkout codes + uses: actions/checkout@v4 + - name: Get PR information + id: pr-info + run: | + PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH") + HAS_PR=false; [ "$PR_NUMBER" != "null" ] && HAS_PR=true + echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV + echo "HAS_PR=$HAS_PR" >> $GITHUB_ENV + echo "PR_NUMBER=$PR_NUMBER" + echo "HAS_PR=$HAS_PR" + - name: Check versions + id: check-versions + run: | + CHECK_NEW_VERSION_RESPONSE=$(bash scripts/check-new-release-version.sh) + if [[ "$CHECK_NEW_VERSION_RESPONSE" == "New release version"* ]]; then + if [ "$HAS_PR" = true ]; then + MESSAGE="$CHECK_NEW_VERSION_RESPONSE :rocket: Coming soon in PR: https://github.com/$GITHUB_REPOSITORY/pull/$PR_NUMBER " + else + BRANCH_NAME=$(echo "${GITHUB_REF#refs/heads/}") + BRANCH_LINK="https://github.com/${{ github.repository }}/tree/$BRANCH_NAME" + MESSAGE="$CHECK_NEW_VERSION_RESPONSE :rocket: Coming soon in branch: $BRANCH_LINK" + fi + echo "SLACK_MESSAGE=$MESSAGE" >> $GITHUB_ENV + else + echo "No new non-dev version found. Skipping Slack notification." + echo "SKIP_STEPS=true" >> $GITHUB_ENV # Set an environment variable to indicate skipping steps + fi + echo "SLACK_MESSAGE=$MESSAGE" + - name: Generate Message Hash + if: env.SKIP_STEPS != 'true' + id: generate-hash + run: | + MESSAGE_HASH=$(echo "${{env.SLACK_MESSAGE}}" | sha256sum | cut -d ' ' -f1) + echo "MESSAGE_HASH=$MESSAGE_HASH" >> $GITHUB_ENV + - name: Restore Message from Cache + if: env.SKIP_STEPS != 'true' + id: restore-cache + uses: actions/cache/restore@v4 + with: + path: message_cache.txt + key: message-cache-${{ env.MESSAGE_HASH }} + - name: Check for Duplicates + if: env.SKIP_STEPS != 'true' + run: | + DUPLICATE_CHECK=$(grep -Fx "${{env.SLACK_MESSAGE}}" message_cache.txt || true) + echo "DUPLICATE_CHECK=$DUPLICATE_CHECK" + if [ -n "$DUPLICATE_CHECK" ]; then + echo "Message already posted. Skipping duplicate Slack notification." + echo "SKIP_STEPS=true" >> $GITHUB_ENV # Set an environment variable to indicate skipping steps + fi + - name: Write Message to Cache File + if: env.SKIP_STEPS != 'true' + run: | + echo "${{env.SLACK_MESSAGE}}" >> message_cache.txt + cat message_cache.txt + - name: Store Message in Cache + if: env.SKIP_STEPS != 'true' + uses: actions/cache/save@v4 + with: + path: message_cache.txt + key: message-cache-${{ env.MESSAGE_HASH }} + - name: Slack Notification + if: env.SKIP_STEPS != 'true' + uses: slackapi/slack-github-action@v1.24.0 + with: + channel-id: 'C05S1QMKL5D' + slack-message: ${{ env.SLACK_MESSAGE }} + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/github/workflows/release.yml b/github/workflows/release.yml new file mode 100644 index 0000000000..b8058706b4 --- /dev/null +++ b/github/workflows/release.yml @@ -0,0 +1,81 @@ +name: Pypi Release + +on: + release: + types: + - published + +permissions: + contents: read + id-token: write # Required for PyPI trusted publishing / attestations + +concurrency: + group: release + cancel-in-progress: false + +env: + PYTHON_VERSION: "3.12" + +jobs: + release: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Set up Python + run: uv python install + + - name: Install dependencies + run: uv sync --locked --only-group release --no-install-project + + - name: Validate version matches release tag + env: + TAG: ${{ github.event.release.tag_name }} + run: | + PKG_VERSION=$(uv run --no-sync python -c "from unstructured.__version__ import __version__; print(__version__)") + if [[ "$TAG" != "$PKG_VERSION" && "$TAG" != "v$PKG_VERSION" ]]; then + echo "Tag '$TAG' does not match package version '$PKG_VERSION'" + exit 1 + fi + + - name: Build artifact + id: build + run: uv build + + - name: Publish package + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1 + + # Best-effort: attempt Azure upload even if PyPI fails, but only if build succeeded. + - name: Create .pypirc for Azure Artifacts + if: always() && steps.build.outcome == 'success' + run: | + cat < ~/.pypirc + [distutils] + index-servers = + azure + + [azure] + repository: https://pkgs.dev.azure.com/${{ secrets.AZURE_ARTIFACTS_FEED }}/_packaging/${{ secrets.AZURE_ARTIFACTS_FEED }}/pypi/upload/ + username: ${{ secrets.AZURE_ARTIFACTS_USERNAME }} + password: ${{ secrets.AZURE_ARTIFACTS_PAT }} + EOF + + - name: Publish package to Azure Artifacts + if: always() && steps.build.outcome == 'success' + run: | + EXIT_CODE=0 + uv run --no-sync twine upload -r azure dist/* || EXIT_CODE=$? + if [[ $EXIT_CODE -eq 0 ]]; then + echo "Successfully published to Azure Artifacts (or already existed)" + else + echo "Azure Artifacts upload failed (exit code: $EXIT_CODE)" + if [[ $EXIT_CODE -eq 1 ]]; then + echo "This may be due to version conflicts or connectivity issues" + fi + echo "Azure Artifacts upload is non-critical - skipping failure" + fi diff --git a/metric_test.sh b/metric_test.sh new file mode 100755 index 0000000000..da01ff9d2a --- /dev/null +++ b/metric_test.sh @@ -0,0 +1,18 @@ +RUNTIME_FILE="scripts/performance/partition-speed-test/partition-runtime.txt" +REGRESSION_THRESHOLD="1.2" + +# First run: no previous file exists yet +if [[ -f "$RUNTIME_FILE" ]]; then + previous_duration=$(cat "$RUNTIME_FILE") + # Replace 45 with the actual total from step 4b above + current_duration=45 + threshold=$(printf "%.0f" $(echo "$previous_duration * $REGRESSION_THRESHOLD" | bc -l)) + echo "Previous: ${previous_duration}s Current: ${current_duration}s Threshold: ${threshold}s" + if [ "$current_duration" -gt "$threshold" ]; then + echo "REGRESSION DETECTED" + else + echo "PASS" + fi +else + echo "No previous runtime found - this would be saved as the baseline." +fi diff --git a/pyproject.toml b/pyproject.toml index bfb82dfe96..70fde4f335 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,6 @@ dependencies = [ "typing-extensions>=4.15.0, <5.0.0", "unstructured-client>=0.25.9, <1.0.0", "wrapt>=1.0.0, <2.0.0", - "filelock>=3.12.0,<4.0.0", ] [project.optional-dependencies] diff --git a/scripts/benchmark_partition.py b/scripts/benchmark_partition.py new file mode 100644 index 0000000000..350dbd9531 --- /dev/null +++ b/scripts/benchmark_partition.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Measure partition() runtime over a fixed set of representative example-docs files. + +Follows the same conventions as the existing scripts/performance tooling: + - PDFs are run with strategy="hi_res". + - Everything else is run with strategy="fast". + - Each file is timed over NUM_ITERATIONS runs (after a warmup) and the + average is recorded, matching time_partition.py behaviour. + +Writes the total elapsed seconds (integer) to $GITHUB_OUTPUT as:: + + duration= + +so the calling workflow step can reference it as:: + + ${{ steps..outputs.duration }} + +Usage: + uv run --no-sync python scripts/performance/benchmark_partition.py + +Environment variables: + NUM_ITERATIONS number of timed iterations per file (default: 1) +""" + +from __future__ import annotations + +import logging +import os +import time +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# File list (relative to repo root). +# Each entry is (path, strategy). +# hi_res - PDFs and images (exercises the full OCR / layout-detection stack) +# fast - all other document types (exercises text-extraction paths) +# Mirrors the HI_RES_STRATEGY_FILES pattern in benchmark-local.sh. +# --------------------------------------------------------------------------- +BENCHMARK_FILES: list[tuple[str, str]] = [ + # PDFs - hi_res + ("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"), + ("example-docs/pdf/copy-protected.pdf", "hi_res"), + ("example-docs/pdf/reliance.pdf", "hi_res"), + ("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"), + ("example-docs/pdf/layout-parser-paper.pdf", "hi_res"), + ("example-docs/pdf/layout-parser-paper-with-table.pdf", "hi_res"), + ("example-docs/pdf/failure-after-repair.pdf", "hi_res"), + # Other document types - fast + ("example-docs/contains-pictures.docx", "fast"), + ("example-docs/example-10k-1p.html", "fast"), + ("example-docs/science-exploration-1p.pptx", "fast"), +] + +NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1")) + + +def _warmup(filepath: str) -> None: + """Run a single fast-strategy partition to warm the process up. + + Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/ + variant if present, otherwise falls back to the file itself. + """ + from unstructured.partition.auto import partition + + warmup_dir = Path(__file__).parent / "warmup-docs" + warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}" + target = str(warmup_file) if warmup_file.exists() else filepath + partition(target, strategy="fast") + + +def _measure(filepath: str, strategy: str, iterations: int) -> float: + """Return the average wall-clock seconds for partitioning *filepath*. + + Identical logic to time_partition.measure_execution_time(). + """ + from unstructured.partition.auto import partition + + total = 0.0 + for _ in range(iterations): + t0 = time.perf_counter() + partition(filepath, strategy=strategy) + total += time.perf_counter() - t0 + return total / iterations + + +def _set_github_output(key: str, value: str) -> None: + """Write key=value to $GITHUB_OUTPUT when running in Actions.""" + gho = os.environ.get("GITHUB_OUTPUT") + if gho: + with open(gho, "a") as fh: + fh.write(f"{key}={value}\n") + + +def main() -> None: + repo_root = ( + Path(__file__).resolve().parent.parent.parent + ) # scripts/performance/ -> repo root + + logger.info("=" * 60) + logger.info(f"Partition benchmark (NUM_ITERATIONS={NUM_ITERATIONS})") + logger.info("=" * 60) + + grand_start = time.perf_counter() + + for rel_path, strategy in BENCHMARK_FILES: + filepath = repo_root / rel_path + if not filepath.exists(): + logger.warning(f" WARNING: {rel_path} not found - skipping.") + continue + + logger.info(f" {rel_path} (strategy={strategy}, iterations={NUM_ITERATIONS})") + _warmup(str(filepath)) + avg = _measure(str(filepath), strategy, NUM_ITERATIONS) + logger.info(f" avg {avg:.2f}s") + + total_seconds = int(time.perf_counter() - grand_start) + logger.info(f"\nTotal wall-clock time: {total_seconds}s") + + _set_github_output("duration", str(total_seconds)) + + +if __name__ == "__main__": + main() diff --git a/scripts/compare_benchmark.py b/scripts/compare_benchmark.py new file mode 100644 index 0000000000..219d013d8a --- /dev/null +++ b/scripts/compare_benchmark.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +"""Compare current benchmark results against the stored best runtime. + +The script: + 1. Loads the current benchmark results and the stored best (if any). + 2. Prints a per-file and total summary table. + 3. Exits 1 (fail) if the current ``__total__`` exceeds the best ``__total__`` + by more than the given threshold fraction. + 4. Updates the best-results file in-place when the current run is faster. + 5. Writes ``new_best`` and ``regression`` to ``$GITHUB_OUTPUT`` when set. + +Values in both JSON files are average elapsed seconds per file, as written by +benchmark_partition.py (which follows the same averaging approach as +time_partition.py). + +Usage: + uv run --no-sync python scripts/performance/compare_benchmark.py \\ + current.json best.json [threshold] + + current.json JSON produced by benchmark_partition.py for this run + best.json JSON produced by a previous run (the stored best); may not + exist yet on the very first run + threshold Regression allowance as a fraction, e.g. 0.20 for 20% + (default: 0.20) +""" + +from __future__ import annotations + +import json +import math +import os +import sys +from pathlib import Path + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _github_output(key: str, value: str) -> None: + """Write key=value to $GITHUB_OUTPUT when running inside GitHub Actions.""" + gho = os.environ.get("GITHUB_OUTPUT") + if gho: + with open(gho, "a") as fh: + fh.write(f"{key}={value}\n") + + +def _fmt(seconds: float) -> str: + if math.isnan(seconds): + return " n/a" + return f"{seconds:7.2f}s" + + +def _pct_diff(current: float, best: float) -> str: + if best == 0: + return " n/a" + diff = (current - best) / best * 100 + sign = "+" if diff >= 0 else "" + return f"{sign}{diff:.1f}%" + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + if len(sys.argv) < 3: + print(__doc__, file=sys.stderr) + sys.exit(2) + + current_path = Path(sys.argv[1]) + best_path = Path(sys.argv[2]) + threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20 + + current: dict[str, float] = json.loads(current_path.read_text()) + current_total: float = current["__total__"] + + # ------------------------------------------------------------------ + # First-ever run – no stored best yet; save and exit cleanly. + # ------------------------------------------------------------------ + if not best_path.exists(): + print("No stored best found – saving current run as the baseline.") + print(f" Total (sum of averages): {current_total:.2f}s") + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + _github_output("regression", "false") + sys.exit(0) + + best: dict[str, float] = json.loads(best_path.read_text()) + best_total: float = best["__total__"] + limit: float = best_total * (1.0 + threshold) + + # Collect all file keys, excluding the __total__ sentinel + all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"}) + + col_w = max((len(f) for f in all_files), default=40) + 2 + header = f"{'File':<{col_w}} {'Current (avg)':>13} {'Best (avg)':>10} {'Δ':>8}" + sep = "=" * len(header) + print(sep) + print("Partition benchmark comparison") + print(sep) + print(header) + print("-" * len(header)) + + for fname in all_files: + c = current.get(fname, float("nan")) + b = best.get(fname, float("nan")) + print(f"{fname:<{col_w}} {_fmt(c)} {_fmt(b)} {_pct_diff(c, b):>8}") + + print("-" * len(header)) + print( + f"{'TOTAL':<{col_w}} {_fmt(current_total)} {_fmt(best_total)}" + f" {_pct_diff(current_total, best_total):>8}" + ) + print() + print(f"Threshold : {threshold * 100:.0f}% (fail if current > {limit:.2f}s)") + print() + + # ------------------------------------------------------------------ + # Regression check + # ------------------------------------------------------------------ + if current_total > limit: + excess_pct = (current_total - best_total) / best_total * 100 + print( + f"FAIL: current total {current_total:.2f}s exceeds best " + f"{best_total:.2f}s by {excess_pct:.1f}% " + f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s).", + file=sys.stderr, + ) + _github_output("new_best", "false") + _github_output("regression", "true") + sys.exit(1) + + # ------------------------------------------------------------------ + # Pass – update stored best if the current run is faster + # ------------------------------------------------------------------ + if current_total < best_total: + improvement_pct = (best_total - current_total) / best_total * 100 + print( + f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% faster than " + f"previous best {best_total:.2f}s – updating stored best." + ) + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + else: + slack_pct = (current_total - best_total) / best_total * 100 + print( + f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best " + f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)." + ) + _github_output("new_best", "false") + + _github_output("regression", "false") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/scripts/performance/benchmark_partition.py b/scripts/performance/benchmark_partition.py new file mode 100644 index 0000000000..947890e631 --- /dev/null +++ b/scripts/performance/benchmark_partition.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +"""Measure partition() runtime over a set of files from example-docs. + +Usage: + uv run --no-sync python scripts/performance/benchmark_partition.py [output.json] + +Environment variables: + NUM_ITERATIONS number of timed iterations per file (default: 1) +""" + +from __future__ import annotations + +import json +import logging +import os +import sys +import time +from pathlib import Path + +from unstructured.partition.auto import partition + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + + +BENCHMARK_FILES: list[tuple[str, str]] = [ + # PDFs – hi_res + ("example-docs/pdf/a1977-backus-p21.pdf", "hi_res"), + ("example-docs/pdf/copy-protected.pdf", "hi_res"), + ("example-docs/pdf/reliance.pdf", "hi_res"), + ("example-docs/pdf/pdf-with-ocr-text.pdf", "hi_res"), + ("example-docs/pdf/list-item-example.pdf", "hi_res"), + # Images – hi_res + ("example-docs/embedded-images-tables.jpg", "hi_res"), + ("example-docs/double-column-A.jpg", "hi_res"), + ("example-docs/double-column-B.jpg", "hi_res"), + ("example-docs/layout-parser-paper-fast.jpg", "hi_res"), + # Other document types – fast + ("example-docs/contains-pictures.docx", "fast"), + ("example-docs/example-10k-1p.html", "fast"), + ("example-docs/science-exploration-1p.pptx", "fast"), +] + +NUM_ITERATIONS: int = int(os.environ.get("NUM_ITERATIONS", "1")) + +DEFAULT_OUTPUT = ( + Path(__file__).parent / "partition-speed-test" / "benchmark_results.json" +) + + +def _warmup(filepath: str) -> None: + """Run a single fast-strategy partition to warm the process up. + + Mirrors warm_up_process() in time_partition.py: uses a warmup-docs/ + variant if present, otherwise falls back to the file itself. + """ + + warmup_dir = Path(__file__).parent / "warmup-docs" + warmup_file = warmup_dir / f"warmup{Path(filepath).suffix}" + target = str(warmup_file) if warmup_file.exists() else filepath + partition(target, strategy="fast") + + +def _measure(filepath: str, strategy: str, iterations: int) -> float: + """Return the average wall-clock seconds for partitioning *filepath*. + + Identical logic to time_partition.measure_execution_time(). + """ + + total = 0.0 + for _ in range(iterations): + t0 = time.time() + partition(filepath, strategy=strategy) + total += time.time() - t0 + return total / iterations + + +def main() -> None: + output_path = Path(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_OUTPUT + repo_root = ( + Path(__file__).resolve().parent.parent.parent + ) # scripts/performance/ -> repo root + + logger.info("=" * 60) + logger.info(f"Partition benchmark (NUM_ITERATIONS={NUM_ITERATIONS})") + logger.info("=" * 60) + + results: dict[str, float] = {} + grand_start = time.time() + + for rel_path, strategy in BENCHMARK_FILES: + filepath = repo_root / rel_path + if not filepath.exists(): + logger.warning(f" WARNING: {rel_path} not found – skipping.") + continue + + logger.info(f" {rel_path} (strategy={strategy}, iterations={NUM_ITERATIONS})") + _warmup(str(filepath)) + avg = _measure(str(filepath), strategy, NUM_ITERATIONS) + results[rel_path] = round(avg, 4) + logger.info(f" avg {avg:.2f}s") + + total_seconds = round(time.time() - grand_start, 2) + results["__total__"] = total_seconds + + logger.info(f"\nTotal wall-clock time: {total_seconds}s") + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(results, indent=2) + "\n") + logger.info(f"Results written to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/performance/compare_benchmark.py b/scripts/performance/compare_benchmark.py new file mode 100644 index 0000000000..a12f32bf81 --- /dev/null +++ b/scripts/performance/compare_benchmark.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Compare current benchmark results against the stored best runtime. + +Usage: + uv run --no-sync python scripts/performance/compare_benchmark.py \ + benchmark_results.json \ + benchmark_best.json \ + [threshold] + + current.json JSON produced by benchmark_partition.py for this run + best.json JSON produced by a previous run (the stored best); may not + exist yet on the very first run + threshold Float regression allowance, e.g. 0.20 for 20% (default 0.20) +""" + +from __future__ import annotations + +import json +import logging +import math +import os +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + + +def _github_output(key: str, value: str) -> None: + """Write a key=value pair to $GITHUB_OUTPUT when running in Actions.""" + gho = os.environ.get("GITHUB_OUTPUT") + if gho: + with open(gho, "a") as fh: + fh.write(f"{key}={value}\n") + + +def _fmt(seconds: float) -> str: + """Format a duration, handling NaN for files missing from one side.""" + if math.isnan(seconds): + return " n/a" + return f"{seconds:7.2f}s" + + +def _pct_diff(current: float, best: float) -> str: + if best == 0: + return " n/a" + diff = (current - best) / best * 100 + sign = "+" if diff >= 0 else "" + return f"{sign}{diff:.1f}%" + + +def main() -> None: + if len(sys.argv) < 3: + print(__doc__, file=sys.stderr) + sys.exit(2) + + current_path = Path(sys.argv[1]) + best_path = Path(sys.argv[2]) + threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.20 + + current: dict[str, float] = json.loads(current_path.read_text()) + current_total: float = current["__total__"] + + if not best_path.exists(): + logger.info("No stored best found – saving current run as the baseline.") + logger.info(f" Total: {current_total:.2f}s") + best_path.parent.mkdir(parents=True, exist_ok=True) + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + _github_output("regression", "false") + sys.exit(0) + + best: dict[str, float] = json.loads(best_path.read_text()) + best_total: float = best["__total__"] + limit: float = best_total * (1.0 + threshold) + + # Collect all file keys (exclude the __total__ sentinel) + all_files = sorted((set(current.keys()) | set(best.keys())) - {"__total__"}) + + col_w = max((len(f) for f in all_files), default=40) + 2 + header = f"{'File':<{col_w}} {'Current':>9} {'Best':>9} {'Delta':>8}" + logger.info("=" * len(header)) + logger.info("Partition benchmark comparison") + logger.info("=" * len(header)) + logger.info(header) + logger.info("-" * len(header)) + + for fname in all_files: + c = current.get(fname, float("nan")) + b = best.get(fname, float("nan")) + logger.info(f"{fname:<{col_w}} {_fmt(c)} {_fmt(b)} {_pct_diff(c, b):>8}") + + logger.info("-" * len(header)) + logger.info( + f"{'TOTAL':<{col_w}} {_fmt(current_total)} {_fmt(best_total)}" + f" {_pct_diff(current_total, best_total):>8}" + ) + logger.info("") + logger.info(f"Threshold : {threshold * 100:.0f}% (fail if current > {limit:.2f}s)") + logger.info("") + + # fail on regression beyond threshold + if current_total > limit: + excess_pct = (current_total - best_total) / best_total * 100 + logger.error( + f"FAIL: current runtime {current_total:.2f}s exceeds best " + f"{best_total:.2f}s by {excess_pct:.1f}% " + f"(threshold {threshold * 100:.0f}%, limit {limit:.2f}s)." + ) + _github_output("new_best", "false") + _github_output("regression", "true") + sys.exit(1) + + # pass: current is within threshold of best; update best if current is faster + if current_total < best_total: + improvement_pct = (best_total - current_total) / best_total * 100 + logger.info( + f"PASS (new best): {current_total:.2f}s is {improvement_pct:.1f}% " + f"faster than the previous best {best_total:.2f}s – updating in S3." + ) + best_path.write_text(current_path.read_text()) + _github_output("new_best", "true") + else: + slack_pct = (current_total - best_total) / best_total * 100 + logger.info( + f"PASS: {current_total:.2f}s is {slack_pct:.1f}% slower than best " + f"{best_total:.2f}s (within {threshold * 100:.0f}% threshold)." + ) + _github_output("new_best", "false") + + _github_output("regression", "false") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/scripts/performance/partition-speed-test/benchmark_best.json b/scripts/performance/partition-speed-test/benchmark_best.json new file mode 100644 index 0000000000..d9832a573c --- /dev/null +++ b/scripts/performance/partition-speed-test/benchmark_best.json @@ -0,0 +1,8 @@ +{ + "example-docs/pdf/a1977-backus-p21.pdf": 4.143, + "example-docs/pdf/copy-protected.pdf": 3.898, + "example-docs/contains-pictures.docx": 0.003, + "example-docs/example-10k-1p.html": 0.005, + "example-docs/science-exploration-1p.pptx": 0.012, + "__total__": 8.061 +} \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 95bb48a3fe..03a41ffa02 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.3" # pragma: no cover +__version__ = "0.21.4" # pragma: no cover diff --git a/unstructured/test_workflow.sh b/unstructured/test_workflow.sh new file mode 100644 index 0000000000..cc7cb0cd2e --- /dev/null +++ b/unstructured/test_workflow.sh @@ -0,0 +1,319 @@ +#!/usr/bin/env bash +# test_benchmark.sh +# +# Smoke-tests for scripts/performance/benchmark_partition.py and +# scripts/performance/compare_benchmark.py. +# +# Running partition() for real is slow and requires the full ML stack, so +# benchmark_partition.py is tested at the import/CLI level only. +# compare_benchmark.py is tested end-to-end with synthetic JSON fixtures +# covering every branch: first run, pass (within threshold), new best, and +# regression (fail). +# +# Usage: +# ./test_benchmark.sh +# +# Exit code: 0 if all tests pass, 1 on the first failure. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +BENCHMARK_SCRIPT="$REPO_ROOT/scripts/performance/benchmark_partition.py" +COMPARE_SCRIPT="$REPO_ROOT/scripts/performance/compare_benchmark.py" + +# Temporary directory; always cleaned up on exit +TMPDIR_WORK="$(mktemp -d)" +trap 'rm -rf "$TMPDIR_WORK"' EXIT + +PASS=0 +FAIL=0 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +green() { printf '\033[0;32m%s\033[0m\n' "$*"; } +red() { printf '\033[0;31m%s\033[0m\n' "$*"; } + +pass() { + green " PASS: $1" + PASS=$((PASS + 1)) +} + +fail() { + red " FAIL: $1" + FAIL=$((FAIL + 1)) +} + +# Run a command and assert it exits with the expected code. +# assert_exit