diff --git a/.gitignore b/.gitignore index e8e4471465..87f4fc72bd 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,5 @@ outputhtmldiff.txt metricsdiff.txt # analysis -annotated/ \ No newline at end of file +annotated/ +.aider* diff --git a/CHANGELOG.md b/CHANGELOG.md index 17cb66d3a6..ad3afdfc3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.17.6-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.17.5 ### Enhancements diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index 74ea031390..2ef0ac4eff 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -16,12 +16,20 @@ Options: --hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. + --vlm vlm strategy: Use Vision Language Model for processing + --vlm-provider Specify the VLM model provider + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) + --vlm-model Specify the VLM model when using + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) --tables Enable table extraction: tables are represented as html in metadata --images Include base64images in json --coordinates Include coordinates in the output --trace Enable trace logging for debugging, useful to cut and paste the executed curl call --verbose Enable verbose logging including printing first 8 elements to stdout --s3 Write the resulting output to s3 (like a pastebin) + --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option. + --open-html Automatically open HTML output in browser (macOS only) if --write-html. + Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option. --help Display this help and exit. @@ -64,6 +72,7 @@ copy_to_clipboard() { HI_RES=false FAST=false OCR_ONLY=false +VLM=false STRATEGY="" VERBOSE=false TRACE=false @@ -72,6 +81,10 @@ FREEMIUM=false TABLES=true IMAGES=false S3="" +WRITE_HTML=${UNST_WRITE_HTML:-false} +OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false} +VLM_PROVIDER="" +VLM_MODEL="" while [[ "$#" -gt 0 ]]; do case "$1" in @@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do OCR_ONLY=true shift ;; + --vlm) + VLM=true + shift + ;; + --vlm-provider) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_PROVIDER=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; + --vlm-model) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_MODEL=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; --trace) TRACE=true shift @@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do S3=true shift ;; + --write-html) + WRITE_HTML=true + shift + ;; + --open-html) + OPEN_HTML=true + shift + ;; --tables) TABLES=true shift @@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then exit 1 fi +# Check for strategy conflicts after all arguments are processed +STRATEGY_COUNT=0 +$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) + +if [ "$STRATEGY_COUNT" -gt 1 ]; then + echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time." + exit 1 +fi + +# Check if vlm-provider or vlm-model are provided without --vlm +if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then + echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy." + exit 1 +fi + if $TRACE; then set -x fi @@ -175,6 +236,25 @@ elif $OCR_ONLY; then STRATEGY="-ocr-only" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=ocr_only") +elif $VLM; then + if $VERBOSE; then echo "Sending API request with vlm strategy"; fi + STRATEGY="-vlm" + # Add provider and model to filename if specified + if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}" + elif [ -n "$VLM_PROVIDER" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}" + elif [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-model-${VLM_MODEL}" + fi + JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json + CURL_STRATEGY=(-F "strategy=vlm") + if [ -n "$VLM_PROVIDER" ]; then + CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER") + fi + if [ -n "$VLM_MODEL" ]; then + CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL") + fi else if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json @@ -213,6 +293,44 @@ else fi echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}" +# Convert JSON to HTML if requested +if [ "$WRITE_HTML" = true ]; then + HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html + + if $VLM; then + # VLM output has all metadata.text_as_html fields defined, so + # create HTML directly from the metadata.text_as_html fields + { + echo "" + echo "" + echo "" + echo " " + echo " " + echo " ${FILENAME}" + echo " " + echo "" + echo "" + jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}" + echo "" + echo "" + } >"${HTML_OUTPUT_FILEPATH}" + echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}" + else + # most elements will not have metadata.text_as_html defined (by design on Table elements do), + # so use the unstructured library's python script for the conversion. + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}" + echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}" + fi + + # Open HTML file in browser if requested and on macOS + if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then + open "${HTML_OUTPUT_FILEPATH}" + fi +fi + # write .json output to s3 location if [ -n "$S3" ]; then diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b243ca7861..db302d22ce 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.5" # pragma: no cover +__version__ = "0.17.6-dev0" # pragma: no cover