Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
632aab6
feat: Add VLM strategy support to unstructured-get-json.sh script
cragwolfe Mar 28, 2025
1093ffe
feat: Add --write-html option to unstructured-get-json with env var s…
cragwolfe Mar 28, 2025
f3c03fd
feat: Add HTML generation using elements_json_to_html.py in unstructu…
cragwolfe Mar 28, 2025
5424613
chore: Replace python with python3 in shell script
cragwolfe Mar 28, 2025
e8d24aa
refactor: Replace direct function call with script invocation in unst…
cragwolfe Mar 28, 2025
6bfcb2f
refactor: Add elements without page number to 'none' page instead of …
cragwolfe Mar 28, 2025
4170a15
feat: Add support for --open-html option to automatically open HTML o…
cragwolfe Mar 28, 2025
c0687e4
refactor: Remove echo statement when opening HTML file in browser
cragwolfe Mar 28, 2025
3fe657c
feat: Add support for --VLM-PROVIDER parameter in unstructured-get-js…
cragwolfe Mar 28, 2025
c455c92
feat: Add support for --vlm-model parameter in unstructured-get-json.sh
cragwolfe Mar 28, 2025
5269451
docs: Add documentation link for VLM strategy model types
cragwolfe Mar 28, 2025
d10d0a0
style: Break long help text lines for improved readability
cragwolfe Mar 28, 2025
23aa81c
style: Improve formatting of --open-html help text line
cragwolfe Mar 28, 2025
b810747
docs: Clarify $UNST_AUTO_OPEN_HTML should be set to true in help text
cragwolfe Mar 28, 2025
b39d4d1
style: Remove $ before env var in help text for clarity
cragwolfe Mar 28, 2025
bf6aac4
feat: Enhance VLM output filename with provider and model details
cragwolfe Mar 28, 2025
9e66fd5
fix: Handle elements with missing parent IDs by moving them to root l…
cragwolfe Mar 28, 2025
f963b01
ignore aider files
cragwolfe Mar 28, 2025
f237daf
feat: Add strategy conflict check for mutually exclusive options
cragwolfe Mar 28, 2025
ebd833b
refactor: Move strategy conflict check after argument processing
cragwolfe Mar 28, 2025
63a44f3
fix: Add debug echo statements in unstructured-get-json.sh script
cragwolfe Mar 28, 2025
2e8f286
refactor: Remove debug echo statements from strategy conflict check
cragwolfe Mar 28, 2025
7c02dee
fix: Remove debug echo statements from strategy conflict check function
cragwolfe Mar 28, 2025
bb88508
fix: Remove debug echo statements in strategy conflict check function
cragwolfe Mar 28, 2025
81d04f8
refactor: Remove strategy conflict function and replace with inline c…
cragwolfe Mar 28, 2025
62b1398
doc
cragwolfe Mar 28, 2025
a280b7d
docs: Update help text for VLM strategy options in unstructured-get-j…
cragwolfe Mar 28, 2025
0e23bf1
feat: Add validation for VLM provider and model without VLM strategy
cragwolfe Mar 28, 2025
ac72c86
refactor: Improve HTML generation by using jq to check and create HTM…
cragwolfe Mar 28, 2025
62c9d78
fix: Update script to use .metadata.text_as_html instead of .text_as_…
cragwolfe Mar 28, 2025
98b3df5
fix: Adjust jq expression for checking metadata.text_as_html field
cragwolfe Mar 28, 2025
80fc64e
refactor: Improve jq expression for checking metadata.text_as_html field
cragwolfe Mar 28, 2025
c7836e5
fix: Improve jq error handling for text_as_html metadata field
cragwolfe Mar 28, 2025
bd08722
revert convert.py change
cragwolfe Mar 28, 2025
87facd4
shfmt
cragwolfe Mar 29, 2025
21e40c9
bump version
cragwolfe Mar 29, 2025
012f113
replace the jq check with checking if in vlm mode, since they are one…
cragwolfe Mar 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,5 @@ outputhtmldiff.txt
metricsdiff.txt

# analysis
annotated/
annotated/
.aider*
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.17.6-dev0

### Enhancements

### Features

### Fixes

## 0.17.5

### Enhancements
Expand Down
118 changes: 118 additions & 0 deletions scripts/user/unstructured-get-json.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,20 @@ Options:
--hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
--fast fast strategy: No OCR, just extract embedded text
--ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
--vlm vlm strategy: Use Vision Language Model for processing
--vlm-provider Specify the VLM model provider
(see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
--vlm-model Specify the VLM model when using
(see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
--tables Enable table extraction: tables are represented as html in metadata
--images Include base64images in json
--coordinates Include coordinates in the output
--trace Enable trace logging for debugging, useful to cut and paste the executed curl call
--verbose Enable verbose logging including printing first 8 elements to stdout
--s3 Write the resulting output to s3 (like a pastebin)
--write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option.
--open-html Automatically open HTML output in browser (macOS only) if --write-html.
Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option.
--help Display this help and exit.


Expand Down Expand Up @@ -64,6 +72,7 @@ copy_to_clipboard() {
HI_RES=false
FAST=false
OCR_ONLY=false
VLM=false
STRATEGY=""
VERBOSE=false
TRACE=false
Expand All @@ -72,6 +81,10 @@ FREEMIUM=false
TABLES=true
IMAGES=false
S3=""
WRITE_HTML=${UNST_WRITE_HTML:-false}
OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false}
VLM_PROVIDER=""
VLM_MODEL=""

while [[ "$#" -gt 0 ]]; do
case "$1" in
Expand All @@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do
OCR_ONLY=true
shift
;;
--vlm)
VLM=true
shift
;;
--vlm-provider)
if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
VLM_PROVIDER=$2
shift 2
else
echo "Error: Argument for $1 is missing" >&2
exit 1
fi
;;
--vlm-model)
if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
VLM_MODEL=$2
shift 2
else
echo "Error: Argument for $1 is missing" >&2
exit 1
fi
;;
--trace)
TRACE=true
shift
Expand All @@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do
S3=true
shift
;;
--write-html)
WRITE_HTML=true
shift
;;
--open-html)
OPEN_HTML=true
shift
;;
--tables)
TABLES=true
shift
Expand Down Expand Up @@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then
exit 1
fi

# Check for strategy conflicts after all arguments are processed
STRATEGY_COUNT=0
$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))

if [ "$STRATEGY_COUNT" -gt 1 ]; then
echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time."
exit 1
fi

# Check if vlm-provider or vlm-model are provided without --vlm
if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then
echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy."
exit 1
fi

if $TRACE; then
set -x
fi
Expand Down Expand Up @@ -175,6 +236,25 @@ elif $OCR_ONLY; then
STRATEGY="-ocr-only"
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
CURL_STRATEGY=(-F "strategy=ocr_only")
elif $VLM; then
if $VERBOSE; then echo "Sending API request with vlm strategy"; fi
STRATEGY="-vlm"
# Add provider and model to filename if specified
if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then
STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}"
elif [ -n "$VLM_PROVIDER" ]; then
STRATEGY="-vlm-${VLM_PROVIDER}"
elif [ -n "$VLM_MODEL" ]; then
STRATEGY="-vlm-model-${VLM_MODEL}"
fi
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
CURL_STRATEGY=(-F "strategy=vlm")
if [ -n "$VLM_PROVIDER" ]; then
CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER")
fi
if [ -n "$VLM_MODEL" ]; then
CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL")
fi
else
if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
Expand Down Expand Up @@ -213,6 +293,44 @@ else
fi
echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}"

# Convert JSON to HTML if requested
if [ "$WRITE_HTML" = true ]; then
HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html

if $VLM; then
# VLM output has all metadata.text_as_html fields defined, so
# create HTML directly from the metadata.text_as_html fields
{
echo "<!DOCTYPE html>"
echo "<html>"
echo "<head>"
echo " <meta charset=\"UTF-8\">"
echo " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">"
echo " <title>${FILENAME}</title>"
echo " <style>"
echo " body { font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }"
echo " </style>"
echo "</head>"
echo "<body>"
jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}"
echo "</body>"
echo "</html>"
} >"${HTML_OUTPUT_FILEPATH}"
echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}"
else
# most elements will not have metadata.text_as_html defined (by design on Table elements do),
# so use the unstructured library's python script for the conversion.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}"
fi

# Open HTML file in browser if requested and on macOS
if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then
open "${HTML_OUTPUT_FILEPATH}"
fi
fi

# write .json output to s3 location
if [ -n "$S3" ]; then

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.17.5" # pragma: no cover
__version__ = "0.17.6-dev0" # pragma: no cover
Loading