diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 0f656c7..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bd001c --- /dev/null +++ b/.gitignore @@ -0,0 +1,204 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml + diff --git a/__pycache__/model.cpython-39.pyc b/__pycache__/model.cpython-39.pyc deleted file mode 100644 index ce497a4..0000000 Binary files a/__pycache__/model.cpython-39.pyc and /dev/null differ diff --git a/tests/.DS_Store b/tests/.DS_Store deleted file mode 100644 index fd3fe2e..0000000 Binary files a/tests/.DS_Store and /dev/null differ diff --git a/tests/data/.DS_Store b/tests/data/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/tests/data/.DS_Store and /dev/null differ diff --git a/tests/out/.DS_Store b/tests/out/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/tests/out/.DS_Store and /dev/null differ diff --git a/twix.egg-info/PKG-INFO b/twix.egg-info/PKG-INFO deleted file mode 100644 index 23443c3..0000000 --- a/twix.egg-info/PKG-INFO +++ /dev/null @@ -1,225 +0,0 @@ -Metadata-Version: 2.1 -Name: twix -Version: 0.1.0 -Summary: A library that extracts structured data from templatized form-like documents automatically. -Home-page: https://github.com/yiminl18/document_reverse.git -Author: Yiming Lin -Author-email: yiminglin@berkeley.edu -License: UNKNOWN -Platform: UNKNOWN -Classifier: Programming Language :: Python :: 3 -Classifier: License :: OSI Approved :: MIT License -Classifier: Operating System :: OS Independent -Requires-Python: >=3.6 -Description-Content-Type: text/markdown - -# TWIX: Reconstructing Structured Data from Templatized Documents - -[![Paper](https://img.shields.io/badge/Paper-arXiv-red)](https://arxiv.org/abs/2501.06659) - -- [🚀 Getting Started](#-getting-started) -- [📦 Python Package](#-python-package) -- [🖥️ User Interface](#️-user-interface) -- [📚 TWIX API Reference](#-twix-api-reference) - - -TWIX is a tool for automatically extracting structured data from templatized documents that are programmatically generated by populating fields in a visual template. -TWIX infers the underlying template and then performs data extraction, offering a scalable solution with high accuracy and low cost. In particular, TWIX offers, - -1. A Python package for extracting structured data from documents step by step, designed for production pipeline deployment, with optional user feedback to monitor and refine the extraction process. -2. An interactive UI playground for data extraction that allows users to edit the inferred template, enabling more confident and accurate extraction. - -![TWIX Figure](docs/assets/image/blog_example.png) - - - -# 🚀 Getting Started - -- Python 3.10 or later -- OpenAI API key - -1. Clone the repository - -```bash -git clone https://github.com/yiminl18/TWIX.git -``` - -2. Install packages. - -```bash -pip install -e . -``` - -3. Set OpenAI API key as environment variable - -[You can refer to this document](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety) - -# 📦 Python Package - -If you want to use TWIX as a Python package, see detailed Python API references in [TWIX API Reference](#twix-api-reference), and examples below. - -1. To use TWIX to extract structured data step by step, check out [`tests/test_twix.ipynb`](tests/test_twix.ipynb). -2. To use TWIX to extract structured data with a single API call, check out [`tests/test_twix_transform.ipynb`](tests/test_twix_transform.ipynb). -3. To edit the inferred template with user input, check out [`tests/test_twix_users.ipynb`](tests/test_twix_users.ipynb). - -# 🖥️ User Interface - -If you want to use TWIX in our user interfaces: - -1. Start the frontend server. In the `/twix-ui/` directory, run: - - ```bash - npm start - ``` - -2. Start the backend server. In the `/twix-ui/backend/` directory, run: - - ```bash - python3 app.py - ``` -[Watch the TWIX Demo](docs/assets/video/Twix_Demo.mp4) -![TWIX Figure](docs/assets/image/UI.png) - -# 📚 TWIX API Reference - -This document provides an overview of the available APIs in the TWIX Python package. Each API is described with its functionality, parameters, and return values. - ---- - -## API List - -```python -__all__ = [ - "predict_field", - "predict_template", - "extract_data", - "extract_phrase", - "transform", - "remove_template_node", - "modify_template_node", - "add_fields", - "remove_fields" -] -``` - -Below is a detailed description of each API: - ---- - -### 1. extract_phrase - -Extracts phrases from PDFs by using OCR tools. - -- **Parameters:** - - `data_files` (list): Stores a list of paths to documents that are created using the same template. - - `result_folder` (str): The path to store results. - - `LLM_model_name` (str, optional): Specify the LLM model name. - - `page_to_infer_fields` (int, optional): TWIX extracts all phrases from each input document by default. For field prediction, it separately creates a small document sample by specifying `page_to_infer_fields` (by default is 5), which determines how many pages are used for inferring fields. -- **Returns:** - - `dict`: A dict of phrases per input document. The key in the dict stores the document name. The value corresponds to the raw extracted phrases and their bounding box, which are also written in the result folder. - - `cost` (float): The cost incurred during the function call. - ---- - -### 2. predict_field - -Predicts a list of fields from documents. Fields refer to phrases in table headers or keywords in key-value pairs. - -- **Parameters:** - - `data_files` (list): Stores a list of paths to documents that are created using the same template. - - `result_folder` (str): The path to store results. - - `LLM_model_name` (str, optional): Specify the LLM model name. -- **Returns:** - - `list`: A list of predicted phrases. TWIX also writes the results in the local result folder, naming the file as `twix_key.txt`. - - `cost` (float): The cost incurred during the function call. ---- - -### 3. predict_template - -Predicts the template from documents. A template is defined as a tree (refer to the paper for details) and stored as a JSON. Each tree node (JSON object) corresponds to the abstract of a data block, storing the type and fields of the node. - -- **Parameters:** - - `data_files` (list): Stores a list of paths to documents that are created using the same template. - - `result_folder` (str): The path to store results. - - - `LLM_model_name` (str, optional): Specify the LLM model name. -- **Returns:** - - `list`: The template as a list of nodes, stored locally in the result folder, naming the file as `template.json`. - - `cost` (float): The cost incurred during the function call. ---- - -### 4. extract_data - -Extracts data based on a template. - -- **Parameters:** - - `data_files` (list): Stores a list of paths to documents that are created using the same template. - - `template` (list, optional): The template output from `predict_template`. If not specified, TWIX will look in the local result folder to read the predicted template. - - `result_folder` (str): The path to store results. -- **Returns:** - - `dict`: A dictionary of extraction results, where the key is the file path, and the value is the extraction object of that file. Each extraction object is a list of data blocks containing either table blocks or key-value blocks. Results will be written locally in the result folder, naming the file as `extracted.json`. - - `cost` (float): The cost incurred during the function call. ---- - -### 5. transform - -Provides an end-to-end API to directly extract data from PDFs. - -- **Parameters:** - - `data_files` (list): Stores a list of paths to documents that are created using the same template. - - `result_folder` (str): The path to store results. -- **Returns:** - - `fields` (list): A list of strings representing the predicted fields. - - `template` (list): The template as a list of nodes, stored locally in the result folder, naming the file as `template.json`. - - `extraction_object` (dict): A dictionary of extraction results, where the key is the file path, and the value is the extraction object of that file. Each extraction object is a list of data blocks containing either table blocks or key-value blocks. Results will be written locally in the result folder, naming the file as `extracted.json`. - - `cost` (float): The cost incurred during the function call. ---- - -### 6. add_fields - -Allows users to add fields to the predicted fields. - -- **Parameters:** - - `added_fields` (list): A list of fields to add based on the predicted fields. - - `data_files` (list, optional): Stores a list of paths to documents that are created using the same template. At least one of `data_files` or `result_folder` must be specified. - - `result_folder` (str, optional): The path to store results. If not specified, TWIX will automatically create a folder under `root/tests/out/file_name/`. - ---- - -### 7. remove_fields - -Allows users to delete fields from the predicted fields. - -- **Parameters:** - - `removed_fields` (list): A list of fields to delete based on the predicted fields. - - `data_files` (list, optional): Stores a list of paths to documents that are created using the same template. At least one of `data_files` or `result_folder` must be specified. - - `result_folder` (str, optional): The path to store results. If not specified, TWIX will automatically create a folder under `root/tests/out/file_name/`. - ---- - -### 8. remove_template_node - -Allows users to remove nodes in the predicted template. - -- **Parameters:** - - `node_ids` (list): A list of node IDs. Each node ID is an integer. - - `data_files` (list, optional): Stores a list of paths to documents that are created using the same template. At least one of `data_files` or `result_folder` must be specified. - - `result_folder` (str, optional): The path to store results. If not specified, TWIX will automatically create a folder under `root/tests/out/file_name/`. - ---- - -### 9. modify_template_node - -Allows users to update nodes in the predicted template. - -- **Parameters:** - - `node_id` (int): The integer node ID to update. - - `type` (str): The type of the node to update, either `"kv"` or `"table"`. - - `fields` (list): A list of fields (strings) to update. - - `data_files` (list, optional): Stores a list of paths to documents that are created using the same template. At least one of `data_files` or `result_folder` must be specified. - - `result_folder` (str, optional): The path to store results. If not specified, TWIX will automatically create a folder under `root/tests/out/file_name/`. - - - - - - diff --git a/twix.egg-info/SOURCES.txt b/twix.egg-info/SOURCES.txt deleted file mode 100644 index f491a78..0000000 --- a/twix.egg-info/SOURCES.txt +++ /dev/null @@ -1,17 +0,0 @@ -README.md -setup.py -twix/__init__.py -twix/cost.py -twix/extract.py -twix/key.py -twix/model.py -twix/pattern.py -twix/transform.py -twix/user_apis.py -twix.egg-info/PKG-INFO -twix.egg-info/SOURCES.txt -twix.egg-info/dependency_links.txt -twix.egg-info/requires.txt -twix.egg-info/top_level.txt -twix/models/gpt_4o.py -twix/models/gpt_4o_mini.py \ No newline at end of file diff --git a/twix.egg-info/dependency_links.txt b/twix.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/twix.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/twix.egg-info/requires.txt b/twix.egg-info/requires.txt deleted file mode 100644 index 47daa96..0000000 --- a/twix.egg-info/requires.txt +++ /dev/null @@ -1,9 +0,0 @@ -pytesseract -pdfplumber -pandas -numpy -scipy -tiktoken -pdf2image -gurobipy -openai diff --git a/twix.egg-info/top_level.txt b/twix.egg-info/top_level.txt deleted file mode 100644 index 632b84c..0000000 --- a/twix.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -twix diff --git a/twix/__pycache__/__init__.cpython-39.pyc b/twix/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index c02d731..0000000 Binary files a/twix/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/baselines.cpython-39.pyc b/twix/__pycache__/baselines.cpython-39.pyc deleted file mode 100644 index cc02515..0000000 Binary files a/twix/__pycache__/baselines.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/cost.cpython-39.pyc b/twix/__pycache__/cost.cpython-39.pyc deleted file mode 100644 index 86159d6..0000000 Binary files a/twix/__pycache__/cost.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/eval.cpython-39.pyc b/twix/__pycache__/eval.cpython-39.pyc deleted file mode 100644 index 5e98f23..0000000 Binary files a/twix/__pycache__/eval.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/extract.cpython-39.pyc b/twix/__pycache__/extract.cpython-39.pyc deleted file mode 100644 index a8f3795..0000000 Binary files a/twix/__pycache__/extract.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/key.cpython-39.pyc b/twix/__pycache__/key.cpython-39.pyc deleted file mode 100644 index e8fbee4..0000000 Binary files a/twix/__pycache__/key.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/main.cpython-39.pyc b/twix/__pycache__/main.cpython-39.pyc deleted file mode 100644 index 559a844..0000000 Binary files a/twix/__pycache__/main.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/model.cpython-39.pyc b/twix/__pycache__/model.cpython-39.pyc deleted file mode 100644 index 53fe12c..0000000 Binary files a/twix/__pycache__/model.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/pattern.cpython-39.pyc b/twix/__pycache__/pattern.cpython-39.pyc deleted file mode 100644 index e17b453..0000000 Binary files a/twix/__pycache__/pattern.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/transform.cpython-39.pyc b/twix/__pycache__/transform.cpython-39.pyc deleted file mode 100644 index 68a47e6..0000000 Binary files a/twix/__pycache__/transform.cpython-39.pyc and /dev/null differ diff --git a/twix/__pycache__/user_apis.cpython-39.pyc b/twix/__pycache__/user_apis.cpython-39.pyc deleted file mode 100644 index fafbaa6..0000000 Binary files a/twix/__pycache__/user_apis.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__init__.py b/twix/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/twix/models/__pycache__/__init__.cpython-39.pyc b/twix/models/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 4fa8029..0000000 Binary files a/twix/models/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/ai21_model.cpython-39.pyc b/twix/models/__pycache__/ai21_model.cpython-39.pyc deleted file mode 100644 index d0b6c71..0000000 Binary files a/twix/models/__pycache__/ai21_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/bert_model.cpython-39.pyc b/twix/models/__pycache__/bert_model.cpython-39.pyc deleted file mode 100644 index 9c91aed..0000000 Binary files a/twix/models/__pycache__/bert_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/flan_t5_base_model.cpython-39.pyc b/twix/models/__pycache__/flan_t5_base_model.cpython-39.pyc deleted file mode 100644 index 81da737..0000000 Binary files a/twix/models/__pycache__/flan_t5_base_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/flan_t5_large_model.cpython-39.pyc b/twix/models/__pycache__/flan_t5_large_model.cpython-39.pyc deleted file mode 100644 index b20f260..0000000 Binary files a/twix/models/__pycache__/flan_t5_large_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/flan_t5_small_model.cpython-39.pyc b/twix/models/__pycache__/flan_t5_small_model.cpython-39.pyc deleted file mode 100644 index e854d7e..0000000 Binary files a/twix/models/__pycache__/flan_t5_small_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_35_azure.cpython-39.pyc b/twix/models/__pycache__/gpt_35_azure.cpython-39.pyc deleted file mode 100644 index d819822..0000000 Binary files a/twix/models/__pycache__/gpt_35_azure.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_35_long.cpython-39.pyc b/twix/models/__pycache__/gpt_35_long.cpython-39.pyc deleted file mode 100644 index d8a5f9c..0000000 Binary files a/twix/models/__pycache__/gpt_35_long.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_35_model.cpython-39.pyc b/twix/models/__pycache__/gpt_35_model.cpython-39.pyc deleted file mode 100644 index 17e4e9d..0000000 Binary files a/twix/models/__pycache__/gpt_35_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4_azure.cpython-39.pyc b/twix/models/__pycache__/gpt_4_azure.cpython-39.pyc deleted file mode 100644 index 269847b..0000000 Binary files a/twix/models/__pycache__/gpt_4_azure.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4_long.cpython-39.pyc b/twix/models/__pycache__/gpt_4_long.cpython-39.pyc deleted file mode 100644 index 4bec3c9..0000000 Binary files a/twix/models/__pycache__/gpt_4_long.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4_model.cpython-39.pyc b/twix/models/__pycache__/gpt_4_model.cpython-39.pyc deleted file mode 100644 index d07d929..0000000 Binary files a/twix/models/__pycache__/gpt_4_model.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4_vision.cpython-39.pyc b/twix/models/__pycache__/gpt_4_vision.cpython-39.pyc deleted file mode 100644 index a58c8ab..0000000 Binary files a/twix/models/__pycache__/gpt_4_vision.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4o.cpython-39.pyc b/twix/models/__pycache__/gpt_4o.cpython-39.pyc deleted file mode 100644 index 58e972f..0000000 Binary files a/twix/models/__pycache__/gpt_4o.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4o_mini.cpython-39.pyc b/twix/models/__pycache__/gpt_4o_mini.cpython-39.pyc deleted file mode 100644 index 67f9940..0000000 Binary files a/twix/models/__pycache__/gpt_4o_mini.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4o_mini_vision.cpython-39.pyc b/twix/models/__pycache__/gpt_4o_mini_vision.cpython-39.pyc deleted file mode 100644 index 82246ac..0000000 Binary files a/twix/models/__pycache__/gpt_4o_mini_vision.cpython-39.pyc and /dev/null differ diff --git a/twix/models/__pycache__/gpt_4o_vision.cpython-39.pyc b/twix/models/__pycache__/gpt_4o_vision.cpython-39.pyc deleted file mode 100644 index eb0465d..0000000 Binary files a/twix/models/__pycache__/gpt_4o_vision.cpython-39.pyc and /dev/null differ