Match data in document to pre-defined schema #75

Vaishnav2804 · web-flow · commit a05bcf59e1bb · 2025-05-31T22:10:30.000-03:00
diff --git a/examples/example_notebook.ipynb b/examples/example_notebook.ipynb
@@ -274,6 +274,98 @@
     "display(Markdown(parsed_md))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### PDF Parsing - Using a Schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dilith/Projects/oidlabs/pdf-parser/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "\u001b[32m2025-05-31 21:47:40.219\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_with_schema\u001b[0m:\u001b[36m355\u001b[0m - \u001b[34m\u001b[1mProcessing page 1 with response: [\n",
+      "  {\n",
+      "    \"Disability Category\": \"Blind\",\n",
+      "    \"Participants\": 5,\n",
+      "    \"Ballots Completed\": 1,\n",
+      "    \"Ballots Incomplete/Terminated\": 4,\n",
+      "    \"Accuracy\": [\n",
+      "      \"34.5%, n=1\"\n",
+      "    ],\n",
+      "    \"Time to complete\": [\n",
+      "      \"1199 sec, n=1\"\n",
+      "    ]\n",
+      "  },\n",
+      "  {\n",
+      "    \"Disability Category\": \"Low Vision\",\n",
+      "    \"Participants\": 5,\n",
+      "    \"Ballots Completed\": 2,\n",
+      "    \"Ballots Incomplete/Terminated\": 3,\n",
+      "    \"Accuracy\": [\n",
+      "      \"98.3% n=2\",\n",
+      "      \"97.7%, n=3\"\n",
+      "    ],\n",
+      "    \"Time to complete\": [\n",
+      "      \"1716 sec, n=3\",\n",
+      "      \"1934 sec, n=2\"\n",
+      "    ]\n",
+      "  },\n",
+      "  {\n",
+      "    \"Disability Category\": \"Dexterity\",\n",
+      "    \"Participants\": 5,\n",
+      "    \"Ballots Completed\": 4,\n",
+      "    \"Ballots Incomplete/Terminated\": 1,\n",
+      "    \"Accuracy\": [\n",
+      "      \"98.3%, n=4\"\n",
+      "    ],\n",
+      "    \"Time to complete\": [\n",
+      "      \"1672.1 sec, n=4\"\n",
+      "    ]\n",
+      "  },\n",
+      "  {\n",
+      "    \"Disability Category\": \"Mobility\",\n",
+      "    \"Participants\": 3,\n",
+      "    \"Ballots Completed\": 3,\n",
+      "    \"Ballots Incomplete/Terminated\": 0,\n",
+      "    \"Accuracy\": [\n",
+      "      \"95.4%, n=3\"\n",
+      "    ],\n",
+      "    \"Time to complete\": [\n",
+      "      \"1416 sec, n=3\"\n",
+      "    ]\n",
+      "  }\n",
+      "]\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from lexoid.api import parse_with_schema\n",
+    "\n",
+    "sample_schema = [\n",
+    "    {\n",
+    "        \"Disability Category\": \"string\",\n",
+    "        \"Participants\": \"int\",\n",
+    "        \"Ballots Completed\": \"int\",\n",
+    "        \"Ballots Incomplete/Terminated\": \"int\",\n",
+    "        \"Accuracy\": [\"string\"],\n",
+    "        \"Time to complete\": [\"string\"]\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "pdf_path = \"inputs/test_1.pdf\"\n",
+    "\n",
+    "parsed_result = parse_with_schema(path=pdf_path, schema=sample_schema, model=\"gpt-4o\") "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/examples/example_notebook_colab.ipynb b/examples/example_notebook_colab.ipynb
@@ -784,6 +784,98 @@
         "display(Markdown(parsed_md.replace(\"None\", \"\")))"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### PDF Parsing - Using a Schema"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/home/dilith/Projects/oidlabs/pdf-parser/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+            "  from .autonotebook import tqdm as notebook_tqdm\n",
+            "\u001b[32m2025-05-31 21:44:21.869\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_with_schema\u001b[0m:\u001b[36m355\u001b[0m - \u001b[34m\u001b[1mProcessing page 1 with response: [\n",
+            "  {\n",
+            "    \"Disability Category\": \"Blind\",\n",
+            "    \"Participants\": 5,\n",
+            "    \"Ballots Completed\": 1,\n",
+            "    \"Ballots Incomplete/Terminated\": 4,\n",
+            "    \"Accuracy\": [\n",
+            "      \"34.5%, n=1\"\n",
+            "    ],\n",
+            "    \"Time to complete\": [\n",
+            "      \"1199 sec, n=1\"\n",
+            "    ]\n",
+            "  },\n",
+            "  {\n",
+            "    \"Disability Category\": \"Low Vision\",\n",
+            "    \"Participants\": 5,\n",
+            "    \"Ballots Completed\": 2,\n",
+            "    \"Ballots Incomplete/Terminated\": 3,\n",
+            "    \"Accuracy\": [\n",
+            "      \"98.3% n=2\",\n",
+            "      \"97.7%, n=3\"\n",
+            "    ],\n",
+            "    \"Time to complete\": [\n",
+            "      \"1716 sec, n=3\",\n",
+            "      \"1934 sec, n=2\"\n",
+            "    ]\n",
+            "  },\n",
+            "  {\n",
+            "    \"Disability Category\": \"Dexterity\",\n",
+            "    \"Participants\": 5,\n",
+            "    \"Ballots Completed\": 4,\n",
+            "    \"Ballots Incomplete/Terminated\": 1,\n",
+            "    \"Accuracy\": [\n",
+            "      \"98.3%, n=4\"\n",
+            "    ],\n",
+            "    \"Time to complete\": [\n",
+            "      \"1672.1 sec, n=4\"\n",
+            "    ]\n",
+            "  },\n",
+            "  {\n",
+            "    \"Disability Category\": \"Mobility\",\n",
+            "    \"Participants\": 3,\n",
+            "    \"Ballots Completed\": 3,\n",
+            "    \"Ballots Incomplete/Terminated\": 0,\n",
+            "    \"Accuracy\": [\n",
+            "      \"95.4%, n=3\"\n",
+            "    ],\n",
+            "    \"Time to complete\": [\n",
+            "      \"1416 sec, n=3\"\n",
+            "    ]\n",
+            "  }\n",
+            "]\u001b[0m\n"
+          ]
+        }
+      ],
+      "source": [
+        "from lexoid.api import parse_with_schema\n",
+        "\n",
+        "sample_schema = [\n",
+        "    {\n",
+        "        \"Disability Category\": \"string\",\n",
+        "        \"Participants\": \"int\",\n",
+        "        \"Ballots Completed\": \"int\",\n",
+        "        \"Ballots Incomplete/Terminated\": \"int\",\n",
+        "        \"Accuracy\": [\"string\"],\n",
+        "        \"Time to complete\": [\"string\"]\n",
+        "    }\n",
+        "]\n",
+        "\n",
+        "pdf_path = \"inputs/test_1.pdf\"\n",
+        "\n",
+        "parsed_result = parse_with_schema(path=pdf_path, schema=sample_schema, model=\"gpt-4o\") "
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -1896,7 +1988,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.10.12"
+      "version": "3.12.3"
     }
   },
   "nbformat": 4,
diff --git a/lexoid/api.py b/lexoid/api.py
@@ -10,7 +10,11 @@
 
 from loguru import logger
 
-from lexoid.core.parse_type.llm_parser import parse_llm_doc
+from lexoid.core.parse_type.llm_parser import (
+    parse_llm_doc,
+    create_response,
+    convert_doc_to_base64_images,
+)
 from lexoid.core.parse_type.static_parser import parse_static_doc
 from lexoid.core.utils import (
     convert_to_pdf,
@@ -293,3 +297,63 @@ def parse(
         result["recursive_docs"] = recursive_docs
 
     return result
+
+
+def parse_with_schema(
+    path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
+) -> List[List[Dict]]:
+    """
+    Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
+
+    Args:
+        path (str): Path to the PDF file.
+        schema (Dict): JSON schema to which the parsed output should conform.
+        api (str, optional): LLM API provider.
+        model (str, optional): LLM model name.
+        **kwargs: Additional arguments for the parser.
+
+    Returns:
+        List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
+    """
+    system_prompt = f"""
+        The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+        As an example, for the schema {{
+        "properties": {{
+            "foo": {{
+            "title": "Foo",
+            "description": "a list of strings",
+            "type": "array",
+            "items": {{"type": "string"}}
+            }}
+        }},
+        "required": ["foo"]
+        }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
+
+        Here is the output schema:
+        {json.dumps(schema, indent=2)}
+
+        """
+
+    user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
+
+    responses = []
+    images = convert_doc_to_base64_images(path)
+    for i, (page_num, image) in enumerate(images):
+        resp_dict = create_response(
+            api=api,
+            model=model,
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            image_url=image,
+            temperature=kwargs.get("temperature", 0.0),
+            max_tokens=kwargs.get("max_tokens", 1024),
+        )
+
+        response = resp_dict.get("response", "")
+        response = response.split("```json")[-1].split("```")[0].strip()
+        logger.debug(f"Processing page {page_num + 1} with response: {response}")
+        new_dict = json.loads(response)
+        responses.append(new_dict)
+
+    return responses
diff --git a/lexoid/core/parse_type/llm_parser.py b/lexoid/core/parse_type/llm_parser.py
@@ -4,7 +4,7 @@
 import os
 import time
 from functools import wraps
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import pypdfium2 as pdfium
 import requests
@@ -372,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
             "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
         },
     }
+
+
+def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
+    """
+    Converts a document (PDF or image) to a base64 encoded string.
+
+    Args:
+        path (str): Path to the PDF file.
+
+    Returns:
+        str: Base64 encoded string of the PDF content.
+    """
+    if path.endswith(".pdf"):
+        pdf_document = pdfium.PdfDocument(path)
+        return [
+            (
+                page_num,
+                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
+            )
+            for page_num in range(len(pdf_document))
+        ]
+    elif mimetypes.guess_type(path)[0].startswith("image"):
+        with open(path, "rb") as img_file:
+            image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+            return [(0, f"data:image/png;base64,{image_base64}")]