Skip to content

Commit a05bcf5

Browse files
authored
Match data in document to pre-defined schema #75
1 parent 59655cc commit a05bcf5

4 files changed

Lines changed: 276 additions & 3 deletions

File tree

examples/example_notebook.ipynb

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,98 @@
274274
"display(Markdown(parsed_md))"
275275
]
276276
},
277+
{
278+
"cell_type": "markdown",
279+
"metadata": {},
280+
"source": [
281+
"### PDF Parsing - Using a Schema"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": 1,
287+
"metadata": {},
288+
"outputs": [
289+
{
290+
"name": "stderr",
291+
"output_type": "stream",
292+
"text": [
293+
"/home/dilith/Projects/oidlabs/pdf-parser/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
294+
" from .autonotebook import tqdm as notebook_tqdm\n",
295+
"\u001b[32m2025-05-31 21:47:40.219\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_with_schema\u001b[0m:\u001b[36m355\u001b[0m - \u001b[34m\u001b[1mProcessing page 1 with response: [\n",
296+
" {\n",
297+
" \"Disability Category\": \"Blind\",\n",
298+
" \"Participants\": 5,\n",
299+
" \"Ballots Completed\": 1,\n",
300+
" \"Ballots Incomplete/Terminated\": 4,\n",
301+
" \"Accuracy\": [\n",
302+
" \"34.5%, n=1\"\n",
303+
" ],\n",
304+
" \"Time to complete\": [\n",
305+
" \"1199 sec, n=1\"\n",
306+
" ]\n",
307+
" },\n",
308+
" {\n",
309+
" \"Disability Category\": \"Low Vision\",\n",
310+
" \"Participants\": 5,\n",
311+
" \"Ballots Completed\": 2,\n",
312+
" \"Ballots Incomplete/Terminated\": 3,\n",
313+
" \"Accuracy\": [\n",
314+
" \"98.3% n=2\",\n",
315+
" \"97.7%, n=3\"\n",
316+
" ],\n",
317+
" \"Time to complete\": [\n",
318+
" \"1716 sec, n=3\",\n",
319+
" \"1934 sec, n=2\"\n",
320+
" ]\n",
321+
" },\n",
322+
" {\n",
323+
" \"Disability Category\": \"Dexterity\",\n",
324+
" \"Participants\": 5,\n",
325+
" \"Ballots Completed\": 4,\n",
326+
" \"Ballots Incomplete/Terminated\": 1,\n",
327+
" \"Accuracy\": [\n",
328+
" \"98.3%, n=4\"\n",
329+
" ],\n",
330+
" \"Time to complete\": [\n",
331+
" \"1672.1 sec, n=4\"\n",
332+
" ]\n",
333+
" },\n",
334+
" {\n",
335+
" \"Disability Category\": \"Mobility\",\n",
336+
" \"Participants\": 3,\n",
337+
" \"Ballots Completed\": 3,\n",
338+
" \"Ballots Incomplete/Terminated\": 0,\n",
339+
" \"Accuracy\": [\n",
340+
" \"95.4%, n=3\"\n",
341+
" ],\n",
342+
" \"Time to complete\": [\n",
343+
" \"1416 sec, n=3\"\n",
344+
" ]\n",
345+
" }\n",
346+
"]\u001b[0m\n"
347+
]
348+
}
349+
],
350+
"source": [
351+
"from lexoid.api import parse_with_schema\n",
352+
"\n",
353+
"sample_schema = [\n",
354+
" {\n",
355+
" \"Disability Category\": \"string\",\n",
356+
" \"Participants\": \"int\",\n",
357+
" \"Ballots Completed\": \"int\",\n",
358+
" \"Ballots Incomplete/Terminated\": \"int\",\n",
359+
" \"Accuracy\": [\"string\"],\n",
360+
" \"Time to complete\": [\"string\"]\n",
361+
" }\n",
362+
"]\n",
363+
"\n",
364+
"pdf_path = \"inputs/test_1.pdf\"\n",
365+
"\n",
366+
"parsed_result = parse_with_schema(path=pdf_path, schema=sample_schema, model=\"gpt-4o\") "
367+
]
368+
},
277369
{
278370
"cell_type": "markdown",
279371
"metadata": {},

examples/example_notebook_colab.ipynb

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,98 @@
784784
"display(Markdown(parsed_md.replace(\"None\", \"\")))"
785785
]
786786
},
787+
{
788+
"cell_type": "markdown",
789+
"metadata": {},
790+
"source": [
791+
"### PDF Parsing - Using a Schema"
792+
]
793+
},
794+
{
795+
"cell_type": "code",
796+
"execution_count": 1,
797+
"metadata": {},
798+
"outputs": [
799+
{
800+
"name": "stderr",
801+
"output_type": "stream",
802+
"text": [
803+
"/home/dilith/Projects/oidlabs/pdf-parser/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
804+
" from .autonotebook import tqdm as notebook_tqdm\n",
805+
"\u001b[32m2025-05-31 21:44:21.869\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_with_schema\u001b[0m:\u001b[36m355\u001b[0m - \u001b[34m\u001b[1mProcessing page 1 with response: [\n",
806+
" {\n",
807+
" \"Disability Category\": \"Blind\",\n",
808+
" \"Participants\": 5,\n",
809+
" \"Ballots Completed\": 1,\n",
810+
" \"Ballots Incomplete/Terminated\": 4,\n",
811+
" \"Accuracy\": [\n",
812+
" \"34.5%, n=1\"\n",
813+
" ],\n",
814+
" \"Time to complete\": [\n",
815+
" \"1199 sec, n=1\"\n",
816+
" ]\n",
817+
" },\n",
818+
" {\n",
819+
" \"Disability Category\": \"Low Vision\",\n",
820+
" \"Participants\": 5,\n",
821+
" \"Ballots Completed\": 2,\n",
822+
" \"Ballots Incomplete/Terminated\": 3,\n",
823+
" \"Accuracy\": [\n",
824+
" \"98.3% n=2\",\n",
825+
" \"97.7%, n=3\"\n",
826+
" ],\n",
827+
" \"Time to complete\": [\n",
828+
" \"1716 sec, n=3\",\n",
829+
" \"1934 sec, n=2\"\n",
830+
" ]\n",
831+
" },\n",
832+
" {\n",
833+
" \"Disability Category\": \"Dexterity\",\n",
834+
" \"Participants\": 5,\n",
835+
" \"Ballots Completed\": 4,\n",
836+
" \"Ballots Incomplete/Terminated\": 1,\n",
837+
" \"Accuracy\": [\n",
838+
" \"98.3%, n=4\"\n",
839+
" ],\n",
840+
" \"Time to complete\": [\n",
841+
" \"1672.1 sec, n=4\"\n",
842+
" ]\n",
843+
" },\n",
844+
" {\n",
845+
" \"Disability Category\": \"Mobility\",\n",
846+
" \"Participants\": 3,\n",
847+
" \"Ballots Completed\": 3,\n",
848+
" \"Ballots Incomplete/Terminated\": 0,\n",
849+
" \"Accuracy\": [\n",
850+
" \"95.4%, n=3\"\n",
851+
" ],\n",
852+
" \"Time to complete\": [\n",
853+
" \"1416 sec, n=3\"\n",
854+
" ]\n",
855+
" }\n",
856+
"]\u001b[0m\n"
857+
]
858+
}
859+
],
860+
"source": [
861+
"from lexoid.api import parse_with_schema\n",
862+
"\n",
863+
"sample_schema = [\n",
864+
" {\n",
865+
" \"Disability Category\": \"string\",\n",
866+
" \"Participants\": \"int\",\n",
867+
" \"Ballots Completed\": \"int\",\n",
868+
" \"Ballots Incomplete/Terminated\": \"int\",\n",
869+
" \"Accuracy\": [\"string\"],\n",
870+
" \"Time to complete\": [\"string\"]\n",
871+
" }\n",
872+
"]\n",
873+
"\n",
874+
"pdf_path = \"inputs/test_1.pdf\"\n",
875+
"\n",
876+
"parsed_result = parse_with_schema(path=pdf_path, schema=sample_schema, model=\"gpt-4o\") "
877+
]
878+
},
787879
{
788880
"cell_type": "markdown",
789881
"metadata": {
@@ -1896,7 +1988,7 @@
18961988
"name": "python",
18971989
"nbconvert_exporter": "python",
18981990
"pygments_lexer": "ipython3",
1899-
"version": "3.10.12"
1991+
"version": "3.12.3"
19001992
}
19011993
},
19021994
"nbformat": 4,

lexoid/api.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010

1111
from loguru import logger
1212

13-
from lexoid.core.parse_type.llm_parser import parse_llm_doc
13+
from lexoid.core.parse_type.llm_parser import (
14+
parse_llm_doc,
15+
create_response,
16+
convert_doc_to_base64_images,
17+
)
1418
from lexoid.core.parse_type.static_parser import parse_static_doc
1519
from lexoid.core.utils import (
1620
convert_to_pdf,
@@ -293,3 +297,63 @@ def parse(
293297
result["recursive_docs"] = recursive_docs
294298

295299
return result
300+
301+
302+
def parse_with_schema(
303+
path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
304+
) -> List[List[Dict]]:
305+
"""
306+
Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
307+
308+
Args:
309+
path (str): Path to the PDF file.
310+
schema (Dict): JSON schema to which the parsed output should conform.
311+
api (str, optional): LLM API provider.
312+
model (str, optional): LLM model name.
313+
**kwargs: Additional arguments for the parser.
314+
315+
Returns:
316+
List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
317+
"""
318+
system_prompt = f"""
319+
The output should be formatted as a JSON instance that conforms to the JSON schema below.
320+
321+
As an example, for the schema {{
322+
"properties": {{
323+
"foo": {{
324+
"title": "Foo",
325+
"description": "a list of strings",
326+
"type": "array",
327+
"items": {{"type": "string"}}
328+
}}
329+
}},
330+
"required": ["foo"]
331+
}}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
332+
333+
Here is the output schema:
334+
{json.dumps(schema, indent=2)}
335+
336+
"""
337+
338+
user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
339+
340+
responses = []
341+
images = convert_doc_to_base64_images(path)
342+
for i, (page_num, image) in enumerate(images):
343+
resp_dict = create_response(
344+
api=api,
345+
model=model,
346+
user_prompt=user_prompt,
347+
system_prompt=system_prompt,
348+
image_url=image,
349+
temperature=kwargs.get("temperature", 0.0),
350+
max_tokens=kwargs.get("max_tokens", 1024),
351+
)
352+
353+
response = resp_dict.get("response", "")
354+
response = response.split("```json")[-1].split("```")[0].strip()
355+
logger.debug(f"Processing page {page_num + 1} with response: {response}")
356+
new_dict = json.loads(response)
357+
responses.append(new_dict)
358+
359+
return responses

lexoid/core/parse_type/llm_parser.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
import time
66
from functools import wraps
7-
from typing import Dict, List, Optional
7+
from typing import Dict, List, Optional, Tuple
88

99
import pypdfium2 as pdfium
1010
import requests
@@ -372,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
372372
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
373373
},
374374
}
375+
376+
377+
def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
378+
"""
379+
Converts a document (PDF or image) to a base64 encoded string.
380+
381+
Args:
382+
path (str): Path to the PDF file.
383+
384+
Returns:
385+
str: Base64 encoded string of the PDF content.
386+
"""
387+
if path.endswith(".pdf"):
388+
pdf_document = pdfium.PdfDocument(path)
389+
return [
390+
(
391+
page_num,
392+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
393+
)
394+
for page_num in range(len(pdf_document))
395+
]
396+
elif mimetypes.guess_type(path)[0].startswith("image"):
397+
with open(path, "rb") as img_file:
398+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
399+
return [(0, f"data:image/png;base64,{image_base64}")]

0 commit comments

Comments
 (0)