|
784 | 784 | "display(Markdown(parsed_md.replace(\"None\", \"\")))" |
785 | 785 | ] |
786 | 786 | }, |
| 787 | + { |
| 788 | + "cell_type": "markdown", |
| 789 | + "metadata": {}, |
| 790 | + "source": [ |
| 791 | + "### PDF Parsing - Using a Schema" |
| 792 | + ] |
| 793 | + }, |
| 794 | + { |
| 795 | + "cell_type": "code", |
| 796 | + "execution_count": 1, |
| 797 | + "metadata": {}, |
| 798 | + "outputs": [ |
| 799 | + { |
| 800 | + "name": "stderr", |
| 801 | + "output_type": "stream", |
| 802 | + "text": [ |
| 803 | + "/home/dilith/Projects/oidlabs/pdf-parser/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", |
| 804 | + " from .autonotebook import tqdm as notebook_tqdm\n", |
| 805 | + "\u001b[32m2025-05-31 21:44:21.869\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mlexoid.api\u001b[0m:\u001b[36mparse_with_schema\u001b[0m:\u001b[36m355\u001b[0m - \u001b[34m\u001b[1mProcessing page 1 with response: [\n", |
| 806 | + " {\n", |
| 807 | + " \"Disability Category\": \"Blind\",\n", |
| 808 | + " \"Participants\": 5,\n", |
| 809 | + " \"Ballots Completed\": 1,\n", |
| 810 | + " \"Ballots Incomplete/Terminated\": 4,\n", |
| 811 | + " \"Accuracy\": [\n", |
| 812 | + " \"34.5%, n=1\"\n", |
| 813 | + " ],\n", |
| 814 | + " \"Time to complete\": [\n", |
| 815 | + " \"1199 sec, n=1\"\n", |
| 816 | + " ]\n", |
| 817 | + " },\n", |
| 818 | + " {\n", |
| 819 | + " \"Disability Category\": \"Low Vision\",\n", |
| 820 | + " \"Participants\": 5,\n", |
| 821 | + " \"Ballots Completed\": 2,\n", |
| 822 | + " \"Ballots Incomplete/Terminated\": 3,\n", |
| 823 | + " \"Accuracy\": [\n", |
| 824 | + " \"98.3% n=2\",\n", |
| 825 | + " \"97.7%, n=3\"\n", |
| 826 | + " ],\n", |
| 827 | + " \"Time to complete\": [\n", |
| 828 | + " \"1716 sec, n=3\",\n", |
| 829 | + " \"1934 sec, n=2\"\n", |
| 830 | + " ]\n", |
| 831 | + " },\n", |
| 832 | + " {\n", |
| 833 | + " \"Disability Category\": \"Dexterity\",\n", |
| 834 | + " \"Participants\": 5,\n", |
| 835 | + " \"Ballots Completed\": 4,\n", |
| 836 | + " \"Ballots Incomplete/Terminated\": 1,\n", |
| 837 | + " \"Accuracy\": [\n", |
| 838 | + " \"98.3%, n=4\"\n", |
| 839 | + " ],\n", |
| 840 | + " \"Time to complete\": [\n", |
| 841 | + " \"1672.1 sec, n=4\"\n", |
| 842 | + " ]\n", |
| 843 | + " },\n", |
| 844 | + " {\n", |
| 845 | + " \"Disability Category\": \"Mobility\",\n", |
| 846 | + " \"Participants\": 3,\n", |
| 847 | + " \"Ballots Completed\": 3,\n", |
| 848 | + " \"Ballots Incomplete/Terminated\": 0,\n", |
| 849 | + " \"Accuracy\": [\n", |
| 850 | + " \"95.4%, n=3\"\n", |
| 851 | + " ],\n", |
| 852 | + " \"Time to complete\": [\n", |
| 853 | + " \"1416 sec, n=3\"\n", |
| 854 | + " ]\n", |
| 855 | + " }\n", |
| 856 | + "]\u001b[0m\n" |
| 857 | + ] |
| 858 | + } |
| 859 | + ], |
| 860 | + "source": [ |
| 861 | + "from lexoid.api import parse_with_schema\n", |
| 862 | + "\n", |
| 863 | + "sample_schema = [\n", |
| 864 | + " {\n", |
| 865 | + " \"Disability Category\": \"string\",\n", |
| 866 | + " \"Participants\": \"int\",\n", |
| 867 | + " \"Ballots Completed\": \"int\",\n", |
| 868 | + " \"Ballots Incomplete/Terminated\": \"int\",\n", |
| 869 | + " \"Accuracy\": [\"string\"],\n", |
| 870 | + " \"Time to complete\": [\"string\"]\n", |
| 871 | + " }\n", |
| 872 | + "]\n", |
| 873 | + "\n", |
| 874 | + "pdf_path = \"inputs/test_1.pdf\"\n", |
| 875 | + "\n", |
| 876 | + "parsed_result = parse_with_schema(path=pdf_path, schema=sample_schema, model=\"gpt-4o\") " |
| 877 | + ] |
| 878 | + }, |
787 | 879 | { |
788 | 880 | "cell_type": "markdown", |
789 | 881 | "metadata": { |
|
1896 | 1988 | "name": "python", |
1897 | 1989 | "nbconvert_exporter": "python", |
1898 | 1990 | "pygments_lexer": "ipython3", |
1899 | | - "version": "3.10.12" |
| 1991 | + "version": "3.12.3" |
1900 | 1992 | } |
1901 | 1993 | }, |
1902 | 1994 | "nbformat": 4, |
|
0 commit comments