-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpdf_to_table.py
More file actions
152 lines (116 loc) · 5.59 KB
/
Copy pathpdf_to_table.py
File metadata and controls
152 lines (116 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# https://www.geeksforgeeks.org/python-reading-contents-of-pdf-using-ocr-optical-character-recognition/
# Import libraries
# from PIL import Image
# import pytesseract
# import sys
# from pdf2image import convert_from_path
# import os
import io
from PyPDF2 import PdfReader
import re
import pandas as pd
import streamlit as st
def read_directly_from_pdf():
# read a file
# inspired by https://x.com/Transparangst/status/1906717209423974689
# Install PyPDF2 if not already installed
# pip install PyPDF2
# Path to the PDF file
pdf_path = "C:/Users/rcxsm/Downloads/vac_med_okt_2020.pdf"
# Create a PDF reader object
pdf_path = st.file_uploader("Choose a file")
if pdf_path is not None:
try:
reader = PdfReader(pdf_path)
except Exception as e:
st.error(f"Error loading / parsing the PDF file: {str(e)}")
st.stop()
else:
st.warning("You need to upload a pdf file. Files are not stored anywhere after the processing of this script")
st.stop()
reader = PdfReader(pdf_path)
all_text = ""
# Extract text from each page
number_of_pages = len(reader.pages)
placeholder = st.empty()
for i,page in enumerate(reader.pages):
text = page.extract_text()
text = re.sub(r'(\n\d{6,7})', r'\1#', text)
for t in ["Reeds Openbaar", "Deels Openbaar", "Niet Openbaar", "Openbaar"]:
text = text.replace(t, f'#{t}#')
text = text.replace('#Deels #Openbaar##','#Deels Openbaar#')
text = text.replace('#Reeds #Openbaar##','#Reeds Openbaar#')
text = text.replace('#Niet #Openbaar##','#Niet Openbaar#')
text = text.replace("Openbaa r", 'Openbaar')
text = text.replace("; 10.","#10.")
text = text.replace("; 11.","#11.")
text = text.replace(";","#")
text = text.replace("; buiten verzoek","#buiten verzoek")
text = text.replace("; buiten verzoe k","#buiten verzoek")
text = text.replace(" 5.","#5.")
text = text.replace(";5.","#5.")
text = text.replace("; 5.","#5.")
text = text.replace("# 5", "#5")
text = text.replace("# ", "#")
text = text.replace("# ", "#")
# Replace non-breaking spaces with regular spaces
# Replace all spaces except newlines with a placeholder (e.g., "#")
text = re.sub(r'[^\S\n]5', '#5', text)
text = re.sub(r'[^\S\n]buiten', '#buiten', text)
text = text.replace("\u00A0", " ")
text = text.replace(" ", " ")
text = text.replace("# 5", "#5")
text = text.replace(" ", " ") # Normalize multiple placeholders
# Replace the placeholder back to spaces
text = text.replace("##", "#")
text = text.replace("#buiten#verzoe#k","#buiten verzoek")
progress_txt= (f"Reading page {i+1}/{number_of_pages}")
placeholder.progress(i/number_of_pages, f"Wait for it...{progress_txt}")
all_text +="\n"+text
# test purposes
test= False
if test:
print (text)
if i>2:
break
# Split text into rows and columns using '#' as a separator
placeholder.empty()
rows = [line.split('#') for line in all_text.splitlines()]
# Convert to DataFrame
df = pd.DataFrame(rows)
# Iterate through rows and check columns 3 to 8 for "10.2.a"
for i in ["a","b","c","d","e","f","g","h","i"]:
df[f"101{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.1.{i}" in row.values, axis=1)
df[f"102{i}"] = df.iloc[:, 3:9].apply(lambda row: f"10.2.{i}" in row.values, axis=1)
df[f"512{i}"] = df.iloc[:, 2:5].apply(lambda row: f"5.1.2{i}" in row.values, axis=1)
df[f"515"] = df.iloc[:,2:5].apply(lambda row: f"5.1.5" in row.values, axis=1)
df["BuitenVerzoek"] = df.iloc[:, 1:9].apply(lambda row: "buiten verzoe k" in row.values, axis=1)
df["BuitenVerzoek"] = df.iloc[:, 1:9].apply(lambda row: "buiten verzoek" in row.values, axis=1)
df["111concept"] = df.iloc[:, 3:9].apply(lambda row: "11.1, concept" in row.values, axis=1)
# df.to_csv("output.csv", index=False)
# df.to_excel("output.xlsx", index=False)
st.download_button("Download CSV", data=df.to_csv(index=False), file_name="output.csv", mime="text/csv")
# st.download_button("Download Excel", data=df.to_excel(index=False), file_name="output.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
# Create an in-memory buffer
excel_buffer = io.BytesIO()
# Write the DataFrame to the buffer as an Excel file
df.to_excel(excel_buffer, index=False, engine='openpyxl')
# Reset the buffer's position to the beginning
excel_buffer.seek(0)
# Streamlit download button for Excel
st.download_button(
label="Download Excel",
data=excel_buffer,
file_name="output.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
st.write(df)
def main():
st.info("Read PDF files from Dutch governement")
st.write("This script reads a PDF file and extracts the text from it. It then processes the text to create a DataFrame.")
st.write("The DataFrame is then saved as a CSV and an Excel file, which can be downloaded.")
st.write("It is specifically written for a type of document (Documentoverzicht inzake WOO verzoeken Covid)")
st.write("Contact me for tailor made solutions (@rcsmit on all social media)")
read_directly_from_pdf()
if __name__ == "__main__":
main()