-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
41 lines (33 loc) · 1.12 KB
/
main.py
File metadata and controls
41 lines (33 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from unicodedata import name
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io
import re
def pdfminer(file, output):
inFile = open(file, 'rb')
rM = PDFResourceManager()
data = io.StringIO()
TxtConverter = TextConverter(rM, data,laparams = LAParams())
interpreter = PDFPageInterpreter(rM, TxtConverter)
for page in PDFPage.get_pages(inFile):
interpreter.process_page(page)
txt = data.getvalue()
with open(output, 'w') as f:
f.write(txt)
file = "test.pdf"
output = "info.txt"
pdfminer(file, output)
text_file = open('info.txt','r')
stringData = text_file.read()
text_file.close()
email_pattern = '[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[A-Z|a-z]{2,}'
name_pattern = '[a-zA-Z]+ [a-zA-Z]+'
phone_pattern = "\(?\d{3}[-.)]\d{3}[-.]\d{4}"
email_list = re.findall(email_pattern, stringData)
name_list = re.findall(name_pattern, stringData)
phone_list = re.findall(phone_pattern,stringData)
print(email_list)
print(name_list)
print(phone_list)