-
Notifications
You must be signed in to change notification settings - Fork 96
Expand file tree
/
Copy pathscript.py
More file actions
45 lines (36 loc) · 1.14 KB
/
script.py
File metadata and controls
45 lines (36 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import codecs
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
text = convert('socialcops.pdf')
saveFile = open('socialcops.txt','w')
saveFile.write(text)
saveFile.close()
with codecs.open('socialcops.txt', encoding='utf-8') as f:
text = f.read()
with codecs.open('socialcops2.txt','w', encoding='utf-8') as f:
f.write(text)
# saveFile2 = open('socialcops2.txt','w')
# saveFile2.write(text)
# saveFile2.close()
# print text
# this code prints the hindi code