-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
executable file
·63 lines (51 loc) · 2.06 KB
/
process.py
File metadata and controls
executable file
·63 lines (51 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python3
"""Simple script to write title, url, and HTML body content of HTML files
to separate numberd files.
Allows to do custom replacements.
Note on security: we assume that your input files are correctly HTML-encoded,
and contain no harmful code.
If your input data is different, then take precautions!
"""
import glob
import os
import re
# make sure that these folders exist
static_dump_dir = '~/site-dump'
processed_dir = '~/site-dump/processed'
title_prefix_to_remove = 'Site: Main - '
html_replacements = {
'/site/' : '/',
'/HomePage/' : '/HomePage'
}
# regular expression to parse out the title of the page
TITLE = re.compile(".*<title>(.*)</title>")
# regular expression to parse out the content HTML of the page
BODY = re.compile("<!--PageText-->(.*)<div id=\"footer\"", re.DOTALL)
def process(index, file):
"""Parse out title, filename, body and do necessary replacements"""
with open(f, 'r', encoding='iso-8859-1') as file:
content = file.read()
match = TITLE.search(content)
if match:
title = match.groups()[0]
title = title.replace(title_prefix_to_remove, '')
filename = os.path.basename(f).replace('.html', '')
print(filename, ':', title)
match2 = BODY.search(content)
if match2:
body = match2.groups()[0]
for old,new in html_replacements.items():
body = body.replace(old, new)
print(body)
# write body, title and URL to separate numbered files
output_filename = "article{0:05}".format(index)
with open(output_filename + '.body.html', 'tw', encoding='utf-8') as body_out:
body_out.write(body)
with open(output_filename + '.title.txt', 'tw', encoding='utf-8') as title_out:
title_out.write(title)
with open(output_filename + '.url.txt', 'tw', encoding='utf-8') as url_out:
url_out.write(filename)
if __name__ == '__main__':
os.chdir(processed_dir)
for i,file in enumerate(f for f in glob.iglob(static_dump_dir + "/*") if os.path.isfile(f))
process(i, file)