-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgethtml.py
More file actions
51 lines (44 loc) · 1.81 KB
/
gethtml.py
File metadata and controls
51 lines (44 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup
import os
import requests
letters = ['al', 'bet', 'gim', 'dal', 'he', 'waw', 'zayn', 'het', 'tet', 'yod', 'kap', 'lam', 'mem', 'num', 'sam', 'ayi', 'pe', 'sad', 'kop', 'res', 'sin', 'taw']
def scrape(url, letter):
html_head = f"""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 //EN"><HTML><HEAD>
<TITLE>{letter}</TITLE><META NAME="description" CONTENT="">
<META NAME="keywords" CONTENT="">
<META NAME="author" CONTENT="Administrator">
<META NAME="generator" CONTENT="">
<LINK REL=STYLESHEET HREF="al.css">
<STYLE>
#pStyle01X0 {{text-indent: 0px; }}
</STYLE>
</HEAD>"""
html_foot = """</BODY></HTML>"""
page_num = 1
all_lines = []
hasnext = True
while hasnext:
indiv_url = url
if page_num < 10:
indiv_url += '0' + str(page_num) + '.html'
else:
indiv_url += str(page_num) + '.html'
r = requests.get(indiv_url)
soup = BeautifulSoup(r.text, 'html.parser')
all_p = soup.find_all('p')
all_lines.extend([str(p) for p in all_p])
all_a = soup.find_all('a')
hasnext = False
for a in all_a:
if a.text == 'Next ->':
hasnext = True
page_num += 1
return html_head + '\n' + '\n'.join(all_lines) + '\n' + html_foot
def scrape_all(fpath):
for i in range(len(letters)):
h = scrape('http://www.ericlevy.com/revel/bdb/bdb/' + str(i+1) + '/' + letters[i], letters[i])
with open(os.path.join(fpath, letters[i] + '.html'), 'w', encoding='utf8') as f:
f.write(h)
if __name__ == '__main__':
fpath = os.path.join(os.getcwd(), 'pages')
scrape_all(fpath)