forked from SublimeText/sublime-text-docset
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfix_html.py
More file actions
executable file
·76 lines (56 loc) · 1.87 KB
/
fix_html.py
File metadata and controls
executable file
·76 lines (56 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
"""
Strip portions of the HTML pages that we don't need
"""
import sys
from pathlib import Path
from re import sub
from bs4 import BeautifulSoup
DOC_ROOTS = [
'sublime-text/www.sublimetext.com/docs',
'sublime-merge/www.sublimemerge.com/docs',
]
def delete_skins(soup):
"""Strip header and ad content"""
for element in soup.find_all(['header', 'nav', 'aside']):
if not element:
continue
element.decompose()
for h1 in soup.find_all('h1'):
if not h1:
continue
h1_version = h1.find('div', class_='versions')
if not h1_version:
continue
h1_version.decompose()
twitter = soup.find('div', class_='twitter_follow')
if twitter:
twitter.decompose()
def remove_link_icon(soup):
"""Drop the permalink icon"""
for a in soup.find_all('a', string='🔗'):
a.decompose()
def trim_title(soup):
"""Remove suffix from <title> element"""
for title in soup.find_all('title'):
print(f'Checking {title}')
regex = r'\s*[\W\S]\s*Sublime\s+(Merge|Text)(\s+Documentation)?$'
trimmed = sub(regex, '', title.string)
if trimmed != title.string:
print(f' Changing to {trimmed}')
title.string.replace_with(trimmed)
def main():
for root in DOC_ROOTS:
root_directory = Path(root)
for path in root_directory.rglob('*.html'):
with path.open(encoding='utf-8') as file:
html = file.read()
soup = BeautifulSoup(html, 'lxml')
delete_skins(soup)
remove_link_icon(soup)
trim_title(soup)
with path.open('w', encoding='utf-8') as file:
# Can't prettify as that would introduce whitespace around inline tags
file.write(str(soup))
if __name__ == '__main__':
sys.exit(main())