Skip to content

Commit e63dbc8

Browse files
committed
use bespoke crawler for Boost.Json
1 parent 50156b3 commit e63dbc8

4 files changed

Lines changed: 88 additions & 21 deletions

File tree

config/config.yaml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
boost:
2-
version: '1_88_0'
3-
root: '../boost_1_88_0'
4-
link: 'https://archives.boost.io/release/1.88.0/source/boost_1_88_0.tar.gz'
2+
version: '1_89_0'
3+
root: '../boost_1_89_0'
4+
link: 'https://archives.boost.io/beta/1.89.0.beta1/source/boost_1_89_0_b1.tar.gz'
55

66
website-v2-docs:
77
root: '../website-v2-docs/build'
@@ -115,6 +115,13 @@ crawlers:
115115
last-words: 34965
116116
last-lvls: 568
117117

118+
- name: BoostJson
119+
libraries:
120+
- key: json
121+
last-records: 4377
122+
last-words: 81076
123+
last-lvls: 18663
124+
118125
- name: BoostPreprocessor
119126
libraries:
120127
- key: preprocessor
@@ -312,13 +319,13 @@ crawlers:
312319
last-words: 8278
313320
last-lvls: 200
314321

322+
- name: Antora
323+
libraries:
315324
- key: redis
316325
last-records: 52
317326
last-words: 7140
318327
last-lvls: 99
319328

320-
- name: Antora
321-
libraries:
322329
- key: unordered
323330
last-records: 2680
324331
last-words: 205084
@@ -648,11 +655,6 @@ crawlers:
648655
last-words: 148179
649656
last-lvls: 2126
650657

651-
- key: json
652-
last-records: 4377
653-
last-words: 81076
654-
last-lvls: 18663
655-
656658
- key: lambda
657659
last-records: 63
658660
last-words: 12935

gecko/crawlers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from .boost_gil import BoostGIL
77
from .boost_graph import BoostGraph
88
from .boost_iostreams import BoostIostreams
9+
from .boost_json import BoostJson
910
from .boost_mpl import BoostMPL
1011
from .boost_outcome import BoostOutcome
1112
from .boost_polygon import BoostPolygon

gecko/crawlers/asciidoc.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,12 @@ def crawl(self, library_key: str) -> dict:
1111
sections = {}
1212
index_path = self._boost_root / 'libs' / library_key / 'index.html'
1313

14-
if library_key == 'array':
15-
index_path = str(index_path.parent / 'doc' / 'html' / 'array.html')
16-
elif library_key == 'process':
17-
index_path = str(index_path.parent / 'doc' / 'html' / 'index.html')
18-
else:
19-
# resolve redirect address
20-
with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
21-
soup = BeautifulSoup(file.read(), 'html.parser')
22-
assert soup.select_one('head > meta[http-equiv="refresh"]')
23-
redirect_to = soup.select_one('body a').get("href")
24-
index_path = urljoin(str(index_path), redirect_to)
14+
# resolve redirect address
15+
with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
16+
soup = BeautifulSoup(file.read(), 'html.parser')
17+
assert soup.select_one('head > meta[http-equiv="refresh"]')
18+
redirect_to = soup.select_one('body a').get("href")
19+
index_path = urljoin(str(index_path), redirect_to)
2520

2621
with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
2722
soup = BeautifulSoup(file.read(), 'html.parser')

gecko/crawlers/boost_json.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from pathlib import Path
2+
from urllib.parse import urljoin
3+
4+
from bs4 import BeautifulSoup, Tag
5+
6+
from .crawler import Crawler
7+
from .helpers import has_class
8+
9+
10+
class BoostJson(Crawler):
11+
def crawl(self, library_key: str) -> dict:
12+
assert library_key == 'json'
13+
14+
index_path = self._boost_root / 'libs' / library_key / 'doc/html'
15+
16+
sections = {}
17+
for file_path in Path(index_path).parent.rglob('*.html'):
18+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
19+
soup = BeautifulSoup(file.read(), 'html.parser')
20+
if 'ref' in file_path.parts :
21+
self._extract_reference(str(file_path), sections, soup.select_one('body div.boostlook'))
22+
else :
23+
for sect1 in soup.select('body div[id="content"] > .sect1'):
24+
self._extract_section_n(str(file_path), sections, sect1)
25+
26+
return sections
27+
28+
def _extract_reference(self, index_path: str, sections: dict, boostlook: Tag):
29+
lvls = []
30+
for link in boostlook.select('nav[id="breadcrumbs"] ul li:not(:first-child) > a'):
31+
lvls = lvls + [{'title': link.text.split("::")[-1], 'path': urljoin(index_path, link.get('href'))}]
32+
33+
header = boostlook.select_one('h1, h2, h3, h4, h5, h6')
34+
path = lvls[-1]['path']
35+
36+
if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
37+
siblings = header.find_next_sibling().find_all(recursive=False)
38+
else:
39+
siblings = header.next_siblings
40+
41+
content = ''
42+
for sibling in siblings:
43+
if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
44+
self._extract_section_n(index_path, sections, sibling, lvls)
45+
continue
46+
content += sibling.get_text() + ' '
47+
48+
sections[path] = {'content': content, 'lvls': lvls}
49+
50+
51+
def _extract_section_n(self, index_path: str, sections: dict, sect: Tag, lvls: list = []):
52+
header = sect.select_one('h1, h2, h3, h4, h5, h6')
53+
title = header.text
54+
path = index_path + '#' + header.get('id')
55+
lvls = lvls + [{'title': title, 'path': path}]
56+
57+
if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
58+
siblings = header.find_next_sibling().find_all(recursive=False)
59+
else:
60+
siblings = header.next_siblings
61+
62+
content = ''
63+
for sibling in siblings:
64+
if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
65+
self._extract_section_n(index_path, sections, sibling, lvls)
66+
continue
67+
content += sibling.get_text() + ' '
68+
69+
sections[path] = {'content': content, 'lvls': lvls}

0 commit comments

Comments
 (0)