use bespoke crawler for Boost.Json

ashtum · ashtum · commit e63dbc8f2be4 · 2025-07-30T13:46:09.000Z
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,7 +1,7 @@
 boost:
-  version: '1_88_0'
-  root: '../boost_1_88_0'
-  link: 'https://archives.boost.io/release/1.88.0/source/boost_1_88_0.tar.gz'
+  version: '1_89_0'
+  root: '../boost_1_89_0'
+  link: 'https://archives.boost.io/beta/1.89.0.beta1/source/boost_1_89_0_b1.tar.gz'
 
 website-v2-docs:
   root: '../website-v2-docs/build'
@@ -115,6 +115,13 @@ crawlers:
         last-words: 34965
         last-lvls: 568
 
+  - name: BoostJson
+    libraries:
+      - key: json
+        last-records: 4377
+        last-words: 81076
+        last-lvls: 18663
+
   - name: BoostPreprocessor
     libraries:
       - key: preprocessor
@@ -312,13 +319,13 @@ crawlers:
         last-words: 8278
         last-lvls: 200
 
+  - name: Antora
+    libraries:
       - key: redis
         last-records: 52
         last-words: 7140
         last-lvls: 99
 
-  - name: Antora
-    libraries:
       - key: unordered
         last-records: 2680
         last-words: 205084
@@ -648,11 +655,6 @@ crawlers:
         last-words: 148179
         last-lvls: 2126
 
-      - key: json
-        last-records: 4377
-        last-words: 81076
-        last-lvls: 18663
-
       - key: lambda
         last-records: 63
         last-words: 12935
diff --git a/gecko/crawlers/__init__.py b/gecko/crawlers/__init__.py
@@ -6,6 +6,7 @@
 from .boost_gil import BoostGIL
 from .boost_graph import BoostGraph
 from .boost_iostreams import BoostIostreams
+from .boost_json import BoostJson
 from .boost_mpl import BoostMPL
 from .boost_outcome import BoostOutcome
 from .boost_polygon import BoostPolygon
diff --git a/gecko/crawlers/asciidoc.py b/gecko/crawlers/asciidoc.py
@@ -11,17 +11,12 @@ def crawl(self, library_key: str) -> dict:
         sections = {}
         index_path = self._boost_root / 'libs' / library_key / 'index.html'
 
-        if library_key == 'array':
-            index_path = str(index_path.parent / 'doc' / 'html' / 'array.html')
-        elif library_key == 'process':
-            index_path = str(index_path.parent / 'doc' / 'html' / 'index.html')
-        else:
-            # resolve redirect address
-            with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
-                soup = BeautifulSoup(file.read(), 'html.parser')
-                assert soup.select_one('head > meta[http-equiv="refresh"]')
-                redirect_to = soup.select_one('body a').get("href")
-                index_path = urljoin(str(index_path), redirect_to)
+        # resolve redirect address
+        with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
+            soup = BeautifulSoup(file.read(), 'html.parser')
+            assert soup.select_one('head > meta[http-equiv="refresh"]')
+            redirect_to = soup.select_one('body a').get("href")
+            index_path = urljoin(str(index_path), redirect_to)
 
         with open(index_path, 'r', encoding='utf-8', errors='ignore') as file:
             soup = BeautifulSoup(file.read(), 'html.parser')
diff --git a/gecko/crawlers/boost_json.py b/gecko/crawlers/boost_json.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup, Tag
+
+from .crawler import Crawler
+from .helpers import has_class
+
+
+class BoostJson(Crawler):
+    def crawl(self, library_key: str) -> dict:
+        assert library_key == 'json'
+
+        index_path = self._boost_root / 'libs' / library_key / 'doc/html'
+
+        sections = {}
+        for file_path in Path(index_path).parent.rglob('*.html'):
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                soup = BeautifulSoup(file.read(), 'html.parser')
+                if 'ref' in file_path.parts :
+                    self._extract_reference(str(file_path), sections, soup.select_one('body div.boostlook'))
+                else :
+                    for sect1 in soup.select('body div[id="content"] > .sect1'):
+                        self._extract_section_n(str(file_path), sections, sect1)
+
+        return sections
+
+    def _extract_reference(self, index_path: str, sections: dict, boostlook: Tag):
+        lvls = []
+        for link in boostlook.select('nav[id="breadcrumbs"] ul li:not(:first-child) > a'):
+            lvls = lvls + [{'title': link.text.split("::")[-1], 'path':  urljoin(index_path, link.get('href'))}]
+
+        header = boostlook.select_one('h1, h2, h3, h4, h5, h6')
+        path = lvls[-1]['path']
+
+        if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
+            siblings = header.find_next_sibling().find_all(recursive=False)
+        else:
+            siblings = header.next_siblings
+
+        content = ''
+        for sibling in siblings:
+            if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
+                self._extract_section_n(index_path, sections, sibling, lvls)
+                continue
+            content += sibling.get_text() + ' '
+
+        sections[path] = {'content': content, 'lvls': lvls}
+
+
+    def _extract_section_n(self, index_path: str, sections: dict, sect: Tag, lvls: list = []):
+        header = sect.select_one('h1, h2, h3, h4, h5, h6')
+        title = header.text
+        path = index_path + '#' + header.get('id')
+        lvls = lvls + [{'title': title, 'path': path}]
+
+        if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
+            siblings = header.find_next_sibling().find_all(recursive=False)
+        else:
+            siblings = header.next_siblings
+
+        content = ''
+        for sibling in siblings:
+            if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
+                self._extract_section_n(index_path, sections, sibling, lvls)
+                continue
+            content += sibling.get_text() + ' '
+
+        sections[path] = {'content': content, 'lvls': lvls}