documentation/builder/search.mjs at main · twinbasic/documentation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
// Phase 6 AUXILIARIES -- search-data.json. Port of the just-the-docs
// theme's `assets/js/zzzz-search-data.json` Liquid template plus the
// empty `_includes/lunr/custom-data.json` (which renders as a blank
// indented line between the `url` and `relUrl` fields). The output is
// the lunr index input that client-side `initSearch()` in
// `just-the-docs.js` feeds into `lunr(...)`.
//
// One entry per heading-bounded section of each titled page. Pages with
// N visible headings produce up to N (+ 1 prefix entry, when the first
// heading text differs from the page title or non-empty prose precedes
// it). See builder/PLAN-6.md §5.3 + §7.D4 + §7.D5.

import path from "node:path";

import { stripHtml } from "./seo.mjs";
import { writeFileMkdirp } from "./write.mjs";

export async function writeSearchData(pages, site, destRoot) {
  const entries = deriveSearchEntries(pages, site);
  const body = entries.map(renderEntryString).join(",");
  const json = `{` + body + `\n}\n`;
  await writeFileMkdirp(path.join(destRoot, "assets/js/search-data.json"), json);
  return { entries: entries.length, json };
}

// Phase 17 consolidation path: per-worker render handlers call
// deriveSearchEntries on their chunk and stash the result on
// state.searchChunks[i].  This function flattens those chunks (in chunk-
// index order, matching the serial page iteration), renumbers `i` so it
// is globally sequential, and writes the same byte-for-byte search-data.json
// the single-threaded writeSearchData would have produced.
export async function writeSearchDataFromChunks(searchChunks, destRoot) {
  const allEntries = searchChunks.flat();
  for (let idx = 0; idx < allEntries.length; idx++) allEntries[idx].i = idx;
  const body = allEntries.map(renderEntryString).join(",");
  const json = `{` + body + `\n}\n`;
  await writeFileMkdirp(path.join(destRoot, "assets/js/search-data.json"), json);
  return { entries: allEntries.length, json };
}

// Pure-compute derivation: produces the search-data entry array
// (already sanitised, already URL-encoded) without writing anything.
// Each entry is `{ i, doc, title, content, url, relUrl, sourcePage }`.
// `sourcePage` is the originating tbdocs page so callers (`_triage.mjs`,
// `_diff.mjs`) can gate by `srcRel` against `accepted-divergences.mjs`.
export function deriveSearchEntries(pages, site) {
  const headingLevel = site.config.search?.heading_level ?? 2;
  const baseurl = String(site.config.baseurl ?? "");
  const entries = [];
  let i = 0;

  for (const page of pages) {
    const title = page.frontmatter?.title;
    if (!title) continue;
    if (page.frontmatter?.search_exclude === true) continue;
    if (typeof page.renderedContent !== "string") continue;

    const { sections, titleFound, prefixContent } = extractSections(
      page,
      String(title),
      headingLevel,
    );

    for (const sec of sections) {
      entries.push({
        i: i++,
        doc: String(title),
        title: sec.title,
        content: sanitiseContent(sec.body),
        url: encodeSpaces(baseurl + sec.url),
        relUrl: sec.url,
        sourcePage: page,
      });
    }

    if (!titleFound) {
      entries.push({
        i: i++,
        doc: String(title),
        title: String(title),
        content: sanitiseContent(prefixContent),
        url: encodeSpaces(baseurl + page.permalink),
        relUrl: page.permalink,
        sourcePage: page,
      });
    }
  }

  return entries;
}

// Returns the heading-split sections plus the prose-before-first-heading
// (`parts[0]`) and a `titleFound` flag indicating whether the title-
// prefix entry should be suppressed.
function extractSections(page, pageTitle, headingLevel) {
  let content = page.renderedContent;

  // h2..h<heading_level> → h1 substitution. For the upstream default
  // of 2 this is a single iteration. For higher levels (not configured
  // on this site) further iterations fold deeper headings into the
  // splitter's boundary set.
  for (let lvl = 2; lvl <= headingLevel; lvl++) {
    content = content
      .replaceAll(`<h${lvl}`, "<h1")
      .replaceAll(`</h${lvl}`, "</h1");
  }

  const parts = content.split("<h1");
  const prefixContent = parts[0] || "";
  const sections = [];
  let titleFound = false;

  for (let k = 1; k < parts.length; k++) {
    const part = parts[k];
    const closeIdx = part.indexOf("</h1>");
    const headingChunk = closeIdx === -1 ? part : part.slice(0, closeIdx);
    const body = closeIdx === -1 ? "" : part.slice(closeIdx + "</h1>".length);

    // Heading text: drop the attribute prefix (everything up to and
    // including the first `>`), then strip any inline HTML (e.g.
    // `<code>`, `<em>`).
    const gtIdx = headingChunk.indexOf(">");
    const titleHtml = gtIdx === -1 ? headingChunk : headingChunk.slice(gtIdx + 1);
    const sectionTitle = stripHtml(titleHtml);

    let url = page.permalink;
    if (sectionTitle === pageTitle && prefixContent === "") {
      titleFound = true;
    } else {
      // Extract id from `id="..."` if present exactly once.
      const idParts = headingChunk.split('id="');
      if (idParts.length === 2) {
        const idValue = idParts[1].split('"')[0];
        url = `${page.permalink}#${idValue}`;
      }
    }

    sections.push({ title: sectionTitle, body, url });
  }

  return { sections, titleFound, prefixContent };
}

// Per-entry JSON shape matching the upstream Liquid template's output
// byte-for-byte: doc / title / content / url, then a blank-indented
// line where the empty lunr/custom-data.json include used to render,
// then relUrl. Closing brace has 2-space indent. No trailing newline
// on the returned string -- the outer join with "," handles separation.
//
// Consumes a derived entry from `deriveSearchEntries`: content is
// already sanitised, url is already URL-encoded.
export function renderEntryString(e) {
  return `"${e.i}": {\n` +
    `    "doc": ${JSON.stringify(e.doc)},\n` +
    `    "title": ${JSON.stringify(e.title)},\n` +
    `    "content": ${JSON.stringify(e.content)},\n` +
    `    "url": "${e.url}",\n` +
    `    \n` +
    `    "relUrl": "${e.relUrl}"\n` +
    `  }`;
}

// Liquid `relative_url` for this site: paths are ASCII-safe except for
// the occasional space. encodeURI over-encodes (would touch `#` in
// `/foo#bar`); a targeted space replacement matches Jekyll byte-for-
// byte.
function encodeSpaces(s) {
  return s.includes(" ") ? s.replaceAll(" ", "%20") : s;
}

// Content sanitiser. Port of the Liquid filter chain in the template's
// `content` line: 14 replaces inserting ` . ` / ` | ` separators
// between block boundaries, then strip_html, remove 'Table of contents',
// normalize_whitespace (collapse + strip), three collapse passes, and a
// trailing-space append. The order is load-bearing for byte parity.
function sanitiseContent(html) {
  let s = String(html ?? "")
    .replaceAll("</h",  " . </h")
    .replaceAll("<hr",  " . <hr")
    .replaceAll("</p",  " . </p")
    .replaceAll("<ul",  " . <ul")
    .replaceAll("</ul", " . </ul")
    .replaceAll("<ol",  " . <ol")
    .replaceAll("</ol", " . </ol")
    .replaceAll("</tr", " . </tr")
    .replaceAll("<li",  " | <li")
    .replaceAll("</li", " | </li")
    .replaceAll("</td", " | </td")
    .replaceAll("<td",  " | <td")
    .replaceAll("</th", " | </th")
    .replaceAll("<th",  " | <th");
  s = stripHtml(s);
  s = s.replaceAll("Table of contents", "");
  // Jekyll's normalize_whitespace = collapse runs of `\s` + strip,
  // with the Ruby semantics for both: `\s` is ASCII-only
  // ([\t\n\v\f\r ]) and `String#strip` is the same set. JS's regex
  // `\s` and `String.prototype.trim` BOTH include NO-BREAK SPACE
  // ( ) and other Unicode whitespace, which would collapse the
  // `&nbsp;`-driven indentation kramdown emits inside blockquote /
  // definition-list syntax. Mirror Ruby's narrower set so search-
  // content stays byte-for-byte with Jekyll on pages that use
  // `&nbsp;` for layout (e.g. the `tB/Core/Class` syntax block).
  s = s.replace(/[\t\n\v\f\r ]+/g, " ");
  s = stripAsciiWhitespace(s);
  s = s.replaceAll(". . .", ".");
  s = s.replaceAll(". .", ".");
  s = s.replaceAll("| |", "|");
  return s + " ";
}

// Ruby's `String#strip` semantics: trim [\t\n\v\f\r ] (and \0, which
// kramdown never emits) from both ends, leaving every other byte --
// including   -- intact.
function stripAsciiWhitespace(s) {
  let start = 0;
  let end = s.length;
  while (start < end && isAsciiWs(s.charCodeAt(start))) start++;
  while (end > start && isAsciiWs(s.charCodeAt(end - 1))) end--;
  return s.slice(start, end);
}

function isAsciiWs(code) {
  return code === 0x20 || (code >= 0x09 && code <= 0x0d);
}