From dd8ab777590fa26aa4beeeae763054273a6e1bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralf=20D=2E=20M=C3=BCller?= Date: Wed, 10 Jun 2026 08:20:42 +0200 Subject: [PATCH] feat(seo): structured data + pre-render the article for AI/search discoverability Implements the structured-data half of #579 and fixes a missing pre-render route. - Standalone Organization entity (resolvable @id + sameAs) in index.html; WebSite.publisher now references it by @id (#579/1a). - scripts/generate-jsonld.js: build-time DefinedTermSet + a DefinedTerm per anchor (161 terms; name, canonical URL, termCode, and a definition where cleanly extractable from the .adoc) generated from anchors.json and injected into the home and /all-anchors pages after prerender (#579/1b). The human-readable definitions already ship via /all-anchors; crisp answer blocks remain #580. - Pre-render /training-data-vs-practice: the article was absent from the prerender ROUTES list, so it was invisible to search engines and LLM crawlers. Now pre-rendered like every other doc page. Verified with a full vite build + prerender + injection: the article page carries real content; home and /all-anchors carry the set, other routes do not; both index.html JSON-LD blocks and the generated set validate. Co-Authored-By: Claude Opus 4.8 --- docs/changelog.adoc | 7 ++ scripts/generate-jsonld.js | 168 ++++++++++++++++++++++++++++++++++++ scripts/prerender-routes.js | 7 ++ website/index.html | 26 ++++-- website/package.json | 2 +- 5 files changed, 204 insertions(+), 6 deletions(-) create mode 100644 scripts/generate-jsonld.js diff --git a/docs/changelog.adoc b/docs/changelog.adoc index 57c9219..38d01f1 100644 --- a/docs/changelog.adoc +++ b/docs/changelog.adoc @@ -2,6 +2,13 @@ A chronological record of all semantic anchors added to the catalog. Community contributors are credited with thanks. +== 2026-06-10 + +*Discoverability (SEO / AI):* + +* *Structured data* — added a standalone `Organization` entity and a `DefinedTermSet` with a `DefinedTerm` for every anchor (name, canonical URL, and a definition where available), generated at build time from `anchors.json`. Lets search engines and retrieval-grounded AI resolve "Semantic Anchors" as a distinct entity and each anchor as a defined term (#579). +* *Fixed:* the _An Anchor Delivers Only as Far as the Prior Reaches_ article was not pre-rendered and was therefore invisible to search engines and LLM crawlers. It is now pre-rendered like every other doc page. + == 2026-06-09 *New contracts:* diff --git a/scripts/generate-jsonld.js b/scripts/generate-jsonld.js new file mode 100644 index 0000000..4ce5e5d --- /dev/null +++ b/scripts/generate-jsonld.js @@ -0,0 +1,168 @@ +#!/usr/bin/env node +/** + * Generate schema.org DefinedTermSet / DefinedTerm JSON-LD from anchors.json + * and inject it into the pre-rendered catalog pages. + * + * Why: the catalog is 160+ well-defined terms, but only their prose is + * crawlable (via /all-anchors). Search engines and retrieval-grounded AI need + * a machine-readable entity graph to resolve "Semantic Anchors" as a distinct + * DefinedTermSet and each anchor as a DefinedTerm with a canonical URL. This is + * the canonical schema.org type for a glossary/controlled vocabulary and is the + * structured-data half of issue #579 (the human-readable definitions already + * ship in crawlable HTML). + * + * Per-term `description` is extracted from the first "Core Concepts" definition + * in each anchor's .adoc when it is cleanly available, and omitted otherwise — + * crisp 40-60 word answer blocks are issue #580's job, and will later supersede + * these as the DefinedTerm descriptions. + * + * Runs AFTER prerender-routes.js so it only touches the home page and + * /all-anchors (the canonical locations for the set), not every route shell. + * When the dist build is absent it prints the JSON-LD to stdout for inspection. + * + * Usage: node scripts/generate-jsonld.js + */ + +const fs = require('fs') +const path = require('path') + +const ROOT = path.join(__dirname, '..') +const ANCHORS_JSON = path.join(ROOT, 'website/public/data/anchors.json') +const DIST = path.join(ROOT, 'website/dist') +const BASE = 'https://llm-coding.github.io/Semantic-Anchors' +const SET_ID = `${BASE}/#catalog` + +// Pages that should carry the DefinedTermSet: the catalog root and the full +// reference. Both represent the whole set; other routes do not. +const TARGETS = [path.join(DIST, 'index.html'), path.join(DIST, 'all-anchors', 'index.html')] + +/** + * Pull a short definition for an anchor from the first "Core Concepts" entry in + * its .adoc. Returns a cleaned, length-capped string, or null when nothing + * usable is found (safe to omit — DefinedTerm.description is optional). + */ +function extractDescription(filePath) { + const abs = path.join(ROOT, filePath) + if (!fs.existsSync(abs)) return null + const lines = fs.readFileSync(abs, 'utf-8').split('\n') + + const ccIndex = lines.findIndex((l) => /Core Concepts/i.test(l)) + if (ccIndex === -1) return null + + // First definition-list description after the Core Concepts heading: + // Term:: definition text + for (let i = ccIndex + 1; i < lines.length && i < ccIndex + 12; i++) { + const m = lines[i].match(/^.+?::\s+(.+)$/) + if (m) { + const cleaned = cleanAdoc(m[1]) + return cleaned.length >= 20 ? capLength(cleaned, 220) : null + } + } + return null +} + +/** Strip the AsciiDoc inline markup that would be noise in a description. */ +function cleanAdoc(s) { + return s + .replace(/link:[^[]*\[([^\]]*)\]/g, '$1') // link:url[text] -> text + .replace(/<<[^,>]+,\s*([^>]+)>>/g, '$1') // <> -> text + .replace(/<<([^>]+)>>/g, '$1') // <> -> id + .replace(/[*_`]/g, '') // bold/italic/mono markers + .replace(/\s+/g, ' ') + .trim() +} + +/** Cap at a word boundary, appending an ellipsis when truncated. */ +function capLength(s, max) { + if (s.length <= max) return s + const cut = s.slice(0, max) + const lastSpace = cut.lastIndexOf(' ') + return `${cut.slice(0, lastSpace > 40 ? lastSpace : max).trim()}…` +} + +/** Build the DefinedTermSet object from anchors.json. */ +function buildDefinedTermSet() { + const anchors = JSON.parse(fs.readFileSync(ANCHORS_JSON, 'utf-8')) + const list = Array.isArray(anchors) ? anchors : anchors.anchors || [] + + const terms = list + .filter((a) => a && a.id && a.title) + .map((a) => { + const url = `${BASE}/anchor/${a.id}` + const term = { + '@type': 'DefinedTerm', + '@id': url, + name: a.title, + termCode: a.id, + url, + inDefinedTermSet: SET_ID, + } + const description = a.filePath ? extractDescription(a.filePath) : null + if (description) term.description = description + return term + }) + + return { + '@context': 'https://schema.org', + '@type': 'DefinedTermSet', + '@id': SET_ID, + name: 'Semantic Anchors', + url: `${BASE}/`, + description: + 'A curated catalog of semantic anchors — well-defined terms, methodologies, and frameworks used as shared vocabulary when communicating with Large Language Models.', + hasDefinedTerm: terms, + } +} + +/** + * Serialize as a " + * inside any description can never break out of the element (standard JSON-LD + * hardening). + */ +function buildScriptTag() { + const json = JSON.stringify(buildDefinedTermSet(), null, 2).replace(/\n${json}\n` +} + +/** Insert the script tag before , unless the set is already present. */ +function injectInto(file, scriptTag) { + if (!fs.existsSync(file)) return false + let html = fs.readFileSync(file, 'utf-8') + if (html.includes(SET_ID)) return false // idempotent + if (!html.includes('')) return false + html = html.replace('', ` ${scriptTag}\n `) + fs.writeFileSync(file, html, 'utf-8') + return true +} + +function main() { + const set = buildDefinedTermSet() + const scriptTag = buildScriptTag() + + const anyDist = TARGETS.some((f) => fs.existsSync(f)) + if (!anyDist) { + // No build present — print for inspection so the output can be validated + // without a full vite build. + process.stdout.write(JSON.stringify(set, null, 2) + '\n') + console.warn( + `\n(no dist build found — printed ${set.hasDefinedTerm.length} DefinedTerms to stdout; run after 'vite build' to inject)` + ) + return + } + + let injected = 0 + for (const file of TARGETS) { + if (injectInto(file, scriptTag)) { + injected++ + console.log(` ✓ injected DefinedTermSet into ${path.relative(ROOT, file)}`) + } + } + const withDesc = set.hasDefinedTerm.filter((t) => t.description).length + console.log( + `\n✓ DefinedTermSet: ${set.hasDefinedTerm.length} terms (${withDesc} with description) injected into ${injected} page(s)` + ) +} + +if (require.main === module) main() + +module.exports = { buildDefinedTermSet, buildScriptTag } diff --git a/scripts/prerender-routes.js b/scripts/prerender-routes.js index d8f552d..3587f9a 100644 --- a/scripts/prerender-routes.js +++ b/scripts/prerender-routes.js @@ -86,6 +86,13 @@ const ROUTES = [ description: 'Installable Claude Code Skill that packages the brownfield documentation-recovery workflow. Two-phase Question Tree with [ANSWERED]/[OPEN] leaves, Q-ID traceability. Install on Claude Code, Codex, Cursor, GitHub Copilot, Gemini CLI, and Amazon Kiro.', }, + { + path: '/training-data-vs-practice', + fragment: 'docs/training-data-vs-practice.html', + title: 'An Anchor Delivers Only as Far as the Prior Reaches — Semantic Anchors', + description: + "A semantic anchor's power depends on how densely the concept sits in an LLM's training data. A reproducible clean-room experiment across Claude Haiku 4.5, Sonnet 4.6, Opus 4.8 and Fable 5 on the Cockburn use-cases anchor.", + }, { path: '/contracts', fragment: 'docs/contracts.html', diff --git a/website/index.html b/website/index.html index 9c1e36c..79bd4cc 100644 --- a/website/index.html +++ b/website/index.html @@ -53,11 +53,7 @@ "url": "https://llm-coding.github.io/Semantic-Anchors/", "description": "110+ semantic anchors and semantic contracts for precise communication with Large Language Models. Evaluated across 10 models.", "inLanguage": ["en", "de"], - "publisher": { - "@type": "Organization", - "name": "LLM Coding Community", - "url": "https://github.com/LLM-Coding" - }, + "publisher": { "@id": "https://llm-coding.github.io/Semantic-Anchors/#organization" }, "potentialAction": { "@type": "SearchAction", "target": "https://llm-coding.github.io/Semantic-Anchors/#/search?q={search_term_string}", @@ -66,6 +62,26 @@ } + + +