From dd8ab777590fa26aa4beeeae763054273a6e1bfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ralf=20D=2E=20M=C3=BCller?= <ralf.d.mueller@gmail.com>
Date: Wed, 10 Jun 2026 08:20:42 +0200
Subject: [PATCH] feat(seo): structured data + pre-render the article for
 AI/search discoverability

Implements the structured-data half of #579 and fixes a missing pre-render route.

- Standalone Organization entity (resolvable @id + sameAs) in index.html;
  WebSite.publisher now references it by @id (#579/1a).
- scripts/generate-jsonld.js: build-time DefinedTermSet + a DefinedTerm per
  anchor (161 terms; name, canonical URL, termCode, and a definition where
  cleanly extractable from the .adoc) generated from anchors.json and injected
  into the home and /all-anchors pages after prerender (#579/1b). The
  human-readable definitions already ship via /all-anchors; crisp answer
  blocks remain #580.
- Pre-render /training-data-vs-practice: the article was absent from the
  prerender ROUTES list, so it was invisible to search engines and LLM
  crawlers. Now pre-rendered like every other doc page.

Verified with a full vite build + prerender + injection: the article page
carries real content; home and /all-anchors carry the set, other routes do
not; both index.html JSON-LD blocks and the generated set validate.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/changelog.adoc         |   7 ++
 scripts/generate-jsonld.js  | 168 ++++++++++++++++++++++++++++++++++++
 scripts/prerender-routes.js |   7 ++
 website/index.html          |  26 ++++--
 website/package.json        |   2 +-
 5 files changed, 204 insertions(+), 6 deletions(-)
 create mode 100644 scripts/generate-jsonld.js

diff --git a/docs/changelog.adoc b/docs/changelog.adoc
index 57c9219..38d01f1 100644
--- a/docs/changelog.adoc
+++ b/docs/changelog.adoc
@@ -2,6 +2,13 @@
 
 A chronological record of all semantic anchors added to the catalog. Community contributors are credited with thanks.
 
+== 2026-06-10
+
+*Discoverability (SEO / AI):*
+
+* *Structured data* — added a standalone `Organization` entity and a `DefinedTermSet` with a `DefinedTerm` for every anchor (name, canonical URL, and a definition where available), generated at build time from `anchors.json`. Lets search engines and retrieval-grounded AI resolve "Semantic Anchors" as a distinct entity and each anchor as a defined term (#579).
+* *Fixed:* the _An Anchor Delivers Only as Far as the Prior Reaches_ article was not pre-rendered and was therefore invisible to search engines and LLM crawlers. It is now pre-rendered like every other doc page.
+
 == 2026-06-09
 
 *New contracts:*
diff --git a/scripts/generate-jsonld.js b/scripts/generate-jsonld.js
new file mode 100644
index 0000000..4ce5e5d
--- /dev/null
+++ b/scripts/generate-jsonld.js
@@ -0,0 +1,168 @@
+#!/usr/bin/env node
+/**
+ * Generate schema.org DefinedTermSet / DefinedTerm JSON-LD from anchors.json
+ * and inject it into the pre-rendered catalog pages.
+ *
+ * Why: the catalog is 160+ well-defined terms, but only their prose is
+ * crawlable (via /all-anchors). Search engines and retrieval-grounded AI need
+ * a machine-readable entity graph to resolve "Semantic Anchors" as a distinct
+ * DefinedTermSet and each anchor as a DefinedTerm with a canonical URL. This is
+ * the canonical schema.org type for a glossary/controlled vocabulary and is the
+ * structured-data half of issue #579 (the human-readable definitions already
+ * ship in crawlable HTML).
+ *
+ * Per-term `description` is extracted from the first "Core Concepts" definition
+ * in each anchor's .adoc when it is cleanly available, and omitted otherwise —
+ * crisp 40-60 word answer blocks are issue #580's job, and will later supersede
+ * these as the DefinedTerm descriptions.
+ *
+ * Runs AFTER prerender-routes.js so it only touches the home page and
+ * /all-anchors (the canonical locations for the set), not every route shell.
+ * When the dist build is absent it prints the JSON-LD to stdout for inspection.
+ *
+ * Usage: node scripts/generate-jsonld.js
+ */
+
+const fs = require('fs')
+const path = require('path')
+
+const ROOT = path.join(__dirname, '..')
+const ANCHORS_JSON = path.join(ROOT, 'website/public/data/anchors.json')
+const DIST = path.join(ROOT, 'website/dist')
+const BASE = 'https://llm-coding.github.io/Semantic-Anchors'
+const SET_ID = `${BASE}/#catalog`
+
+// Pages that should carry the DefinedTermSet: the catalog root and the full
+// reference. Both represent the whole set; other routes do not.
+const TARGETS = [path.join(DIST, 'index.html'), path.join(DIST, 'all-anchors', 'index.html')]
+
+/**
+ * Pull a short definition for an anchor from the first "Core Concepts" entry in
+ * its .adoc. Returns a cleaned, length-capped string, or null when nothing
+ * usable is found (safe to omit — DefinedTerm.description is optional).
+ */
+function extractDescription(filePath) {
+  const abs = path.join(ROOT, filePath)
+  if (!fs.existsSync(abs)) return null
+  const lines = fs.readFileSync(abs, 'utf-8').split('\n')
+
+  const ccIndex = lines.findIndex((l) => /Core Concepts/i.test(l))
+  if (ccIndex === -1) return null
+
+  // First definition-list description after the Core Concepts heading:
+  //   Term:: definition text
+  for (let i = ccIndex + 1; i < lines.length && i < ccIndex + 12; i++) {
+    const m = lines[i].match(/^.+?::\s+(.+)$/)
+    if (m) {
+      const cleaned = cleanAdoc(m[1])
+      return cleaned.length >= 20 ? capLength(cleaned, 220) : null
+    }
+  }
+  return null
+}
+
+/** Strip the AsciiDoc inline markup that would be noise in a description. */
+function cleanAdoc(s) {
+  return s
+    .replace(/link:[^[]*\[([^\]]*)\]/g, '$1') // link:url[text] -> text
+    .replace(/<<[^,>]+,\s*([^>]+)>>/g, '$1') // <<id,text>> -> text
+    .replace(/<<([^>]+)>>/g, '$1') // <<id>> -> id
+    .replace(/[*_`]/g, '') // bold/italic/mono markers
+    .replace(/\s+/g, ' ')
+    .trim()
+}
+
+/** Cap at a word boundary, appending an ellipsis when truncated. */
+function capLength(s, max) {
+  if (s.length <= max) return s
+  const cut = s.slice(0, max)
+  const lastSpace = cut.lastIndexOf(' ')
+  return `${cut.slice(0, lastSpace > 40 ? lastSpace : max).trim()}…`
+}
+
+/** Build the DefinedTermSet object from anchors.json. */
+function buildDefinedTermSet() {
+  const anchors = JSON.parse(fs.readFileSync(ANCHORS_JSON, 'utf-8'))
+  const list = Array.isArray(anchors) ? anchors : anchors.anchors || []
+
+  const terms = list
+    .filter((a) => a && a.id && a.title)
+    .map((a) => {
+      const url = `${BASE}/anchor/${a.id}`
+      const term = {
+        '@type': 'DefinedTerm',
+        '@id': url,
+        name: a.title,
+        termCode: a.id,
+        url,
+        inDefinedTermSet: SET_ID,
+      }
+      const description = a.filePath ? extractDescription(a.filePath) : null
+      if (description) term.description = description
+      return term
+    })
+
+  return {
+    '@context': 'https://schema.org',
+    '@type': 'DefinedTermSet',
+    '@id': SET_ID,
+    name: 'Semantic Anchors',
+    url: `${BASE}/`,
+    description:
+      'A curated catalog of semantic anchors — well-defined terms, methodologies, and frameworks used as shared vocabulary when communicating with Large Language Models.',
+    hasDefinedTerm: terms,
+  }
+}
+
+/**
+ * Serialize as a <script> tag. `<` is escaped to < so a stray "</script>"
+ * inside any description can never break out of the element (standard JSON-LD
+ * hardening).
+ */
+function buildScriptTag() {
+  const json = JSON.stringify(buildDefinedTermSet(), null, 2).replace(/</g, '\\u003c')
+  return `<script type="application/ld+json">\n${json}\n</script>`
+}
+
+/** Insert the script tag before </head>, unless the set is already present. */
+function injectInto(file, scriptTag) {
+  if (!fs.existsSync(file)) return false
+  let html = fs.readFileSync(file, 'utf-8')
+  if (html.includes(SET_ID)) return false // idempotent
+  if (!html.includes('</head>')) return false
+  html = html.replace('</head>', `  ${scriptTag}\n  </head>`)
+  fs.writeFileSync(file, html, 'utf-8')
+  return true
+}
+
+function main() {
+  const set = buildDefinedTermSet()
+  const scriptTag = buildScriptTag()
+
+  const anyDist = TARGETS.some((f) => fs.existsSync(f))
+  if (!anyDist) {
+    // No build present — print for inspection so the output can be validated
+    // without a full vite build.
+    process.stdout.write(JSON.stringify(set, null, 2) + '\n')
+    console.warn(
+      `\n(no dist build found — printed ${set.hasDefinedTerm.length} DefinedTerms to stdout; run after 'vite build' to inject)`
+    )
+    return
+  }
+
+  let injected = 0
+  for (const file of TARGETS) {
+    if (injectInto(file, scriptTag)) {
+      injected++
+      console.log(`  ✓ injected DefinedTermSet into ${path.relative(ROOT, file)}`)
+    }
+  }
+  const withDesc = set.hasDefinedTerm.filter((t) => t.description).length
+  console.log(
+    `\n✓ DefinedTermSet: ${set.hasDefinedTerm.length} terms (${withDesc} with description) injected into ${injected} page(s)`
+  )
+}
+
+if (require.main === module) main()
+
+module.exports = { buildDefinedTermSet, buildScriptTag }
diff --git a/scripts/prerender-routes.js b/scripts/prerender-routes.js
index d8f552d..3587f9a 100644
--- a/scripts/prerender-routes.js
+++ b/scripts/prerender-routes.js
@@ -86,6 +86,13 @@ const ROUTES = [
     description:
       'Installable Claude Code Skill that packages the brownfield documentation-recovery workflow. Two-phase Question Tree with [ANSWERED]/[OPEN] leaves, Q-ID traceability. Install on Claude Code, Codex, Cursor, GitHub Copilot, Gemini CLI, and Amazon Kiro.',
   },
+  {
+    path: '/training-data-vs-practice',
+    fragment: 'docs/training-data-vs-practice.html',
+    title: 'An Anchor Delivers Only as Far as the Prior Reaches — Semantic Anchors',
+    description:
+      "A semantic anchor's power depends on how densely the concept sits in an LLM's training data. A reproducible clean-room experiment across Claude Haiku 4.5, Sonnet 4.6, Opus 4.8 and Fable 5 on the Cockburn use-cases anchor.",
+  },
   {
     path: '/contracts',
     fragment: 'docs/contracts.html',
diff --git a/website/index.html b/website/index.html
index 9c1e36c..79bd4cc 100644
--- a/website/index.html
+++ b/website/index.html
@@ -53,11 +53,7 @@
       "url": "https://llm-coding.github.io/Semantic-Anchors/",
       "description": "110+ semantic anchors and semantic contracts for precise communication with Large Language Models. Evaluated across 10 models.",
       "inLanguage": ["en", "de"],
-      "publisher": {
-        "@type": "Organization",
-        "name": "LLM Coding Community",
-        "url": "https://github.com/LLM-Coding"
-      },
+      "publisher": { "@id": "https://llm-coding.github.io/Semantic-Anchors/#organization" },
       "potentialAction": {
         "@type": "SearchAction",
         "target": "https://llm-coding.github.io/Semantic-Anchors/#/search?q={search_term_string}",
@@ -66,6 +62,26 @@
     }
     </script>
 
+    <!-- Standalone Organization entity (resolvable by @id, not only nested as
+         publisher) so search engines and AI can identify "Semantic Anchors" as
+         a distinct entity. See issue #579. -->
+    <script type="application/ld+json">
+    {
+      "@context": "https://schema.org",
+      "@type": "Organization",
+      "@id": "https://llm-coding.github.io/Semantic-Anchors/#organization",
+      "name": "Semantic Anchors",
+      "alternateName": "LLM Coding Community",
+      "url": "https://llm-coding.github.io/Semantic-Anchors/",
+      "logo": "https://llm-coding.github.io/Semantic-Anchors/logo.png",
+      "description": "A curated catalog of semantic anchors and semantic contracts — shared vocabulary for precise communication with Large Language Models.",
+      "sameAs": [
+        "https://github.com/LLM-Coding",
+        "https://github.com/LLM-Coding/Semantic-Anchors"
+      ]
+    }
+    </script>
+
     <!-- Privacy-friendly, cookieless analytics (GoatCounter). No cookies, no
          personal data, no IP storage — so no consent banner is required.
          count.js is self-hosted (first-party) to avoid a third-party script
diff --git a/website/package.json b/website/package.json
index c826d91..d8ed8a5 100644
--- a/website/package.json
+++ b/website/package.json
@@ -8,7 +8,7 @@
     "predev": "node ../scripts/sync-anchors.js",
     "dev": "vite",
     "prebuild": "node ../scripts/sync-anchors.js && node ../scripts/render-docs.js && node ../scripts/render-contracts.js",
-    "build": "vite build && node ../scripts/prerender-routes.js",
+    "build": "vite build && node ../scripts/prerender-routes.js && node ../scripts/generate-jsonld.js",
     "preview": "vite preview",
     "test": "vitest run",
     "test:watch": "vitest",