Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 72 additions & 40 deletions scripts/generate-sitemap.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,101 @@
/**
* generate-sitemap.js
*
* Generates sitemap.xml for the Semantic Anchors website
* Generates sitemap.xml for the Semantic Anchors website.
*
* Produces clean (non-hash) URLs that match the History API router in
* website/src/utils/router.js. Hash-based URLs (#/about) are not crawlable
* by search engines — every hash URL looks like the homepage to a crawler,
* and claude.ai / LLM fetchers cannot reach them either.
*
* Keep the PAGES list in sync with router.js `ROUTE_TITLES` when adding
* new routes.
*/

const fs = require('fs')
const path = require('path')

// Paths
const ANCHORS_DATA = path.join(__dirname, '..', 'website', 'public', 'data', 'anchors.json')
const OUTPUT_FILE = path.join(__dirname, '..', 'website', 'public', 'sitemap.xml')
const BASE_URL = 'https://llm-coding.github.io/Semantic-Anchors'

// Read anchors data
const anchorsData = JSON.parse(fs.readFileSync(ANCHORS_DATA, 'utf-8'))
// Static pages served by the SPA router. Keep in sync with
// website/src/utils/router.js -> ROUTE_TITLES AND with the ROUTES list in
// scripts/prerender-routes.js.
//
// Only routes that can be pre-rendered to static HTML are listed here —
// otherwise the sitemap would advertise URLs that return an empty SPA
// shell to non-JS crawlers and claude.ai fetchers.
//
// Excluded on purpose:
// - /contracts — interactive JS page (localStorage, client-side data
// fetching); no static content worth serving
// - /anchor/:id — rendered per entry via the anchor loop below
//
// priority: 1.0 homepage, 0.8 top-level content, 0.7 contributing/meta, 0.6 anchors
const PAGES = [
{ path: '/', priority: '1.0', changefreq: 'weekly' },
{ path: '/about', priority: '0.8', changefreq: 'monthly' },
{ path: '/workflow', priority: '0.8', changefreq: 'monthly' },
{ path: '/brownfield', priority: '0.8', changefreq: 'monthly' },
{ path: '/evaluations', priority: '0.8', changefreq: 'monthly' },
{ path: '/all-anchors', priority: '0.8', changefreq: 'weekly' },
{ path: '/agentskill', priority: '0.7', changefreq: 'monthly' },
{ path: '/changelog', priority: '0.7', changefreq: 'weekly' },
{ path: '/contributing', priority: '0.7', changefreq: 'monthly' },
{ path: '/rejected-proposals', priority: '0.5', changefreq: 'monthly' },
]

// Generate sitemap
const anchorsData = JSON.parse(fs.readFileSync(ANCHORS_DATA, 'utf-8'))
const today = new Date().toISOString().split('T')[0]

let sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<!-- Homepage -->
<url>
<loc>${BASE_URL}/</loc>
<lastmod>${today}</lastmod>
<changefreq>weekly</changefreq>
<priority>1.0</priority>
</url>

<!-- About Page -->
<url>
<loc>${BASE_URL}/#/about</loc>
<lastmod>${today}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
/**
* Render one <url> entry for sitemap.xml.
* @param {string} loc - Fully-qualified URL of the page.
* @param {string} lastmod - ISO date string (YYYY-MM-DD).
* @param {string} changefreq - Sitemap changefreq value (weekly, monthly, ...).
* @param {string} priority - Sitemap priority value ("0.0"–"1.0").
* @param {string} [comment] - Optional XML comment placed above the entry.
* @returns {string} One <url>...</url> block with a trailing blank line.
*/
function urlEntry(loc, lastmod, changefreq, priority, comment) {
return ` ${comment ? `<!-- ${comment} -->\n ` : ''}<url>
<loc>${loc}</loc>
<lastmod>${lastmod}</lastmod>
<changefreq>${changefreq}</changefreq>
<priority>${priority}</priority>
</url>

<!-- Contributing Page -->
<url>
<loc>${BASE_URL}/#/contributing</loc>
<lastmod>${today}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.7</priority>
</url>
`
}

let sitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
`

// Add all anchors
anchorsData.forEach((anchor) => {
sitemap += ` <!-- Anchor: ${anchor.title} -->
<url>
<loc>${BASE_URL}/#/anchor/${anchor.id}</loc>
<lastmod>${today}</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
// Static pages
for (const page of PAGES) {
const loc = page.path === '/' ? `${BASE_URL}/` : `${BASE_URL}${page.path}`
sitemap += urlEntry(loc, today, page.changefreq, page.priority)
}

`
// Individual anchor pages
anchorsData.forEach((anchor) => {
sitemap += urlEntry(
`${BASE_URL}/anchor/${anchor.id}`,
today,
'monthly',
'0.6',
`Anchor: ${anchor.title}`
)
})

sitemap += `</urlset>
`

// Write sitemap
fs.writeFileSync(OUTPUT_FILE, sitemap, 'utf-8')

console.log(`✓ Sitemap generated: ${OUTPUT_FILE}`)
console.log(`✓ Total URLs: ${anchorsData.length + 3} (3 pages + ${anchorsData.length} anchors)`)
console.log(
`✓ Total URLs: ${PAGES.length + anchorsData.length} (${PAGES.length} pages + ${anchorsData.length} anchors)`
)
216 changes: 216 additions & 0 deletions scripts/prerender-routes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#!/usr/bin/env node

/**
* prerender-routes.js
*
* Post-build step: generate per-route static HTML so crawlers and non-JS
* fetchers (claude.ai, curl, search engine bots that skip JS execution) can
* access doc-style pages directly at their clean URLs.
*
* How it works:
* 1. Reads the built Vite shell at website/dist/index.html
* 2. For each route that has a pre-rendered content fragment in
* website/dist/docs/<fragment>.html, generates
* website/dist/<route>/index.html that injects the fragment into
* the #app div's initial markup and updates the <title> + meta
* description.
* 3. When a user-agent with JS loads the page, the SPA boots, clears
* #app, and re-renders as usual — so users get the normal interactive
* experience. Crawlers and no-JS fetchers see real content immediately.
*
* GitHub Pages serves <route>/index.html automatically when the clean URL
* (e.g. /workflow) is requested.
*
* Keep ROUTES in sync with website/src/utils/router.js and scripts/render-docs.js.
*/

const fs = require('fs')
const path = require('path')

const DIST = path.join(__dirname, '..', 'website', 'dist')
const SHELL = path.join(DIST, 'index.html')

// Each entry maps a clean-URL route to the doc fragment rendered by
// scripts/render-docs.js, plus SEO metadata for the per-route <head>.
const ROUTES = [
{
path: '/about',
fragment: 'docs/about.html',
title: 'About — Semantic Anchors',
description:
'Learn what semantic anchors are, why they matter for LLM communication, and how the catalog is curated.',
},
{
path: '/workflow',
fragment: 'docs/spec-driven-workflow.html',
title: 'Development Workflow — Semantic Anchors',
description:
'The Semantic Anchors spec-driven development workflow — from requirements to specification to implementation, powered by semantic anchors.',
},
{
path: '/brownfield',
fragment: 'docs/brownfield-workflow.html',
title: 'Brownfield Workflow — Semantic Anchors',
description:
'Applying semantic anchors to brownfield codebases using a bounded-context approach.',
},
{
path: '/changelog',
fragment: 'docs/changelog.html',
title: 'Changelog — Semantic Anchors',
description: 'Chronological record of all semantic anchors added to the catalog.',
},
{
path: '/contributing',
fragment: 'CONTRIBUTING.html',
title: 'Contributing — Semantic Anchors',
description:
'How to propose new semantic anchors, quality criteria, and the contribution workflow.',
},
{
path: '/agentskill',
fragment: 'docs/agentskill.html',
title: 'AgentSkill — Semantic Anchors',
description:
'The semantic-anchor-translator AgentSkill — install semantic anchors into Claude Code, Codex, Cursor, and other coding agents.',
},
{
path: '/rejected-proposals',
fragment: 'docs/rejected-proposals.html',
title: 'Rejected Proposals — Semantic Anchors',
description:
'Anchor proposals that did not meet the quality criteria, with reasoning — useful for understanding the curation bar.',
},
{
path: '/all-anchors',
fragment: 'docs/all-anchors.html',
title: 'Full Reference — Semantic Anchors',
description:
'Full reference of all semantic anchors in one long document — readable offline, linkable, easy to Ctrl-F.',
},
{
path: '/evaluations',
fragment: 'docs/anchor-evaluations.html',
title: 'Evaluations — Semantic Anchors',
description: 'Multiple-choice evaluations of semantic anchor recognition across 10 LLMs.',
},
]
Comment thread
coderabbitai[bot] marked this conversation as resolved.

/**
* Read the Vite-built HTML shell (website/dist/index.html).
* Exits with an error if the shell is missing — indicates that the caller
* forgot to run `vite build` before this post-build step.
* @returns {string} Raw HTML contents of the shell.
*/
function readShell() {
if (!fs.existsSync(SHELL)) {
console.error(`ERROR: ${SHELL} does not exist. Run 'vite build' first.`)
process.exit(1)
}
return fs.readFileSync(SHELL, 'utf-8')
}

/**
* Escape a string for safe insertion into an HTML attribute or text node.
* Converts &, <, >, ", and ' to their HTML entity equivalents. Used for
* route titles and descriptions that end up inside <title> and meta tags.
* @param {string} str - Input string to escape.
* @returns {string} HTML-safe string.
*/
function escapeHtml(str) {
return String(str).replace(
/[&<>"']/g,
(c) =>
({
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#39;',
})[c]
)
}

/**
* Build the pre-populated markup that goes inside <div id="app">.
* Mirrors the layout produced at runtime by renderHeader() + renderDocPage()
* + renderFooter() in website/src/main.js, but statically — so crawlers see
* real content in the initial HTML response.
*/
function buildAppMarkup(fragmentHtml) {
return `
<main class="flex-1">
<article class="mx-auto max-w-4xl px-4 py-8 sm:px-6 lg:px-8">
<div id="doc-content" class="asciidoc-content">${fragmentHtml}</div>
</article>
</main>
`
}

/**
* Pre-render a single route to website/dist/<route>/index.html.
* Reads the AsciiDoc fragment produced by scripts/render-docs.js, injects
* it into a copy of the Vite shell, and updates the <title>, meta
* description, and canonical URL to match the route. Throws if the
* fragment is missing so the build fails fast instead of shipping an
* incomplete set of pre-rendered pages.
* @param {string} shell - Raw HTML of the Vite build shell.
* @param {{path: string, fragment: string, title: string, description: string}} route
* Route descriptor from the ROUTES list.
* @throws {Error} When the configured fragment file does not exist.
*/
function prerenderRoute(shell, route) {
const fragmentPath = path.join(DIST, route.fragment)
if (!fs.existsSync(fragmentPath)) {
throw new Error(
`Missing fragment for ${route.path}: ${route.fragment} (expected at ${fragmentPath}). ` +
`Make sure scripts/render-docs.js runs before prerender-routes.js and writes the fragment to website/public/docs/.`
)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
const fragment = fs.readFileSync(fragmentPath, 'utf-8')

let html = shell

// Replace <title>
html = html.replace(/<title>[\s\S]*?<\/title>/, `<title>${escapeHtml(route.title)}</title>`)

// Replace meta description if present
html = html.replace(
/<meta\s+name="description"\s+content="[^"]*"\s*\/?>/,
`<meta name="description" content="${escapeHtml(route.description)}" />`
)

// Update canonical URL so each pre-rendered page points to itself
const canonicalUrl = `https://llm-coding.github.io/Semantic-Anchors${route.path}`
html = html.replace(
/<link\s+rel="canonical"\s+href="[^"]*"\s*\/?>/,
`<link rel="canonical" href="${canonicalUrl}" />`
)

// Inject pre-rendered content into #app
html = html.replace(
/<div\s+id="app"\s*>\s*<\/div>/,
`<div id="app">${buildAppMarkup(fragment)}</div>`
)

const outDir = path.join(DIST, route.path)
const outFile = path.join(outDir, 'index.html')
fs.mkdirSync(outDir, { recursive: true })
fs.writeFileSync(outFile, html, 'utf-8')
}

/**
* Entry point: read the shell once, then pre-render every route in ROUTES.
* Throws (via prerenderRoute) if any fragment is missing, so the build
* fails non-zero instead of shipping an incomplete set of static pages.
*/
function main() {
const shell = readShell()
for (const route of ROUTES) {
prerenderRoute(shell, route)
console.log(` ✓ pre-rendered ${route.path}`)
}
console.log(`\n✓ Pre-rendered ${ROUTES.length} routes to dist/<route>/index.html`)
}

main()
2 changes: 1 addition & 1 deletion website/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"predev": "node ../scripts/sync-anchors.js",
"dev": "vite",
"prebuild": "node ../scripts/sync-anchors.js && node ../scripts/render-docs.js",
"build": "vite build",
"build": "vite build && node ../scripts/prerender-routes.js",

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Der neue Build-Pfad liegt außerhalb eurer Lint-/Prettier-Gates.

Mit node ../scripts/prerender-routes.js hängt der Produktionsbuild jetzt von zusätzlichem scripts/**/*.js ab, aber lint, lint:fix, format und format:check prüfen weiter nur src/. Syntax- und Formatfehler in den neuen Build-Skripten werden damit nicht von den Tier-1-Gates abgefangen.

Vorschlag
-    "lint": "eslint src/",
-    "lint:fix": "eslint src/ --fix",
-    "format": "prettier --write src/",
-    "format:check": "prettier --check src/"
+    "lint": "eslint src/ ../scripts/",
+    "lint:fix": "eslint src/ ../scripts/ --fix",
+    "format": "prettier --write src/ ../scripts/",
+    "format:check": "prettier --check src/ ../scripts/"
As per coding guidelines: `**/*.js`: All projects must implement ESLint configuration and enforce formatting with Prettier to achieve Tier 1 automated gates as documented in Risk Radar assessment.
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@website/package.json` at line 11, The build script ("build": "vite build &&
node ../scripts/prerender-routes.js") pulls in ../scripts/prerender-routes.js
but your lint/format gates still only target src/; update the project's
lint/format config and npm scripts so the ESLint and Prettier checks include the
external scripts (e.g., add ../scripts/**/*.js to the globs used by "lint",
"lint:fix", "format" and "format:check"), or move/duplicate the prerender script
inside the checked source; ensure the referenced symbols are "build", "lint",
"lint:fix", "format", "format:check" and the external script path
../scripts/prerender-routes.js so the new files are covered by Tier‑1 gates.

"preview": "vite preview",
"test": "vitest run",
"test:watch": "vitest",
Expand Down
Loading
Loading