Skip to content

Commit 7e7a690

Browse files
authored
Merge pull request #429 from raifdmueller/fix/sitemap-clean-urls-root-cause
fix: make SPA routes crawlable at source (sitemap script + pre-render)
2 parents debe6e2 + 9253168 commit 7e7a690

4 files changed

Lines changed: 603 additions & 309 deletions

File tree

scripts/generate-sitemap.js

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,69 +3,101 @@
33
/**
44
* generate-sitemap.js
55
*
6-
* Generates sitemap.xml for the Semantic Anchors website
6+
* Generates sitemap.xml for the Semantic Anchors website.
7+
*
8+
* Produces clean (non-hash) URLs that match the History API router in
9+
* website/src/utils/router.js. Hash-based URLs (#/about) are not crawlable
10+
* by search engines — every hash URL looks like the homepage to a crawler,
11+
* and claude.ai / LLM fetchers cannot reach them either.
12+
*
13+
* Keep the PAGES list in sync with router.js `ROUTE_TITLES` when adding
14+
* new routes.
715
*/
816

917
const fs = require('fs')
1018
const path = require('path')
1119

12-
// Paths
1320
const ANCHORS_DATA = path.join(__dirname, '..', 'website', 'public', 'data', 'anchors.json')
1421
const OUTPUT_FILE = path.join(__dirname, '..', 'website', 'public', 'sitemap.xml')
1522
const BASE_URL = 'https://llm-coding.github.io/Semantic-Anchors'
1623

17-
// Read anchors data
18-
const anchorsData = JSON.parse(fs.readFileSync(ANCHORS_DATA, 'utf-8'))
24+
// Static pages served by the SPA router. Keep in sync with
25+
// website/src/utils/router.js -> ROUTE_TITLES AND with the ROUTES list in
26+
// scripts/prerender-routes.js.
27+
//
28+
// Only routes that can be pre-rendered to static HTML are listed here —
29+
// otherwise the sitemap would advertise URLs that return an empty SPA
30+
// shell to non-JS crawlers and claude.ai fetchers.
31+
//
32+
// Excluded on purpose:
33+
// - /contracts — interactive JS page (localStorage, client-side data
34+
// fetching); no static content worth serving
35+
// - /anchor/:id — rendered per entry via the anchor loop below
36+
//
37+
// priority: 1.0 homepage, 0.8 top-level content, 0.7 contributing/meta, 0.6 anchors
38+
const PAGES = [
39+
{ path: '/', priority: '1.0', changefreq: 'weekly' },
40+
{ path: '/about', priority: '0.8', changefreq: 'monthly' },
41+
{ path: '/workflow', priority: '0.8', changefreq: 'monthly' },
42+
{ path: '/brownfield', priority: '0.8', changefreq: 'monthly' },
43+
{ path: '/evaluations', priority: '0.8', changefreq: 'monthly' },
44+
{ path: '/all-anchors', priority: '0.8', changefreq: 'weekly' },
45+
{ path: '/agentskill', priority: '0.7', changefreq: 'monthly' },
46+
{ path: '/changelog', priority: '0.7', changefreq: 'weekly' },
47+
{ path: '/contributing', priority: '0.7', changefreq: 'monthly' },
48+
{ path: '/rejected-proposals', priority: '0.5', changefreq: 'monthly' },
49+
]
1950

20-
// Generate sitemap
51+
const anchorsData = JSON.parse(fs.readFileSync(ANCHORS_DATA, 'utf-8'))
2152
const today = new Date().toISOString().split('T')[0]
2253

23-
let sitemap = `<?xml version="1.0" encoding="UTF-8"?>
24-
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
25-
<!-- Homepage -->
26-
<url>
27-
<loc>${BASE_URL}/</loc>
28-
<lastmod>${today}</lastmod>
29-
<changefreq>weekly</changefreq>
30-
<priority>1.0</priority>
31-
</url>
32-
33-
<!-- About Page -->
34-
<url>
35-
<loc>${BASE_URL}/#/about</loc>
36-
<lastmod>${today}</lastmod>
37-
<changefreq>monthly</changefreq>
38-
<priority>0.8</priority>
54+
/**
55+
* Render one <url> entry for sitemap.xml.
56+
* @param {string} loc - Fully-qualified URL of the page.
57+
* @param {string} lastmod - ISO date string (YYYY-MM-DD).
58+
* @param {string} changefreq - Sitemap changefreq value (weekly, monthly, ...).
59+
* @param {string} priority - Sitemap priority value ("0.0"–"1.0").
60+
* @param {string} [comment] - Optional XML comment placed above the entry.
61+
* @returns {string} One <url>...</url> block with a trailing blank line.
62+
*/
63+
function urlEntry(loc, lastmod, changefreq, priority, comment) {
64+
return ` ${comment ? `<!-- ${comment} -->\n ` : ''}<url>
65+
<loc>${loc}</loc>
66+
<lastmod>${lastmod}</lastmod>
67+
<changefreq>${changefreq}</changefreq>
68+
<priority>${priority}</priority>
3969
</url>
4070
41-
<!-- Contributing Page -->
42-
<url>
43-
<loc>${BASE_URL}/#/contributing</loc>
44-
<lastmod>${today}</lastmod>
45-
<changefreq>monthly</changefreq>
46-
<priority>0.7</priority>
47-
</url>
71+
`
72+
}
4873

74+
let sitemap = `<?xml version="1.0" encoding="UTF-8"?>
75+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
4976
`
5077

51-
// Add all anchors
52-
anchorsData.forEach((anchor) => {
53-
sitemap += ` <!-- Anchor: ${anchor.title} -->
54-
<url>
55-
<loc>${BASE_URL}/#/anchor/${anchor.id}</loc>
56-
<lastmod>${today}</lastmod>
57-
<changefreq>monthly</changefreq>
58-
<priority>0.6</priority>
59-
</url>
78+
// Static pages
79+
for (const page of PAGES) {
80+
const loc = page.path === '/' ? `${BASE_URL}/` : `${BASE_URL}${page.path}`
81+
sitemap += urlEntry(loc, today, page.changefreq, page.priority)
82+
}
6083

61-
`
84+
// Individual anchor pages
85+
anchorsData.forEach((anchor) => {
86+
sitemap += urlEntry(
87+
`${BASE_URL}/anchor/${anchor.id}`,
88+
today,
89+
'monthly',
90+
'0.6',
91+
`Anchor: ${anchor.title}`
92+
)
6293
})
6394

6495
sitemap += `</urlset>
6596
`
6697

67-
// Write sitemap
6898
fs.writeFileSync(OUTPUT_FILE, sitemap, 'utf-8')
6999

70100
console.log(`✓ Sitemap generated: ${OUTPUT_FILE}`)
71-
console.log(`✓ Total URLs: ${anchorsData.length + 3} (3 pages + ${anchorsData.length} anchors)`)
101+
console.log(
102+
`✓ Total URLs: ${PAGES.length + anchorsData.length} (${PAGES.length} pages + ${anchorsData.length} anchors)`
103+
)

scripts/prerender-routes.js

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env node
2+
3+
/**
4+
* prerender-routes.js
5+
*
6+
* Post-build step: generate per-route static HTML so crawlers and non-JS
7+
* fetchers (claude.ai, curl, search engine bots that skip JS execution) can
8+
* access doc-style pages directly at their clean URLs.
9+
*
10+
* How it works:
11+
* 1. Reads the built Vite shell at website/dist/index.html
12+
* 2. For each route that has a pre-rendered content fragment in
13+
* website/dist/docs/<fragment>.html, generates
14+
* website/dist/<route>/index.html that injects the fragment into
15+
* the #app div's initial markup and updates the <title> + meta
16+
* description.
17+
* 3. When a user-agent with JS loads the page, the SPA boots, clears
18+
* #app, and re-renders as usual — so users get the normal interactive
19+
* experience. Crawlers and no-JS fetchers see real content immediately.
20+
*
21+
* GitHub Pages serves <route>/index.html automatically when the clean URL
22+
* (e.g. /workflow) is requested.
23+
*
24+
* Keep ROUTES in sync with website/src/utils/router.js and scripts/render-docs.js.
25+
*/
26+
27+
const fs = require('fs')
28+
const path = require('path')
29+
30+
const DIST = path.join(__dirname, '..', 'website', 'dist')
31+
const SHELL = path.join(DIST, 'index.html')
32+
33+
// Each entry maps a clean-URL route to the doc fragment rendered by
34+
// scripts/render-docs.js, plus SEO metadata for the per-route <head>.
35+
const ROUTES = [
36+
{
37+
path: '/about',
38+
fragment: 'docs/about.html',
39+
title: 'About — Semantic Anchors',
40+
description:
41+
'Learn what semantic anchors are, why they matter for LLM communication, and how the catalog is curated.',
42+
},
43+
{
44+
path: '/workflow',
45+
fragment: 'docs/spec-driven-workflow.html',
46+
title: 'Development Workflow — Semantic Anchors',
47+
description:
48+
'The Semantic Anchors spec-driven development workflow — from requirements to specification to implementation, powered by semantic anchors.',
49+
},
50+
{
51+
path: '/brownfield',
52+
fragment: 'docs/brownfield-workflow.html',
53+
title: 'Brownfield Workflow — Semantic Anchors',
54+
description:
55+
'Applying semantic anchors to brownfield codebases using a bounded-context approach.',
56+
},
57+
{
58+
path: '/changelog',
59+
fragment: 'docs/changelog.html',
60+
title: 'Changelog — Semantic Anchors',
61+
description: 'Chronological record of all semantic anchors added to the catalog.',
62+
},
63+
{
64+
path: '/contributing',
65+
fragment: 'CONTRIBUTING.html',
66+
title: 'Contributing — Semantic Anchors',
67+
description:
68+
'How to propose new semantic anchors, quality criteria, and the contribution workflow.',
69+
},
70+
{
71+
path: '/agentskill',
72+
fragment: 'docs/agentskill.html',
73+
title: 'AgentSkill — Semantic Anchors',
74+
description:
75+
'The semantic-anchor-translator AgentSkill — install semantic anchors into Claude Code, Codex, Cursor, and other coding agents.',
76+
},
77+
{
78+
path: '/rejected-proposals',
79+
fragment: 'docs/rejected-proposals.html',
80+
title: 'Rejected Proposals — Semantic Anchors',
81+
description:
82+
'Anchor proposals that did not meet the quality criteria, with reasoning — useful for understanding the curation bar.',
83+
},
84+
{
85+
path: '/all-anchors',
86+
fragment: 'docs/all-anchors.html',
87+
title: 'Full Reference — Semantic Anchors',
88+
description:
89+
'Full reference of all semantic anchors in one long document — readable offline, linkable, easy to Ctrl-F.',
90+
},
91+
{
92+
path: '/evaluations',
93+
fragment: 'docs/anchor-evaluations.html',
94+
title: 'Evaluations — Semantic Anchors',
95+
description: 'Multiple-choice evaluations of semantic anchor recognition across 10 LLMs.',
96+
},
97+
]
98+
99+
/**
100+
* Read the Vite-built HTML shell (website/dist/index.html).
101+
* Exits with an error if the shell is missing — indicates that the caller
102+
* forgot to run `vite build` before this post-build step.
103+
* @returns {string} Raw HTML contents of the shell.
104+
*/
105+
function readShell() {
106+
if (!fs.existsSync(SHELL)) {
107+
console.error(`ERROR: ${SHELL} does not exist. Run 'vite build' first.`)
108+
process.exit(1)
109+
}
110+
return fs.readFileSync(SHELL, 'utf-8')
111+
}
112+
113+
/**
114+
* Escape a string for safe insertion into an HTML attribute or text node.
115+
* Converts &, <, >, ", and ' to their HTML entity equivalents. Used for
116+
* route titles and descriptions that end up inside <title> and meta tags.
117+
* @param {string} str - Input string to escape.
118+
* @returns {string} HTML-safe string.
119+
*/
120+
function escapeHtml(str) {
121+
return String(str).replace(
122+
/[&<>"']/g,
123+
(c) =>
124+
({
125+
'&': '&amp;',
126+
'<': '&lt;',
127+
'>': '&gt;',
128+
'"': '&quot;',
129+
"'": '&#39;',
130+
})[c]
131+
)
132+
}
133+
134+
/**
135+
* Build the pre-populated markup that goes inside <div id="app">.
136+
* Mirrors the layout produced at runtime by renderHeader() + renderDocPage()
137+
* + renderFooter() in website/src/main.js, but statically — so crawlers see
138+
* real content in the initial HTML response.
139+
*/
140+
function buildAppMarkup(fragmentHtml) {
141+
return `
142+
<main class="flex-1">
143+
<article class="mx-auto max-w-4xl px-4 py-8 sm:px-6 lg:px-8">
144+
<div id="doc-content" class="asciidoc-content">${fragmentHtml}</div>
145+
</article>
146+
</main>
147+
`
148+
}
149+
150+
/**
151+
* Pre-render a single route to website/dist/<route>/index.html.
152+
* Reads the AsciiDoc fragment produced by scripts/render-docs.js, injects
153+
* it into a copy of the Vite shell, and updates the <title>, meta
154+
* description, and canonical URL to match the route. Throws if the
155+
* fragment is missing so the build fails fast instead of shipping an
156+
* incomplete set of pre-rendered pages.
157+
* @param {string} shell - Raw HTML of the Vite build shell.
158+
* @param {{path: string, fragment: string, title: string, description: string}} route
159+
* Route descriptor from the ROUTES list.
160+
* @throws {Error} When the configured fragment file does not exist.
161+
*/
162+
function prerenderRoute(shell, route) {
163+
const fragmentPath = path.join(DIST, route.fragment)
164+
if (!fs.existsSync(fragmentPath)) {
165+
throw new Error(
166+
`Missing fragment for ${route.path}: ${route.fragment} (expected at ${fragmentPath}). ` +
167+
`Make sure scripts/render-docs.js runs before prerender-routes.js and writes the fragment to website/public/docs/.`
168+
)
169+
}
170+
const fragment = fs.readFileSync(fragmentPath, 'utf-8')
171+
172+
let html = shell
173+
174+
// Replace <title>
175+
html = html.replace(/<title>[\s\S]*?<\/title>/, `<title>${escapeHtml(route.title)}</title>`)
176+
177+
// Replace meta description if present
178+
html = html.replace(
179+
/<meta\s+name="description"\s+content="[^"]*"\s*\/?>/,
180+
`<meta name="description" content="${escapeHtml(route.description)}" />`
181+
)
182+
183+
// Update canonical URL so each pre-rendered page points to itself
184+
const canonicalUrl = `https://llm-coding.github.io/Semantic-Anchors${route.path}`
185+
html = html.replace(
186+
/<link\s+rel="canonical"\s+href="[^"]*"\s*\/?>/,
187+
`<link rel="canonical" href="${canonicalUrl}" />`
188+
)
189+
190+
// Inject pre-rendered content into #app
191+
html = html.replace(
192+
/<div\s+id="app"\s*>\s*<\/div>/,
193+
`<div id="app">${buildAppMarkup(fragment)}</div>`
194+
)
195+
196+
const outDir = path.join(DIST, route.path)
197+
const outFile = path.join(outDir, 'index.html')
198+
fs.mkdirSync(outDir, { recursive: true })
199+
fs.writeFileSync(outFile, html, 'utf-8')
200+
}
201+
202+
/**
203+
* Entry point: read the shell once, then pre-render every route in ROUTES.
204+
* Throws (via prerenderRoute) if any fragment is missing, so the build
205+
* fails non-zero instead of shipping an incomplete set of static pages.
206+
*/
207+
function main() {
208+
const shell = readShell()
209+
for (const route of ROUTES) {
210+
prerenderRoute(shell, route)
211+
console.log(` ✓ pre-rendered ${route.path}`)
212+
}
213+
console.log(`\n✓ Pre-rendered ${ROUTES.length} routes to dist/<route>/index.html`)
214+
}
215+
216+
main()

website/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"predev": "node ../scripts/sync-anchors.js",
99
"dev": "vite",
1010
"prebuild": "node ../scripts/sync-anchors.js && node ../scripts/render-docs.js",
11-
"build": "vite build",
11+
"build": "vite build && node ../scripts/prerender-routes.js",
1212
"preview": "vite preview",
1313
"test": "vitest run",
1414
"test:watch": "vitest",

0 commit comments

Comments
 (0)