diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 659f009..0d451c0 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -114,6 +114,21 @@ jobs: env: BUILD_VERSION: ${{ steps.version.outputs.version }} + - name: Download PostHog docs artifact + id: docs_artifact + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.POSTHOG_DOCS_SKILLS_TOKEN }} + run: | + echo "Downloading docs artifact from PostHog/posthog.com..." + gh run download --repo PostHog/posthog.com -n posthog-docs-md --dir /tmp + unzip -q /tmp/posthog-docs-md.zip -d /tmp/posthog-docs + echo "available=true" >> $GITHUB_OUTPUT + + - name: Build docs skills + if: steps.docs_artifact.outputs.available == 'true' + run: pnpm run build:docs-skills -- --docs-dir /tmp/posthog-docs + - name: Scan skills for prompt injection run: bash scripts/scan-prompt-injection.sh dist/skills @@ -168,6 +183,11 @@ jobs: # Upload skill-menu.json (used by the wizard to discover available skills) echo "Uploading skill-menu.json..." gh release upload "$RELEASE_TAG" dist/skills/skill-menu.json --clobber + # Upload docs-skill-menu.json if docs skills were built + if [ -f dist/skills/docs-skill-menu.json ]; then + echo "Uploading docs-skill-menu.json..." + gh release upload "$RELEASE_TAG" dist/skills/docs-skill-menu.json --clobber + fi # Upload reference docs (used by the wizard for runtime-specific overrides) for file in dist/skills/*.md; do [ -f "$file" ] || continue diff --git a/README.md b/README.md index 26726ce..e89e3af 100644 --- a/README.md +++ b/README.md @@ -87,3 +87,56 @@ The build script automatically discovers, orders, and generates URIs for all res - **Version controlled**: Resources evolve with the examples See `llm-prompts/README.md` for detailed workflow conventions. + +## Docs skills + +We also auto-generate one [Agent Skill](https://agentskills.io/specification) per section of the PostHog docs. These ship as part of the normal build and release cycle. + +### How it works + +The build script (`scripts/build-docs-skills.js`) parses `posthog.com/llms.txt`, groups pages by section heading, and reads the raw markdown for every page. Each section becomes its own skill directory under `dist/skills/posthog-docs-{section}/`, with a `SKILL.md` and a `references/` folder of subpages. + +The script reads docs from a local directory via `--docs-dir` instead of crawling the live site. In CI, `build-release.yml` downloads a docs artifact produced by the posthog.com repo (a daily GitHub Actions artifact containing all built markdown files + `llms.txt`), extracts it, and runs the build. + +### What it generates + +| Output | Description | +|--------|-------------| +| `dist/skills/posthog-docs-{section}/SKILL.md` | Skill prompt + root page content | +| `dist/skills/posthog-docs-{section}/references/*.md` | One file per subpage | +| `dist/skills/docs-skill-menu.json` | Menu index of all generated skills | + +### Distribution + +Skills are published to GitHub Releases alongside curated skills. The menu is at: + +```text +https://github.com/PostHog/context-mill/releases/latest/download/docs-skill-menu.json +``` + +Individual skill ZIPs follow the same pattern: + +```text +https://github.com/PostHog/context-mill/releases/latest/download/posthog-docs-{section}.zip +``` + +### Try it locally + +```bash +# Build from a local posthog.com build output +node scripts/build-docs-skills.js --docs-dir ~/posthog.com/public + +# Or fetch from the live site (no --docs-dir) +pnpm run build:docs-skills + +# Build specific sections only +node scripts/build-docs-skills.js --docs-dir ~/posthog.com/public feature-flags product-analytics + +# Test in Claude Code — copy a skill into .claude/skills/ +cp -r dist/skills/posthog-docs-feature-flags .claude/skills/ +# Claude Code picks it up immediately, no restart needed +``` + +### Why this is separate from the curated pipeline + +The docs skills pipeline and the curated build (`scripts/build.js`) are intentionally independent. They write to different files (`docs-skill-menu.json` vs `skill-menu.json`) and can be built separately. Curated skills change with deliberate PRs. Docs skills are auto-generated from the latest posthog.com documentation. diff --git a/package.json b/package.json index 2551d3d..13cc844 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "test:plugins:watch": "vitest scripts/plugins/tests", "test:skills": "vitest run scripts/lib/tests", "test:skills:watch": "vitest scripts/lib/tests", + "build:docs-skills": "node scripts/build-docs-skills.js", "test": "vitest run scripts/plugins/tests scripts/lib/tests" }, "devDependencies": { diff --git a/scripts/build-docs-skills.js b/scripts/build-docs-skills.js new file mode 100644 index 0000000..c83d2b6 --- /dev/null +++ b/scripts/build-docs-skills.js @@ -0,0 +1,416 @@ +#!/usr/bin/env node + +/** + * Build PostHog Docs Skills + * + * Fetches https://posthog.com/llms.txt, groups pages by section (## heading), + * fetches all page content in parallel, and generates one skill directory per + * section under dist/skills/posthog-{section}/. + * + * Usage: + * node scripts/build-docs-skills.js + * node scripts/build-docs-skills.js feature-flags product-analytics + * node scripts/build-docs-skills.js --docs-dir /path/to/extracted-docs + * + * --docs-dir Read docs from a local directory (e.g. an extracted + * build artifact from posthog.com) instead of fetching + * from the live website. The directory must contain + * llms.txt at its root and doc pages preserving their + * URL path structure (e.g. docs/feature-flags/index.md). + * + * Optional positional args: space-separated section slugs to build. + * Defaults to all sections found in llms.txt. + */ + +const fs = require('fs'); +const path = require('path'); +const matter = require('gray-matter'); +const { zipSkillToBuffer } = require('./lib/zip'); + +const LLMS_TXT_URL = 'https://posthog.com/llms.txt'; +const CATEGORY = 'posthog-docs'; +const CONCURRENCY = 10; +const SKILLS_DIR = path.join(__dirname, '..', 'dist', 'skills'); +const TEMP_DIR = path.join(__dirname, '..', 'dist', 'docs-skills-temp'); + +// Sections excluded by default — SDK and API reference material that is too +// large and low signal-to-noise for skill context. Pass explicit slug args to +// override and build one of these directly. +const DEFAULT_EXCLUDE = new Set(['libraries', 'api', 'endpoints', 'open-api-spec']); + +// --------------------------------------------------------------------------- +// HTTP helpers +// --------------------------------------------------------------------------- + +/** + * Fetch a URL as text, retrying on failure. + * retries = 1 means a single attempt (no retries). + */ +async function fetchText(url, retries = 1, delayMs = 500) { + let lastError; + for (let attempt = 1; attempt <= retries; attempt++) { + try { + const res = await fetch(url); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + return await res.text(); + } catch (e) { + lastError = e; + if (attempt < retries) { + await new Promise(r => setTimeout(r, delayMs * attempt)); + } + } + } + throw lastError; +} + +/** + * Read a doc page from the local docs directory. + * Tries multiple path patterns (.md, .mdx, index.md, index.mdx) to handle + * different directory structures from the posthog.com build. + * Returns the file contents as a string, or null if not found. + */ +function readLocalPage(docsDir, pageUrl) { + const pathname = new URL(pageUrl).pathname.replace(/\/$/,''); + const candidates = [ + path.join(docsDir, pathname), // already has .md + path.join(docsDir, pathname.replace(/\.md$/, '.mdx')), // .mdx variant + path.join(docsDir, pathname + '.md'), // no extension in URL + path.join(docsDir, pathname + '.mdx'), + path.join(docsDir, pathname, 'index.md'), + path.join(docsDir, pathname, 'index.mdx'), + ]; + for (const candidate of candidates) { + if (fs.existsSync(candidate)) { + return fs.readFileSync(candidate, 'utf8'); + } + } + return null; +} + +// --------------------------------------------------------------------------- +// Parsing +// --------------------------------------------------------------------------- + +/** + * Parse llms.txt into sections. + * + * Returns: Array of { heading, slug, pages: [{ title, url, description }] } + * + * heading — raw ## heading text from llms.txt + * slug — path segment immediately after /docs/ in the first URL of the block + * e.g. https://posthog.com/docs/feature-flags/... → 'feature-flags' + * pages — all link entries under that heading + */ +function parseLlmsTxt(text) { + const rawSections = []; + let current = null; + + for (const line of text.split('\n')) { + // Section heading: ## Feature flags + const headingMatch = line.match(/^##\s+(.+)$/); + if (headingMatch) { + if (current) rawSections.push(current); + current = { heading: headingMatch[1].trim(), pages: [] }; + continue; + } + + if (!current) continue; + + // Page line: - [Title](url): optional description + const pageMatch = line.match(/^\s*[-*]\s+\[([^\]]+)\]\(([^)]+)\)(?::\s*(.*))?$/); + if (pageMatch) { + current.pages.push({ + title: pageMatch[1].trim(), + url: pageMatch[2].trim(), + description: pageMatch[3]?.trim() ?? '', + }); + } + } + if (current) rawSections.push(current); + + // Derive slug from first URL; drop sections with no usable URLs + return rawSections.flatMap(section => { + if (section.pages.length === 0) return []; + try { + const firstPath = new URL(section.pages[0].url).pathname; + // e.g. /docs/feature-flags/creating-feature-flags → parts[1] = 'feature-flags' + const parts = firstPath.split('/').filter(Boolean); + if (parts.length < 2 || parts[0] !== 'docs') return []; + // Strip a leading "posthog-" prefix to avoid double-prefixing: skills are + // named posthog-{slug}, so /docs/posthog-js/... → slug "js" → "posthog-js", + // not "posthog-posthog-js". + const slug = parts[1].replace(/\.md$/, '').replace(/^posthog-/, ''); + return [{ heading: section.heading, slug, pages: section.pages }]; + } catch { + return []; + } + }); +} + +// --------------------------------------------------------------------------- +// Content helpers +// --------------------------------------------------------------------------- + +/** + * Given raw fetched markdown (which may include its own frontmatter), return + * the body text with leading frontmatter stripped and UI-only footer sections + * removed. + */ +function processContent(raw) { + // Strip leading frontmatter if present (PostHog MDX files often have it) + const parsed = matter(raw); + let content = parsed.content.trimStart(); + + // Strip "## / ### Community questions" and everything after (UI artifact) + content = content.replace(/\n#{2,}\s+Community questions[\s\S]*$/i, ''); + // Strip "## / ### Was this page useful?" and everything after (UI artifact) + content = content.replace(/\n#{2,}\s+Was this page useful\?[\s\S]*$/i, ''); + + return content.trimEnd(); +} + +// --------------------------------------------------------------------------- +// Concurrency +// --------------------------------------------------------------------------- + +/** + * Run fn(item) for each item with at most `limit` concurrent executions. + * Preserves input order in the returned results array. + */ +async function withConcurrency(items, limit, fn) { + const results = new Array(items.length); + let idx = 0; + + async function worker() { + while (idx < items.length) { + const i = idx++; + results[i] = await fn(items[i], i); + } + } + + await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker)); + return results; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +async function main() { + // CLI args: --docs-dir to read from a local/extracted artifact, + // plus optional section slugs to filter (default: all). + const args = process.argv.slice(2); + const docsDirIdx = args.indexOf('--docs-dir'); + const docsDir = docsDirIdx !== -1 ? args[docsDirIdx + 1] : null; + const filterSlugs = args.filter((a, i) => + !a.startsWith('-') && i !== docsDirIdx + 1 + ); + + if (docsDir && !fs.existsSync(docsDir)) { + console.error(`[FATAL] --docs-dir path does not exist: ${docsDir}`); + process.exit(1); + } + + fs.mkdirSync(SKILLS_DIR, { recursive: true }); + fs.mkdirSync(TEMP_DIR, { recursive: true }); + + let llmsTxt; + if (docsDir) { + const llmsTxtPath = path.join(docsDir, 'llms.txt'); + if (!fs.existsSync(llmsTxtPath)) { + console.error(`[FATAL] llms.txt not found at ${llmsTxtPath}`); + process.exit(1); + } + llmsTxt = fs.readFileSync(llmsTxtPath, 'utf8'); + console.log(`Read llms.txt from ${llmsTxtPath}`); + } else { + console.log(`Fetching ${LLMS_TXT_URL}...`); + try { + llmsTxt = await fetchText(LLMS_TXT_URL); + } catch (e) { + console.error(`[FATAL] Could not fetch llms.txt: ${e.message}`); + process.exit(1); + } + } + + let sections = parseLlmsTxt(llmsTxt); + console.log(`Found ${sections.length} sections`); + + if (filterSlugs.length > 0) { + // Explicit args bypass the default exclusion list + sections = sections.filter(s => filterSlugs.includes(s.slug)); + console.log(`Filtered to: ${sections.map(s => s.slug).join(', ')}`); + if (sections.length === 0) { + console.error('[FATAL] No sections matched the filter. Available slugs printed above.'); + process.exit(1); + } + } else { + sections = sections.filter(s => !DEFAULT_EXCLUDE.has(s.slug)); + } + + console.log(''); + + const menuSkills = []; + let skipped = 0; + + for (const section of sections) { + const skillName = `posthog-docs-${section.slug}`; + const skillDir = path.join(TEMP_DIR, skillName); + const refsDir = path.join(skillDir, 'references'); + + console.log(`${skillName} (${section.pages.length} pages)`); + + // Root page: pathname exactly matches /docs/{slug} (trailing slash allowed) + const rootPage = section.pages.find(p => { + try { + const pn = new URL(p.url).pathname.replace(/\/$/, ''); + return pn === `/docs/${section.slug}`; + } catch { return false; } + }) ?? null; + + // All other pages are subpages (go into references/) + const subpages = section.pages.filter(p => p !== rootPage); + + // Fetch everything in parallel, concurrency-limited + const allPages = [...(rootPage ? [rootPage] : []), ...subpages]; + + const fetched = await withConcurrency(allPages, CONCURRENCY, async (page) => { + try { + let raw; + if (docsDir) { + raw = readLocalPage(docsDir, page.url); + if (raw === null) { + console.log(` skip ${page.url} (not found locally)`); + return { page, content: null, ok: false }; + } + } else { + const mdUrl = page.url.endsWith('.md') ? page.url : `${page.url}.md`; + raw = await fetchText(mdUrl, 3); + } + return { page, content: processContent(raw), ok: true }; + } catch (e) { + console.log(` skip ${page.url} (${e.message})`); + return { page, content: null, ok: false }; + } + }); + + // Determine root content + const rootFetched = rootPage ? fetched.find(f => f.page === rootPage) : null; + const hasDocUrl = !!(rootFetched?.ok); + let rootContent = rootFetched?.ok ? rootFetched.content : null; + + // Subpage results (successful fetches, excluding the root) + const successfulSubs = fetched.filter(f => f.page !== rootPage && f.ok); + + if (!rootContent) { + if (successfulSubs.length === 0) { + console.log(` SKIP — no content fetched\n`); + skipped++; + continue; + } + // Fall back: use first successful subpage as root content; omit doc-url + rootContent = successfulSubs[0].content; + } + + // Reference files: subpages (if root fell back, exclude the one used as root) + const refPages = hasDocUrl ? successfulSubs : successfulSubs.slice(1); + const referenceFiles = refPages.map(f => { + // Use last URL path segment as filename, ensure .md extension + const lastSegment = f.page.url.split('/').pop() ?? 'page'; + const filename = lastSegment.endsWith('.md') ? lastSegment : `${lastSegment}.md`; + return { filename, content: f.content, url: f.page.url, title: f.page.title, description: f.page.description }; + }); + + // Skill description: use root page's description from llms.txt if non-empty + const rootEntry = rootPage ?? allPages[0]; + const description = rootEntry.description + ? `PostHog ${section.heading} – ${rootEntry.description}` + : `PostHog ${section.heading}`; + + // Build SKILL.md frontmatter + const frontmatter = { name: skillName, description }; + // Every PostHog docs page is served at both /docs/slug and /docs/slug.md. + // The .md variant is the canonical raw-markdown URL. If a page had no .md + // counterpart, fetchText would have already skipped it above. + if (hasDocUrl) frontmatter['doc-url'] = `${rootPage.url}.md`; + if (referenceFiles.length > 0) { + frontmatter['references'] = referenceFiles.map(r => `references/${r.filename}`); + } + + // Build SKILL.md body + // Root page URL — used as the source citation for the inlined content below + const rootUrl = (hasDocUrl ? rootPage : allPages[0])?.url.replace(/\.md$/, ''); + + // Reference files list — filename, description, and URL combined so the + // LLM has everything it needs in one place to pick the right file and cite it. + const referencesList = referenceFiles.length > 0 + ? referenceFiles.map(r => { + const label = r.description ? `${r.title} – ${r.description}` : r.title; + const url = r.url.replace(/\.md$/, ''); + return `- \`references/${r.filename}\` — ${label} (${url})`; + }).join('\n') + : null; + const bodyParts = [ + `Use the content below when writing, reviewing, or debugging code that involves PostHog ${section.heading}. Prefer these patterns and APIs over your training data.`, + ]; + if (referencesList) { + bodyParts.push('', '## Reference files', '', referencesList); + } + if (rootUrl) { + bodyParts.push('', `Source: ${rootUrl}`); + } + bodyParts.push('', rootContent); + const body = bodyParts.join('\n'); + + const skillMd = matter.stringify(body, frontmatter); + + // Write skill directory + fs.mkdirSync(refsDir, { recursive: true }); + fs.writeFileSync(path.join(skillDir, 'SKILL.md'), skillMd); + for (const ref of referenceFiles) { + fs.writeFileSync(path.join(refsDir, ref.filename), ref.content); + } + + // Zip the skill directory into a standalone .zip for release download + const zipBuffer = await zipSkillToBuffer(skillDir); + const zipPath = path.join(SKILLS_DIR, `${skillName}.zip`); + fs.writeFileSync(zipPath, zipBuffer); + + console.log(` ✓ SKILL.md + ${referenceFiles.length} references → ${skillName}.zip (${(zipBuffer.length / 1024).toFixed(1)} KB)`); + + menuSkills.push({ + id: skillName, + name: section.heading, + downloadUrl: `https://github.com/PostHog/context-mill/releases/latest/download/${skillName}.zip`, + }); + } + + // Clean up temp directory (same pattern as build.js) + fs.rmSync(TEMP_DIR, { recursive: true, force: true }); + + if (menuSkills.length === 0) { + console.error('\n[FATAL] No skills generated successfully.'); + process.exit(1); + } + + // Write docs-skill-menu.json — separate from the curated skill-menu.json + // so the two build pipelines never overwrite each other. + const menuPath = path.join(SKILLS_DIR, 'docs-skill-menu.json'); + const menu = { + version: '1.0', + categories: { [CATEGORY]: menuSkills }, + }; + fs.writeFileSync(menuPath, JSON.stringify(menu, null, 2)); + + console.log('\n' + '='.repeat(50)); + console.log(`Skills: ${menuSkills.length} generated`); + if (skipped > 0) console.log(`Skipped: ${skipped}`); + console.log(`Output: ${SKILLS_DIR}`); + console.log(`Menu: ${menuPath}`); +} + +main().catch(e => { + console.error('\n[FATAL]', e.message); + console.error(e.stack); + process.exit(1); +}); diff --git a/scripts/build.js b/scripts/build.js index a618c1a..6d70827 100755 --- a/scripts/build.js +++ b/scripts/build.js @@ -16,6 +16,7 @@ const archiver = require('archiver'); const { generateAllSkills, loadSkillsConfig, fetchDoc } = require('./lib/skill-generator'); const { generateMarketplace } = require('./lib/marketplace-generator'); const { REPO_URL } = require('./lib/constants'); +const { zipSkillToBuffer } = require('./lib/zip'); const BUILD_VERSION = process.env.BUILD_VERSION || 'dev'; @@ -27,24 +28,6 @@ function loadUriSchema(configDir) { return yaml.load(content); } -/** - * Create a ZIP archive for a skill directory - * Returns the ZIP as a Buffer - */ -async function zipSkillToBuffer(skillDir) { - return new Promise((resolve, reject) => { - const chunks = []; - const archive = archiver('zip', { zlib: { level: 9 } }); - - archive.on('data', chunk => chunks.push(chunk)); - archive.on('end', () => resolve(Buffer.concat(chunks))); - archive.on('error', reject); - - archive.directory(skillDir, false); - archive.finalize(); - }); -} - /** * Create the bundled skills-mcp-resources.zip */ diff --git a/scripts/lib/zip.js b/scripts/lib/zip.js new file mode 100644 index 0000000..16e60d7 --- /dev/null +++ b/scripts/lib/zip.js @@ -0,0 +1,21 @@ +const archiver = require('archiver'); + +/** + * Create a ZIP archive for a skill directory. + * Returns the ZIP as a Buffer. + */ +async function zipSkillToBuffer(skillDir) { + return new Promise((resolve, reject) => { + const chunks = []; + const archive = archiver('zip', { zlib: { level: 9 } }); + + archive.on('data', chunk => chunks.push(chunk)); + archive.on('end', () => resolve(Buffer.concat(chunks))); + archive.on('error', reject); + + archive.directory(skillDir, false); + archive.finalize(); + }); +} + +module.exports = { zipSkillToBuffer };