diff --git a/components/ui/Autocomplete.tsx b/components/ui/Autocomplete.tsx index 36ea233c4..bd443919e 100644 --- a/components/ui/Autocomplete.tsx +++ b/components/ui/Autocomplete.tsx @@ -29,7 +29,11 @@ import { MenuItem } from "@telegraph/menu"; import { Tag } from "@telegraph/tag"; import { Code, Text } from "@telegraph/typography"; -import { DocsSearchItem, EndpointSearchItem } from "@/types"; +import { + DocsSearchItem, + EndpointSearchItem, + EnhancedDocsSearchItem, +} from "@/types"; import { useInkeepModal } from "../AiChatButton"; import { useAskAi } from "../AskAiContext"; @@ -62,7 +66,9 @@ function createAskAiPrompt(query: string): string { return `Can you tell me about ${query}`; } -type ResultItem = (DocsSearchItem & BaseItem) | (EndpointSearchItem & BaseItem); +type ResultItem = + | (EnhancedDocsSearchItem & BaseItem) + | (EndpointSearchItem & BaseItem); const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID || ""; const algoliaSearchApiKey = @@ -175,6 +181,9 @@ const DocsSearchResult = ({ const href = `/${item.path}`; const isApiRef = isApiReferencePath(item.path); + const enhancedItem = item as EnhancedDocsSearchItem; + const showPageTitle = !enhancedItem.isPageLevel && enhancedItem.pageTitle; + const content = ( @@ -193,6 +202,7 @@ const DocsSearchResult = ({ )} + {showPageTitle ? `${enhancedItem.pageTitle} ยท ` : ""} {item.section} @@ -400,7 +410,7 @@ const Autocomplete = () => { ], transformResponse({ hits: hitsArray }) { const hits = hitsArray as ( - | DocsSearchItem[] + | EnhancedDocsSearchItem[] | EndpointSearchItem[] )[]; // Add the "Ask AI" item at the top of the results @@ -823,7 +833,7 @@ const Autocomplete = () => { /> ) : ( autocomplete.setQuery("")} /> )} diff --git a/lib/content.server.ts b/lib/content.server.ts index ad0d45e03..0d1ae2ad6 100644 --- a/lib/content.server.ts +++ b/lib/content.server.ts @@ -1,16 +1,9 @@ import fs from "fs"; import path from "path"; -import algoliasearch from "algoliasearch"; -import type { FrontMatter, DocsSearchItem } from "../types"; export const CONTENT_DIR = "content/"; export const DOCS_FILE_EXTENSIONS = [".mdx", ".md"]; -/** - * This is to index our .md and .mdx file content. - * API/mAPI reference content is indexed at script/indexApisForSearch.ts. - */ - export const getAllFilesInDir = ( directory: string, files: string[] = [], @@ -31,48 +24,5 @@ export const getAllFilesInDir = ( }; export function makeIdFromPath(resourcePath) { - return resourcePath.replace(/\.mdx?$/, "").replace("/index", ""); -} - -export async function generateAlgoliaIndex(frontmatter: FrontMatter) { - const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID ?? ""; - const algoliaAdminApiKey = process.env.ALGOLIA_ADMIN_API_KEY ?? ""; - const algoliaIndexName = process.env.NEXT_PUBLIC_ALGOLIA_INDEX_NAME ?? ""; - - if (algoliaAppId && algoliaAdminApiKey && algoliaIndexName) { - const client = algoliasearch(algoliaAppId, algoliaAdminApiKey); - const index = client.initIndex(algoliaIndexName); - - try { - // Notes: - // Algolia recommends saving objects in batches because of efficiency. - // Our markdown processor doesn't provide a callback to subscribe to that - // gets called after finishing with all elements. - // - // Given we only have ~40 items to be indexed right now, we are just saving - // entries one by one. - const object: DocsSearchItem = { - // The path to the page will be the identifier in Algolia. - objectID: frontmatter.id, - path: frontmatter.id, - title: frontmatter.title, - section: frontmatter.section, - // Once we add tags are added to pages, Algolia records - // will be updated with them, so we can enhance the search experience - tags: frontmatter.tags || [], - // Saving a content page, not an API endpoint - contentType: "document", - // Saving to the pages index - index: "pages", - }; - - await index.saveObject(object); - } catch (e) { - console.error(e); - } - } else { - console.info( - "Algolia configuration variables not present. Skipping indexing.", - ); - } + return resourcePath.replace(/\.mdx?$/, "").replace(/\/index$/, ""); } diff --git a/package.json b/package.json index 3da0302d4..a271ae75f 100644 --- a/package.json +++ b/package.json @@ -16,10 +16,11 @@ "generate-llms": "yarn run open-api-to-md && tsx scripts/generateApiMarkdown.ts && tsx scripts/generateLlmsTxt.ts", "generate-reference-md": "tsx scripts/generateApiMarkdown.ts", "index-apis": "tsx scripts/indexApisForSearch.ts", + "index-docs": "tsx scripts/indexDocsForSearch.ts", "open-api-to-md": "bash scripts/openApiToMd.sh", "split-specs": "tsx scripts/splitOpenApiSpec.ts", "predev": "yarn split-specs && yarn generate-llms", - "prebuild": "yarn split-specs && yarn generate-llms && yarn index-apis" + "prebuild": "yarn split-specs && yarn generate-llms && yarn index-docs && yarn index-apis" }, "dependencies": { "@algolia/autocomplete-js": "^1.6.3", diff --git a/pages/[...slug].tsx b/pages/[...slug].tsx index 00d485c78..c66cd4c52 100644 --- a/pages/[...slug].tsx +++ b/pages/[...slug].tsx @@ -14,7 +14,6 @@ import { CONTENT_DIR, DOCS_FILE_EXTENSIONS, makeIdFromPath, - generateAlgoliaIndex, } from "../lib/content.server"; import eventPayload from "../data/code/sources/eventPayload"; import datadogDashboardJson from "../content/integrations/extensions/datadog_dashboard.json"; @@ -96,9 +95,6 @@ export async function getStaticProps({ params: { slug } }) { // Extend frontmatter mdxSource.frontmatter.id = makeIdFromPath(slug.join(sep)); - // Index page in algolia - await generateAlgoliaIndex(mdxSource.frontmatter); - return { props: { source: mdxSource, sourcePath, typedocs } }; } diff --git a/scripts/indexApisForSearch.ts b/scripts/indexApisForSearch.ts index df0b191fe..c631df365 100644 --- a/scripts/indexApisForSearch.ts +++ b/scripts/indexApisForSearch.ts @@ -3,13 +3,19 @@ import { readOpenApiSpec, readStainlessSpec, } from "@/lib/openApiSpec"; -import { RESOURCE_ORDER as API_RESOURCE_ORDER } from "@/data/sidebars/apiOverviewSidebar"; -import { RESOURCE_ORDER as MAPI_RESOURCE_ORDER } from "@/data/sidebars/mapiOverviewSidebar"; +import { + API_REFERENCE_OVERVIEW_CONTENT, + RESOURCE_ORDER as API_RESOURCE_ORDER, +} from "@/data/sidebars/apiOverviewSidebar"; +import { + MAPI_REFERENCE_OVERVIEW_CONTENT, + RESOURCE_ORDER as MAPI_RESOURCE_ORDER, +} from "@/data/sidebars/mapiOverviewSidebar"; import algoliasearch from "algoliasearch"; import { resolveEndpointFromMethod } from "@/components/ui/ApiReference/helpers"; import JSONPointer from "jsonpointer"; import { loadEnvConfig } from "@next/env"; -import type { DocsSearchItem, EndpointSearchItem } from "@/types"; +import type { EndpointSearchItem, EnhancedDocsSearchItem } from "@/types"; import { readFile } from "fs/promises"; import path from "path"; @@ -29,8 +35,17 @@ const algoliaEndpointIndexName = let indexCount = 0; let endpointCount = 0; +const MAX_CONTENT_LENGTH = 2000; + +interface StaticContentHeading { + level: number; + title: string; + slug: string; + content: string; +} + async function validateSearchObject( - object: DocsSearchItem | EndpointSearchItem, + object: EnhancedDocsSearchItem | EndpointSearchItem, ) { if (object.path.startsWith("/")) { return Promise.reject( @@ -45,11 +60,16 @@ async function validateSearchObject( * So we're pretty good for now and can either expand that limit or * save objects in chunks. These batches work for now! */ -let pagesToSave: DocsSearchItem[] = []; -async function queuePage(object: DocsSearchItem) { +let pagesToSave: EnhancedDocsSearchItem[] = []; +async function queuePage(object: EnhancedDocsSearchItem) { await validateSearchObject(object); // Keep this in for logging purposes - console.log("Indexing page:", object.title, object.path); + console.log( + "Indexing page:", + object.title, + object.path, + `(${object.content.length} content chars)`, + ); indexCount++; pagesToSave.push(object); return; @@ -65,6 +85,115 @@ async function queueEndpoint(object: EndpointSearchItem) { return; } +function buildApiPageSearchItem({ + objectID, + path, + title, + pageTitle = title, + section, + content = "", + tags = [], + headingLevel = 0, + isPageLevel = true, +}: { + objectID: string; + path: string; + title: string; + pageTitle?: string; + section: string; + content?: string; + tags?: string[]; + headingLevel?: number; + isPageLevel?: boolean; +}): EnhancedDocsSearchItem { + return { + objectID, + path, + title, + pageTitle, + content: content.slice(0, MAX_CONTENT_LENGTH), + section, + tags, + headingLevel, + contentType: "api-reference", + index: "pages", + isPageLevel, + }; +} + +function extractTextContent(mdxContent: string): string { + let content = mdxContent; + + content = content.replace(/^---[\s\S]*?---\n*/g, ""); + content = content.replace(/```[^\n]*\n([\s\S]*?)```/g, "$1"); + content = content.replace(/`([^`]+)`/g, "$1"); + content = content.replace(/!\[([^\]]*)\]\([^)]+\)/g, ""); + content = content.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); + content = content.replace(/\{\/\*[\s\S]*?\*\/\}/g, ""); + content = content.replace(/<\/?[A-Za-z][^>]*>/g, " "); + content = content.replace(/[{}[\](),"]/g, " "); + content = content.replace(/\s+/g, " "); + + return content.trim(); +} + +function getOverviewSectionTitle(name: "api" | "mapi", path: string): string { + const overviewContent = + name === "api" + ? API_REFERENCE_OVERVIEW_CONTENT + : MAPI_REFERENCE_OVERVIEW_CONTENT; + const pathWithoutOverview = path.replace(/^\/overview/, "") || "/"; + const page = overviewContent[0]?.pages.find( + ({ slug }) => slug === pathWithoutOverview, + ); + + return page?.title ?? "Overview"; +} + +function slugify(text: string): string { + return text + .trim() + .toLowerCase() + .replace(/[^\w\s-]/g, "") + .replace(/\s+/g, "-") + .replace(/-+/g, "-"); +} + +function extractHeadings(content: string): StaticContentHeading[] { + const headingRegex = /^(#{2,3})\s+(.+)$/gm; + const matches: Array<{ index: number; level: number; title: string }> = []; + const headings: StaticContentHeading[] = []; + let match; + + while ((match = headingRegex.exec(content)) !== null) { + matches.push({ + index: match.index, + level: match[1].length, + title: match[2].trim(), + }); + } + + for (let index = 0; index < matches.length; index++) { + const current = matches[index]; + const next = matches[index + 1]; + const contentStart = + current.index + `${"#".repeat(current.level)} ${current.title}`.length; + const contentEnd = next ? next.index : content.length; + const cleanContent = extractTextContent( + content.slice(contentStart, contentEnd), + ); + + headings.push({ + level: current.level, + title: current.title, + slug: slugify(current.title), + content: cleanContent, + }); + } + + return headings; +} + async function indexResource({ apiName, openApiSpec, @@ -99,34 +228,37 @@ async function indexResource({ const sectionName = section + " > " + resourceName; - const resourceObject: DocsSearchItem = { - // The path to the page will be the identifier in Algolia. + const resourceObject = buildApiPageSearchItem({ objectID: `page-${staticName}-${basePath}`, path: basePath, title: resourceName, section, - tags: [], - contentType: "api-reference", - index: "pages", - }; + content: resourceName, + }); await queuePage(resourceObject); // Methods like get, post, put, delete - Object.keys(methods).forEach(async (methodName) => { + for (const methodName of Object.keys(methods)) { const method = methods[methodName]; const [methodType, endpoint] = resolveEndpointFromMethod(method); const openApiOperation = openApiSpec.paths?.[endpoint]?.[methodType]; const title = openApiOperation?.summary; const methodUrl = `${basePath}/${methodName}`; - const docsSearchItem: DocsSearchItem = { + const docsSearchItem = buildApiPageSearchItem({ objectID: `page-${staticName}-${methodUrl}`, title, path: methodUrl, section: sectionName, - tags: [], - contentType: "api-reference", - index: "pages", - }; + tags: openApiOperation?.tags ?? [], + content: [ + openApiOperation?.summary, + openApiOperation?.description, + methodType?.toUpperCase(), + endpoint, + ] + .filter(Boolean) + .join(" "), + }); await queuePage(docsSearchItem); const formattedApiName = apiName === "api" ? "API" : "mAPI"; @@ -140,28 +272,28 @@ async function indexResource({ index: "endpoints", }; await queueEndpoint(endpointSearchItem); - }); + } // Handle the schemas - Object.keys(models).forEach(async (modelName) => { + for (const modelName of Object.keys(models)) { const modelRef = models[modelName]; const modelUrl = `${basePath}/schemas/${modelName}`; const schema = JSONPointer.get(openApiSpec, modelRef.replace("#", "")); const title = schema?.title ?? modelName; - const modelObject: DocsSearchItem = { + const modelObject = buildApiPageSearchItem({ objectID: `page-${staticName}-${modelUrl}`, title, path: modelUrl, section: sectionName + " > " + "Object definitions", - tags: [], - contentType: "api-reference", - index: "pages", - }; + content: [schema?.title, schema?.description, modelName] + .filter(Boolean) + .join(" "), + }); await queuePage(modelObject); - }); + } // Subresources like BulkOperations - Object.keys(resource.subresources ?? {}).forEach(async (subresourceName) => { + for (const subresourceName of Object.keys(resource.subresources ?? {})) { if (!resource.subresources) return; const subresource = resource.subresources[subresourceName]; // Recursively index the subresource @@ -173,7 +305,7 @@ async function indexResource({ pathPrefix: basePath, section: sectionName, }); - }); + } } async function indexApi(name: "api" | "mapi") { @@ -209,13 +341,19 @@ async function indexStaticContent(name: "api" | "mapi") { // Find all sections in the static content and index them function extractSectionInfo(content: string) { - const sections: Array<{ title?: string; path?: string }> = []; - const parser = new RegExp(//g); + const sections: Array<{ + title: string; + path: string; + content: string; + headings: StaticContentHeading[]; + }> = []; + const parser = new RegExp(/]*)>([\s\S]*?)<\/Section>/g); const attributeParser = /(\w+)="([^"]*)"/g; let match; while ((match = parser.exec(content)) !== null) { const attributes = match[1]; + const body = match[2]; const sectionInfo: { title?: string; path?: string } = {}; let attrMatch; @@ -225,25 +363,45 @@ async function indexStaticContent(name: "api" | "mapi") { if (name === "path") sectionInfo.path = value; } - sections.push(sectionInfo); + if (!sectionInfo.path) continue; + + sections.push({ + path: sectionInfo.path, + title: + sectionInfo.title ?? getOverviewSectionTitle(name, sectionInfo.path), + content: extractTextContent(body), + headings: extractHeadings(body), + }); } return sections; } const sections = extractSectionInfo(staticContent); for (const section of sections) { - const { title, path } = section; - if (!title || !path) continue; - const sectionObject: DocsSearchItem = { + const { title, path, content, headings } = section; + const sectionObject = buildApiPageSearchItem({ objectID: `page-section-${name}-reference${path}`, path: `${name}-reference${path}`, title, section: name === "api" ? "API Reference" : "mAPI Reference", - tags: [], - contentType: "api-reference", - index: "pages", - }; + content, + }); await queuePage(sectionObject); + + for (const heading of headings) { + const headingPath = `${name}-reference${path}#${heading.slug}`; + const headingObject = buildApiPageSearchItem({ + objectID: `heading-section-${headingPath}`, + path: headingPath, + title: heading.title, + pageTitle: title, + section: name === "api" ? "API Reference" : "mAPI Reference", + content: heading.content, + headingLevel: heading.level, + isPageLevel: false, + }); + await queuePage(headingObject); + } } return staticContent; } diff --git a/scripts/indexDocsForSearch.ts b/scripts/indexDocsForSearch.ts new file mode 100644 index 000000000..e87856c26 --- /dev/null +++ b/scripts/indexDocsForSearch.ts @@ -0,0 +1,407 @@ +import fs from "fs"; +import path from "path"; +import { unified } from "unified"; +import remarkParse from "remark-parse"; +import remarkFrontmatter from "remark-frontmatter"; +import yaml from "yaml"; +import algoliasearch from "algoliasearch"; +import { loadEnvConfig } from "@next/env"; +import type { EnhancedDocsSearchItem } from "@/types"; + +// Load Next.js environment variables +const projectDir = process.cwd(); +loadEnvConfig(projectDir); + +const algoliaAppId = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID ?? ""; +const algoliaAdminApiKey = process.env.ALGOLIA_ADMIN_API_KEY ?? ""; +const algoliaPagesIndexName = process.env.NEXT_PUBLIC_ALGOLIA_INDEX_NAME ?? ""; + +const CONTENT_DIR = path.join(projectDir, "content"); +const DOCS_FILE_EXTENSIONS = [".mdx", ".md"]; + +// Maximum content length per record (in characters) +// Algolia recommends keeping records small for better performance +const MAX_CONTENT_LENGTH = 2000; + +// Keep count of indexed items +let pageCount = 0; +let headingCount = 0; + +interface Heading { + level: number; + title: string; + slug: string; + content: string; +} + +interface Frontmatter { + title: string; + description?: string; + tags?: string[]; + section: string; +} + +/** + * Recursively get all files in a directory with specific extensions + */ +function getAllFilesInDir( + directory: string, + files: string[] = [], + extensions?: string[], +): string[] { + fs.readdirSync(directory).forEach((file) => { + const subpath = path.join(directory, file); + if (fs.lstatSync(subpath).isDirectory()) { + getAllFilesInDir(subpath, files, extensions); + } else { + if (!extensions || extensions.includes(path.extname(subpath))) { + files.push(subpath); + } + } + }); + + return files; +} + +/** + * Parse frontmatter from markdown content using remark + */ +async function parseFrontmatter( + markdownContent: string, +): Promise { + const file = await unified() + .use(remarkParse) + .use(remarkFrontmatter, ["yaml"]) + .parse(markdownContent); + + const yamlNode = file.children.find( + (node): node is { type: "yaml"; value: string } => node.type === "yaml", + ); + if (!yamlNode) return null; + return yaml.parse(yamlNode.value); +} + +/** + * Create a URL-friendly slug from a heading title + */ +function slugify(text: string): string { + return text + .toLowerCase() + .replace(/[^\w\s-]/g, "") // Remove non-word characters except spaces and hyphens + .replace(/\s+/g, "-") // Replace spaces with hyphens + .replace(/-+/g, "-") // Replace multiple hyphens with single + .trim(); +} + +/** + * Remove frontmatter from markdown content + */ +function removeFrontmatter(content: string): string { + // Match YAML frontmatter at the start of the file + const frontmatterRegex = /^---[\s\S]*?---\n*/; + return content.replace(frontmatterRegex, ""); +} + +/** + * Extract plain text from markdown content + * Removes JSX components, imports, code blocks, and other non-text elements + */ +function extractTextContent(mdxContent: string): string { + let content = mdxContent; + + // Remove import statements + content = content.replace(/^import\s+.*$/gm, ""); + + // Remove export statements + content = content.replace(/^export\s+.*$/gm, ""); + + // Remove code blocks (fenced) + content = content.replace(/```[\s\S]*?```/g, ""); + + // Remove inline code + content = content.replace(/`[^`]+`/g, ""); + + // Remove JSX components (self-closing and with children) + content = content.replace(/<[A-Z][^>]*\/>/g, ""); // Self-closing like + content = content.replace(/<[A-Z][^>]*>[\s\S]*?<\/[A-Z][^>]*>/g, ""); // With children + + // Remove HTML-style components + content = content.replace(/<[a-z][^>]*>[\s\S]*?<\/[a-z][^>]*>/g, ""); + + // Remove remaining HTML/JSX tags + content = content.replace(/<[^>]+>/g, ""); + + // Remove markdown images (must come before links since images contain link syntax) + content = content.replace(/!\[([^\]]*)\]\([^)]+\)/g, ""); + + // Remove markdown links but keep the text + content = content.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); + + // Remove markdown emphasis markers + content = content.replace(/\*\*([^*]+)\*\*/g, "$1"); // Bold + content = content.replace(/\*([^*]+)\*/g, "$1"); // Italic + content = content.replace(/__([^_]+)__/g, "$1"); // Bold + content = content.replace(/_([^_]+)_/g, "$1"); // Italic + + // Remove heading markers + content = content.replace(/^#{1,6}\s+/gm, ""); + + // Remove horizontal rules + content = content.replace(/^[-*_]{3,}$/gm, ""); + + // Remove list markers + content = content.replace(/^\s*[-*+]\s+/gm, ""); + content = content.replace(/^\s*\d+\.\s+/gm, ""); + + // Remove blockquote markers + content = content.replace(/^\s*>\s*/gm, ""); + + // Normalize whitespace + content = content.replace(/\n{3,}/g, "\n\n"); // Multiple newlines to double + content = content.replace(/[ \t]+/g, " "); // Multiple spaces to single + + return content.trim(); +} + +/** + * Extract headings with their content from markdown + */ +function extractHeadings(mdxContent: string): Heading[] { + // Remove frontmatter first + const contentWithoutFrontmatter = removeFrontmatter(mdxContent); + + const headingRegex = /^(#{2,3})\s+(.+)$/gm; + const headings: Heading[] = []; + let match; + + const matches: Array<{ index: number; level: number; title: string }> = []; + + while ((match = headingRegex.exec(contentWithoutFrontmatter)) !== null) { + matches.push({ + index: match.index, + level: match[1].length, + title: match[2].trim(), + }); + } + + // Extract content for each heading + for (let i = 0; i < matches.length; i++) { + const current = matches[i]; + const next = matches[i + 1]; + + const contentStart = + current.index + `${"#".repeat(current.level)} ${current.title}`.length; + const contentEnd = next ? next.index : contentWithoutFrontmatter.length; + const rawContent = contentWithoutFrontmatter.slice( + contentStart, + contentEnd, + ); + + const cleanContent = extractTextContent(rawContent); + + // Only include headings with meaningful content + if (cleanContent.length > 20) { + headings.push({ + level: current.level, + title: current.title, + slug: slugify(current.title), + content: cleanContent.slice(0, MAX_CONTENT_LENGTH), + }); + } + } + + return headings; +} + +/** + * Get the intro content (content before the first heading) + */ +function getIntroContent(mdxContent: string): string { + const contentWithoutFrontmatter = removeFrontmatter(mdxContent); + + // Find the first H2 or H3 heading + const firstHeadingMatch = contentWithoutFrontmatter.match(/^#{2,3}\s+/m); + + if (firstHeadingMatch && firstHeadingMatch.index !== undefined) { + const introRaw = contentWithoutFrontmatter.slice( + 0, + firstHeadingMatch.index, + ); + return extractTextContent(introRaw).slice(0, MAX_CONTENT_LENGTH); + } + + // No headings found, use all content + return extractTextContent(contentWithoutFrontmatter).slice( + 0, + MAX_CONTENT_LENGTH, + ); +} + +/** + * Convert file path to URL path + */ +function filePathToUrlPath(filePath: string): string { + return filePath + .replace(CONTENT_DIR, "") + .replace(/\.mdx?$/, "") + .replace(/\/index$/, "") // Only remove /index at end of path + .replace(/^\//, ""); // Remove leading slash for objectID +} + +/** + * Queue of items to save to Algolia + */ +const itemsToSave: EnhancedDocsSearchItem[] = []; + +async function queueItem(item: EnhancedDocsSearchItem) { + // Validate path doesn't start with / + if (item.path.startsWith("/")) { + console.error(`Path may not start with "/". Violating path: ${item.path}`); + return; + } + + console.log( + `Indexing ${item.isPageLevel ? "page" : "heading"}: ${item.title} -> ${ + item.path + }`, + ); + itemsToSave.push(item); +} + +/** + * Process a single MDX file and create search records + */ +async function processFile(filePath: string): Promise { + // Skip special directories + if ( + filePath.includes("/__mapi-reference/") || + filePath.includes("/__api-reference/") || + filePath.includes("/__cli/") + ) { + return; + } + + const content = fs.readFileSync(filePath, "utf-8"); + const frontmatter = await parseFrontmatter(content); + + if (!frontmatter || !frontmatter.title || !frontmatter.section) { + console.warn(`Skipping ${filePath}: missing required frontmatter`); + return; + } + + const urlPath = filePathToUrlPath(filePath); + + // Create page-level record + const introContent = getIntroContent(content); + const pageRecord: EnhancedDocsSearchItem = { + objectID: `page-${urlPath}`, + path: urlPath, + title: frontmatter.title, + pageTitle: frontmatter.title, + description: frontmatter.description, + content: introContent, + section: frontmatter.section, + tags: frontmatter.tags || [], + headingLevel: 0, + contentType: "document", + index: "pages", + isPageLevel: true, + }; + await queueItem(pageRecord); + pageCount++; + + // Extract and create heading-level records + const headings = extractHeadings(content); + for (const heading of headings) { + const headingPath = `${urlPath}#${heading.slug}`; + const headingRecord: EnhancedDocsSearchItem = { + objectID: `heading-${headingPath}`, + path: headingPath, + title: heading.title, + pageTitle: frontmatter.title, + content: heading.content, + section: frontmatter.section, + tags: frontmatter.tags || [], + headingLevel: heading.level, + contentType: "document", + index: "pages", + isPageLevel: false, + }; + await queueItem(headingRecord); + headingCount++; + } +} + +/** + * Main entry point + */ +async function main() { + console.log("๐Ÿ” Starting docs search indexing...\n"); + + let skipIndexing = false; + + // Check for required environment variables + if (!algoliaAppId || !algoliaAdminApiKey || !algoliaPagesIndexName) { + const missing: string[] = []; + if (!algoliaAppId) missing.push("NEXT_PUBLIC_ALGOLIA_APP_ID"); + if (!algoliaAdminApiKey) missing.push("ALGOLIA_ADMIN_API_KEY"); + if (!algoliaPagesIndexName) missing.push("NEXT_PUBLIC_ALGOLIA_INDEX_NAME"); + + console.warn( + "Missing Algolia environment variables. Continuing with script but skipping actual indexing.\n\nMissing: " + + missing.join(", "), + ); + skipIndexing = true; + } + + // Get all MDX/MD files + const files = getAllFilesInDir(CONTENT_DIR, [], DOCS_FILE_EXTENSIONS); + console.log(`Found ${files.length} content files to process\n`); + + // Process each file + for (const file of files) { + try { + await processFile(file); + } catch (error) { + console.error(`Error processing ${file}:`, error); + } + } + + console.log("\n๐Ÿ“Š Indexing summary:"); + console.log(` Pages indexed: ${pageCount}`); + console.log(` Headings indexed: ${headingCount}`); + console.log(` Total records: ${itemsToSave.length}`); + + // Save to Algolia + if (!skipIndexing && itemsToSave.length > 0) { + console.log("\n๐Ÿ“ค Uploading to Algolia..."); + + const client = algoliasearch(algoliaAppId, algoliaAdminApiKey); + const index = client.initIndex(algoliaPagesIndexName); + + // Save objects in batches (Algolia recommends batches of 1000) + const BATCH_SIZE = 1000; + for (let i = 0; i < itemsToSave.length; i += BATCH_SIZE) { + const batch = itemsToSave.slice(i, i + BATCH_SIZE); + await index.saveObjects(batch); + console.log( + ` Saved batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil( + itemsToSave.length / BATCH_SIZE, + )}`, + ); + } + + console.log("\nโœ… Successfully indexed docs for search!"); + } else if (skipIndexing) { + console.log( + "\nโš ๏ธ Completed processing, but skipped Algolia upload due to missing environment variables.", + ); + } + + process.exit(0); +} + +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/types.ts b/types.ts index f3cb89ee6..71bc5ed0f 100644 --- a/types.ts +++ b/types.ts @@ -47,6 +47,24 @@ export type DocsSearchItem = { index: "pages" | "endpoints"; }; +// Enhanced search item type for improved Algolia indexing +// This extends the basic DocsSearchItem with content and heading information +export type EnhancedDocsSearchItem = { + objectID: string; // Unique ID (page-path or page-path#heading-slug) + path: string; // URL path (with optional anchor) + title: string; // Page title OR heading title + pageTitle: string; // Always the parent page title + description?: string; // From frontmatter (page-level only) + content: string; // Text content (truncated ~300-500 words) + section: string; // Top-level section (Concepts, Getting Started, etc.) + tags: string[]; // Tags from frontmatter + headingLevel: number; // 0 for page, 2 for H2, 3 for H3 + contentType: "document" | "api-reference"; + index: "pages" | "endpoints"; + // Ranking fields + isPageLevel: boolean; // True if this is a page-level record (not a heading) +}; + export type EndpointSearchItem = DocsSearchItem & { method: string; endpoint: string;