Skip to content

Commit eb73111

Browse files
authored
Merge pull request #51 from reactome/45-site-search-broken
45 site search broken
2 parents 0a6f650 + 0e8635c commit eb73111

1 file changed

Lines changed: 165 additions & 14 deletions

File tree

projects/website-angular/src/scripts/generate-index.ts

Lines changed: 165 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,156 @@ import { SiteSearchIndexItem } from '../types/site-search';
55
import parseFrontmatter from '../utils/parseFrontmatter';
66
import truncateHtml from '../utils/truncateHtml';
77

8+
/**
9+
* Strip markdown/MDX syntax to produce plain text for search indexing
10+
*/
11+
function stripMarkdown(md: string): string {
12+
return md
13+
.replace(/^---[\s\S]*?---\n?/, '') // frontmatter
14+
.replace(/import\s+.*?from\s+['"].*?['"]\s*;?\n?/g, '') // ESM imports
15+
.replace(/<[^>]+>/g, '') // HTML/JSX tags
16+
.replace(/!\[.*?\]\(.*?\)/g, '') // images
17+
.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1') // links → text
18+
.replace(/#{1,6}\s+/g, '') // headings
19+
.replace(/(\*{1,3}|_{1,3})(.*?)\1/g, '$2') // bold/italic
20+
.replace(/`{1,3}[^`]*`{1,3}/g, '') // inline/block code
21+
.replace(/>\s?/gm, '') // blockquotes
22+
.replace(/[-*+]\s+/gm, '') // list markers
23+
.replace(/\d+\.\s+/gm, '') // ordered list markers
24+
.replace(/\n{2,}/g, '\n') // collapse blank lines
25+
.replace(/\s+/g, ' ') // normalize whitespace
26+
.trim();
27+
}
28+
29+
/**
30+
* Recursively find all .mdx/.md files in a directory
31+
*/
32+
function findAllMdxFiles(dir: string): string[] {
33+
const results: string[] = [];
34+
if (!fs.existsSync(dir)) return results;
35+
36+
const entries = fs.readdirSync(dir, { withFileTypes: true });
37+
for (const entry of entries) {
38+
const fullPath = path.join(dir, entry.name);
39+
if (entry.isDirectory()) {
40+
results.push(...findAllMdxFiles(fullPath));
41+
} else if (entry.name.endsWith('.mdx') || entry.name.endsWith('.md')) {
42+
// Skip index.json artifacts
43+
if (entry.name !== 'index.json') {
44+
results.push(fullPath);
45+
}
46+
}
47+
}
48+
return results;
49+
}
50+
51+
/**
52+
* Map a file path to its site URL
53+
* e.g. content/about/what-is-reactome.mdx → /about/what-is-reactome
54+
* content/about/news/article-1.mdx → /about/news/article-1
55+
* content/documentation/dev/index.mdx → /documentation/dev
56+
*/
57+
function filePathToUrl(filePath: string, contentRoot: string): string {
58+
let relative = path.relative(contentRoot, filePath);
59+
// Remove extension
60+
relative = relative.replace(/\.(mdx|md)$/, '');
61+
// Remove trailing /index
62+
relative = relative.replace(/\/index$/, '');
63+
// Convert to URL
64+
return '/' + relative.replace(/\\/g, '/');
65+
}
66+
67+
/**
68+
* Infer a human-readable category from the top-level directory
69+
*/
70+
function inferCategory(url: string): string {
71+
const categoryMap: Record<string, string> = {
72+
about: 'About',
73+
content: 'Content',
74+
documentation: 'Documentation',
75+
community: 'Community',
76+
tools: 'Tools',
77+
};
78+
const topDir = url.split('/')[1] || '';
79+
// Special sub-categories
80+
if (url.startsWith('/about/news/')) return 'News';
81+
if (url.startsWith('/content/reactome-research-spotlight/'))
82+
return 'Research Spotlight';
83+
return categoryMap[topDir] || 'Other';
84+
}
85+
86+
/**
87+
* Generate a consolidated site search index covering all content
88+
*/
89+
function generateSiteSearchIndex(): void {
90+
const contentRoot = path.resolve(
91+
process.cwd(),
92+
'projects',
93+
'website-angular',
94+
'content'
95+
);
96+
97+
if (!fs.existsSync(contentRoot)) {
98+
console.warn('Content directory not found:', contentRoot);
99+
return;
100+
}
101+
102+
const allFiles = findAllMdxFiles(contentRoot);
103+
const items: SiteSearchIndexItem[] = [];
104+
const seenUrls = new Set<string>();
105+
let nextId = 1;
106+
107+
for (const filePath of allFiles) {
108+
const raw = fs.readFileSync(filePath, 'utf-8');
109+
const { frontmatter, body } = parseFrontmatter(raw);
110+
111+
const url = filePathToUrl(filePath, contentRoot);
112+
113+
// Skip duplicates (e.g. collaboration.mdx and collaboration/index.mdx)
114+
if (seenUrls.has(url)) continue;
115+
seenUrls.add(url);
116+
const title =
117+
(frontmatter['title'] as string) ||
118+
path
119+
.basename(filePath)
120+
.replace(/\.(mdx|md)$/, '')
121+
.replace(/-/g, ' ');
122+
const category = (frontmatter['category'] as string)
123+
? inferCategory(url)
124+
: inferCategory(url);
125+
const plainBody = stripMarkdown(body);
126+
const excerpt =
127+
plainBody.slice(0, 200) + (plainBody.length > 200 ? '...' : '');
128+
129+
items.push({
130+
id: nextId++,
131+
title,
132+
category,
133+
url,
134+
body: plainBody,
135+
excerpt,
136+
date: (frontmatter['date'] as string) || undefined,
137+
});
138+
}
139+
140+
// Write to public assets so it can be fetched at runtime
141+
const outputDir = path.resolve(
142+
process.cwd(),
143+
'projects',
144+
'website-angular',
145+
'public'
146+
);
147+
if (!fs.existsSync(outputDir)) {
148+
fs.mkdirSync(outputDir, { recursive: true });
149+
}
150+
151+
const outputPath = path.join(outputDir, 'site-search-index.json');
152+
fs.writeFileSync(outputPath, JSON.stringify(items));
153+
console.log(
154+
`Site search index generated: ${items.length} entries → ${outputPath}`
155+
);
156+
}
157+
8158
function loadNewsArticlesFromDir(dir: string): ArticleIndexItem[] {
9159
if (!fs.existsSync(dir)) return [];
10160

@@ -19,9 +169,7 @@ function loadNewsArticlesFromDir(dir: string): ArticleIndexItem[] {
19169
const { frontmatter, body } = parseFrontmatter(content);
20170

21171
return {
22-
title:
23-
frontmatter['title'] ||
24-
filename.replace(/\.(mdx|md)$/, ''),
172+
title: frontmatter['title'] || filename.replace(/\.(mdx|md)$/, ''),
25173
author: frontmatter['author'] || undefined,
26174
excerpt: truncateHtml(body || '', 50),
27175
date: frontmatter['date'] || new Date().toISOString(),
@@ -36,11 +184,7 @@ function loadNewsArticlesFromDir(dir: string): ArticleIndexItem[] {
36184
: frontmatter['tags'],
37185
} as ArticleIndexItem;
38186
})
39-
.sort(
40-
(a, b) =>
41-
new Date(b.date).getTime() -
42-
new Date(a.date).getTime()
43-
);
187+
.sort((a, b) => new Date(b.date).getTime() - new Date(a.date).getTime());
44188
}
45189

46190
function buildRecursiveIndex(dir: string): any {
@@ -73,10 +217,7 @@ function buildRecursiveIndex(dir: string): any {
73217
/**
74218
* Generate a JSON file with optional recursive indexing
75219
*/
76-
function generateIndex(
77-
directories: string[],
78-
recursive: boolean = true
79-
): void {
220+
function generateIndex(directories: string[], recursive: boolean = true): void {
80221
const outputDir = path.resolve(process.cwd(), ...directories);
81222

82223
if (!fs.existsSync(outputDir)) {
@@ -94,5 +235,15 @@ function generateIndex(
94235

95236
// Run on module load
96237
generateIndex(['projects', 'website-angular', 'content', 'about', 'news']);
97-
generateIndex(['projects', 'website-angular', 'content', 'content', 'reactome-research-spotlight']);
98-
generateIndex(['projects', 'website-angular', 'content', 'documentation', 'faq'], true);
238+
generateIndex([
239+
'projects',
240+
'website-angular',
241+
'content',
242+
'content',
243+
'reactome-research-spotlight',
244+
]);
245+
generateIndex(
246+
['projects', 'website-angular', 'content', 'documentation', 'faq'],
247+
true
248+
);
249+
generateSiteSearchIndex();

0 commit comments

Comments
 (0)