Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions src/core/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,73 @@ describe('generateSitemap', () => {
expect(sitemap).toContain('<loc>https://example.com/products/</loc>');
});

it('filters /sitemap, /sitemap-N, and /sitemap-index from config.pages', () => {
mockFs.readdirSync.mockReturnValue([]);
const configWithSitemapPages = {
...baseConfig,
pages: [
{ pathname: '/', title: 'Home', description: '', content: '' },
{ pathname: '/about', title: 'About', description: '', content: '' },
{ pathname: '/sitemap', title: '', description: '', content: '' },
{ pathname: '/sitemap-0', title: '', description: '', content: '' },
{ pathname: '/sitemap-1', title: '', description: '', content: '' },
{ pathname: '/sitemap-index', title: '', description: '', content: '' },
{ pathname: '/sitemap.xml', title: '', description: '', content: '' },
{ pathname: '/sitemap-0.xml', title: '', description: '', content: '' },
],
};

const sitemap = generateSitemap(configWithSitemapPages);

expect(sitemap).toContain('<loc>https://example.com/about</loc>');
expect(sitemap).not.toContain('/sitemap-0');
expect(sitemap).not.toContain('/sitemap-1');
expect(sitemap).not.toContain('/sitemap-index');
expect(sitemap).not.toContain('/sitemap.xml');
expect(sitemap).not.toMatch(/<loc>https:\/\/example\.com\/sitemap<\/loc>/);
});

it('preserves legitimate paths that start with sitemap- but are not sitemap files', () => {
mockFs.readdirSync.mockReturnValue([]);
const configWithSitemapNamedPages = {
...baseConfig,
pages: [
{ pathname: '/sitemap-guide', title: 'Sitemap Guide', description: '', content: '' },
{ pathname: '/sitemap-tutorial', title: 'Tutorial', description: '', content: '' },
{ pathname: '/sitemaps-explained', title: 'Explained', description: '', content: '' },
{ pathname: '/sitemap-0', title: '', description: '', content: '' },
],
};

const sitemap = generateSitemap(configWithSitemapNamedPages);

expect(sitemap).toContain('<loc>https://example.com/sitemap-guide</loc>');
expect(sitemap).toContain('<loc>https://example.com/sitemap-tutorial</loc>');
expect(sitemap).toContain('<loc>https://example.com/sitemaps-explained</loc>');
expect(sitemap).not.toContain('<loc>https://example.com/sitemap-0</loc>');
});

it('filters sitemap-named files discovered in contentDir', () => {
mockFs.readdirSync.mockReturnValue([
'index.md',
'about.md',
'sitemap-0.html',
'sitemap-index.html',
'sitemap-guide.md',
]);
mockFs.statSync.mockReturnValue({
isDirectory: () => false,
isFile: () => true,
});

const sitemap = generateSitemap(baseConfig);

expect(sitemap).toContain('<loc>https://example.com/about</loc>');
expect(sitemap).toContain('<loc>https://example.com/sitemap-guide</loc>');
expect(sitemap).not.toContain('/sitemap-0');
expect(sitemap).not.toContain('/sitemap-index');
});

it('should handle subdirectories recursively', () => {
const files = ['index.md', 'blog', 'docs'];

Expand Down
25 changes: 24 additions & 1 deletion src/core/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,42 @@ function escapeXml(str: string): string {
.replace(/'/g, '&apos;');
}

function isSitemapPathname(pathname: string): boolean {
// Matches sitemap output produced by plugins like @astrojs/sitemap:
// /sitemap, /sitemap-0, /sitemap-1, /sitemap-index, /sitemap.xml, /sitemap-0.xml
// Does NOT match legitimate user pages like /sitemap-guide or /sitemaps-explained.
return /^\/sitemap(-\d+|-index)?(\.xml)?$/i.test(pathname);
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

function pathnameFromUrl(url: string, baseUrl: string): string {
// Always extract the pathname portion so the sitemap-name filter still applies
// even when the URL doesn't share the configured base (e.g. a different host
// sneaks in via collectUrls). Falls back to the raw input only if URL parsing
// fails entirely, which the regex below will then ignore.
try {
return new URL(url, baseUrl).pathname;
} catch {
return url;
}
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.

export function generateSitemap(config: ResolvedAeoConfig): string {
const urls: string[] = [];

// Add discovered pages from framework plugin
if (config.pages && config.pages.length > 0) {
for (const page of config.pages) {
if (isSitemapPathname(page.pathname)) continue;
urls.push(`${config.url}${page.pathname === '/' ? '' : page.pathname}`);
}
}

// Add markdown/html files from content dir
if (config.contentDir && existsSync(config.contentDir)) {
urls.push(...collectUrls(config.contentDir, config));
const contentUrls = collectUrls(config.contentDir, config).filter(
(url) => !isSitemapPathname(pathnameFromUrl(url, config.url))
);
urls.push(...contentUrls);
}

const lines: string[] = [
Expand Down