From 60c4c6b3e70fcf23a191dfe7cb282ed513528c57 Mon Sep 17 00:00:00 2001 From: Ido Shamun <1993245+idoshamun@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:36:36 +0300 Subject: [PATCH 1/3] feat: add archive sitemaps for best-of pages Add two new sitemap endpoints for archive best-of pages: - /sitemaps/archive-index.xml: index pages per tag/source (/tags/{tag}/best-of, /sources/{source}/best-of) - /sitemaps/archive-pages.xml: individual archive pages with year/month URLs Both are registered in the sitemap index. Source archives resolve the source handle via join; global archives are excluded. Months are zero-padded. --- __tests__/sitemaps.ts | 196 +++++++++++++++++++++++++++++++++++++++++ src/routes/sitemaps.ts | 145 ++++++++++++++++++++++++++++++ 2 files changed, 341 insertions(+) diff --git a/__tests__/sitemaps.ts b/__tests__/sitemaps.ts index c3bdc22479..a112a89fa2 100644 --- a/__tests__/sitemaps.ts +++ b/__tests__/sitemaps.ts @@ -7,6 +7,7 @@ import { DataSource, DeepPartial } from 'typeorm'; import createOrGetConnection from '../src/db'; import { AGENTS_DIGEST_SOURCE, + Archive, CollectionPost, Keyword, KeywordStatus, @@ -18,6 +19,12 @@ import { SourceType, User, } from '../src/entity'; +import { + ArchivePeriodType, + ArchiveRankingType, + ArchiveScopeType, + ArchiveSubjectType, +} from '../src/common/archive'; import { getSitemapRowLastmod } from '../src/routes/sitemaps'; import { updateFlagsStatement } from '../src/common/utils'; import { sourcesFixture } from './fixture/source'; @@ -1055,6 +1062,195 @@ describe('GET /sitemaps/evergreen.xml', () => { }); }); +describe('GET /sitemaps/archive-index.xml', () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should return index pages for tags and sources with archives', async () => { + const createdAt = new Date('2025-03-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'rust', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'rust', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-02-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'a', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Global, + scopeId: null, + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-index.xml') + .expect(200); + + expect(res.header['content-type']).toContain('application/xml'); + expect(res.header['cache-control']).toEqual( + 'public, max-age=7200, s-maxage=7200', + ); + expect(res.text).toContain( + '', + ); + // Source 'a' has handle 'a' + expect(res.text).toContain( + 'http://localhost:5002/sources/a/best-of', + ); + // Tag rust should appear once (deduplicated) + expect(res.text).toContain( + 'http://localhost:5002/tags/rust/best-of', + ); + // Global archives should not appear + expect(res.text).not.toContain('/best-of\n'); + // Only one entry for rust (two archives but one index) + const rustMatches = res.text.match(/\/tags\/rust\/best-of<\/loc>/g); + expect(rustMatches).toHaveLength(1); + }); + + it('should exclude source archives when the source has been deleted', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'nonexistent-source', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-index.xml') + .expect(200); + + expect(res.text).not.toContain('/sources/nonexistent-source/best-of'); + }); +}); + +describe('GET /sitemaps/archive-pages.xml', () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should return individual archive pages with correct URL format', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'b', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-09-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages.xml') + .expect(200); + + expect(res.header['content-type']).toContain('application/xml'); + expect(res.header['cache-control']).toEqual( + 'public, max-age=7200, s-maxage=7200', + ); + expect(res.text).toContain( + '', + ); + // Monthly tag archive with zero-padded month + expect(res.text).toContain( + 'http://localhost:5002/tags/golang/best-of/2025/01', + ); + // Yearly tag archive + expect(res.text).toContain( + 'http://localhost:5002/tags/golang/best-of/2024', + ); + // Source archive uses handle (source 'b' has handle 'b') + expect(res.text).toContain( + 'http://localhost:5002/sources/b/best-of/2025/09', + ); + // Lastmod should be present + expect(res.text).toContain(''); + }); + + it('should exclude global archives', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Global, + scopeId: null, + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages.xml') + .expect(200); + + // Should not contain any best-of URL for global scope + expect(res.text).not.toContain('/best-of/2025/01'); + }); +}); + +describe('GET /sitemaps/index.xml (archive entries)', () => { + it('should include archive sitemaps in the sitemap index', async () => { + const res = await request(app.server) + .get('/sitemaps/index.xml') + .expect(200); + + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-index.xml', + ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-pages.xml', + ); + }); +}); + describe('getSitemapRowLastmod', () => { it('should normalize pg timestamp format to ISO-8601', () => { const normalizedLastmod = getSitemapRowLastmod({ diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index 272fdd4c6b..d1c37a12e3 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -1,5 +1,6 @@ import { FastifyInstance } from 'fastify'; import { + Archive, Keyword, KeywordStatus, Post, @@ -10,6 +11,7 @@ import { User, } from '../entity'; import { AGENTS_DIGEST_SOURCE } from '../entity/Source'; +import { ArchivePeriodType, ArchiveScopeType } from '../common/archive'; import { getUserProfileUrl } from '../common/users'; import createOrGetConnection from '../db'; import { Readable } from 'stream'; @@ -399,6 +401,105 @@ const buildUsersSitemapQuery = ( .addOrderBy('u.username', 'ASC') .limit(DEFAULT_SITEMAP_LIMIT); +const zeroPadMonth = (month: number): string => + month.toString().padStart(2, '0'); + +const getArchiveBestOfUrl = ( + prefix: string, + scopeType: ArchiveScopeType, + scopeId: string, +): string => { + const segment = scopeType === ArchiveScopeType.Tag ? 'tags' : 'sources'; + + return `${prefix}/${segment}/${encodeURIComponent(scopeId)}/best-of`; +}; + +const getArchivePageUrl = ( + prefix: string, + scopeType: ArchiveScopeType, + scopeId: string, + periodType: ArchivePeriodType, + periodStart: Date, +): string => { + const base = getArchiveBestOfUrl(prefix, scopeType, scopeId); + const year = periodStart.getUTCFullYear(); + + if (periodType === ArchivePeriodType.Year) { + return `${base}/${year}`; + } + + const month = zeroPadMonth(periodStart.getUTCMonth() + 1); + + return `${base}/${year}/${month}`; +}; + +const buildArchiveIndexSitemapQuery = ( + source: DataSource | EntityManager, +): SelectQueryBuilder => + source + .createQueryBuilder() + .select('DISTINCT a."scopeType"', 'scopeType') + .addSelect( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'scopeId', + ) + .addSelect('MAX(a."createdAt")', 'lastmod') + .from(Archive, 'a') + .leftJoin( + Source, + 's', + `a."scopeType" = '${ArchiveScopeType.Source}' AND s.id = a."scopeId"`, + ) + .where('a."scopeType" IN (:...scopeTypes)', { + scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], + }) + .andWhere( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle IS NOT NULL ELSE TRUE END`, + ) + .groupBy('a."scopeType"') + .addGroupBy( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + ) + .orderBy('a."scopeType"', 'ASC') + .addOrderBy( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'ASC', + ) + .limit(DEFAULT_SITEMAP_LIMIT); + +const buildArchivePagesSitemapQuery = ( + source: DataSource | EntityManager, +): SelectQueryBuilder => + source + .createQueryBuilder() + .select('a."scopeType"', 'scopeType') + .addSelect( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'scopeId', + ) + .addSelect('a."periodType"', 'periodType') + .addSelect('a."periodStart"', 'periodStart') + .addSelect('a."createdAt"', 'lastmod') + .from(Archive, 'a') + .leftJoin( + Source, + 's', + `a."scopeType" = '${ArchiveScopeType.Source}' AND s.id = a."scopeId"`, + ) + .where('a."scopeType" IN (:...scopeTypes)', { + scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], + }) + .andWhere( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle IS NOT NULL ELSE TRUE END`, + ) + .orderBy('a."scopeType"', 'ASC') + .addOrderBy( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'ASC', + ) + .addOrderBy('a."periodStart"', 'ASC') + .limit(DEFAULT_SITEMAP_LIMIT); + const getPostsSitemapPath = (page: number): string => page === 1 ? '/api/sitemaps/posts-1.xml' : `/api/sitemaps/posts-${page}.xml`; @@ -454,6 +555,12 @@ ${evergreenSitemaps} ${escapeXml(`${prefix}/api/sitemaps/users.xml`)} + + ${escapeXml(`${prefix}/api/sitemaps/archive-index.xml`)} + + + ${escapeXml(`${prefix}/api/sitemaps/archive-pages.xml`)} + `; }; @@ -669,6 +776,44 @@ export default async function (fastify: FastifyInstance): Promise { ); }); + fastify.get('/archive-index.xml', async (_, res) => { + const con = await createOrGetConnection(); + const prefix = getSitemapUrlPrefix(); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send( + await buildSitemapXmlStream(con, buildArchiveIndexSitemapQuery, (row) => + getArchiveBestOfUrl( + prefix, + row.scopeType as ArchiveScopeType, + row.scopeId, + ), + ), + ); + }); + + fastify.get('/archive-pages.xml', async (_, res) => { + const con = await createOrGetConnection(); + const prefix = getSitemapUrlPrefix(); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send( + await buildSitemapXmlStream(con, buildArchivePagesSitemapQuery, (row) => + getArchivePageUrl( + prefix, + row.scopeType as ArchiveScopeType, + row.scopeId, + row.periodType as ArchivePeriodType, + new Date(row.periodStart), + ), + ), + ); + }); + fastify.get('/index.xml', async (_, res) => { const con = await createOrGetConnection(); const postsSitemapCount = getSitemapPageCount( From 3c80701b502f155350ead8e3dd9f0b17b4ed151f Mon Sep 17 00:00:00 2001 From: Ido Shamun <1993245+idoshamun@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:53:48 +0300 Subject: [PATCH 2/3] chore: use 50k page size for archive sitemaps The sitemap spec allows up to 50k URLs per file. No need to split into smaller pages. --- src/routes/sitemaps.ts | 175 +++++++++++++++++++++++++++++++---------- 1 file changed, 134 insertions(+), 41 deletions(-) diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index d1c37a12e3..0009d79733 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -25,6 +25,7 @@ import { const SITEMAP_CACHE_CONTROL = `public, max-age=${2 * ONE_HOUR_IN_SECONDS}, s-maxage=${2 * ONE_HOUR_IN_SECONDS}`; const DEFAULT_SITEMAP_LIMIT = 50_000; +const ARCHIVE_PAGES_LIMIT = 50_000; const QUALIFIED_SOURCE_MIN_PUBLIC_POSTS = 10; const ARENA_SITEMAP_GROUP_IDS = [ '385404b4-f0f4-4e81-a338-bdca851eca31', @@ -467,38 +468,79 @@ const buildArchiveIndexSitemapQuery = ( ) .limit(DEFAULT_SITEMAP_LIMIT); -const buildArchivePagesSitemapQuery = ( +const VALID_ARCHIVE_SCOPE_TYPES = new Set([ + ArchiveScopeType.Tag, + ArchiveScopeType.Source, +]); +const VALID_ARCHIVE_PERIOD_TYPES = new Set([ + ArchivePeriodType.Month, + ArchivePeriodType.Year, +]); + +const buildArchivePagesPaginatedQuery = ( source: DataSource | EntityManager, -): SelectQueryBuilder => - source + scopeType: ArchiveScopeType, + periodType: ArchivePeriodType, + page: number, +): SelectQueryBuilder => { + const qb = source .createQueryBuilder() .select('a."scopeType"', 'scopeType') .addSelect( - `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + scopeType === ArchiveScopeType.Source + ? 's.handle' + : 'a."scopeId"', 'scopeId', ) .addSelect('a."periodType"', 'periodType') .addSelect('a."periodStart"', 'periodStart') .addSelect('a."createdAt"', 'lastmod') .from(Archive, 'a') - .leftJoin( - Source, - 's', - `a."scopeType" = '${ArchiveScopeType.Source}' AND s.id = a."scopeId"`, - ) - .where('a."scopeType" IN (:...scopeTypes)', { - scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], - }) - .andWhere( - `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle IS NOT NULL ELSE TRUE END`, - ) - .orderBy('a."scopeType"', 'ASC') - .addOrderBy( - `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, - 'ASC', - ) - .addOrderBy('a."periodStart"', 'ASC') - .limit(DEFAULT_SITEMAP_LIMIT); + .where('a."scopeType" = :scopeType', { scopeType }) + .andWhere('a."periodType" = :periodType', { periodType }); + + if (scopeType === ArchiveScopeType.Source) { + qb.innerJoin(Source, 's', 's.id = a."scopeId"'); + qb.orderBy('s.handle', 'ASC'); + } else { + qb.orderBy('a."scopeId"', 'ASC'); + } + + qb.addOrderBy('a."periodStart"', 'ASC') + .limit(ARCHIVE_PAGES_LIMIT) + .offset(page * ARCHIVE_PAGES_LIMIT); + + return qb; +}; + +const getArchivePagesCount = async ( + con: DataSource, +): Promise<{ scopeType: string; periodType: string; count: number }[]> => { + const queryRunner = con.createQueryRunner('slave'); + + try { + const rows = await queryRunner.manager + .createQueryBuilder() + .select('a."scopeType"', 'scopeType') + .addSelect('a."periodType"', 'periodType') + .addSelect('COUNT(*)', 'count') + .from(Archive, 'a') + .where('a."scopeType" IN (:...scopeTypes)', { + scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], + }) + .groupBy('a."scopeType"') + .addGroupBy('a."periodType"') + .getRawMany<{ scopeType: string; periodType: string; count: string }>(); + + return rows.map((row) => ({ + scopeType: row.scopeType, + periodType: row.periodType, + count: Number(row.count), + })); + } finally { + await queryRunner.release(); + } +}; const getPostsSitemapPath = (page: number): string => page === 1 ? '/api/sitemaps/posts-1.xml' : `/api/sitemaps/posts-${page}.xml`; @@ -514,9 +556,25 @@ const buildEvergreenSitemapStream = async ( ): Promise => buildPaginatedPostSitemapStream(con, page, buildEvergreenSitemapQuery); +const buildArchivePagesIndexEntries = ( + prefix: string, + archivePageCounts: { scopeType: string; periodType: string; count: number }[], +): string => + archivePageCounts + .flatMap(({ scopeType, periodType, count }) => { + const pages = Math.max(1, Math.ceil(count / ARCHIVE_PAGES_LIMIT)); + + return Array.from({ length: pages }, (_, i) => + ` + ${escapeXml(`${prefix}/api/sitemaps/archive-pages-${scopeType}-${periodType}-${i}.xml`)} + `); + }) + .join('\n'); + const getSitemapIndexXml = ( postsSitemapCount: number, evergreenSitemapCount: number, + archivePageCounts: { scopeType: string; periodType: string; count: number }[], ): string => { const prefix = getSitemapUrlPrefix(); const postsSitemaps = buildSitemapIndexEntries( @@ -529,6 +587,10 @@ const getSitemapIndexXml = ( evergreenSitemapCount, getEvergreenSitemapPath, ); + const archivePagesSitemaps = buildArchivePagesIndexEntries( + prefix, + archivePageCounts, + ); return ` @@ -558,9 +620,7 @@ ${evergreenSitemaps} ${escapeXml(`${prefix}/api/sitemaps/archive-index.xml`)} - - ${escapeXml(`${prefix}/api/sitemaps/archive-pages.xml`)} - +${archivePagesSitemaps} `; }; @@ -794,7 +854,21 @@ export default async function (fastify: FastifyInstance): Promise { ); }); - fastify.get('/archive-pages.xml', async (_, res) => { + fastify.get<{ + Params: { scopeType: string; periodType: string; page: string }; + }>('/archive-pages-:scopeType-:periodType-:page.xml', async (req, res) => { + const { scopeType, periodType } = req.params; + const page = Number.parseInt(req.params.page, 10); + + if ( + !VALID_ARCHIVE_SCOPE_TYPES.has(scopeType) || + !VALID_ARCHIVE_PERIOD_TYPES.has(periodType) || + !Number.isInteger(page) || + page < 0 + ) { + return res.code(404).send(); + } + const con = await createOrGetConnection(); const prefix = getSitemapUrlPrefix(); @@ -802,30 +876,49 @@ export default async function (fastify: FastifyInstance): Promise { .type('application/xml') .header('cache-control', SITEMAP_CACHE_CONTROL) .send( - await buildSitemapXmlStream(con, buildArchivePagesSitemapQuery, (row) => - getArchivePageUrl( - prefix, - row.scopeType as ArchiveScopeType, - row.scopeId, - row.periodType as ArchivePeriodType, - new Date(row.periodStart), - ), + await buildSitemapXmlStream( + con, + (source) => + buildArchivePagesPaginatedQuery( + source, + scopeType as ArchiveScopeType, + periodType as ArchivePeriodType, + page, + ), + (row) => + getArchivePageUrl( + prefix, + row.scopeType as ArchiveScopeType, + row.scopeId, + row.periodType as ArchivePeriodType, + new Date(row.periodStart), + ), ), ); }); fastify.get('/index.xml', async (_, res) => { const con = await createOrGetConnection(); - const postsSitemapCount = getSitemapPageCount( - await getReplicaQueryCount(con, buildPostsSitemapBaseQuery), - ); - const evergreenSitemapCount = getSitemapPageCount( - await getReplicaQueryCount(con, buildEvergreenSitemapBaseQuery), - ); + const [postsSitemapCount, evergreenSitemapCount, archivePageCounts] = + await Promise.all([ + getReplicaQueryCount(con, buildPostsSitemapBaseQuery).then( + getSitemapPageCount, + ), + getReplicaQueryCount(con, buildEvergreenSitemapBaseQuery).then( + getSitemapPageCount, + ), + getArchivePagesCount(con), + ]); return res .type('application/xml') .header('cache-control', SITEMAP_CACHE_CONTROL) - .send(getSitemapIndexXml(postsSitemapCount, evergreenSitemapCount)); + .send( + getSitemapIndexXml( + postsSitemapCount, + evergreenSitemapCount, + archivePageCounts, + ), + ); }); } From 6b34d19901bde9a35c1c9b5014beb9184df26e56 Mon Sep 17 00:00:00 2001 From: Ido Shamun <1993245+idoshamun@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:55:33 +0300 Subject: [PATCH 3/3] feat: paginate archive-pages sitemap by scopeType and periodType Replace the single archive-pages.xml with paginated sub-sitemaps split by scope type (tag/source) and period type (month/year), each containing at most 10,000 URLs. The sitemap index now dynamically lists all archive page sitemaps based on actual archive counts. Route: /archive-pages-:scopeType-:periodType-:page.xml (0-indexed) --- __tests__/sitemaps.ts | 153 +++++++++++++++++++++++++++++++++++------ src/routes/sitemaps.ts | 13 ++-- 2 files changed, 139 insertions(+), 27 deletions(-) diff --git a/__tests__/sitemaps.ts b/__tests__/sitemaps.ts index a112a89fa2..54bf57dbac 100644 --- a/__tests__/sitemaps.ts +++ b/__tests__/sitemaps.ts @@ -1152,13 +1152,13 @@ describe('GET /sitemaps/archive-index.xml', () => { }); }); -describe('GET /sitemaps/archive-pages.xml', () => { +describe('GET /sitemaps/archive-pages-:scopeType-:periodType-:page.xml', () => { const archiveBase = { subjectType: ArchiveSubjectType.Post, rankingType: ArchiveRankingType.Best, }; - it('should return individual archive pages with correct URL format', async () => { + it('should return tag monthly archive pages with correct URL format', async () => { const createdAt = new Date('2025-04-01T10:00:00.000Z'); await con.getRepository(Archive).save([ @@ -1178,18 +1178,10 @@ describe('GET /sitemaps/archive-pages.xml', () => { periodStart: new Date('2024-01-01T00:00:00.000Z'), createdAt, }, - { - ...archiveBase, - scopeType: ArchiveScopeType.Source, - scopeId: 'b', - periodType: ArchivePeriodType.Month, - periodStart: new Date('2025-09-01T00:00:00.000Z'), - createdAt, - }, ]); const res = await request(app.server) - .get('/sitemaps/archive-pages.xml') + .get('/sitemaps/archive-pages-tag-month-0.xml') .expect(200); expect(res.header['content-type']).toContain('application/xml'); @@ -1203,24 +1195,102 @@ describe('GET /sitemaps/archive-pages.xml', () => { expect(res.text).toContain( 'http://localhost:5002/tags/golang/best-of/2025/01', ); - // Yearly tag archive + // Should not include yearly archives + expect(res.text).not.toContain( + 'http://localhost:5002/tags/golang/best-of/2024', + ); + // Lastmod should be present + expect(res.text).toContain(''); + }); + + it('should return tag yearly archive pages', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-tag-year-0.xml') + .expect(200); + expect(res.text).toContain( 'http://localhost:5002/tags/golang/best-of/2024', ); + }); + + it('should return source monthly archive pages using handle', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'b', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-09-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-source-month-0.xml') + .expect(200); + // Source archive uses handle (source 'b' has handle 'b') expect(res.text).toContain( 'http://localhost:5002/sources/b/best-of/2025/09', ); - // Lastmod should be present - expect(res.text).toContain(''); }); - it('should exclude global archives', async () => { + it('should return 404 for invalid scopeType', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-invalid-month-0.xml') + .expect(404); + }); + + it('should return 404 for invalid periodType', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-invalid-0.xml') + .expect(404); + }); + + it('should return 404 for negative page', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-month--1.xml') + .expect(404); + }); + + it('should return 404 for non-integer page', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-month-abc.xml') + .expect(404); + }); + + it('should return empty urlset for page beyond data', async () => { + const res = await request(app.server) + .get('/sitemaps/archive-pages-tag-month-999.xml') + .expect(200); + + expect(res.text).toContain( + '', + ); + expect(res.text).not.toContain(''); + }); + + it('should exclude source archives when the source has been deleted', async () => { await con.getRepository(Archive).save([ { ...archiveBase, - scopeType: ArchiveScopeType.Global, - scopeId: null, + scopeType: ArchiveScopeType.Source, + scopeId: 'nonexistent-source', periodType: ArchivePeriodType.Month, periodStart: new Date('2025-01-01T00:00:00.000Z'), createdAt: new Date(), @@ -1228,16 +1298,39 @@ describe('GET /sitemaps/archive-pages.xml', () => { ]); const res = await request(app.server) - .get('/sitemaps/archive-pages.xml') + .get('/sitemaps/archive-pages-source-month-0.xml') .expect(200); - // Should not contain any best-of URL for global scope - expect(res.text).not.toContain('/best-of/2025/01'); + expect(res.text).not.toContain('/sources/nonexistent-source/best-of'); }); }); describe('GET /sitemaps/index.xml (archive entries)', () => { - it('should include archive sitemaps in the sitemap index', async () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should include archive-index and paginated archive-pages sitemaps', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'a', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + const res = await request(app.server) .get('/sitemaps/index.xml') .expect(200); @@ -1246,9 +1339,27 @@ describe('GET /sitemaps/index.xml (archive entries)', () => { 'http://localhost:5002/api/sitemaps/archive-index.xml', ); expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-pages-tag-month-0.xml', + ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-pages-source-year-0.xml', + ); + // Should not contain old non-paginated archive-pages.xml + expect(res.text).not.toContain( 'http://localhost:5002/api/sitemaps/archive-pages.xml', ); }); + + it('should not include archive-pages entries when no archives exist', async () => { + const res = await request(app.server) + .get('/sitemaps/index.xml') + .expect(200); + + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-index.xml', + ); + expect(res.text).not.toContain('archive-pages-'); + }); }); describe('getSitemapRowLastmod', () => { diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index 0009d79733..17ce019e6b 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -487,9 +487,7 @@ const buildArchivePagesPaginatedQuery = ( .createQueryBuilder() .select('a."scopeType"', 'scopeType') .addSelect( - scopeType === ArchiveScopeType.Source - ? 's.handle' - : 'a."scopeId"', + scopeType === ArchiveScopeType.Source ? 's.handle' : 'a."scopeId"', 'scopeId', ) .addSelect('a."periodType"', 'periodType') @@ -564,10 +562,13 @@ const buildArchivePagesIndexEntries = ( .flatMap(({ scopeType, periodType, count }) => { const pages = Math.max(1, Math.ceil(count / ARCHIVE_PAGES_LIMIT)); - return Array.from({ length: pages }, (_, i) => - ` + return Array.from( + { length: pages }, + (_, i) => + ` ${escapeXml(`${prefix}/api/sitemaps/archive-pages-${scopeType}-${periodType}-${i}.xml`)} - `); + `, + ); }) .join('\n');