diff --git a/__tests__/sitemaps.ts b/__tests__/sitemaps.ts index c3bdc22479..54bf57dbac 100644 --- a/__tests__/sitemaps.ts +++ b/__tests__/sitemaps.ts @@ -7,6 +7,7 @@ import { DataSource, DeepPartial } from 'typeorm'; import createOrGetConnection from '../src/db'; import { AGENTS_DIGEST_SOURCE, + Archive, CollectionPost, Keyword, KeywordStatus, @@ -18,6 +19,12 @@ import { SourceType, User, } from '../src/entity'; +import { + ArchivePeriodType, + ArchiveRankingType, + ArchiveScopeType, + ArchiveSubjectType, +} from '../src/common/archive'; import { getSitemapRowLastmod } from '../src/routes/sitemaps'; import { updateFlagsStatement } from '../src/common/utils'; import { sourcesFixture } from './fixture/source'; @@ -1055,6 +1062,306 @@ describe('GET /sitemaps/evergreen.xml', () => { }); }); +describe('GET /sitemaps/archive-index.xml', () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should return index pages for tags and sources with archives', async () => { + const createdAt = new Date('2025-03-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'rust', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'rust', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-02-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'a', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Global, + scopeId: null, + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-index.xml') + .expect(200); + + expect(res.header['content-type']).toContain('application/xml'); + expect(res.header['cache-control']).toEqual( + 'public, max-age=7200, s-maxage=7200', + ); + expect(res.text).toContain( + '', + ); + // Source 'a' has handle 'a' + expect(res.text).toContain( + 'http://localhost:5002/sources/a/best-of', + ); + // Tag rust should appear once (deduplicated) + expect(res.text).toContain( + 'http://localhost:5002/tags/rust/best-of', + ); + // Global archives should not appear + expect(res.text).not.toContain('/best-of\n'); + // Only one entry for rust (two archives but one index) + const rustMatches = res.text.match(/\/tags\/rust\/best-of<\/loc>/g); + expect(rustMatches).toHaveLength(1); + }); + + it('should exclude source archives when the source has been deleted', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'nonexistent-source', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-index.xml') + .expect(200); + + expect(res.text).not.toContain('/sources/nonexistent-source/best-of'); + }); +}); + +describe('GET /sitemaps/archive-pages-:scopeType-:periodType-:page.xml', () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should return tag monthly archive pages with correct URL format', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt, + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-tag-month-0.xml') + .expect(200); + + expect(res.header['content-type']).toContain('application/xml'); + expect(res.header['cache-control']).toEqual( + 'public, max-age=7200, s-maxage=7200', + ); + expect(res.text).toContain( + '', + ); + // Monthly tag archive with zero-padded month + expect(res.text).toContain( + 'http://localhost:5002/tags/golang/best-of/2025/01', + ); + // Should not include yearly archives + expect(res.text).not.toContain( + 'http://localhost:5002/tags/golang/best-of/2024', + ); + // Lastmod should be present + expect(res.text).toContain(''); + }); + + it('should return tag yearly archive pages', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-tag-year-0.xml') + .expect(200); + + expect(res.text).toContain( + 'http://localhost:5002/tags/golang/best-of/2024', + ); + }); + + it('should return source monthly archive pages using handle', async () => { + const createdAt = new Date('2025-04-01T10:00:00.000Z'); + + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'b', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-09-01T00:00:00.000Z'), + createdAt, + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-source-month-0.xml') + .expect(200); + + // Source archive uses handle (source 'b' has handle 'b') + expect(res.text).toContain( + 'http://localhost:5002/sources/b/best-of/2025/09', + ); + }); + + it('should return 404 for invalid scopeType', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-invalid-month-0.xml') + .expect(404); + }); + + it('should return 404 for invalid periodType', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-invalid-0.xml') + .expect(404); + }); + + it('should return 404 for negative page', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-month--1.xml') + .expect(404); + }); + + it('should return 404 for non-integer page', async () => { + await request(app.server) + .get('/sitemaps/archive-pages-tag-month-abc.xml') + .expect(404); + }); + + it('should return empty urlset for page beyond data', async () => { + const res = await request(app.server) + .get('/sitemaps/archive-pages-tag-month-999.xml') + .expect(200); + + expect(res.text).toContain( + '', + ); + expect(res.text).not.toContain(''); + }); + + it('should exclude source archives when the source has been deleted', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'nonexistent-source', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/archive-pages-source-month-0.xml') + .expect(200); + + expect(res.text).not.toContain('/sources/nonexistent-source/best-of'); + }); +}); + +describe('GET /sitemaps/index.xml (archive entries)', () => { + const archiveBase = { + subjectType: ArchiveSubjectType.Post, + rankingType: ArchiveRankingType.Best, + }; + + it('should include archive-index and paginated archive-pages sitemaps', async () => { + await con.getRepository(Archive).save([ + { + ...archiveBase, + scopeType: ArchiveScopeType.Tag, + scopeId: 'golang', + periodType: ArchivePeriodType.Month, + periodStart: new Date('2025-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + { + ...archiveBase, + scopeType: ArchiveScopeType.Source, + scopeId: 'a', + periodType: ArchivePeriodType.Year, + periodStart: new Date('2024-01-01T00:00:00.000Z'), + createdAt: new Date(), + }, + ]); + + const res = await request(app.server) + .get('/sitemaps/index.xml') + .expect(200); + + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-index.xml', + ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-pages-tag-month-0.xml', + ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-pages-source-year-0.xml', + ); + // Should not contain old non-paginated archive-pages.xml + expect(res.text).not.toContain( + 'http://localhost:5002/api/sitemaps/archive-pages.xml', + ); + }); + + it('should not include archive-pages entries when no archives exist', async () => { + const res = await request(app.server) + .get('/sitemaps/index.xml') + .expect(200); + + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/archive-index.xml', + ); + expect(res.text).not.toContain('archive-pages-'); + }); +}); + describe('getSitemapRowLastmod', () => { it('should normalize pg timestamp format to ISO-8601', () => { const normalizedLastmod = getSitemapRowLastmod({ diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index 272fdd4c6b..17ce019e6b 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -1,5 +1,6 @@ import { FastifyInstance } from 'fastify'; import { + Archive, Keyword, KeywordStatus, Post, @@ -10,6 +11,7 @@ import { User, } from '../entity'; import { AGENTS_DIGEST_SOURCE } from '../entity/Source'; +import { ArchivePeriodType, ArchiveScopeType } from '../common/archive'; import { getUserProfileUrl } from '../common/users'; import createOrGetConnection from '../db'; import { Readable } from 'stream'; @@ -23,6 +25,7 @@ import { const SITEMAP_CACHE_CONTROL = `public, max-age=${2 * ONE_HOUR_IN_SECONDS}, s-maxage=${2 * ONE_HOUR_IN_SECONDS}`; const DEFAULT_SITEMAP_LIMIT = 50_000; +const ARCHIVE_PAGES_LIMIT = 50_000; const QUALIFIED_SOURCE_MIN_PUBLIC_POSTS = 10; const ARENA_SITEMAP_GROUP_IDS = [ '385404b4-f0f4-4e81-a338-bdca851eca31', @@ -399,6 +402,144 @@ const buildUsersSitemapQuery = ( .addOrderBy('u.username', 'ASC') .limit(DEFAULT_SITEMAP_LIMIT); +const zeroPadMonth = (month: number): string => + month.toString().padStart(2, '0'); + +const getArchiveBestOfUrl = ( + prefix: string, + scopeType: ArchiveScopeType, + scopeId: string, +): string => { + const segment = scopeType === ArchiveScopeType.Tag ? 'tags' : 'sources'; + + return `${prefix}/${segment}/${encodeURIComponent(scopeId)}/best-of`; +}; + +const getArchivePageUrl = ( + prefix: string, + scopeType: ArchiveScopeType, + scopeId: string, + periodType: ArchivePeriodType, + periodStart: Date, +): string => { + const base = getArchiveBestOfUrl(prefix, scopeType, scopeId); + const year = periodStart.getUTCFullYear(); + + if (periodType === ArchivePeriodType.Year) { + return `${base}/${year}`; + } + + const month = zeroPadMonth(periodStart.getUTCMonth() + 1); + + return `${base}/${year}/${month}`; +}; + +const buildArchiveIndexSitemapQuery = ( + source: DataSource | EntityManager, +): SelectQueryBuilder => + source + .createQueryBuilder() + .select('DISTINCT a."scopeType"', 'scopeType') + .addSelect( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'scopeId', + ) + .addSelect('MAX(a."createdAt")', 'lastmod') + .from(Archive, 'a') + .leftJoin( + Source, + 's', + `a."scopeType" = '${ArchiveScopeType.Source}' AND s.id = a."scopeId"`, + ) + .where('a."scopeType" IN (:...scopeTypes)', { + scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], + }) + .andWhere( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle IS NOT NULL ELSE TRUE END`, + ) + .groupBy('a."scopeType"') + .addGroupBy( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + ) + .orderBy('a."scopeType"', 'ASC') + .addOrderBy( + `CASE WHEN a."scopeType" = '${ArchiveScopeType.Source}' THEN s.handle ELSE a."scopeId" END`, + 'ASC', + ) + .limit(DEFAULT_SITEMAP_LIMIT); + +const VALID_ARCHIVE_SCOPE_TYPES = new Set([ + ArchiveScopeType.Tag, + ArchiveScopeType.Source, +]); +const VALID_ARCHIVE_PERIOD_TYPES = new Set([ + ArchivePeriodType.Month, + ArchivePeriodType.Year, +]); + +const buildArchivePagesPaginatedQuery = ( + source: DataSource | EntityManager, + scopeType: ArchiveScopeType, + periodType: ArchivePeriodType, + page: number, +): SelectQueryBuilder => { + const qb = source + .createQueryBuilder() + .select('a."scopeType"', 'scopeType') + .addSelect( + scopeType === ArchiveScopeType.Source ? 's.handle' : 'a."scopeId"', + 'scopeId', + ) + .addSelect('a."periodType"', 'periodType') + .addSelect('a."periodStart"', 'periodStart') + .addSelect('a."createdAt"', 'lastmod') + .from(Archive, 'a') + .where('a."scopeType" = :scopeType', { scopeType }) + .andWhere('a."periodType" = :periodType', { periodType }); + + if (scopeType === ArchiveScopeType.Source) { + qb.innerJoin(Source, 's', 's.id = a."scopeId"'); + qb.orderBy('s.handle', 'ASC'); + } else { + qb.orderBy('a."scopeId"', 'ASC'); + } + + qb.addOrderBy('a."periodStart"', 'ASC') + .limit(ARCHIVE_PAGES_LIMIT) + .offset(page * ARCHIVE_PAGES_LIMIT); + + return qb; +}; + +const getArchivePagesCount = async ( + con: DataSource, +): Promise<{ scopeType: string; periodType: string; count: number }[]> => { + const queryRunner = con.createQueryRunner('slave'); + + try { + const rows = await queryRunner.manager + .createQueryBuilder() + .select('a."scopeType"', 'scopeType') + .addSelect('a."periodType"', 'periodType') + .addSelect('COUNT(*)', 'count') + .from(Archive, 'a') + .where('a."scopeType" IN (:...scopeTypes)', { + scopeTypes: [ArchiveScopeType.Tag, ArchiveScopeType.Source], + }) + .groupBy('a."scopeType"') + .addGroupBy('a."periodType"') + .getRawMany<{ scopeType: string; periodType: string; count: string }>(); + + return rows.map((row) => ({ + scopeType: row.scopeType, + periodType: row.periodType, + count: Number(row.count), + })); + } finally { + await queryRunner.release(); + } +}; + const getPostsSitemapPath = (page: number): string => page === 1 ? '/api/sitemaps/posts-1.xml' : `/api/sitemaps/posts-${page}.xml`; @@ -413,9 +554,28 @@ const buildEvergreenSitemapStream = async ( ): Promise => buildPaginatedPostSitemapStream(con, page, buildEvergreenSitemapQuery); +const buildArchivePagesIndexEntries = ( + prefix: string, + archivePageCounts: { scopeType: string; periodType: string; count: number }[], +): string => + archivePageCounts + .flatMap(({ scopeType, periodType, count }) => { + const pages = Math.max(1, Math.ceil(count / ARCHIVE_PAGES_LIMIT)); + + return Array.from( + { length: pages }, + (_, i) => + ` + ${escapeXml(`${prefix}/api/sitemaps/archive-pages-${scopeType}-${periodType}-${i}.xml`)} + `, + ); + }) + .join('\n'); + const getSitemapIndexXml = ( postsSitemapCount: number, evergreenSitemapCount: number, + archivePageCounts: { scopeType: string; periodType: string; count: number }[], ): string => { const prefix = getSitemapUrlPrefix(); const postsSitemaps = buildSitemapIndexEntries( @@ -428,6 +588,10 @@ const getSitemapIndexXml = ( evergreenSitemapCount, getEvergreenSitemapPath, ); + const archivePagesSitemaps = buildArchivePagesIndexEntries( + prefix, + archivePageCounts, + ); return ` @@ -454,6 +618,10 @@ ${evergreenSitemaps} ${escapeXml(`${prefix}/api/sitemaps/users.xml`)} + + ${escapeXml(`${prefix}/api/sitemaps/archive-index.xml`)} + +${archivePagesSitemaps} `; }; @@ -669,18 +837,89 @@ export default async function (fastify: FastifyInstance): Promise { ); }); + fastify.get('/archive-index.xml', async (_, res) => { + const con = await createOrGetConnection(); + const prefix = getSitemapUrlPrefix(); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send( + await buildSitemapXmlStream(con, buildArchiveIndexSitemapQuery, (row) => + getArchiveBestOfUrl( + prefix, + row.scopeType as ArchiveScopeType, + row.scopeId, + ), + ), + ); + }); + + fastify.get<{ + Params: { scopeType: string; periodType: string; page: string }; + }>('/archive-pages-:scopeType-:periodType-:page.xml', async (req, res) => { + const { scopeType, periodType } = req.params; + const page = Number.parseInt(req.params.page, 10); + + if ( + !VALID_ARCHIVE_SCOPE_TYPES.has(scopeType) || + !VALID_ARCHIVE_PERIOD_TYPES.has(periodType) || + !Number.isInteger(page) || + page < 0 + ) { + return res.code(404).send(); + } + + const con = await createOrGetConnection(); + const prefix = getSitemapUrlPrefix(); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send( + await buildSitemapXmlStream( + con, + (source) => + buildArchivePagesPaginatedQuery( + source, + scopeType as ArchiveScopeType, + periodType as ArchivePeriodType, + page, + ), + (row) => + getArchivePageUrl( + prefix, + row.scopeType as ArchiveScopeType, + row.scopeId, + row.periodType as ArchivePeriodType, + new Date(row.periodStart), + ), + ), + ); + }); + fastify.get('/index.xml', async (_, res) => { const con = await createOrGetConnection(); - const postsSitemapCount = getSitemapPageCount( - await getReplicaQueryCount(con, buildPostsSitemapBaseQuery), - ); - const evergreenSitemapCount = getSitemapPageCount( - await getReplicaQueryCount(con, buildEvergreenSitemapBaseQuery), - ); + const [postsSitemapCount, evergreenSitemapCount, archivePageCounts] = + await Promise.all([ + getReplicaQueryCount(con, buildPostsSitemapBaseQuery).then( + getSitemapPageCount, + ), + getReplicaQueryCount(con, buildEvergreenSitemapBaseQuery).then( + getSitemapPageCount, + ), + getArchivePagesCount(con), + ]); return res .type('application/xml') .header('cache-control', SITEMAP_CACHE_CONTROL) - .send(getSitemapIndexXml(postsSitemapCount, evergreenSitemapCount)); + .send( + getSitemapIndexXml( + postsSitemapCount, + evergreenSitemapCount, + archivePageCounts, + ), + ); }); }