diff --git a/__tests__/sitemaps.ts b/__tests__/sitemaps.ts index f4757c4847..7229eec931 100644 --- a/__tests__/sitemaps.ts +++ b/__tests__/sitemaps.ts @@ -269,6 +269,38 @@ describe('GET /sitemaps/tags.xml', () => { describe('GET /sitemaps/index.xml', () => { it('should return sitemap index xml with all paginated post sitemaps', async () => { + const oldDate = new Date(now.getTime() - 91 * ONE_DAY_IN_SECONDS * 1000); + + await con.getRepository(Post).insert([ + { + id: 'evergreen-index-1', + shortId: 'ei1', + title: 'Evergreen Index 1', + sourceId: 'a', + createdAt: oldDate, + type: PostType.Article, + upvotes: 10, + }, + { + id: 'evergreen-index-2', + shortId: 'ei2', + title: 'Evergreen Index 2', + sourceId: 'a', + createdAt: new Date(oldDate.getTime() - 1000), + type: PostType.Article, + upvotes: 11, + }, + { + id: 'evergreen-index-3', + shortId: 'ei3', + title: 'Evergreen Index 3', + sourceId: 'a', + createdAt: new Date(oldDate.getTime() - 2000), + type: PostType.Article, + upvotes: 12, + }, + ]); + const res = await request(app.server) .get('/sitemaps/index.xml') .expect(200); @@ -285,7 +317,10 @@ describe('GET /sitemaps/index.xml', () => { 'http://localhost:5002/api/sitemaps/posts-2.xml', ); expect(res.text).toContain( - 'http://localhost:5002/api/sitemaps/tags.xml', + 'http://localhost:5002/api/sitemaps/evergreen.xml', + ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/evergreen-2.xml', ); expect(res.text).toContain( 'http://localhost:5002/api/sitemaps/agents.xml', @@ -296,6 +331,9 @@ describe('GET /sitemaps/index.xml', () => { expect(res.text).toContain( 'http://localhost:5002/api/sitemaps/squads.xml', ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/tags.xml', + ); }); }); @@ -431,6 +469,77 @@ describe('GET /sitemaps/squads.xml', () => { }); describe('GET /sitemaps/evergreen.xml', () => { + it('should include posts with at least 10 upvotes and paginate older posts', async () => { + const oldDate = new Date(now.getTime() - 91 * ONE_DAY_IN_SECONDS * 1000); + + await con.getRepository(Post).insert([ + { + id: 'evergreen-min-threshold', + shortId: 'emt', + title: 'Evergreen Min Threshold', + sourceId: 'a', + createdAt: oldDate, + type: PostType.Article, + upvotes: 10, + }, + { + id: 'evergreen-next-page', + shortId: 'enp', + title: 'Evergreen Next Page', + sourceId: 'a', + createdAt: new Date(oldDate.getTime() - 1000), + type: PostType.Article, + upvotes: 11, + }, + { + id: 'evergreen-third-page', + shortId: 'etp', + title: 'Evergreen Third Page', + sourceId: 'a', + createdAt: new Date(oldDate.getTime() - 2000), + type: PostType.Article, + upvotes: 12, + }, + { + id: 'evergreen-below-threshold', + shortId: 'ebt', + title: 'Evergreen Below Threshold', + sourceId: 'a', + createdAt: new Date(oldDate.getTime() - 3000), + type: PostType.Article, + upvotes: 9, + }, + ]); + + const firstPage = await request(app.server) + .get('/sitemaps/evergreen.xml') + .expect(200); + const secondPage = await request(app.server) + .get('/sitemaps/evergreen-2.xml') + .expect(200); + + expect(firstPage.header['content-type']).toContain('application/xml'); + expect(firstPage.text).toContain( + '/posts/evergreen-third-page-evergreen-third-page', + ); + expect(firstPage.text).toContain( + '/posts/evergreen-next-page-evergreen-next-page', + ); + expect(firstPage.text).not.toContain( + '/posts/evergreen-min-threshold-evergreen-min-threshold', + ); + expect(firstPage.text).not.toContain( + '/posts/evergreen-below-threshold-ebt', + ); + + expect(secondPage.text).toContain( + '/posts/evergreen-min-threshold-evergreen-min-threshold', + ); + expect(secondPage.text).not.toContain( + '/posts/evergreen-below-threshold-ebt', + ); + }); + it('should exclude posts by low-reputation authors', async () => { await con.getRepository(User).save({ id: 'low-rep-sitemap', diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index 409bb0feaa..af9583909c 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -27,7 +27,7 @@ const ARENA_SITEMAP_GROUP_IDS = [ '970ab2c9-f845-4822-82f0-02169713b814', ]; -const getPostsSitemapLimit = (): number => { +const getPaginatedSitemapLimit = (): number => { const limit = Number.parseInt(process.env.SITEMAP_LIMIT || '', 10); return Number.isInteger(limit) && limit > 0 ? limit : DEFAULT_SITEMAP_LIMIT; @@ -139,32 +139,35 @@ const applyPostsSitemapOrder = ( ): SelectQueryBuilder => query.orderBy('p."createdAt"', 'ASC').addOrderBy('p.id', 'ASC'); -const buildPostsSitemapQuery = ( - source: DataSource | EntityManager, +const applyPaginatedSitemapWindow = ( + query: SelectQueryBuilder, page: number, ): SelectQueryBuilder => - applyPostsSitemapOrder( - buildPostsSitemapBaseQuery(source) - .select('p.slug', 'slug') - .addSelect('p."metadataChangedAt"', 'lastmod') - .limit(getPostsSitemapLimit()) - .offset((page - 1) * getPostsSitemapLimit()), - ); + query + .limit(getPaginatedSitemapLimit()) + .offset((page - 1) * getPaginatedSitemapLimit()); -const buildPostsSitemapTextQuery = ( +const buildPostsSitemapQuery = ( source: DataSource | EntityManager, + page: number, ): SelectQueryBuilder => - applyPostsSitemapOrder( - buildPostsSitemapBaseQuery(source).select('p.slug', 'slug'), + applyPaginatedSitemapWindow( + applyPostsSitemapOrder( + buildPostsSitemapBaseQuery(source) + .select('p.slug', 'slug') + .addSelect('p."metadataChangedAt"', 'lastmod'), + ), + page, ); -const buildPostSitemapStream = async ( +const buildPaginatedPostSitemapStream = async ( con: DataSource, page: number, + buildQuery: (source: EntityManager, page: number) => SelectQueryBuilder, ): Promise => { const prefix = getSitemapUrlPrefix(); const input = await streamReplicaQuery(con, (source) => - buildPostsSitemapQuery(source, page), + buildQuery(source, page), ); return toSitemapUrlSetStream( @@ -174,13 +177,53 @@ const buildPostSitemapStream = async ( ); }; -const buildEvergreenSitemapQuery = ( +const getSitemapPageCount = (totalPosts: number): number => + Math.max(1, Math.ceil(totalPosts / getPaginatedSitemapLimit())); + +const getReplicaQueryCount = async ( + con: DataSource, + buildQuery: (source: EntityManager) => SelectQueryBuilder, +): Promise => { + const queryRunner = con.createQueryRunner('slave'); + + try { + return await buildQuery(queryRunner.manager).getCount(); + } finally { + await queryRunner.release(); + } +}; + +const buildSitemapIndexEntries = ( + prefix: string, + sitemapCount: number, + getPath: (page: number) => string, +): string => + Array.from({ length: sitemapCount }, (_, index) => { + const page = index + 1; + + return ` + ${escapeXml(`${prefix}${getPath(page)}`)} + `; + }).join('\n'); + +const buildPostsSitemapTextQuery = ( + source: DataSource | EntityManager, +): SelectQueryBuilder => + applyPostsSitemapOrder( + buildPostsSitemapBaseQuery(source).select('p.slug', 'slug'), + ); + +const buildPostSitemapStream = async ( + con: DataSource, + page: number, +): Promise => + buildPaginatedPostSitemapStream(con, page, buildPostsSitemapQuery); + +const buildEvergreenSitemapBaseQuery = ( source: DataSource | EntityManager, ): SelectQueryBuilder => source .createQueryBuilder() - .select('p.slug', 'slug') - .addSelect('p."metadataChangedAt"', 'lastmod') .from(Post, 'p') .leftJoin(User, 'u', 'p."authorId" = u.id') .where('p.type NOT IN (:...types)', { types: [PostType.Welcome] }) @@ -188,10 +231,21 @@ const buildEvergreenSitemapQuery = ( .andWhere('NOT p.banned') .andWhere('NOT p.deleted') .andWhere('p."createdAt" <= current_timestamp - interval \'90 day\'') - .andWhere('p.upvotes >= :minUpvotes', { minUpvotes: 50 }) - .andWhere('(u.id is null or u.reputation > 10)') - .orderBy('p.upvotes', 'DESC') - .limit(DEFAULT_SITEMAP_LIMIT); + .andWhere('p.upvotes >= :minUpvotes', { minUpvotes: 10 }) + .andWhere('(u.id is null or u.reputation > 10)'); + +const buildEvergreenSitemapQuery = ( + source: DataSource | EntityManager, + page: number, +): SelectQueryBuilder => + applyPaginatedSitemapWindow( + buildEvergreenSitemapBaseQuery(source) + .select('p.slug', 'slug') + .addSelect('p."metadataChangedAt"', 'lastmod') + .orderBy('p."createdAt"', 'ASC') + .addOrderBy('p.id', 'ASC'), + page, + ); const buildTagsSitemapQuery = ( source: DataSource | EntityManager, @@ -250,38 +304,37 @@ const buildSquadsSitemapQuery = ( const getPostsSitemapPath = (page: number): string => page === 1 ? '/api/sitemaps/posts-1.xml' : `/api/sitemaps/posts-${page}.xml`; -const getPostsSitemapPageCount = (totalPosts: number): number => - Math.max(1, Math.ceil(totalPosts / getPostsSitemapLimit())); +const getEvergreenSitemapPath = (page: number): string => + page === 1 + ? '/api/sitemaps/evergreen.xml' + : `/api/sitemaps/evergreen-${page}.xml`; -const getPostsSitemapCount = async (con: DataSource): Promise => { - const queryRunner = con.createQueryRunner('slave'); - - try { - return await buildPostsSitemapBaseQuery(queryRunner.manager).getCount(); - } finally { - await queryRunner.release(); - } -}; +const buildEvergreenSitemapStream = async ( + con: DataSource, + page: number, +): Promise => + buildPaginatedPostSitemapStream(con, page, buildEvergreenSitemapQuery); -const getSitemapIndexXml = (postsSitemapCount: number): string => { +const getSitemapIndexXml = ( + postsSitemapCount: number, + evergreenSitemapCount: number, +): string => { const prefix = getSitemapUrlPrefix(); - const postsSitemaps = Array.from( - { length: postsSitemapCount }, - (_, index) => { - const page = index + 1; - - return ` - ${escapeXml(`${prefix}${getPostsSitemapPath(page)}`)} - `; - }, - ).join('\n'); + const postsSitemaps = buildSitemapIndexEntries( + prefix, + postsSitemapCount, + getPostsSitemapPath, + ); + const evergreenSitemaps = buildSitemapIndexEntries( + prefix, + evergreenSitemapCount, + getEvergreenSitemapPath, + ); return ` ${postsSitemaps} - - ${escapeXml(`${prefix}/api/sitemaps/evergreen.xml`)} - +${evergreenSitemaps} ${escapeXml(`${prefix}/api/sitemaps/tags.xml`)} @@ -374,19 +427,28 @@ export default async function (fastify: FastifyInstance): Promise { fastify.get('/evergreen.xml', async (_, res) => { const con = await createOrGetConnection(); - const prefix = getSitemapUrlPrefix(); - const input = await streamReplicaQuery(con, buildEvergreenSitemapQuery); return res .type('application/xml') .header('cache-control', SITEMAP_CACHE_CONTROL) - .send( - toSitemapUrlSetStream( - input, - (row) => getPostSitemapUrl(prefix, row.slug), - getSitemapRowLastmod, - ), - ); + .send(await buildEvergreenSitemapStream(con, 1)); + }); + + fastify.get<{ + Params: { page: string }; + }>('/evergreen-:page.xml', async (req, res) => { + const page = Number.parseInt(req.params.page, 10); + + if (!Number.isInteger(page) || page < 1) { + return res.code(404).send(); + } + + const con = await createOrGetConnection(); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send(await buildEvergreenSitemapStream(con, page)); }); fastify.get('/tags.txt', async (_, res) => { @@ -473,13 +535,16 @@ export default async function (fastify: FastifyInstance): Promise { fastify.get('/index.xml', async (_, res) => { const con = await createOrGetConnection(); - const postsSitemapCount = getPostsSitemapPageCount( - await getPostsSitemapCount(con), + const postsSitemapCount = getSitemapPageCount( + await getReplicaQueryCount(con, buildPostsSitemapBaseQuery), + ); + const evergreenSitemapCount = getSitemapPageCount( + await getReplicaQueryCount(con, buildEvergreenSitemapBaseQuery), ); return res .type('application/xml') .header('cache-control', SITEMAP_CACHE_CONTROL) - .send(getSitemapIndexXml(postsSitemapCount)); + .send(getSitemapIndexXml(postsSitemapCount, evergreenSitemapCount)); }); }