diff --git a/__tests__/sitemaps.ts b/__tests__/sitemaps.ts index 1a38075225..c49cffa300 100644 --- a/__tests__/sitemaps.ts +++ b/__tests__/sitemaps.ts @@ -14,6 +14,7 @@ import { SentimentEntity, SentimentGroup, Source, + SourceType, User, } from '../src/entity'; import { getSitemapRowLastmod } from '../src/routes/sitemaps'; @@ -109,6 +110,26 @@ const sentimentEntitiesFixture: DeepPartial[] = [ }, ]; +const createSourcePostFixtures = ( + sourceId: string, + count: number, + prefix: string, + overrides?: (index: number) => DeepPartial, +): DeepPartial[] => + Array.from({ length: count }, (_, index) => ({ + id: `${prefix}-${index}`, + shortId: `${prefix.replace(/[^a-z0-9]/gi, '').slice(0, 10)}${index}`, + title: `${prefix} ${index}`, + sourceId, + createdAt: new Date('2023-01-01T00:00:00.000Z'), + type: PostType.Article, + visible: true, + private: false, + deleted: false, + banned: false, + ...overrides?.(index), + })); + beforeAll(async () => { process.env.SITEMAP_LIMIT = '2'; con = await createOrGetConnection(); @@ -328,6 +349,9 @@ describe('GET /sitemaps/index.xml', () => { expect(res.text).toContain( 'http://localhost:5002/api/sitemaps/agents-digest.xml', ); + expect(res.text).toContain( + 'http://localhost:5002/api/sitemaps/sources.xml', + ); expect(res.text).toContain( 'http://localhost:5002/api/sitemaps/squads.xml', ); @@ -340,6 +364,131 @@ describe('GET /sitemaps/index.xml', () => { }); }); +describe('GET /sitemaps/sources.xml', () => { + it('should include only qualified public machine sources', async () => { + const sourceCreatedAt = new Date('2023-10-01T10:00:00.000Z'); + const recentActivityDate = new Date(); + + await con.getRepository(Source).save([ + { + id: 'qualified-source', + name: 'Qualified Source', + image: 'https://daily.dev/qualified-source.jpg', + handle: 'qualifiedsource', + type: SourceType.Machine, + active: true, + private: false, + createdAt: sourceCreatedAt, + }, + { + id: 'not-enough-posts-source', + name: 'Not Enough Posts Source', + image: 'https://daily.dev/not-enough-posts-source.jpg', + handle: 'notenoughposts', + type: SourceType.Machine, + active: true, + private: false, + }, + { + id: 'stale-source', + name: 'Stale Source', + image: 'https://daily.dev/stale-source.jpg', + handle: 'stalesource', + type: SourceType.Machine, + active: true, + private: false, + }, + { + id: 'private-source', + name: 'Private Source', + image: 'https://daily.dev/private-source.jpg', + handle: 'privatesource', + type: SourceType.Machine, + active: true, + private: true, + }, + { + id: 'inactive-source', + name: 'Inactive Source', + image: 'https://daily.dev/inactive-source.jpg', + handle: 'inactivesource', + type: SourceType.Machine, + active: false, + private: false, + }, + { + id: 'squad-source', + name: 'Squad Source', + image: 'https://daily.dev/squad-source.jpg', + handle: 'squadsource', + type: SourceType.Squad, + active: true, + private: false, + }, + ]); + + await con.getRepository(Post).insert([ + ...createSourcePostFixtures( + 'qualified-source', + 9, + 'qualified-old', + () => ({}), + ), + ...createSourcePostFixtures( + 'qualified-source', + 1, + 'qualified-recent', + () => ({ + createdAt: recentActivityDate, + }), + ), + ...createSourcePostFixtures( + 'not-enough-posts-source', + 9, + 'notenough', + () => ({}), + ), + ...createSourcePostFixtures( + 'not-enough-posts-source', + 1, + 'notenough-private', + () => ({ private: true }), + ), + ...createSourcePostFixtures('stale-source', 10, 'stale', () => ({})), + ...createSourcePostFixtures('private-source', 10, 'private', () => ({ + createdAt: recentActivityDate, + })), + ...createSourcePostFixtures('inactive-source', 10, 'inactive', () => ({ + createdAt: recentActivityDate, + })), + ...createSourcePostFixtures('squad-source', 10, 'squad', () => ({ + createdAt: recentActivityDate, + })), + ]); + + const res = await request(app.server) + .get('/sitemaps/sources.xml') + .expect(200); + + expect(res.header['content-type']).toContain('application/xml'); + expect(res.header['cache-control']).toEqual( + 'public, max-age=7200, s-maxage=7200', + ); + expect(res.text).toContain( + '', + ); + expect(res.text).toContain( + 'http://localhost:5002/sources/qualifiedsource', + ); + expect(res.text).toContain('2023-10-01T10:00:00.000Z'); + expect(res.text).not.toContain('/sources/notenoughposts'); + expect(res.text).not.toContain('/sources/stalesource'); + expect(res.text).not.toContain('/sources/privatesource'); + expect(res.text).not.toContain('/sources/inactivesource'); + expect(res.text).not.toContain('/sources/squadsource'); + }); +}); + describe('GET /sitemaps/users.xml', () => { it('should include only qualified author profiles', async () => { const updatedAt = new Date('2024-01-01T12:00:00.123Z'); diff --git a/src/routes/sitemaps.ts b/src/routes/sitemaps.ts index fba8ee7598..54c8f0c737 100644 --- a/src/routes/sitemaps.ts +++ b/src/routes/sitemaps.ts @@ -23,6 +23,7 @@ import { const SITEMAP_CACHE_CONTROL = `public, max-age=${2 * ONE_HOUR_IN_SECONDS}, s-maxage=${2 * ONE_HOUR_IN_SECONDS}`; const DEFAULT_SITEMAP_LIMIT = 50_000; +const QUALIFIED_SOURCE_MIN_PUBLIC_POSTS = 10; const ARENA_SITEMAP_GROUP_IDS = [ '385404b4-f0f4-4e81-a338-bdca851eca31', '970ab2c9-f845-4822-82f0-02169713b814', @@ -88,6 +89,9 @@ const getTagSitemapUrl = (prefix: string, value: string): string => const getAgentSitemapUrl = (prefix: string, entity: string): string => `${prefix}/agents/${encodeURIComponent(entity)}`; +const getSourceSitemapUrl = (prefix: string, handle: string): string => + `${prefix}/sources/${encodeURIComponent(handle)}`; + const getSquadSitemapUrl = (prefix: string, handle: string): string => `${prefix}/squads/${encodeURIComponent(handle)}`; @@ -287,6 +291,36 @@ const buildAgentsDigestSitemapQuery = ( .orderBy('p."createdAt"', 'DESC') .limit(DEFAULT_SITEMAP_LIMIT); +const buildSourcesSitemapQuery = ( + source: DataSource | EntityManager, +): SelectQueryBuilder => + source + .createQueryBuilder() + .select('s.handle', 'handle') + .addSelect('s."createdAt"', 'lastmod') + .from(Source, 's') + .innerJoin( + Post, + 'p', + `p."sourceId" = s.id + AND p.deleted = false + AND p.visible = true + AND p.private = false + AND p.banned = false`, + ) + .where('s.type = :type', { type: SourceType.Machine }) + .andWhere('s.active = true') + .andWhere('s.private = false') + .groupBy('s.id') + .addGroupBy('s.handle') + .addGroupBy('s."createdAt"') + .having('COUNT(*) >= :minPublicPosts') + .andHaving(`MAX(p."createdAt") >= current_timestamp - interval '12 months'`) + .orderBy('s."createdAt"', 'DESC') + .addOrderBy('s.handle', 'ASC') + .limit(DEFAULT_SITEMAP_LIMIT) + .setParameter('minPublicPosts', QUALIFIED_SOURCE_MIN_PUBLIC_POSTS); + const buildSquadsSitemapQuery = ( source: DataSource | EntityManager, ): SelectQueryBuilder => @@ -374,6 +408,9 @@ ${evergreenSitemaps} ${escapeXml(`${prefix}/api/sitemaps/agents-digest.xml`)} + + ${escapeXml(`${prefix}/api/sitemaps/sources.xml`)} + ${escapeXml(`${prefix}/api/sitemaps/squads.xml`)} @@ -549,6 +586,23 @@ export default async function (fastify: FastifyInstance): Promise { ); }); + fastify.get('/sources.xml', async (_, res) => { + const con = await createOrGetConnection(); + const prefix = getSitemapUrlPrefix(); + const input = await streamReplicaQuery(con, buildSourcesSitemapQuery); + + return res + .type('application/xml') + .header('cache-control', SITEMAP_CACHE_CONTROL) + .send( + toSitemapUrlSetStream( + input, + (row) => getSourceSitemapUrl(prefix, row.handle), + getSitemapRowLastmod, + ), + ); + }); + fastify.get('/squads.xml', async (_, res) => { const con = await createOrGetConnection(); const prefix = getSitemapUrlPrefix();