diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 686f540c3d06..48fcaa118c6a 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -8,10 +8,10 @@ import type { RouterRoutes, } from '@crawlee/browser'; import { BrowserCrawler, Configuration, Router } from '@crawlee/browser'; -import type { BrowserPoolOptions, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool'; +import type { BrowserPoolOptions, CommonPage, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool'; import type { Dictionary } from '@crawlee/types'; import ow from 'ow'; -import type { LaunchOptions, Page, Response } from 'playwright'; +import type { Download, LaunchOptions, Page, Response } from 'playwright'; import type { PlaywrightLaunchContext } from './playwright-launcher'; import { PlaywrightLauncher } from './playwright-launcher'; @@ -238,6 +238,17 @@ export class PlaywrightCrawler extends BrowserCrawler< super({ ...browserCrawlerOptions, launchContext, browserPoolOptions }, config); } + protected override _enhanceCrawlingContextWithPageInfo( + crawlingContext: PlaywrightCrawlingContext, + page: CommonPage, + createNewSession?: boolean, + ): void { + super._enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession); + const downloads: Download[] = []; + (page as Page).on('download', (download) => downloads.push(download)); + crawlingContext.listDownloads = async () => downloads; + } + protected override async _runRequestHandler(context: PlaywrightCrawlingContext) { registerUtilsToContext(context, this.options); await super._runRequestHandler(context); diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 774785bd99ad..6bd4f6873cf2 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -34,7 +34,7 @@ import type { BatchAddRequestsResult } from '@crawlee/types'; import { type CheerioRoot, type Dictionary, expandShadowRoots, sleep } from '@crawlee/utils'; import * as cheerio from 'cheerio'; import ow from 'ow'; -import type { Page, Response, Route } from 'playwright'; +import type { Download, Page, Response, Route } from 'playwright'; import { LruCache } from '@apify/datastructures'; import log_ from '@apify/log'; @@ -1065,6 +1065,30 @@ export interface PlaywrightContextUtils { * @param [options] */ handleCloudflareChallenge(options?: HandleCloudflareChallengeOptions): Promise; + + /** + * Returns the list of {@link https://playwright.dev/docs/api/class-download | Download} objects + * collected during the current page navigation and request handler. + * + * Useful for accessing files that the page downloads automatically. + * For most use cases, prefer re-enqueueing the URL to {@apilink FileDownload}. + * Use this only when direct access to the Playwright `Download` object is required. + * + * **Example usage** + * ```ts + * requestHandler: async ({ listDownloads }) => { + * for (const download of await listDownloads()) { + * try { + * const stream = await download.createReadStream(); + * // stream to storage... + * } catch { + * // download failed or was cancelled + * } + * } + * }, + * ``` + */ + listDownloads(): Promise; } export function registerUtilsToContext( diff --git a/test/core/crawlers/playwright_crawler.test.ts b/test/core/crawlers/playwright_crawler.test.ts index f381062eebec..2acf7a91ced7 100644 --- a/test/core/crawlers/playwright_crawler.test.ts +++ b/test/core/crawlers/playwright_crawler.test.ts @@ -41,6 +41,16 @@ describe('PlaywrightCrawler', () => { res.send(`Example Domain`); res.status(200); }); + app.get('/page-with-download', (_req, res) => { + res.status(200).send( + `download`, + ); + }); + app.get('/download-file', (_req, res) => { + res.setHeader('Content-Type', 'text/plain'); + res.setHeader('Content-Disposition', 'attachment; filename="hello.txt"'); + res.send('hello'); + }); }); beforeAll(async () => { @@ -207,6 +217,34 @@ describe('PlaywrightCrawler', () => { expect(reducedMotion).toBe(launchOptions.reducedMotion); }); + test('exposes triggered downloads via listDownloads()', async () => { + let countBefore = -1; + let countAfter = -1; + let suggestedFilename: string | undefined; + + const playwrightCrawler = new PlaywrightCrawler({ + maxRequestRetries: 0, + maxConcurrency: 1, + requestHandler: async ({ page, listDownloads }) => { + countBefore = (await listDownloads()).length; + + const downloadPromise = page.waitForEvent('download'); + await page.click('a#download-link'); + await downloadPromise; + + const downloads = await listDownloads(); + countAfter = downloads.length; + suggestedFilename = downloads[0]?.suggestedFilename(); + }, + }); + + await playwrightCrawler.run([`http://${HOSTNAME}:${port}/page-with-download`]); + + expect(countBefore).toBe(0); + expect(countAfter).toBe(1); + expect(suggestedFilename).toBe('hello.txt'); + }); + test('should have correct types in crawling context', async () => { const requestHandler = async (crawlingContext: PlaywrightCrawlingContext) => { // Checking that types are correct