Skip to content

Commit 5dc1627

Browse files
feat: expose Download objects on PlaywrightCrawlingContext (apify#3596)
When page.goto throws "Download is starting", Playwright has already captured the file — but the Download object was inaccessible to user-land code. The only workaround was re-downloading via HTTP, which breaks on sites requiring a browser session (e.g. ECAS-gated resources on Eur-Lex). This PR adds a downloads: Download[] array to PlaywrightCrawlingContext, populated via page.on('download', ...) registered in _enhanceCrawlingContextWithPageInfo — before navigation, so downloads triggered by page.goto are always captured. Closes apify#3583
1 parent 1d2528b commit 5dc1627

3 files changed

Lines changed: 76 additions & 3 deletions

File tree

packages/playwright-crawler/src/internals/playwright-crawler.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ import type {
88
RouterRoutes,
99
} from '@crawlee/browser';
1010
import { BrowserCrawler, Configuration, Router } from '@crawlee/browser';
11-
import type { BrowserPoolOptions, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool';
11+
import type { BrowserPoolOptions, CommonPage, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool';
1212
import type { Dictionary } from '@crawlee/types';
1313
import ow from 'ow';
14-
import type { LaunchOptions, Page, Response } from 'playwright';
14+
import type { Download, LaunchOptions, Page, Response } from 'playwright';
1515

1616
import type { PlaywrightLaunchContext } from './playwright-launcher';
1717
import { PlaywrightLauncher } from './playwright-launcher';
@@ -238,6 +238,17 @@ export class PlaywrightCrawler extends BrowserCrawler<
238238
super({ ...browserCrawlerOptions, launchContext, browserPoolOptions }, config);
239239
}
240240

241+
protected override _enhanceCrawlingContextWithPageInfo(
242+
crawlingContext: PlaywrightCrawlingContext,
243+
page: CommonPage,
244+
createNewSession?: boolean,
245+
): void {
246+
super._enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession);
247+
const downloads: Download[] = [];
248+
(page as Page).on('download', (download) => downloads.push(download));
249+
crawlingContext.listDownloads = async () => downloads;
250+
}
251+
241252
protected override async _runRequestHandler(context: PlaywrightCrawlingContext) {
242253
registerUtilsToContext(context, this.options);
243254
await super._runRequestHandler(context);

packages/playwright-crawler/src/internals/utils/playwright-utils.ts

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import type { BatchAddRequestsResult } from '@crawlee/types';
3434
import { type CheerioRoot, type Dictionary, expandShadowRoots, sleep } from '@crawlee/utils';
3535
import * as cheerio from 'cheerio';
3636
import ow from 'ow';
37-
import type { Page, Response, Route } from 'playwright';
37+
import type { Download, Page, Response, Route } from 'playwright';
3838

3939
import { LruCache } from '@apify/datastructures';
4040
import log_ from '@apify/log';
@@ -1065,6 +1065,30 @@ export interface PlaywrightContextUtils {
10651065
* @param [options]
10661066
*/
10671067
handleCloudflareChallenge(options?: HandleCloudflareChallengeOptions): Promise<void>;
1068+
1069+
/**
1070+
* Returns the list of {@link https://playwright.dev/docs/api/class-download | Download} objects
1071+
* collected during the current page navigation and request handler.
1072+
*
1073+
* Useful for accessing files that the page downloads automatically.
1074+
* For most use cases, prefer re-enqueueing the URL to {@apilink FileDownload}.
1075+
* Use this only when direct access to the Playwright `Download` object is required.
1076+
*
1077+
* **Example usage**
1078+
* ```ts
1079+
* requestHandler: async ({ listDownloads }) => {
1080+
* for (const download of await listDownloads()) {
1081+
* try {
1082+
* const stream = await download.createReadStream();
1083+
* // stream to storage...
1084+
* } catch {
1085+
* // download failed or was cancelled
1086+
* }
1087+
* }
1088+
* },
1089+
* ```
1090+
*/
1091+
listDownloads(): Promise<Download[]>;
10681092
}
10691093

10701094
export function registerUtilsToContext(

test/core/crawlers/playwright_crawler.test.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ describe('PlaywrightCrawler', () => {
4141
res.send(`<html><head><title>Example Domain</title></head></html>`);
4242
res.status(200);
4343
});
44+
app.get('/page-with-download', (_req, res) => {
45+
res.status(200).send(
46+
`<html><body><a id="download-link" href="/download-file" download="hello.txt">download</a></body></html>`,
47+
);
48+
});
49+
app.get('/download-file', (_req, res) => {
50+
res.setHeader('Content-Type', 'text/plain');
51+
res.setHeader('Content-Disposition', 'attachment; filename="hello.txt"');
52+
res.send('hello');
53+
});
4454
});
4555

4656
beforeAll(async () => {
@@ -207,6 +217,34 @@ describe('PlaywrightCrawler', () => {
207217
expect(reducedMotion).toBe(launchOptions.reducedMotion);
208218
});
209219

220+
test('exposes triggered downloads via listDownloads()', async () => {
221+
let countBefore = -1;
222+
let countAfter = -1;
223+
let suggestedFilename: string | undefined;
224+
225+
const playwrightCrawler = new PlaywrightCrawler({
226+
maxRequestRetries: 0,
227+
maxConcurrency: 1,
228+
requestHandler: async ({ page, listDownloads }) => {
229+
countBefore = (await listDownloads()).length;
230+
231+
const downloadPromise = page.waitForEvent('download');
232+
await page.click('a#download-link');
233+
await downloadPromise;
234+
235+
const downloads = await listDownloads();
236+
countAfter = downloads.length;
237+
suggestedFilename = downloads[0]?.suggestedFilename();
238+
},
239+
});
240+
241+
await playwrightCrawler.run([`http://${HOSTNAME}:${port}/page-with-download`]);
242+
243+
expect(countBefore).toBe(0);
244+
expect(countAfter).toBe(1);
245+
expect(suggestedFilename).toBe('hello.txt');
246+
});
247+
210248
test('should have correct types in crawling context', async () => {
211249
const requestHandler = async (crawlingContext: PlaywrightCrawlingContext) => {
212250
// Checking that types are correct

0 commit comments

Comments
 (0)