Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions packages/playwright-crawler/src/internals/playwright-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import type {
RouterRoutes,
} from '@crawlee/browser';
import { BrowserCrawler, Configuration, Router } from '@crawlee/browser';
import type { BrowserPoolOptions, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool';
import type { BrowserPoolOptions, CommonPage, PlaywrightController, PlaywrightPlugin } from '@crawlee/browser-pool';
import type { Dictionary } from '@crawlee/types';
import ow from 'ow';
import type { LaunchOptions, Page, Response } from 'playwright';
import type { Download, LaunchOptions, Page, Response } from 'playwright';

import type { PlaywrightLaunchContext } from './playwright-launcher';
import { PlaywrightLauncher } from './playwright-launcher';
Expand Down Expand Up @@ -238,6 +238,17 @@ export class PlaywrightCrawler extends BrowserCrawler<
super({ ...browserCrawlerOptions, launchContext, browserPoolOptions }, config);
}

protected override _enhanceCrawlingContextWithPageInfo(
crawlingContext: PlaywrightCrawlingContext,
page: CommonPage,
createNewSession?: boolean,
): void {
super._enhanceCrawlingContextWithPageInfo(crawlingContext, page, createNewSession);
const downloads: Download[] = [];
(page as Page).on('download', (download) => downloads.push(download));
crawlingContext.listDownloads = async () => downloads;
}

protected override async _runRequestHandler(context: PlaywrightCrawlingContext) {
registerUtilsToContext(context, this.options);
await super._runRequestHandler(context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import type { BatchAddRequestsResult } from '@crawlee/types';
import { type CheerioRoot, type Dictionary, expandShadowRoots, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import ow from 'ow';
import type { Page, Response, Route } from 'playwright';
import type { Download, Page, Response, Route } from 'playwright';

import { LruCache } from '@apify/datastructures';
import log_ from '@apify/log';
Expand Down Expand Up @@ -1065,6 +1065,30 @@ export interface PlaywrightContextUtils {
* @param [options]
*/
handleCloudflareChallenge(options?: HandleCloudflareChallengeOptions): Promise<void>;

/**
* Returns the list of {@link https://playwright.dev/docs/api/class-download | Download} objects
* collected during the current page navigation and request handler.
*
* Useful for accessing files that the page downloads automatically.
* For most use cases, prefer re-enqueueing the URL to {@apilink FileDownload}.
* Use this only when direct access to the Playwright `Download` object is required.
*
* **Example usage**
* ```ts
* requestHandler: async ({ listDownloads }) => {
* for (const download of await listDownloads()) {
* try {
* const stream = await download.createReadStream();
* // stream to storage...
* } catch {
* // download failed or was cancelled
* }
* }
* },
* ```
*/
listDownloads(): Promise<Download[]>;
}

export function registerUtilsToContext(
Expand Down
38 changes: 38 additions & 0 deletions test/core/crawlers/playwright_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ describe('PlaywrightCrawler', () => {
res.send(`<html><head><title>Example Domain</title></head></html>`);
res.status(200);
});
app.get('/page-with-download', (_req, res) => {
res.status(200).send(
`<html><body><a id="download-link" href="/download-file" download="hello.txt">download</a></body></html>`,
);
});
app.get('/download-file', (_req, res) => {
res.setHeader('Content-Type', 'text/plain');
res.setHeader('Content-Disposition', 'attachment; filename="hello.txt"');
res.send('hello');
});
});

beforeAll(async () => {
Expand Down Expand Up @@ -207,6 +217,34 @@ describe('PlaywrightCrawler', () => {
expect(reducedMotion).toBe(launchOptions.reducedMotion);
});

test('exposes triggered downloads via listDownloads()', async () => {
let countBefore = -1;
let countAfter = -1;
let suggestedFilename: string | undefined;

const playwrightCrawler = new PlaywrightCrawler({
maxRequestRetries: 0,
maxConcurrency: 1,
requestHandler: async ({ page, listDownloads }) => {
countBefore = (await listDownloads()).length;

const downloadPromise = page.waitForEvent('download');
await page.click('a#download-link');
await downloadPromise;

const downloads = await listDownloads();
countAfter = downloads.length;
suggestedFilename = downloads[0]?.suggestedFilename();
},
});

await playwrightCrawler.run([`http://${HOSTNAME}:${port}/page-with-download`]);

expect(countBefore).toBe(0);
expect(countAfter).toBe(1);
expect(suggestedFilename).toBe('hello.txt');
});

test('should have correct types in crawling context', async () => {
const requestHandler = async (crawlingContext: PlaywrightCrawlingContext) => {
// Checking that types are correct
Expand Down
Loading