diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index a3c921b5d6d3..774785bd99ad 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -613,15 +613,28 @@ export async function parseWithCheerio( ): Promise { ow(page, ow.object.validate(validators.browserPage)); + const html = ignoreShadowRoots + ? null + : ((await page.evaluate(`(${expandShadowRoots.toString()})(document)`)) as string); + const pageContent = html || (await page.content()); + const $ = cheerio.load(pageContent); + if (page.frames().length > 1 && !ignoreIframes) { const frames = await page.$$('iframe'); + const cheerioIframes = $('iframe').toArray(); + + if (frames.length !== cheerioIframes.length) { + log.warning( + `parseWithCheerio: iframe count mismatch between live DOM (${frames.length}) and page snapshot (${cheerioIframes.length}). Some iframes may not be expanded.`, + ); + } await Promise.all( - frames.map(async (frame) => { + frames.map(async (frame, index) => { try { const iframe = await frame.contentFrame(); - if (iframe) { + if (iframe && cheerioIframes[index]) { const getIframeHTML = async (): Promise => { try { return iframe.locator('body').first().innerHTML(); @@ -631,14 +644,9 @@ export async function parseWithCheerio( }; const contents = await getIframeHTML(); - - await frame.evaluate((f, c) => { - const replacementNode = document.createElement('div'); - replacementNode.innerHTML = c; - replacementNode.className = 'crawlee-iframe-replacement'; - - f.replaceWith(replacementNode); - }, contents); + $(cheerioIframes[index]).replaceWith( + `
${contents}
`, + ); } } catch (error) { log.warning(`Failed to extract iframe content: ${error}`); @@ -647,12 +655,7 @@ export async function parseWithCheerio( ); } - const html = ignoreShadowRoots - ? null - : ((await page.evaluate(`(${expandShadowRoots.toString()})(document)`)) as string); - const pageContent = html || (await page.content()); - - return cheerio.load(pageContent); + return $; } let idcacPlaywright: null | { getInjectableScript: () => string } = null; diff --git a/test/core/playwright_utils.test.ts b/test/core/playwright_utils.test.ts index a91f7ed6115c..d17eeebe9c40 100644 --- a/test/core/playwright_utils.test.ts +++ b/test/core/playwright_utils.test.ts @@ -185,6 +185,25 @@ describe('playwrightUtils', () => { } }); + test('parseWithCheerio() iframe expansion works with Trusted Types CSP', async () => { + const browser = await launchPlaywright(launchContext); + + try { + const page = await browser.newPage(); + await page.goto(new URL('/special/outside-iframe-csp', serverAddress).toString()); + + const $ = await playwrightUtils.parseWithCheerio(page); + + const headings = $('h1') + .map((_, el) => $(el).text()) + .get(); + + expect(headings).toEqual(['Outside iframe', 'In iframe']); + } finally { + await browser.close(); + } + }); + describe('blockRequests()', () => { let browser: Browser = null as any; beforeAll(async () => { diff --git a/test/shared/_helper.ts b/test/shared/_helper.ts index 8c275ea9d17e..b0ac71cf26c7 100644 --- a/test/shared/_helper.ts +++ b/test/shared/_helper.ts @@ -320,6 +320,11 @@ export async function runExampleComServer(): Promise<[Server, number]> { res.type('html').send(responseSamples.outsideIframe); }); + special.get('/outside-iframe-csp', (_req, res) => { + res.setHeader('Content-Security-Policy', "require-trusted-types-for 'script'"); + res.type('html').send(responseSamples.outsideIframe); + }); + special.get('/inside-iframe', (_req, res) => { res.type('html').send(responseSamples.insideIframe); });