Skip to content

Commit b3c2475

Browse files
committed
feat: flags to avoid ads and cookie banners
1 parent 09402ba commit b3c2475

6 files changed

Lines changed: 200 additions & 6 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Thumbs.db
3030
dist/
3131
coverage/
3232
build/
33+
test/
3334

3435
# Optional: lock files (if you want to avoid committing them)
3536
# package-lock.json

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,10 @@ Arguments:
376376
Options:
377377
-o, --output <dir> Custom output directory (default: domain name)
378378
--clean Remove tracking scripts and analytics
379+
--block-ads Block advertisements using adblocker plugin
380+
--block-cookies Automatically remove and block cookie consent banners
379381
--ai Enable AI-powered analysis (requires OpenAI API key)
382+
--ai-model <model> AI model to use (default: gemini-3-flash-preview)
380383
--openai-key <key> OpenAI API key for AI features (or set OPENAI_API_KEY env var)
381384
--debug Enable detailed debug logging
382385
--timeout <ms> Page load timeout in milliseconds (default: 120000)

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
"node-fetch": "^3.3.2",
5959
"openai": "^4.67.3",
6060
"puppeteer": "^24.16.2",
61+
"puppeteer-extra": "^3.3.6",
62+
"puppeteer-extra-plugin-adblocker": "^2.13.6",
6163
"sharp": "^0.33.5"
6264
},
6365
"repository": {

src/cli.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,16 @@ program
127127
'Remove tracking scripts, analytics, and third-party code',
128128
false,
129129
)
130+
.option(
131+
'--block-ads',
132+
'Block advertisements using adblocker plugin',
133+
false,
134+
)
135+
.option(
136+
'--block-cookies',
137+
'Automatically remove and block cookie consent banners',
138+
false,
139+
)
130140
.option(
131141
'--ai',
132142
'Enable AI-powered website analysis (reads OPENAI_API_KEY or GEMINI_API_KEY from env)',
@@ -162,6 +172,8 @@ program
162172
const config = {
163173
outputDir: options.output,
164174
clean: options.clean,
175+
blockAds: options.blockAds,
176+
blockCookies: options.blockCookies,
165177
ai: aiEnabled,
166178
aiModel: options.aiModel,
167179
debug: options.debug,

src/core/browser-engine.js

Lines changed: 179 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import puppeteer from 'puppeteer';
1+
import puppeteer from 'puppeteer-extra';
2+
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker';
23
import chalk from 'chalk';
34

45
/**
@@ -10,10 +11,17 @@ export class BrowserEngine {
1011
headless: options.headless !== false,
1112
timeout: options.timeout || 120000,
1213
debug: options.debug || false,
14+
blockAds: !!options.blockAds,
15+
blockCookies: !!options.blockCookies,
1316
...options,
1417
};
1518
this.browser = null;
1619
this.page = null;
20+
21+
// Use adblocker plugin if enabled
22+
if (this.options.blockAds) {
23+
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
24+
}
1725
}
1826

1927
/**
@@ -39,7 +47,7 @@ export class BrowserEngine {
3947

4048
try {
4149
await this.page.setBypassCSP(true);
42-
} catch {}
50+
} catch { }
4351

4452
await this.page.setViewport({ width: 1366, height: 768 });
4553
await this.page.setUserAgent(
@@ -49,18 +57,19 @@ export class BrowserEngine {
4957
await this.page.setDefaultTimeout(this.options.timeout);
5058
await this.page.setDefaultNavigationTimeout(this.options.timeout);
5159

60+
// If adblocker is NOT enabled, we still use request interception for basic logic
61+
// If it IS enabled, puppeteer-extra-plugin-adblocker handles much of this.
5262
await this.page.setRequestInterception(true);
5363

5464
this.page.on('request', (request) => {
55-
request.continue().catch(() => {});
65+
request.continue().catch(() => { });
5666
});
5767

5868
this.page.on('requestfailed', (request) => {
5969
if (this.options.debug) {
6070
console.log(
6171
chalk.yellow(
62-
`⚠️ Failed request: ${request.url()} - ${
63-
request.failure()?.errorText || ''
72+
`⚠️ Failed request: ${request.url()} - ${request.failure()?.errorText || ''
6473
}`,
6574
),
6675
);
@@ -70,11 +79,175 @@ export class BrowserEngine {
7079
return this.page;
7180
}
7281

82+
/**
83+
* Automatically detect and remove/click-through cookie consent banners
84+
*/
85+
async handleCookieConsent(page) {
86+
if (!this.options.blockCookies) return;
87+
88+
if (this.options.debug) {
89+
console.log(chalk.gray(' 🍪 Attempting to remove cookie consent banners...'));
90+
}
91+
92+
try {
93+
await page.evaluate(() => {
94+
// 1. Common selectors for cookie banners and modals
95+
const cookieSelectors = [
96+
'[id*="cookie" i]', '[class*="cookie" i]',
97+
'[id*="consent" i]', '[class*="consent" i]',
98+
'[id*="privacy" i]', '[class*="privacy" i]',
99+
'[id*="banner" i]', '[class*="banner" i]',
100+
'[id*="notice" i]', '[class*="notice" i]',
101+
'[id*="modal" i]', '[class*="modal" i]',
102+
'.optanon-alert-box-wrapper', '#onetrust-consent-sdk',
103+
'.cc_banner', '.cc_container',
104+
'#didomi-host'
105+
];
106+
107+
// 2. Common button text for "Accept"
108+
const acceptButtonTexts = [
109+
// Core accept
110+
'Accept',
111+
'Accept All',
112+
'Accept all cookies',
113+
'Accept Cookies',
114+
'Accept all',
115+
'Accept & Continue',
116+
'Accept and Continue',
117+
'Accept and Close',
118+
'Accept All Cookies',
119+
'Accept all and continue',
120+
121+
// Agree
122+
'Agree',
123+
'I Agree',
124+
'I agree',
125+
'Agree & Continue',
126+
'Agree and Continue',
127+
128+
// Allow
129+
'Allow',
130+
'Allow All',
131+
'Allow all cookies',
132+
'Allow Cookies',
133+
'Allow all',
134+
'Allow & Continue',
135+
136+
// Consent
137+
'Consent',
138+
'Give Consent',
139+
'Provide Consent',
140+
'Yes I Consent',
141+
142+
// Confirm / continue
143+
'Confirm',
144+
'Confirm choices',
145+
'Confirm selection',
146+
'Confirm and continue',
147+
'Continue',
148+
'Continue to site',
149+
'Continue without changes',
150+
151+
// OK style
152+
'OK',
153+
'Okay',
154+
'Ok',
155+
'Got it',
156+
'Understood',
157+
'I understand',
158+
'Sounds good',
159+
160+
// Close style (many banners accept on close)
161+
'Close',
162+
'Close and accept',
163+
'Dismiss',
164+
'Done',
165+
166+
// Save / approve
167+
'Save',
168+
'Save and accept',
169+
'Save preferences',
170+
'Approve',
171+
'Approve all',
172+
173+
// Misc common wording
174+
'Yes',
175+
'Yes, I agree',
176+
'Yes, accept',
177+
'Yes, allow',
178+
'Enable all',
179+
'Enable cookies',
180+
'Accept recommended',
181+
'Accept suggested',
182+
183+
// GDPR CMP wording
184+
'Accept all purposes',
185+
'Accept all cookies and continue',
186+
'Agree to all',
187+
'Allow all and continue'
188+
];
189+
190+
// Remove identified banners
191+
cookieSelectors.forEach(selector => {
192+
try {
193+
document.querySelectorAll(selector).forEach(el => {
194+
// Only remove if it looks like a popup (likely fixed or absolute)
195+
const style = window.getComputedStyle(el);
196+
if (style.position === 'fixed' || style.position === 'absolute' || el.tagName === 'DIV') {
197+
el.style.display = 'none';
198+
el.setAttribute('aria-hidden', 'true');
199+
}
200+
});
201+
} catch (e) { }
202+
});
203+
204+
// Click "Accept" buttons if found (case-insensitive partial match)
205+
const buttons = Array.from(document.querySelectorAll('button, a, span, div[role="button"]'));
206+
for (const btn of buttons) {
207+
const text = (btn.innerText || btn.textContent || '').trim();
208+
if (!text || text.length > 50) continue; // Skip empty or too long text (unlikely to be a simple button)
209+
210+
const lowerText = text.toLowerCase();
211+
if (acceptButtonTexts.some(t => lowerText.includes(t.toLowerCase()))) {
212+
try {
213+
btn.click();
214+
} catch (e) { }
215+
}
216+
}
217+
218+
// 3. Remove overlay/backdrop if present
219+
const overlays = [
220+
'.modal-backdrop', '.fade', '.in',
221+
'[class*="backdrop" i]', '[id*="backdrop" i]',
222+
'[class*="overlay" i]', '[id*="overlay" i]'
223+
];
224+
overlays.forEach(selector => {
225+
try {
226+
document.querySelectorAll(selector).forEach(el => {
227+
const style = window.getComputedStyle(el);
228+
if (style.position === 'fixed' || style.zIndex > 100) {
229+
el.style.display = 'none';
230+
}
231+
});
232+
} catch (e) { }
233+
});
234+
235+
// 4. Restore scrolling if it was disabled by a modal
236+
document.body.style.overflow = 'auto';
237+
document.documentElement.style.overflow = 'auto';
238+
});
239+
} catch (error) {
240+
if (this.options.debug) {
241+
console.log(chalk.gray(` ⚠️ Cookie removal error: ${error.message}`));
242+
}
243+
}
244+
}
245+
73246
async close() {
74247
if (this.browser) {
75248
try {
76249
await this.browser.close();
77-
} catch {}
250+
} catch { }
78251
this.browser = null;
79252
this.page = null;
80253
}

src/core/mirror-cloner.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,9 @@ export class MirrorCloner {
569569
timeout: this.options.timeout,
570570
});
571571

572+
// Handle cookie consent banners before further processing
573+
await this.browserEngine.handleCookieConsent(page);
574+
572575
await this.waitForRootReady(page);
573576
await this.scrollToBottomAndLoad(page);
574577
await this.waitForImagesSettled(page, 8000);

0 commit comments

Comments
 (0)