From 39d8a5a059b6e044616d302e97e683934f92f693 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Tue, 17 Feb 2026 14:02:30 -0500 Subject: [PATCH 01/28] add parameter emmision to wayback --- bbot/modules/wayback.py | 77 +++++++++++++++++-- .../module_tests/test_module_wayback.py | 33 ++++++++ 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 49010f451a..d871bf2631 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -1,38 +1,72 @@ from datetime import datetime +from urllib.parse import parse_qs, urlparse, urlunparse +from bbot.core.helpers.validators import clean_url from bbot.modules.templates.subdomain_enum import subdomain_enum class wayback(subdomain_enum): flags = ["safe", "passive", "subdomain-enum"] - watched_events = ["DNS_NAME"] - produced_events = ["URL_UNVERIFIED", "DNS_NAME"] + watched_events = ["DNS_NAME", "URL"] + produced_events = ["URL_UNVERIFIED", "DNS_NAME", "WEB_PARAMETER"] meta = { "description": "Query archive.org's API for subdomains", "created_date": "2022-04-01", "author": "@liquidsec", } - options = {"urls": False, "garbage_threshold": 10} + options = {"urls": False, "garbage_threshold": 10, "parameters": False} options_desc = { "urls": "emit URLs in addition to DNS_NAMEs", "garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)", + "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (forces urls=true)", } in_scope_only = True base_url = "http://web.archive.org" + url_blacklist = ["_Incapsula_Resource"] async def setup(self): self.urls = self.config.get("urls", False) + self.parameters = self.config.get("parameters", False) + if self.parameters: + self.urls = True self.garbage_threshold = self.config.get("garbage_threshold", 10) + self._parameter_cache = {} return await super().setup() async def handle_event(self, event): + if event.type == "URL": + # use clean_url (always strips query) to match cache key regardless of url_querystring_remove setting + cached = self._parameter_cache.pop(clean_url(event.data).geturl(), None) + if cached is not None: + flat_params, base_url = cached + for param_name, original_value in flat_params.items(): + data = { + "host": str(event.host), + "type": "GETPARAM", + "name": param_name, + "original_value": original_value, + "url": base_url, + "description": f"HTTP Extracted Parameter [{param_name}] (wayback)", + "additional_params": {k: v for k, v in flat_params.items() if k != param_name}, + } + await self.emit_event( + data, + "WEB_PARAMETER", + event, + tags=["from-wayback"], + context=f"{{module}} found query parameter [{param_name}] in archived URL and emitted {{event.type}}", + ) + return + query = self.make_query(event) for result, event_type in await self.query(query): + tags = ["from-wayback"] if event_type == "URL_UNVERIFIED" else [] await self.emit_event( result, event_type, event, + tags=tags, abort_if=self.abort_if, context=f'{{module}} queried archive.org for "{query}" and found {{event.type}}: {{event.pretty_string}}', ) @@ -40,7 +74,14 @@ async def handle_event(self, event): async def query(self, query): results = set() waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" - r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 10) + r = None + for i in range(3): + r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 10) + if r: + break + if i < 2: + self.verbose(f'Error connecting to archive.org for query "{query}", retrying ({i + 1}/2)') + await self.helpers.sleep(2**i) if not r: self.warning(f'Error connecting to archive.org for query "{query}"') return results @@ -61,6 +102,28 @@ async def query(self, query): self.verbose(f"Found {len(urls):,} URLs for {query}") + # pre-extract parameters from raw URLs before collapse strips query strings + raw_url_params = {} + if self.parameters: + for url in urls: + try: + parsed = urlparse(url) + if any(bl in url for bl in self.url_blacklist): + continue + if parsed.query and parsed.hostname and self.scan.in_scope(parsed.hostname): + params = parse_qs(parsed.query) + flat_params = {k: v[0] for k, v in params.items()} + if flat_params: + # key by cleaned URL (always strips query) to match what collapse_urls produces + cleaned = clean_url(url) + cleaned_str = cleaned.geturl() + if cleaned_str not in raw_url_params: + raw_url_params[cleaned_str] = flat_params + else: + raw_url_params[cleaned_str].update(flat_params) + except Exception: + continue + dns_names = set() collapsed_urls = 0 start_time = datetime.now() @@ -80,7 +143,11 @@ async def query(self, query): dns_names.add(h) results.add((dns_name, "DNS_NAME")) else: - results.add((parsed_url.geturl(), "URL_UNVERIFIED")) + url_str = parsed_url.geturl() + results.add((url_str, "URL_UNVERIFIED")) + if self.parameters and url_str in raw_url_params: + base_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")) + self._parameter_cache[url_str] = (raw_url_params[url_str], base_url) end_time = datetime.now() duration = self.helpers.human_timedelta(end_time - start_time) self.verbose(f"Collapsed {len(urls):,} -> {collapsed_urls:,} URLs in {duration}") diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 13ddac33fa..21d03038aa 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -10,3 +10,36 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): assert any(e.data == "asdf.blacklanternsecurity.com" for e in events), "Failed to detect subdomain" + + +class TestWaybackParameters(ModuleTestBase): + module_name = "wayback" + config_overrides = {"modules": {"wayback": {"parameters": True}}} + + async def setup_after_prep(self, module_test): + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + json=[ + ["original"], + ["http://blacklanternsecurity.com/page?foo=bar&baz=qux"], + ], + ) + # mock httpx response for the URL so it becomes a verified URL event + module_test.httpx_mock.add_response(url="http://blacklanternsecurity.com/page?foo=bar&baz=qux") + + def check(self, module_test, events): + assert any(e.type == "URL_UNVERIFIED" and "blacklanternsecurity.com/page" in e.data for e in events), ( + "Failed to emit URL_UNVERIFIED" + ) + assert any( + e.type == "WEB_PARAMETER" and e.data["name"] == "foo" and e.data["original_value"] == "bar" for e in events + ), "Failed to emit WEB_PARAMETER for foo" + assert any( + e.type == "WEB_PARAMETER" and e.data["name"] == "baz" and e.data["original_value"] == "qux" for e in events + ), "Failed to emit WEB_PARAMETER for baz" + # check that additional_params contains sibling params but excludes the current one + for e in events: + if e.type == "WEB_PARAMETER" and e.data["name"] == "foo": + assert e.data["additional_params"] == {"baz": "qux"}, f"foo's additional_params wrong: {e.data['additional_params']}" + if e.type == "WEB_PARAMETER" and e.data["name"] == "baz": + assert e.data["additional_params"] == {"foo": "bar"}, f"baz's additional_params wrong: {e.data['additional_params']}" From 27f140914f3540a671ede5ffcfe1a028d68d77b2 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Tue, 17 Feb 2026 14:25:13 -0500 Subject: [PATCH 02/28] mods to the wayback parameter extraction --- bbot/modules/wayback.py | 14 +++++++++++-- .../module_tests/test_module_wayback.py | 20 ++++++++++++------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index d871bf2631..a0b5a2c03c 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -18,7 +18,7 @@ class wayback(subdomain_enum): options_desc = { "urls": "emit URLs in addition to DNS_NAMEs", "garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)", - "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (forces urls=true)", + "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true)", } in_scope_only = True @@ -29,7 +29,17 @@ async def setup(self): self.urls = self.config.get("urls", False) self.parameters = self.config.get("parameters", False) if self.parameters: - self.urls = True + if not self.urls: + self.warning("parameters option requires urls to be enabled") + return False + consumers = [m for m, mod in self.scan.modules.items() if "WEB_PARAMETER" in mod.watched_events] + if not consumers: + self.warning("Disabling parameter extraction because no modules consume WEB_PARAMETER events") + self.parameters = False + else: + self.hugeinfo( + f"Parameter extraction enabled because the following modules consume WEB_PARAMETER events: [{', '.join(consumers)}]" + ) self.garbage_threshold = self.config.get("garbage_threshold", 10) self._parameter_cache = {} return await super().setup() diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 21d03038aa..cd12c0b672 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -14,21 +14,23 @@ def check(self, module_test, events): class TestWaybackParameters(ModuleTestBase): module_name = "wayback" - config_overrides = {"modules": {"wayback": {"parameters": True}}} + modules_overrides = ["wayback", "hunt"] + whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "parameters": True}}} async def setup_after_prep(self, module_test): module_test.httpx_mock.add_response( url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", json=[ ["original"], - ["http://blacklanternsecurity.com/page?foo=bar&baz=qux"], + ["http://127.0.0.1:8888/page?foo=bar&baz=qux"], ], ) - # mock httpx response for the URL so it becomes a verified URL event - module_test.httpx_mock.add_response(url="http://blacklanternsecurity.com/page?foo=bar&baz=qux") + # serve a response on the local httpserver so the httpx binary gets a 200 + module_test.set_expect_requests(expect_args={"uri": "/page"}, respond_args={"response_data": "alive"}) def check(self, module_test, events): - assert any(e.type == "URL_UNVERIFIED" and "blacklanternsecurity.com/page" in e.data for e in events), ( + assert any(e.type == "URL_UNVERIFIED" and "127.0.0.1" in e.data and "/page" in e.data for e in events), ( "Failed to emit URL_UNVERIFIED" ) assert any( @@ -40,6 +42,10 @@ def check(self, module_test, events): # check that additional_params contains sibling params but excludes the current one for e in events: if e.type == "WEB_PARAMETER" and e.data["name"] == "foo": - assert e.data["additional_params"] == {"baz": "qux"}, f"foo's additional_params wrong: {e.data['additional_params']}" + assert e.data["additional_params"] == {"baz": "qux"}, ( + f"foo's additional_params wrong: {e.data['additional_params']}" + ) if e.type == "WEB_PARAMETER" and e.data["name"] == "baz": - assert e.data["additional_params"] == {"foo": "bar"}, f"baz's additional_params wrong: {e.data['additional_params']}" + assert e.data["additional_params"] == {"foo": "bar"}, ( + f"baz's additional_params wrong: {e.data['additional_params']}" + ) From aac96741a3ac148483043a425eb5b3fc18f26bd8 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 12:04:24 -0500 Subject: [PATCH 03/28] more features / bug fixes for new wayback --- bbot/core/event/base.py | 14 + bbot/defaults.yml | 1 + bbot/modules/internal/excavate.py | 53 ++- bbot/modules/templates/subdomain_enum.py | 4 +- bbot/modules/wayback.py | 404 ++++++++++++++---- bbot/presets/kitchen-sink.yml | 9 + bbot/presets/wayback-intense.yml | 15 + bbot/presets/wayback.yml | 12 + bbot/presets/web/lightfuzz-heavy.yml | 4 + bbot/presets/web/lightfuzz-max.yml | 4 + .../module_tests/test_module_wayback.py | 392 +++++++++++++++++ 11 files changed, 828 insertions(+), 84 deletions(-) create mode 100644 bbot/presets/wayback-intense.yml create mode 100644 bbot/presets/wayback.yml diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index c010148036..643aec3b92 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -1069,6 +1069,19 @@ def _data_load(self, data): class DictHostEvent(DictEvent): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # inherit archive_url from parent for provenance tracking (e.g. wayback archived content) + if isinstance(self.data, dict) and "archive_url" not in self.data: + parent = self.parent + if ( + parent is not None + and parent is not self + and isinstance(parent.data, dict) + and "archive_url" in parent.data + ): + self.data["archive_url"] = parent.data["archive_url"] + def _host(self): if isinstance(self.data, dict) and "host" in self.data: return make_ip_type(self.data["host"]) @@ -1645,6 +1658,7 @@ class _data_validator(BaseModel): full_url: Optional[str] = None path: Optional[str] = None cves: Optional[list[str]] = None + archive_url: Optional[str] = None _validate_url = field_validator("url")(validators.validate_url) _validate_host = field_validator("host")(validators.validate_host) _validate_severity = field_validator("severity")(validators.validate_severity) diff --git a/bbot/defaults.yml b/bbot/defaults.yml index 3856a1f644..18abb48dcc 100644 --- a/bbot/defaults.yml +++ b/bbot/defaults.yml @@ -261,6 +261,7 @@ parameter_blacklist: - .AspNetCore.Session - PHPSESSID - __cf_bm + - _cfuvid - f5_cspm parameter_blacklist_prefixes: diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 7f8c559471..84824ee0e4 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -366,6 +366,34 @@ def in_bl(self, value): return False + def _event_host(self, event): + """Get the effective host from an event, preferring data["host"] over parsed_url. + + HTTP_RESPONSE._host() derives from parsed_url.hostname (i.e. data["url"]), + but data["host"] may be explicitly overridden (e.g. for archived wayback content + where url is archive.org but host is the original target). + """ + if isinstance(event.data, dict) and event.data.get("host"): + return str(event.data["host"]) + return str(event.host) + + def _event_base_url(self, event): + """Reconstruct the effective base URL from event data fields. + + For normal HTTP_RESPONSE events, this matches event.parsed_url. + For archived content (e.g. wayback), the data fields (host/scheme/path) + reflect the original URL while parsed_url comes from the archive URL. + """ + scheme = event.data.get("scheme", event.parsed_url.scheme) + host = self._event_host(event) + port = event.data.get("port") + if port is not None: + port = int(port) + if not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)): + host = f"{host}:{port}" + path = event.data.get("path", event.parsed_url.path) + return urlparse(f"{scheme}://{host}{path}") + def url_unparse(self, param_type, parsed_url): # Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value. if param_type == "GETPARAM": @@ -641,8 +669,9 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte # The endpoint is usually a form action - we should use it if we have it. If not, default to URL. else: - # Use the original URL as the base and resolve the endpoint correctly in case of relative paths - base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}" + # Use the effective base URL (which may differ from parsed_url for archived content) + event_base = self.excavate._event_base_url(event) + base_url = f"{event_base.scheme}://{event_base.netloc}{event_base.path}" if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0: base_url += f"?{event.parsed_url.query}" url = urljoin(base_url, endpoint) @@ -1000,12 +1029,13 @@ async def emit_custom_parameters(self, event, config_key, param_type, descriptio # Emits WEB_PARAMETER events for custom headers and cookies from the configuration. custom_params = self.scan.web_config.get(config_key, {}) for param_name, param_value in custom_params.items(): + event_base = self._event_base_url(event) await self.emit_web_parameter( - host=event.parsed_url.hostname, + host=self._event_host(event), param_type=param_type, name=param_name, original_value=param_value, - url=self.url_unparse(param_type, event.parsed_url), + url=self.url_unparse(param_type, event_base), description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})", additional_params=_exclude_key(custom_params, param_name), event=event, @@ -1121,7 +1151,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon if results: for parameter_name, original_value in results: await self.emit_web_parameter( - host=str(event.host), + host=self._event_host(event), param_type="SPECULATIVE", name=parameter_name, original_value=original_value, @@ -1129,7 +1159,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]", additional_params={}, event=event, - context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}", + context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {self._event_host(event)}", ) return @@ -1181,7 +1211,7 @@ async def handle_event(self, event, **kwargs): ) in extract_params_url(event.parsed_url): if self.in_bl(parameter_name) is False: await self.emit_web_parameter( - host=parsed_url.hostname, + host=self._event_host(event), param_type="GETPARAM", name=parameter_name, original_value=original_value, @@ -1215,12 +1245,13 @@ async def handle_event(self, event, **kwargs): if self.in_bl(cookie_name) is False: self.assigned_cookies[cookie_name] = cookie_value + event_base = self._event_base_url(event) await self.emit_web_parameter( - host=str(event.host), + host=self._event_host(event), param_type="COOKIE", name=cookie_name, original_value=cookie_value, - url=self.url_unparse("COOKIE", event.parsed_url), + url=self.url_unparse("COOKIE", event_base), description=f"Set-Cookie Assigned Cookie [{cookie_name}]", additional_params={}, event=event, @@ -1257,10 +1288,10 @@ async def handle_event(self, event, **kwargs): original_value, regex_name, additional_params, - ) in extract_params_location(header_value, event.parsed_url): + ) in extract_params_location(header_value, self._event_base_url(event)): if self.in_bl(parameter_name) is False: await self.emit_web_parameter( - host=parsed_url.hostname, + host=self._event_host(event), param_type="GETPARAM", name=parameter_name, original_value=original_value, diff --git a/bbot/modules/templates/subdomain_enum.py b/bbot/modules/templates/subdomain_enum.py index 3bdcdff07b..ceb341118b 100644 --- a/bbot/modules/templates/subdomain_enum.py +++ b/bbot/modules/templates/subdomain_enum.py @@ -171,8 +171,10 @@ async def filter_event(self, event): # reject if it's a cloud resource and not in our target (unless it's a seed event) if is_cloud and not self.scan.in_target(event) and "seed" not in event.tags: return False, "Event is a cloud resource and not a direct target" + # don't reject targets — if the user explicitly targeted a domain, always process it + is_target = event in self.scan.target.whitelist # optionally reject events with wildcards / errors - if self.reject_wildcards: + if self.reject_wildcards and not is_target: if any(t in event.tags for t in ("a-error", "aaaa-error")): return False, "Event has a DNS resolution error" if self.reject_wildcards == "strict": diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index a0b5a2c03c..5f62417a36 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -1,6 +1,8 @@ +import re from datetime import datetime from urllib.parse import parse_qs, urlparse, urlunparse +from bbot.core.helpers.misc import get_file_extension from bbot.core.helpers.validators import clean_url from bbot.modules.templates.subdomain_enum import subdomain_enum @@ -8,29 +10,40 @@ class wayback(subdomain_enum): flags = ["safe", "passive", "subdomain-enum"] watched_events = ["DNS_NAME", "URL"] - produced_events = ["URL_UNVERIFIED", "DNS_NAME", "WEB_PARAMETER"] + produced_events = ["URL_UNVERIFIED", "DNS_NAME", "WEB_PARAMETER", "HTTP_RESPONSE", "FINDING"] meta = { - "description": "Query archive.org's API for subdomains", + "description": "Query archive.org's Wayback Machine for subdomains, URLs, parameters, and archived content", "created_date": "2022-04-01", "author": "@liquidsec", } - options = {"urls": False, "garbage_threshold": 10, "parameters": False} + options = {"urls": False, "garbage_threshold": 10, "parameters": False, "archive": False} options_desc = { "urls": "emit URLs in addition to DNS_NAMEs", "garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)", "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true)", + "archive": "fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true)", } in_scope_only = True base_url = "http://web.archive.org" - url_blacklist = ["_Incapsula_Resource"] + url_blacklist = ["_Incapsula_Resource", "/cdn-cgi/"] + + interesting_extensions = frozenset({"zip", "sql", "bak", "env", "config"}) + interesting_compound_extensions = frozenset({"tar.gz", "tar.bz2"}) + + def _is_interesting_file(self, url): + ext = get_file_extension(url) + if ext and ext.lower() in self.interesting_extensions: + return True + lower_url = url.lower() + return any(lower_url.endswith(f".{ce}") for ce in self.interesting_compound_extensions) async def setup(self): self.urls = self.config.get("urls", False) self.parameters = self.config.get("parameters", False) if self.parameters: if not self.urls: - self.warning("parameters option requires urls to be enabled") + self.hugewarning("parameters option requires urls to be enabled. Please add modules.wayback.urls=True") return False consumers = [m for m, mod in self.scan.modules.items() if "WEB_PARAMETER" in mod.watched_events] if not consumers: @@ -40,37 +53,23 @@ async def setup(self): self.hugeinfo( f"Parameter extraction enabled because the following modules consume WEB_PARAMETER events: [{', '.join(consumers)}]" ) + self.archive = self.config.get("archive", False) + if self.archive and not self.urls: + self.hugewarning("archive option requires urls to be enabled. Please add modules.wayback.urls=True") + return False self.garbage_threshold = self.config.get("garbage_threshold", 10) self._parameter_cache = {} + self._archive_cache = {} return await super().setup() async def handle_event(self, event): if event.type == "URL": - # use clean_url (always strips query) to match cache key regardless of url_querystring_remove setting - cached = self._parameter_cache.pop(clean_url(event.data).geturl(), None) - if cached is not None: - flat_params, base_url = cached - for param_name, original_value in flat_params.items(): - data = { - "host": str(event.host), - "type": "GETPARAM", - "name": param_name, - "original_value": original_value, - "url": base_url, - "description": f"HTTP Extracted Parameter [{param_name}] (wayback)", - "additional_params": {k: v for k, v in flat_params.items() if k != param_name}, - } - await self.emit_event( - data, - "WEB_PARAMETER", - event, - tags=["from-wayback"], - context=f"{{module}} found query parameter [{param_name}] in archived URL and emitted {{event.type}}", - ) + await self._handle_url_event(event) return query = self.make_query(event) - for result, event_type in await self.query(query): + results, interesting_files = await self.query(query) + for result, event_type in results: tags = ["from-wayback"] if event_type == "URL_UNVERIFIED" else [] await self.emit_event( result, @@ -81,12 +80,115 @@ async def handle_event(self, event): context=f'{{module}} queried archive.org for "{query}" and found {{event.type}}: {{event.pretty_string}}', ) - async def query(self, query): - results = set() + if interesting_files: + await self._check_interesting_files(interesting_files, event) + + # pair unpaired archive cache entries with their parent DNS_NAME event + if self.archive: + paired = 0 + for url_str in list(self._archive_cache): + if isinstance(self._archive_cache[url_str], str): + self._archive_cache[url_str] = (self._archive_cache[url_str], event) + paired += 1 + if paired: + self.debug(f"Paired {paired} archive cache entries with parent event {event.data}") + + async def _handle_url_event(self, event): + """Process a URL event: evict live URLs from archive cache and emit cached parameters.""" + if self.archive: + status_code = 0 + for tag in event.tags: + if tag.startswith("status-"): + try: + status_code = int(tag.split("-", 1)[1]) + except ValueError: + pass + break + # only 2xx counts as live — 3xx (e.g. http→https 301 to a 404) doesn't confirm the page exists + if 200 <= status_code < 300: + cleaned = clean_url(event.data).geturl() + if self._archive_cache.pop(cleaned, None) is not None: + self.verbose(f"URL is live (status {status_code}), removed from archive cache: {cleaned}") + + cached = self._parameter_cache.pop(clean_url(event.data).geturl(), None) + if cached is not None: + flat_params, base_url = cached + for param_name, original_value in flat_params.items(): + data = { + "host": str(event.host), + "type": "GETPARAM", + "name": param_name, + "original_value": original_value, + "url": base_url, + "description": f"HTTP Extracted Parameter [{param_name}] (wayback)", + "additional_params": {k: v for k, v in flat_params.items() if k != param_name}, + } + self.verbose(f"Emitting WEB_PARAMETER [{param_name}] from archived URL {base_url}") + await self.emit_event( + data, + "WEB_PARAMETER", + event, + tags=["from-wayback"], + context=f"{{module}} found query parameter [{param_name}] in archived URL and emitted {{event.type}}", + ) + + async def _check_interesting_files(self, interesting_files, event): + """HEAD-check interesting archived files and emit FINDINGs for those that exist.""" + self.verbose(f"Checking {len(interesting_files)} interesting archived files") + + # build URL list and mapping back to metadata + url_metadata = {} + for cleaned_url, raw_url in interesting_files.items(): + archive_url = f"{self.base_url}/web/{raw_url}" + url_metadata[archive_url] = (cleaned_url, raw_url) + + gen = self.helpers.request_batch( + list(url_metadata), method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True + ) + async for archive_url, r in gen: + cleaned_url, raw_url = url_metadata[archive_url] + + if not r or r.status_code != 200: + status = getattr(r, "status_code", "no response") if r else "no response" + self.debug(f"Interesting file HEAD check failed for {raw_url}: status={status}") + continue + # guard against soft 404s (archive.org returns text/html for missing pages) + content_type = r.headers.get("content-type", "") + if "text/html" in content_type: + self.debug(f"Interesting file skipped (soft 404): {raw_url}") + continue + + ext = get_file_extension(cleaned_url) + desc = f"Interesting archived file found (.{ext}): {raw_url}" + content_length = r.headers.get("content-length", "") + if content_length: + try: + size = int(content_length) + if size > 1024 * 1024: + desc += f" ({size / (1024 * 1024):.1f} MB)" + elif size > 1024: + desc += f" ({size / 1024:.1f} KB)" + else: + desc += f" ({size} bytes)" + except ValueError: + pass + + self.verbose(f"Interesting archived file confirmed: {raw_url}") + parsed = urlparse(raw_url) + await self.emit_event( + {"description": desc, "url": str(r.url), "host": str(parsed.hostname or "")}, + "FINDING", + event, + tags=["from-wayback", "archived", "interesting-file"], + context=f"{{module}} found interesting archived file: {raw_url}", + ) + + async def _fetch_cdx(self, query): + """Fetch URLs from the CDX API with retries. Returns the URL list or None on failure.""" waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" r = None for i in range(3): - r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 10) + r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 30) if r: break if i < 2: @@ -94,71 +196,229 @@ async def query(self, query): await self.helpers.sleep(2**i) if not r: self.warning(f'Error connecting to archive.org for query "{query}"') - return results + return None try: j = r.json() assert type(j) == list except Exception: self.warning(f'Error JSON-decoding archive.org response for query "{query}"') - return results + return None + return [result[0] for result in j[1:] if result] - urls = [] - for result in j[1:]: + def _pre_process_urls(self, urls): + """Extract parameters, archive URLs, and interesting files from raw CDX URLs before collapse.""" + raw_url_params = {} + archive_urls = {} + interesting_files = {} + + for url in urls: try: - url = result[0] - urls.append(url) - except KeyError: + parsed = urlparse(url) + if any(bl in url for bl in self.url_blacklist): + continue + if not (parsed.hostname and self.scan.in_scope(parsed.hostname)): + continue + + cleaned_str = clean_url(url).geturl() + + if self.archive and cleaned_str not in archive_urls: + archive_urls[cleaned_str] = url + + if self.urls and self._is_interesting_file(url) and cleaned_str not in interesting_files: + interesting_files[cleaned_str] = url + + if self.parameters and parsed.query: + params = parse_qs(parsed.query) + flat_params = {k: v[0] for k, v in params.items()} + if flat_params: + if cleaned_str not in raw_url_params: + raw_url_params[cleaned_str] = flat_params + else: + raw_url_params[cleaned_str].update(flat_params) + except Exception: continue + if archive_urls or interesting_files or raw_url_params: + self.debug( + f"Pre-processed {len(urls):,} URLs: {len(archive_urls):,} archive candidates, " + f"{len(interesting_files):,} interesting files, {len(raw_url_params):,} URLs with parameters" + ) + + return raw_url_params, archive_urls, interesting_files + + async def query(self, query): + results = set() + + urls = await self._fetch_cdx(query) + if urls is None: + return results, {} + self.verbose(f"Found {len(urls):,} URLs for {query}") - # pre-extract parameters from raw URLs before collapse strips query strings - raw_url_params = {} - if self.parameters: - for url in urls: - try: - parsed = urlparse(url) - if any(bl in url for bl in self.url_blacklist): - continue - if parsed.query and parsed.hostname and self.scan.in_scope(parsed.hostname): - params = parse_qs(parsed.query) - flat_params = {k: v[0] for k, v in params.items()} - if flat_params: - # key by cleaned URL (always strips query) to match what collapse_urls produces - cleaned = clean_url(url) - cleaned_str = cleaned.geturl() - if cleaned_str not in raw_url_params: - raw_url_params[cleaned_str] = flat_params - else: - raw_url_params[cleaned_str].update(flat_params) - except Exception: - continue + # filter blacklisted URLs before any further processing + urls = [url for url in urls if not any(bl in url for bl in self.url_blacklist)] + + # pre-extract metadata from raw URLs before collapse strips query strings + raw_url_params, archive_urls, interesting_files = {}, {}, {} + if self.parameters or self.archive or self.urls: + raw_url_params, archive_urls, interesting_files = self._pre_process_urls(urls) dns_names = set() collapsed_urls = 0 start_time = datetime.now() - # we consolidate URLs to cut down on garbage data - # this is CPU-intensive, so we do it in its own core. + # consolidate URLs to cut down on garbage data (CPU-intensive, runs in separate process) parsed_urls = await self.helpers.run_in_executor_mp( self.helpers.validators.collapse_urls, urls, threshold=self.garbage_threshold, ) - for parsed_url in parsed_urls: - collapsed_urls += 1 - if not self.urls: - dns_name = parsed_url.hostname - h = hash(dns_name) - if h not in dns_names: - dns_names.add(h) - results.add((dns_name, "DNS_NAME")) - else: + if self.urls: + # deduplicate http/https variants — drop http when https also exists + url_dedup = {} + for parsed_url in parsed_urls: + collapsed_urls += 1 + https_key = parsed_url._replace(scheme="https").geturl() + if https_key not in url_dedup or parsed_url.scheme == "https": + url_dedup[https_key] = parsed_url + for parsed_url in url_dedup.values(): url_str = parsed_url.geturl() results.add((url_str, "URL_UNVERIFIED")) if self.parameters and url_str in raw_url_params: base_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")) self._parameter_cache[url_str] = (raw_url_params[url_str], base_url) - end_time = datetime.now() - duration = self.helpers.human_timedelta(end_time - start_time) + if self.archive and url_str in archive_urls: + self._archive_cache[url_str] = archive_urls[url_str] + else: + for parsed_url in parsed_urls: + collapsed_urls += 1 + dns_name = parsed_url.hostname + h = hash(dns_name) + if h not in dns_names: + dns_names.add(h) + results.add((dns_name, "DNS_NAME")) + + duration = self.helpers.human_timedelta(datetime.now() - start_time) self.verbose(f"Collapsed {len(urls):,} -> {collapsed_urls:,} URLs in {duration}") - return results + return results, interesting_files + + _wayback_head_re = re.compile( + r'

content

' + stripped = w._strip_wayback_wrapper(body) + assert "archive.org" not in stripped + assert "content" in stripped + + # test stripping of relative wayback URL rewrites (href) + body = 'link' + stripped = w._strip_wayback_wrapper(body) + assert "/web/19971024185506/" not in stripped + assert "http://www.example.com/PDF%20files/data.pdf" in stripped + + # test stripping of relative wayback URL rewrites with modifier suffix (im_ for images) + body = '' + stripped = w._strip_wayback_wrapper(body) + assert "/web/19971024185506im_/" not in stripped + assert "http://www.example.com/images/logo.gif" in stripped + + # test stripping of relative wayback URL rewrites with js_ suffix + body = '' + stripped = w._strip_wayback_wrapper(body) + assert "/web/20250529193232js_/" not in stripped + assert "https://www.example.com/script.js" in stripped From 251d48d095d87de12a49f8131a6328414d3fa377 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 13:15:12 -0500 Subject: [PATCH 04/28] allow from-wayback tag to propagate --- bbot/core/event/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 643aec3b92..3232e6d55c 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -620,7 +620,7 @@ def parent(self, parent): self.web_spider_distance = getattr(parent, "web_spider_distance", 0) event_has_url = getattr(self, "parsed_url", None) is not None for t in parent.tags: - if t in ("affiliate",): + if t in ("affiliate", "from-wayback"): self.add_tag(t) elif t.startswith("mutation-"): self.add_tag(t) From ca211fc9006b64996c5e6adb9afd2c44efc7bea0 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 14:24:37 -0500 Subject: [PATCH 05/28] update docs for wayback --- docs/modules/wayback.md | 141 +++++++++++++++++++++++++++++ docs/scanning/configuration.md | 2 + docs/scanning/presets_list.md | 158 +++++++++++++++++++++++++-------- mkdocs.yml | 1 + 4 files changed, 267 insertions(+), 35 deletions(-) create mode 100644 docs/modules/wayback.md diff --git a/docs/modules/wayback.md b/docs/modules/wayback.md new file mode 100644 index 0000000000..dab08340db --- /dev/null +++ b/docs/modules/wayback.md @@ -0,0 +1,141 @@ +# Wayback + +## Overview + +The Wayback module queries [archive.org's Wayback Machine](https://web.archive.org/) CDX API to discover subdomains, URLs, web parameters, and archived content for your targets. By default it operates as a passive subdomain enumeration source, but with its extended features enabled it becomes a powerful tool for discovering dead URLs, extracting parameters for fuzzing, and retrieving archived versions of pages that no longer exist. + +* Watches: **DNS_NAME**, **URL** +* Produces: **URL_UNVERIFIED**, **DNS_NAME**, **WEB_PARAMETER**, **HTTP_RESPONSE**, **FINDING** +* Flags: `passive`, `subdomain-enum`, `safe` + +## Default Behavior + +By default, wayback only emits **DNS_NAME** events (subdomains) extracted from archived URLs. This is the behavior you get when wayback is included via the `subdomain-enum` preset. No URLs, parameters, or archived content are fetched. + +To unlock the more advanced features, you need to enable them via configuration options or use one of the wayback presets. + +## Configuration Options + +| Option | Type | Default | Description | +|---------------------|------|---------|-------------------------------------------------------------------------------------------------------| +| `urls` | bool | `False` | Emit `URL_UNVERIFIED` events in addition to `DNS_NAME`s. Required for `parameters` and `archive`. | +| `parameters` | bool | `False` | Extract `WEB_PARAMETER` events from query strings in archived URLs. Requires `urls=True`. | +| `archive` | bool | `False` | Fetch archived versions of dead URLs and emit `HTTP_RESPONSE` events. Requires `urls=True`. | +| `garbage_threshold` | int | `10` | Deduplicate similar URLs if they appear in groups of this size or larger. Lower = less noise. | + +## Features + +### URL Discovery (`urls: True`) + +When `urls` is enabled, wayback emits `URL_UNVERIFIED` events for every unique URL found in the Wayback Machine's index. These are tagged with `from-wayback` and sent through BBOT's normal URL verification pipeline (httpx). + +Before emission, URLs go through several cleanup steps: + +- **URL collapsing** - Groups of similar URLs (e.g. pagination, search results) are deduplicated based on the `garbage_threshold` setting +- **HTTP/HTTPS deduplication** - When both `http://` and `https://` variants exist, only the HTTPS version is kept +- **Blacklist filtering** - URLs containing known CDN/WAF paths (e.g. `_Incapsula_Resource`, `/cdn-cgi/`) are filtered out + +### Parameter Extraction (`parameters: True`) + +When `parameters` is enabled (requires `urls: True`), wayback extracts query string parameters from archived URLs and emits them as `WEB_PARAMETER` events. This is useful for discovering GET parameters that can be fed into fuzzing modules like lightfuzz. + +Parameters are cached and only emitted after the corresponding URL has been verified as live by httpx. This prevents emitting parameters for URLs that no longer exist. + +!!! note + Parameter extraction requires at least one module that consumes `WEB_PARAMETER` events to be active (e.g. `lightfuzz`, `hunt`, `paramminer_getparams`). If no such module is present, parameter extraction is automatically disabled with a warning. + +### Archive Retrieval (`archive: True`) + +When `archive` is enabled (requires `urls: True`), wayback fetches the actual archived content of URLs from the Wayback Machine and emits them as `HTTP_RESPONSE` events. This is particularly useful for: + +- **Finding secrets in dead pages** - Archived versions may contain API keys, credentials, or other sensitive data that modules like `badsecrets` can detect +- **Discovering hidden functionality** - Pages that have been removed may reveal application structure or endpoints + +Archive retrieval runs during the module's `finish()` phase, after all URLs have been discovered and verified. URLs that are confirmed live (2xx status) are automatically removed from the archive queue, so only dead URLs are fetched from the archive. + +The archived content goes through extensive cleanup to remove Wayback Machine artifacts: + +- Wayback toolbar/header/footer HTML is stripped +- Rewritten URLs (e.g. `http://web.archive.org/web/20250101/http://example.com/page`) are restored to originals +- Wayback-injected headers (`x-archive-*`, `set-cookie`) are removed +- The event's host, port, and URL are set to the original target, not `web.archive.org` + +Archived HTTP_RESPONSE events are tagged with `from-wayback` and `archived`. + +!!! warning + Static file extensions (images, CSS, JS, etc.) are automatically skipped during archive retrieval to avoid unnecessary traffic. + +### Interesting File Detection + +When `urls` is enabled, wayback also checks for potentially interesting archived files by looking for URLs with sensitive extensions: `.zip`, `.sql`, `.bak`, `.env`, `.config`, `.tar.gz`, `.tar.bz2`. + +When found, these are verified with a HEAD request to archive.org. If the archived file exists and isn't a soft-404, a `FINDING` event is emitted with details about the file (including size if available). These findings are tagged with `from-wayback`, `archived`, and `interesting-file`. + +## Presets + +Wayback comes with two dedicated presets, and is also integrated into several other presets: + +### `-p wayback` + +Basic URL discovery mode. Includes `subdomain-enum` and enables `urls: True`. Good for general recon when you want to discover historical URLs alongside subdomains. + +```bash +bbot -p wayback -t evilcorp.com +``` + +### `-p wayback-intense` + +Full-featured mode with URL discovery, parameter extraction, and archive retrieval. Also includes `badsecrets` to scan archived content for exposed secrets. + +```bash +bbot -p wayback-intense -t evilcorp.com +``` + +### Integration with other presets + +Wayback's extended features are also enabled in several other presets: + +| Preset | Wayback Config | +|-----------------------|-----------------------------------------| +| `kitchen-sink` | `urls`, `parameters`, `archive` | +| `dirbust-heavy` | `urls` | +| `nuclei-intense` | `urls` | +| `lightfuzz-heavy` | `urls`, `parameters` | +| `lightfuzz-superheavy`| `urls`, `parameters`, `archive` | + +## Example Commands + +```bash +# Basic subdomain enumeration (default behavior, no URL emission) +bbot -p subdomain-enum -t evilcorp.com +``` + +```bash +# URL discovery via wayback preset +bbot -p wayback -t evilcorp.com +``` + +```bash +# Full wayback integration with archived content and parameter extraction +bbot -p wayback-intense -t evilcorp.com +``` + +```bash +# Enable wayback URLs alongside a nuclei scan +bbot -p nuclei -m wayback -c modules.wayback.urls=True --allow-deadly -t evilcorp.com +``` + +```bash +# Pair with lightfuzz for parameter fuzzing using archived parameters +bbot -p lightfuzz-heavy spider -t evilcorp.com --allow-deadly +``` + +```bash +# Enable wayback features via command-line config +bbot -p subdomain-enum -c modules.wayback.urls=True modules.wayback.parameters=True modules.wayback.archive=True -t evilcorp.com +``` + +```bash +# Adjust garbage threshold for cleaner output (more aggressive deduplication) +bbot -p wayback -c modules.wayback.garbage_threshold=5 -t evilcorp.com +``` diff --git a/docs/scanning/configuration.md b/docs/scanning/configuration.md index bbc5aa7a22..a0bdf16401 100644 --- a/docs/scanning/configuration.md +++ b/docs/scanning/configuration.md @@ -600,7 +600,9 @@ In addition to the stated options for each module, the following universal optio | modules.trufflehog.version | str | trufflehog version | 3.90.8 | | modules.urlscan.urls | bool | Emit URLs in addition to DNS_NAMEs | False | | modules.virustotal.api_key | str | VirusTotal API Key | | +| modules.wayback.archive | bool | fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true) | False | | modules.wayback.garbage_threshold | int | Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data) | 10 | +| modules.wayback.parameters | bool | emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true) | False | | modules.wayback.urls | bool | emit URLs in addition to DNS_NAMEs | False | | modules.asset_inventory.output_file | str | Set a custom output file | | | modules.asset_inventory.recheck | bool | When use_previous=True, don't retain past details like open ports or findings. Instead, allow them to be rediscovered by the new scan | False | diff --git a/docs/scanning/presets_list.md b/docs/scanning/presets_list.md index 98a4d126ec..1b824df6af 100644 --- a/docs/scanning/presets_list.md +++ b/docs/scanning/presets_list.md @@ -282,7 +282,7 @@ Everything everywhere all at once ??? note "`kitchen-sink.yml`" ```yaml title="~/.bbot/presets/kitchen-sink.yml" description: Everything everywhere all at once - + include: - subdomain-enum - cloud-enum @@ -294,6 +294,15 @@ Everything everywhere all at once - dirbust-light - web-screenshots - baddns-heavy + + config: + modules: + baddns: + enable_references: True + wayback: + urls: True + parameters: True + archive: True ``` @@ -340,10 +349,11 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet flags: - web-paramminer - + modules: - robots - + - wayback + config: modules: lightfuzz: @@ -351,6 +361,9 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet disable_post: False try_post_as_get: True try_get_as_post: True + wayback: + urls: True + parameters: True ``` Category: web @@ -400,7 +413,7 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s include: - lightfuzz-heavy - + config: url_querystring_collapse: False # in cases where the same parameter is observed multiple times, fuzz them individually instead of collapsing them into a single parameter modules: @@ -410,6 +423,10 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s avoid_wafs: False excavate: speculate_params: True # speculate potential parameters extracted from JSON/XML web responses + wayback: + urls: True + parameters: True + archive: True ``` Category: web @@ -802,7 +819,78 @@ Take screenshots of webpages -Modules: [0]("") +Modules: [3]("`gowitness`, `httpx`, `social`") + +## **web-thorough** + +Aggressive web scan + +??? note "`web-thorough.yml`" + ```yaml title="~/.bbot/presets/web-thorough.yml" + description: Aggressive web scan + + include: + # include the web-basic preset + - web-basic + + flags: + - web-thorough + ``` + + + +Modules: [32]("`ajaxpro`, `aspnet_bin_exposure`, `azure_realm`, `baddns`, `badsecrets`, `bucket_amazon`, `bucket_digitalocean`, `bucket_firebase`, `bucket_google`, `bucket_microsoft`, `bypass403`, `dotnetnuke`, `ffuf_shortnames`, `filedownload`, `generic_ssrf`, `git`, `graphql_introspection`, `host_header`, `httpx`, `hunt`, `iis_shortnames`, `lightfuzz`, `ntlm`, `oauth`, `reflected_parameters`, `retirejs`, `robots`, `securitytxt`, `smuggler`, `sslcert`, `telerik`, `url_manipulation`") + +## **wayback** + +Discover URLs and interesting archived files via the Wayback Machine + +??? note "`wayback.yml`" + ```yaml title="~/.bbot/presets/wayback.yml" + description: Discover URLs and interesting archived files via the Wayback Machine + + include: + - subdomain-enum + + modules: + - wayback + + config: + modules: + wayback: + urls: True + ``` + + + +Modules: [52]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`") + +## **wayback-heavy** + +Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection + +??? note "`wayback-heavy.yml`" + ```yaml title="~/.bbot/presets/wayback-heavy.yml" + description: Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection + + include: + - subdomain-enum + + modules: + - wayback + - badsecrets + + config: + modules: + wayback: + urls: True + parameters: True + archive: True + ``` + + + +Modules: [53]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `badsecrets`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`") ## Table of Default Presets @@ -810,34 +898,34 @@ Modules: [0]("") Here is a the same data, but in a table: -| Preset | Category | Description | # Modules | Modules | -|-------------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---------------------------------------------------------------------------------------------| -| baddns | | Check for subdomain takeovers and other DNS issues. | 1 | baddns | -| baddns-heavy | | Run all baddns modules and submodules. | 3 | baddns, baddns_direct, baddns_zone | -| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 0 | | -| code-enum | | Enumerate Git repositories, Docker images, etc. | 0 | | -| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 3 | ffuf, httpx, wayback | -| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 1 | ffuf | -| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 8 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, telerik | -| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 0 | | -| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | | -| iis-shortnames | web | Recursively enumerate IIS shortnames | 0 | | -| kitchen-sink | | Everything everywhere all at once | 7 | baddns, baddns_direct, baddns_zone, ffuf, httpx, hunt, reflected_parameters | -| lightfuzz | web | Default fuzzing: all 9 submodules (cmdi, crypto, path, serial, sqli, ssti, xss, esi, ssrf) plus companion modules (badsecrets, hunt, reflected_parameters). POST fuzzing disabled but try_post_as_get enabled, so POST params are retested as GET. Skips confirmed WAFs. | 6 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters | -| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots | -| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter | -| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots | -| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters | -| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter | -| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter | -| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback | -| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter | -| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 3 | httpx, hunt, reflected_parameters | -| spider | | Recursive web spider | 1 | httpx | -| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx | -| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 0 | | -| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 2 | fingerprintx, nuclei | -| web | | Quick web scan | 0 | | -| web-heavy | | Aggressive web scan | 0 | | -| web-screenshots | | Take screenshots of webpages | 0 | | +| Preset | Category | Description | # Modules | Modules | +|----------------------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| baddns-heavy | | Run all baddns modules and submodules. | 4 | baddns, baddns_direct, baddns_zone, httpx | +| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 58 | anubisdb, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| code-enum | | Enumerate Git repositories, Docker images, etc. | 20 | apkpure, code_repository, docker_pull, dockerhub, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, httpx, jadx, postman, postman_download, social, trufflehog | +| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 5 | ffuf, ffuf_shortnames, httpx, iis_shortnames, wayback | +| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 4 | ffuf, ffuf_shortnames, httpx, iis_shortnames | +| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 9 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, iis_shortnames, telerik | +| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 8 | dehashed, dnscaa, dnstlsrpt, emailformat, hunterio, pgp, skymem, sslcert | +| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | | +| iis-shortnames | web | Recursively enumerate IIS shortnames | 3 | ffuf_shortnames, httpx, iis_shortnames | +| kitchen-sink | | Everything everywhere all at once | 90 | anubisdb, apkpure, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, badsecrets, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, code_repository, crt, crt_db, dehashed, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, docker_pull, dockerhub, emailformat, ffuf, ffuf_shortnames, filedownload, fullhunt, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, gowitness, graphql_introspection, hackertarget, httpx, hunt, hunterio, iis_shortnames, ipneighbor, jadx, leakix, myssl, ntlm, oauth, otx, paramminer_cookies, paramminer_getparams, paramminer_headers, passivetotal, pgp, postman, postman_download, rapiddns, reflected_parameters, robots, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, skymem, social, sslcert, subdomaincenter, subdomainradar, trickest, trufflehog, urlscan, virustotal, wayback | +| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots | +| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter | +| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots | +| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters | +| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter | +| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter | +| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback | +| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter | +| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 6 | httpx, hunt, paramminer_cookies, paramminer_getparams, paramminer_headers, reflected_parameters | +| spider | | Recursive web spider | 1 | httpx | +| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx | +| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 51 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 3 | fingerprintx, httpx, nuclei | +| web-basic | | Quick web scan | 18 | azure_realm, baddns, badsecrets, bucket_amazon, bucket_firebase, bucket_google, bucket_microsoft, ffuf_shortnames, filedownload, git, graphql_introspection, httpx, iis_shortnames, ntlm, oauth, robots, securitytxt, sslcert | +| web-screenshots | | Take screenshots of webpages | 3 | gowitness, httpx, social | +| web-thorough | | Aggressive web scan | 32 | ajaxpro, aspnet_bin_exposure, azure_realm, baddns, badsecrets, bucket_amazon, bucket_digitalocean, bucket_firebase, bucket_google, bucket_microsoft, bypass403, dotnetnuke, ffuf_shortnames, filedownload, generic_ssrf, git, graphql_introspection, host_header, httpx, hunt, iis_shortnames, lightfuzz, ntlm, oauth, reflected_parameters, retirejs, robots, securitytxt, smuggler, sslcert, telerik, url_manipulation | +| wayback | | Discover URLs and interesting archived files via the Wayback Machine | 52 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| wayback-heavy | | Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection | 53 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, badsecrets, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | diff --git a/mkdocs.yml b/mkdocs.yml index 498b0b4a6d..2355c67f5a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,6 +32,7 @@ nav: - Modules: - List of Modules: modules/list_of_modules.md - Nuclei: modules/nuclei.md + - Wayback: modules/wayback.md - Custom YARA Rules: modules/custom_yara_rules.md - Lightfuzz: modules/lightfuzz.md - Misc: From de97820c4fa25a5c152136e7b7817eff3bc935b3 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 14:25:05 -0500 Subject: [PATCH 06/28] add waf string 4xx filtering --- bbot/modules/http.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bbot/modules/http.py b/bbot/modules/http.py index 5edf0c77d7..5a17a216f2 100644 --- a/bbot/modules/http.py +++ b/bbot/modules/http.py @@ -42,6 +42,7 @@ async def setup(self): self.max_response_size = self.config.get("max_response_size", 5242880) self.store_responses = self.config.get("store_responses", False) self.client = self.helpers.blasthttp + self.waf_yara_rule = self.helpers.yara.compile_strings(self.helpers.get_waf_strings(), nocase=True) return True async def filter_event(self, event): @@ -275,6 +276,13 @@ async def handle_batch(self, *events): self.debug(f'Discarding 404 from "{url}"') continue + # discard 4xx responses that contain WAF strings + if 400 <= status_code < 500: + body = j.get("body", "") + if body and await self.helpers.yara.match(self.waf_yara_rule, body): + self.debug(f'Discarding WAF {status_code} from "{url}"') + continue + # main URL tags = [f"status-{status_code}"] From d448bca82ea785a5ce98bdd8f327b742b1de6e91 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 14:52:05 -0500 Subject: [PATCH 07/28] add Akamai WAF string to waf_strings helper --- bbot/core/helpers/misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py index eb2e322bac..bc7fcb2102 100644 --- a/bbot/core/helpers/misc.py +++ b/bbot/core/helpers/misc.py @@ -2691,6 +2691,7 @@ def get_waf_strings(): return [ "The requested URL was rejected", "This content has been blocked", + "You don't have permission to access ", ] From 9b817918765c35c96249c68905d97bd99fd97f2c Mon Sep 17 00:00:00 2001 From: liquidsec Date: Wed, 18 Feb 2026 15:21:51 -0500 Subject: [PATCH 08/28] add directory listing excavate submodule --- bbot/modules/internal/excavate.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 84824ee0e4..6033a42710 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -1002,6 +1002,34 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte if yara_results: event.add_tag("login-page") + class DirectoryListingExtractor(ExcavateRule): + description = "Detects directory listing pages from web servers." + signatures = { + "Apache_Nginx": '"Index of /"', + "IIS": '"[To Parent Directory]"', + "Python_HTTP_Server": '"<h1>Directory listing for"', + "Generic_Directory_Listing": '"<title>Directory Listing"', + } + yara_rules = {} + + def __init__(self, excavate): + super().__init__(excavate) + signature_component_list = [] + for signature_name, signature in self.signatures.items(): + signature_component_list.append(rf"${signature_name} = {signature}") + signature_component = " ".join(signature_component_list) + self.yara_rules["directory_listing"] = ( + f'rule directory_listing {{meta: description = "contains a directory listing" strings: {signature_component} condition: any of them}}' + ) + + async def process(self, yara_results, event, yara_rule_settings, discovery_context): + for identifier in yara_results.keys(): + for findings in yara_results[identifier]: + event_data = { + "description": f"{discovery_context} {yara_rule_settings.description} ({identifier})" + } + await self.report(event_data, event, yara_rule_settings, discovery_context, event_type="FINDING") + def add_yara_rule(self, rule_name, rule_content, rule_instance): rule_instance.name = rule_name self.yara_rules_dict[rule_name] = rule_content From 1991e28a8ab3f8072ea7a8fb4393869d9f35f620 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 18 Feb 2026 15:25:18 -0500 Subject: [PATCH 09/28] improve wayback CDX error logging and increase timeout Include the actual failure reason (timeout, connection error, HTTP status code) in retry and warning messages so it's clear why archive.org requests failed. Increase CDX timeout from +30s to +60s. --- bbot/modules/wayback.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 5f62417a36..0c0497738e 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -187,15 +187,23 @@ async def _fetch_cdx(self, query): """Fetch URLs from the CDX API with retries. Returns the URL list or None on failure.""" waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" r = None + last_error = None for i in range(3): - r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 30) - if r: - break + try: + r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 60, raise_error=True) + except Exception as e: + last_error = str(e) + r = None + if r is not None: + if r.status_code == 200: + break + last_error = f"HTTP status {r.status_code}" + r = None if i < 2: - self.verbose(f'Error connecting to archive.org for query "{query}", retrying ({i + 1}/2)') + self.verbose(f'Error connecting to archive.org for query "{query}" ({last_error}), retrying ({i + 1}/2)') await self.helpers.sleep(2**i) - if not r: - self.warning(f'Error connecting to archive.org for query "{query}"') + if r is None: + self.warning(f'Error connecting to archive.org for query "{query}": {last_error}') return None try: j = r.json() From 5d0cde76071bd99924dbe19e8427d2e6416e0fcf Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 18 Feb 2026 21:17:22 -0500 Subject: [PATCH 10/28] add rate limiting, retry, and bloom filter dedup to wayback archive fetching --- bbot/modules/wayback.py | 165 ++++++++++++------ .../module_tests/test_module_wayback.py | 77 ++++++++ 2 files changed, 191 insertions(+), 51 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 0c0497738e..363769748f 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -60,6 +60,10 @@ async def setup(self): self.garbage_threshold = self.config.get("garbage_threshold", 10) self._parameter_cache = {} self._archive_cache = {} + # bloom filter to deduplicate archive fetches by the response URL archive.org actually served + # (multiple request URLs can redirect to the same archived snapshot) + # 32M bits (~4MB) supports ~400K entries with negligible false-positive rate + self._archive_bloom = self.helpers.bloom_filter(32000000) return await super().setup() async def handle_event(self, event): @@ -343,6 +347,10 @@ def _strip_wayback_wrapper(self, body): body = self._wayback_stale_ref_re.sub("", body) return body + # archive.org rate-limits aggressively; keep concurrency low to avoid cascading timeouts + _archive_threads = 2 + _archive_max_retries = 2 + async def finish(self): if not self.archive or not self._archive_cache: return @@ -367,66 +375,121 @@ async def finish(self): if not url_metadata: return - gen = self.helpers.request_batch(list(url_metadata), timeout=self.http_timeout + 30, follow_redirects=True) + total = len(url_metadata) + self.info(f"Fetching {total:,} archived pages from archive.org (concurrency={self._archive_threads})") + + failed, succeeded, processed = await self._fetch_archive_batch(url_metadata, total, 0) + + # retry failed URLs with backoff + for retry_num in range(1, self._archive_max_retries + 1): + if not failed: + break + delay = 2**retry_num + self.info( + f"Retrying {len(failed):,} failed archive fetches (attempt {retry_num}/{self._archive_max_retries}, " + f"backoff {delay}s)" + ) + await self.helpers.sleep(delay) + retry_metadata = {url: url_metadata[url] for url in failed} + new_failed, new_succeeded, processed = await self._fetch_archive_batch( + retry_metadata, total, processed - len(failed) + ) + succeeded += new_succeeded + failed = new_failed + + if failed: + self.warning(f"Failed to fetch {len(failed):,} archived URLs after retries") + self.info(f"Archive loading complete: {succeeded:,}/{total:,} succeeded") + + async def _fetch_archive_batch(self, url_metadata, total, processed_offset): + """Fetch a batch of archive URLs. Returns (failed_urls, success_count, processed_count).""" + failed = [] + succeeded = 0 + processed = processed_offset + + gen = self.helpers.request_batch( + list(url_metadata), threads=self._archive_threads, timeout=self.http_timeout + 30, follow_redirects=True + ) async for archive_url, r in gen: + processed += 1 raw_url, parent_event = url_metadata[archive_url] if not r or r.status_code != 200: status = getattr(r, "status_code", "no response") if r else "no response" self.verbose(f"Archive fetch failed for {raw_url}: status={status}") + failed.append(archive_url) continue - j = self.helpers.response_to_json(r) - if not j: - self.verbose(f"Failed to parse archive response for {raw_url}") - continue + if await self._process_archive_response(r, raw_url, parent_event): + succeeded += 1 - if "body" in j: - j["body"] = self._strip_wayback_wrapper(j["body"]) + if processed % 50 == 0 or processed == total: + self.verbose(f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed)") - # strip wayback-injected headers to prevent excavate from extracting archive.org artifacts - if "header" in j: - j["header"] = { - k: v for k, v in j["header"].items() if not k.startswith("x_archive_") and k != "set_cookie" - } - if "raw_header" in j: - j["raw_header"] = "\r\n".join( - line - for line in j["raw_header"].split("\r\n") - if not line.lower().startswith(("set-cookie:", "x-archive-")) - ) + return failed, succeeded, processed - # use the original URL so event.host returns the original host, not web.archive.org - # this prevents internal modules (speculate, host, dnsresolve) from treating archive.org as a target - parsed_original = urlparse(raw_url) - hostname = str(parsed_original.hostname or "") - port = parsed_original.port or (443 if parsed_original.scheme == "https" else 80) - scheme = parsed_original.scheme - # strip redundant port (e.g. :80 for http, :443 for https) - if (scheme == "http" and port == 80) or (scheme == "https" and port == 443): - netloc = hostname - else: - netloc = f"{hostname}:{port}" - j["url"] = urlunparse((scheme, netloc, parsed_original.path or "/", "", parsed_original.query, "")) - # store the archive URL for provenance — downstream modules can check this field - j["archive_url"] = str(r.url) - # override host/port/scheme/path to match the original URL (response_to_json set them from archive.org) - j["host"] = hostname - j["port"] = port - j["scheme"] = scheme - j["path"] = parsed_original.path or "/" - - http_response = self.make_event( - j, - "HTTP_RESPONSE", - parent_event, - tags=["from-wayback", "archived"], - context=f"{{module}} loaded archived version of {raw_url} from the Wayback Machine", + async def _process_archive_response(self, r, raw_url, parent_event): + """Process a successful archive.org response into an HTTP_RESPONSE event. Returns True on success.""" + # deduplicate by the actual response URL archive.org served (after redirects) + # multiple request URLs can redirect to the same archived snapshot + response_url = str(r.url) + if response_url in self._archive_bloom: + self.verbose(f"Skipping duplicate archive response for {raw_url} (response URL: {response_url})") + return False + self._archive_bloom.add(response_url) + + j = self.helpers.response_to_json(r) + if not j: + self.verbose(f"Failed to parse archive response for {raw_url}") + return False + + if "body" in j: + j["body"] = self._strip_wayback_wrapper(j["body"]) + + # strip wayback-injected headers to prevent excavate from extracting archive.org artifacts + if "header" in j: + j["header"] = { + k: v for k, v in j["header"].items() if not k.startswith("x_archive_") and k != "set_cookie" + } + if "raw_header" in j: + j["raw_header"] = "\r\n".join( + line + for line in j["raw_header"].split("\r\n") + if not line.lower().startswith(("set-cookie:", "x-archive-")) ) - if http_response is None: - self.verbose(f"Failed to create HTTP_RESPONSE event for {raw_url}") - continue - # keep the event in scope so modules like badsecrets can process the archived content - http_response.scope_distance = 0 - self.verbose(f"Emitting archived HTTP_RESPONSE for dead URL: {raw_url}") - await self.emit_event(http_response) + + # use the original URL so event.host returns the original host, not web.archive.org + # this prevents internal modules (speculate, host, dnsresolve) from treating archive.org as a target + parsed_original = urlparse(raw_url) + hostname = str(parsed_original.hostname or "") + port = parsed_original.port or (443 if parsed_original.scheme == "https" else 80) + scheme = parsed_original.scheme + # strip redundant port (e.g. :80 for http, :443 for https) + if (scheme == "http" and port == 80) or (scheme == "https" and port == 443): + netloc = hostname + else: + netloc = f"{hostname}:{port}" + j["url"] = urlunparse((scheme, netloc, parsed_original.path or "/", "", parsed_original.query, "")) + # store the archive URL for provenance — downstream modules can check this field + j["archive_url"] = str(r.url) + # override host/port/scheme/path to match the original URL (response_to_json set them from archive.org) + j["host"] = hostname + j["port"] = port + j["scheme"] = scheme + j["path"] = parsed_original.path or "/" + + http_response = self.make_event( + j, + "HTTP_RESPONSE", + parent_event, + tags=["from-wayback", "archived"], + context=f"{{module}} loaded archived version of {raw_url} from the Wayback Machine", + ) + if http_response is None: + self.verbose(f"Failed to create HTTP_RESPONSE event for {raw_url}") + return False + # keep the event in scope so modules like badsecrets can process the archived content + http_response.scope_distance = 0 + self.verbose(f"Emitting archived HTTP_RESPONSE for dead URL: {raw_url}") + await self.emit_event(http_response) + return True diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 9b2f322332..d7636f4d21 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -441,3 +441,80 @@ def check(self, module_test, events): stripped = w._strip_wayback_wrapper(body) assert "/web/20250529193232js_/" not in stripped assert "https://www.example.com/script.js" in stripped + + +class TestWaybackArchiveBloomDedup(ModuleTestBase): + """When multiple archive URLs redirect to the same snapshot, bloom filter prevents duplicate HTTP_RESPONSEs.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # CDX returns two different dead URLs + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + json=[ + ["original"], + ["http://127.0.0.1:1/page-a"], + ["http://127.0.0.1:1/page-b"], + ], + ) + # both archive URLs redirect to the same archived snapshot + redirect_target = "http://web.archive.org/web/20230101120000/http://127.0.0.1:1/same-page" + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/page-a", + status_code=301, + headers={"Location": redirect_target}, + ) + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/page-b", + status_code=301, + headers={"Location": redirect_target}, + ) + # two responses for the redirect target (one consumed per redirect) + for _ in range(2): + module_test.httpx_mock.add_response( + url=redirect_target, + text="<html><body>archived content</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, ( + f"Expected exactly 1 archived HTTP_RESPONSE (bloom dedup should prevent duplicate), got {len(http_responses)}" + ) + + +class TestWaybackArchiveRetry(ModuleTestBase): + """Archive fetches that fail on first attempt should be retried and succeed.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + json=[["original"], ["http://127.0.0.1:1/retry-page"]], + ) + # first attempt: 503 (archive.org overloaded) + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", + status_code=503, + ) + # retry attempt: 200 + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", + text="<html><body>recovered content</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, ( + f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}" + ) From c0ccda545ad28a6a139b5d0535e65c12af9e4f72 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 18 Feb 2026 21:42:47 -0500 Subject: [PATCH 11/28] add CDX server-side filters and 100k URL limit to wayback module --- bbot/modules/wayback.py | 16 ++++- .../module_tests/test_module_wayback.py | 68 +++++++++---------- 2 files changed, 49 insertions(+), 35 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 363769748f..f90662f930 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -187,9 +187,23 @@ async def _check_interesting_files(self, interesting_files, event): context=f"{{module}} found interesting archived file: {raw_url}", ) + # CDX API filters applied server-side to reduce response size + _cdx_filters = ( + "filter=!statuscode:404", + "filter=!statuscode:301", + "filter=!statuscode:302", + "filter=!mimetype:image/.*", + "filter=!mimetype:text/css", + "filter=!mimetype:warc/revisit", + ) + _cdx_limit = 100000 + async def _fetch_cdx(self, query): """Fetch URLs from the CDX API with retries. Returns the URL list or None on failure.""" - waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" + params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" + params += f"&limit={self._cdx_limit}" + params += "&" + "&".join(self._cdx_filters) + waybackurl = f"{self.base_url}/cdx/search/cdx?{params}" r = None last_error = None for i in range(3): diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index d7636f4d21..576f2b1723 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -11,7 +11,7 @@ class TestWayback(ModuleTestBase): async def setup_after_prep(self, module_test): module_test.blasthttp_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://asdf.blacklanternsecurity.com"]], ) @@ -26,8 +26,8 @@ class TestWaybackParameters(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True, "parameters": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], ["http://127.0.0.1:8888/page?foo=bar&baz=qux"], @@ -65,11 +65,11 @@ class TestWaybackInterestingFiles(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://blacklanternsecurity.com/backup/site.zip"]], ) - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://blacklanternsecurity.com/backup/site.zip", headers={"Content-Type": "application/zip", "Content-Length": "1048576"}, ) @@ -106,12 +106,12 @@ class TestWaybackArchive(ModuleTestBase): async def setup_after_prep(self, module_test): # wayback returns a URL on an unreachable port — httpx binary can't verify it - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/deadpage"]], ) # the archived page itself contains the vulnerable viewstate - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/deadpage", text=self.sample_viewstate, headers={"Content-Type": "text/html"}, @@ -148,8 +148,8 @@ class TestWaybackHttpHttpsDedup(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], ["http://blacklanternsecurity.com/page"], @@ -175,8 +175,8 @@ class TestWaybackHttpOnlyKept(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], ["http://blacklanternsecurity.com/old-http-only"], @@ -200,8 +200,8 @@ class TestWaybackCdnCgiBlacklist(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], ["https://blacklanternsecurity.com/cdn-cgi/challenge-platform/h/g/something"], @@ -229,11 +229,11 @@ class TestWaybackArchiveHostField(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/archived-page"]], ) - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/archived-page", text="<html><body>archived content</body></html>", headers={"Content-Type": "text/html"}, @@ -273,12 +273,12 @@ class TestWaybackArchiveHuntFinding(ModuleTestBase): async def setup_after_prep(self, module_test): # CDX returns a dead URL (port 1 = unreachable) with a huntable form - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/search"]], ) # the archived page contains a form with "redirect" — a known hunt parameter - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/search", text='<html><form method="GET" action="/search"><input name="redirect" value="test"></form></html>', headers={"Content-Type": "text/html"}, @@ -359,8 +359,8 @@ def request_handler(self, request): async def setup_after_prep(self, module_test): module_test.scan.modules["lightfuzz"].helpers.rand_string = lambda *args, **kwargs: "AAAAAAAAAAAAAA" # CDX returns a URL with a search parameter pointing at the local httpserver - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:8888/?search=test"]], ) # httpserver handles httpx verification and lightfuzz probes @@ -390,8 +390,8 @@ class TestWaybackStripBodyArtifacts(ModuleTestBase): modules_overrides = ["wayback"] async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"]], ) @@ -453,8 +453,8 @@ class TestWaybackArchiveBloomDedup(ModuleTestBase): async def setup_after_prep(self, module_test): # CDX returns two different dead URLs - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], ["http://127.0.0.1:1/page-a"], @@ -463,19 +463,19 @@ async def setup_after_prep(self, module_test): ) # both archive URLs redirect to the same archived snapshot redirect_target = "http://web.archive.org/web/20230101120000/http://127.0.0.1:1/same-page" - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/page-a", status_code=301, headers={"Location": redirect_target}, ) - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/page-b", status_code=301, headers={"Location": redirect_target}, ) # two responses for the redirect target (one consumed per redirect) for _ in range(2): - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url=redirect_target, text="<html><body>archived content</body></html>", headers={"Content-Type": "text/html"}, @@ -497,17 +497,17 @@ class TestWaybackArchiveRetry(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): - module_test.httpx_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/retry-page"]], ) # first attempt: 503 (archive.org overloaded) - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", status_code=503, ) # retry attempt: 200 - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", text="<html><body>recovered content</body></html>", headers={"Content-Type": "text/html"}, From b55d2a1ec50f515fb72b3b32ea38118727dab4a1 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 18 Feb 2026 23:01:24 -0500 Subject: [PATCH 12/28] fixing wayback rate limiting --- bbot/modules/wayback.py | 102 +++++++++++++++--- .../module_tests/test_module_wayback.py | 4 +- 2 files changed, 87 insertions(+), 19 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index f90662f930..9e7a06ba8e 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -218,7 +218,9 @@ async def _fetch_cdx(self, query): last_error = f"HTTP status {r.status_code}" r = None if i < 2: - self.verbose(f'Error connecting to archive.org for query "{query}" ({last_error}), retrying ({i + 1}/2)') + self.verbose( + f'Error connecting to archive.org for query "{query}" ({last_error}), retrying ({i + 1}/2)' + ) await self.helpers.sleep(2**i) if r is None: self.warning(f'Error connecting to archive.org for query "{query}": {last_error}') @@ -361,9 +363,12 @@ def _strip_wayback_wrapper(self, body): body = self._wayback_stale_ref_re.sub("", body) return body - # archive.org rate-limits aggressively; keep concurrency low to avoid cascading timeouts - _archive_threads = 2 - _archive_max_retries = 2 + # archive.org rate-limits aggressively; pace requests to avoid cascading ReadErrors + _archive_per_request_retries = 3 + _archive_batch_retries = 1 + _archive_delay = 0.5 # seconds between successful requests + _archive_error_delay = 3 # initial backoff seconds after a failed request + _archive_429_default_delay = 30 # default delay on 429 when no retry-after header async def finish(self): if not self.archive or not self._archive_cache: @@ -394,13 +399,14 @@ async def finish(self): failed, succeeded, processed = await self._fetch_archive_batch(url_metadata, total, 0) - # retry failed URLs with backoff - for retry_num in range(1, self._archive_max_retries + 1): + # batch-level retry as safety net (per-request retry handles most transient errors, + # but a temporary outage window could still leave a batch of failures) + for retry_num in range(1, self._archive_batch_retries + 1): if not failed: break - delay = 2**retry_num + delay = 30 * retry_num self.info( - f"Retrying {len(failed):,} failed archive fetches (attempt {retry_num}/{self._archive_max_retries}, " + f"Retrying {len(failed):,} failed archive fetches (batch retry {retry_num}/{self._archive_batch_retries}, " f"backoff {delay}s)" ) await self.helpers.sleep(delay) @@ -416,21 +422,22 @@ async def finish(self): self.info(f"Archive loading complete: {succeeded:,}/{total:,} succeeded") async def _fetch_archive_batch(self, url_metadata, total, processed_offset): - """Fetch a batch of archive URLs. Returns (failed_urls, success_count, processed_count).""" + """Fetch a batch of archive URLs with per-request retry and rate-limit handling. + + Returns (failed_urls, success_count, processed_count). + """ failed = [] succeeded = 0 processed = processed_offset - gen = self.helpers.request_batch( - list(url_metadata), threads=self._archive_threads, timeout=self.http_timeout + 30, follow_redirects=True - ) - async for archive_url, r in gen: + for archive_url, (raw_url, parent_event) in url_metadata.items(): processed += 1 - raw_url, parent_event = url_metadata[archive_url] + + r = await self._fetch_single_archive_url(archive_url, raw_url) if not r or r.status_code != 200: status = getattr(r, "status_code", "no response") if r else "no response" - self.verbose(f"Archive fetch failed for {raw_url}: status={status}") + self.verbose(f"Archive fetch failed for {raw_url} after retries: status={status}") failed.append(archive_url) continue @@ -438,10 +445,73 @@ async def _fetch_archive_batch(self, url_metadata, total, processed_offset): succeeded += 1 if processed % 50 == 0 or processed == total: - self.verbose(f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed)") + self.verbose( + f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed)" + ) + + # pace requests to avoid triggering rate limits + await self.helpers.sleep(self._archive_delay) return failed, succeeded, processed + async def _fetch_single_archive_url(self, archive_url, raw_url): + """Fetch a single archive URL with per-request retry, 429 handling, and backoff. + + archive.org rate-limits CDX at ~60 req/min and blocks the IP at the firewall + if 429 responses are ignored for more than a minute. We must respect 429 + Retry-After. + """ + r = None + for attempt in range(self._archive_per_request_retries): + try: + r = await self.helpers.request( + archive_url, timeout=self.http_timeout + 60, follow_redirects=True, raise_error=True + ) + except Exception as e: + r = None + if attempt < self._archive_per_request_retries - 1: + delay = self._archive_error_delay * (2**attempt) + self.verbose( + f"Archive fetch error for {raw_url} (attempt {attempt + 1}/{self._archive_per_request_retries}): " + f"{e} -- retrying in {delay}s" + ) + await self.helpers.sleep(delay) + else: + self.verbose( + f"Archive fetch error for {raw_url} (final attempt {attempt + 1}/{self._archive_per_request_retries}): {e}" + ) + continue + + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + self.verbose(f"Archive.org rate limit (429) for {raw_url}, sleeping {delay}s") + await self.helpers.sleep(delay) + r = None + continue + + if r.status_code == 200: + return r + + # non-200, non-429 status + if attempt < self._archive_per_request_retries - 1: + delay = self._archive_error_delay * (2**attempt) + self.verbose( + f"Archive fetch got HTTP {r.status_code} for {raw_url} " + f"(attempt {attempt + 1}/{self._archive_per_request_retries}), retrying in {delay}s" + ) + await self.helpers.sleep(delay) + else: + self.verbose( + f"Archive fetch got HTTP {r.status_code} for {raw_url} " + f"(final attempt {attempt + 1}/{self._archive_per_request_retries})" + ) + r = None + + return r + async def _process_archive_response(self, r, raw_url, parent_event): """Process a successful archive.org response into an HTTP_RESPONSE event. Returns True on success.""" # deduplicate by the actual response URL archive.org served (after redirects) diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 576f2b1723..ef4e87780e 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -515,6 +515,4 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] - assert len(http_responses) == 1, ( - f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}" - ) + assert len(http_responses) == 1, f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}" From ab154c8a18fc659f4a0815edfc8e581a9e5f98d1 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Thu, 19 Feb 2026 00:41:12 -0500 Subject: [PATCH 13/28] improving wayback delay system --- bbot/modules/wayback.py | 42 ++++++- .../module_tests/test_module_wayback.py | 104 ++++++++++++++++++ 2 files changed, 142 insertions(+), 4 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 9e7a06ba8e..465377f4b6 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -1,4 +1,5 @@ import re +from collections import Counter from datetime import datetime from urllib.parse import parse_qs, urlparse, urlunparse @@ -31,6 +32,24 @@ class wayback(subdomain_enum): interesting_extensions = frozenset({"zip", "sql", "bak", "env", "config"}) interesting_compound_extensions = frozenset({"tar.gz", "tar.bz2"}) + # maximum URL length before we consider it garbage (crawler traps produce absurdly long URLs) + _max_url_length = 2000 + # if any single path segment repeats more than this many times, it's a path loop / crawler trap + _max_path_segment_repeats = 3 + + def _is_garbage_url(self, url): + """Detect crawler-trap URLs with repeating path segments or excessive length.""" + if len(url) > self._max_url_length: + return True + path = urlparse(url).path + if not path: + return False + segments = [s for s in path.split("/") if s] + if not segments: + return False + counts = Counter(segments) + return counts.most_common(1)[0][1] > self._max_path_segment_repeats + def _is_interesting_file(self, url): ext = get_file_extension(url) if ext and ext.lower() in self.interesting_extensions: @@ -199,7 +218,7 @@ async def _check_interesting_files(self, interesting_files, event): _cdx_limit = 100000 async def _fetch_cdx(self, query): - """Fetch URLs from the CDX API with retries. Returns the URL list or None on failure.""" + """Fetch URLs from the CDX API with retries and 429 handling. Returns the URL list or None on failure.""" params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" params += f"&limit={self._cdx_limit}" params += "&" + "&".join(self._cdx_filters) @@ -215,6 +234,17 @@ async def _fetch_cdx(self, query): if r is not None: if r.status_code == 200: break + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + last_error = "HTTP 429 rate limited" + self.verbose(f'Archive.org rate limit (429) for CDX query "{query}", sleeping {delay}s') + await self.helpers.sleep(delay) + r = None + continue last_error = f"HTTP status {r.status_code}" r = None if i < 2: @@ -244,6 +274,8 @@ def _pre_process_urls(self, urls): parsed = urlparse(url) if any(bl in url for bl in self.url_blacklist): continue + if self._is_garbage_url(url): + continue if not (parsed.hostname and self.scan.in_scope(parsed.hostname)): continue @@ -283,8 +315,10 @@ async def query(self, query): self.verbose(f"Found {len(urls):,} URLs for {query}") - # filter blacklisted URLs before any further processing - urls = [url for url in urls if not any(bl in url for bl in self.url_blacklist)] + # filter blacklisted and garbage URLs before any further processing + urls = [ + url for url in urls if not any(bl in url for bl in self.url_blacklist) and not self._is_garbage_url(url) + ] # pre-extract metadata from raw URLs before collapse strips query strings raw_url_params, archive_urls, interesting_files = {}, {}, {} @@ -395,7 +429,7 @@ async def finish(self): return total = len(url_metadata) - self.info(f"Fetching {total:,} archived pages from archive.org (concurrency={self._archive_threads})") + self.info(f"Fetching {total:,} archived pages from archive.org") failed, succeeded, processed = await self._fetch_archive_batch(url_metadata, total, 0) diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index ef4e87780e..9ee842a8fb 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -497,6 +497,9 @@ class TestWaybackArchiveRetry(ModuleTestBase): config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): + # speed up retries for testing + module_test.scan.modules["wayback"]._archive_error_delay = 0.01 + module_test.scan.modules["wayback"]._archive_delay = 0 module_test.blasthttp_mock.add_response( url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/retry-page"]], @@ -516,3 +519,104 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] assert len(http_responses) == 1, f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}" + + +class TestWaybackGarbageUrlFilter(ModuleTestBase): + """Crawler-trap URLs with repeating path segments should be filtered out.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + whitelist = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + # build a crawler-trap URL with repeating path segments (like the real-world example) + repeating = "/themes/sites/example.com".lstrip("/") + garbage_path = "/get-materials/" + "/".join([repeating] * 20) + garbage_url = f"https://blacklanternsecurity.com{garbage_path}" + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + [garbage_url], + ["https://blacklanternsecurity.com/real-page"], + ], + ) + + def check(self, module_test, events): + # garbage URL should be filtered + assert not any(e.type == "URL_UNVERIFIED" and "get-materials" in e.data for e in events), ( + "Crawler-trap URL with repeating path segments should have been filtered" + ) + # real page should still be emitted + assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.data for e in events), ( + "Non-garbage URL should have been emitted" + ) + + +class TestWaybackGarbageUrlLength(ModuleTestBase): + """Excessively long URLs should be filtered out as garbage.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + whitelist = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + # URL exceeding 2000 character limit + long_url = "https://blacklanternsecurity.com/" + "a" * 2000 + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + [long_url], + ["https://blacklanternsecurity.com/normal-page"], + ], + ) + + def check(self, module_test, events): + # long URL should be filtered + assert not any(e.type == "URL_UNVERIFIED" and "aaaa" in e.data for e in events), ( + "Excessively long URL should have been filtered" + ) + # normal page should still be emitted + assert any(e.type == "URL_UNVERIFIED" and "normal-page" in e.data for e in events), ( + "Normal-length URL should have been emitted" + ) + + +class TestWaybackArchive429Retry(ModuleTestBase): + """Archive fetches that get 429 rate-limited should back off and retry successfully.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # speed up delays for testing + module_test.scan.modules["wayback"]._archive_429_default_delay = 0.01 + module_test.scan.modules["wayback"]._archive_error_delay = 0.01 + module_test.scan.modules["wayback"]._archive_delay = 0 + module_test.httpx_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/rate-limited-page"]], + ) + # first attempt: 429 rate limited + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", + status_code=429, + headers={"Retry-After": "1"}, + ) + # retry after backoff: 200 + module_test.httpx_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", + text="<html><body>content after rate limit</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, ( + f"Expected 1 archived HTTP_RESPONSE after 429 retry, got {len(http_responses)}" + ) From 454f1698a1df645125a9ec9aabbcaa1f000bf537 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Thu, 19 Feb 2026 07:40:35 -0500 Subject: [PATCH 14/28] make cpu heavy processing non-blocking --- bbot/modules/wayback.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 465377f4b6..cfd6a293bd 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -3,11 +3,21 @@ from datetime import datetime from urllib.parse import parse_qs, urlparse, urlunparse +import orjson + from bbot.core.helpers.misc import get_file_extension from bbot.core.helpers.validators import clean_url from bbot.modules.templates.subdomain_enum import subdomain_enum +def _parse_cdx_response(text): + """Parse CDX JSON response text into a URL list. Designed to run in a separate process.""" + j = orjson.loads(text) + if not isinstance(j, list): + return None + return [result[0] for result in j[1:] if result] + + class wayback(subdomain_enum): flags = ["safe", "passive", "subdomain-enum"] watched_events = ["DNS_NAME", "URL"] @@ -255,13 +265,16 @@ async def _fetch_cdx(self, query): if r is None: self.warning(f'Error connecting to archive.org for query "{query}": {last_error}') return None + # parse JSON + extract URLs in a separate process to avoid blocking the event loop + # (CDX responses can contain 100k+ entries) try: - j = r.json() - assert type(j) == list + urls = await self.helpers.run_in_executor_mp(_parse_cdx_response, r.text) except Exception: + urls = None + if urls is None: self.warning(f'Error JSON-decoding archive.org response for query "{query}"') return None - return [result[0] for result in j[1:] if result] + return urls def _pre_process_urls(self, urls): """Extract parameters, archive URLs, and interesting files from raw CDX URLs before collapse.""" From ec111a90e67a9df4011942b586e2e8847c91c481 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Thu, 19 Feb 2026 09:23:20 -0500 Subject: [PATCH 15/28] make max_records configurable, fix archive retry logic, demote log level - Add max_records option (default 100000) for CDX API limit - Only retry archive fetches on connection errors/429, not on definitive HTTP status codes - Change "Loading archived URLs" message from hugeinfo to verbose - Update retry test to use ReadError instead of 503 --- bbot/modules/wayback.py | 29 +++++-------------- .../module_tests/test_module_wayback.py | 3 +- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index cfd6a293bd..7071b6f0f6 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -27,12 +27,13 @@ class wayback(subdomain_enum): "created_date": "2022-04-01", "author": "@liquidsec", } - options = {"urls": False, "garbage_threshold": 10, "parameters": False, "archive": False} + options = {"urls": False, "garbage_threshold": 10, "parameters": False, "archive": False, "max_records": 100000} options_desc = { "urls": "emit URLs in addition to DNS_NAMEs", "garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)", "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true)", "archive": "fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true)", + "max_records": "Maximum number of URLs to fetch from the CDX API", } in_scope_only = True @@ -87,6 +88,7 @@ async def setup(self): self.hugewarning("archive option requires urls to be enabled. Please add modules.wayback.urls=True") return False self.garbage_threshold = self.config.get("garbage_threshold", 10) + self.max_records = self.config.get("max_records", 100000) self._parameter_cache = {} self._archive_cache = {} # bloom filter to deduplicate archive fetches by the response URL archive.org actually served @@ -225,12 +227,10 @@ async def _check_interesting_files(self, interesting_files, event): "filter=!mimetype:text/css", "filter=!mimetype:warc/revisit", ) - _cdx_limit = 100000 - async def _fetch_cdx(self, query): """Fetch URLs from the CDX API with retries and 429 handling. Returns the URL list or None on failure.""" params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" - params += f"&limit={self._cdx_limit}" + params += f"&limit={self.max_records}" params += "&" + "&".join(self._cdx_filters) waybackurl = f"{self.base_url}/cdx/search/cdx?{params}" r = None @@ -421,7 +421,7 @@ async def finish(self): if not self.archive or not self._archive_cache: return - self.hugeinfo(f"Loading {len(self._archive_cache):,} archived URLs from the Wayback Machine") + self.verbose(f"Loading {len(self._archive_cache):,} archived URLs from the Wayback Machine") # build combined set of extensions to skip (blacklist + static + special) skip_extensions = set(self.scan.url_extension_blacklist) @@ -539,23 +539,8 @@ async def _fetch_single_archive_url(self, archive_url, raw_url): r = None continue - if r.status_code == 200: - return r - - # non-200, non-429 status - if attempt < self._archive_per_request_retries - 1: - delay = self._archive_error_delay * (2**attempt) - self.verbose( - f"Archive fetch got HTTP {r.status_code} for {raw_url} " - f"(attempt {attempt + 1}/{self._archive_per_request_retries}), retrying in {delay}s" - ) - await self.helpers.sleep(delay) - else: - self.verbose( - f"Archive fetch got HTTP {r.status_code} for {raw_url} " - f"(final attempt {attempt + 1}/{self._archive_per_request_retries})" - ) - r = None + # any other status code (200, 404, 503, etc.) is a definitive answer — return it + return r return r diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 9ee842a8fb..9bbd2b279e 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -489,7 +489,7 @@ def check(self, module_test, events): class TestWaybackArchiveRetry(ModuleTestBase): - """Archive fetches that fail on first attempt should be retried and succeed.""" + """Archive fetches that fail transiently (connection error) should be retried and succeed.""" module_name = "wayback" modules_overrides = ["wayback"] @@ -507,7 +507,6 @@ async def setup_after_prep(self, module_test): # first attempt: 503 (archive.org overloaded) module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", - status_code=503, ) # retry attempt: 200 module_test.blasthttp_mock.add_response( From 630bbb588f5ddaa3d5c7e09ff27611de84013cbd Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Thu, 19 Feb 2026 23:44:46 -0500 Subject: [PATCH 16/28] fix _event_host() using resolved IP instead of URL hostname --- bbot/modules/internal/excavate.py | 26 +++++++++----- .../module_tests/test_module_excavate.py | 35 +++++++++++++++++++ 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 6033a42710..319cc0ee95 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -366,24 +366,32 @@ def in_bl(self, value): return False + def _is_archived(self, event): + """Check if an event represents archived wayback content.""" + return isinstance(event.data, dict) and "archive_url" in event.data + def _event_host(self, event): - """Get the effective host from an event, preferring data["host"] over parsed_url. + """Get the effective host from an event. + + For archived wayback content, data["host"] contains the original target hostname + (since data["url"] points to archive.org). For regular events, we use event.host. - HTTP_RESPONSE._host() derives from parsed_url.hostname (i.e. data["url"]), - but data["host"] may be explicitly overridden (e.g. for archived wayback content - where url is archive.org but host is the original target). + NOTE: Regular HTTP_RESPONSE events also have data["host"], but it contains the + resolved IP from the httpx binary — NOT a hostname override. """ - if isinstance(event.data, dict) and event.data.get("host"): + if self._is_archived(event) and event.data.get("host"): return str(event.data["host"]) return str(event.host) def _event_base_url(self, event): - """Reconstruct the effective base URL from event data fields. + """Get the effective base URL from an event. - For normal HTTP_RESPONSE events, this matches event.parsed_url. - For archived content (e.g. wayback), the data fields (host/scheme/path) - reflect the original URL while parsed_url comes from the archive URL. + For archived wayback content, reconstructs the original URL from override fields + (host/scheme/port/path) since parsed_url points to archive.org. + For regular events, returns event.parsed_url directly. """ + if not self._is_archived(event): + return event.parsed_url scheme = event.data.get("scheme", event.parsed_url.scheme) host = self._event_host(event) port = event.data.get("port") diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index 6d56911723..996f8bec9b 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -1290,6 +1290,41 @@ def check(self, module_test, events): assert not web_parameter_outofscope, "Out of scope domain was emitted" +class TestExcavate_webparameter_ip_host(ModuleTestBase): + """Verify that when the httpx binary resolves a hostname to an IP (data["host"]), + excavate still uses the URL hostname for WEB_PARAMETER host — not the resolved IP. + + This test uses 'localhost' as the target. The httpx binary resolves it to 127.0.0.1 + and sets data["host"] = "127.0.0.1" in its JSON output. Without the archive_url guard + in _event_host(), this IP would be used as the WEB_PARAMETER host, putting it out of + scope and preventing downstream modules (like lightfuzz) from processing it. + """ + + targets = ["http://localhost:8888"] + modules_overrides = ["httpx", "excavate", "hunt"] + config_overrides = {"interactsh_disable": True} + + async def setup_after_prep(self, module_test): + await module_test.mock_dns({"localhost": {"A": ["127.0.0.1"]}}) + module_test.httpserver.expect_request("/").respond_with_data( + "<html><p>hello</p></html>", + status=200, + headers={"Set-Cookie": "session=abc123; Path=/"}, + ) + + def check(self, module_test, events): + web_params = [e for e in events if e.type == "WEB_PARAMETER" and e.data["name"] == "session"] + assert len(web_params) > 0, "WEB_PARAMETER for 'session' cookie was not emitted" + for wp in web_params: + assert wp.data["host"] != "127.0.0.1", ( + f"WEB_PARAMETER host should be 'localhost', not the resolved IP '127.0.0.1'. " + f"excavate._event_host() is using data['host'] (resolved IP) instead of event.host" + ) + assert wp.data["host"] == "localhost", ( + f"WEB_PARAMETER host should be 'localhost', got '{wp.data['host']}'" + ) + + class TestExcavateHeaders(ModuleTestBase): targets = ["http://127.0.0.1:8888/"] modules_overrides = ["excavate", "http", "hunt"] From b11d3ef2a87ee2c48b4298990d9174cafe8b1296 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Fri, 20 Feb 2026 22:47:28 -0500 Subject: [PATCH 17/28] skip URL collapse when there are no URLs to process --- bbot/modules/wayback.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 7071b6f0f6..c222cef348 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -338,6 +338,9 @@ async def query(self, query): if self.parameters or self.archive or self.urls: raw_url_params, archive_urls, interesting_files = self._pre_process_urls(urls) + if not urls: + return results, interesting_files + dns_names = set() collapsed_urls = 0 start_time = datetime.now() From 4ab6588c71bd74c2167919f009915eb916a5f0c2 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Mon, 23 Feb 2026 13:03:27 -0500 Subject: [PATCH 18/28] ruff format --- bbot/modules/wayback.py | 1 + bbot/test/test_step_2/module_tests/test_module_excavate.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index c222cef348..3ced55937d 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -227,6 +227,7 @@ async def _check_interesting_files(self, interesting_files, event): "filter=!mimetype:text/css", "filter=!mimetype:warc/revisit", ) + async def _fetch_cdx(self, query): """Fetch URLs from the CDX API with retries and 429 handling. Returns the URL list or None on failure.""" params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index 996f8bec9b..25d38644f5 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -1320,9 +1320,7 @@ def check(self, module_test, events): f"WEB_PARAMETER host should be 'localhost', not the resolved IP '127.0.0.1'. " f"excavate._event_host() is using data['host'] (resolved IP) instead of event.host" ) - assert wp.data["host"] == "localhost", ( - f"WEB_PARAMETER host should be 'localhost', got '{wp.data['host']}'" - ) + assert wp.data["host"] == "localhost", f"WEB_PARAMETER host should be 'localhost', got '{wp.data['host']}'" class TestExcavateHeaders(ModuleTestBase): From 786bc09ec8949e43533e207d137da480a4d7d340 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Mon, 23 Feb 2026 14:24:24 -0500 Subject: [PATCH 19/28] add timeout and recovery protections to run_in_executor_mp --- bbot/core/helpers/helper.py | 24 +++++++++++++++++++----- bbot/scanner/scanner.py | 4 ++-- bbot/test/test_step_1/test_helpers.py | 27 +++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py index 0d2a1dbb6b..d1d64850ef 100644 --- a/bbot/core/helpers/helper.py +++ b/bbot/core/helpers/helper.py @@ -1,4 +1,6 @@ import os +import sys +import asyncio import logging from pathlib import Path import multiprocessing as mp @@ -83,7 +85,12 @@ def __init__(self, preset): # we spawn 1 fewer processes than cores # this helps to avoid locking up the system or competing with the main python process for cpu time num_processes = max(1, mp.cpu_count() - 1) - self.process_pool = ProcessPoolExecutor(max_workers=num_processes) + pool_kwargs = {"max_workers": num_processes} + # max_tasks_per_child replaces workers after N tasks, preventing memory leaks + # and reducing the chance of a degraded worker process causing hangs + if sys.version_info >= (3, 11): + pool_kwargs["max_tasks_per_child"] = 25 + self.process_pool = ProcessPoolExecutor(**pool_kwargs) self._cloud = None self._blasthttp_client = None @@ -237,17 +244,24 @@ def run_in_executor_cpu(self, callback, *args, **kwargs): callback = partial(callback, **kwargs) return self.loop.run_in_executor(self._cpu_executor, callback, *args) - def run_in_executor_mp(self, callback, *args, **kwargs): + async def run_in_executor_mp(self, callback, *args, **kwargs): """ - Same as run_in_executor_io() except with a process pool executor - Use only in cases where callback is CPU-bound + Same as run_in_executor_io() except with a process pool executor. + Use only in cases where callback is CPU-bound. + + Includes a timeout (default 300s) to prevent indefinite hangs if a + child process dies or the pool enters a broken state. + + Pass ``_timeout=seconds`` to override the default timeout. Examples: Execute callback: >>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2) """ + timeout = kwargs.pop("_timeout", 300) callback = partial(callback, **kwargs) - return self.loop.run_in_executor(self.process_pool, callback, *args) + future = self.loop.run_in_executor(self.process_pool, callback, *args) + return await asyncio.wait_for(future, timeout=timeout) @property def in_tests(self): diff --git a/bbot/scanner/scanner.py b/bbot/scanner/scanner.py index 49c4bb9b56..b5a4712550 100644 --- a/bbot/scanner/scanner.py +++ b/bbot/scanner/scanner.py @@ -908,8 +908,8 @@ def _cancel_tasks(self): tasks.append(self._stop_task) self.helpers.cancel_tasks_sync(tasks) - # process pool - self.helpers.process_pool.shutdown(cancel_futures=True) + # process pool (wait=False so a stuck worker can't hang the cleanup) + self.helpers.process_pool.shutdown(wait=False, cancel_futures=True) self.debug("Finished cancelling all scan tasks") return tasks diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py index 44a83d8394..a8094cd628 100644 --- a/bbot/test/test_step_1/test_helpers.py +++ b/bbot/test/test_step_1/test_helpers.py @@ -978,6 +978,7 @@ async def test_rm_temp_dir_at_exit(helpers): assert not temp_dir.exists() +<<<<<<< HEAD def test_simhash_similarity(helpers): """Test SimHash helper with increasingly different HTML pages.""" @@ -1146,3 +1147,29 @@ def test_clean_dns_record(): assert clean_dns_record("'d1jwhzvlef5tfb.example.com'") == "d1jwhzvlef5tfb.example.com" # quotes + trailing dot assert clean_dns_record('"d1jwhzvlef5tfb.example.com."') == "d1jwhzvlef5tfb.example.com" + + +# these must be top-level functions so they can be pickled for the subprocess +def _hang_forever(): + import time + + time.sleep(9999) + + +def _cpu_work(n): + return sum(range(n)) + + +@pytest.mark.asyncio +async def test_run_in_executor_mp(helpers): + # normal tasks should complete fine + result = await helpers.run_in_executor_mp(_cpu_work, 100_000) + assert result == sum(range(100_000)) + + # a hanging task should raise TimeoutError + with pytest.raises(asyncio.TimeoutError): + await helpers.run_in_executor_mp(_hang_forever, _timeout=2) + + # pool should still work after a timeout + result = await helpers.run_in_executor_mp(_cpu_work, 50_000, _timeout=30) + assert result == sum(range(50_000)) From d4e7a58e61a35f505a2ac48a084ee2a4db543196 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 25 Feb 2026 15:41:36 -0500 Subject: [PATCH 20/28] ruff check fixes --- bbot/test/test_step_2/module_tests/test_module_excavate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index 25d38644f5..e45cdbe646 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -1317,8 +1317,8 @@ def check(self, module_test, events): assert len(web_params) > 0, "WEB_PARAMETER for 'session' cookie was not emitted" for wp in web_params: assert wp.data["host"] != "127.0.0.1", ( - f"WEB_PARAMETER host should be 'localhost', not the resolved IP '127.0.0.1'. " - f"excavate._event_host() is using data['host'] (resolved IP) instead of event.host" + "WEB_PARAMETER host should be 'localhost', not the resolved IP '127.0.0.1'. " + "excavate._event_host() is using data['host'] (resolved IP) instead of event.host" ) assert wp.data["host"] == "localhost", f"WEB_PARAMETER host should be 'localhost', got '{wp.data['host']}'" From 036a187e003191d38c0790cff7668043bf0a833b Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Fri, 6 Mar 2026 14:16:40 -0500 Subject: [PATCH 21/28] Fix three test failures: dedup, validation, and wildcard defense - wayback: override _incoming_dedup_hash for URL events to prevent subdomain_enum's domain-based dedup from collapsing distinct URLs - wayback: fix FINDING confidence "MODERATE" -> "MEDIUM" (valid level) - wayback: use individual requests instead of request_batch for interesting file HEAD checks - subdomain_enum: revert is_target exemption from wildcard rejection --- bbot/modules/templates/subdomain_enum.py | 2 +- bbot/modules/wayback.py | 29 +++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/bbot/modules/templates/subdomain_enum.py b/bbot/modules/templates/subdomain_enum.py index ceb341118b..598d89ff5f 100644 --- a/bbot/modules/templates/subdomain_enum.py +++ b/bbot/modules/templates/subdomain_enum.py @@ -174,7 +174,7 @@ async def filter_event(self, event): # don't reject targets — if the user explicitly targeted a domain, always process it is_target = event in self.scan.target.whitelist # optionally reject events with wildcards / errors - if self.reject_wildcards and not is_target: + if self.reject_wildcards: if any(t in event.tags for t in ("a-error", "aaaa-error")): return False, "Event has a DNS resolution error" if self.reject_wildcards == "strict": diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 3ced55937d..71452c892d 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -97,6 +97,13 @@ async def setup(self): self._archive_bloom = self.helpers.bloom_filter(32000000) return await super().setup() + def _incoming_dedup_hash(self, event): + # URL events are handled differently (parameter/archive cache eviction), + # so they should not be deduplicated by the subdomain_enum strategy + if event.type == "URL": + return hash(event.data), "url_event" + return super()._incoming_dedup_hash(event) + async def handle_event(self, event): if event.type == "URL": await self._handle_url_event(event) @@ -177,11 +184,14 @@ async def _check_interesting_files(self, interesting_files, event): archive_url = f"{self.base_url}/web/{raw_url}" url_metadata[archive_url] = (cleaned_url, raw_url) - gen = self.helpers.request_batch( - list(url_metadata), method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True - ) - async for archive_url, r in gen: - cleaned_url, raw_url = url_metadata[archive_url] + for archive_url, (cleaned_url, raw_url) in url_metadata.items(): + try: + r = await self.helpers.request( + archive_url, method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True + ) + except Exception as e: + self.debug(f"Interesting file HEAD check error for {raw_url}: {e}") + continue if not r or r.status_code != 200: status = getattr(r, "status_code", "no response") if r else "no response" @@ -211,7 +221,14 @@ async def _check_interesting_files(self, interesting_files, event): self.verbose(f"Interesting archived file confirmed: {raw_url}") parsed = urlparse(raw_url) await self.emit_event( - {"description": desc, "url": str(r.url), "host": str(parsed.hostname or "")}, + { + "description": desc, + "severity": "LOW", + "name": "Interesting Archived File", + "confidence": "MEDIUM", + "url": str(r.url), + "host": str(parsed.hostname or ""), + }, "FINDING", event, tags=["from-wayback", "archived", "interesting-file"], From e72cc93a72f3c7155e66388a12ebbee5dc8c891e Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Sun, 15 Mar 2026 11:44:18 -0400 Subject: [PATCH 22/28] Speed up wayback archive fetching with HEAD pre-check and reactive rate limiting --- bbot/modules/wayback.py | 45 +++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 71452c892d..e13ddcb3e6 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -431,10 +431,8 @@ def _strip_wayback_wrapper(self, body): body = self._wayback_stale_ref_re.sub("", body) return body - # archive.org rate-limits aggressively; pace requests to avoid cascading ReadErrors _archive_per_request_retries = 3 _archive_batch_retries = 1 - _archive_delay = 0.5 # seconds between successful requests _archive_error_delay = 3 # initial backoff seconds after a failed request _archive_429_default_delay = 30 # default delay on 429 when no retry-after header @@ -496,11 +494,24 @@ async def _fetch_archive_batch(self, url_metadata, total, processed_offset): """ failed = [] succeeded = 0 + skipped = 0 processed = processed_offset for archive_url, (raw_url, parent_event) in url_metadata.items(): processed += 1 + # HEAD pre-check: resolve redirects cheaply to check for duplicates + # before downloading the full response body + resolved_url = await self._resolve_archive_url(archive_url, raw_url) + if resolved_url is not None and resolved_url in self._archive_bloom: + self.verbose(f"Skipping duplicate archive response for {raw_url} (resolved URL: {resolved_url})") + skipped += 1 + if processed % 50 == 0 or processed == total: + self.verbose( + f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed, {skipped:,} skipped)" + ) + continue + r = await self._fetch_single_archive_url(archive_url, raw_url) if not r or r.status_code != 200: @@ -514,14 +525,36 @@ async def _fetch_archive_batch(self, url_metadata, total, processed_offset): if processed % 50 == 0 or processed == total: self.verbose( - f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed)" + f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed, {skipped:,} skipped)" ) - # pace requests to avoid triggering rate limits - await self.helpers.sleep(self._archive_delay) - return failed, succeeded, processed + async def _resolve_archive_url(self, archive_url, raw_url): + """HEAD request to resolve the final URL after redirects, for bloom filter pre-check. + + Returns the resolved URL string, or None if the HEAD request fails. + """ + try: + r = await self.helpers.request( + archive_url, method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True, raise_error=True + ) + except Exception as e: + self.debug(f"HEAD pre-check failed for {raw_url}: {e}") + return None + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + self.verbose(f"Archive.org rate limit (429) during HEAD pre-check for {raw_url}, sleeping {delay}s") + await self.helpers.sleep(delay) + return None + if r.status_code == 200: + return str(r.url) + return None + async def _fetch_single_archive_url(self, archive_url, raw_url): """Fetch a single archive URL with per-request retry, 429 handling, and backoff. From cb53d7863017fe10a56f5cbd90cc02b5b140e14a Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Mon, 23 Mar 2026 18:06:56 -0400 Subject: [PATCH 23/28] Fix pytest hanging after test_run_in_executor_mp The _hang_forever worker process outlives the test and blocks Python's threading._shutdown via the ProcessPoolExecutor management thread. Terminate stuck workers after the test and add a safety net in pytest_sessionfinish. --- bbot/test/conftest.py | 10 ++++++ bbot/test/test_step_1/test_helpers.py | 45 ++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/bbot/test/conftest.py b/bbot/test/conftest.py index 216667d36d..4ea07bbcff 100644 --- a/bbot/test/conftest.py +++ b/bbot/test/conftest.py @@ -367,6 +367,16 @@ def pytest_sessionfinish(session, exitstatus): for handler in handlers: logger.removeHandler(handler) + # Kill any orphaned ProcessPoolExecutor workers that could block exit + import multiprocessing + + for child in multiprocessing.active_children(): + if child.is_alive(): + child.terminate() + child.join(timeout=5) + if child.is_alive(): + child.kill() + # Wipe out BBOT home dir shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py index a8094cd628..8ca6e91223 100644 --- a/bbot/test/test_step_1/test_helpers.py +++ b/bbot/test/test_step_1/test_helpers.py @@ -1,6 +1,7 @@ import asyncio import datetime import ipaddress +from concurrent.futures import ProcessPoolExecutor from ..bbot_fixtures import * @@ -978,7 +979,49 @@ async def test_rm_temp_dir_at_exit(helpers): assert not temp_dir.exists() -<<<<<<< HEAD + +# these must be top-level functions so they can be pickled for the subprocess +def _hang_forever(): + import time + + time.sleep(9999) + + +def _cpu_work(n): + return sum(range(n)) + + +@pytest.mark.asyncio +async def test_run_in_executor_mp(helpers): + # normal tasks should complete fine + result = await helpers.run_in_executor_mp(_cpu_work, 100_000) + assert result == sum(range(100_000)) + + # a hanging task should raise TimeoutError + with pytest.raises(asyncio.TimeoutError): + await helpers.run_in_executor_mp(_hang_forever, _timeout=2) + + # pool should still work after a timeout + result = await helpers.run_in_executor_mp(_cpu_work, 50_000, _timeout=30) + assert result == sum(range(50_000)) + + # kill the stuck worker so it doesn't prevent pytest from exiting + pool = helpers.process_pool + for proc in list(pool._processes.values()): + if proc.is_alive(): + proc.terminate() + proc.join(timeout=5) + pool.shutdown(wait=False, cancel_futures=True) + # replace the pool so subsequent tests aren't affected + import sys + import multiprocessing as mp + + pool_kwargs = {"max_workers": max(1, mp.cpu_count() - 1)} + if sys.version_info >= (3, 11): + pool_kwargs["max_tasks_per_child"] = 25 + helpers.process_pool = ProcessPoolExecutor(**pool_kwargs) + + def test_simhash_similarity(helpers): """Test SimHash helper with increasingly different HTML pages.""" From 86e4ae9a48769c2296b0a59968ca85c97c56cbf6 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 25 Mar 2026 09:07:48 -0400 Subject: [PATCH 24/28] Kill stuck process pool workers on timeout instead of leaking them Previously, asyncio.wait_for() only cancelled the awaiting coroutine but left the child process running indefinitely. On a 4-core machine, just 4 stuck workers would permanently stall the scan. Now on timeout we terminate all workers, replace the pool, and continue cleanly. --- bbot/core/helpers/helper.py | 63 +++++++++++++++++++++------ bbot/scanner/scanner.py | 13 +++++- bbot/test/test_step_1/test_helpers.py | 21 +-------- 3 files changed, 62 insertions(+), 35 deletions(-) diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py index d1d64850ef..925b736516 100644 --- a/bbot/core/helpers/helper.py +++ b/bbot/core/helpers/helper.py @@ -77,20 +77,12 @@ def __init__(self, preset): self._loop = None - # multiprocessing thread pool + # multiprocessing process pool start_method = mp.get_start_method() if start_method != "spawn": self.warning(f"Multiprocessing spawn method is set to {start_method}.") - - # we spawn 1 fewer processes than cores - # this helps to avoid locking up the system or competing with the main python process for cpu time - num_processes = max(1, mp.cpu_count() - 1) - pool_kwargs = {"max_workers": num_processes} - # max_tasks_per_child replaces workers after N tasks, preventing memory leaks - # and reducing the chance of a degraded worker process causing hangs - if sys.version_info >= (3, 11): - pool_kwargs["max_tasks_per_child"] = 25 - self.process_pool = ProcessPoolExecutor(**pool_kwargs) + self.process_pool = self._create_process_pool() + self._pool_reset_lock = asyncio.Lock() self._cloud = None self._blasthttp_client = None @@ -221,6 +213,18 @@ def loop(self): self._loop.set_default_executor(self._io_executor) return self._loop + @staticmethod + def _create_process_pool(): + # we spawn 1 fewer processes than cores + # this helps to avoid locking up the system or competing with the main python process for cpu time + num_processes = max(1, mp.cpu_count() - 1) + pool_kwargs = {"max_workers": num_processes} + # max_tasks_per_child replaces workers after N tasks, preventing memory leaks + # and reducing the chance of a degraded worker process causing hangs + if sys.version_info >= (3, 11): + pool_kwargs["max_tasks_per_child"] = 25 + return ProcessPoolExecutor(**pool_kwargs) + def run_in_executor_io(self, callback, *args, **kwargs): """ Run a synchronous task in the event loop's default thread pool executor @@ -249,8 +253,8 @@ async def run_in_executor_mp(self, callback, *args, **kwargs): Same as run_in_executor_io() except with a process pool executor. Use only in cases where callback is CPU-bound. - Includes a timeout (default 300s) to prevent indefinite hangs if a - child process dies or the pool enters a broken state. + Includes a timeout (default 300s) to prevent indefinite hangs if a child process dies or the pool enters a broken state. + On timeout, the entire pool is terminated and replaced so that stuck workers cannot accumulate and starve the scan. Pass ``_timeout=seconds`` to override the default timeout. @@ -261,7 +265,38 @@ async def run_in_executor_mp(self, callback, *args, **kwargs): timeout = kwargs.pop("_timeout", 300) callback = partial(callback, **kwargs) future = self.loop.run_in_executor(self.process_pool, callback, *args) - return await asyncio.wait_for(future, timeout=timeout) + try: + return await asyncio.wait_for(future, timeout=timeout) + except asyncio.TimeoutError: + log.warning(f"Process pool task timed out after {timeout}s, killing stuck workers and replacing pool") + await self._reset_process_pool() + raise + + async def _reset_process_pool(self): + """Terminate all workers in the current process pool and replace it. + + This is the nuclear option — every in-flight task on the old pool will fail with BrokenProcessPool. + We accept that trade-off because a timeout means something is genuinely broken, and leaving the stuck worker alive would permanently consume a pool slot. + + # TODO: Python 3.14 adds ProcessPoolExecutor.terminate_workers() + # and kill_workers() (https://github.com/python/cpython/pull/130849). + # Once we drop 3.13 support we can replace the _processes access + # with those official methods. + """ + async with self._pool_reset_lock: + old_pool = self.process_pool + self.process_pool = self._create_process_pool() + # snapshot workers before shutdown (shutdown sets _processes = None) + workers = list((old_pool._processes or {}).values()) + # terminate workers before shutdown so stuck ones don't block + for proc in workers: + if proc.is_alive(): + proc.terminate() + old_pool.shutdown(wait=False, cancel_futures=True) + # escalate to SIGKILL for anything that ignored SIGTERM + for proc in workers: + if proc.is_alive(): + proc.kill() @property def in_tests(self): diff --git a/bbot/scanner/scanner.py b/bbot/scanner/scanner.py index b5a4712550..0900d615fa 100644 --- a/bbot/scanner/scanner.py +++ b/bbot/scanner/scanner.py @@ -908,8 +908,17 @@ def _cancel_tasks(self): tasks.append(self._stop_task) self.helpers.cancel_tasks_sync(tasks) - # process pool (wait=False so a stuck worker can't hang the cleanup) - self.helpers.process_pool.shutdown(wait=False, cancel_futures=True) + # kill all pool workers and shut down (same logic as _reset_process_pool + # but synchronous, since we're tearing down the scan) + pool = self.helpers.process_pool + workers = list((pool._processes or {}).values()) + for proc in workers: + if proc.is_alive(): + proc.terminate() + pool.shutdown(wait=False, cancel_futures=True) + for proc in workers: + if proc.is_alive(): + proc.kill() self.debug("Finished cancelling all scan tasks") return tasks diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py index 8ca6e91223..fdc35b383b 100644 --- a/bbot/test/test_step_1/test_helpers.py +++ b/bbot/test/test_step_1/test_helpers.py @@ -1,7 +1,6 @@ import asyncio import datetime import ipaddress -from concurrent.futures import ProcessPoolExecutor from ..bbot_fixtures import * @@ -997,30 +996,14 @@ async def test_run_in_executor_mp(helpers): result = await helpers.run_in_executor_mp(_cpu_work, 100_000) assert result == sum(range(100_000)) - # a hanging task should raise TimeoutError + # a hanging task should raise TimeoutError and auto-replace the pool with pytest.raises(asyncio.TimeoutError): await helpers.run_in_executor_mp(_hang_forever, _timeout=2) - # pool should still work after a timeout + # pool should still work after a timeout (was replaced by _reset_process_pool) result = await helpers.run_in_executor_mp(_cpu_work, 50_000, _timeout=30) assert result == sum(range(50_000)) - # kill the stuck worker so it doesn't prevent pytest from exiting - pool = helpers.process_pool - for proc in list(pool._processes.values()): - if proc.is_alive(): - proc.terminate() - proc.join(timeout=5) - pool.shutdown(wait=False, cancel_futures=True) - # replace the pool so subsequent tests aren't affected - import sys - import multiprocessing as mp - - pool_kwargs = {"max_workers": max(1, mp.cpu_count() - 1)} - if sys.version_info >= (3, 11): - pool_kwargs["max_tasks_per_child"] = 25 - helpers.process_pool = ProcessPoolExecutor(**pool_kwargs) - def test_simhash_similarity(helpers): """Test SimHash helper with increasingly different HTML pages.""" From 5e917fe437d57908e7f51898d8933af8554649fe Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Thu, 26 Mar 2026 15:30:16 -0400 Subject: [PATCH 25/28] Fix wayback module for URL DictHostEvent migration, rename wayback-intense to wayback-heavy URL_UNVERIFIED/URL events changed from string to dict data in 3.0 merge. Fix event.data -> event.url for hash, clean_url, and test assertions. Add filter_event override to skip subdomain_enum filtering for URL events. --- bbot/modules/wayback.py | 12 ++++++-- ...{wayback-intense.yml => wayback-heavy.yml} | 0 .../module_tests/test_module_wayback.py | 28 +++++++++---------- docs/modules/wayback.md | 6 ++-- 4 files changed, 26 insertions(+), 20 deletions(-) rename bbot/presets/{wayback-intense.yml => wayback-heavy.yml} (100%) diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index e13ddcb3e6..6ec3f9238b 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -101,9 +101,15 @@ def _incoming_dedup_hash(self, event): # URL events are handled differently (parameter/archive cache eviction), # so they should not be deduplicated by the subdomain_enum strategy if event.type == "URL": - return hash(event.data), "url_event" + return hash(event.url), "url_event" return super()._incoming_dedup_hash(event) + async def filter_event(self, event): + # URL events are handled separately and don't need subdomain_enum's wildcard/cloud filtering + if event.type == "URL": + return True + return await super().filter_event(event) + async def handle_event(self, event): if event.type == "URL": await self._handle_url_event(event) @@ -148,11 +154,11 @@ async def _handle_url_event(self, event): break # only 2xx counts as live — 3xx (e.g. http→https 301 to a 404) doesn't confirm the page exists if 200 <= status_code < 300: - cleaned = clean_url(event.data).geturl() + cleaned = clean_url(event.url).geturl() if self._archive_cache.pop(cleaned, None) is not None: self.verbose(f"URL is live (status {status_code}), removed from archive cache: {cleaned}") - cached = self._parameter_cache.pop(clean_url(event.data).geturl(), None) + cached = self._parameter_cache.pop(clean_url(event.url).geturl(), None) if cached is not None: flat_params, base_url = cached for param_name, original_value in flat_params.items(): diff --git a/bbot/presets/wayback-intense.yml b/bbot/presets/wayback-heavy.yml similarity index 100% rename from bbot/presets/wayback-intense.yml rename to bbot/presets/wayback-heavy.yml diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 9bbd2b279e..0093dac879 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -37,7 +37,7 @@ async def setup_after_prep(self, module_test): module_test.set_expect_requests(expect_args={"uri": "/page"}, respond_args={"response_data": "alive"}) def check(self, module_test, events): - assert any(e.type == "URL_UNVERIFIED" and "127.0.0.1" in e.data and "/page" in e.data for e in events), ( + assert any(e.type == "URL_UNVERIFIED" and "127.0.0.1" in e.url and "/page" in e.url for e in events), ( "Failed to emit URL_UNVERIFIED" ) assert any( @@ -119,7 +119,7 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): # the dead URL (port 1) should NOT be verified as live - assert not any(e.type == "URL" and "deadpage" in e.data for e in events) + assert not any(e.type == "URL" and "deadpage" in e.url for e in events) # badsecrets should have found the vulnerability in the archived viewstate assert any(e.type == "VULNERABILITY" and "Known Secret Found." in e.data["description"] for e in events), ( "Failed to detect badsecrets vulnerability from archived content" @@ -158,12 +158,12 @@ async def setup_after_prep(self, module_test): ) def check(self, module_test, events): - url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/page" in e.data] + url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/page" in e.url] # should have only one, the https version assert len(url_unverified) == 1, ( - f"Expected 1 URL_UNVERIFIED, got {len(url_unverified)}: {[e.data for e in url_unverified]}" + f"Expected 1 URL_UNVERIFIED, got {len(url_unverified)}: {[e.url for e in url_unverified]}" ) - assert url_unverified[0].data.startswith("https://"), f"Expected https URL, got: {url_unverified[0].data}" + assert url_unverified[0].url.startswith("https://"), f"Expected https URL, got: {url_unverified[0].url}" class TestWaybackHttpOnlyKept(ModuleTestBase): @@ -184,10 +184,10 @@ async def setup_after_prep(self, module_test): ) def check(self, module_test, events): - url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/old-http-only" in e.data] + url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/old-http-only" in e.url] assert len(url_unverified) == 1, f"Expected 1 URL_UNVERIFIED, got {len(url_unverified)}" - assert url_unverified[0].data.startswith("http://"), ( - f"Expected http URL when no https exists, got: {url_unverified[0].data}" + assert url_unverified[0].url.startswith("http://"), ( + f"Expected http URL when no https exists, got: {url_unverified[0].url}" ) @@ -211,11 +211,11 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): # cdn-cgi URL should be filtered - assert not any(e.type == "URL_UNVERIFIED" and "cdn-cgi" in e.data for e in events), ( + assert not any(e.type == "URL_UNVERIFIED" and "cdn-cgi" in e.url for e in events), ( "cdn-cgi URL should have been filtered" ) # real page should still be emitted - assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.data for e in events), ( + assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.url for e in events), ( "Non-cdn-cgi URL should have been emitted" ) @@ -544,11 +544,11 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): # garbage URL should be filtered - assert not any(e.type == "URL_UNVERIFIED" and "get-materials" in e.data for e in events), ( + assert not any(e.type == "URL_UNVERIFIED" and "get-materials" in e.url for e in events), ( "Crawler-trap URL with repeating path segments should have been filtered" ) # real page should still be emitted - assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.data for e in events), ( + assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.url for e in events), ( "Non-garbage URL should have been emitted" ) @@ -575,11 +575,11 @@ async def setup_after_prep(self, module_test): def check(self, module_test, events): # long URL should be filtered - assert not any(e.type == "URL_UNVERIFIED" and "aaaa" in e.data for e in events), ( + assert not any(e.type == "URL_UNVERIFIED" and "aaaa" in e.url for e in events), ( "Excessively long URL should have been filtered" ) # normal page should still be emitted - assert any(e.type == "URL_UNVERIFIED" and "normal-page" in e.data for e in events), ( + assert any(e.type == "URL_UNVERIFIED" and "normal-page" in e.url for e in events), ( "Normal-length URL should have been emitted" ) diff --git a/docs/modules/wayback.md b/docs/modules/wayback.md index dab08340db..a3f30f3e23 100644 --- a/docs/modules/wayback.md +++ b/docs/modules/wayback.md @@ -83,12 +83,12 @@ Basic URL discovery mode. Includes `subdomain-enum` and enables `urls: True`. Go bbot -p wayback -t evilcorp.com ``` -### `-p wayback-intense` +### `-p wayback-heavy` Full-featured mode with URL discovery, parameter extraction, and archive retrieval. Also includes `badsecrets` to scan archived content for exposed secrets. ```bash -bbot -p wayback-intense -t evilcorp.com +bbot -p wayback-heavy -t evilcorp.com ``` ### Integration with other presets @@ -117,7 +117,7 @@ bbot -p wayback -t evilcorp.com ```bash # Full wayback integration with archived content and parameter extraction -bbot -p wayback-intense -t evilcorp.com +bbot -p wayback-heavy -t evilcorp.com ``` ```bash From 9b3b3f443d220ec13a89dd618e13909755ff9ec6 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Mon, 30 Mar 2026 14:24:12 -0400 Subject: [PATCH 26/28] Replace archive_url inheritance with tag-based parent traversal Stop auto-copying archive_url from parent to child event data dicts, which could infect live HTTP responses downstream. Instead, use the from-wayback tag as a signal and traverse upward to find the nearest archive_url when needed (via new event.archive_url property). --- bbot/core/event/base.py | 33 +++++++++++-------- .../module_tests/test_module_wayback.py | 22 ++++++++----- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 3232e6d55c..efb08a424a 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -649,6 +649,26 @@ def parent_uuid(self): return parent_uuid return self._parent_uuid + @property + def archive_url(self): + """Traverse the parent chain to find the nearest archive_url. + + The 'from-wayback' tag signals that this event descends from archived content. + The actual archive URL is stored only in the data dict of the originating + wayback HTTP_RESPONSE; this property walks upward to find it. + """ + if "from-wayback" not in self.tags: + return None + event = self + while event is not None: + if isinstance(event.data, dict) and "archive_url" in event.data: + return event.data["archive_url"] + parent = getattr(event, "parent", None) + if parent is None or parent is event: + break + event = parent + return None + @property def validators(self): """ @@ -1069,19 +1089,6 @@ def _data_load(self, data): class DictHostEvent(DictEvent): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # inherit archive_url from parent for provenance tracking (e.g. wayback archived content) - if isinstance(self.data, dict) and "archive_url" not in self.data: - parent = self.parent - if ( - parent is not None - and parent is not self - and isinstance(parent.data, dict) - and "archive_url" in parent.data - ): - self.data["archive_url"] = parent.data["archive_url"] - def _host(self): if isinstance(self.data, dict) and "host" in self.data: return make_ip_type(self.data["host"]) diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 0093dac879..e9bab6b575 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -306,21 +306,27 @@ def check(self, module_test, events): assert "web.archive.org" not in finding_url, ( f"Hunt FINDING url should NOT contain web.archive.org, got: {finding_url}" ) - # archive_url should propagate from HTTP_RESPONSE → WEB_PARAMETER → FINDING - assert "archive_url" in finding.data, ( - f"Hunt FINDING should have archive_url for provenance, got: {finding.data}" + # from-wayback tag should propagate; archive_url is reachable via parent traversal + assert "from-wayback" in finding.tags, ( + f"Hunt FINDING should have from-wayback tag, got tags: {finding.tags}" ) - assert "web.archive.org" in finding.data["archive_url"], ( - f"Hunt FINDING archive_url should be archive.org URL, got: {finding.data['archive_url']}" + assert finding.archive_url is not None, ( + "Hunt FINDING should be able to reach archive_url via parent traversal" + ) + assert "web.archive.org" in finding.archive_url, ( + f"Hunt FINDING archive_url should be archive.org URL, got: {finding.archive_url}" ) - # WEB_PARAMETERs from archived content should also have archive_url + # WEB_PARAMETERs from archived content should have from-wayback tag and reachable archive_url archived_params = [ e for e in events if e.type == "WEB_PARAMETER" and "redirect" in e.data.get("name", "").lower() ] for param in archived_params: - assert "archive_url" in param.data, ( - f"WEB_PARAMETER from archived content should have archive_url, got: {param.data}" + assert "from-wayback" in param.tags, ( + f"WEB_PARAMETER from archived content should have from-wayback tag, got tags: {param.tags}" + ) + assert param.archive_url is not None, ( + "WEB_PARAMETER from archived content should reach archive_url via parent traversal" ) # web.archive.org should never appear as a DNS_NAME From bce4b865a8241a637e3dbe78c6b094c284de9d0c Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Fri, 3 Apr 2026 15:19:22 -0400 Subject: [PATCH 27/28] =?UTF-8?q?Fix=20tests=20for=20blasthttp=20migration?= =?UTF-8?q?:=20httpx=5Fmock=E2=86=92blasthttp=5Fmock,=20whitelist=E2=86=92?= =?UTF-8?q?targets,=20VULNERABILITY=E2=86=92FINDING,=20httpx=E2=86=92http?= =?UTF-8?q?=20module=20refs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bbot/modules/templates/subdomain_enum.py | 2 - bbot/test/test_step_1/test_helpers.py | 27 ----------- .../module_tests/test_module_excavate.py | 2 +- .../module_tests/test_module_wayback.py | 48 +++++++++---------- 4 files changed, 25 insertions(+), 54 deletions(-) diff --git a/bbot/modules/templates/subdomain_enum.py b/bbot/modules/templates/subdomain_enum.py index 598d89ff5f..3bdcdff07b 100644 --- a/bbot/modules/templates/subdomain_enum.py +++ b/bbot/modules/templates/subdomain_enum.py @@ -171,8 +171,6 @@ async def filter_event(self, event): # reject if it's a cloud resource and not in our target (unless it's a seed event) if is_cloud and not self.scan.in_target(event) and "seed" not in event.tags: return False, "Event is a cloud resource and not a direct target" - # don't reject targets — if the user explicitly targeted a domain, always process it - is_target = event in self.scan.target.whitelist # optionally reject events with wildcards / errors if self.reject_wildcards: if any(t in event.tags for t in ("a-error", "aaaa-error")): diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py index fdc35b383b..783dac9b38 100644 --- a/bbot/test/test_step_1/test_helpers.py +++ b/bbot/test/test_step_1/test_helpers.py @@ -978,7 +978,6 @@ async def test_rm_temp_dir_at_exit(helpers): assert not temp_dir.exists() - # these must be top-level functions so they can be pickled for the subprocess def _hang_forever(): import time @@ -1173,29 +1172,3 @@ def test_clean_dns_record(): assert clean_dns_record("'d1jwhzvlef5tfb.example.com'") == "d1jwhzvlef5tfb.example.com" # quotes + trailing dot assert clean_dns_record('"d1jwhzvlef5tfb.example.com."') == "d1jwhzvlef5tfb.example.com" - - -# these must be top-level functions so they can be pickled for the subprocess -def _hang_forever(): - import time - - time.sleep(9999) - - -def _cpu_work(n): - return sum(range(n)) - - -@pytest.mark.asyncio -async def test_run_in_executor_mp(helpers): - # normal tasks should complete fine - result = await helpers.run_in_executor_mp(_cpu_work, 100_000) - assert result == sum(range(100_000)) - - # a hanging task should raise TimeoutError - with pytest.raises(asyncio.TimeoutError): - await helpers.run_in_executor_mp(_hang_forever, _timeout=2) - - # pool should still work after a timeout - result = await helpers.run_in_executor_mp(_cpu_work, 50_000, _timeout=30) - assert result == sum(range(50_000)) diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index e45cdbe646..6bb4d60001 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -1301,7 +1301,7 @@ class TestExcavate_webparameter_ip_host(ModuleTestBase): """ targets = ["http://localhost:8888"] - modules_overrides = ["httpx", "excavate", "hunt"] + modules_overrides = ["http", "excavate", "hunt"] config_overrides = {"interactsh_disable": True} async def setup_after_prep(self, module_test): diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index e9bab6b575..150d7de339 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -21,8 +21,8 @@ def check(self, module_test, events): class TestWaybackParameters(ModuleTestBase): module_name = "wayback" + targets = ["blacklanternsecurity.com", "127.0.0.1"] modules_overrides = ["wayback", "hunt"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "parameters": True}}} async def setup_after_prep(self, module_test): @@ -61,7 +61,7 @@ def check(self, module_test, events): class TestWaybackInterestingFiles(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): @@ -89,7 +89,7 @@ def check(self, module_test, events): class TestWaybackArchive(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback", "badsecrets", "excavate"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} sample_viewstate = """<html> @@ -121,17 +121,17 @@ def check(self, module_test, events): # the dead URL (port 1) should NOT be verified as live assert not any(e.type == "URL" and "deadpage" in e.url for e in events) # badsecrets should have found the vulnerability in the archived viewstate - assert any(e.type == "VULNERABILITY" and "Known Secret Found." in e.data["description"] for e in events), ( + assert any(e.type == "FINDING" and "Known Secret Found." in e.data["description"] for e in events), ( "Failed to detect badsecrets vulnerability from archived content" ) # the vulnerability should reference the original URL, with "from-wayback" tag for provenance for e in events: - if e.type == "VULNERABILITY" and "Known Secret Found." in e.data["description"]: + if e.type == "FINDING" and "Known Secret Found." in e.data["description"]: assert "127.0.0.1" in e.data["url"], ( - f"VULNERABILITY url should contain the original host, got: {e.data['url']}" + f"FINDING url should contain the original host, got: {e.data['url']}" ) assert "web.archive.org" not in e.data["url"], ( - f"VULNERABILITY url should NOT be an archive.org URL, got: {e.data['url']}" + f"FINDING url should NOT be an archive.org URL, got: {e.data['url']}" ) # web.archive.org should NOT appear as a DNS_NAME event assert not any(e.type == "DNS_NAME" and e.data == "web.archive.org" for e in events), ( @@ -144,7 +144,7 @@ class TestWaybackHttpHttpsDedup(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com"] + targets = ["blacklanternsecurity.com"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): @@ -171,7 +171,7 @@ class TestWaybackHttpOnlyKept(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com"] + targets = ["blacklanternsecurity.com"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): @@ -196,7 +196,7 @@ class TestWaybackCdnCgiBlacklist(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com"] + targets = ["blacklanternsecurity.com"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): @@ -225,7 +225,7 @@ class TestWaybackArchiveHostField(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback", "excavate"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): @@ -268,7 +268,7 @@ class TestWaybackArchiveHuntFinding(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback", "excavate", "hunt"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): @@ -340,8 +340,8 @@ class TestWaybackLightfuzzXSS(ModuleTestBase): module_name = "wayback" targets = ["blacklanternsecurity.com"] - modules_overrides = ["wayback", "httpx", "lightfuzz", "excavate"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + modules_overrides = ["wayback", "http", "lightfuzz", "excavate"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = { "interactsh_disable": True, "modules": { @@ -454,7 +454,7 @@ class TestWaybackArchiveBloomDedup(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): @@ -499,7 +499,7 @@ class TestWaybackArchiveRetry(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): @@ -531,7 +531,7 @@ class TestWaybackGarbageUrlFilter(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com"] + targets = ["blacklanternsecurity.com"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): @@ -539,7 +539,7 @@ async def setup_after_prep(self, module_test): repeating = "/themes/sites/example.com".lstrip("/") garbage_path = "/get-materials/" + "/".join([repeating] * 20) garbage_url = f"https://blacklanternsecurity.com{garbage_path}" - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], @@ -564,13 +564,13 @@ class TestWaybackGarbageUrlLength(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com"] + targets = ["blacklanternsecurity.com"] config_overrides = {"modules": {"wayback": {"urls": True}}} async def setup_after_prep(self, module_test): # URL exceeding 2000 character limit long_url = "https://blacklanternsecurity.com/" + "a" * 2000 - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[ ["original"], @@ -595,7 +595,7 @@ class TestWaybackArchive429Retry(ModuleTestBase): module_name = "wayback" modules_overrides = ["wayback"] - whitelist = ["blacklanternsecurity.com", "127.0.0.1"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} async def setup_after_prep(self, module_test): @@ -603,18 +603,18 @@ async def setup_after_prep(self, module_test): module_test.scan.modules["wayback"]._archive_429_default_delay = 0.01 module_test.scan.modules["wayback"]._archive_error_delay = 0.01 module_test.scan.modules["wayback"]._archive_delay = 0 - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://127.0.0.1:1/rate-limited-page"]], ) # first attempt: 429 rate limited - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", status_code=429, headers={"Retry-After": "1"}, ) # retry after backoff: 200 - module_test.httpx_mock.add_response( + module_test.blasthttp_mock.add_response( url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", text="<html><body>content after rate limit</body></html>", headers={"Content-Type": "text/html"}, From a23a3122b50b02a590148c5002d5e82f582f82a9 Mon Sep 17 00:00:00 2001 From: liquidsec <paul.mueller08@gmail.com> Date: Wed, 15 Apr 2026 23:47:34 -0400 Subject: [PATCH 28/28] Skip non-HTTP archived URLs in wayback, truncate sanitization warnings Filter ftp:// and other non-HTTP URLs in _pre_process_urls() before they enter the archive cache. Truncate event data in ValidationError messages to 200 chars to prevent terminal flooding. --- bbot/core/event/base.py | 6 ++++-- bbot/modules/wayback.py | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 7be86ab4f1..b7edb150bf 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -258,7 +258,8 @@ def __init__( self.data = self._sanitize_data(data) except Exception as e: log.trace(traceback.format_exc()) - raise ValidationError(f'Error sanitizing event data "{data}" for type "{self.type}": {e}') + data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data) + raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{self.type}": {e}') if not self.data: raise ValidationError(f'Invalid event data "{data}" for type "{self.type}"') @@ -2206,7 +2207,8 @@ def make_event( data = validators.validate_host(data) except Exception as e: log.trace(traceback.format_exc()) - raise ValidationError(f'Error sanitizing event data "{data}" for type "{event_type}": {e}') + data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data) + raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{event_type}": {e}') data_is_ip = is_ip(data) if event_type == "DNS_NAME" and data_is_ip: event_type = "IP_ADDRESS" diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 6ec3f9238b..5f2338b8af 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -315,6 +315,9 @@ def _pre_process_urls(self, urls): continue if not (parsed.hostname and self.scan.in_scope(parsed.hostname)): continue + # skip non-HTTP URLs (e.g. ftp:// archived by the Wayback Machine) + if parsed.scheme not in ("http", "https"): + continue cleaned_str = clean_url(url).geturl()