diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py index 4ffbe7dc40..b7edb150bf 100644 --- a/bbot/core/event/base.py +++ b/bbot/core/event/base.py @@ -258,7 +258,8 @@ def __init__( self.data = self._sanitize_data(data) except Exception as e: log.trace(traceback.format_exc()) - raise ValidationError(f'Error sanitizing event data "{data}" for type "{self.type}": {e}') + data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data) + raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{self.type}": {e}') if not self.data: raise ValidationError(f'Invalid event data "{data}" for type "{self.type}"') @@ -626,7 +627,7 @@ def parent(self, parent): self.web_spider_distance = getattr(parent, "web_spider_distance", 0) event_has_url = getattr(self, "parsed_url", None) is not None for t in parent.tags: - if t in ("affiliate",): + if t in ("affiliate", "from-wayback"): self.add_tag(t) elif t.startswith("mutation-"): self.add_tag(t) @@ -655,6 +656,26 @@ def parent_uuid(self): return parent_uuid return self._parent_uuid + @property + def archive_url(self): + """Traverse the parent chain to find the nearest archive_url. + + The 'from-wayback' tag signals that this event descends from archived content. + The actual archive URL is stored only in the data dict of the originating + wayback HTTP_RESPONSE; this property walks upward to find it. + """ + if "from-wayback" not in self.tags: + return None + event = self + while event is not None: + if isinstance(event.data, dict) and "archive_url" in event.data: + return event.data["archive_url"] + parent = getattr(event, "parent", None) + if parent is None or parent is event: + break + event = parent + return None + @property def validators(self): """ @@ -1783,6 +1804,7 @@ class _data_validator(BaseModel): full_url: Optional[str] = None path: Optional[str] = None cves: Optional[list[str]] = None + archive_url: Optional[str] = None _validate_url = field_validator("url")(validators.validate_url) _validate_host = field_validator("host")(validators.validate_host) _validate_severity = field_validator("severity")(validators.validate_severity) @@ -2185,7 +2207,8 @@ def make_event( data = validators.validate_host(data) except Exception as e: log.trace(traceback.format_exc()) - raise ValidationError(f'Error sanitizing event data "{data}" for type "{event_type}": {e}') + data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data) + raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{event_type}": {e}') data_is_ip = is_ip(data) if event_type == "DNS_NAME" and data_is_ip: event_type = "IP_ADDRESS" diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py index 0d2a1dbb6b..925b736516 100644 --- a/bbot/core/helpers/helper.py +++ b/bbot/core/helpers/helper.py @@ -1,4 +1,6 @@ import os +import sys +import asyncio import logging from pathlib import Path import multiprocessing as mp @@ -75,15 +77,12 @@ def __init__(self, preset): self._loop = None - # multiprocessing thread pool + # multiprocessing process pool start_method = mp.get_start_method() if start_method != "spawn": self.warning(f"Multiprocessing spawn method is set to {start_method}.") - - # we spawn 1 fewer processes than cores - # this helps to avoid locking up the system or competing with the main python process for cpu time - num_processes = max(1, mp.cpu_count() - 1) - self.process_pool = ProcessPoolExecutor(max_workers=num_processes) + self.process_pool = self._create_process_pool() + self._pool_reset_lock = asyncio.Lock() self._cloud = None self._blasthttp_client = None @@ -214,6 +213,18 @@ def loop(self): self._loop.set_default_executor(self._io_executor) return self._loop + @staticmethod + def _create_process_pool(): + # we spawn 1 fewer processes than cores + # this helps to avoid locking up the system or competing with the main python process for cpu time + num_processes = max(1, mp.cpu_count() - 1) + pool_kwargs = {"max_workers": num_processes} + # max_tasks_per_child replaces workers after N tasks, preventing memory leaks + # and reducing the chance of a degraded worker process causing hangs + if sys.version_info >= (3, 11): + pool_kwargs["max_tasks_per_child"] = 25 + return ProcessPoolExecutor(**pool_kwargs) + def run_in_executor_io(self, callback, *args, **kwargs): """ Run a synchronous task in the event loop's default thread pool executor @@ -237,17 +248,55 @@ def run_in_executor_cpu(self, callback, *args, **kwargs): callback = partial(callback, **kwargs) return self.loop.run_in_executor(self._cpu_executor, callback, *args) - def run_in_executor_mp(self, callback, *args, **kwargs): + async def run_in_executor_mp(self, callback, *args, **kwargs): """ - Same as run_in_executor_io() except with a process pool executor - Use only in cases where callback is CPU-bound + Same as run_in_executor_io() except with a process pool executor. + Use only in cases where callback is CPU-bound. + + Includes a timeout (default 300s) to prevent indefinite hangs if a child process dies or the pool enters a broken state. + On timeout, the entire pool is terminated and replaced so that stuck workers cannot accumulate and starve the scan. + + Pass ``_timeout=seconds`` to override the default timeout. Examples: Execute callback: >>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2) """ + timeout = kwargs.pop("_timeout", 300) callback = partial(callback, **kwargs) - return self.loop.run_in_executor(self.process_pool, callback, *args) + future = self.loop.run_in_executor(self.process_pool, callback, *args) + try: + return await asyncio.wait_for(future, timeout=timeout) + except asyncio.TimeoutError: + log.warning(f"Process pool task timed out after {timeout}s, killing stuck workers and replacing pool") + await self._reset_process_pool() + raise + + async def _reset_process_pool(self): + """Terminate all workers in the current process pool and replace it. + + This is the nuclear option — every in-flight task on the old pool will fail with BrokenProcessPool. + We accept that trade-off because a timeout means something is genuinely broken, and leaving the stuck worker alive would permanently consume a pool slot. + + # TODO: Python 3.14 adds ProcessPoolExecutor.terminate_workers() + # and kill_workers() (https://github.com/python/cpython/pull/130849). + # Once we drop 3.13 support we can replace the _processes access + # with those official methods. + """ + async with self._pool_reset_lock: + old_pool = self.process_pool + self.process_pool = self._create_process_pool() + # snapshot workers before shutdown (shutdown sets _processes = None) + workers = list((old_pool._processes or {}).values()) + # terminate workers before shutdown so stuck ones don't block + for proc in workers: + if proc.is_alive(): + proc.terminate() + old_pool.shutdown(wait=False, cancel_futures=True) + # escalate to SIGKILL for anything that ignored SIGTERM + for proc in workers: + if proc.is_alive(): + proc.kill() @property def in_tests(self): diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py index 5b2a9ae7d6..8c213a0000 100644 --- a/bbot/core/helpers/misc.py +++ b/bbot/core/helpers/misc.py @@ -2723,6 +2723,7 @@ def get_waf_strings(): return [ "The requested URL was rejected", "This content has been blocked", + "You don't have permission to access ", ] diff --git a/bbot/defaults.yml b/bbot/defaults.yml index 311cacdb7a..f70f9099bb 100644 --- a/bbot/defaults.yml +++ b/bbot/defaults.yml @@ -265,6 +265,7 @@ parameter_blacklist: - .AspNetCore.Session - PHPSESSID - __cf_bm + - _cfuvid - f5_cspm parameter_blacklist_prefixes: diff --git a/bbot/modules/http.py b/bbot/modules/http.py index e7e45859b3..c9c5155a88 100644 --- a/bbot/modules/http.py +++ b/bbot/modules/http.py @@ -42,6 +42,7 @@ async def setup(self): self.max_response_size = self.config.get("max_response_size", 5242880) self.store_responses = self.config.get("store_responses", False) self.client = self.helpers.blasthttp + self.waf_yara_rule = self.helpers.yara.compile_strings(self.helpers.get_waf_strings(), nocase=True) return True async def filter_event(self, event): @@ -274,6 +275,13 @@ async def handle_batch(self, *events): self.debug(f'Discarding 404 from "{url}"') continue + # discard 4xx responses that contain WAF strings + if 400 <= status_code < 500: + body = j.get("body", "") + if body and await self.helpers.yara.match(self.waf_yara_rule, body): + self.debug(f'Discarding WAF {status_code} from "{url}"') + continue + # main URL tags = [f"status-{status_code}"] diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 893dcc4300..8d24829c38 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -366,6 +366,42 @@ def in_bl(self, value): return False + def _is_archived(self, event): + """Check if an event represents archived wayback content.""" + return isinstance(event.data, dict) and "archive_url" in event.data + + def _event_host(self, event): + """Get the effective host from an event. + + For archived wayback content, data["host"] contains the original target hostname + (since data["url"] points to archive.org). For regular events, we use event.host. + + NOTE: Regular HTTP_RESPONSE events also have data["host"], but it contains the + resolved IP from the httpx binary — NOT a hostname override. + """ + if self._is_archived(event) and event.data.get("host"): + return str(event.data["host"]) + return str(event.host) + + def _event_base_url(self, event): + """Get the effective base URL from an event. + + For archived wayback content, reconstructs the original URL from override fields + (host/scheme/port/path) since parsed_url points to archive.org. + For regular events, returns event.parsed_url directly. + """ + if not self._is_archived(event): + return event.parsed_url + scheme = event.data.get("scheme", event.parsed_url.scheme) + host = self._event_host(event) + port = event.data.get("port") + if port is not None: + port = int(port) + if not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)): + host = f"{host}:{port}" + path = event.data.get("path", event.parsed_url.path) + return urlparse(f"{scheme}://{host}{path}") + def url_unparse(self, param_type, parsed_url): # Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value. if param_type == "GETPARAM": @@ -641,8 +677,9 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte # The endpoint is usually a form action - we should use it if we have it. If not, default to URL. else: - # Use the original URL as the base and resolve the endpoint correctly in case of relative paths - base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}" + # Use the effective base URL (which may differ from parsed_url for archived content) + event_base = self.excavate._event_base_url(event) + base_url = f"{event_base.scheme}://{event_base.netloc}{event_base.path}" if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0: base_url += f"?{event.parsed_url.query}" url = urljoin(base_url, endpoint) @@ -986,6 +1023,34 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte if yara_results: event.add_tag("login-page") + class DirectoryListingExtractor(ExcavateRule): + description = "Detects directory listing pages from web servers." + signatures = { + "Apache_Nginx": '"Index of /"', + "IIS": '"[To Parent Directory]"', + "Python_HTTP_Server": '"<h1>Directory listing for"', + "Generic_Directory_Listing": '"<title>Directory Listing"', + } + yara_rules = {} + + def __init__(self, excavate): + super().__init__(excavate) + signature_component_list = [] + for signature_name, signature in self.signatures.items(): + signature_component_list.append(rf"${signature_name} = {signature}") + signature_component = " ".join(signature_component_list) + self.yara_rules["directory_listing"] = ( + f'rule directory_listing {{meta: description = "contains a directory listing" strings: {signature_component} condition: any of them}}' + ) + + async def process(self, yara_results, event, yara_rule_settings, discovery_context): + for identifier in yara_results.keys(): + for findings in yara_results[identifier]: + event_data = { + "description": f"{discovery_context} {yara_rule_settings.description} ({identifier})" + } + await self.report(event_data, event, yara_rule_settings, discovery_context, event_type="FINDING") + def add_yara_rule(self, rule_name, rule_content, rule_instance): rule_instance.name = rule_name self.yara_rules_dict[rule_name] = rule_content @@ -1013,12 +1078,13 @@ async def emit_custom_parameters(self, event, config_key, param_type, descriptio # Emits WEB_PARAMETER events for custom headers and cookies from the configuration. custom_params = self.scan.web_config.get(config_key, {}) for param_name, param_value in custom_params.items(): + event_base = self._event_base_url(event) await self.emit_web_parameter( - host=event.parsed_url.hostname, + host=self._event_host(event), param_type=param_type, name=param_name, original_value=param_value, - url=self.url_unparse(param_type, event.parsed_url), + url=self.url_unparse(param_type, event_base), description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})", additional_params=_exclude_key(custom_params, param_name), event=event, @@ -1134,7 +1200,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon if results: for parameter_name, original_value in results: await self.emit_web_parameter( - host=str(event.host), + host=self._event_host(event), param_type="SPECULATIVE", name=parameter_name, original_value=original_value, @@ -1142,7 +1208,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]", additional_params={}, event=event, - context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}", + context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {self._event_host(event)}", ) return @@ -1194,7 +1260,7 @@ async def handle_event(self, event, **kwargs): ) in extract_params_url(event.parsed_url): if self.in_bl(parameter_name) is False: await self.emit_web_parameter( - host=parsed_url.hostname, + host=self._event_host(event), param_type="GETPARAM", name=parameter_name, original_value=original_value, @@ -1228,12 +1294,13 @@ async def handle_event(self, event, **kwargs): if self.in_bl(cookie_name) is False: self.assigned_cookies[cookie_name] = cookie_value + event_base = self._event_base_url(event) await self.emit_web_parameter( - host=str(event.host), + host=self._event_host(event), param_type="COOKIE", name=cookie_name, original_value=cookie_value, - url=self.url_unparse("COOKIE", event.parsed_url), + url=self.url_unparse("COOKIE", event_base), description=f"Set-Cookie Assigned Cookie [{cookie_name}]", additional_params={}, event=event, @@ -1270,10 +1337,10 @@ async def handle_event(self, event, **kwargs): original_value, regex_name, additional_params, - ) in extract_params_location(header_value, event.parsed_url): + ) in extract_params_location(header_value, self._event_base_url(event)): if self.in_bl(parameter_name) is False: await self.emit_web_parameter( - host=parsed_url.hostname, + host=self._event_host(event), param_type="GETPARAM", name=parameter_name, original_value=original_value, diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 49010f451a..5f2338b8af 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -1,87 +1,674 @@ +import re +from collections import Counter from datetime import datetime +from urllib.parse import parse_qs, urlparse, urlunparse +import orjson + +from bbot.core.helpers.misc import get_file_extension +from bbot.core.helpers.validators import clean_url from bbot.modules.templates.subdomain_enum import subdomain_enum +def _parse_cdx_response(text): + """Parse CDX JSON response text into a URL list. Designed to run in a separate process.""" + j = orjson.loads(text) + if not isinstance(j, list): + return None + return [result[0] for result in j[1:] if result] + + class wayback(subdomain_enum): flags = ["safe", "passive", "subdomain-enum"] - watched_events = ["DNS_NAME"] - produced_events = ["URL_UNVERIFIED", "DNS_NAME"] + watched_events = ["DNS_NAME", "URL"] + produced_events = ["URL_UNVERIFIED", "DNS_NAME", "WEB_PARAMETER", "HTTP_RESPONSE", "FINDING"] meta = { - "description": "Query archive.org's API for subdomains", + "description": "Query archive.org's Wayback Machine for subdomains, URLs, parameters, and archived content", "created_date": "2022-04-01", "author": "@liquidsec", } - options = {"urls": False, "garbage_threshold": 10} + options = {"urls": False, "garbage_threshold": 10, "parameters": False, "archive": False, "max_records": 100000} options_desc = { "urls": "emit URLs in addition to DNS_NAMEs", "garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)", + "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true)", + "archive": "fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true)", + "max_records": "Maximum number of URLs to fetch from the CDX API", } in_scope_only = True base_url = "http://web.archive.org" + url_blacklist = ["_Incapsula_Resource", "/cdn-cgi/"] + + interesting_extensions = frozenset({"zip", "sql", "bak", "env", "config"}) + interesting_compound_extensions = frozenset({"tar.gz", "tar.bz2"}) + + # maximum URL length before we consider it garbage (crawler traps produce absurdly long URLs) + _max_url_length = 2000 + # if any single path segment repeats more than this many times, it's a path loop / crawler trap + _max_path_segment_repeats = 3 + + def _is_garbage_url(self, url): + """Detect crawler-trap URLs with repeating path segments or excessive length.""" + if len(url) > self._max_url_length: + return True + path = urlparse(url).path + if not path: + return False + segments = [s for s in path.split("/") if s] + if not segments: + return False + counts = Counter(segments) + return counts.most_common(1)[0][1] > self._max_path_segment_repeats + + def _is_interesting_file(self, url): + ext = get_file_extension(url) + if ext and ext.lower() in self.interesting_extensions: + return True + lower_url = url.lower() + return any(lower_url.endswith(f".{ce}") for ce in self.interesting_compound_extensions) async def setup(self): self.urls = self.config.get("urls", False) + self.parameters = self.config.get("parameters", False) + if self.parameters: + if not self.urls: + self.hugewarning("parameters option requires urls to be enabled. Please add modules.wayback.urls=True") + return False + consumers = [m for m, mod in self.scan.modules.items() if "WEB_PARAMETER" in mod.watched_events] + if not consumers: + self.warning("Disabling parameter extraction because no modules consume WEB_PARAMETER events") + self.parameters = False + else: + self.hugeinfo( + f"Parameter extraction enabled because the following modules consume WEB_PARAMETER events: [{', '.join(consumers)}]" + ) + self.archive = self.config.get("archive", False) + if self.archive and not self.urls: + self.hugewarning("archive option requires urls to be enabled. Please add modules.wayback.urls=True") + return False self.garbage_threshold = self.config.get("garbage_threshold", 10) + self.max_records = self.config.get("max_records", 100000) + self._parameter_cache = {} + self._archive_cache = {} + # bloom filter to deduplicate archive fetches by the response URL archive.org actually served + # (multiple request URLs can redirect to the same archived snapshot) + # 32M bits (~4MB) supports ~400K entries with negligible false-positive rate + self._archive_bloom = self.helpers.bloom_filter(32000000) return await super().setup() + def _incoming_dedup_hash(self, event): + # URL events are handled differently (parameter/archive cache eviction), + # so they should not be deduplicated by the subdomain_enum strategy + if event.type == "URL": + return hash(event.url), "url_event" + return super()._incoming_dedup_hash(event) + + async def filter_event(self, event): + # URL events are handled separately and don't need subdomain_enum's wildcard/cloud filtering + if event.type == "URL": + return True + return await super().filter_event(event) + async def handle_event(self, event): + if event.type == "URL": + await self._handle_url_event(event) + return + query = self.make_query(event) - for result, event_type in await self.query(query): + results, interesting_files = await self.query(query) + for result, event_type in results: + tags = ["from-wayback"] if event_type == "URL_UNVERIFIED" else [] await self.emit_event( result, event_type, event, + tags=tags, abort_if=self.abort_if, context=f'{{module}} queried archive.org for "{query}" and found {{event.type}}: {{event.pretty_string}}', ) - async def query(self, query): - results = set() - waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" - r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 10) - if not r: - self.warning(f'Error connecting to archive.org for query "{query}"') - return results + if interesting_files: + await self._check_interesting_files(interesting_files, event) + + # pair unpaired archive cache entries with their parent DNS_NAME event + if self.archive: + paired = 0 + for url_str in list(self._archive_cache): + if isinstance(self._archive_cache[url_str], str): + self._archive_cache[url_str] = (self._archive_cache[url_str], event) + paired += 1 + if paired: + self.debug(f"Paired {paired} archive cache entries with parent event {event.data}") + + async def _handle_url_event(self, event): + """Process a URL event: evict live URLs from archive cache and emit cached parameters.""" + if self.archive: + status_code = 0 + for tag in event.tags: + if tag.startswith("status-"): + try: + status_code = int(tag.split("-", 1)[1]) + except ValueError: + pass + break + # only 2xx counts as live — 3xx (e.g. http→https 301 to a 404) doesn't confirm the page exists + if 200 <= status_code < 300: + cleaned = clean_url(event.url).geturl() + if self._archive_cache.pop(cleaned, None) is not None: + self.verbose(f"URL is live (status {status_code}), removed from archive cache: {cleaned}") + + cached = self._parameter_cache.pop(clean_url(event.url).geturl(), None) + if cached is not None: + flat_params, base_url = cached + for param_name, original_value in flat_params.items(): + data = { + "host": str(event.host), + "type": "GETPARAM", + "name": param_name, + "original_value": original_value, + "url": base_url, + "description": f"HTTP Extracted Parameter [{param_name}] (wayback)", + "additional_params": {k: v for k, v in flat_params.items() if k != param_name}, + } + self.verbose(f"Emitting WEB_PARAMETER [{param_name}] from archived URL {base_url}") + await self.emit_event( + data, + "WEB_PARAMETER", + event, + tags=["from-wayback"], + context=f"{{module}} found query parameter [{param_name}] in archived URL and emitted {{event.type}}", + ) + + async def _check_interesting_files(self, interesting_files, event): + """HEAD-check interesting archived files and emit FINDINGs for those that exist.""" + self.verbose(f"Checking {len(interesting_files)} interesting archived files") + + # build URL list and mapping back to metadata + url_metadata = {} + for cleaned_url, raw_url in interesting_files.items(): + archive_url = f"{self.base_url}/web/{raw_url}" + url_metadata[archive_url] = (cleaned_url, raw_url) + + for archive_url, (cleaned_url, raw_url) in url_metadata.items(): + try: + r = await self.helpers.request( + archive_url, method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True + ) + except Exception as e: + self.debug(f"Interesting file HEAD check error for {raw_url}: {e}") + continue + + if not r or r.status_code != 200: + status = getattr(r, "status_code", "no response") if r else "no response" + self.debug(f"Interesting file HEAD check failed for {raw_url}: status={status}") + continue + # guard against soft 404s (archive.org returns text/html for missing pages) + content_type = r.headers.get("content-type", "") + if "text/html" in content_type: + self.debug(f"Interesting file skipped (soft 404): {raw_url}") + continue + + ext = get_file_extension(cleaned_url) + desc = f"Interesting archived file found (.{ext}): {raw_url}" + content_length = r.headers.get("content-length", "") + if content_length: + try: + size = int(content_length) + if size > 1024 * 1024: + desc += f" ({size / (1024 * 1024):.1f} MB)" + elif size > 1024: + desc += f" ({size / 1024:.1f} KB)" + else: + desc += f" ({size} bytes)" + except ValueError: + pass + + self.verbose(f"Interesting archived file confirmed: {raw_url}") + parsed = urlparse(raw_url) + await self.emit_event( + { + "description": desc, + "severity": "LOW", + "name": "Interesting Archived File", + "confidence": "MEDIUM", + "url": str(r.url), + "host": str(parsed.hostname or ""), + }, + "FINDING", + event, + tags=["from-wayback", "archived", "interesting-file"], + context=f"{{module}} found interesting archived file: {raw_url}", + ) + + # CDX API filters applied server-side to reduce response size + _cdx_filters = ( + "filter=!statuscode:404", + "filter=!statuscode:301", + "filter=!statuscode:302", + "filter=!mimetype:image/.*", + "filter=!mimetype:text/css", + "filter=!mimetype:warc/revisit", + ) + + async def _fetch_cdx(self, query): + """Fetch URLs from the CDX API with retries and 429 handling. Returns the URL list or None on failure.""" + params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original" + params += f"&limit={self.max_records}" + params += "&" + "&".join(self._cdx_filters) + waybackurl = f"{self.base_url}/cdx/search/cdx?{params}" + r = None + last_error = None + for i in range(3): + try: + r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 60, raise_error=True) + except Exception as e: + last_error = str(e) + r = None + if r is not None: + if r.status_code == 200: + break + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + last_error = "HTTP 429 rate limited" + self.verbose(f'Archive.org rate limit (429) for CDX query "{query}", sleeping {delay}s') + await self.helpers.sleep(delay) + r = None + continue + last_error = f"HTTP status {r.status_code}" + r = None + if i < 2: + self.verbose( + f'Error connecting to archive.org for query "{query}" ({last_error}), retrying ({i + 1}/2)' + ) + await self.helpers.sleep(2**i) + if r is None: + self.warning(f'Error connecting to archive.org for query "{query}": {last_error}') + return None + # parse JSON + extract URLs in a separate process to avoid blocking the event loop + # (CDX responses can contain 100k+ entries) try: - j = r.json() - assert type(j) == list + urls = await self.helpers.run_in_executor_mp(_parse_cdx_response, r.text) except Exception: + urls = None + if urls is None: self.warning(f'Error JSON-decoding archive.org response for query "{query}"') - return results + return None + return urls + + def _pre_process_urls(self, urls): + """Extract parameters, archive URLs, and interesting files from raw CDX URLs before collapse.""" + raw_url_params = {} + archive_urls = {} + interesting_files = {} - urls = [] - for result in j[1:]: + for url in urls: try: - url = result[0] - urls.append(url) - except KeyError: + parsed = urlparse(url) + if any(bl in url for bl in self.url_blacklist): + continue + if self._is_garbage_url(url): + continue + if not (parsed.hostname and self.scan.in_scope(parsed.hostname)): + continue + # skip non-HTTP URLs (e.g. ftp:// archived by the Wayback Machine) + if parsed.scheme not in ("http", "https"): + continue + + cleaned_str = clean_url(url).geturl() + + if self.archive and cleaned_str not in archive_urls: + archive_urls[cleaned_str] = url + + if self.urls and self._is_interesting_file(url) and cleaned_str not in interesting_files: + interesting_files[cleaned_str] = url + + if self.parameters and parsed.query: + params = parse_qs(parsed.query) + flat_params = {k: v[0] for k, v in params.items()} + if flat_params: + if cleaned_str not in raw_url_params: + raw_url_params[cleaned_str] = flat_params + else: + raw_url_params[cleaned_str].update(flat_params) + except Exception: continue + if archive_urls or interesting_files or raw_url_params: + self.debug( + f"Pre-processed {len(urls):,} URLs: {len(archive_urls):,} archive candidates, " + f"{len(interesting_files):,} interesting files, {len(raw_url_params):,} URLs with parameters" + ) + + return raw_url_params, archive_urls, interesting_files + + async def query(self, query): + results = set() + + urls = await self._fetch_cdx(query) + if urls is None: + return results, {} + self.verbose(f"Found {len(urls):,} URLs for {query}") + # filter blacklisted and garbage URLs before any further processing + urls = [ + url for url in urls if not any(bl in url for bl in self.url_blacklist) and not self._is_garbage_url(url) + ] + + # pre-extract metadata from raw URLs before collapse strips query strings + raw_url_params, archive_urls, interesting_files = {}, {}, {} + if self.parameters or self.archive or self.urls: + raw_url_params, archive_urls, interesting_files = self._pre_process_urls(urls) + + if not urls: + return results, interesting_files + dns_names = set() collapsed_urls = 0 start_time = datetime.now() - # we consolidate URLs to cut down on garbage data - # this is CPU-intensive, so we do it in its own core. + # consolidate URLs to cut down on garbage data (CPU-intensive, runs in separate process) parsed_urls = await self.helpers.run_in_executor_mp( self.helpers.validators.collapse_urls, urls, threshold=self.garbage_threshold, ) - for parsed_url in parsed_urls: - collapsed_urls += 1 - if not self.urls: + if self.urls: + # deduplicate http/https variants — drop http when https also exists + url_dedup = {} + for parsed_url in parsed_urls: + collapsed_urls += 1 + https_key = parsed_url._replace(scheme="https").geturl() + if https_key not in url_dedup or parsed_url.scheme == "https": + url_dedup[https_key] = parsed_url + for parsed_url in url_dedup.values(): + url_str = parsed_url.geturl() + results.add((url_str, "URL_UNVERIFIED")) + if self.parameters and url_str in raw_url_params: + base_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")) + self._parameter_cache[url_str] = (raw_url_params[url_str], base_url) + if self.archive and url_str in archive_urls: + self._archive_cache[url_str] = archive_urls[url_str] + else: + for parsed_url in parsed_urls: + collapsed_urls += 1 dns_name = parsed_url.hostname h = hash(dns_name) if h not in dns_names: dns_names.add(h) results.add((dns_name, "DNS_NAME")) - else: - results.add((parsed_url.geturl(), "URL_UNVERIFIED")) - end_time = datetime.now() - duration = self.helpers.human_timedelta(end_time - start_time) + + duration = self.helpers.human_timedelta(datetime.now() - start_time) self.verbose(f"Collapsed {len(urls):,} -> {collapsed_urls:,} URLs in {duration}") - return results + return results, interesting_files + + _wayback_head_re = re.compile( + r'<script src="//archive\.org/includes/athena\.js".*?<!-- End Wayback Rewrite JS Include -->\s*', + re.DOTALL, + ) + _wayback_toolbar_re = re.compile( + r"<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->\s*", + re.DOTALL, + ) + _wayback_footer_re = re.compile( + r"<!--\s*FILE ARCHIVED ON.*", + re.DOTALL, + ) + # wayback rewrites all URLs in the HTML body to go through web.archive.org, e.g.: + # http://web.archive.org/web/20250524005847/http://example.com/page + # this regex strips the prefix to restore original URLs + _wayback_url_re = re.compile(r"https?://web\.archive\.org/web/\d+\w*/") + # relative variant of the above — wayback also rewrites hrefs/srcs as relative paths, e.g.: + # /web/19971024185506/http://www.example.com/page + # /web/19971024185506im_/http://www.example.com/image.gif + # the timestamp is always 14 digits, optionally followed by a modifier suffix (im_, js_, cs_, if_, etc.) + _wayback_relative_url_re = re.compile(r"/web/\d{14}\w*/") + # catch any remaining archive.org URLs not handled by the toolbar/head/footer regexes + _wayback_stale_ref_re = re.compile(r"""(?:https?:)?//(?:web\.)?archive\.org/[^\s"'<>]*""") + + def _strip_wayback_wrapper(self, body): + """Remove Wayback Machine artifacts from archived HTML: toolbar, scripts, footer, and URL rewrites.""" + body = self._wayback_head_re.sub("", body) + body = self._wayback_toolbar_re.sub("", body) + body = self._wayback_footer_re.sub("", body) + body = self._wayback_url_re.sub("", body) + body = self._wayback_relative_url_re.sub("", body) + body = self._wayback_stale_ref_re.sub("", body) + return body + + _archive_per_request_retries = 3 + _archive_batch_retries = 1 + _archive_error_delay = 3 # initial backoff seconds after a failed request + _archive_429_default_delay = 30 # default delay on 429 when no retry-after header + + async def finish(self): + if not self.archive or not self._archive_cache: + return + + self.verbose(f"Loading {len(self._archive_cache):,} archived URLs from the Wayback Machine") + + # build combined set of extensions to skip (blacklist + static + special) + skip_extensions = set(self.scan.url_extension_blacklist) + skip_extensions.update(e.lower() for e in self.scan.config.get("url_extension_static", [])) + skip_extensions.update(e.lower() for e in self.scan.config.get("url_extension_special", [])) + + # build URL list and mapping back to metadata + url_metadata = {} + for cleaned_url, (raw_url, parent_event) in list(self._archive_cache.items()): + ext = get_file_extension(cleaned_url) + if ext and ext in skip_extensions: + self.debug(f"Skipping archive fetch for {raw_url} (extension: .{ext})") + continue + archive_url = f"{self.base_url}/web/{raw_url}" + url_metadata[archive_url] = (raw_url, parent_event) + + if not url_metadata: + return + + total = len(url_metadata) + self.info(f"Fetching {total:,} archived pages from archive.org") + + failed, succeeded, processed = await self._fetch_archive_batch(url_metadata, total, 0) + + # batch-level retry as safety net (per-request retry handles most transient errors, + # but a temporary outage window could still leave a batch of failures) + for retry_num in range(1, self._archive_batch_retries + 1): + if not failed: + break + delay = 30 * retry_num + self.info( + f"Retrying {len(failed):,} failed archive fetches (batch retry {retry_num}/{self._archive_batch_retries}, " + f"backoff {delay}s)" + ) + await self.helpers.sleep(delay) + retry_metadata = {url: url_metadata[url] for url in failed} + new_failed, new_succeeded, processed = await self._fetch_archive_batch( + retry_metadata, total, processed - len(failed) + ) + succeeded += new_succeeded + failed = new_failed + + if failed: + self.warning(f"Failed to fetch {len(failed):,} archived URLs after retries") + self.info(f"Archive loading complete: {succeeded:,}/{total:,} succeeded") + + async def _fetch_archive_batch(self, url_metadata, total, processed_offset): + """Fetch a batch of archive URLs with per-request retry and rate-limit handling. + + Returns (failed_urls, success_count, processed_count). + """ + failed = [] + succeeded = 0 + skipped = 0 + processed = processed_offset + + for archive_url, (raw_url, parent_event) in url_metadata.items(): + processed += 1 + + # HEAD pre-check: resolve redirects cheaply to check for duplicates + # before downloading the full response body + resolved_url = await self._resolve_archive_url(archive_url, raw_url) + if resolved_url is not None and resolved_url in self._archive_bloom: + self.verbose(f"Skipping duplicate archive response for {raw_url} (resolved URL: {resolved_url})") + skipped += 1 + if processed % 50 == 0 or processed == total: + self.verbose( + f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed, {skipped:,} skipped)" + ) + continue + + r = await self._fetch_single_archive_url(archive_url, raw_url) + + if not r or r.status_code != 200: + status = getattr(r, "status_code", "no response") if r else "no response" + self.verbose(f"Archive fetch failed for {raw_url} after retries: status={status}") + failed.append(archive_url) + continue + + if await self._process_archive_response(r, raw_url, parent_event): + succeeded += 1 + + if processed % 50 == 0 or processed == total: + self.verbose( + f"Archive progress: {processed:,}/{total:,} ({succeeded:,} succeeded, {len(failed):,} failed, {skipped:,} skipped)" + ) + + return failed, succeeded, processed + + async def _resolve_archive_url(self, archive_url, raw_url): + """HEAD request to resolve the final URL after redirects, for bloom filter pre-check. + + Returns the resolved URL string, or None if the HEAD request fails. + """ + try: + r = await self.helpers.request( + archive_url, method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True, raise_error=True + ) + except Exception as e: + self.debug(f"HEAD pre-check failed for {raw_url}: {e}") + return None + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + self.verbose(f"Archive.org rate limit (429) during HEAD pre-check for {raw_url}, sleeping {delay}s") + await self.helpers.sleep(delay) + return None + if r.status_code == 200: + return str(r.url) + return None + + async def _fetch_single_archive_url(self, archive_url, raw_url): + """Fetch a single archive URL with per-request retry, 429 handling, and backoff. + + archive.org rate-limits CDX at ~60 req/min and blocks the IP at the firewall + if 429 responses are ignored for more than a minute. We must respect 429 + Retry-After. + """ + r = None + for attempt in range(self._archive_per_request_retries): + try: + r = await self.helpers.request( + archive_url, timeout=self.http_timeout + 60, follow_redirects=True, raise_error=True + ) + except Exception as e: + r = None + if attempt < self._archive_per_request_retries - 1: + delay = self._archive_error_delay * (2**attempt) + self.verbose( + f"Archive fetch error for {raw_url} (attempt {attempt + 1}/{self._archive_per_request_retries}): " + f"{e} -- retrying in {delay}s" + ) + await self.helpers.sleep(delay) + else: + self.verbose( + f"Archive fetch error for {raw_url} (final attempt {attempt + 1}/{self._archive_per_request_retries}): {e}" + ) + continue + + if r.status_code == 429: + retry_after = r.headers.get("retry-after", "") + try: + delay = min(int(retry_after), 120) + except (ValueError, TypeError): + delay = self._archive_429_default_delay + self.verbose(f"Archive.org rate limit (429) for {raw_url}, sleeping {delay}s") + await self.helpers.sleep(delay) + r = None + continue + + # any other status code (200, 404, 503, etc.) is a definitive answer — return it + return r + + return r + + async def _process_archive_response(self, r, raw_url, parent_event): + """Process a successful archive.org response into an HTTP_RESPONSE event. Returns True on success.""" + # deduplicate by the actual response URL archive.org served (after redirects) + # multiple request URLs can redirect to the same archived snapshot + response_url = str(r.url) + if response_url in self._archive_bloom: + self.verbose(f"Skipping duplicate archive response for {raw_url} (response URL: {response_url})") + return False + self._archive_bloom.add(response_url) + + j = self.helpers.response_to_json(r) + if not j: + self.verbose(f"Failed to parse archive response for {raw_url}") + return False + + if "body" in j: + j["body"] = self._strip_wayback_wrapper(j["body"]) + + # strip wayback-injected headers to prevent excavate from extracting archive.org artifacts + if "header" in j: + j["header"] = { + k: v for k, v in j["header"].items() if not k.startswith("x_archive_") and k != "set_cookie" + } + if "raw_header" in j: + j["raw_header"] = "\r\n".join( + line + for line in j["raw_header"].split("\r\n") + if not line.lower().startswith(("set-cookie:", "x-archive-")) + ) + + # use the original URL so event.host returns the original host, not web.archive.org + # this prevents internal modules (speculate, host, dnsresolve) from treating archive.org as a target + parsed_original = urlparse(raw_url) + hostname = str(parsed_original.hostname or "") + port = parsed_original.port or (443 if parsed_original.scheme == "https" else 80) + scheme = parsed_original.scheme + # strip redundant port (e.g. :80 for http, :443 for https) + if (scheme == "http" and port == 80) or (scheme == "https" and port == 443): + netloc = hostname + else: + netloc = f"{hostname}:{port}" + j["url"] = urlunparse((scheme, netloc, parsed_original.path or "/", "", parsed_original.query, "")) + # store the archive URL for provenance — downstream modules can check this field + j["archive_url"] = str(r.url) + # override host/port/scheme/path to match the original URL (response_to_json set them from archive.org) + j["host"] = hostname + j["port"] = port + j["scheme"] = scheme + j["path"] = parsed_original.path or "/" + + http_response = self.make_event( + j, + "HTTP_RESPONSE", + parent_event, + tags=["from-wayback", "archived"], + context=f"{{module}} loaded archived version of {raw_url} from the Wayback Machine", + ) + if http_response is None: + self.verbose(f"Failed to create HTTP_RESPONSE event for {raw_url}") + return False + # keep the event in scope so modules like badsecrets can process the archived content + http_response.scope_distance = 0 + self.verbose(f"Emitting archived HTTP_RESPONSE for dead URL: {raw_url}") + await self.emit_event(http_response) + return True diff --git a/bbot/presets/kitchen-sink.yml b/bbot/presets/kitchen-sink.yml index 647889aab5..700007c709 100644 --- a/bbot/presets/kitchen-sink.yml +++ b/bbot/presets/kitchen-sink.yml @@ -11,3 +11,12 @@ include: - dirbust-light - web-screenshots - baddns-heavy + +config: + modules: + baddns: + enable_references: True + wayback: + urls: True + parameters: True + archive: True diff --git a/bbot/presets/wayback-heavy.yml b/bbot/presets/wayback-heavy.yml new file mode 100644 index 0000000000..c72a735ccf --- /dev/null +++ b/bbot/presets/wayback-heavy.yml @@ -0,0 +1,15 @@ +description: Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection + +include: + - subdomain-enum + +modules: + - wayback + - badsecrets + +config: + modules: + wayback: + urls: True + parameters: True + archive: True diff --git a/bbot/presets/wayback.yml b/bbot/presets/wayback.yml new file mode 100644 index 0000000000..3404c7da51 --- /dev/null +++ b/bbot/presets/wayback.yml @@ -0,0 +1,12 @@ +description: Discover URLs and interesting archived files via the Wayback Machine + +include: + - subdomain-enum + +modules: + - wayback + +config: + modules: + wayback: + urls: True diff --git a/bbot/presets/web/lightfuzz-heavy.yml b/bbot/presets/web/lightfuzz-heavy.yml index ecc8c82c04..c67474af1e 100644 --- a/bbot/presets/web/lightfuzz-heavy.yml +++ b/bbot/presets/web/lightfuzz-heavy.yml @@ -8,6 +8,7 @@ flags: modules: - robots + - wayback config: modules: @@ -16,3 +17,6 @@ config: disable_post: False try_post_as_get: True try_get_as_post: True + wayback: + urls: True + parameters: True diff --git a/bbot/presets/web/lightfuzz-max.yml b/bbot/presets/web/lightfuzz-max.yml index 5ad33b817c..fb8dbb070c 100644 --- a/bbot/presets/web/lightfuzz-max.yml +++ b/bbot/presets/web/lightfuzz-max.yml @@ -12,3 +12,7 @@ config: avoid_wafs: False excavate: speculate_params: True # speculate potential parameters extracted from JSON/XML web responses + wayback: + urls: True + parameters: True + archive: True diff --git a/bbot/scanner/scanner.py b/bbot/scanner/scanner.py index edfe5ca8db..1bb41dddb5 100644 --- a/bbot/scanner/scanner.py +++ b/bbot/scanner/scanner.py @@ -906,8 +906,17 @@ def _cancel_tasks(self): tasks.append(self._stop_task) self.helpers.cancel_tasks_sync(tasks) - # process pool - self.helpers.process_pool.shutdown(cancel_futures=True) + # kill all pool workers and shut down (same logic as _reset_process_pool + # but synchronous, since we're tearing down the scan) + pool = self.helpers.process_pool + workers = list((pool._processes or {}).values()) + for proc in workers: + if proc.is_alive(): + proc.terminate() + pool.shutdown(wait=False, cancel_futures=True) + for proc in workers: + if proc.is_alive(): + proc.kill() self.debug("Finished cancelling all scan tasks") return tasks diff --git a/bbot/test/conftest.py b/bbot/test/conftest.py index dec15b4496..32f0ebafdd 100644 --- a/bbot/test/conftest.py +++ b/bbot/test/conftest.py @@ -418,6 +418,16 @@ def pytest_sessionfinish(session, exitstatus): for handler in handlers: logger.removeHandler(handler) + # Kill any orphaned ProcessPoolExecutor workers that could block exit + import multiprocessing + + for child in multiprocessing.active_children(): + if child.is_alive(): + child.terminate() + child.join(timeout=5) + if child.is_alive(): + child.kill() + # Wipe out BBOT home dir shutil.rmtree("/tmp/.bbot_test", ignore_errors=True) diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py index 44a83d8394..783dac9b38 100644 --- a/bbot/test/test_step_1/test_helpers.py +++ b/bbot/test/test_step_1/test_helpers.py @@ -978,6 +978,32 @@ async def test_rm_temp_dir_at_exit(helpers): assert not temp_dir.exists() +# these must be top-level functions so they can be pickled for the subprocess +def _hang_forever(): + import time + + time.sleep(9999) + + +def _cpu_work(n): + return sum(range(n)) + + +@pytest.mark.asyncio +async def test_run_in_executor_mp(helpers): + # normal tasks should complete fine + result = await helpers.run_in_executor_mp(_cpu_work, 100_000) + assert result == sum(range(100_000)) + + # a hanging task should raise TimeoutError and auto-replace the pool + with pytest.raises(asyncio.TimeoutError): + await helpers.run_in_executor_mp(_hang_forever, _timeout=2) + + # pool should still work after a timeout (was replaced by _reset_process_pool) + result = await helpers.run_in_executor_mp(_cpu_work, 50_000, _timeout=30) + assert result == sum(range(50_000)) + + def test_simhash_similarity(helpers): """Test SimHash helper with increasingly different HTML pages.""" diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py index 0af83ac193..96c5a94b00 100644 --- a/bbot/test/test_step_2/module_tests/test_module_excavate.py +++ b/bbot/test/test_step_2/module_tests/test_module_excavate.py @@ -1305,6 +1305,39 @@ def check(self, module_test, events): assert not web_parameter_outofscope, "Out of scope domain was emitted" +class TestExcavate_webparameter_ip_host(ModuleTestBase): + """Verify that when the httpx binary resolves a hostname to an IP (data["host"]), + excavate still uses the URL hostname for WEB_PARAMETER host — not the resolved IP. + + This test uses 'localhost' as the target. The httpx binary resolves it to 127.0.0.1 + and sets data["host"] = "127.0.0.1" in its JSON output. Without the archive_url guard + in _event_host(), this IP would be used as the WEB_PARAMETER host, putting it out of + scope and preventing downstream modules (like lightfuzz) from processing it. + """ + + targets = ["http://localhost:8888"] + modules_overrides = ["http", "excavate", "hunt"] + config_overrides = {"interactsh_disable": True} + + async def setup_after_prep(self, module_test): + await module_test.mock_dns({"localhost": {"A": ["127.0.0.1"]}}) + module_test.httpserver.expect_request("/").respond_with_data( + "<html><p>hello</p></html>", + status=200, + headers={"Set-Cookie": "session=abc123; Path=/"}, + ) + + def check(self, module_test, events): + web_params = [e for e in events if e.type == "WEB_PARAMETER" and e.data["name"] == "session"] + assert len(web_params) > 0, "WEB_PARAMETER for 'session' cookie was not emitted" + for wp in web_params: + assert wp.data["host"] != "127.0.0.1", ( + "WEB_PARAMETER host should be 'localhost', not the resolved IP '127.0.0.1'. " + "excavate._event_host() is using data['host'] (resolved IP) instead of event.host" + ) + assert wp.data["host"] == "localhost", f"WEB_PARAMETER host should be 'localhost', got '{wp.data['host']}'" + + class TestExcavateHeaders(ModuleTestBase): targets = ["http://127.0.0.1:8888/"] modules_overrides = ["excavate", "http", "hunt"] diff --git a/bbot/test/test_step_2/module_tests/test_module_wayback.py b/bbot/test/test_step_2/module_tests/test_module_wayback.py index 13ddac33fa..150d7de339 100644 --- a/bbot/test/test_step_2/module_tests/test_module_wayback.py +++ b/bbot/test/test_step_2/module_tests/test_module_wayback.py @@ -1,12 +1,627 @@ +import re +from urllib.parse import unquote + +from werkzeug.wrappers import Response + +from bbot.modules.wayback import wayback + from .base import ModuleTestBase class TestWayback(ModuleTestBase): async def setup_after_prep(self, module_test): module_test.blasthttp_mock.add_response( - url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original", + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", json=[["original"], ["http://asdf.blacklanternsecurity.com"]], ) def check(self, module_test, events): assert any(e.data == "asdf.blacklanternsecurity.com" for e in events), "Failed to detect subdomain" + + +class TestWaybackParameters(ModuleTestBase): + module_name = "wayback" + targets = ["blacklanternsecurity.com", "127.0.0.1"] + modules_overrides = ["wayback", "hunt"] + config_overrides = {"modules": {"wayback": {"urls": True, "parameters": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + ["http://127.0.0.1:8888/page?foo=bar&baz=qux"], + ], + ) + # serve a response on the local httpserver so the httpx binary gets a 200 + module_test.set_expect_requests(expect_args={"uri": "/page"}, respond_args={"response_data": "alive"}) + + def check(self, module_test, events): + assert any(e.type == "URL_UNVERIFIED" and "127.0.0.1" in e.url and "/page" in e.url for e in events), ( + "Failed to emit URL_UNVERIFIED" + ) + assert any( + e.type == "WEB_PARAMETER" and e.data["name"] == "foo" and e.data["original_value"] == "bar" for e in events + ), "Failed to emit WEB_PARAMETER for foo" + assert any( + e.type == "WEB_PARAMETER" and e.data["name"] == "baz" and e.data["original_value"] == "qux" for e in events + ), "Failed to emit WEB_PARAMETER for baz" + # check that additional_params contains sibling params but excludes the current one + for e in events: + if e.type == "WEB_PARAMETER" and e.data["name"] == "foo": + assert e.data["additional_params"] == {"baz": "qux"}, ( + f"foo's additional_params wrong: {e.data['additional_params']}" + ) + if e.type == "WEB_PARAMETER" and e.data["name"] == "baz": + assert e.data["additional_params"] == {"foo": "bar"}, ( + f"baz's additional_params wrong: {e.data['additional_params']}" + ) + + +class TestWaybackInterestingFiles(ModuleTestBase): + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://blacklanternsecurity.com/backup/site.zip"]], + ) + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://blacklanternsecurity.com/backup/site.zip", + headers={"Content-Type": "application/zip", "Content-Length": "1048576"}, + ) + + def check(self, module_test, events): + assert any( + e.type == "FINDING" + and "Interesting archived file found" in e.data["description"] + and "site.zip" in e.data["description"] + for e in events + ), "Failed to emit FINDING for interesting archived file" + for e in events: + if e.type == "FINDING" and "site.zip" in e.data.get("description", ""): + assert "web.archive.org" in e.data["url"] + + +class TestWaybackArchive(ModuleTestBase): + module_name = "wayback" + modules_overrides = ["wayback", "badsecrets", "excavate"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + sample_viewstate = """<html> +<form method="post" action="./query.aspx" id="form1"> +<div class="aspNetHidden"> +<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="rJdyYspajyiWEjvZ/SMXsU/1Q6Dp1XZ/19fZCABpGqWu+s7F1F/JT1s9mP9ED44fMkninhDc8eIq7IzSllZeJ9JVUME41i8ozheGunVSaESf4nBu" /> +</div> +<div class="aspNetHidden"> +<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="EDD8C9AE" /> +<input type="hidden" name="__VIEWSTATEENCRYPTED" id="__VIEWSTATEENCRYPTED" value="" /> +</div> +</form> +</html>""" + + async def setup_after_prep(self, module_test): + # wayback returns a URL on an unreachable port — httpx binary can't verify it + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/deadpage"]], + ) + # the archived page itself contains the vulnerable viewstate + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/deadpage", + text=self.sample_viewstate, + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + # the dead URL (port 1) should NOT be verified as live + assert not any(e.type == "URL" and "deadpage" in e.url for e in events) + # badsecrets should have found the vulnerability in the archived viewstate + assert any(e.type == "FINDING" and "Known Secret Found." in e.data["description"] for e in events), ( + "Failed to detect badsecrets vulnerability from archived content" + ) + # the vulnerability should reference the original URL, with "from-wayback" tag for provenance + for e in events: + if e.type == "FINDING" and "Known Secret Found." in e.data["description"]: + assert "127.0.0.1" in e.data["url"], ( + f"FINDING url should contain the original host, got: {e.data['url']}" + ) + assert "web.archive.org" not in e.data["url"], ( + f"FINDING url should NOT be an archive.org URL, got: {e.data['url']}" + ) + # web.archive.org should NOT appear as a DNS_NAME event + assert not any(e.type == "DNS_NAME" and e.data == "web.archive.org" for e in events), ( + "web.archive.org should not leak as a DNS_NAME event" + ) + + +class TestWaybackHttpHttpsDedup(ModuleTestBase): + """When CDX returns both http:// and https:// for the same URL, only emit https://.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + ["http://blacklanternsecurity.com/page"], + ["https://blacklanternsecurity.com/page"], + ], + ) + + def check(self, module_test, events): + url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/page" in e.url] + # should have only one, the https version + assert len(url_unverified) == 1, ( + f"Expected 1 URL_UNVERIFIED, got {len(url_unverified)}: {[e.url for e in url_unverified]}" + ) + assert url_unverified[0].url.startswith("https://"), f"Expected https URL, got: {url_unverified[0].url}" + + +class TestWaybackHttpOnlyKept(ModuleTestBase): + """When CDX returns only http:// (no https:// counterpart), emit the http:// URL.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + ["http://blacklanternsecurity.com/old-http-only"], + ], + ) + + def check(self, module_test, events): + url_unverified = [e for e in events if e.type == "URL_UNVERIFIED" and "/old-http-only" in e.url] + assert len(url_unverified) == 1, f"Expected 1 URL_UNVERIFIED, got {len(url_unverified)}" + assert url_unverified[0].url.startswith("http://"), ( + f"Expected http URL when no https exists, got: {url_unverified[0].url}" + ) + + +class TestWaybackCdnCgiBlacklist(ModuleTestBase): + """cdn-cgi/ URLs (Cloudflare infrastructure) should be filtered out.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + ["https://blacklanternsecurity.com/cdn-cgi/challenge-platform/h/g/something"], + ["https://blacklanternsecurity.com/real-page"], + ], + ) + + def check(self, module_test, events): + # cdn-cgi URL should be filtered + assert not any(e.type == "URL_UNVERIFIED" and "cdn-cgi" in e.url for e in events), ( + "cdn-cgi URL should have been filtered" + ) + # real page should still be emitted + assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.url for e in events), ( + "Non-cdn-cgi URL should have been emitted" + ) + + +class TestWaybackArchiveHostField(ModuleTestBase): + """Archived HTTP_RESPONSE events should use original URL (not archive.org) to prevent cascade.""" + + module_name = "wayback" + modules_overrides = ["wayback", "excavate"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/archived-page"]], + ) + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/archived-page", + text="<html><body>archived content</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) >= 1, "Expected at least one archived HTTP_RESPONSE" + for e in http_responses: + # URL should be the ORIGINAL (not archive.org) so event.host returns the original host + assert "web.archive.org" not in e.data["url"], ( + f"HTTP_RESPONSE url should NOT be an archive.org URL, got: {e.data['url']}" + ) + assert "127.0.0.1" in e.data["url"], ( + f"HTTP_RESPONSE url should contain original host, got: {e.data['url']}" + ) + # archive_url should contain the archive.org provenance URL + assert "web.archive.org" in e.data.get("archive_url", ""), ( + f"HTTP_RESPONSE archive_url should be the archive.org URL, got: {e.data.get('archive_url')}" + ) + # event.host should be the original host + assert str(e.host) != "web.archive.org", f"event.host should be original host, got: {e.host}" + # web.archive.org should NOT appear as a DNS_NAME event + assert not any(e.type == "DNS_NAME" and e.data == "web.archive.org" for e in events), ( + "web.archive.org should not leak as a DNS_NAME event" + ) + + +class TestWaybackArchiveHuntFinding(ModuleTestBase): + """When hunt processes a WEB_PARAMETER extracted from archived content, + the resulting FINDING should have the original host and original URL — NOT web.archive.org.""" + + module_name = "wayback" + modules_overrides = ["wayback", "excavate", "hunt"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # CDX returns a dead URL (port 1 = unreachable) with a huntable form + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/search"]], + ) + # the archived page contains a form with "redirect" — a known hunt parameter + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/search", + text='<html><form method="GET" action="/search"><input name="redirect" value="test"></form></html>', + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + # hunt should have found the "redirect" parameter as interesting + hunt_findings = [ + e for e in events if e.type == "FINDING" and "redirect" in e.data.get("description", "").lower() + ] + assert len(hunt_findings) >= 1, ( + f"Expected at least one hunt FINDING for 'redirect' param, got: " + f"{[(e.type, e.data.get('description', '')) for e in events if e.type == 'FINDING']}" + ) + for finding in hunt_findings: + # host must be the original, NOT web.archive.org + assert finding.data.get("host") != "web.archive.org", ( + f"Hunt FINDING host should NOT be web.archive.org, got: {finding.data}" + ) + assert finding.data.get("host") == "127.0.0.1", ( + f"Hunt FINDING host should be 127.0.0.1 (original), got: {finding.data.get('host')}" + ) + # URL should NOT contain web.archive.org — it should be the original URL + finding_url = finding.data.get("url", "") + assert "web.archive.org" not in finding_url, ( + f"Hunt FINDING url should NOT contain web.archive.org, got: {finding_url}" + ) + # from-wayback tag should propagate; archive_url is reachable via parent traversal + assert "from-wayback" in finding.tags, ( + f"Hunt FINDING should have from-wayback tag, got tags: {finding.tags}" + ) + assert finding.archive_url is not None, ( + "Hunt FINDING should be able to reach archive_url via parent traversal" + ) + assert "web.archive.org" in finding.archive_url, ( + f"Hunt FINDING archive_url should be archive.org URL, got: {finding.archive_url}" + ) + + # WEB_PARAMETERs from archived content should have from-wayback tag and reachable archive_url + archived_params = [ + e for e in events if e.type == "WEB_PARAMETER" and "redirect" in e.data.get("name", "").lower() + ] + for param in archived_params: + assert "from-wayback" in param.tags, ( + f"WEB_PARAMETER from archived content should have from-wayback tag, got tags: {param.tags}" + ) + assert param.archive_url is not None, ( + "WEB_PARAMETER from archived content should reach archive_url via parent traversal" + ) + + # web.archive.org should never appear as a DNS_NAME + assert not any(e.type == "DNS_NAME" and e.data == "web.archive.org" for e in events), ( + "web.archive.org should not leak as a DNS_NAME event" + ) + + +class TestWaybackLightfuzzXSS(ModuleTestBase): + """End-to-end: wayback discovers URL with param → httpx verifies → wayback emits WEB_PARAMETER → lightfuzz finds XSS.""" + + module_name = "wayback" + targets = ["blacklanternsecurity.com"] + modules_overrides = ["wayback", "http", "lightfuzz", "excavate"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = { + "interactsh_disable": True, + "modules": { + "wayback": {"urls": True, "parameters": True}, + "lightfuzz": {"enabled_submodules": ["xss"]}, + }, + } + + def request_handler(self, request): + qs = str(request.query_string.decode()) + if "search=" in qs: + value = qs.split("search=")[1] + if "&" in value: + value = value.split("&")[0] + return Response( + f"<html><h1>Results for '{unquote(value)}'</h1></html>", + status=200, + ) + return Response("<html><p>default page</p></html>", status=200) + + async def setup_after_prep(self, module_test): + module_test.scan.modules["lightfuzz"].helpers.rand_string = lambda *args, **kwargs: "AAAAAAAAAAAAAA" + # CDX returns a URL with a search parameter pointing at the local httpserver + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:8888/?search=test"]], + ) + # httpserver handles httpx verification and lightfuzz probes + expect_args = re.compile("/") + module_test.set_expect_requests_handler(expect_args=expect_args, request_handler=self.request_handler) + + def check(self, module_test, events): + # wayback should have emitted WEB_PARAMETER for "search" + assert any( + e.type == "WEB_PARAMETER" and e.data["name"] == "search" and "wayback" in e.data["description"].lower() + for e in events + ), "wayback failed to emit WEB_PARAMETER for search" + # lightfuzz should have detected XSS + assert any( + e.type == "FINDING" and "XSS" in e.data["description"] and "search" in e.data["description"] + for e in events + ), ( + f"lightfuzz failed to detect XSS. FINDINGs: " + f"{[(e.data.get('description', '')) for e in events if e.type == 'FINDING']}" + ) + + +class TestWaybackStripBodyArtifacts(ModuleTestBase): + """Test that _strip_wayback_wrapper removes all archive.org artifacts from HTML body.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + + async def setup_after_prep(self, module_test): + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"]], + ) + + def check(self, module_test, events): + w = wayback.__new__(wayback) + + # test stripping of rewritten URLs + body = '<a href="http://web.archive.org/web/20250101120000/http://example.com/page">link</a>' + stripped = w._strip_wayback_wrapper(body) + assert "web.archive.org" not in stripped + assert "http://example.com/page" in stripped + + # test stripping of toolbar + body = ( + "<!-- BEGIN WAYBACK TOOLBAR INSERT --><div>toolbar</div><!-- END WAYBACK TOOLBAR INSERT --><p>content</p>" + ) + stripped = w._strip_wayback_wrapper(body) + assert "toolbar" not in stripped + assert "content" in stripped + + # test stripping of stale archive.org references (e.g. /web/submit form) + body = '<form action="http://web.archive.org/web/submit"><input name="date"></form><p>real</p>' + stripped = w._strip_wayback_wrapper(body) + assert "web.archive.org" not in stripped + assert "real" in stripped + + # test stripping of protocol-relative archive.org URLs + body = '<script src="//archive.org/includes/athena.js"></script><p>content</p>' + stripped = w._strip_wayback_wrapper(body) + assert "archive.org" not in stripped + assert "content" in stripped + + # test stripping of relative wayback URL rewrites (href) + body = '<a href="/web/19971024185506/http://www.example.com/PDF%20files/data.pdf">link</a>' + stripped = w._strip_wayback_wrapper(body) + assert "/web/19971024185506/" not in stripped + assert "http://www.example.com/PDF%20files/data.pdf" in stripped + + # test stripping of relative wayback URL rewrites with modifier suffix (im_ for images) + body = '<img src="/web/19971024185506im_/http://www.example.com/images/logo.gif">' + stripped = w._strip_wayback_wrapper(body) + assert "/web/19971024185506im_/" not in stripped + assert "http://www.example.com/images/logo.gif" in stripped + + # test stripping of relative wayback URL rewrites with js_ suffix + body = '<script src="/web/20250529193232js_/https://www.example.com/script.js"></script>' + stripped = w._strip_wayback_wrapper(body) + assert "/web/20250529193232js_/" not in stripped + assert "https://www.example.com/script.js" in stripped + + +class TestWaybackArchiveBloomDedup(ModuleTestBase): + """When multiple archive URLs redirect to the same snapshot, bloom filter prevents duplicate HTTP_RESPONSEs.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # CDX returns two different dead URLs + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + ["http://127.0.0.1:1/page-a"], + ["http://127.0.0.1:1/page-b"], + ], + ) + # both archive URLs redirect to the same archived snapshot + redirect_target = "http://web.archive.org/web/20230101120000/http://127.0.0.1:1/same-page" + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/page-a", + status_code=301, + headers={"Location": redirect_target}, + ) + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/page-b", + status_code=301, + headers={"Location": redirect_target}, + ) + # two responses for the redirect target (one consumed per redirect) + for _ in range(2): + module_test.blasthttp_mock.add_response( + url=redirect_target, + text="<html><body>archived content</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, ( + f"Expected exactly 1 archived HTTP_RESPONSE (bloom dedup should prevent duplicate), got {len(http_responses)}" + ) + + +class TestWaybackArchiveRetry(ModuleTestBase): + """Archive fetches that fail transiently (connection error) should be retried and succeed.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # speed up retries for testing + module_test.scan.modules["wayback"]._archive_error_delay = 0.01 + module_test.scan.modules["wayback"]._archive_delay = 0 + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/retry-page"]], + ) + # first attempt: 503 (archive.org overloaded) + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", + ) + # retry attempt: 200 + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/retry-page", + text="<html><body>recovered content</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}" + + +class TestWaybackGarbageUrlFilter(ModuleTestBase): + """Crawler-trap URLs with repeating path segments should be filtered out.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + # build a crawler-trap URL with repeating path segments (like the real-world example) + repeating = "/themes/sites/example.com".lstrip("/") + garbage_path = "/get-materials/" + "/".join([repeating] * 20) + garbage_url = f"https://blacklanternsecurity.com{garbage_path}" + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + [garbage_url], + ["https://blacklanternsecurity.com/real-page"], + ], + ) + + def check(self, module_test, events): + # garbage URL should be filtered + assert not any(e.type == "URL_UNVERIFIED" and "get-materials" in e.url for e in events), ( + "Crawler-trap URL with repeating path segments should have been filtered" + ) + # real page should still be emitted + assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.url for e in events), ( + "Non-garbage URL should have been emitted" + ) + + +class TestWaybackGarbageUrlLength(ModuleTestBase): + """Excessively long URLs should be filtered out as garbage.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com"] + config_overrides = {"modules": {"wayback": {"urls": True}}} + + async def setup_after_prep(self, module_test): + # URL exceeding 2000 character limit + long_url = "https://blacklanternsecurity.com/" + "a" * 2000 + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[ + ["original"], + [long_url], + ["https://blacklanternsecurity.com/normal-page"], + ], + ) + + def check(self, module_test, events): + # long URL should be filtered + assert not any(e.type == "URL_UNVERIFIED" and "aaaa" in e.url for e in events), ( + "Excessively long URL should have been filtered" + ) + # normal page should still be emitted + assert any(e.type == "URL_UNVERIFIED" and "normal-page" in e.url for e in events), ( + "Normal-length URL should have been emitted" + ) + + +class TestWaybackArchive429Retry(ModuleTestBase): + """Archive fetches that get 429 rate-limited should back off and retry successfully.""" + + module_name = "wayback" + modules_overrides = ["wayback"] + targets = ["blacklanternsecurity.com", "127.0.0.1"] + config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}} + + async def setup_after_prep(self, module_test): + # speed up delays for testing + module_test.scan.modules["wayback"]._archive_429_default_delay = 0.01 + module_test.scan.modules["wayback"]._archive_error_delay = 0.01 + module_test.scan.modules["wayback"]._archive_delay = 0 + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit", + json=[["original"], ["http://127.0.0.1:1/rate-limited-page"]], + ) + # first attempt: 429 rate limited + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", + status_code=429, + headers={"Retry-After": "1"}, + ) + # retry after backoff: 200 + module_test.blasthttp_mock.add_response( + url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page", + text="<html><body>content after rate limit</body></html>", + headers={"Content-Type": "text/html"}, + ) + + def check(self, module_test, events): + http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags] + assert len(http_responses) == 1, ( + f"Expected 1 archived HTTP_RESPONSE after 429 retry, got {len(http_responses)}" + ) diff --git a/docs/modules/wayback.md b/docs/modules/wayback.md new file mode 100644 index 0000000000..a3f30f3e23 --- /dev/null +++ b/docs/modules/wayback.md @@ -0,0 +1,141 @@ +# Wayback + +## Overview + +The Wayback module queries [archive.org's Wayback Machine](https://web.archive.org/) CDX API to discover subdomains, URLs, web parameters, and archived content for your targets. By default it operates as a passive subdomain enumeration source, but with its extended features enabled it becomes a powerful tool for discovering dead URLs, extracting parameters for fuzzing, and retrieving archived versions of pages that no longer exist. + +* Watches: **DNS_NAME**, **URL** +* Produces: **URL_UNVERIFIED**, **DNS_NAME**, **WEB_PARAMETER**, **HTTP_RESPONSE**, **FINDING** +* Flags: `passive`, `subdomain-enum`, `safe` + +## Default Behavior + +By default, wayback only emits **DNS_NAME** events (subdomains) extracted from archived URLs. This is the behavior you get when wayback is included via the `subdomain-enum` preset. No URLs, parameters, or archived content are fetched. + +To unlock the more advanced features, you need to enable them via configuration options or use one of the wayback presets. + +## Configuration Options + +| Option | Type | Default | Description | +|---------------------|------|---------|-------------------------------------------------------------------------------------------------------| +| `urls` | bool | `False` | Emit `URL_UNVERIFIED` events in addition to `DNS_NAME`s. Required for `parameters` and `archive`. | +| `parameters` | bool | `False` | Extract `WEB_PARAMETER` events from query strings in archived URLs. Requires `urls=True`. | +| `archive` | bool | `False` | Fetch archived versions of dead URLs and emit `HTTP_RESPONSE` events. Requires `urls=True`. | +| `garbage_threshold` | int | `10` | Deduplicate similar URLs if they appear in groups of this size or larger. Lower = less noise. | + +## Features + +### URL Discovery (`urls: True`) + +When `urls` is enabled, wayback emits `URL_UNVERIFIED` events for every unique URL found in the Wayback Machine's index. These are tagged with `from-wayback` and sent through BBOT's normal URL verification pipeline (httpx). + +Before emission, URLs go through several cleanup steps: + +- **URL collapsing** - Groups of similar URLs (e.g. pagination, search results) are deduplicated based on the `garbage_threshold` setting +- **HTTP/HTTPS deduplication** - When both `http://` and `https://` variants exist, only the HTTPS version is kept +- **Blacklist filtering** - URLs containing known CDN/WAF paths (e.g. `_Incapsula_Resource`, `/cdn-cgi/`) are filtered out + +### Parameter Extraction (`parameters: True`) + +When `parameters` is enabled (requires `urls: True`), wayback extracts query string parameters from archived URLs and emits them as `WEB_PARAMETER` events. This is useful for discovering GET parameters that can be fed into fuzzing modules like lightfuzz. + +Parameters are cached and only emitted after the corresponding URL has been verified as live by httpx. This prevents emitting parameters for URLs that no longer exist. + +!!! note + Parameter extraction requires at least one module that consumes `WEB_PARAMETER` events to be active (e.g. `lightfuzz`, `hunt`, `paramminer_getparams`). If no such module is present, parameter extraction is automatically disabled with a warning. + +### Archive Retrieval (`archive: True`) + +When `archive` is enabled (requires `urls: True`), wayback fetches the actual archived content of URLs from the Wayback Machine and emits them as `HTTP_RESPONSE` events. This is particularly useful for: + +- **Finding secrets in dead pages** - Archived versions may contain API keys, credentials, or other sensitive data that modules like `badsecrets` can detect +- **Discovering hidden functionality** - Pages that have been removed may reveal application structure or endpoints + +Archive retrieval runs during the module's `finish()` phase, after all URLs have been discovered and verified. URLs that are confirmed live (2xx status) are automatically removed from the archive queue, so only dead URLs are fetched from the archive. + +The archived content goes through extensive cleanup to remove Wayback Machine artifacts: + +- Wayback toolbar/header/footer HTML is stripped +- Rewritten URLs (e.g. `http://web.archive.org/web/20250101/http://example.com/page`) are restored to originals +- Wayback-injected headers (`x-archive-*`, `set-cookie`) are removed +- The event's host, port, and URL are set to the original target, not `web.archive.org` + +Archived HTTP_RESPONSE events are tagged with `from-wayback` and `archived`. + +!!! warning + Static file extensions (images, CSS, JS, etc.) are automatically skipped during archive retrieval to avoid unnecessary traffic. + +### Interesting File Detection + +When `urls` is enabled, wayback also checks for potentially interesting archived files by looking for URLs with sensitive extensions: `.zip`, `.sql`, `.bak`, `.env`, `.config`, `.tar.gz`, `.tar.bz2`. + +When found, these are verified with a HEAD request to archive.org. If the archived file exists and isn't a soft-404, a `FINDING` event is emitted with details about the file (including size if available). These findings are tagged with `from-wayback`, `archived`, and `interesting-file`. + +## Presets + +Wayback comes with two dedicated presets, and is also integrated into several other presets: + +### `-p wayback` + +Basic URL discovery mode. Includes `subdomain-enum` and enables `urls: True`. Good for general recon when you want to discover historical URLs alongside subdomains. + +```bash +bbot -p wayback -t evilcorp.com +``` + +### `-p wayback-heavy` + +Full-featured mode with URL discovery, parameter extraction, and archive retrieval. Also includes `badsecrets` to scan archived content for exposed secrets. + +```bash +bbot -p wayback-heavy -t evilcorp.com +``` + +### Integration with other presets + +Wayback's extended features are also enabled in several other presets: + +| Preset | Wayback Config | +|-----------------------|-----------------------------------------| +| `kitchen-sink` | `urls`, `parameters`, `archive` | +| `dirbust-heavy` | `urls` | +| `nuclei-intense` | `urls` | +| `lightfuzz-heavy` | `urls`, `parameters` | +| `lightfuzz-superheavy`| `urls`, `parameters`, `archive` | + +## Example Commands + +```bash +# Basic subdomain enumeration (default behavior, no URL emission) +bbot -p subdomain-enum -t evilcorp.com +``` + +```bash +# URL discovery via wayback preset +bbot -p wayback -t evilcorp.com +``` + +```bash +# Full wayback integration with archived content and parameter extraction +bbot -p wayback-heavy -t evilcorp.com +``` + +```bash +# Enable wayback URLs alongside a nuclei scan +bbot -p nuclei -m wayback -c modules.wayback.urls=True --allow-deadly -t evilcorp.com +``` + +```bash +# Pair with lightfuzz for parameter fuzzing using archived parameters +bbot -p lightfuzz-heavy spider -t evilcorp.com --allow-deadly +``` + +```bash +# Enable wayback features via command-line config +bbot -p subdomain-enum -c modules.wayback.urls=True modules.wayback.parameters=True modules.wayback.archive=True -t evilcorp.com +``` + +```bash +# Adjust garbage threshold for cleaner output (more aggressive deduplication) +bbot -p wayback -c modules.wayback.garbage_threshold=5 -t evilcorp.com +``` diff --git a/docs/scanning/configuration.md b/docs/scanning/configuration.md index bbc5aa7a22..a0bdf16401 100644 --- a/docs/scanning/configuration.md +++ b/docs/scanning/configuration.md @@ -600,7 +600,9 @@ In addition to the stated options for each module, the following universal optio | modules.trufflehog.version | str | trufflehog version | 3.90.8 | | modules.urlscan.urls | bool | Emit URLs in addition to DNS_NAMEs | False | | modules.virustotal.api_key | str | VirusTotal API Key | | +| modules.wayback.archive | bool | fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true) | False | | modules.wayback.garbage_threshold | int | Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data) | 10 | +| modules.wayback.parameters | bool | emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true) | False | | modules.wayback.urls | bool | emit URLs in addition to DNS_NAMEs | False | | modules.asset_inventory.output_file | str | Set a custom output file | | | modules.asset_inventory.recheck | bool | When use_previous=True, don't retain past details like open ports or findings. Instead, allow them to be rediscovered by the new scan | False | diff --git a/docs/scanning/presets_list.md b/docs/scanning/presets_list.md index 98a4d126ec..1b824df6af 100644 --- a/docs/scanning/presets_list.md +++ b/docs/scanning/presets_list.md @@ -282,7 +282,7 @@ Everything everywhere all at once ??? note "`kitchen-sink.yml`" ```yaml title="~/.bbot/presets/kitchen-sink.yml" description: Everything everywhere all at once - + include: - subdomain-enum - cloud-enum @@ -294,6 +294,15 @@ Everything everywhere all at once - dirbust-light - web-screenshots - baddns-heavy + + config: + modules: + baddns: + enable_references: True + wayback: + urls: True + parameters: True + archive: True ``` @@ -340,10 +349,11 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet flags: - web-paramminer - + modules: - robots - + - wayback + config: modules: lightfuzz: @@ -351,6 +361,9 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet disable_post: False try_post_as_get: True try_get_as_post: True + wayback: + urls: True + parameters: True ``` Category: web @@ -400,7 +413,7 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s include: - lightfuzz-heavy - + config: url_querystring_collapse: False # in cases where the same parameter is observed multiple times, fuzz them individually instead of collapsing them into a single parameter modules: @@ -410,6 +423,10 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s avoid_wafs: False excavate: speculate_params: True # speculate potential parameters extracted from JSON/XML web responses + wayback: + urls: True + parameters: True + archive: True ``` Category: web @@ -802,7 +819,78 @@ Take screenshots of webpages -Modules: [0]("") +Modules: [3]("`gowitness`, `httpx`, `social`") + +## **web-thorough** + +Aggressive web scan + +??? note "`web-thorough.yml`" + ```yaml title="~/.bbot/presets/web-thorough.yml" + description: Aggressive web scan + + include: + # include the web-basic preset + - web-basic + + flags: + - web-thorough + ``` + + + +Modules: [32]("`ajaxpro`, `aspnet_bin_exposure`, `azure_realm`, `baddns`, `badsecrets`, `bucket_amazon`, `bucket_digitalocean`, `bucket_firebase`, `bucket_google`, `bucket_microsoft`, `bypass403`, `dotnetnuke`, `ffuf_shortnames`, `filedownload`, `generic_ssrf`, `git`, `graphql_introspection`, `host_header`, `httpx`, `hunt`, `iis_shortnames`, `lightfuzz`, `ntlm`, `oauth`, `reflected_parameters`, `retirejs`, `robots`, `securitytxt`, `smuggler`, `sslcert`, `telerik`, `url_manipulation`") + +## **wayback** + +Discover URLs and interesting archived files via the Wayback Machine + +??? note "`wayback.yml`" + ```yaml title="~/.bbot/presets/wayback.yml" + description: Discover URLs and interesting archived files via the Wayback Machine + + include: + - subdomain-enum + + modules: + - wayback + + config: + modules: + wayback: + urls: True + ``` + + + +Modules: [52]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`") + +## **wayback-heavy** + +Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection + +??? note "`wayback-heavy.yml`" + ```yaml title="~/.bbot/presets/wayback-heavy.yml" + description: Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection + + include: + - subdomain-enum + + modules: + - wayback + - badsecrets + + config: + modules: + wayback: + urls: True + parameters: True + archive: True + ``` + + + +Modules: [53]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `badsecrets`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`") <!-- END BBOT PRESET YAML --> ## Table of Default Presets @@ -810,34 +898,34 @@ Modules: [0]("") Here is a the same data, but in a table: <!-- BBOT PRESETS --> -| Preset | Category | Description | # Modules | Modules | -|-------------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---------------------------------------------------------------------------------------------| -| baddns | | Check for subdomain takeovers and other DNS issues. | 1 | baddns | -| baddns-heavy | | Run all baddns modules and submodules. | 3 | baddns, baddns_direct, baddns_zone | -| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 0 | | -| code-enum | | Enumerate Git repositories, Docker images, etc. | 0 | | -| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 3 | ffuf, httpx, wayback | -| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 1 | ffuf | -| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 8 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, telerik | -| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 0 | | -| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | | -| iis-shortnames | web | Recursively enumerate IIS shortnames | 0 | | -| kitchen-sink | | Everything everywhere all at once | 7 | baddns, baddns_direct, baddns_zone, ffuf, httpx, hunt, reflected_parameters | -| lightfuzz | web | Default fuzzing: all 9 submodules (cmdi, crypto, path, serial, sqli, ssti, xss, esi, ssrf) plus companion modules (badsecrets, hunt, reflected_parameters). POST fuzzing disabled but try_post_as_get enabled, so POST params are retested as GET. Skips confirmed WAFs. | 6 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters | -| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots | -| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter | -| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots | -| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters | -| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter | -| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter | -| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback | -| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter | -| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 3 | httpx, hunt, reflected_parameters | -| spider | | Recursive web spider | 1 | httpx | -| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx | -| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 0 | | -| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 2 | fingerprintx, nuclei | -| web | | Quick web scan | 0 | | -| web-heavy | | Aggressive web scan | 0 | | -| web-screenshots | | Take screenshots of webpages | 0 | | +| Preset | Category | Description | # Modules | Modules | +|----------------------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| baddns-heavy | | Run all baddns modules and submodules. | 4 | baddns, baddns_direct, baddns_zone, httpx | +| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 58 | anubisdb, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| code-enum | | Enumerate Git repositories, Docker images, etc. | 20 | apkpure, code_repository, docker_pull, dockerhub, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, httpx, jadx, postman, postman_download, social, trufflehog | +| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 5 | ffuf, ffuf_shortnames, httpx, iis_shortnames, wayback | +| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 4 | ffuf, ffuf_shortnames, httpx, iis_shortnames | +| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 9 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, iis_shortnames, telerik | +| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 8 | dehashed, dnscaa, dnstlsrpt, emailformat, hunterio, pgp, skymem, sslcert | +| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | | +| iis-shortnames | web | Recursively enumerate IIS shortnames | 3 | ffuf_shortnames, httpx, iis_shortnames | +| kitchen-sink | | Everything everywhere all at once | 90 | anubisdb, apkpure, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, badsecrets, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, code_repository, crt, crt_db, dehashed, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, docker_pull, dockerhub, emailformat, ffuf, ffuf_shortnames, filedownload, fullhunt, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, gowitness, graphql_introspection, hackertarget, httpx, hunt, hunterio, iis_shortnames, ipneighbor, jadx, leakix, myssl, ntlm, oauth, otx, paramminer_cookies, paramminer_getparams, paramminer_headers, passivetotal, pgp, postman, postman_download, rapiddns, reflected_parameters, robots, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, skymem, social, sslcert, subdomaincenter, subdomainradar, trickest, trufflehog, urlscan, virustotal, wayback | +| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots | +| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter | +| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots | +| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters | +| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter | +| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter | +| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback | +| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter | +| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 6 | httpx, hunt, paramminer_cookies, paramminer_getparams, paramminer_headers, reflected_parameters | +| spider | | Recursive web spider | 1 | httpx | +| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx | +| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 51 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 3 | fingerprintx, httpx, nuclei | +| web-basic | | Quick web scan | 18 | azure_realm, baddns, badsecrets, bucket_amazon, bucket_firebase, bucket_google, bucket_microsoft, ffuf_shortnames, filedownload, git, graphql_introspection, httpx, iis_shortnames, ntlm, oauth, robots, securitytxt, sslcert | +| web-screenshots | | Take screenshots of webpages | 3 | gowitness, httpx, social | +| web-thorough | | Aggressive web scan | 32 | ajaxpro, aspnet_bin_exposure, azure_realm, baddns, badsecrets, bucket_amazon, bucket_digitalocean, bucket_firebase, bucket_google, bucket_microsoft, bypass403, dotnetnuke, ffuf_shortnames, filedownload, generic_ssrf, git, graphql_introspection, host_header, httpx, hunt, iis_shortnames, lightfuzz, ntlm, oauth, reflected_parameters, retirejs, robots, securitytxt, smuggler, sslcert, telerik, url_manipulation | +| wayback | | Discover URLs and interesting archived files via the Wayback Machine | 52 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | +| wayback-heavy | | Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection | 53 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, badsecrets, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback | <!-- END BBOT PRESETS --> diff --git a/mkdocs.yml b/mkdocs.yml index 498b0b4a6d..2355c67f5a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,6 +32,7 @@ nav: - Modules: - List of Modules: modules/list_of_modules.md - Nuclei: modules/nuclei.md + - Wayback: modules/wayback.md - Custom YARA Rules: modules/custom_yara_rules.md - Lightfuzz: modules/lightfuzz.md - Misc: