diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py
index 4ffbe7dc40..b7edb150bf 100644
--- a/bbot/core/event/base.py
+++ b/bbot/core/event/base.py
@@ -258,7 +258,8 @@ def __init__(
self.data = self._sanitize_data(data)
except Exception as e:
log.trace(traceback.format_exc())
- raise ValidationError(f'Error sanitizing event data "{data}" for type "{self.type}": {e}')
+ data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data)
+ raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{self.type}": {e}')
if not self.data:
raise ValidationError(f'Invalid event data "{data}" for type "{self.type}"')
@@ -626,7 +627,7 @@ def parent(self, parent):
self.web_spider_distance = getattr(parent, "web_spider_distance", 0)
event_has_url = getattr(self, "parsed_url", None) is not None
for t in parent.tags:
- if t in ("affiliate",):
+ if t in ("affiliate", "from-wayback"):
self.add_tag(t)
elif t.startswith("mutation-"):
self.add_tag(t)
@@ -655,6 +656,26 @@ def parent_uuid(self):
return parent_uuid
return self._parent_uuid
+ @property
+ def archive_url(self):
+ """Traverse the parent chain to find the nearest archive_url.
+
+ The 'from-wayback' tag signals that this event descends from archived content.
+ The actual archive URL is stored only in the data dict of the originating
+ wayback HTTP_RESPONSE; this property walks upward to find it.
+ """
+ if "from-wayback" not in self.tags:
+ return None
+ event = self
+ while event is not None:
+ if isinstance(event.data, dict) and "archive_url" in event.data:
+ return event.data["archive_url"]
+ parent = getattr(event, "parent", None)
+ if parent is None or parent is event:
+ break
+ event = parent
+ return None
+
@property
def validators(self):
"""
@@ -1783,6 +1804,7 @@ class _data_validator(BaseModel):
full_url: Optional[str] = None
path: Optional[str] = None
cves: Optional[list[str]] = None
+ archive_url: Optional[str] = None
_validate_url = field_validator("url")(validators.validate_url)
_validate_host = field_validator("host")(validators.validate_host)
_validate_severity = field_validator("severity")(validators.validate_severity)
@@ -2185,7 +2207,8 @@ def make_event(
data = validators.validate_host(data)
except Exception as e:
log.trace(traceback.format_exc())
- raise ValidationError(f'Error sanitizing event data "{data}" for type "{event_type}": {e}')
+ data_preview = str(data)[:200] + "..." if len(str(data)) > 200 else str(data)
+ raise ValidationError(f'Error sanitizing event data "{data_preview}" for type "{event_type}": {e}')
data_is_ip = is_ip(data)
if event_type == "DNS_NAME" and data_is_ip:
event_type = "IP_ADDRESS"
diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py
index 0d2a1dbb6b..925b736516 100644
--- a/bbot/core/helpers/helper.py
+++ b/bbot/core/helpers/helper.py
@@ -1,4 +1,6 @@
import os
+import sys
+import asyncio
import logging
from pathlib import Path
import multiprocessing as mp
@@ -75,15 +77,12 @@ def __init__(self, preset):
self._loop = None
- # multiprocessing thread pool
+ # multiprocessing process pool
start_method = mp.get_start_method()
if start_method != "spawn":
self.warning(f"Multiprocessing spawn method is set to {start_method}.")
-
- # we spawn 1 fewer processes than cores
- # this helps to avoid locking up the system or competing with the main python process for cpu time
- num_processes = max(1, mp.cpu_count() - 1)
- self.process_pool = ProcessPoolExecutor(max_workers=num_processes)
+ self.process_pool = self._create_process_pool()
+ self._pool_reset_lock = asyncio.Lock()
self._cloud = None
self._blasthttp_client = None
@@ -214,6 +213,18 @@ def loop(self):
self._loop.set_default_executor(self._io_executor)
return self._loop
+ @staticmethod
+ def _create_process_pool():
+ # we spawn 1 fewer processes than cores
+ # this helps to avoid locking up the system or competing with the main python process for cpu time
+ num_processes = max(1, mp.cpu_count() - 1)
+ pool_kwargs = {"max_workers": num_processes}
+ # max_tasks_per_child replaces workers after N tasks, preventing memory leaks
+ # and reducing the chance of a degraded worker process causing hangs
+ if sys.version_info >= (3, 11):
+ pool_kwargs["max_tasks_per_child"] = 25
+ return ProcessPoolExecutor(**pool_kwargs)
+
def run_in_executor_io(self, callback, *args, **kwargs):
"""
Run a synchronous task in the event loop's default thread pool executor
@@ -237,17 +248,55 @@ def run_in_executor_cpu(self, callback, *args, **kwargs):
callback = partial(callback, **kwargs)
return self.loop.run_in_executor(self._cpu_executor, callback, *args)
- def run_in_executor_mp(self, callback, *args, **kwargs):
+ async def run_in_executor_mp(self, callback, *args, **kwargs):
"""
- Same as run_in_executor_io() except with a process pool executor
- Use only in cases where callback is CPU-bound
+ Same as run_in_executor_io() except with a process pool executor.
+ Use only in cases where callback is CPU-bound.
+
+ Includes a timeout (default 300s) to prevent indefinite hangs if a child process dies or the pool enters a broken state.
+ On timeout, the entire pool is terminated and replaced so that stuck workers cannot accumulate and starve the scan.
+
+ Pass ``_timeout=seconds`` to override the default timeout.
Examples:
Execute callback:
>>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2)
"""
+ timeout = kwargs.pop("_timeout", 300)
callback = partial(callback, **kwargs)
- return self.loop.run_in_executor(self.process_pool, callback, *args)
+ future = self.loop.run_in_executor(self.process_pool, callback, *args)
+ try:
+ return await asyncio.wait_for(future, timeout=timeout)
+ except asyncio.TimeoutError:
+ log.warning(f"Process pool task timed out after {timeout}s, killing stuck workers and replacing pool")
+ await self._reset_process_pool()
+ raise
+
+ async def _reset_process_pool(self):
+ """Terminate all workers in the current process pool and replace it.
+
+ This is the nuclear option — every in-flight task on the old pool will fail with BrokenProcessPool.
+ We accept that trade-off because a timeout means something is genuinely broken, and leaving the stuck worker alive would permanently consume a pool slot.
+
+ # TODO: Python 3.14 adds ProcessPoolExecutor.terminate_workers()
+ # and kill_workers() (https://github.com/python/cpython/pull/130849).
+ # Once we drop 3.13 support we can replace the _processes access
+ # with those official methods.
+ """
+ async with self._pool_reset_lock:
+ old_pool = self.process_pool
+ self.process_pool = self._create_process_pool()
+ # snapshot workers before shutdown (shutdown sets _processes = None)
+ workers = list((old_pool._processes or {}).values())
+ # terminate workers before shutdown so stuck ones don't block
+ for proc in workers:
+ if proc.is_alive():
+ proc.terminate()
+ old_pool.shutdown(wait=False, cancel_futures=True)
+ # escalate to SIGKILL for anything that ignored SIGTERM
+ for proc in workers:
+ if proc.is_alive():
+ proc.kill()
@property
def in_tests(self):
diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py
index 5b2a9ae7d6..8c213a0000 100644
--- a/bbot/core/helpers/misc.py
+++ b/bbot/core/helpers/misc.py
@@ -2723,6 +2723,7 @@ def get_waf_strings():
return [
"The requested URL was rejected",
"This content has been blocked",
+ "You don't have permission to access ",
]
diff --git a/bbot/defaults.yml b/bbot/defaults.yml
index 311cacdb7a..f70f9099bb 100644
--- a/bbot/defaults.yml
+++ b/bbot/defaults.yml
@@ -265,6 +265,7 @@ parameter_blacklist:
- .AspNetCore.Session
- PHPSESSID
- __cf_bm
+ - _cfuvid
- f5_cspm
parameter_blacklist_prefixes:
diff --git a/bbot/modules/http.py b/bbot/modules/http.py
index e7e45859b3..c9c5155a88 100644
--- a/bbot/modules/http.py
+++ b/bbot/modules/http.py
@@ -42,6 +42,7 @@ async def setup(self):
self.max_response_size = self.config.get("max_response_size", 5242880)
self.store_responses = self.config.get("store_responses", False)
self.client = self.helpers.blasthttp
+ self.waf_yara_rule = self.helpers.yara.compile_strings(self.helpers.get_waf_strings(), nocase=True)
return True
async def filter_event(self, event):
@@ -274,6 +275,13 @@ async def handle_batch(self, *events):
self.debug(f'Discarding 404 from "{url}"')
continue
+ # discard 4xx responses that contain WAF strings
+ if 400 <= status_code < 500:
+ body = j.get("body", "")
+ if body and await self.helpers.yara.match(self.waf_yara_rule, body):
+ self.debug(f'Discarding WAF {status_code} from "{url}"')
+ continue
+
# main URL
tags = [f"status-{status_code}"]
diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
index 893dcc4300..8d24829c38 100644
--- a/bbot/modules/internal/excavate.py
+++ b/bbot/modules/internal/excavate.py
@@ -366,6 +366,42 @@ def in_bl(self, value):
return False
+ def _is_archived(self, event):
+ """Check if an event represents archived wayback content."""
+ return isinstance(event.data, dict) and "archive_url" in event.data
+
+ def _event_host(self, event):
+ """Get the effective host from an event.
+
+ For archived wayback content, data["host"] contains the original target hostname
+ (since data["url"] points to archive.org). For regular events, we use event.host.
+
+ NOTE: Regular HTTP_RESPONSE events also have data["host"], but it contains the
+ resolved IP from the httpx binary — NOT a hostname override.
+ """
+ if self._is_archived(event) and event.data.get("host"):
+ return str(event.data["host"])
+ return str(event.host)
+
+ def _event_base_url(self, event):
+ """Get the effective base URL from an event.
+
+ For archived wayback content, reconstructs the original URL from override fields
+ (host/scheme/port/path) since parsed_url points to archive.org.
+ For regular events, returns event.parsed_url directly.
+ """
+ if not self._is_archived(event):
+ return event.parsed_url
+ scheme = event.data.get("scheme", event.parsed_url.scheme)
+ host = self._event_host(event)
+ port = event.data.get("port")
+ if port is not None:
+ port = int(port)
+ if not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)):
+ host = f"{host}:{port}"
+ path = event.data.get("path", event.parsed_url.path)
+ return urlparse(f"{scheme}://{host}{path}")
+
def url_unparse(self, param_type, parsed_url):
# Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value.
if param_type == "GETPARAM":
@@ -641,8 +677,9 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
# The endpoint is usually a form action - we should use it if we have it. If not, default to URL.
else:
- # Use the original URL as the base and resolve the endpoint correctly in case of relative paths
- base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}"
+ # Use the effective base URL (which may differ from parsed_url for archived content)
+ event_base = self.excavate._event_base_url(event)
+ base_url = f"{event_base.scheme}://{event_base.netloc}{event_base.path}"
if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0:
base_url += f"?{event.parsed_url.query}"
url = urljoin(base_url, endpoint)
@@ -986,6 +1023,34 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
if yara_results:
event.add_tag("login-page")
+ class DirectoryListingExtractor(ExcavateRule):
+ description = "Detects directory listing pages from web servers."
+ signatures = {
+ "Apache_Nginx": '"
Index of /"',
+ "IIS": '"[To Parent Directory]"',
+ "Python_HTTP_Server": '"Directory listing for"',
+ "Generic_Directory_Listing": '"Directory Listing"',
+ }
+ yara_rules = {}
+
+ def __init__(self, excavate):
+ super().__init__(excavate)
+ signature_component_list = []
+ for signature_name, signature in self.signatures.items():
+ signature_component_list.append(rf"${signature_name} = {signature}")
+ signature_component = " ".join(signature_component_list)
+ self.yara_rules["directory_listing"] = (
+ f'rule directory_listing {{meta: description = "contains a directory listing" strings: {signature_component} condition: any of them}}'
+ )
+
+ async def process(self, yara_results, event, yara_rule_settings, discovery_context):
+ for identifier in yara_results.keys():
+ for findings in yara_results[identifier]:
+ event_data = {
+ "description": f"{discovery_context} {yara_rule_settings.description} ({identifier})"
+ }
+ await self.report(event_data, event, yara_rule_settings, discovery_context, event_type="FINDING")
+
def add_yara_rule(self, rule_name, rule_content, rule_instance):
rule_instance.name = rule_name
self.yara_rules_dict[rule_name] = rule_content
@@ -1013,12 +1078,13 @@ async def emit_custom_parameters(self, event, config_key, param_type, descriptio
# Emits WEB_PARAMETER events for custom headers and cookies from the configuration.
custom_params = self.scan.web_config.get(config_key, {})
for param_name, param_value in custom_params.items():
+ event_base = self._event_base_url(event)
await self.emit_web_parameter(
- host=event.parsed_url.hostname,
+ host=self._event_host(event),
param_type=param_type,
name=param_name,
original_value=param_value,
- url=self.url_unparse(param_type, event.parsed_url),
+ url=self.url_unparse(param_type, event_base),
description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})",
additional_params=_exclude_key(custom_params, param_name),
event=event,
@@ -1134,7 +1200,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon
if results:
for parameter_name, original_value in results:
await self.emit_web_parameter(
- host=str(event.host),
+ host=self._event_host(event),
param_type="SPECULATIVE",
name=parameter_name,
original_value=original_value,
@@ -1142,7 +1208,7 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon
description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]",
additional_params={},
event=event,
- context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}",
+ context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {self._event_host(event)}",
)
return
@@ -1194,7 +1260,7 @@ async def handle_event(self, event, **kwargs):
) in extract_params_url(event.parsed_url):
if self.in_bl(parameter_name) is False:
await self.emit_web_parameter(
- host=parsed_url.hostname,
+ host=self._event_host(event),
param_type="GETPARAM",
name=parameter_name,
original_value=original_value,
@@ -1228,12 +1294,13 @@ async def handle_event(self, event, **kwargs):
if self.in_bl(cookie_name) is False:
self.assigned_cookies[cookie_name] = cookie_value
+ event_base = self._event_base_url(event)
await self.emit_web_parameter(
- host=str(event.host),
+ host=self._event_host(event),
param_type="COOKIE",
name=cookie_name,
original_value=cookie_value,
- url=self.url_unparse("COOKIE", event.parsed_url),
+ url=self.url_unparse("COOKIE", event_base),
description=f"Set-Cookie Assigned Cookie [{cookie_name}]",
additional_params={},
event=event,
@@ -1270,10 +1337,10 @@ async def handle_event(self, event, **kwargs):
original_value,
regex_name,
additional_params,
- ) in extract_params_location(header_value, event.parsed_url):
+ ) in extract_params_location(header_value, self._event_base_url(event)):
if self.in_bl(parameter_name) is False:
await self.emit_web_parameter(
- host=parsed_url.hostname,
+ host=self._event_host(event),
param_type="GETPARAM",
name=parameter_name,
original_value=original_value,
diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py
index 49010f451a..5f2338b8af 100644
--- a/bbot/modules/wayback.py
+++ b/bbot/modules/wayback.py
@@ -1,87 +1,674 @@
+import re
+from collections import Counter
from datetime import datetime
+from urllib.parse import parse_qs, urlparse, urlunparse
+import orjson
+
+from bbot.core.helpers.misc import get_file_extension
+from bbot.core.helpers.validators import clean_url
from bbot.modules.templates.subdomain_enum import subdomain_enum
+def _parse_cdx_response(text):
+ """Parse CDX JSON response text into a URL list. Designed to run in a separate process."""
+ j = orjson.loads(text)
+ if not isinstance(j, list):
+ return None
+ return [result[0] for result in j[1:] if result]
+
+
class wayback(subdomain_enum):
flags = ["safe", "passive", "subdomain-enum"]
- watched_events = ["DNS_NAME"]
- produced_events = ["URL_UNVERIFIED", "DNS_NAME"]
+ watched_events = ["DNS_NAME", "URL"]
+ produced_events = ["URL_UNVERIFIED", "DNS_NAME", "WEB_PARAMETER", "HTTP_RESPONSE", "FINDING"]
meta = {
- "description": "Query archive.org's API for subdomains",
+ "description": "Query archive.org's Wayback Machine for subdomains, URLs, parameters, and archived content",
"created_date": "2022-04-01",
"author": "@liquidsec",
}
- options = {"urls": False, "garbage_threshold": 10}
+ options = {"urls": False, "garbage_threshold": 10, "parameters": False, "archive": False, "max_records": 100000}
options_desc = {
"urls": "emit URLs in addition to DNS_NAMEs",
"garbage_threshold": "Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data)",
+ "parameters": "emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true)",
+ "archive": "fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true)",
+ "max_records": "Maximum number of URLs to fetch from the CDX API",
}
in_scope_only = True
base_url = "http://web.archive.org"
+ url_blacklist = ["_Incapsula_Resource", "/cdn-cgi/"]
+
+ interesting_extensions = frozenset({"zip", "sql", "bak", "env", "config"})
+ interesting_compound_extensions = frozenset({"tar.gz", "tar.bz2"})
+
+ # maximum URL length before we consider it garbage (crawler traps produce absurdly long URLs)
+ _max_url_length = 2000
+ # if any single path segment repeats more than this many times, it's a path loop / crawler trap
+ _max_path_segment_repeats = 3
+
+ def _is_garbage_url(self, url):
+ """Detect crawler-trap URLs with repeating path segments or excessive length."""
+ if len(url) > self._max_url_length:
+ return True
+ path = urlparse(url).path
+ if not path:
+ return False
+ segments = [s for s in path.split("/") if s]
+ if not segments:
+ return False
+ counts = Counter(segments)
+ return counts.most_common(1)[0][1] > self._max_path_segment_repeats
+
+ def _is_interesting_file(self, url):
+ ext = get_file_extension(url)
+ if ext and ext.lower() in self.interesting_extensions:
+ return True
+ lower_url = url.lower()
+ return any(lower_url.endswith(f".{ce}") for ce in self.interesting_compound_extensions)
async def setup(self):
self.urls = self.config.get("urls", False)
+ self.parameters = self.config.get("parameters", False)
+ if self.parameters:
+ if not self.urls:
+ self.hugewarning("parameters option requires urls to be enabled. Please add modules.wayback.urls=True")
+ return False
+ consumers = [m for m, mod in self.scan.modules.items() if "WEB_PARAMETER" in mod.watched_events]
+ if not consumers:
+ self.warning("Disabling parameter extraction because no modules consume WEB_PARAMETER events")
+ self.parameters = False
+ else:
+ self.hugeinfo(
+ f"Parameter extraction enabled because the following modules consume WEB_PARAMETER events: [{', '.join(consumers)}]"
+ )
+ self.archive = self.config.get("archive", False)
+ if self.archive and not self.urls:
+ self.hugewarning("archive option requires urls to be enabled. Please add modules.wayback.urls=True")
+ return False
self.garbage_threshold = self.config.get("garbage_threshold", 10)
+ self.max_records = self.config.get("max_records", 100000)
+ self._parameter_cache = {}
+ self._archive_cache = {}
+ # bloom filter to deduplicate archive fetches by the response URL archive.org actually served
+ # (multiple request URLs can redirect to the same archived snapshot)
+ # 32M bits (~4MB) supports ~400K entries with negligible false-positive rate
+ self._archive_bloom = self.helpers.bloom_filter(32000000)
return await super().setup()
+ def _incoming_dedup_hash(self, event):
+ # URL events are handled differently (parameter/archive cache eviction),
+ # so they should not be deduplicated by the subdomain_enum strategy
+ if event.type == "URL":
+ return hash(event.url), "url_event"
+ return super()._incoming_dedup_hash(event)
+
+ async def filter_event(self, event):
+ # URL events are handled separately and don't need subdomain_enum's wildcard/cloud filtering
+ if event.type == "URL":
+ return True
+ return await super().filter_event(event)
+
async def handle_event(self, event):
+ if event.type == "URL":
+ await self._handle_url_event(event)
+ return
+
query = self.make_query(event)
- for result, event_type in await self.query(query):
+ results, interesting_files = await self.query(query)
+ for result, event_type in results:
+ tags = ["from-wayback"] if event_type == "URL_UNVERIFIED" else []
await self.emit_event(
result,
event_type,
event,
+ tags=tags,
abort_if=self.abort_if,
context=f'{{module}} queried archive.org for "{query}" and found {{event.type}}: {{event.pretty_string}}',
)
- async def query(self, query):
- results = set()
- waybackurl = f"{self.base_url}/cdx/search/cdx?url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original"
- r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 10)
- if not r:
- self.warning(f'Error connecting to archive.org for query "{query}"')
- return results
+ if interesting_files:
+ await self._check_interesting_files(interesting_files, event)
+
+ # pair unpaired archive cache entries with their parent DNS_NAME event
+ if self.archive:
+ paired = 0
+ for url_str in list(self._archive_cache):
+ if isinstance(self._archive_cache[url_str], str):
+ self._archive_cache[url_str] = (self._archive_cache[url_str], event)
+ paired += 1
+ if paired:
+ self.debug(f"Paired {paired} archive cache entries with parent event {event.data}")
+
+ async def _handle_url_event(self, event):
+ """Process a URL event: evict live URLs from archive cache and emit cached parameters."""
+ if self.archive:
+ status_code = 0
+ for tag in event.tags:
+ if tag.startswith("status-"):
+ try:
+ status_code = int(tag.split("-", 1)[1])
+ except ValueError:
+ pass
+ break
+ # only 2xx counts as live — 3xx (e.g. http→https 301 to a 404) doesn't confirm the page exists
+ if 200 <= status_code < 300:
+ cleaned = clean_url(event.url).geturl()
+ if self._archive_cache.pop(cleaned, None) is not None:
+ self.verbose(f"URL is live (status {status_code}), removed from archive cache: {cleaned}")
+
+ cached = self._parameter_cache.pop(clean_url(event.url).geturl(), None)
+ if cached is not None:
+ flat_params, base_url = cached
+ for param_name, original_value in flat_params.items():
+ data = {
+ "host": str(event.host),
+ "type": "GETPARAM",
+ "name": param_name,
+ "original_value": original_value,
+ "url": base_url,
+ "description": f"HTTP Extracted Parameter [{param_name}] (wayback)",
+ "additional_params": {k: v for k, v in flat_params.items() if k != param_name},
+ }
+ self.verbose(f"Emitting WEB_PARAMETER [{param_name}] from archived URL {base_url}")
+ await self.emit_event(
+ data,
+ "WEB_PARAMETER",
+ event,
+ tags=["from-wayback"],
+ context=f"{{module}} found query parameter [{param_name}] in archived URL and emitted {{event.type}}",
+ )
+
+ async def _check_interesting_files(self, interesting_files, event):
+ """HEAD-check interesting archived files and emit FINDINGs for those that exist."""
+ self.verbose(f"Checking {len(interesting_files)} interesting archived files")
+
+ # build URL list and mapping back to metadata
+ url_metadata = {}
+ for cleaned_url, raw_url in interesting_files.items():
+ archive_url = f"{self.base_url}/web/{raw_url}"
+ url_metadata[archive_url] = (cleaned_url, raw_url)
+
+ for archive_url, (cleaned_url, raw_url) in url_metadata.items():
+ try:
+ r = await self.helpers.request(
+ archive_url, method="HEAD", timeout=self.http_timeout + 30, follow_redirects=True
+ )
+ except Exception as e:
+ self.debug(f"Interesting file HEAD check error for {raw_url}: {e}")
+ continue
+
+ if not r or r.status_code != 200:
+ status = getattr(r, "status_code", "no response") if r else "no response"
+ self.debug(f"Interesting file HEAD check failed for {raw_url}: status={status}")
+ continue
+ # guard against soft 404s (archive.org returns text/html for missing pages)
+ content_type = r.headers.get("content-type", "")
+ if "text/html" in content_type:
+ self.debug(f"Interesting file skipped (soft 404): {raw_url}")
+ continue
+
+ ext = get_file_extension(cleaned_url)
+ desc = f"Interesting archived file found (.{ext}): {raw_url}"
+ content_length = r.headers.get("content-length", "")
+ if content_length:
+ try:
+ size = int(content_length)
+ if size > 1024 * 1024:
+ desc += f" ({size / (1024 * 1024):.1f} MB)"
+ elif size > 1024:
+ desc += f" ({size / 1024:.1f} KB)"
+ else:
+ desc += f" ({size} bytes)"
+ except ValueError:
+ pass
+
+ self.verbose(f"Interesting archived file confirmed: {raw_url}")
+ parsed = urlparse(raw_url)
+ await self.emit_event(
+ {
+ "description": desc,
+ "severity": "LOW",
+ "name": "Interesting Archived File",
+ "confidence": "MEDIUM",
+ "url": str(r.url),
+ "host": str(parsed.hostname or ""),
+ },
+ "FINDING",
+ event,
+ tags=["from-wayback", "archived", "interesting-file"],
+ context=f"{{module}} found interesting archived file: {raw_url}",
+ )
+
+ # CDX API filters applied server-side to reduce response size
+ _cdx_filters = (
+ "filter=!statuscode:404",
+ "filter=!statuscode:301",
+ "filter=!statuscode:302",
+ "filter=!mimetype:image/.*",
+ "filter=!mimetype:text/css",
+ "filter=!mimetype:warc/revisit",
+ )
+
+ async def _fetch_cdx(self, query):
+ """Fetch URLs from the CDX API with retries and 429 handling. Returns the URL list or None on failure."""
+ params = f"url={self.helpers.quote(query)}&matchType=domain&output=json&fl=original&collapse=original"
+ params += f"&limit={self.max_records}"
+ params += "&" + "&".join(self._cdx_filters)
+ waybackurl = f"{self.base_url}/cdx/search/cdx?{params}"
+ r = None
+ last_error = None
+ for i in range(3):
+ try:
+ r = await self.helpers.request(waybackurl, timeout=self.http_timeout + 60, raise_error=True)
+ except Exception as e:
+ last_error = str(e)
+ r = None
+ if r is not None:
+ if r.status_code == 200:
+ break
+ if r.status_code == 429:
+ retry_after = r.headers.get("retry-after", "")
+ try:
+ delay = min(int(retry_after), 120)
+ except (ValueError, TypeError):
+ delay = self._archive_429_default_delay
+ last_error = "HTTP 429 rate limited"
+ self.verbose(f'Archive.org rate limit (429) for CDX query "{query}", sleeping {delay}s')
+ await self.helpers.sleep(delay)
+ r = None
+ continue
+ last_error = f"HTTP status {r.status_code}"
+ r = None
+ if i < 2:
+ self.verbose(
+ f'Error connecting to archive.org for query "{query}" ({last_error}), retrying ({i + 1}/2)'
+ )
+ await self.helpers.sleep(2**i)
+ if r is None:
+ self.warning(f'Error connecting to archive.org for query "{query}": {last_error}')
+ return None
+ # parse JSON + extract URLs in a separate process to avoid blocking the event loop
+ # (CDX responses can contain 100k+ entries)
try:
- j = r.json()
- assert type(j) == list
+ urls = await self.helpers.run_in_executor_mp(_parse_cdx_response, r.text)
except Exception:
+ urls = None
+ if urls is None:
self.warning(f'Error JSON-decoding archive.org response for query "{query}"')
- return results
+ return None
+ return urls
+
+ def _pre_process_urls(self, urls):
+ """Extract parameters, archive URLs, and interesting files from raw CDX URLs before collapse."""
+ raw_url_params = {}
+ archive_urls = {}
+ interesting_files = {}
- urls = []
- for result in j[1:]:
+ for url in urls:
try:
- url = result[0]
- urls.append(url)
- except KeyError:
+ parsed = urlparse(url)
+ if any(bl in url for bl in self.url_blacklist):
+ continue
+ if self._is_garbage_url(url):
+ continue
+ if not (parsed.hostname and self.scan.in_scope(parsed.hostname)):
+ continue
+ # skip non-HTTP URLs (e.g. ftp:// archived by the Wayback Machine)
+ if parsed.scheme not in ("http", "https"):
+ continue
+
+ cleaned_str = clean_url(url).geturl()
+
+ if self.archive and cleaned_str not in archive_urls:
+ archive_urls[cleaned_str] = url
+
+ if self.urls and self._is_interesting_file(url) and cleaned_str not in interesting_files:
+ interesting_files[cleaned_str] = url
+
+ if self.parameters and parsed.query:
+ params = parse_qs(parsed.query)
+ flat_params = {k: v[0] for k, v in params.items()}
+ if flat_params:
+ if cleaned_str not in raw_url_params:
+ raw_url_params[cleaned_str] = flat_params
+ else:
+ raw_url_params[cleaned_str].update(flat_params)
+ except Exception:
continue
+ if archive_urls or interesting_files or raw_url_params:
+ self.debug(
+ f"Pre-processed {len(urls):,} URLs: {len(archive_urls):,} archive candidates, "
+ f"{len(interesting_files):,} interesting files, {len(raw_url_params):,} URLs with parameters"
+ )
+
+ return raw_url_params, archive_urls, interesting_files
+
+ async def query(self, query):
+ results = set()
+
+ urls = await self._fetch_cdx(query)
+ if urls is None:
+ return results, {}
+
self.verbose(f"Found {len(urls):,} URLs for {query}")
+ # filter blacklisted and garbage URLs before any further processing
+ urls = [
+ url for url in urls if not any(bl in url for bl in self.url_blacklist) and not self._is_garbage_url(url)
+ ]
+
+ # pre-extract metadata from raw URLs before collapse strips query strings
+ raw_url_params, archive_urls, interesting_files = {}, {}, {}
+ if self.parameters or self.archive or self.urls:
+ raw_url_params, archive_urls, interesting_files = self._pre_process_urls(urls)
+
+ if not urls:
+ return results, interesting_files
+
dns_names = set()
collapsed_urls = 0
start_time = datetime.now()
- # we consolidate URLs to cut down on garbage data
- # this is CPU-intensive, so we do it in its own core.
+ # consolidate URLs to cut down on garbage data (CPU-intensive, runs in separate process)
parsed_urls = await self.helpers.run_in_executor_mp(
self.helpers.validators.collapse_urls,
urls,
threshold=self.garbage_threshold,
)
- for parsed_url in parsed_urls:
- collapsed_urls += 1
- if not self.urls:
+ if self.urls:
+ # deduplicate http/https variants — drop http when https also exists
+ url_dedup = {}
+ for parsed_url in parsed_urls:
+ collapsed_urls += 1
+ https_key = parsed_url._replace(scheme="https").geturl()
+ if https_key not in url_dedup or parsed_url.scheme == "https":
+ url_dedup[https_key] = parsed_url
+ for parsed_url in url_dedup.values():
+ url_str = parsed_url.geturl()
+ results.add((url_str, "URL_UNVERIFIED"))
+ if self.parameters and url_str in raw_url_params:
+ base_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", ""))
+ self._parameter_cache[url_str] = (raw_url_params[url_str], base_url)
+ if self.archive and url_str in archive_urls:
+ self._archive_cache[url_str] = archive_urls[url_str]
+ else:
+ for parsed_url in parsed_urls:
+ collapsed_urls += 1
dns_name = parsed_url.hostname
h = hash(dns_name)
if h not in dns_names:
dns_names.add(h)
results.add((dns_name, "DNS_NAME"))
- else:
- results.add((parsed_url.geturl(), "URL_UNVERIFIED"))
- end_time = datetime.now()
- duration = self.helpers.human_timedelta(end_time - start_time)
+
+ duration = self.helpers.human_timedelta(datetime.now() - start_time)
self.verbose(f"Collapsed {len(urls):,} -> {collapsed_urls:,} URLs in {duration}")
- return results
+ return results, interesting_files
+
+ _wayback_head_re = re.compile(
+ r'content
'
+ stripped = w._strip_wayback_wrapper(body)
+ assert "archive.org" not in stripped
+ assert "content" in stripped
+
+ # test stripping of relative wayback URL rewrites (href)
+ body = 'link'
+ stripped = w._strip_wayback_wrapper(body)
+ assert "/web/19971024185506/" not in stripped
+ assert "http://www.example.com/PDF%20files/data.pdf" in stripped
+
+ # test stripping of relative wayback URL rewrites with modifier suffix (im_ for images)
+ body = '
'
+ stripped = w._strip_wayback_wrapper(body)
+ assert "/web/19971024185506im_/" not in stripped
+ assert "http://www.example.com/images/logo.gif" in stripped
+
+ # test stripping of relative wayback URL rewrites with js_ suffix
+ body = ''
+ stripped = w._strip_wayback_wrapper(body)
+ assert "/web/20250529193232js_/" not in stripped
+ assert "https://www.example.com/script.js" in stripped
+
+
+class TestWaybackArchiveBloomDedup(ModuleTestBase):
+ """When multiple archive URLs redirect to the same snapshot, bloom filter prevents duplicate HTTP_RESPONSEs."""
+
+ module_name = "wayback"
+ modules_overrides = ["wayback"]
+ targets = ["blacklanternsecurity.com", "127.0.0.1"]
+ config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}}
+
+ async def setup_after_prep(self, module_test):
+ # CDX returns two different dead URLs
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit",
+ json=[
+ ["original"],
+ ["http://127.0.0.1:1/page-a"],
+ ["http://127.0.0.1:1/page-b"],
+ ],
+ )
+ # both archive URLs redirect to the same archived snapshot
+ redirect_target = "http://web.archive.org/web/20230101120000/http://127.0.0.1:1/same-page"
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/page-a",
+ status_code=301,
+ headers={"Location": redirect_target},
+ )
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/page-b",
+ status_code=301,
+ headers={"Location": redirect_target},
+ )
+ # two responses for the redirect target (one consumed per redirect)
+ for _ in range(2):
+ module_test.blasthttp_mock.add_response(
+ url=redirect_target,
+ text="archived content",
+ headers={"Content-Type": "text/html"},
+ )
+
+ def check(self, module_test, events):
+ http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags]
+ assert len(http_responses) == 1, (
+ f"Expected exactly 1 archived HTTP_RESPONSE (bloom dedup should prevent duplicate), got {len(http_responses)}"
+ )
+
+
+class TestWaybackArchiveRetry(ModuleTestBase):
+ """Archive fetches that fail transiently (connection error) should be retried and succeed."""
+
+ module_name = "wayback"
+ modules_overrides = ["wayback"]
+ targets = ["blacklanternsecurity.com", "127.0.0.1"]
+ config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}}
+
+ async def setup_after_prep(self, module_test):
+ # speed up retries for testing
+ module_test.scan.modules["wayback"]._archive_error_delay = 0.01
+ module_test.scan.modules["wayback"]._archive_delay = 0
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit",
+ json=[["original"], ["http://127.0.0.1:1/retry-page"]],
+ )
+ # first attempt: 503 (archive.org overloaded)
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/retry-page",
+ )
+ # retry attempt: 200
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/retry-page",
+ text="recovered content",
+ headers={"Content-Type": "text/html"},
+ )
+
+ def check(self, module_test, events):
+ http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags]
+ assert len(http_responses) == 1, f"Expected 1 archived HTTP_RESPONSE from retry, got {len(http_responses)}"
+
+
+class TestWaybackGarbageUrlFilter(ModuleTestBase):
+ """Crawler-trap URLs with repeating path segments should be filtered out."""
+
+ module_name = "wayback"
+ modules_overrides = ["wayback"]
+ targets = ["blacklanternsecurity.com"]
+ config_overrides = {"modules": {"wayback": {"urls": True}}}
+
+ async def setup_after_prep(self, module_test):
+ # build a crawler-trap URL with repeating path segments (like the real-world example)
+ repeating = "/themes/sites/example.com".lstrip("/")
+ garbage_path = "/get-materials/" + "/".join([repeating] * 20)
+ garbage_url = f"https://blacklanternsecurity.com{garbage_path}"
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit",
+ json=[
+ ["original"],
+ [garbage_url],
+ ["https://blacklanternsecurity.com/real-page"],
+ ],
+ )
+
+ def check(self, module_test, events):
+ # garbage URL should be filtered
+ assert not any(e.type == "URL_UNVERIFIED" and "get-materials" in e.url for e in events), (
+ "Crawler-trap URL with repeating path segments should have been filtered"
+ )
+ # real page should still be emitted
+ assert any(e.type == "URL_UNVERIFIED" and "real-page" in e.url for e in events), (
+ "Non-garbage URL should have been emitted"
+ )
+
+
+class TestWaybackGarbageUrlLength(ModuleTestBase):
+ """Excessively long URLs should be filtered out as garbage."""
+
+ module_name = "wayback"
+ modules_overrides = ["wayback"]
+ targets = ["blacklanternsecurity.com"]
+ config_overrides = {"modules": {"wayback": {"urls": True}}}
+
+ async def setup_after_prep(self, module_test):
+ # URL exceeding 2000 character limit
+ long_url = "https://blacklanternsecurity.com/" + "a" * 2000
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit",
+ json=[
+ ["original"],
+ [long_url],
+ ["https://blacklanternsecurity.com/normal-page"],
+ ],
+ )
+
+ def check(self, module_test, events):
+ # long URL should be filtered
+ assert not any(e.type == "URL_UNVERIFIED" and "aaaa" in e.url for e in events), (
+ "Excessively long URL should have been filtered"
+ )
+ # normal page should still be emitted
+ assert any(e.type == "URL_UNVERIFIED" and "normal-page" in e.url for e in events), (
+ "Normal-length URL should have been emitted"
+ )
+
+
+class TestWaybackArchive429Retry(ModuleTestBase):
+ """Archive fetches that get 429 rate-limited should back off and retry successfully."""
+
+ module_name = "wayback"
+ modules_overrides = ["wayback"]
+ targets = ["blacklanternsecurity.com", "127.0.0.1"]
+ config_overrides = {"modules": {"wayback": {"urls": True, "archive": True}}}
+
+ async def setup_after_prep(self, module_test):
+ # speed up delays for testing
+ module_test.scan.modules["wayback"]._archive_429_default_delay = 0.01
+ module_test.scan.modules["wayback"]._archive_error_delay = 0.01
+ module_test.scan.modules["wayback"]._archive_delay = 0
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/cdx/search/cdx?url=blacklanternsecurity.com&matchType=domain&output=json&fl=original&collapse=original&limit=100000&filter=!statuscode:404&filter=!statuscode:301&filter=!statuscode:302&filter=!mimetype:image/.*&filter=!mimetype:text/css&filter=!mimetype:warc/revisit",
+ json=[["original"], ["http://127.0.0.1:1/rate-limited-page"]],
+ )
+ # first attempt: 429 rate limited
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page",
+ status_code=429,
+ headers={"Retry-After": "1"},
+ )
+ # retry after backoff: 200
+ module_test.blasthttp_mock.add_response(
+ url="http://web.archive.org/web/http://127.0.0.1:1/rate-limited-page",
+ text="content after rate limit",
+ headers={"Content-Type": "text/html"},
+ )
+
+ def check(self, module_test, events):
+ http_responses = [e for e in events if e.type == "HTTP_RESPONSE" and "from-wayback" in e.tags]
+ assert len(http_responses) == 1, (
+ f"Expected 1 archived HTTP_RESPONSE after 429 retry, got {len(http_responses)}"
+ )
diff --git a/docs/modules/wayback.md b/docs/modules/wayback.md
new file mode 100644
index 0000000000..a3f30f3e23
--- /dev/null
+++ b/docs/modules/wayback.md
@@ -0,0 +1,141 @@
+# Wayback
+
+## Overview
+
+The Wayback module queries [archive.org's Wayback Machine](https://web.archive.org/) CDX API to discover subdomains, URLs, web parameters, and archived content for your targets. By default it operates as a passive subdomain enumeration source, but with its extended features enabled it becomes a powerful tool for discovering dead URLs, extracting parameters for fuzzing, and retrieving archived versions of pages that no longer exist.
+
+* Watches: **DNS_NAME**, **URL**
+* Produces: **URL_UNVERIFIED**, **DNS_NAME**, **WEB_PARAMETER**, **HTTP_RESPONSE**, **FINDING**
+* Flags: `passive`, `subdomain-enum`, `safe`
+
+## Default Behavior
+
+By default, wayback only emits **DNS_NAME** events (subdomains) extracted from archived URLs. This is the behavior you get when wayback is included via the `subdomain-enum` preset. No URLs, parameters, or archived content are fetched.
+
+To unlock the more advanced features, you need to enable them via configuration options or use one of the wayback presets.
+
+## Configuration Options
+
+| Option | Type | Default | Description |
+|---------------------|------|---------|-------------------------------------------------------------------------------------------------------|
+| `urls` | bool | `False` | Emit `URL_UNVERIFIED` events in addition to `DNS_NAME`s. Required for `parameters` and `archive`. |
+| `parameters` | bool | `False` | Extract `WEB_PARAMETER` events from query strings in archived URLs. Requires `urls=True`. |
+| `archive` | bool | `False` | Fetch archived versions of dead URLs and emit `HTTP_RESPONSE` events. Requires `urls=True`. |
+| `garbage_threshold` | int | `10` | Deduplicate similar URLs if they appear in groups of this size or larger. Lower = less noise. |
+
+## Features
+
+### URL Discovery (`urls: True`)
+
+When `urls` is enabled, wayback emits `URL_UNVERIFIED` events for every unique URL found in the Wayback Machine's index. These are tagged with `from-wayback` and sent through BBOT's normal URL verification pipeline (httpx).
+
+Before emission, URLs go through several cleanup steps:
+
+- **URL collapsing** - Groups of similar URLs (e.g. pagination, search results) are deduplicated based on the `garbage_threshold` setting
+- **HTTP/HTTPS deduplication** - When both `http://` and `https://` variants exist, only the HTTPS version is kept
+- **Blacklist filtering** - URLs containing known CDN/WAF paths (e.g. `_Incapsula_Resource`, `/cdn-cgi/`) are filtered out
+
+### Parameter Extraction (`parameters: True`)
+
+When `parameters` is enabled (requires `urls: True`), wayback extracts query string parameters from archived URLs and emits them as `WEB_PARAMETER` events. This is useful for discovering GET parameters that can be fed into fuzzing modules like lightfuzz.
+
+Parameters are cached and only emitted after the corresponding URL has been verified as live by httpx. This prevents emitting parameters for URLs that no longer exist.
+
+!!! note
+ Parameter extraction requires at least one module that consumes `WEB_PARAMETER` events to be active (e.g. `lightfuzz`, `hunt`, `paramminer_getparams`). If no such module is present, parameter extraction is automatically disabled with a warning.
+
+### Archive Retrieval (`archive: True`)
+
+When `archive` is enabled (requires `urls: True`), wayback fetches the actual archived content of URLs from the Wayback Machine and emits them as `HTTP_RESPONSE` events. This is particularly useful for:
+
+- **Finding secrets in dead pages** - Archived versions may contain API keys, credentials, or other sensitive data that modules like `badsecrets` can detect
+- **Discovering hidden functionality** - Pages that have been removed may reveal application structure or endpoints
+
+Archive retrieval runs during the module's `finish()` phase, after all URLs have been discovered and verified. URLs that are confirmed live (2xx status) are automatically removed from the archive queue, so only dead URLs are fetched from the archive.
+
+The archived content goes through extensive cleanup to remove Wayback Machine artifacts:
+
+- Wayback toolbar/header/footer HTML is stripped
+- Rewritten URLs (e.g. `http://web.archive.org/web/20250101/http://example.com/page`) are restored to originals
+- Wayback-injected headers (`x-archive-*`, `set-cookie`) are removed
+- The event's host, port, and URL are set to the original target, not `web.archive.org`
+
+Archived HTTP_RESPONSE events are tagged with `from-wayback` and `archived`.
+
+!!! warning
+ Static file extensions (images, CSS, JS, etc.) are automatically skipped during archive retrieval to avoid unnecessary traffic.
+
+### Interesting File Detection
+
+When `urls` is enabled, wayback also checks for potentially interesting archived files by looking for URLs with sensitive extensions: `.zip`, `.sql`, `.bak`, `.env`, `.config`, `.tar.gz`, `.tar.bz2`.
+
+When found, these are verified with a HEAD request to archive.org. If the archived file exists and isn't a soft-404, a `FINDING` event is emitted with details about the file (including size if available). These findings are tagged with `from-wayback`, `archived`, and `interesting-file`.
+
+## Presets
+
+Wayback comes with two dedicated presets, and is also integrated into several other presets:
+
+### `-p wayback`
+
+Basic URL discovery mode. Includes `subdomain-enum` and enables `urls: True`. Good for general recon when you want to discover historical URLs alongside subdomains.
+
+```bash
+bbot -p wayback -t evilcorp.com
+```
+
+### `-p wayback-heavy`
+
+Full-featured mode with URL discovery, parameter extraction, and archive retrieval. Also includes `badsecrets` to scan archived content for exposed secrets.
+
+```bash
+bbot -p wayback-heavy -t evilcorp.com
+```
+
+### Integration with other presets
+
+Wayback's extended features are also enabled in several other presets:
+
+| Preset | Wayback Config |
+|-----------------------|-----------------------------------------|
+| `kitchen-sink` | `urls`, `parameters`, `archive` |
+| `dirbust-heavy` | `urls` |
+| `nuclei-intense` | `urls` |
+| `lightfuzz-heavy` | `urls`, `parameters` |
+| `lightfuzz-superheavy`| `urls`, `parameters`, `archive` |
+
+## Example Commands
+
+```bash
+# Basic subdomain enumeration (default behavior, no URL emission)
+bbot -p subdomain-enum -t evilcorp.com
+```
+
+```bash
+# URL discovery via wayback preset
+bbot -p wayback -t evilcorp.com
+```
+
+```bash
+# Full wayback integration with archived content and parameter extraction
+bbot -p wayback-heavy -t evilcorp.com
+```
+
+```bash
+# Enable wayback URLs alongside a nuclei scan
+bbot -p nuclei -m wayback -c modules.wayback.urls=True --allow-deadly -t evilcorp.com
+```
+
+```bash
+# Pair with lightfuzz for parameter fuzzing using archived parameters
+bbot -p lightfuzz-heavy spider -t evilcorp.com --allow-deadly
+```
+
+```bash
+# Enable wayback features via command-line config
+bbot -p subdomain-enum -c modules.wayback.urls=True modules.wayback.parameters=True modules.wayback.archive=True -t evilcorp.com
+```
+
+```bash
+# Adjust garbage threshold for cleaner output (more aggressive deduplication)
+bbot -p wayback -c modules.wayback.garbage_threshold=5 -t evilcorp.com
+```
diff --git a/docs/scanning/configuration.md b/docs/scanning/configuration.md
index bbc5aa7a22..a0bdf16401 100644
--- a/docs/scanning/configuration.md
+++ b/docs/scanning/configuration.md
@@ -600,7 +600,9 @@ In addition to the stated options for each module, the following universal optio
| modules.trufflehog.version | str | trufflehog version | 3.90.8 |
| modules.urlscan.urls | bool | Emit URLs in addition to DNS_NAMEs | False |
| modules.virustotal.api_key | str | VirusTotal API Key | |
+| modules.wayback.archive | bool | fetch archived versions of dead URLs from the Wayback Machine and emit HTTP_RESPONSE events (requires urls=true) | False |
| modules.wayback.garbage_threshold | int | Dedupe similar urls if they are in a group of this size or higher (lower values == less garbage data) | 10 |
+| modules.wayback.parameters | bool | emit WEB_PARAMETER events for query parameters discovered in archived URLs (requires urls=true) | False |
| modules.wayback.urls | bool | emit URLs in addition to DNS_NAMEs | False |
| modules.asset_inventory.output_file | str | Set a custom output file | |
| modules.asset_inventory.recheck | bool | When use_previous=True, don't retain past details like open ports or findings. Instead, allow them to be rediscovered by the new scan | False |
diff --git a/docs/scanning/presets_list.md b/docs/scanning/presets_list.md
index 98a4d126ec..1b824df6af 100644
--- a/docs/scanning/presets_list.md
+++ b/docs/scanning/presets_list.md
@@ -282,7 +282,7 @@ Everything everywhere all at once
??? note "`kitchen-sink.yml`"
```yaml title="~/.bbot/presets/kitchen-sink.yml"
description: Everything everywhere all at once
-
+
include:
- subdomain-enum
- cloud-enum
@@ -294,6 +294,15 @@ Everything everywhere all at once
- dirbust-light
- web-screenshots
- baddns-heavy
+
+ config:
+ modules:
+ baddns:
+ enable_references: True
+ wayback:
+ urls: True
+ parameters: True
+ archive: True
```
@@ -340,10 +349,11 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet
flags:
- web-paramminer
-
+
modules:
- robots
-
+ - wayback
+
config:
modules:
lightfuzz:
@@ -351,6 +361,9 @@ Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force paramet
disable_post: False
try_post_as_get: True
try_get_as_post: True
+ wayback:
+ urls: True
+ parameters: True
```
Category: web
@@ -400,7 +413,7 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s
include:
- lightfuzz-heavy
-
+
config:
url_querystring_collapse: False # in cases where the same parameter is observed multiple times, fuzz them individually instead of collapsing them into a single parameter
modules:
@@ -410,6 +423,10 @@ Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer s
avoid_wafs: False
excavate:
speculate_params: True # speculate potential parameters extracted from JSON/XML web responses
+ wayback:
+ urls: True
+ parameters: True
+ archive: True
```
Category: web
@@ -802,7 +819,78 @@ Take screenshots of webpages
-Modules: [0]("")
+Modules: [3]("`gowitness`, `httpx`, `social`")
+
+## **web-thorough**
+
+Aggressive web scan
+
+??? note "`web-thorough.yml`"
+ ```yaml title="~/.bbot/presets/web-thorough.yml"
+ description: Aggressive web scan
+
+ include:
+ # include the web-basic preset
+ - web-basic
+
+ flags:
+ - web-thorough
+ ```
+
+
+
+Modules: [32]("`ajaxpro`, `aspnet_bin_exposure`, `azure_realm`, `baddns`, `badsecrets`, `bucket_amazon`, `bucket_digitalocean`, `bucket_firebase`, `bucket_google`, `bucket_microsoft`, `bypass403`, `dotnetnuke`, `ffuf_shortnames`, `filedownload`, `generic_ssrf`, `git`, `graphql_introspection`, `host_header`, `httpx`, `hunt`, `iis_shortnames`, `lightfuzz`, `ntlm`, `oauth`, `reflected_parameters`, `retirejs`, `robots`, `securitytxt`, `smuggler`, `sslcert`, `telerik`, `url_manipulation`")
+
+## **wayback**
+
+Discover URLs and interesting archived files via the Wayback Machine
+
+??? note "`wayback.yml`"
+ ```yaml title="~/.bbot/presets/wayback.yml"
+ description: Discover URLs and interesting archived files via the Wayback Machine
+
+ include:
+ - subdomain-enum
+
+ modules:
+ - wayback
+
+ config:
+ modules:
+ wayback:
+ urls: True
+ ```
+
+
+
+Modules: [52]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`")
+
+## **wayback-heavy**
+
+Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection
+
+??? note "`wayback-heavy.yml`"
+ ```yaml title="~/.bbot/presets/wayback-heavy.yml"
+ description: Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection
+
+ include:
+ - subdomain-enum
+
+ modules:
+ - wayback
+ - badsecrets
+
+ config:
+ modules:
+ wayback:
+ urls: True
+ parameters: True
+ archive: True
+ ```
+
+
+
+Modules: [53]("`anubisdb`, `asn`, `azure_realm`, `azure_tenant`, `baddns_direct`, `baddns_zone`, `badsecrets`, `bevigil`, `bufferoverrun`, `builtwith`, `c99`, `censys_dns`, `certspotter`, `chaos`, `crt`, `crt_db`, `digitorus`, `dnsbimi`, `dnsbrute`, `dnsbrute_mutations`, `dnscaa`, `dnscommonsrv`, `dnsdumpster`, `dnstlsrpt`, `fullhunt`, `github_codesearch`, `github_org`, `hackertarget`, `httpx`, `hunterio`, `ipneighbor`, `leakix`, `myssl`, `oauth`, `otx`, `passivetotal`, `postman`, `postman_download`, `rapiddns`, `securitytrails`, `securitytxt`, `shodan_dns`, `shodan_idb`, `sitedossier`, `social`, `sslcert`, `subdomaincenter`, `subdomainradar`, `trickest`, `urlscan`, `virustotal`, `wayback`, `httpx`")
## Table of Default Presets
@@ -810,34 +898,34 @@ Modules: [0]("")
Here is a the same data, but in a table:
-| Preset | Category | Description | # Modules | Modules |
-|-------------------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|---------------------------------------------------------------------------------------------|
-| baddns | | Check for subdomain takeovers and other DNS issues. | 1 | baddns |
-| baddns-heavy | | Run all baddns modules and submodules. | 3 | baddns, baddns_direct, baddns_zone |
-| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 0 | |
-| code-enum | | Enumerate Git repositories, Docker images, etc. | 0 | |
-| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 3 | ffuf, httpx, wayback |
-| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 1 | ffuf |
-| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 8 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, telerik |
-| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 0 | |
-| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | |
-| iis-shortnames | web | Recursively enumerate IIS shortnames | 0 | |
-| kitchen-sink | | Everything everywhere all at once | 7 | baddns, baddns_direct, baddns_zone, ffuf, httpx, hunt, reflected_parameters |
-| lightfuzz | web | Default fuzzing: all 9 submodules (cmdi, crypto, path, serial, sqli, ssti, xss, esi, ssrf) plus companion modules (badsecrets, hunt, reflected_parameters). POST fuzzing disabled but try_post_as_get enabled, so POST params are retested as GET. Skips confirmed WAFs. | 6 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters |
-| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots |
-| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter |
-| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 7 | badsecrets, httpx, hunt, lightfuzz, portfilter, reflected_parameters, robots |
-| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters |
-| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter |
-| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter |
-| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback |
-| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter |
-| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 3 | httpx, hunt, reflected_parameters |
-| spider | | Recursive web spider | 1 | httpx |
-| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx |
-| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 0 | |
-| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 2 | fingerprintx, nuclei |
-| web | | Quick web scan | 0 | |
-| web-heavy | | Aggressive web scan | 0 | |
-| web-screenshots | | Take screenshots of webpages | 0 | |
+| Preset | Category | Description | # Modules | Modules |
+|----------------------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| baddns-heavy | | Run all baddns modules and submodules. | 4 | baddns, baddns_direct, baddns_zone, httpx |
+| cloud-enum | | Enumerate cloud resources such as storage buckets, etc. | 58 | anubisdb, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback |
+| code-enum | | Enumerate Git repositories, Docker images, etc. | 20 | apkpure, code_repository, docker_pull, dockerhub, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, httpx, jadx, postman, postman_download, social, trufflehog |
+| dirbust-heavy | web | Recursive web directory brute-force (aggressive) | 5 | ffuf, ffuf_shortnames, httpx, iis_shortnames, wayback |
+| dirbust-light | web | Basic web directory brute-force (surface-level directories only) | 4 | ffuf, ffuf_shortnames, httpx, iis_shortnames |
+| dotnet-audit | web | Comprehensive scan for all IIS/.NET specific modules and module settings | 9 | ajaxpro, aspnet_bin_exposure, badsecrets, dotnetnuke, ffuf, ffuf_shortnames, httpx, iis_shortnames, telerik |
+| email-enum | | Enumerate email addresses from APIs, web crawling, etc. | 8 | dehashed, dnscaa, dnstlsrpt, emailformat, hunterio, pgp, skymem, sslcert |
+| fast | | Scan only the provided targets as fast as possible - no extra discovery | 0 | |
+| iis-shortnames | web | Recursively enumerate IIS shortnames | 3 | ffuf_shortnames, httpx, iis_shortnames |
+| kitchen-sink | | Everything everywhere all at once | 90 | anubisdb, apkpure, asn, azure_realm, azure_tenant, baddns, baddns_direct, baddns_zone, badsecrets, bevigil, bucket_amazon, bucket_digitalocean, bucket_file_enum, bucket_firebase, bucket_google, bucket_microsoft, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, code_repository, crt, crt_db, dehashed, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, docker_pull, dockerhub, emailformat, ffuf, ffuf_shortnames, filedownload, fullhunt, git, git_clone, gitdumper, github_codesearch, github_org, github_usersearch, github_workflows, gitlab_com, gitlab_onprem, google_playstore, gowitness, graphql_introspection, hackertarget, httpx, hunt, hunterio, iis_shortnames, ipneighbor, jadx, leakix, myssl, ntlm, oauth, otx, paramminer_cookies, paramminer_getparams, paramminer_headers, passivetotal, pgp, postman, postman_download, rapiddns, reflected_parameters, robots, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, skymem, social, sslcert, subdomaincenter, subdomainradar, trickest, trufflehog, urlscan, virustotal, wayback |
+| lightfuzz-heavy | web | Aggressive fuzzing: everything in lightfuzz, plus paramminer brute-force parameter discovery (headers, GET params, cookies), POST request fuzzing enabled, try_get_as_post enabled (GET params retested as POST), and robots.txt parsing. Still skips confirmed WAFs. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots |
+| lightfuzz-light | web | Minimal fuzzing: only path traversal, SQLi, and XSS submodules. No POST requests. No companion modules. Safest option for running alongside larger scans with minimal overhead. | 3 | httpx, lightfuzz, portfilter |
+| lightfuzz-max | web | Maximum fuzzing: everything in lightfuzz-heavy, plus WAF targets are no longer skipped, each unique parameter-value pair is fuzzed individually (no collapsing), common headers like X-Forwarded-For are fuzzed even if not observed, and potential parameters are speculated from JSON/XML response bodies. Significantly increases scan time. | 10 | badsecrets, httpx, hunt, lightfuzz, paramminer_cookies, paramminer_getparams, paramminer_headers, portfilter, reflected_parameters, robots |
+| lightfuzz-xss | web | XSS-only: enables only the xss submodule with paramminer_getparams and reflected_parameters. POST disabled, no query string collapsing. Example of a focused single-submodule preset. | 5 | httpx, lightfuzz, paramminer_getparams, portfilter, reflected_parameters |
+| nuclei | nuclei | Run nuclei scans against all discovered targets | 3 | httpx, nuclei, portfilter |
+| nuclei-budget | nuclei | Run nuclei scans against all discovered targets, using budget mode to look for low hanging fruit with greatly reduced number of requests | 3 | httpx, nuclei, portfilter |
+| nuclei-heavy | nuclei | Run nuclei scans against all discovered targets, allowing for spidering, against ALL URLs, and with additional discovery modules. | 6 | httpx, nuclei, portfilter, robots, urlscan, wayback |
+| nuclei-technology | nuclei | Run nuclei scans against all discovered targets, running templates which match discovered technologies | 3 | httpx, nuclei, portfilter |
+| paramminer | web | Discover new web parameters via brute-force, and analyze them with additional modules | 6 | httpx, hunt, paramminer_cookies, paramminer_getparams, paramminer_headers, reflected_parameters |
+| spider | | Recursive web spider | 1 | httpx |
+| spider-heavy | | Recursive web spider with more aggressive settings | 1 | httpx |
+| subdomain-enum | | Enumerate subdomains via APIs, brute-force | 51 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback |
+| tech-detect | | Detect technologies via Nuclei, and FingerprintX | 3 | fingerprintx, httpx, nuclei |
+| web-basic | | Quick web scan | 18 | azure_realm, baddns, badsecrets, bucket_amazon, bucket_firebase, bucket_google, bucket_microsoft, ffuf_shortnames, filedownload, git, graphql_introspection, httpx, iis_shortnames, ntlm, oauth, robots, securitytxt, sslcert |
+| web-screenshots | | Take screenshots of webpages | 3 | gowitness, httpx, social |
+| web-thorough | | Aggressive web scan | 32 | ajaxpro, aspnet_bin_exposure, azure_realm, baddns, badsecrets, bucket_amazon, bucket_digitalocean, bucket_firebase, bucket_google, bucket_microsoft, bypass403, dotnetnuke, ffuf_shortnames, filedownload, generic_ssrf, git, graphql_introspection, host_header, httpx, hunt, iis_shortnames, lightfuzz, ntlm, oauth, reflected_parameters, retirejs, robots, securitytxt, smuggler, sslcert, telerik, url_manipulation |
+| wayback | | Discover URLs and interesting archived files via the Wayback Machine | 52 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback |
+| wayback-heavy | | Full Wayback Machine integration - URL discovery, parameter extraction, archived page retrieval, and interesting file detection | 53 | anubisdb, asn, azure_realm, azure_tenant, baddns_direct, baddns_zone, badsecrets, bevigil, bufferoverrun, builtwith, c99, censys_dns, certspotter, chaos, crt, crt_db, digitorus, dnsbimi, dnsbrute, dnsbrute_mutations, dnscaa, dnscommonsrv, dnsdumpster, dnstlsrpt, fullhunt, github_codesearch, github_org, hackertarget, httpx, hunterio, ipneighbor, leakix, myssl, oauth, otx, passivetotal, postman, postman_download, rapiddns, securitytrails, securitytxt, shodan_dns, shodan_idb, sitedossier, social, sslcert, subdomaincenter, subdomainradar, trickest, urlscan, virustotal, wayback |
diff --git a/mkdocs.yml b/mkdocs.yml
index 498b0b4a6d..2355c67f5a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -32,6 +32,7 @@ nav:
- Modules:
- List of Modules: modules/list_of_modules.md
- Nuclei: modules/nuclei.md
+ - Wayback: modules/wayback.md
- Custom YARA Rules: modules/custom_yara_rules.md
- Lightfuzz: modules/lightfuzz.md
- Misc: