Merge branch '3.0' into wayback-upgrade

liquidsec · web-flow · commit 93fee255743b · 2026-04-01T14:21:06.000-04:00
diff --git a/bbot/core/helpers/dns/engine.py b/bbot/core/helpers/dns/engine.py
@@ -72,7 +72,7 @@ def __init__(self, socket_path, config={}, debug=False):
             self.wildcard_ignore = []
         self.wildcard_ignore = tuple([str(d).strip().lower() for d in self.wildcard_ignore])
         self.wildcard_tests = self.dns_config.get("wildcard_tests", 5)
-        self._wildcard_cache = {}
+        self._wildcard_cache = LRUCache(maxsize=10000)
         # since wildcard detection takes some time, This is to prevent multiple
         # modules from kicking off wildcard detection for the same domain at the same time
         self._wildcard_lock = NamedLock()
@@ -81,10 +81,10 @@ def __init__(self, socket_path, config={}, debug=False):
         self._last_dns_success = None
         self._last_connectivity_warning = time.time()
         # keeps track of warnings issued for wildcard detection to prevent duplicate warnings
-        self._dns_warnings = set()
-        self._errors = {}
+        self._dns_warnings = LRUCache(maxsize=10000)
+        self._errors = LRUCache(maxsize=10000)
         self._debug = self.dns_config.get("debug", False)
-        self._dns_cache = LRUCache(maxsize=10000)
+        self._dns_cache = LRUCache(maxsize=100000)
 
     async def resolve(self, query, **kwargs):
         """Resolve DNS names and IP addresses to their corresponding results.
@@ -221,7 +221,7 @@ async def _resolve_hostname(self, query, **kwargs):
                                 self.log.verbose(
                                     f'Aborting future {rdtype} queries to "{parent}" because error count ({error_count:,}) exceeded abort threshold ({self.abort_threshold:,})'
                                 )
-                            self._dns_warnings.add(parent_hash)
+                            self._dns_warnings[parent_hash] = True
                             return results, errors
                     results = await self._catch(self.resolver.resolve, query, **kwargs)
                     if use_cache:
diff --git a/bbot/modules/gowitness.py b/bbot/modules/gowitness.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import asyncio
 import aiosqlite
 import multiprocessing
@@ -243,7 +244,7 @@ async def handle_batch(self, *events):
                     context=f"{{module}} visited {{event.type}}: {url}",
                 )
                 if url_event and ip:
-                    url_event._resolved_hosts.add(ip)
+                    url_event._resolved_hosts.add(sys.intern(ip))
                 await self.emit_event(url_event)
 
         # emit technologies
diff --git a/bbot/modules/httpx.py b/bbot/modules/httpx.py
@@ -1,4 +1,5 @@
 import re
+import sys
 import orjson
 import tempfile
 import subprocess
@@ -207,7 +208,7 @@ async def handle_batch(self, *events):
             if url_event:
                 httpx_ip = j.get("host", "")
                 if httpx_ip:
-                    url_event._resolved_hosts.add(httpx_ip)
+                    url_event._resolved_hosts.add(sys.intern(httpx_ip))
                 url_event.data["status_code"] = status_code
                 title = j.get("title", "")
                 if title:
diff --git a/bbot/modules/internal/dnsresolve.py b/bbot/modules/internal/dnsresolve.py
@@ -1,3 +1,4 @@
+import sys
 import ipaddress
 from contextlib import suppress
 
@@ -231,7 +232,7 @@ def check_scope(self, event):
         for rdtype in ("A", "AAAA", "CNAME"):
             hosts = dns_children.get(rdtype, [])
             # update resolved hosts
-            event.resolved_hosts.update(hosts)
+            event.resolved_hosts.update(sys.intern(h) for h in hosts)
             for host in hosts:
                 # having a CNAME to an in-scope host doesn't make you in-scope
                 if rdtype != "CNAME":
@@ -258,6 +259,7 @@ async def resolve_event(self, event, types):
         queries = [(event_host, rdtype) for rdtype in types]
         dns_errors = {}
         async for (query, rdtype), (answers, errors) in self.helpers.dns.resolve_raw_batch(queries):
+            rdtype = sys.intern(rdtype)
             # errors
             try:
                 dns_errors[rdtype].update(errors)
@@ -272,6 +274,8 @@ async def resolve_event(self, event, types):
                     event.raw_dns_records[rdtype] = {answer}
                 # hosts
                 for _rdtype, host in extract_targets(answer):
+                    _rdtype = sys.intern(_rdtype)
+                    host = sys.intern(host)
                     try:
                         event.dns_children[_rdtype].add(host)
                     except KeyError:
diff --git a/bbot/scripts/benchmark_report.py b/bbot/scripts/benchmark_report.py
@@ -180,6 +180,7 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
 |--------------|---------|------------|-----------|-----------|"""
 
     significant_changes = []
+    new_tests = []
     performance_summary = []
 
     for current_bench in current_benchmarks:
@@ -245,10 +246,16 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
             else:
                 base_ops = 1 / base_mean  # Default: single operation
 
-            # Use per-event memory if available, otherwise use time
+            # Use memory metrics if available, otherwise use time
+            current_mb = current_extra.get("total_memory_mb")
+            base_mb = base_extra.get("total_memory_mb")
             current_peb = current_extra.get("per_event_bytes")
             base_peb = base_extra.get("per_event_bytes")
-            if current_peb is not None and base_peb is not None:
+            if current_mb is not None and base_mb is not None and current_peb is None:
+                change_percent, emoji = calculate_change_percentage(base_mb, current_mb)
+                base_label = f"{base_mb:.1f} MB"
+                current_label = f"{current_mb:.1f} MB"
+            elif current_peb is not None and base_peb is not None:
                 change_percent, emoji = calculate_change_percentage(base_peb, current_peb)
                 base_label = f"{base_peb:.0f} B/event"
                 current_label = f"{current_peb:.0f} B/event"
@@ -269,7 +276,10 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
 
             # Track significant changes
             if abs(change_percent) > 10:
-                is_memory = current_extra.get("per_event_bytes") is not None
+                is_memory = (
+                    current_extra.get("per_event_bytes") is not None
+                    or current_extra.get("total_memory_mb") is not None
+                )
                 if is_memory:
                     direction = "🐌 more memory" if change_percent > 0 else "🚀 less memory"
                 else:
@@ -295,9 +305,7 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
 
         else:
             table += f"\n| **{test_name}** | `-` | `{format_time(current_mean)}` | **New** 🆕 | 🆕 |"
-            significant_changes.append(
-                f"- **{test_name}**: New test 🆕 ({format_time(current_mean)}, {format_ops(current_ops)})"
-            )
+            new_tests.append(f"- **{test_name}**: {format_time(current_mean)}, {format_ops(current_ops)}")
 
     table += "\n\n</details>\n\n"
 
@@ -323,6 +331,13 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
             table += f"{change}\n"
         table += "\n"
 
+    # Add new tests section
+    if new_tests:
+        table += "### 🆕 New Tests\n\n"
+        for new_test in new_tests:
+            table += f"{new_test}\n"
+        table += "\n"
+
     return table
 
 
diff --git a/bbot/test/benchmarks/_scan_memory_subdomain_enum.py b/bbot/test/benchmarks/_scan_memory_subdomain_enum.py
@@ -0,0 +1,62 @@
+"""
+Subprocess script for subdomain enumeration memory benchmark.
+
+Injects SUBDOMAIN_ENUM_COUNT synthetic DNS_NAME events into a scan
+and prints peak tracemalloc memory to stdout.
+
+Invoked by test_scan_memory.py — not meant to be run directly.
+"""
+
+import gc
+import sys
+import asyncio
+import tracemalloc
+
+from bbot.scanner import Scanner
+
+SUBDOMAIN_ENUM_COUNT = int(sys.argv[1])
+
+scan = Scanner(
+    "blacklanternsecurity.com",
+    modules=[],
+    output_modules=["python"],
+    config={
+        "dns": {"disable": True},
+        "scope": {"search_distance": 0},
+        "web": {"spider_distance": 0, "spider_depth": 0},
+        "speculate": False,
+        "excavate": True,
+        "aggregate": False,
+        "cloudcheck": False,
+    },
+    force_start=True,
+)
+
+
+async def run():
+    await scan._prep()
+    gc.collect()
+    if tracemalloc.is_tracing():
+        tracemalloc.stop()
+    tracemalloc.start()
+    events = []
+    injected = False
+    async for event in scan.async_start():
+        events.append(event)
+        if event.type == "SCAN" and not injected:
+            injected = True
+            root_event = scan.root_event
+            for i in range(SUBDOMAIN_ENUM_COUNT):
+                dns_event = scan.make_event(
+                    f"sub{i}.blacklanternsecurity.com",
+                    "DNS_NAME",
+                    parent=root_event,
+                    context=f"benchmark DNS_NAME {i}",
+                )
+                await scan.ingress_module.queue_event(dns_event, {})
+
+
+asyncio.run(run())
+_, peak = tracemalloc.get_traced_memory()
+tracemalloc.stop()
+print(f"PEAK_MB:{round(peak / 1024 / 1024, 2)}")
diff --git a/bbot/test/benchmarks/_scan_memory_web_crawl.py b/bbot/test/benchmarks/_scan_memory_web_crawl.py
@@ -0,0 +1,86 @@
+"""
+Subprocess script for web crawl memory benchmark.
+
+Launches a local HTTP server with NUM_PAGES pages (each BODY_SIZE bytes),
+runs a BBOT scan against it, and prints peak tracemalloc memory to stdout.
+
+Invoked by test_scan_memory.py — not meant to be run directly.
+"""
+
+import gc
+import sys
+import asyncio
+import threading
+import tracemalloc
+import importlib.util
+from http.server import HTTPServer, BaseHTTPRequestHandler
+
+from bbot.scanner import Scanner
+
+NUM_PAGES = int(sys.argv[1])
+BODY_SIZE = int(sys.argv[2])
+
+HTTP_MODULE = "httpx" if importlib.util.find_spec("bbot.modules.httpx") else "http"
+
+
+class H(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/":
+            links = "".join(f'<a href="/page{i}">page{i}</a>' for i in range(NUM_PAGES))
+            body = "<html><body>" + links + "</body></html>"
+        elif self.path.startswith("/page"):
+            i = self.path.replace("/page", "")
+            links = f'<a href="/data{i}/info">info</a><a href="/data{i}/details">details</a>'
+            body = "<html><body><h1>Page " + i + "</h1>" + links + "A" * BODY_SIZE + "</body></html>"
+        elif self.path.startswith("/data"):
+            body = "<html><body>data endpoint</body></html>"
+        else:
+            self.send_response(404)
+            self.end_headers()
+            return
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html")
+        self.end_headers()
+        self.wfile.write(body.encode())
+
+    def log_message(self, *a):
+        pass
+
+
+server = HTTPServer(("127.0.0.1", 0), H)
+port = server.server_address[1]
+threading.Thread(target=server.serve_forever, daemon=True).start()
+
+scan = Scanner(
+    f"http://127.0.0.1:{port}/",
+    modules=[HTTP_MODULE],
+    output_modules=["python"],
+    config={
+        "dns": {"disable": True},
+        "scope": {"search_distance": 0},
+        "web": {"spider_distance": 10, "spider_depth": 10, "spider_links_per_page": NUM_PAGES},
+        "speculate": True,
+        "excavate": True,
+        "aggregate": False,
+        "cloudcheck": False,
+    },
+    force_start=True,
+)
+
+
+async def run():
+    await scan._prep()
+    gc.collect()
+    if tracemalloc.is_tracing():
+        tracemalloc.stop()
+    tracemalloc.start()
+    events = []
+    async for event in scan.async_start():
+        events.append(event)
+
+
+asyncio.run(run())
+_, peak = tracemalloc.get_traced_memory()
+tracemalloc.stop()
+server.shutdown()
+print(f"PEAK_MB:{round(peak / 1024 / 1024, 2)}")
diff --git a/bbot/test/benchmarks/test_excavate_benchmarks.py b/bbot/test/benchmarks/test_excavate_benchmarks.py
@@ -1,7 +1,11 @@
+import importlib.util
+
 import pytest
 import asyncio
 from bbot.scanner import Scanner
 
+HTTP_MODULE = "httpx" if importlib.util.find_spec("bbot.modules.httpx") else "http"
+
 
 class TestExcavateDirectBenchmarks:
     """
@@ -99,7 +103,7 @@ def _generate_realistic_content(self, index):
     async def _run_excavate_single_thread(self, text_segments):
         """Run excavate processing in single thread"""
         # Create scanner and initialize excavate
-        scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
+        scan = Scanner("example.com", modules=[HTTP_MODULE], config={"excavate": True})
         await scan._prep()
         excavate_module = scan.modules.get("excavate")
 
@@ -140,7 +144,7 @@ async def track_emit_event(event_data, *args, **kwargs):
     async def _run_excavate_parallel_tasks(self, text_segments):
         """Run excavate processing with parallel asyncio tasks"""
         # Create scanner and initialize excavate
-        scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
+        scan = Scanner("example.com", modules=[HTTP_MODULE], config={"excavate": True})
         await scan._prep()
         excavate_module = scan.modules.get("excavate")
 
diff --git a/bbot/test/benchmarks/test_scan_memory.py b/bbot/test/benchmarks/test_scan_memory.py
diff --git a/bbot/test/benchmarks/test_scan_throughput_benchmarks.py b/bbot/test/benchmarks/test_scan_throughput_benchmarks.py