Skip to content

Commit 93fee25

Browse files
authored
Merge branch '3.0' into wayback-upgrade
2 parents 853a2d8 + 6b359ac commit 93fee25

10 files changed

Lines changed: 346 additions & 16 deletions

File tree

bbot/core/helpers/dns/engine.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(self, socket_path, config={}, debug=False):
7272
self.wildcard_ignore = []
7373
self.wildcard_ignore = tuple([str(d).strip().lower() for d in self.wildcard_ignore])
7474
self.wildcard_tests = self.dns_config.get("wildcard_tests", 5)
75-
self._wildcard_cache = {}
75+
self._wildcard_cache = LRUCache(maxsize=10000)
7676
# since wildcard detection takes some time, This is to prevent multiple
7777
# modules from kicking off wildcard detection for the same domain at the same time
7878
self._wildcard_lock = NamedLock()
@@ -81,10 +81,10 @@ def __init__(self, socket_path, config={}, debug=False):
8181
self._last_dns_success = None
8282
self._last_connectivity_warning = time.time()
8383
# keeps track of warnings issued for wildcard detection to prevent duplicate warnings
84-
self._dns_warnings = set()
85-
self._errors = {}
84+
self._dns_warnings = LRUCache(maxsize=10000)
85+
self._errors = LRUCache(maxsize=10000)
8686
self._debug = self.dns_config.get("debug", False)
87-
self._dns_cache = LRUCache(maxsize=10000)
87+
self._dns_cache = LRUCache(maxsize=100000)
8888

8989
async def resolve(self, query, **kwargs):
9090
"""Resolve DNS names and IP addresses to their corresponding results.
@@ -221,7 +221,7 @@ async def _resolve_hostname(self, query, **kwargs):
221221
self.log.verbose(
222222
f'Aborting future {rdtype} queries to "{parent}" because error count ({error_count:,}) exceeded abort threshold ({self.abort_threshold:,})'
223223
)
224-
self._dns_warnings.add(parent_hash)
224+
self._dns_warnings[parent_hash] = True
225225
return results, errors
226226
results = await self._catch(self.resolver.resolve, query, **kwargs)
227227
if use_cache:

bbot/modules/gowitness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import sys
23
import asyncio
34
import aiosqlite
45
import multiprocessing
@@ -243,7 +244,7 @@ async def handle_batch(self, *events):
243244
context=f"{{module}} visited {{event.type}}: {url}",
244245
)
245246
if url_event and ip:
246-
url_event._resolved_hosts.add(ip)
247+
url_event._resolved_hosts.add(sys.intern(ip))
247248
await self.emit_event(url_event)
248249

249250
# emit technologies

bbot/modules/httpx.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import sys
23
import orjson
34
import tempfile
45
import subprocess
@@ -207,7 +208,7 @@ async def handle_batch(self, *events):
207208
if url_event:
208209
httpx_ip = j.get("host", "")
209210
if httpx_ip:
210-
url_event._resolved_hosts.add(httpx_ip)
211+
url_event._resolved_hosts.add(sys.intern(httpx_ip))
211212
url_event.data["status_code"] = status_code
212213
title = j.get("title", "")
213214
if title:

bbot/modules/internal/dnsresolve.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
import ipaddress
23
from contextlib import suppress
34

@@ -231,7 +232,7 @@ def check_scope(self, event):
231232
for rdtype in ("A", "AAAA", "CNAME"):
232233
hosts = dns_children.get(rdtype, [])
233234
# update resolved hosts
234-
event.resolved_hosts.update(hosts)
235+
event.resolved_hosts.update(sys.intern(h) for h in hosts)
235236
for host in hosts:
236237
# having a CNAME to an in-scope host doesn't make you in-scope
237238
if rdtype != "CNAME":
@@ -258,6 +259,7 @@ async def resolve_event(self, event, types):
258259
queries = [(event_host, rdtype) for rdtype in types]
259260
dns_errors = {}
260261
async for (query, rdtype), (answers, errors) in self.helpers.dns.resolve_raw_batch(queries):
262+
rdtype = sys.intern(rdtype)
261263
# errors
262264
try:
263265
dns_errors[rdtype].update(errors)
@@ -272,6 +274,8 @@ async def resolve_event(self, event, types):
272274
event.raw_dns_records[rdtype] = {answer}
273275
# hosts
274276
for _rdtype, host in extract_targets(answer):
277+
_rdtype = sys.intern(_rdtype)
278+
host = sys.intern(host)
275279
try:
276280
event.dns_children[_rdtype].add(host)
277281
except KeyError:

bbot/scripts/benchmark_report.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
180180
|--------------|---------|------------|-----------|-----------|"""
181181

182182
significant_changes = []
183+
new_tests = []
183184
performance_summary = []
184185

185186
for current_bench in current_benchmarks:
@@ -245,10 +246,16 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
245246
else:
246247
base_ops = 1 / base_mean # Default: single operation
247248

248-
# Use per-event memory if available, otherwise use time
249+
# Use memory metrics if available, otherwise use time
250+
current_mb = current_extra.get("total_memory_mb")
251+
base_mb = base_extra.get("total_memory_mb")
249252
current_peb = current_extra.get("per_event_bytes")
250253
base_peb = base_extra.get("per_event_bytes")
251-
if current_peb is not None and base_peb is not None:
254+
if current_mb is not None and base_mb is not None and current_peb is None:
255+
change_percent, emoji = calculate_change_percentage(base_mb, current_mb)
256+
base_label = f"{base_mb:.1f} MB"
257+
current_label = f"{current_mb:.1f} MB"
258+
elif current_peb is not None and base_peb is not None:
252259
change_percent, emoji = calculate_change_percentage(base_peb, current_peb)
253260
base_label = f"{base_peb:.0f} B/event"
254261
current_label = f"{current_peb:.0f} B/event"
@@ -269,7 +276,10 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
269276

270277
# Track significant changes
271278
if abs(change_percent) > 10:
272-
is_memory = current_extra.get("per_event_bytes") is not None
279+
is_memory = (
280+
current_extra.get("per_event_bytes") is not None
281+
or current_extra.get("total_memory_mb") is not None
282+
)
273283
if is_memory:
274284
direction = "🐌 more memory" if change_percent > 0 else "🚀 less memory"
275285
else:
@@ -295,9 +305,7 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
295305

296306
else:
297307
table += f"\n| **{test_name}** | `-` | `{format_time(current_mean)}` | **New** 🆕 | 🆕 |"
298-
significant_changes.append(
299-
f"- **{test_name}**: New test 🆕 ({format_time(current_mean)}, {format_ops(current_ops)})"
300-
)
308+
new_tests.append(f"- **{test_name}**: {format_time(current_mean)}, {format_ops(current_ops)}")
301309

302310
table += "\n\n</details>\n\n"
303311

@@ -323,6 +331,13 @@ def generate_comparison_table(current_data: Dict, base_data: Dict, current_branc
323331
table += f"{change}\n"
324332
table += "\n"
325333

334+
# Add new tests section
335+
if new_tests:
336+
table += "### 🆕 New Tests\n\n"
337+
for new_test in new_tests:
338+
table += f"{new_test}\n"
339+
table += "\n"
340+
326341
return table
327342

328343

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Subprocess script for subdomain enumeration memory benchmark.
3+
4+
Injects SUBDOMAIN_ENUM_COUNT synthetic DNS_NAME events into a scan
5+
and prints peak tracemalloc memory to stdout.
6+
7+
Invoked by test_scan_memory.py — not meant to be run directly.
8+
"""
9+
10+
import gc
11+
import sys
12+
import asyncio
13+
import tracemalloc
14+
15+
from bbot.scanner import Scanner
16+
17+
SUBDOMAIN_ENUM_COUNT = int(sys.argv[1])
18+
19+
scan = Scanner(
20+
"blacklanternsecurity.com",
21+
modules=[],
22+
output_modules=["python"],
23+
config={
24+
"dns": {"disable": True},
25+
"scope": {"search_distance": 0},
26+
"web": {"spider_distance": 0, "spider_depth": 0},
27+
"speculate": False,
28+
"excavate": True,
29+
"aggregate": False,
30+
"cloudcheck": False,
31+
},
32+
force_start=True,
33+
)
34+
35+
36+
async def run():
37+
await scan._prep()
38+
gc.collect()
39+
if tracemalloc.is_tracing():
40+
tracemalloc.stop()
41+
tracemalloc.start()
42+
events = []
43+
injected = False
44+
async for event in scan.async_start():
45+
events.append(event)
46+
if event.type == "SCAN" and not injected:
47+
injected = True
48+
root_event = scan.root_event
49+
for i in range(SUBDOMAIN_ENUM_COUNT):
50+
dns_event = scan.make_event(
51+
f"sub{i}.blacklanternsecurity.com",
52+
"DNS_NAME",
53+
parent=root_event,
54+
context=f"benchmark DNS_NAME {i}",
55+
)
56+
await scan.ingress_module.queue_event(dns_event, {})
57+
58+
59+
asyncio.run(run())
60+
_, peak = tracemalloc.get_traced_memory()
61+
tracemalloc.stop()
62+
print(f"PEAK_MB:{round(peak / 1024 / 1024, 2)}")
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""
2+
Subprocess script for web crawl memory benchmark.
3+
4+
Launches a local HTTP server with NUM_PAGES pages (each BODY_SIZE bytes),
5+
runs a BBOT scan against it, and prints peak tracemalloc memory to stdout.
6+
7+
Invoked by test_scan_memory.py — not meant to be run directly.
8+
"""
9+
10+
import gc
11+
import sys
12+
import asyncio
13+
import threading
14+
import tracemalloc
15+
import importlib.util
16+
from http.server import HTTPServer, BaseHTTPRequestHandler
17+
18+
from bbot.scanner import Scanner
19+
20+
NUM_PAGES = int(sys.argv[1])
21+
BODY_SIZE = int(sys.argv[2])
22+
23+
HTTP_MODULE = "httpx" if importlib.util.find_spec("bbot.modules.httpx") else "http"
24+
25+
26+
class H(BaseHTTPRequestHandler):
27+
def do_GET(self):
28+
if self.path == "/":
29+
links = "".join(f'<a href="/page{i}">page{i}</a>' for i in range(NUM_PAGES))
30+
body = "<html><body>" + links + "</body></html>"
31+
elif self.path.startswith("/page"):
32+
i = self.path.replace("/page", "")
33+
links = f'<a href="/data{i}/info">info</a><a href="/data{i}/details">details</a>'
34+
body = "<html><body><h1>Page " + i + "</h1>" + links + "A" * BODY_SIZE + "</body></html>"
35+
elif self.path.startswith("/data"):
36+
body = "<html><body>data endpoint</body></html>"
37+
else:
38+
self.send_response(404)
39+
self.end_headers()
40+
return
41+
self.send_response(200)
42+
self.send_header("Content-Type", "text/html")
43+
self.end_headers()
44+
self.wfile.write(body.encode())
45+
46+
def log_message(self, *a):
47+
pass
48+
49+
50+
server = HTTPServer(("127.0.0.1", 0), H)
51+
port = server.server_address[1]
52+
threading.Thread(target=server.serve_forever, daemon=True).start()
53+
54+
scan = Scanner(
55+
f"http://127.0.0.1:{port}/",
56+
modules=[HTTP_MODULE],
57+
output_modules=["python"],
58+
config={
59+
"dns": {"disable": True},
60+
"scope": {"search_distance": 0},
61+
"web": {"spider_distance": 10, "spider_depth": 10, "spider_links_per_page": NUM_PAGES},
62+
"speculate": True,
63+
"excavate": True,
64+
"aggregate": False,
65+
"cloudcheck": False,
66+
},
67+
force_start=True,
68+
)
69+
70+
71+
async def run():
72+
await scan._prep()
73+
gc.collect()
74+
if tracemalloc.is_tracing():
75+
tracemalloc.stop()
76+
tracemalloc.start()
77+
events = []
78+
async for event in scan.async_start():
79+
events.append(event)
80+
81+
82+
asyncio.run(run())
83+
_, peak = tracemalloc.get_traced_memory()
84+
tracemalloc.stop()
85+
server.shutdown()
86+
print(f"PEAK_MB:{round(peak / 1024 / 1024, 2)}")

bbot/test/benchmarks/test_excavate_benchmarks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1+
import importlib.util
2+
13
import pytest
24
import asyncio
35
from bbot.scanner import Scanner
46

7+
HTTP_MODULE = "httpx" if importlib.util.find_spec("bbot.modules.httpx") else "http"
8+
59

610
class TestExcavateDirectBenchmarks:
711
"""
@@ -99,7 +103,7 @@ def _generate_realistic_content(self, index):
99103
async def _run_excavate_single_thread(self, text_segments):
100104
"""Run excavate processing in single thread"""
101105
# Create scanner and initialize excavate
102-
scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
106+
scan = Scanner("example.com", modules=[HTTP_MODULE], config={"excavate": True})
103107
await scan._prep()
104108
excavate_module = scan.modules.get("excavate")
105109

@@ -140,7 +144,7 @@ async def track_emit_event(event_data, *args, **kwargs):
140144
async def _run_excavate_parallel_tasks(self, text_segments):
141145
"""Run excavate processing with parallel asyncio tasks"""
142146
# Create scanner and initialize excavate
143-
scan = Scanner("example.com", modules=["httpx"], config={"excavate": True})
147+
scan = Scanner("example.com", modules=[HTTP_MODULE], config={"excavate": True})
144148
await scan._prep()
145149
excavate_module = scan.modules.get("excavate")
146150

0 commit comments

Comments
 (0)