Skip to content

Commit 87f05c6

Browse files
committed
chg: [crawler] import zip archive of captures from lookyloo
1 parent 612fb01 commit 87f05c6

6 files changed

Lines changed: 120 additions & 19 deletions

File tree

bin/lib/crawlers.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,7 +1388,6 @@ def extract_time_from_capture(self, capture):
13881388
if not self.date:
13891389
self.date = get_current_date(separator=True)
13901390
self.epoch = int(time.time())
1391-
print(self.date, self.epoch)
13921391

13931392
def extract_title(self, item, html_content):
13941393
title_content = extract_title(html_content)
@@ -1439,7 +1438,8 @@ def process_favicons(self, item, favicons):
14391438
fav.add(item.get_date(), item)
14401439

14411440
def process(self, capture, capture_parent='capture_importer'):
1442-
capture = self._decode_capture(capture)
1441+
if capture_parent != 'lookyloo':
1442+
capture = self._decode_capture(capture)
14431443
self.extract_domain_from_capture(capture)
14441444
# Filter unsafe onions
14451445
if self.domain.id.endswith('.onion'):
@@ -1481,7 +1481,7 @@ def process(self, capture, capture_parent='capture_importer'):
14811481
# SSHKeys.save_passive_ssh_host(self.domain.id)
14821482
return objs
14831483

1484-
def process_capture(self, parent_id, capture):
1484+
def process_capture(self, parent_id, capture, force=False):
14851485
objs = []
14861486
filter_page = False
14871487
if not parent_id:
@@ -1520,8 +1520,17 @@ def process_capture(self, parent_id, capture):
15201520
else:
15211521
last_url = f'http://{self.domain.id}'
15221522

1523+
# Filter duplicate
1524+
if not force and self.root_item_id is None:
1525+
if self.domain.exists_epoch_history(self.epoch):
1526+
self.logger.warning(f'Capture Already Imported, {self.domain.id} -> {self.epoch}')
1527+
return False
1528+
15231529
if capture.get('html') and not filter_page:
1524-
item_id = create_item_id(self.items_dir, self.domain.id)
1530+
if capture.get('uuid') and parent_id is None:
1531+
item_id = create_item_id(self.items_dir, self.domain.id, c_uuid=capture['uuid'])
1532+
else:
1533+
item_id = create_item_id(self.items_dir, self.domain.id)
15251534
item = Item(item_id)
15261535
print(item.id)
15271536

@@ -1574,13 +1583,15 @@ def process_capture(self, parent_id, capture):
15741583

15751584
def process_lookyloo_archive(self, archive):
15761585
temp_dir = os.path.join(os.environ['AIL_HOME'], 'temp/import')
1577-
archive = os.path.join(temp_dir, archive) # TODO sanitise
1586+
archive = os.path.join(temp_dir, archive)
15781587
if not os.path.commonpath([archive, temp_dir]) == temp_dir:
15791588
self.logger.critical(f'Path Transversal {archive}')
15801589
return []
15811590

15821591
files_to_skip = ['cnames.json', 'ipasn.json', 'ips.json', 'mx.json',
1583-
'nameservers.json', 'soa.json', 'hashlookup.json']
1592+
'nameservers.json', 'soa.json', 'hashlookup.json',
1593+
'cookies.json', 'storage.json', 'meta', 'parent', 'categories', 'data.filename', # TEMP
1594+
'data', 'trusted_timestamps.json', 'capture_settings.json', 'frames.json'] # TEMP
15841595
capture = {}
15851596
unrecoverable_error= False
15861597

@@ -1599,22 +1610,21 @@ def process_lookyloo_archive(self, archive):
15991610
elif filename.endswith('0.last_redirect.txt'):
16001611
capture['last_redirected_url'] = lookyloo_capture.read(filename).decode()
16011612
elif filename.endswith('0.png'):
1602-
capture['png'] = base64.b64encode(lookyloo_capture.read(filename))
1603-
# elif filename.endswith('0.cookies.json'): # TODO # # # #
1613+
capture['png'] = lookyloo_capture.read(filename)
1614+
# elif filename.endswith('0.cookies.json'):
16041615
# # Not required
16051616
# capture{'cookies'} = orjson.loads(lookyloo_capture.read(filename))
16061617
# elif filename.endswith('0.storage.json'):
16071618
# # Not required
16081619
# storage = orjson.loads(lookyloo_capture.read(filename))
16091620
elif filename.endswith('potential_favicons.ico'):
16101621
if 'potential_favicons' not in capture:
1611-
capture['potential_favicons'] = set()
1622+
capture['potential_favicons'] = []
16121623
# We may have more than one favicon
1613-
print(lookyloo_capture.read(filename))
1614-
capture['potential_favicons'].add(lookyloo_capture.read(filename))
1615-
# elif filename.endswith('uuid'): # TODO Avoid duplicate and multiple Imports
1616-
# uuid = lookyloo_capture.read(filename).decode()
1617-
# if self.uuid_exists(uuid): # TODO Avoid duplicate and multiple Imports
1624+
capture['potential_favicons'].append(lookyloo_capture.read(filename))
1625+
elif filename.endswith('uuid'): # TODO Avoid duplicate and multiple Imports
1626+
capture['uuid'] = lookyloo_capture.read(filename).decode()
1627+
# if self.uuid_exists(uuid):
16181628
# messages['warnings'].append(f'UUID {uuid} already exists, set a new one.')
16191629
# uuid = str(uuid4())
16201630
# elif filename.endswith('meta'):
@@ -2681,14 +2691,16 @@ def is_redirection(domain, last_url):
26812691
last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1])
26822692
return domain != last_domain
26832693

2684-
def create_item_id(item_dir, domain):
2694+
def create_item_id(item_dir, domain, c_uuid=None):
2695+
if not c_uuid:
2696+
c_uuid = str(uuid.uuid4())
26852697
# remove /
26862698
domain = domain.replace('/', '_')
26872699
if len(domain) > 215:
2688-
n_uuid = domain[-215:]+str(uuid.uuid4())
2700+
item_id = domain[-215:]+c_uuid
26892701
else:
2690-
n_uuid = domain+str(uuid.uuid4())
2691-
return os.path.join(item_dir, n_uuid)
2702+
item_id = domain+c_uuid
2703+
return os.path.join(item_dir, item_id)
26922704

26932705
# # # # # # # # # # # #
26942706
# #

bin/lib/objects/Domains.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,10 @@ def _add_history_root_item(self, root_item, epoch):
529529
# Create/Update crawler history
530530
r_crawler.zadd(f'domain:history:{self.id}', {root_item: epoch})
531531

532+
def exists_epoch_history(self, epoch):
533+
nb = r_crawler.zcount(f'domain:history:{self.id}', epoch, epoch)
534+
return nb > 0
535+
532536
# if domain down -> root_item = epoch
533537
def add_history(self, epoch, root_item=None, date=None):
534538
if not date:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ redis>4.4.4
1212
python-magic>0.4.15
1313
yara-python>4.0.2
1414
pyfaup-rs
15+
orjson
1516

1617
# AIL Sync
1718
websockets>9.0

var/www/blueprints/crawler_splash.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
import sys
1212
import time
1313
from datetime import datetime
14+
from pathlib import Path
1415

15-
from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, send_file, abort
16+
from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, send_file, abort, current_app
1617
from flask_login import login_required, current_user
1718

1819
sys.path.append('modules')
@@ -25,6 +26,7 @@
2526
##################################
2627
# Import Project packages
2728
##################################
29+
from lib.ail_core import generate_uuid
2830
from lib import crawlers
2931
from lib import Language
3032
from lib.objects import Domains
@@ -53,6 +55,34 @@ def api_validator(message, code):
5355
return Response(json.dumps(message, indent=2, sort_keys=True), mimetype='application/json'), code
5456

5557

58+
def _import_lookyloo_archive_from_upload(uploaded_file, logger):
59+
if not uploaded_file or not uploaded_file.filename:
60+
return False, 'No archive file was provided.'
61+
62+
filename = uploaded_file.filename.strip()
63+
if not filename.lower().endswith('.zip'):
64+
return False, 'Invalid file type: only .zip archives are supported.'
65+
66+
archive_name = f"{generate_uuid()}.zip"
67+
imports_root = Path(os.environ['AIL_HOME']) / 'temp' / 'import'
68+
archive_path = imports_root / archive_name
69+
70+
try:
71+
imports_root.mkdir(parents=True, exist_ok=True)
72+
uploaded_file.save(str(archive_path))
73+
74+
crawler_processor = crawlers.CrawlerCapturesProcessor(logger)
75+
imported = crawler_processor.process_lookyloo_archive(archive_name)
76+
if not imported:
77+
return False, 'Invalid or unsupported Lacus capture archive format.'
78+
return True, f'Lacus capture archive imported successfully ({len(imported)} object(s)).'
79+
except Exception as e:
80+
logger.exception(f'Error while importing lookyloo archive: {e}')
81+
return False, f'Archive processing failed: {e}'
82+
finally:
83+
archive_path.unlink(missing_ok=True)
84+
85+
5686
def create_json_response(data, status_code):
5787
if status_code == 403:
5888
abort(403)
@@ -113,9 +143,19 @@ def manual():
113143
crawlers_types=crawlers_types,
114144
proxies=proxies,
115145
l_cookiejar=l_cookiejar,
146+
import_status=request.args.get('import_status'),
147+
import_message=request.args.get('import_message'),
116148
tags_selector_data=Tag.get_tags_selector_data())
117149

118150

151+
@crawler_splash.route("/crawlers/import_lookyloo_archive", methods=['POST'])
152+
@login_required
153+
@login_user_no_api
154+
def import_lookyloo_archive():
155+
success, message = _import_lookyloo_archive_from_upload(request.files.get('lookyloo_archive'), logger=current_app.logger)
156+
return redirect(url_for('crawler_splash.manual', import_status='success' if success else 'error', import_message=message))
157+
158+
119159
@crawler_splash.route("/crawlers/send_to_spider", methods=['POST'])
120160
@login_required
121161
@login_user_no_api

var/www/static/image/lookyloo.png

26.4 KB
Loading

var/www/templates/crawler/crawler_splash/crawler_manual.html

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
{% include 'crawler/crawler_disabled.html' %}
3636

37+
3738
<div class="card text-white bg-dark mb-3 mt-1">
3839
<div class="card-header">
3940
<h5 class="card-title">Crawl a Domain</h5>
@@ -223,6 +224,44 @@ <h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
223224
</div>
224225
</div>
225226

227+
{% if import_message %}
228+
<div class="alert {% if import_status == 'success' %}alert-success{% else %}alert-danger{% endif %} mt-3" role="alert">
229+
{{ import_message }}
230+
</div>
231+
{% endif %}
232+
233+
<div class="card text-white bg-dark mb-3 mt-1">
234+
<div class="card-header">
235+
<h5 class="card-title">Import Lookyloo Capture Archive</h5>
236+
</div>
237+
238+
<div class="card-body">
239+
<div class="d-flex justify-content-between align-items-center">
240+
<div class="flex-grow-1 pr-3">
241+
<p class="card-text">Upload a zipped capture archive exported from Lookyloo.</p>
242+
243+
<form action="{{ url_for('crawler_splash.import_lookyloo_archive') }}" method="post" enctype="multipart/form-data">
244+
<div class="input-group mb-2">
245+
<div class="custom-file">
246+
<input class="custom-file-input" type="file" id="lookyloo_archive" name="lookyloo_archive" accept=".zip,application/zip" required>
247+
<label class="custom-file-label" for="lookyloo_archive">Choose Lookyloo archive (.zip)</label>
248+
</div>
249+
</div>
250+
251+
<button class="btn btn-primary mt-1" type="submit">
252+
<i class="fas fa-file-archive"></i> Import Archive
253+
</button>
254+
</form>
255+
</div>
256+
257+
<img src="{{ url_for('static', filename='image/lookyloo.png') }}"
258+
alt="Lookyloo"
259+
class="ml-3"
260+
style="max-width: 140px; height: auto;">
261+
</div>
262+
</div>
263+
</div>
264+
226265

227266
</div>
228267

@@ -249,6 +288,11 @@ <h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
249288
$('#queue_type_selector').on("change", function () {
250289
queue_type_selector_input_controler();
251290
});
291+
292+
$('#lookyloo_archive').on('change', function(){
293+
const fileName = $(this).val().split('\\').pop();
294+
$(this).next('.custom-file-label').html(fileName || 'Choose Lookyloo archive (.zip)');
295+
});
252296
});
253297

254298
function toggle_sidebar(){

0 commit comments

Comments
 (0)