chg: [crawler] import zip archive of captures from lookyloo

Terrtia · Terrtia · commit 87f05c64b753 · 2026-05-15T12:02:33.000+02:00
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
@@ -1388,7 +1388,6 @@ def extract_time_from_capture(self, capture):
         if not self.date:
             self.date = get_current_date(separator=True)
             self.epoch = int(time.time())
-        print(self.date, self.epoch)
 
     def extract_title(self, item, html_content):
         title_content = extract_title(html_content)
@@ -1439,7 +1438,8 @@ def process_favicons(self, item, favicons):
             fav.add(item.get_date(), item)
 
     def process(self, capture, capture_parent='capture_importer'):
-        capture = self._decode_capture(capture)
+        if capture_parent != 'lookyloo':
+            capture = self._decode_capture(capture)
         self.extract_domain_from_capture(capture)
         # Filter unsafe onions
         if self.domain.id.endswith('.onion'):
@@ -1481,7 +1481,7 @@ def process(self, capture, capture_parent='capture_importer'):
             #     SSHKeys.save_passive_ssh_host(self.domain.id)
         return objs
 
-    def process_capture(self, parent_id, capture):
+    def process_capture(self, parent_id, capture, force=False):
         objs = []
         filter_page = False
         if not parent_id:
@@ -1520,8 +1520,17 @@ def process_capture(self, parent_id, capture):
         else:
             last_url = f'http://{self.domain.id}'
 
+        # Filter duplicate
+        if not force and self.root_item_id is None:
+            if self.domain.exists_epoch_history(self.epoch):
+                self.logger.warning(f'Capture Already Imported, {self.domain.id} -> {self.epoch}')
+                return False
+
         if capture.get('html') and not filter_page:
-            item_id = create_item_id(self.items_dir, self.domain.id)
+            if capture.get('uuid') and parent_id is None:
+                item_id = create_item_id(self.items_dir, self.domain.id, c_uuid=capture['uuid'])
+            else:
+                item_id = create_item_id(self.items_dir, self.domain.id)
             item = Item(item_id)
             print(item.id)
 
@@ -1574,13 +1583,15 @@ def process_capture(self, parent_id, capture):
 
     def process_lookyloo_archive(self, archive):
         temp_dir = os.path.join(os.environ['AIL_HOME'], 'temp/import')
-        archive = os.path.join(temp_dir, archive)  # TODO sanitise
+        archive = os.path.join(temp_dir, archive)
         if not os.path.commonpath([archive, temp_dir]) == temp_dir:
             self.logger.critical(f'Path Transversal {archive}')
             return []
 
         files_to_skip = ['cnames.json', 'ipasn.json', 'ips.json', 'mx.json',
-                         'nameservers.json', 'soa.json', 'hashlookup.json']
+                         'nameservers.json', 'soa.json', 'hashlookup.json',
+                         'cookies.json', 'storage.json', 'meta', 'parent', 'categories', 'data.filename',  # TEMP
+                         'data', 'trusted_timestamps.json', 'capture_settings.json', 'frames.json']  # TEMP
         capture = {}
         unrecoverable_error= False
 
@@ -1599,22 +1610,21 @@ def process_lookyloo_archive(self, archive):
                 elif filename.endswith('0.last_redirect.txt'):
                     capture['last_redirected_url'] = lookyloo_capture.read(filename).decode()
                 elif filename.endswith('0.png'):
-                    capture['png'] = base64.b64encode(lookyloo_capture.read(filename))
-                # elif filename.endswith('0.cookies.json'): # TODO # # # #
+                    capture['png'] = lookyloo_capture.read(filename)
+                # elif filename.endswith('0.cookies.json'):
                 #     # Not required
                 #     capture{'cookies'} = orjson.loads(lookyloo_capture.read(filename))
                 # elif filename.endswith('0.storage.json'):
                 #     # Not required
                 #     storage = orjson.loads(lookyloo_capture.read(filename))
                 elif filename.endswith('potential_favicons.ico'):
                     if 'potential_favicons' not in capture:
-                        capture['potential_favicons'] = set()
+                        capture['potential_favicons'] = []
                     # We may have more than one favicon
-                    print(lookyloo_capture.read(filename))
-                    capture['potential_favicons'].add(lookyloo_capture.read(filename))
-                # elif filename.endswith('uuid'): # TODO Avoid duplicate and multiple Imports
-                #     uuid = lookyloo_capture.read(filename).decode()
-                #     if self.uuid_exists(uuid):  # TODO Avoid duplicate and multiple Imports
+                    capture['potential_favicons'].append(lookyloo_capture.read(filename))
+                elif filename.endswith('uuid'): # TODO Avoid duplicate and multiple Imports
+                    capture['uuid'] = lookyloo_capture.read(filename).decode()
+                #     if self.uuid_exists(uuid):
                 #         messages['warnings'].append(f'UUID {uuid} already exists, set a new one.')
                 #         uuid = str(uuid4())
                 # elif filename.endswith('meta'):
@@ -2681,14 +2691,16 @@ def is_redirection(domain, last_url):
     last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1])
     return domain != last_domain
 
-def create_item_id(item_dir, domain):
+def create_item_id(item_dir, domain, c_uuid=None):
+    if not c_uuid:
+        c_uuid = str(uuid.uuid4())
     # remove /
     domain = domain.replace('/', '_')
     if len(domain) > 215:
-        n_uuid = domain[-215:]+str(uuid.uuid4())
+        item_id = domain[-215:]+c_uuid
     else:
-        n_uuid = domain+str(uuid.uuid4())
-    return os.path.join(item_dir, n_uuid)
+        item_id = domain+c_uuid
+    return os.path.join(item_dir, item_id)
 
 # # # # # # # # # # # #
 #                     #
diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py
@@ -529,6 +529,10 @@ def _add_history_root_item(self, root_item, epoch):
         # Create/Update crawler history
         r_crawler.zadd(f'domain:history:{self.id}', {root_item: epoch})
 
+    def exists_epoch_history(self, epoch):
+        nb = r_crawler.zcount(f'domain:history:{self.id}', epoch, epoch)
+        return nb > 0
+
     # if domain down -> root_item = epoch
     def add_history(self, epoch, root_item=None, date=None):
         if not date:
diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ redis>4.4.4
 python-magic>0.4.15
 yara-python>4.0.2
 pyfaup-rs
+orjson
 
 # AIL Sync
 websockets>9.0
diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py
@@ -11,8 +11,9 @@
 import sys
 import time
 from datetime import datetime
+from pathlib import Path
 
-from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, send_file, abort
+from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, send_file, abort, current_app
 from flask_login import login_required, current_user
 
 sys.path.append('modules')
@@ -25,6 +26,7 @@
 ##################################
 # Import Project packages
 ##################################
+from lib.ail_core import generate_uuid
 from lib import crawlers
 from lib import Language
 from lib.objects import Domains
@@ -53,6 +55,34 @@ def api_validator(message, code):
         return Response(json.dumps(message, indent=2, sort_keys=True), mimetype='application/json'), code
 
 
+def _import_lookyloo_archive_from_upload(uploaded_file, logger):
+    if not uploaded_file or not uploaded_file.filename:
+        return False, 'No archive file was provided.'
+
+    filename = uploaded_file.filename.strip()
+    if not filename.lower().endswith('.zip'):
+        return False, 'Invalid file type: only .zip archives are supported.'
+
+    archive_name = f"{generate_uuid()}.zip"
+    imports_root = Path(os.environ['AIL_HOME']) / 'temp' / 'import'
+    archive_path = imports_root / archive_name
+
+    try:
+        imports_root.mkdir(parents=True, exist_ok=True)
+        uploaded_file.save(str(archive_path))
+
+        crawler_processor = crawlers.CrawlerCapturesProcessor(logger)
+        imported = crawler_processor.process_lookyloo_archive(archive_name)
+        if not imported:
+            return False, 'Invalid or unsupported Lacus capture archive format.'
+        return True, f'Lacus capture archive imported successfully ({len(imported)} object(s)).'
+    except Exception as e:
+        logger.exception(f'Error while importing lookyloo archive: {e}')
+        return False, f'Archive processing failed: {e}'
+    finally:
+        archive_path.unlink(missing_ok=True)
+
+
 def create_json_response(data, status_code):
     if status_code == 403:
         abort(403)
@@ -113,9 +143,19 @@ def manual():
                            crawlers_types=crawlers_types,
                            proxies=proxies,
                            l_cookiejar=l_cookiejar,
+                           import_status=request.args.get('import_status'),
+                           import_message=request.args.get('import_message'),
                            tags_selector_data=Tag.get_tags_selector_data())
 
 
+@crawler_splash.route("/crawlers/import_lookyloo_archive", methods=['POST'])
+@login_required
+@login_user_no_api
+def import_lookyloo_archive():
+    success, message = _import_lookyloo_archive_from_upload(request.files.get('lookyloo_archive'), logger=current_app.logger)
+    return redirect(url_for('crawler_splash.manual', import_status='success' if success else 'error', import_message=message))
+
+
 @crawler_splash.route("/crawlers/send_to_spider", methods=['POST'])
 @login_required
 @login_user_no_api
diff --git a/var/www/static/image/lookyloo.png b/var/www/static/image/lookyloo.png
diff --git a/var/www/templates/crawler/crawler_splash/crawler_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html
@@ -34,6 +34,7 @@
 
 				{% include 'crawler/crawler_disabled.html' %}
 
+
 						<div class="card text-white bg-dark mb-3 mt-1">
 						  <div class="card-header">
 								<h5 class="card-title">Crawl a Domain</h5>
@@ -223,6 +224,44 @@ <h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
 						  </div>
 						</div>
 
+                {% if import_message %}
+					<div class="alert {% if import_status == 'success' %}alert-success{% else %}alert-danger{% endif %} mt-3" role="alert">
+						{{ import_message }}
+					</div>
+				{% endif %}
+
+                <div class="card text-white bg-dark mb-3 mt-1">
+                    <div class="card-header">
+                        <h5 class="card-title">Import Lookyloo Capture Archive</h5>
+                    </div>
+
+                    <div class="card-body">
+                        <div class="d-flex justify-content-between align-items-center">
+                            <div class="flex-grow-1 pr-3">
+                                <p class="card-text">Upload a zipped capture archive exported from Lookyloo.</p>
+
+                                <form action="{{ url_for('crawler_splash.import_lookyloo_archive') }}" method="post" enctype="multipart/form-data">
+                                    <div class="input-group mb-2">
+                                        <div class="custom-file">
+                                            <input class="custom-file-input" type="file" id="lookyloo_archive" name="lookyloo_archive" accept=".zip,application/zip" required>
+                                            <label class="custom-file-label" for="lookyloo_archive">Choose Lookyloo archive (.zip)</label>
+                                        </div>
+                                    </div>
+
+                                    <button class="btn btn-primary mt-1" type="submit">
+                                        <i class="fas fa-file-archive"></i> Import Archive
+                                    </button>
+                                </form>
+                            </div>
+
+                            <img src="{{ url_for('static', filename='image/lookyloo.png') }}"
+                                 alt="Lookyloo"
+                                 class="ml-3"
+                                 style="max-width: 140px; height: auto;">
+                        </div>
+                    </div>
+                </div>
+
 
 			</div>
 
@@ -249,6 +288,11 @@ <h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
 	$('#queue_type_selector').on("change", function () {
 		queue_type_selector_input_controler();
 	});
+
+	$('#lookyloo_archive').on('change', function(){
+		const fileName = $(this).val().split('\\').pop();
+		$(this).next('.custom-file-label').html(fileName || 'Choose Lookyloo archive (.zip)');
+	});
 });
 
 function toggle_sidebar(){