Skip to content

Commit dd39bae

Browse files
committed
chg: [crawler test] add web crawler availability test
1 parent b9c8c52 commit dd39bae

3 files changed

Lines changed: 113 additions & 49 deletions

File tree

bin/lib/crawlers.py

Lines changed: 65 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2757,61 +2757,85 @@ def api_set_crawler_max_captures(data):
27572757
## TEST ##
27582758

27592759
def is_test_ail_crawlers_successful():
2760-
return r_db.hget('crawler:tor:test', 'success') == 'True'
2760+
web_success = r_db.hget('crawler:tor:test', 'web_success')
2761+
onion_success = r_db.hget('crawler:tor:test', 'onion_success')
2762+
return web_success == 'True' or onion_success == 'True'
27612763

27622764
def get_test_ail_crawlers_message():
2763-
return r_db.hget('crawler:tor:test', 'message')
2764-
2765-
def save_test_ail_crawlers_result(test_success, message):
2766-
r_db.hset('crawler:tor:test', 'success', str(test_success))
2767-
r_db.hset('crawler:tor:test', 'message', message)
2765+
metadata = get_test_ail_crawlers_metadata()
2766+
return f"Web: {metadata['web_message']}\nOnion: {metadata['onion_message']}"
2767+
2768+
def get_test_ail_crawlers_metadata():
2769+
metadata = {
2770+
'web_success': r_db.hget('crawler:tor:test', 'web_success'),
2771+
'web_message': r_db.hget('crawler:tor:test', 'web_message'),
2772+
'onion_success': r_db.hget('crawler:tor:test', 'onion_success'),
2773+
'onion_message': r_db.hget('crawler:tor:test', 'onion_message'),
2774+
'date_test': r_db.hget('crawler:tor:test', 'date_test')
2775+
}
2776+
if metadata['web_success'] is None:
2777+
metadata['web_success'] = 'False'
2778+
if not metadata['web_message']:
2779+
metadata['web_message'] = 'Web crawler test has not been run yet.'
2780+
if metadata['onion_success'] is None:
2781+
metadata['onion_success'] = 'False'
2782+
if not metadata['onion_message']:
2783+
metadata['onion_message'] = 'Onion crawler test has not been run yet.'
2784+
if not metadata['date_test']:
2785+
metadata['date_test'] = 'Unknown'
2786+
return metadata
2787+
2788+
def save_test_ail_crawlers_result(web_success, web_message, onion_success, onion_message, date_test):
2789+
r_db.hset('crawler:tor:test', 'web_success', str(web_success))
2790+
r_db.hset('crawler:tor:test', 'web_message', web_message)
2791+
r_db.hset('crawler:tor:test', 'onion_success', str(onion_success))
2792+
r_db.hset('crawler:tor:test', 'onion_message', onion_message)
2793+
r_db.hset('crawler:tor:test', 'date_test', date_test)
2794+
2795+
def _run_lacus_network_test(lacus, user_agent, url, expected_text, proxy=None):
2796+
enqueue_kwargs = {'url': url, 'depth': 0, 'user_agent': user_agent, 'force': True, 'general_timeout_in_sec': 90}
2797+
if proxy:
2798+
enqueue_kwargs['proxy'] = proxy
2799+
capture_uuid = lacus.enqueue(**enqueue_kwargs)
2800+
status = lacus.get_capture_status(capture_uuid)
2801+
launch_time = int(time.time())
2802+
while int(time.time()) - launch_time < 90 and status != CaptureStatus.DONE:
2803+
time.sleep(1)
2804+
status = lacus.get_capture_status(capture_uuid)
2805+
entries = lacus.get_capture(capture_uuid)
2806+
if 'error' in entries:
2807+
return False, entries['error']
2808+
if 'html' in entries and entries['html']:
2809+
if expected_text in entries['html']:
2810+
return True, f'Expected content "{expected_text}" found.'
2811+
return False, f'Expected content "{expected_text}" not found.'
2812+
if status == 2:
2813+
return False, 'Timeout Error'
2814+
return False, 'Error'
27682815

27692816
def test_ail_crawlers():
2770-
# # TODO: test web domain
2817+
date_test = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
27712818
if not ping_lacus():
27722819
lacus_url = get_lacus_url()
27732820
error_message = f'Error: Can\'t connect to AIL Lacus, {lacus_url}'
27742821
print(error_message)
2775-
save_test_ail_crawlers_result(False, error_message)
2822+
save_test_ail_crawlers_result(False, error_message, False, error_message, date_test)
27762823
return False
27772824

27782825
lacus = get_lacus()
27792826
commit_id = git_status.get_last_commit_id_from_local()
27802827
user_agent = f'{commit_id}-AIL LACUS CRAWLER'
2781-
# domain = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
2782-
url = 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
2783-
2784-
## LAUNCH CRAWLER, TEST MODE ##
2785-
# set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True,
2786-
# crawled_domain='TEST DOMAIN', crawler_type='onion')
2787-
capture_uuid = lacus.enqueue(url=url, depth=0, user_agent=user_agent, proxy='force_tor',
2788-
force=True, general_timeout_in_sec=90)
2789-
status = lacus.get_capture_status(capture_uuid)
2790-
launch_time = int(time.time()) # capture timeout
2791-
while int(time.time()) - launch_time < 90 and status != CaptureStatus.DONE:
2792-
# DEBUG
2793-
print(int(time.time()) - launch_time)
2794-
print(status)
2795-
time.sleep(1)
2796-
status = lacus.get_capture_status(capture_uuid)
27972828

2798-
# TODO CRAWLER STATUS OR QUEUED CAPTURE LIST
2799-
entries = lacus.get_capture(capture_uuid)
2800-
if 'error' in entries:
2801-
save_test_ail_crawlers_result(False, entries['error'])
2802-
return False
2803-
elif 'html' in entries and entries['html']:
2804-
mess = 'It works!'
2805-
if mess in entries['html']:
2806-
save_test_ail_crawlers_result(True, mess)
2807-
return True
2808-
else:
2809-
return False
2810-
elif status == 2:
2811-
save_test_ail_crawlers_result(False, 'Timeout Error')
2812-
else:
2813-
save_test_ail_crawlers_result(False, 'Error')
2814-
return False
2829+
web_success, web_message = _run_lacus_network_test(lacus, user_agent, 'https://ail-project.org/', 'AIL Project')
2830+
onion_success, onion_message = _run_lacus_network_test(
2831+
lacus,
2832+
user_agent,
2833+
'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion',
2834+
'It works!',
2835+
proxy='force_tor'
2836+
)
2837+
save_test_ail_crawlers_result(web_success, web_message, onion_success, onion_message, date_test)
2838+
return web_success or onion_success
28152839

28162840
#### ---- ####
28172841

var/www/blueprints/crawler_splash.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1059,7 +1059,7 @@ def crawler_settings():
10591059

10601060
is_manager_connected = crawlers.get_lacus_connection_metadata(force_ping=True)
10611061
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
1062-
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
1062+
crawler_test_metadata = crawlers.get_test_ail_crawlers_metadata()
10631063

10641064
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
10651065
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
@@ -1076,7 +1076,7 @@ def crawler_settings():
10761076
nb_captures=nb_captures,
10771077
# all_proxies=all_proxies,
10781078
is_crawler_working=is_crawler_working,
1079-
crawler_error_mess=crawler_error_mess,
1079+
crawler_test_metadata=crawler_test_metadata,
10801080
is_onion_filter_enabled=is_onion_filter_enabled,
10811081
is_onion_filter_unknown=is_onion_filter_unknown,
10821082
crawler_logs=crawler_logs

var/www/templates/crawler/crawler_splash/settings_crawler.html

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,19 +137,59 @@ <h4 class="mb-0">Crawler Health & Runtime</h4>
137137

138138
<div class="d-flex flex-wrap align-items-center justify-content-between mb-3 action-buttons">
139139
<div>
140-
<h5 class="mb-1">TOR crawler diagnostic test</h5>
141-
<p class="text-muted mb-0">Run this check to verify crawler dependencies and connectivity.</p>
140+
<h5 class="mb-1">Crawler network availability test (Web + Onion)</h5>
141+
<p class="text-muted mb-0">Run this check to verify crawler dependencies and network connectivity for both normal web and Tor onion services.</p>
142142
</div>
143143
<a href="{{ url_for('crawler_splash.crawler_settings_crawler_test') }}" class="btn btn-primary mt-2 mt-md-0">
144144
Run Test Again <i class="fas fa-rocket ml-1"></i>
145145
</a>
146146
</div>
147147

148-
<pre class="bg-dark text-white p-3 test-output">----------------------------
149-
- TOR CRAWLER TEST OUTPUT: -
150-
----------------------------
148+
{% set web_available = crawler_test_metadata['web_success'] == 'True' %}
149+
{% set onion_available = crawler_test_metadata['onion_success'] == 'True' %}
150+
<div class="mb-3">
151+
<h5 class="mb-2">Network status</h5>
152+
<div class="mb-2">
153+
Web:
154+
<span class="badge badge-{% if web_available %}success{% else %}danger{% endif %} px-3 py-2">
155+
{% if web_available %}Available{% else %}Unavailable{% endif %}
156+
</span>
157+
</div>
158+
<div>
159+
Onion (Tor):
160+
<span class="badge badge-{% if onion_available %}success{% else %}danger{% endif %} px-3 py-2">
161+
{% if onion_available %}Available{% else %}Unavailable{% endif %}
162+
</span>
163+
</div>
164+
</div>
151165

152-
{{crawler_error_mess}}</pre>
166+
{% if web_available and onion_available %}
167+
<pre class="bg-dark text-white p-3 test-output">-----------------------------
168+
- CRAWLER TEST OUTPUT -
169+
-----------------------------
170+
Web: available
171+
Onion (Tor): available</pre>
172+
{% elif not web_available or not onion_available %}
173+
<pre class="bg-dark text-white p-3 test-output">-----------------------------
174+
- CRAWLER ISSUE DETAILS -
175+
-----------------------------
176+
{% if not web_available %}Web issue: {{ crawler_test_metadata['web_message'] }}{% endif %}
177+
{% if not web_available and not onion_available %}
178+
{% endif %}
179+
{% if not onion_available %}Onion issue: {{ crawler_test_metadata['onion_message'] }}{% endif %}</pre>
180+
{% endif %}
181+
182+
<p class="text-muted mb-3"><strong>Last tested:</strong> {{ crawler_test_metadata['date_test'] }}</p>
183+
184+
<div class="alert alert-info">
185+
<strong>Availability scope:</strong> Crawler network availability was last tested on <code>{{ crawler_test_metadata['date_test'] }}</code>.<br>
186+
Web availability indicates whether the Lacus crawler could reach and crawl a normal website.<br>
187+
Onion availability indicates whether the Lacus crawler could reach and crawl Tor hidden services.<br>
188+
These results only reflect network availability at the time of the test. Relaunch the test to check current availability.<br>
189+
It is possible for only one network to be available (for example: web available while onion is unavailable).<br>
190+
Web availability is required for crawling normal websites.<br>
191+
Onion availability is required for crawling Tor hidden services.
192+
</div>
153193

154194
<div class="card border-secondary my-4">
155195
<div class="card-body d-flex flex-wrap justify-content-between align-items-center">

0 commit comments

Comments
 (0)