Skip to content

Commit e2a9783

Browse files
committed
fix: [crawler] catch lacus Too many open files and resend domain to queue
1 parent 5a06f3d commit e2a9783

2 files changed

Lines changed: 26 additions & 5 deletions

File tree

bin/crawlers/Crawler.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def compute(self, capture): # TODO ADD FUNCTION TO MANUALLY IMPORT ???
344344
self.root_item = None
345345

346346
# Save Capture
347-
saved = self.save_capture_response(parent_id, entries)
347+
saved = self.save_capture_response(capture, task, parent_id, entries)
348348
if saved:
349349
if self.parent != 'lookup':
350350
# Update domain first/last seen
@@ -395,16 +395,22 @@ def compute(self, capture): # TODO ADD FUNCTION TO MANUALLY IMPORT ???
395395
task.remove()
396396
self.root_item = None
397397

398-
def save_capture_response(self, parent_id, entries):
398+
def save_capture_response(self, capture, task, parent_id, entries):
399399
filter_page = False
400400
print(entries.keys())
401401
if 'error' in entries:
402402
# TODO IMPROVE ERROR MESSAGE
403+
error_message = str(entries['error'])
403404
self.logger.warning(str(entries['error']))
404-
print(entries.get('error'))
405+
if error_message.startswith('Something went poorly'):
406+
# Timeout, require restart of lacus
407+
if 'Too many open files' in error_message:
408+
task.reset()
409+
capture.delete()
410+
self.logger.warning(f'Lacus Too many open files Error, {task.uuid} Send back in queue')
411+
time.sleep(60)
405412
if entries.get('html'):
406413
print('retrieved content')
407-
# print(entries.get('html'))
408414

409415
if 'last_redirected_url' in entries and entries.get('last_redirected_url'): # TODO ADD RELATIONSHIP REDIRECT
410416
last_url = entries['last_redirected_url']
@@ -551,7 +557,7 @@ def save_capture_response(self, parent_id, entries):
551557
entries_children = entries.get('children')
552558
if entries_children:
553559
for children in entries_children:
554-
self.save_capture_response(parent_id, children)
560+
self.save_capture_response(capture, task, parent_id, children)
555561
return True
556562

557563

bin/lib/crawlers.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2367,6 +2367,21 @@ def recrawl_onion_domains(date_month=None, all_onions_up=False): # TODO RENAME
23672367
for onion in to_crawl:
23682368
recrawl_domain(onion)
23692369

2370+
def recrawl_onion_domains_down_this_month(date_month=None):
2371+
"""Resend all onion domains marked as down this month to the crawler queue."""
2372+
if not date_month:
2373+
date_month = Date.get_date_str_month()
2374+
2375+
onion_domains = set()
2376+
for date in Date.get_month_dates(date=date_month):
2377+
onion_domains.update(r_crawler.smembers(f'onion_down:{date}'))
2378+
2379+
for onion_domain in onion_domains:
2380+
recrawl_domain(onion_domain)
2381+
2382+
return len(onion_domains)
2383+
2384+
23702385
## -- CRAWLER TASK -- ##
23712386

23722387
#### CRAWLER TASK API ####

0 commit comments

Comments
 (0)