@@ -1388,7 +1388,6 @@ def extract_time_from_capture(self, capture):
13881388 if not self .date :
13891389 self .date = get_current_date (separator = True )
13901390 self .epoch = int (time .time ())
1391- print (self .date , self .epoch )
13921391
13931392 def extract_title (self , item , html_content ):
13941393 title_content = extract_title (html_content )
@@ -1439,7 +1438,8 @@ def process_favicons(self, item, favicons):
14391438 fav .add (item .get_date (), item )
14401439
14411440 def process (self , capture , capture_parent = 'capture_importer' ):
1442- capture = self ._decode_capture (capture )
1441+ if capture_parent != 'lookyloo' :
1442+ capture = self ._decode_capture (capture )
14431443 self .extract_domain_from_capture (capture )
14441444 # Filter unsafe onions
14451445 if self .domain .id .endswith ('.onion' ):
@@ -1481,7 +1481,7 @@ def process(self, capture, capture_parent='capture_importer'):
14811481 # SSHKeys.save_passive_ssh_host(self.domain.id)
14821482 return objs
14831483
1484- def process_capture (self , parent_id , capture ):
1484+ def process_capture (self , parent_id , capture , force = False ):
14851485 objs = []
14861486 filter_page = False
14871487 if not parent_id :
@@ -1520,8 +1520,17 @@ def process_capture(self, parent_id, capture):
15201520 else :
15211521 last_url = f'http://{ self .domain .id } '
15221522
1523+ # Filter duplicate
1524+ if not force and self .root_item_id is None :
1525+ if self .domain .exists_epoch_history (self .epoch ):
1526+ self .logger .warning (f'Capture Already Imported, { self .domain .id } -> { self .epoch } ' )
1527+ return False
1528+
15231529 if capture .get ('html' ) and not filter_page :
1524- item_id = create_item_id (self .items_dir , self .domain .id )
1530+ if capture .get ('uuid' ) and parent_id is None :
1531+ item_id = create_item_id (self .items_dir , self .domain .id , c_uuid = capture ['uuid' ])
1532+ else :
1533+ item_id = create_item_id (self .items_dir , self .domain .id )
15251534 item = Item (item_id )
15261535 print (item .id )
15271536
@@ -1574,13 +1583,15 @@ def process_capture(self, parent_id, capture):
15741583
15751584 def process_lookyloo_archive (self , archive ):
15761585 temp_dir = os .path .join (os .environ ['AIL_HOME' ], 'temp/import' )
1577- archive = os .path .join (temp_dir , archive ) # TODO sanitise
1586+ archive = os .path .join (temp_dir , archive )
15781587 if not os .path .commonpath ([archive , temp_dir ]) == temp_dir :
15791588 self .logger .critical (f'Path Transversal { archive } ' )
15801589 return []
15811590
15821591 files_to_skip = ['cnames.json' , 'ipasn.json' , 'ips.json' , 'mx.json' ,
1583- 'nameservers.json' , 'soa.json' , 'hashlookup.json' ]
1592+ 'nameservers.json' , 'soa.json' , 'hashlookup.json' ,
1593+ 'cookies.json' , 'storage.json' , 'meta' , 'parent' , 'categories' , 'data.filename' , # TEMP
1594+ 'data' , 'trusted_timestamps.json' , 'capture_settings.json' , 'frames.json' ] # TEMP
15841595 capture = {}
15851596 unrecoverable_error = False
15861597
@@ -1599,22 +1610,21 @@ def process_lookyloo_archive(self, archive):
15991610 elif filename .endswith ('0.last_redirect.txt' ):
16001611 capture ['last_redirected_url' ] = lookyloo_capture .read (filename ).decode ()
16011612 elif filename .endswith ('0.png' ):
1602- capture ['png' ] = base64 . b64encode ( lookyloo_capture .read (filename ) )
1603- # elif filename.endswith('0.cookies.json'): # TODO # # # #
1613+ capture ['png' ] = lookyloo_capture .read (filename )
1614+ # elif filename.endswith('0.cookies.json'):
16041615 # # Not required
16051616 # capture{'cookies'} = orjson.loads(lookyloo_capture.read(filename))
16061617 # elif filename.endswith('0.storage.json'):
16071618 # # Not required
16081619 # storage = orjson.loads(lookyloo_capture.read(filename))
16091620 elif filename .endswith ('potential_favicons.ico' ):
16101621 if 'potential_favicons' not in capture :
1611- capture ['potential_favicons' ] = set ()
1622+ capture ['potential_favicons' ] = []
16121623 # We may have more than one favicon
1613- print (lookyloo_capture .read (filename ))
1614- capture ['potential_favicons' ].add (lookyloo_capture .read (filename ))
1615- # elif filename.endswith('uuid'): # TODO Avoid duplicate and multiple Imports
1616- # uuid = lookyloo_capture.read(filename).decode()
1617- # if self.uuid_exists(uuid): # TODO Avoid duplicate and multiple Imports
1624+ capture ['potential_favicons' ].append (lookyloo_capture .read (filename ))
1625+ elif filename .endswith ('uuid' ): # TODO Avoid duplicate and multiple Imports
1626+ capture ['uuid' ] = lookyloo_capture .read (filename ).decode ()
1627+ # if self.uuid_exists(uuid):
16181628 # messages['warnings'].append(f'UUID {uuid} already exists, set a new one.')
16191629 # uuid = str(uuid4())
16201630 # elif filename.endswith('meta'):
@@ -2681,14 +2691,16 @@ def is_redirection(domain, last_url):
26812691 last_domain = '{}.{}' .format (last_domain [- 2 ], last_domain [- 1 ])
26822692 return domain != last_domain
26832693
2684- def create_item_id (item_dir , domain ):
2694+ def create_item_id (item_dir , domain , c_uuid = None ):
2695+ if not c_uuid :
2696+ c_uuid = str (uuid .uuid4 ())
26852697 # remove /
26862698 domain = domain .replace ('/' , '_' )
26872699 if len (domain ) > 215 :
2688- n_uuid = domain [- 215 :]+ str ( uuid . uuid4 ())
2700+ item_id = domain [- 215 :]+ c_uuid
26892701 else :
2690- n_uuid = domain + str ( uuid . uuid4 ())
2691- return os .path .join (item_dir , n_uuid )
2702+ item_id = domain + c_uuid
2703+ return os .path .join (item_dir , item_id )
26922704
26932705# # # # # # # # # # # #
26942706# #
0 commit comments