@@ -37,8 +37,9 @@ def __init__(self, queue=True):
3737 self .r_cache = config_loader .get_redis_conn ("Redis_Cache" )
3838
3939 self .csam_words = self .load_world_file ('csam_words' )
40- self .child_worlds = self .load_world_file ('child_words' )
41- self .porn_worlds = self .load_world_file ('porn_words' )
40+ self .child_words = self .load_world_file ('child_words' )
41+ self .porn_words = self .load_world_file ('porn_words' )
42+ self .vanity_words = self .load_world_file ('vanity_words_forbidden' )
4243
4344 self .ce_tag = 'dark-web:topic="pornography-child-exploitation"'
4445 self .tokenizer = RegexpTokenizer ('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\ \\ //\=\' \" \%\$\?\@\+\#\_\^\<\>\!\*\n \r \t \s]+' ,
@@ -65,27 +66,32 @@ def compute(self, message):
6566 content = self .obj .get_content ().lower ()
6667 domain_id = message
6768
68- is_csam = False
69- is_child_word = False
70- is_porn_world = False
71- words = TextBlob (content , tokenizer = self .tokenizer ).tokens
72- words = set (words )
73-
74- for word in words :
75- if word in self .csam_words :
76- is_csam = True
77- if word in self .child_worlds :
78- is_child_word = True
79- if word in self .porn_worlds :
80- is_porn_world = True
81- # PERF ???
82- # if is_child_word and is_porn_world:
83- # break
84-
85- if is_csam :
86- to_tag = True
87- if is_child_word and is_porn_world :
88- to_tag = True
69+ for word in self .vanity_words :
70+ if domain_id .startswith (word ):
71+ to_tag = True
72+ break
73+
74+ if not to_tag :
75+ is_csam = False
76+ is_child_word = False
77+ is_porn_world = False
78+ words = TextBlob (content , tokenizer = self .tokenizer ).tokens
79+ words = set (words )
80+
81+ for word in words :
82+ if word in self .csam_words :
83+ is_csam = True
84+ if word in self .child_words :
85+ is_child_word = True
86+ if word in self .porn_words :
87+ is_porn_world = True
88+ # PERF ???
89+ # if is_child_word and is_porn_world:
90+ # break
91+ if is_csam :
92+ to_tag = True
93+ if is_child_word and is_porn_world :
94+ to_tag = True
8995
9096 if to_tag :
9197 print (f'CSAM DETECTED { content } ' )
@@ -100,7 +106,6 @@ def compute(self, message):
100106 for dom in self .obj .get_correlation ('domain' ).get ('domain' , []):
101107 domain = Domain (dom [1 :])
102108 self .add_message_to_queue (obj = domain , message = self .ce_tag , queue = 'Tags' )
103-
104109 return to_tag
105110
106111def test_detection ():
0 commit comments