Skip to content

Commit ea1fe11

Browse files
committed
chg: [CEDetector] add list of forbidden wildcare words
1 parent 85e86cc commit ea1fe11

1 file changed

Lines changed: 29 additions & 24 deletions

File tree

bin/modules/CEDetector.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ def __init__(self, queue=True):
3737
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
3838

3939
self.csam_words = self.load_world_file('csam_words')
40-
self.child_worlds = self.load_world_file('child_words')
41-
self.porn_worlds = self.load_world_file('porn_words')
40+
self.child_words = self.load_world_file('child_words')
41+
self.porn_words = self.load_world_file('porn_words')
42+
self.vanity_words = self.load_world_file('vanity_words_forbidden')
4243

4344
self.ce_tag = 'dark-web:topic="pornography-child-exploitation"'
4445
self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
@@ -65,27 +66,32 @@ def compute(self, message):
6566
content = self.obj.get_content().lower()
6667
domain_id = message
6768

68-
is_csam = False
69-
is_child_word = False
70-
is_porn_world = False
71-
words = TextBlob(content, tokenizer=self.tokenizer).tokens
72-
words = set(words)
73-
74-
for word in words:
75-
if word in self.csam_words:
76-
is_csam = True
77-
if word in self.child_worlds:
78-
is_child_word = True
79-
if word in self.porn_worlds:
80-
is_porn_world = True
81-
# PERF ???
82-
# if is_child_word and is_porn_world:
83-
# break
84-
85-
if is_csam:
86-
to_tag = True
87-
if is_child_word and is_porn_world:
88-
to_tag = True
69+
for word in self.vanity_words:
70+
if domain_id.startswith(word):
71+
to_tag = True
72+
break
73+
74+
if not to_tag:
75+
is_csam = False
76+
is_child_word = False
77+
is_porn_world = False
78+
words = TextBlob(content, tokenizer=self.tokenizer).tokens
79+
words = set(words)
80+
81+
for word in words:
82+
if word in self.csam_words:
83+
is_csam = True
84+
if word in self.child_words:
85+
is_child_word = True
86+
if word in self.porn_words:
87+
is_porn_world = True
88+
# PERF ???
89+
# if is_child_word and is_porn_world:
90+
# break
91+
if is_csam:
92+
to_tag = True
93+
if is_child_word and is_porn_world:
94+
to_tag = True
8995

9096
if to_tag:
9197
print(f'CSAM DETECTED {content}')
@@ -100,7 +106,6 @@ def compute(self, message):
100106
for dom in self.obj.get_correlation('domain').get('domain', []):
101107
domain = Domain(dom[1:])
102108
self.add_message_to_queue(obj=domain, message=self.ce_tag, queue='Tags')
103-
104109
return to_tag
105110

106111
def test_detection():

0 commit comments

Comments
 (0)