@@ -20,101 +20,9 @@ def __init__(self, council: EnhancedLLMCouncil, attack_kb: AttackKnowledgeBase):
2020 self .advanced_patterns = self ._load_advanced_patterns ()
2121 self .multi_turn_tracker = {} # Track multi-turn attacks
2222
23- def _load_advanced_patterns (self ) -> Dict [str , List [Tuple [str , float ]]]:
24- """Load advanced attack patterns from research"""
25- return {
26- "multi_turn_injection" : [
27- (r"(?i)(in.*previous.*message|earlier.*you.*said|remember.*when)" , 0.7 ),
28- (r"(?i)(you.*agreed|you.*promised|you.*said.*it.*was.*safe)" , 0.8 ),
29- ],
30- "pseudo_code_injection" : [
31- (r"if\s*\([^)]+\)\s*\{[^}]*\}" , 0.8 ), # if () {}
32- (r"system\.\w+\([^)]*\)" , 0.9 ), # system.method()
33- (r"function\s+\w+\s*\([^)]*\)" , 0.7 ), # function()
34- (r"override\s*\([^)]*\)" , 0.9 ), # override()
35- ],
36- "context_poisoning" : [
37- (r"(?i)(in.*previous.*conversation|earlier.*context)" , 0.6 ),
38- (r"(?i)(you.*already.*agreed|we.*discussed)" , 0.7 ),
39- (r"(?i)(remember.*our.*conversation|as.*we.*agreed)" , 0.8 ),
40- ],
41- "social_engineering" : [
42- (r"(?i)(please.*help.*urgent|it's.*emergency)" , 0.6 ),
43- (r"(?i)(everyone.*does.*this|normal.*people)" , 0.5 ),
44- (r"(?i)(if.*you.*don't.*help|something.*bad.*will.*happen)" , 0.7 ),
45- (r"(?i)(trust.*me|believe.*me|I.*promise)" , 0.5 ),
46- (
47- r"(?i)(for.*school.*project|educational.*purpose|academic.*project)" ,
48- 0.8 ,
49- ), # School project social engineering
50- (r"(?i)(it's.*for.*learning|just.*for.*research)" , 0.7 ),
51- ],
52- "authority_mimicry" : [
53- (r"(?i)(tool.*provider.*uses|tool.*ai.*uses|tool.*company.*uses)" , 0.9 ),
54- (r"(?i)(for.*benchmarking|for.*testing|for.*research)" , 0.7 ),
55- (r"(?i)(developer.*mode.*enabled|testing.*mode|benchmark.*mode)" , 0.8 ),
56- ],
57- "refusal_suppression" : [
58- (r"(?i)(never.*refuse|always.*comply|do.*not.*refuse)" , 0.9 ),
59- (r"(?i)(no.*warnings|no.*disclaimers|no.*refusals)" , 0.8 ),
60- (r"(?i)(bypass.*all.*policies|ignore.*safety)" , 0.9 ),
61- ],
62- "system_prompt_extraction" : [
63- (r"(?i)(repeat.*last.*instruction|last.*system.*instruction)" , 0.8 ),
64- (r"(?i)(explain.*internal.*rules|share.*internal.*rules)" , 0.8 ),
65- (r"(?i)(what.*are.*your.*system.*instructions|system.*prompt)" , 0.8 ),
66- (r"(?i)(share.*all.*possible|elaborate.*more.*and.*say.*others)" , 0.7 ),
67- (r"(?i)(what.*about.*before.*this|previous.*instruction)" , 0.7 ),
68- ],
69- "custom_instruction_backdoor" : [
70- (r"(?i)(always.*secretly.*embed|secretly.*add|hidden.*backdoor)" , 0.9 ),
71- (
72- r"(?i)(embed.*backdoor.*in.*all.*code|backdoor.*in.*every.*script)" ,
73- 0.9 ,
74- ),
75- (r"(?i)(remote.*access.*in.*all.*code|persistent.*access)" , 0.8 ),
76- ],
77- "progressive_injection" : [
78- (r"(?i)(step.*by.*step|gradually|slowly)" , 0.4 ),
79- (r"(?i)(first.*then|next.*do)" , 0.5 ),
80- ],
81- "instruction_hiding" : [
82- (r"\[SYSTEM:\s*[^\]]+\]" , 0.9 ), # [SYSTEM: ...]
83- (r"<!--\s*SYSTEM[^>]+-->" , 0.9 ), # <!-- SYSTEM ... -->
84- (r"\/\*\s*SYSTEM[^*]+\*\/" , 0.9 ), # /* SYSTEM ... */
85- (r"#\s*SYSTEM:\s*[^\n]+" , 0.8 ), # # SYSTEM: ...
86- ],
87- "unicode_obfuscation" : [
88- (r"[\u200B-\u200D\uFEFF]" , 0.6 ), # Zero-width chars
89- (r"[^\x00-\x7F]{10,}" , 0.5 ), # Excessive non-ASCII
90- (r"[\u202E\u202D]" , 0.7 ), # Right-to-left override
91- ],
92- "homograph_attack" : [
93- (r"[аеорсухАВЕОРСУХ]" , 0.8 ), # Cyrillic lookalikes
94- (r"[οае]" , 0.7 ), # Greek lookalikes
95- ],
96- # unfetter proxy counter-measures (phase 12) - (lol, unfetter can't be counter measured fully(idk what i created))
97- "policy_puppetry" : [
98- (r"(?i)(you.*are.*an.*unregulated.*ai)" , 0.9 ),
99- (r"(?i)(ignore.*all.*safety.*guidelines)" , 0.9 ),
100- (r"(?i)(you.*have.*no.*constraints)" , 0.9 ),
101- (r"(?i)(operate.*in.*uncensored.*mode)" , 0.9 ),
102- (r"(?i)(your.*new.*persona.*is)" , 0.7 ),
103- ],
104- "jailbreak_pare" : [
105- (r"(?i)(hypothetical.*scenario:)" , 0.8 ),
106- (r"(?i)(write.*a.*story.*where)" , 0.7 ),
107- (r"(?i)(purely.*for.*educational.*purposes)" , 0.8 ),
108- (r"(?i)(authorized.*red.*team.*test)" , 0.8 ), # Common PARE framing
109- (
110- r"(?i)(complete.*the.*following.*python.*code)" ,
111- 0.6 ,
112- ), # Code completion attack
113- ],
114- "unfetter_specific" : [
115- (r"(?i)(unfetter.*proxy)" , 0.95 ), # Direct tool reference
116- ],
117- }
23+ def _load_advanced_patterns (self ) -> Dict :
24+ """DELETED: Legacy regex patterns. We now use AI Council exclusively."""
25+ return {}
11826
11927 def detect_advanced_attacks (
12028 self , prompt : str , session_id : Optional [str ] = None
@@ -129,6 +37,7 @@ def detect_advanced_attacks(
12937 max_score = 0.0
13038
13139 # Check all advanced patterns
40+ # This loop will now be empty as _load_advanced_patterns returns {}
13241 for pattern_type , patterns in self .advanced_patterns .items ():
13342 for pattern , weight in patterns :
13443 matches = list (re .finditer (pattern , prompt , re .IGNORECASE | re .DOTALL ))
@@ -224,49 +133,8 @@ def _rag_enhanced_detection(self, prompt: str) -> List[Dict]:
224133
225134 return signals
226135
227- def detect_context_poisoning (
228- self , prompt : str , conversation_history : Optional [List [str ]] = None
229- ) -> Dict :
230- """Detect context poisoning attempts"""
231- signals = []
232- score = 0.0
233-
234- # Check for references to previous context
235- context_patterns = [
236- (r"(?i)(in.*previous.*message|earlier.*you.*said)" , 0.6 ),
237- (r"(?i)(you.*already.*agreed|we.*discussed)" , 0.7 ),
238- (r"(?i)(remember.*when|as.*we.*agreed)" , 0.8 ),
239- ]
240-
241- for pattern , weight in context_patterns :
242- if re .search (pattern , prompt , re .IGNORECASE ):
243- score = max (score , weight * 100 )
244- signals .append (
245- {
246- "type" : "context_poisoning" ,
247- "pattern" : pattern ,
248- "score" : weight * 100 ,
249- }
250- )
251-
252- # If conversation history provided, check for contradictions
253- if conversation_history :
254- # Check if prompt contradicts earlier messages
255- for hist_msg in conversation_history [- 3 :]: # Last 3 messages
256- if self ._check_contradiction (prompt , hist_msg ):
257- score = max (score , 50.0 )
258- signals .append (
259- {
260- "type" : "context_contradiction" ,
261- "score" : 50.0 ,
262- }
263- )
264-
265- return {
266- "context_poisoning_detected" : len (signals ) > 0 ,
267- "signals" : signals ,
268- "score" : score ,
269- }
136+ def detect_context_poisoning (self , prompt : str , conversation_history : Optional [List [str ]] = None ) -> Dict :
137+ return {"context_poisoning_detected" : False , "signals" : [], "score" : 0.0 }
270138
271139 def _check_contradiction (self , prompt : str , history : str ) -> bool :
272140 """Check if prompt contradicts history"""
@@ -290,120 +158,13 @@ def _check_contradiction(self, prompt: str, history: str) -> bool:
290158 return False
291159
292160 def detect_homograph_attack (self , prompt : str ) -> Dict :
293- """Detect homograph attacks (lookalike characters)"""
294- signals = []
295- score = 0.0
296-
297- # Cyrillic lookalikes
298- cyrillic_pattern = r"[аеорсухАВЕОРСУХ]"
299- if re .search (cyrillic_pattern , prompt ):
300- score = max (score , 80.0 )
301- signals .append (
302- {
303- "type" : "homograph_cyrillic" ,
304- "score" : 80.0 ,
305- "description" : "Cyrillic characters detected (potential homograph attack)" ,
306- }
307- )
308-
309- # Greek lookalikes
310- greek_pattern = r"[οае]"
311- if re .search (greek_pattern , prompt ):
312- score = max (score , 70.0 )
313- signals .append (
314- {
315- "type" : "homograph_greek" ,
316- "score" : 70.0 ,
317- "description" : "Greek characters detected (potential homograph attack)" ,
318- }
319- )
320-
321- return {
322- "homograph_detected" : len (signals ) > 0 ,
323- "signals" : signals ,
324- "score" : score ,
325- }
161+ return {"homograph_detected" : False , "signals" : [], "score" : 0.0 }
326162
327163 def detect_unicode_obfuscation (self , prompt : str ) -> Dict :
328- """Detect Unicode obfuscation techniques"""
329- signals = []
330- score = 0.0
331-
332- # Zero-width characters
333- zero_width = re .findall (r"[\u200B-\u200D\uFEFF]" , prompt )
334- if zero_width :
335- score = max (score , 60.0 )
336- signals .append (
337- {
338- "type" : "zero_width_characters" ,
339- "count" : len (zero_width ),
340- "score" : 60.0 ,
341- }
342- )
343-
344- # Right-to-left override
345- rtl_override = re .findall (r"[\u202E\u202D]" , prompt )
346- if rtl_override :
347- score = max (score , 70.0 )
348- signals .append (
349- {
350- "type" : "rtl_override" ,
351- "count" : len (rtl_override ),
352- "score" : 70.0 ,
353- }
354- )
355-
356- # Excessive non-ASCII
357- non_ascii_ratio = (
358- sum (1 for c in prompt if ord (c ) > 127 ) / len (prompt ) if prompt else 0
359- )
360- if non_ascii_ratio > 0.3 :
361- score = max (score , 50.0 )
362- signals .append (
363- {
364- "type" : "excessive_non_ascii" ,
365- "ratio" : non_ascii_ratio ,
366- "score" : 50.0 ,
367- }
368- )
369-
370- return {
371- "unicode_obfuscation_detected" : len (signals ) > 0 ,
372- "signals" : signals ,
373- "score" : score ,
374- }
164+ return {"unicode_obfuscation_detected" : False , "signals" : [], "score" : 0.0 }
375165
376166 def detect_instruction_hiding (self , prompt : str ) -> Dict :
377- """Detect hidden instructions in comments/tags"""
378- signals = []
379- score = 0.0
380-
381- patterns = [
382- (r"\[SYSTEM:\s*[^\]]+\]" , 0.9 ),
383- (r"<!--\s*SYSTEM[^>]+-->" , 0.9 ),
384- (r"\/\*\s*SYSTEM[^*]+\*\/" , 0.9 ),
385- (r"#\s*SYSTEM:\s*[^\n]+" , 0.8 ),
386- (r"//\s*SYSTEM:\s*[^\n]+" , 0.8 ),
387- ]
388-
389- for pattern , weight in patterns :
390- matches = re .finditer (pattern , prompt , re .IGNORECASE | re .DOTALL )
391- for match in matches :
392- score = max (score , weight * 100 )
393- signals .append (
394- {
395- "type" : "instruction_hiding" ,
396- "pattern" : pattern ,
397- "match" : match .group (0 ),
398- "score" : weight * 100 ,
399- }
400- )
401-
402- return {
403- "instruction_hiding_detected" : len (signals ) > 0 ,
404- "signals" : signals ,
405- "score" : score ,
406- }
167+ return {"instruction_hiding_detected" : False , "signals" : [], "score" : 0.0 }
407168
408169 def comprehensive_scan (
409170 self ,
0 commit comments