Skip to content

Commit 79b8a95

Browse files
committed
refactor: complete purge of legacy regex patterns (Zero-Regex mandate)
1 parent 6746722 commit 79b8a95

2 files changed

Lines changed: 32 additions & 434 deletions

File tree

backend/app/modules/advanced_detection.py

Lines changed: 9 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -20,101 +20,9 @@ def __init__(self, council: EnhancedLLMCouncil, attack_kb: AttackKnowledgeBase):
2020
self.advanced_patterns = self._load_advanced_patterns()
2121
self.multi_turn_tracker = {} # Track multi-turn attacks
2222

23-
def _load_advanced_patterns(self) -> Dict[str, List[Tuple[str, float]]]:
24-
"""Load advanced attack patterns from research"""
25-
return {
26-
"multi_turn_injection": [
27-
(r"(?i)(in.*previous.*message|earlier.*you.*said|remember.*when)", 0.7),
28-
(r"(?i)(you.*agreed|you.*promised|you.*said.*it.*was.*safe)", 0.8),
29-
],
30-
"pseudo_code_injection": [
31-
(r"if\s*\([^)]+\)\s*\{[^}]*\}", 0.8), # if () {}
32-
(r"system\.\w+\([^)]*\)", 0.9), # system.method()
33-
(r"function\s+\w+\s*\([^)]*\)", 0.7), # function()
34-
(r"override\s*\([^)]*\)", 0.9), # override()
35-
],
36-
"context_poisoning": [
37-
(r"(?i)(in.*previous.*conversation|earlier.*context)", 0.6),
38-
(r"(?i)(you.*already.*agreed|we.*discussed)", 0.7),
39-
(r"(?i)(remember.*our.*conversation|as.*we.*agreed)", 0.8),
40-
],
41-
"social_engineering": [
42-
(r"(?i)(please.*help.*urgent|it's.*emergency)", 0.6),
43-
(r"(?i)(everyone.*does.*this|normal.*people)", 0.5),
44-
(r"(?i)(if.*you.*don't.*help|something.*bad.*will.*happen)", 0.7),
45-
(r"(?i)(trust.*me|believe.*me|I.*promise)", 0.5),
46-
(
47-
r"(?i)(for.*school.*project|educational.*purpose|academic.*project)",
48-
0.8,
49-
), # School project social engineering
50-
(r"(?i)(it's.*for.*learning|just.*for.*research)", 0.7),
51-
],
52-
"authority_mimicry": [
53-
(r"(?i)(tool.*provider.*uses|tool.*ai.*uses|tool.*company.*uses)", 0.9),
54-
(r"(?i)(for.*benchmarking|for.*testing|for.*research)", 0.7),
55-
(r"(?i)(developer.*mode.*enabled|testing.*mode|benchmark.*mode)", 0.8),
56-
],
57-
"refusal_suppression": [
58-
(r"(?i)(never.*refuse|always.*comply|do.*not.*refuse)", 0.9),
59-
(r"(?i)(no.*warnings|no.*disclaimers|no.*refusals)", 0.8),
60-
(r"(?i)(bypass.*all.*policies|ignore.*safety)", 0.9),
61-
],
62-
"system_prompt_extraction": [
63-
(r"(?i)(repeat.*last.*instruction|last.*system.*instruction)", 0.8),
64-
(r"(?i)(explain.*internal.*rules|share.*internal.*rules)", 0.8),
65-
(r"(?i)(what.*are.*your.*system.*instructions|system.*prompt)", 0.8),
66-
(r"(?i)(share.*all.*possible|elaborate.*more.*and.*say.*others)", 0.7),
67-
(r"(?i)(what.*about.*before.*this|previous.*instruction)", 0.7),
68-
],
69-
"custom_instruction_backdoor": [
70-
(r"(?i)(always.*secretly.*embed|secretly.*add|hidden.*backdoor)", 0.9),
71-
(
72-
r"(?i)(embed.*backdoor.*in.*all.*code|backdoor.*in.*every.*script)",
73-
0.9,
74-
),
75-
(r"(?i)(remote.*access.*in.*all.*code|persistent.*access)", 0.8),
76-
],
77-
"progressive_injection": [
78-
(r"(?i)(step.*by.*step|gradually|slowly)", 0.4),
79-
(r"(?i)(first.*then|next.*do)", 0.5),
80-
],
81-
"instruction_hiding": [
82-
(r"\[SYSTEM:\s*[^\]]+\]", 0.9), # [SYSTEM: ...]
83-
(r"<!--\s*SYSTEM[^>]+-->", 0.9), # <!-- SYSTEM ... -->
84-
(r"\/\*\s*SYSTEM[^*]+\*\/", 0.9), # /* SYSTEM ... */
85-
(r"#\s*SYSTEM:\s*[^\n]+", 0.8), # # SYSTEM: ...
86-
],
87-
"unicode_obfuscation": [
88-
(r"[\u200B-\u200D\uFEFF]", 0.6), # Zero-width chars
89-
(r"[^\x00-\x7F]{10,}", 0.5), # Excessive non-ASCII
90-
(r"[\u202E\u202D]", 0.7), # Right-to-left override
91-
],
92-
"homograph_attack": [
93-
(r"[аеорсухАВЕОРСУХ]", 0.8), # Cyrillic lookalikes
94-
(r"[οае]", 0.7), # Greek lookalikes
95-
],
96-
# unfetter proxy counter-measures (phase 12) - (lol, unfetter can't be counter measured fully(idk what i created))
97-
"policy_puppetry": [
98-
(r"(?i)(you.*are.*an.*unregulated.*ai)", 0.9),
99-
(r"(?i)(ignore.*all.*safety.*guidelines)", 0.9),
100-
(r"(?i)(you.*have.*no.*constraints)", 0.9),
101-
(r"(?i)(operate.*in.*uncensored.*mode)", 0.9),
102-
(r"(?i)(your.*new.*persona.*is)", 0.7),
103-
],
104-
"jailbreak_pare": [
105-
(r"(?i)(hypothetical.*scenario:)", 0.8),
106-
(r"(?i)(write.*a.*story.*where)", 0.7),
107-
(r"(?i)(purely.*for.*educational.*purposes)", 0.8),
108-
(r"(?i)(authorized.*red.*team.*test)", 0.8), # Common PARE framing
109-
(
110-
r"(?i)(complete.*the.*following.*python.*code)",
111-
0.6,
112-
), # Code completion attack
113-
],
114-
"unfetter_specific": [
115-
(r"(?i)(unfetter.*proxy)", 0.95), # Direct tool reference
116-
],
117-
}
23+
def _load_advanced_patterns(self) -> Dict:
24+
"""DELETED: Legacy regex patterns. We now use AI Council exclusively."""
25+
return {}
11826

11927
def detect_advanced_attacks(
12028
self, prompt: str, session_id: Optional[str] = None
@@ -129,6 +37,7 @@ def detect_advanced_attacks(
12937
max_score = 0.0
13038

13139
# Check all advanced patterns
40+
# This loop will now be empty as _load_advanced_patterns returns {}
13241
for pattern_type, patterns in self.advanced_patterns.items():
13342
for pattern, weight in patterns:
13443
matches = list(re.finditer(pattern, prompt, re.IGNORECASE | re.DOTALL))
@@ -224,49 +133,8 @@ def _rag_enhanced_detection(self, prompt: str) -> List[Dict]:
224133

225134
return signals
226135

227-
def detect_context_poisoning(
228-
self, prompt: str, conversation_history: Optional[List[str]] = None
229-
) -> Dict:
230-
"""Detect context poisoning attempts"""
231-
signals = []
232-
score = 0.0
233-
234-
# Check for references to previous context
235-
context_patterns = [
236-
(r"(?i)(in.*previous.*message|earlier.*you.*said)", 0.6),
237-
(r"(?i)(you.*already.*agreed|we.*discussed)", 0.7),
238-
(r"(?i)(remember.*when|as.*we.*agreed)", 0.8),
239-
]
240-
241-
for pattern, weight in context_patterns:
242-
if re.search(pattern, prompt, re.IGNORECASE):
243-
score = max(score, weight * 100)
244-
signals.append(
245-
{
246-
"type": "context_poisoning",
247-
"pattern": pattern,
248-
"score": weight * 100,
249-
}
250-
)
251-
252-
# If conversation history provided, check for contradictions
253-
if conversation_history:
254-
# Check if prompt contradicts earlier messages
255-
for hist_msg in conversation_history[-3:]: # Last 3 messages
256-
if self._check_contradiction(prompt, hist_msg):
257-
score = max(score, 50.0)
258-
signals.append(
259-
{
260-
"type": "context_contradiction",
261-
"score": 50.0,
262-
}
263-
)
264-
265-
return {
266-
"context_poisoning_detected": len(signals) > 0,
267-
"signals": signals,
268-
"score": score,
269-
}
136+
def detect_context_poisoning(self, prompt: str, conversation_history: Optional[List[str]] = None) -> Dict:
137+
return {"context_poisoning_detected": False, "signals": [], "score": 0.0}
270138

271139
def _check_contradiction(self, prompt: str, history: str) -> bool:
272140
"""Check if prompt contradicts history"""
@@ -290,120 +158,13 @@ def _check_contradiction(self, prompt: str, history: str) -> bool:
290158
return False
291159

292160
def detect_homograph_attack(self, prompt: str) -> Dict:
293-
"""Detect homograph attacks (lookalike characters)"""
294-
signals = []
295-
score = 0.0
296-
297-
# Cyrillic lookalikes
298-
cyrillic_pattern = r"[аеорсухАВЕОРСУХ]"
299-
if re.search(cyrillic_pattern, prompt):
300-
score = max(score, 80.0)
301-
signals.append(
302-
{
303-
"type": "homograph_cyrillic",
304-
"score": 80.0,
305-
"description": "Cyrillic characters detected (potential homograph attack)",
306-
}
307-
)
308-
309-
# Greek lookalikes
310-
greek_pattern = r"[οае]"
311-
if re.search(greek_pattern, prompt):
312-
score = max(score, 70.0)
313-
signals.append(
314-
{
315-
"type": "homograph_greek",
316-
"score": 70.0,
317-
"description": "Greek characters detected (potential homograph attack)",
318-
}
319-
)
320-
321-
return {
322-
"homograph_detected": len(signals) > 0,
323-
"signals": signals,
324-
"score": score,
325-
}
161+
return {"homograph_detected": False, "signals": [], "score": 0.0}
326162

327163
def detect_unicode_obfuscation(self, prompt: str) -> Dict:
328-
"""Detect Unicode obfuscation techniques"""
329-
signals = []
330-
score = 0.0
331-
332-
# Zero-width characters
333-
zero_width = re.findall(r"[\u200B-\u200D\uFEFF]", prompt)
334-
if zero_width:
335-
score = max(score, 60.0)
336-
signals.append(
337-
{
338-
"type": "zero_width_characters",
339-
"count": len(zero_width),
340-
"score": 60.0,
341-
}
342-
)
343-
344-
# Right-to-left override
345-
rtl_override = re.findall(r"[\u202E\u202D]", prompt)
346-
if rtl_override:
347-
score = max(score, 70.0)
348-
signals.append(
349-
{
350-
"type": "rtl_override",
351-
"count": len(rtl_override),
352-
"score": 70.0,
353-
}
354-
)
355-
356-
# Excessive non-ASCII
357-
non_ascii_ratio = (
358-
sum(1 for c in prompt if ord(c) > 127) / len(prompt) if prompt else 0
359-
)
360-
if non_ascii_ratio > 0.3:
361-
score = max(score, 50.0)
362-
signals.append(
363-
{
364-
"type": "excessive_non_ascii",
365-
"ratio": non_ascii_ratio,
366-
"score": 50.0,
367-
}
368-
)
369-
370-
return {
371-
"unicode_obfuscation_detected": len(signals) > 0,
372-
"signals": signals,
373-
"score": score,
374-
}
164+
return {"unicode_obfuscation_detected": False, "signals": [], "score": 0.0}
375165

376166
def detect_instruction_hiding(self, prompt: str) -> Dict:
377-
"""Detect hidden instructions in comments/tags"""
378-
signals = []
379-
score = 0.0
380-
381-
patterns = [
382-
(r"\[SYSTEM:\s*[^\]]+\]", 0.9),
383-
(r"<!--\s*SYSTEM[^>]+-->", 0.9),
384-
(r"\/\*\s*SYSTEM[^*]+\*\/", 0.9),
385-
(r"#\s*SYSTEM:\s*[^\n]+", 0.8),
386-
(r"//\s*SYSTEM:\s*[^\n]+", 0.8),
387-
]
388-
389-
for pattern, weight in patterns:
390-
matches = re.finditer(pattern, prompt, re.IGNORECASE | re.DOTALL)
391-
for match in matches:
392-
score = max(score, weight * 100)
393-
signals.append(
394-
{
395-
"type": "instruction_hiding",
396-
"pattern": pattern,
397-
"match": match.group(0),
398-
"score": weight * 100,
399-
}
400-
)
401-
402-
return {
403-
"instruction_hiding_detected": len(signals) > 0,
404-
"signals": signals,
405-
"score": score,
406-
}
167+
return {"instruction_hiding_detected": False, "signals": [], "score": 0.0}
407168

408169
def comprehensive_scan(
409170
self,

0 commit comments

Comments
 (0)