11import re
22import logging
3+
34logger = logging .getLogger (__name__ )
45def sanitize_input (user_input :str ) -> str :
56 """
67 Sanitize user input to prevent injection attacks and remove unwanted characters.
8+
79 Args:
810 user_input (str): The raw input string from the user.
11+
912 Returns:
1013 str: The sanitized input string.
1114 """
1215 try :
13- # Remove any script tags
14- sanitized = re .sub (r'<script.*?>.*?</script>' , '' , user_input , flags = re .IGNORECASE )
15- # Remove any HTML tags
16+ sanitized = user_input
17+
18+ # Remove any style tags
19+ sanitized = re .sub (r'<style.*?>.*?</style>' , '' , sanitized , flags = re .IGNORECASE )
20+
21+ # Remove any HTML/script tags
1622 sanitized = re .sub (r'<.*?>' , '' , sanitized )
23+
24+ # Remove Phone Numbers
25+ sanitized = re .sub (r'\+?\d[\d -]{8,}\d' , '[Phone Number]' , sanitized )
26+
27+ # Remove Email Addresses
28+ sanitized = re .sub (r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' , '[Email Address]' , sanitized )
29+
30+ # Remove Medical Record Numbers (simple pattern)
31+ sanitized = re .sub (r'\bMRN[:\s]*\d+\b' , '[Medical Record Number]' , sanitized , flags = re .IGNORECASE )
32+
33+ # Normalize pronouns
34+ sanitized = normalize_pronouns (sanitized )
35+
1736 # Escape special characters
18- sanitized = re .sub (r'["\'\\]' , '' , sanitized )
37+ sanitized = re .sub (r'\s+' , '' , sanitized )
38+
1939 # Limit length to prevent buffer overflow attacks
20- max_length = 1000
40+ max_length = 5000
2141 if len (sanitized ) > max_length :
2242 sanitized = sanitized [:max_length ]
43+
2344 return sanitized .strip ()
2445 except Exception as e :
2546 logger .error (f"Error sanitizing input: { e } " )
26- return ""
47+ return ""
48+
49+ def normalize_pronouns (text :str ) -> str :
50+ """
51+ Normalize first and second person pronouns to third person clinical language.
52+
53+ Converts patient centric pronouns to a more neutral form.
54+ Args:
55+ text (str): The input text containing pronouns.
56+ Returns:
57+ str: The text with normalized pronouns.
58+ """
59+ # Normalize first person possessives: I, me, my, mine -> the patient
60+ text = re .sub (r'\bMy\b' , 'The patient\' s' , text )
61+ text = re .sub (r'\bmy\b' , 'the patient\' s' , text )
62+
63+ # First person subject: I -> the patient
64+ text = re .sub (r'\bI\b' , 'the patient' , text )
65+
66+ # First person object: me -> the patient
67+ text = re .sub (r'\bme\b' , 'the patient' , text )
68+
69+ # First person reflexive: myself -> the patient
70+ text = re .sub (r'\bmyself\b' , 'the patient' , text )
71+
72+ # Second person: you, your -> the clinician
73+ text = re .sub (r'\bYour\b' , 'the clinician' , text )
74+ return text
75+
76+
0 commit comments