Skip to content

Commit 01ccf9a

Browse files
authored
Enhance input sanitization and normalize pronouns
Updated the sanitizer function to improve input sanitization by removing style tags, normalizing pronouns, and increasing the maximum length limit.
1 parent ee06e75 commit 01ccf9a

File tree

1 file changed

+56
-6
lines changed

1 file changed

+56
-6
lines changed
Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,76 @@
11
import re
22
import logging
3+
34
logger = logging.getLogger(__name__)
45
def sanitize_input(user_input:str) -> str:
56
"""
67
Sanitize user input to prevent injection attacks and remove unwanted characters.
8+
79
Args:
810
user_input (str): The raw input string from the user.
11+
912
Returns:
1013
str: The sanitized input string.
1114
"""
1215
try:
13-
# Remove any script tags
14-
sanitized = re.sub(r'<script.*?>.*?</script>', '', user_input, flags=re.IGNORECASE)
15-
# Remove any HTML tags
16+
sanitized = user_input
17+
18+
# Remove any style tags
19+
sanitized = re.sub(r'<style.*?>.*?</style>', '', sanitized, flags=re.IGNORECASE)
20+
21+
# Remove any HTML/script tags
1622
sanitized = re.sub(r'<.*?>', '', sanitized)
23+
24+
# Remove Phone Numbers
25+
sanitized = re.sub(r'\+?\d[\d -]{8,}\d', '[Phone Number]', sanitized)
26+
27+
# Remove Email Addresses
28+
sanitized = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[Email Address]', sanitized)
29+
30+
# Remove Medical Record Numbers (simple pattern)
31+
sanitized = re.sub(r'\bMRN[:\s]*\d+\b', '[Medical Record Number]', sanitized, flags=re.IGNORECASE)
32+
33+
# Normalize pronouns
34+
sanitized = normalize_pronouns(sanitized)
35+
1736
# Escape special characters
18-
sanitized = re.sub(r'["\'\\]', '', sanitized)
37+
sanitized = re.sub(r'\s+', '', sanitized)
38+
1939
# Limit length to prevent buffer overflow attacks
20-
max_length = 1000
40+
max_length = 5000
2141
if len(sanitized) > max_length:
2242
sanitized = sanitized[:max_length]
43+
2344
return sanitized.strip()
2445
except Exception as e:
2546
logger.error(f"Error sanitizing input: {e}")
26-
return ""
47+
return ""
48+
49+
def normalize_pronouns(text:str) -> str:
50+
"""
51+
Normalize first and second person pronouns to third person clinical language.
52+
53+
Converts patient centric pronouns to a more neutral form.
54+
Args:
55+
text (str): The input text containing pronouns.
56+
Returns:
57+
str: The text with normalized pronouns.
58+
"""
59+
# Normalize first person possessives: I, me, my, mine -> the patient
60+
text = re.sub(r'\bMy\b', 'The patient\'s', text)
61+
text = re.sub(r'\bmy\b', 'the patient\'s', text)
62+
63+
# First person subject: I -> the patient
64+
text = re.sub(r'\bI\b', 'the patient', text)
65+
66+
# First person object: me -> the patient
67+
text = re.sub(r'\bme\b', 'the patient', text)
68+
69+
# First person reflexive: myself -> the patient
70+
text = re.sub(r'\bmyself\b', 'the patient', text)
71+
72+
# Second person: you, your -> the clinician
73+
text = re.sub(r'\bYour\b', 'the clinician', text)
74+
return text
75+
76+

0 commit comments

Comments
 (0)