Skip to content

Commit 65aabef

Browse files
trickster026MinuraPunchihewamartyna-mindsdbZoranPandovskiea-rus
authored
Fix for undetectable encodings (#10884) (#10905)
Co-authored-by: Minura Punchihewa <49385643+MinuraPunchihewa@users.noreply.github.com> Co-authored-by: martyna-mindsdb <109554435+martyna-mindsdb@users.noreply.github.com> Co-authored-by: Zoran Pandovski <zoran.pandovski@gmail.com> Co-authored-by: Andrey <elkin.andr@gmail.com>
1 parent 8a08a21 commit 65aabef

2 files changed

Lines changed: 88 additions & 47 deletions

File tree

mindsdb/integrations/handlers/email_handler/email_ingestor.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,45 +21,48 @@ def __init__(self, email_client: EmailClient, search_options: EmailSearchOptions
2121
self.search_options = search_options
2222

2323
def _is_tag_visible(self, element):
24-
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
24+
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
2525
return False
2626
if isinstance(element, bs4.element.Comment):
2727
return False
2828
return True
2929

3030
def _preprocess_raw_html(self, html: str) -> str:
31-
soup = BeautifulSoup(html, 'html.parser')
31+
soup = BeautifulSoup(html, "html.parser")
3232
texts = soup.find_all(text=True)
3333
visible_texts = filter(self._is_tag_visible, texts)
34-
return '\n'.join(t.strip() for t in visible_texts)
34+
return "\n".join(t.strip() for t in visible_texts)
3535

3636
def _ingest_email_row(self, row: pd.Series) -> dict:
37-
if row['body_content_type'] == 'html':
37+
if row["body_content_type"] == "html":
3838
# Extract meaningful text from raw HTML.
39-
row['body'] = self._preprocess_raw_html(row['body'])
40-
body_str = row['body']
39+
row["body"] = self._preprocess_raw_html(row["body"])
40+
body_str = row["body"]
4141
encoding = None
4242
if isinstance(body_str, bytes):
43-
encoding = chardet.detect(body_str)['encoding']
44-
if 'windows' in encoding.lower():
43+
encoding = chardet.detect(body_str)["encoding"]
44+
if encoding is None:
45+
# If chardet can't detect the encoding, we default to utf-8.
46+
encoding = "utf-8"
47+
elif "windows" in encoding.lower():
4548
# Easier to treat this at utf-8 since str constructor doesn't support all encodings here:
4649
# https://chardet.readthedocs.io/en/latest/supported-encodings.html.
47-
encoding = 'utf-8'
50+
encoding = "utf-8"
4851
try:
4952
body_str = str(body_str, encoding=encoding)
5053
except UnicodeDecodeError:
5154
# If illegal characters are found, we ignore them.
5255
# I encountered this issue with some emails that had a mix of encodings.
53-
body_str = row['body'].decode(encoding, errors='ignore')
56+
body_str = row["body"].decode(encoding, errors="ignore")
5457
# We split by paragraph so make sure there aren't too many newlines in a row.
55-
body_str = re.sub(r'[\r\n]\s*[\r\n]', '\n\n', body_str)
58+
body_str = re.sub(r"[\r\n]\s*[\r\n]", "\n\n", body_str)
5659
email_data = {
57-
'id': row['id'],
58-
'body': body_str,
59-
'subject': row['subject'],
60-
'to_field': row['to_field'],
61-
'from_field': row['from_field'],
62-
'datetime': row['date']
60+
"id": row["id"],
61+
"body": body_str,
62+
"subject": row["subject"],
63+
"to_field": row["to_field"],
64+
"from_field": row["from_field"],
65+
"datetime": row["date"],
6366
}
6467
# Replacing None values {None: ""}
6568
for key in email_data:
@@ -77,10 +80,12 @@ def ingest(self) -> pd.DataFrame:
7780
df = pd.DataFrame(all_email_data)
7881

7982
# Replace "(UTC)" with empty string over a pandas DataFrame column
80-
if 'datetime' in df.columns:
81-
df['datetime'] = df['datetime'].str.replace(' (UTC)', '')
83+
if "datetime" in df.columns:
84+
df["datetime"] = df["datetime"].str.replace(" (UTC)", "")
8285

8386
# Convert datetime string to datetime object, and normalize timezone to UTC.
84-
df['datetime'] = pd.to_datetime(df['datetime'], utc=True, format="%a, %d %b %Y %H:%M:%S %z", errors='coerce')
87+
df["datetime"] = pd.to_datetime(
88+
df["datetime"], utc=True, format="%a, %d %b %Y %H:%M:%S %z", errors="coerce"
89+
)
8590

8691
return df

tests/unused/unit/handler_tests/test_email_handler.py

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ def test_connect_already_connected(self):
2828
assert connection is self.email_handler.connection, "The connection must be the same as the one in the handler."
2929

3030
def test_check_connection(self):
31-
3231
response = self.email_handler.check_connection()
3332
assert response.success is True, "The response success must be True."
3433

@@ -37,30 +36,38 @@ def test_select(self):
3736
Test the select method of EmailsTable Class
3837
"""
3938

40-
mock_df = pd.DataFrame({
41-
'date': ["Wed, 02 Feb 2022 15:30:00 +0000",
42-
"Thu, 10 Mar 2022 10:45:15 +0530",
43-
"Fri, 16 Dec 2022 20:15:30 -0400"
44-
],
45-
'body_content_type': ['html', 'html', 'text'],
46-
"body": ["<html><body><p>Hello, World!</p></body></html>", "<html><body><p>Hello, World!</p></body></html>", "Hello, World!"],
47-
"from_field": ["", "", ""],
48-
"id": ["", "", ""],
49-
"to_field": ["", "", ""],
50-
"subject": ["", "", ""],
51-
})
39+
mock_df = pd.DataFrame(
40+
{
41+
"date": [
42+
"Wed, 02 Feb 2022 15:30:00 +0000",
43+
"Thu, 10 Mar 2022 10:45:15 +0530",
44+
"Fri, 16 Dec 2022 20:15:30 -0400",
45+
],
46+
"body_content_type": ["html", "html", "text"],
47+
"body": [
48+
"<html><body><p>Hello, World!</p></body></html>",
49+
"<html><body><p>Hello, World!</p></body></html>",
50+
"Hello, World!",
51+
],
52+
"from_field": ["", "", ""],
53+
"id": ["", "", ""],
54+
"to_field": ["", "", ""],
55+
"subject": ["", "", ""],
56+
}
57+
)
5258

5359
self.emails_table_instance.handler.connection.search_email = MagicMock(return_value=mock_df)
5460

55-
query = parse_sql('SELECT * FROM emails limit 1')
61+
query = parse_sql("SELECT * FROM emails limit 1")
5662

5763
self.emails_table_instance.select(query)
5864

59-
assert self.emails_table_instance.handler.connection.search_email.called, ("The search_email "
60-
"method must be called.")
65+
assert self.emails_table_instance.handler.connection.search_email.called, (
66+
"The search_email method must be called."
67+
)
6168

6269
# select using invalid column should raise Exception
63-
query = parse_sql('SELECT invalid_column FROM emails limit 1')
70+
query = parse_sql("SELECT invalid_column FROM emails limit 1")
6471

6572
with pytest.raises(Exception):
6673
self.emails_table_instance.select(query)
@@ -73,16 +80,18 @@ def test_insert(self):
7380
self.emails_table_instance.handler.connection.send_email = MagicMock()
7481

7582
query = parse_sql(
76-
'INSERT INTO email_datasource.emails(to_field, subject, body) '
77-
'VALUES ("toemail@email.com", "MindsDB", "Hello from MindsDB!")')
83+
"INSERT INTO email_datasource.emails(to_field, subject, body) "
84+
'VALUES ("toemail@email.com", "MindsDB", "Hello from MindsDB!")'
85+
)
7886

7987
self.emails_table_instance.insert(query)
8088
assert self.emails_table_instance.handler.connection.send_email.called, "The send_email method must be called."
8189

8290
# insert using invalid column should raise Exception
8391
query = parse_sql(
84-
'INSERT INTO email_datasource.emails(to_field, subject, body, invalid_column) '
85-
'VALUES ("toemail@email.com", "MindsDB", "blaha" , "invalid")')
92+
"INSERT INTO email_datasource.emails(to_field, subject, body, invalid_column) "
93+
'VALUES ("toemail@email.com", "MindsDB", "blaha" , "invalid")'
94+
)
8695

8796
with pytest.raises(Exception):
8897
self.emails_table_instance.insert(query)
@@ -94,9 +103,36 @@ def test_get_columns(self):
94103

95104
columns = self.emails_table_instance.get_columns()
96105
assert isinstance(columns, list), "The returned value must be a list."
97-
assert 'id' in columns, "Column 'id' must be in the columns list."
98-
assert 'body' in columns, "Column 'body' must be in the columns list."
99-
assert 'subject' in columns, "Column 'subject' must be in the columns list."
100-
assert 'to_field' in columns, "Column 'to_field' must be in the columns list."
101-
assert 'from_field' in columns, "Column 'from_field' must be in the columns list."
102-
assert 'datetime' in columns, "Column 'datetime' must be in the columns list."
106+
assert "id" in columns, "Column 'id' must be in the columns list."
107+
assert "body" in columns, "Column 'body' must be in the columns list."
108+
assert "subject" in columns, "Column 'subject' must be in the columns list."
109+
assert "to_field" in columns, "Column 'to_field' must be in the columns list."
110+
assert "from_field" in columns, "Column 'from_field' must be in the columns list."
111+
assert "datetime" in columns, "Column 'datetime' must be in the columns list."
112+
113+
def test_undetectable_encoding_handling(self):
114+
"""
115+
Test that the email handler can process emails with undetectable encodings
116+
without raising exceptions.
117+
"""
118+
119+
undetectable_content = b"\x80\x81\x82\x83\x84\x85\x86\x87"
120+
mock_df = pd.DataFrame(
121+
{
122+
"date": ["Wed, 02 Feb 2022 15:30:00 +0000"],
123+
"body_content_type": ["text"],
124+
"body": [undetectable_content],
125+
"from_field": ["test@example.com"],
126+
"id": ["test1"],
127+
"to_field": ["recipient@example.com"],
128+
"subject": ["Test email with undetectable encoding"],
129+
}
130+
)
131+
132+
self.emails_table_instance.handler.connection.search_email = MagicMock(return_value=mock_df)
133+
query = parse_sql("SELECT * FROM emails limit 1")
134+
result = self.emails_table_instance.select(query)
135+
136+
assert result is not None, "The result must not be None."
137+
assert "body" in result.columns, "The body should be in the result columns."
138+
assert len(result) > 0, "The result should not be empty."

0 commit comments

Comments
 (0)