Skip to content

Commit fdcc236

Browse files
author
Security Fix
committed
security: fix pickle RCE, XSS, and zip slip vulnerabilities
- serialize_report.py: add trusted_source parameter with RuntimeWarning for pickle.loads - dataframe.py: add trusted_source parameter with RuntimeWarning for pd.read_pickle - templates.py: enable Jinja2 autoescape to prevent XSS in HTML reports - alerts.py: escape HTML in tooltip data-bs-title attribute - common.py: validate zip extraction paths to prevent zip slip
1 parent 0f95acb commit fdcc236

5 files changed

Lines changed: 46 additions & 6 deletions

File tree

src/data_profiling/model/alerts.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import numpy as np
88
import pandas as pd
99

10+
import html
11+
1012
from data_profiling.config import Settings
1113
from data_profiling.model.correlations import perform_check_correlation
1214
from data_profiling.utils.styles import get_alert_styles
@@ -134,7 +136,8 @@ def fmt(self) -> str:
134136
num = len(self.values["fields"])
135137
title = ", ".join(self.values["fields"])
136138
corr = self.values["corr"]
137-
hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {title}"'
139+
safe_title = html.escape(title)
140+
hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {safe_title}"'
138141

139142
return (
140143
f'<span class="badge text-bg-{style}" {hint}>{self.alert_type_name}</span>'

src/data_profiling/report/presentation/flavours/html/templates.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
"data_profiling", "report/presentation/flavours/html/templates"
1313
)
1414
jinja2_env = jinja2.Environment(
15-
lstrip_blocks=True, trim_blocks=True, loader=package_loader
15+
lstrip_blocks=True,
16+
trim_blocks=True,
17+
loader=package_loader,
18+
autoescape=jinja2.select_autoescape(["html", "xml"]),
1619
)
1720
jinja2_env.filters["is_list"] = lambda x: isinstance(x, list)
1821
jinja2_env.filters["fmt_badge"] = fmt_badge

src/data_profiling/serialize_report.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,15 @@ def dumps(self) -> bytes:
4343
]
4444
)
4545

46-
def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
46+
def loads(
47+
self, data: bytes, trusted_source: bool = False
48+
) -> Union["ProfileReport", "SerializeReport"]:
4749
"""
4850
Deserialize the serialized report
4951
5052
Args:
5153
data: The bytes of a serialize ProfileReport object.
54+
trusted_source: Whether the data comes from a trusted source.
5255
5356
Raises:
5457
ValueError: if ignore_config is set to False and the configs do not match.
@@ -58,6 +61,14 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
5861
"""
5962
import pickle
6063

64+
if not trusted_source:
65+
warnings.warn(
66+
"Deserializing untrusted data with pickle can lead to remote code execution. "
67+
"Only load data from trusted sources or set trusted_source=True if you accept the risk.",
68+
RuntimeWarning,
69+
stacklevel=2,
70+
)
71+
6172
try:
6273
(
6374
df_hash,
@@ -120,6 +131,10 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]:
120131
def dump(self, output_file: Union[Path, str]) -> None:
121132
"""
122133
Dump ProfileReport to file
134+
135+
Args:
136+
output_file: The path to write the serialized report to.
137+
trusted_source: Whether the data will be treated as from a trusted source on load.
123138
"""
124139
if not isinstance(output_file, Path):
125140
output_file = Path(str(output_file))
@@ -128,16 +143,20 @@ def dump(self, output_file: Union[Path, str]) -> None:
128143
output_file.write_bytes(self.dumps())
129144

130145
def load(
131-
self, load_file: Union[Path, str]
146+
self, load_file: Union[Path, str], trusted_source: bool = False
132147
) -> Union["ProfileReport", "SerializeReport"]:
133148
"""
134149
Load ProfileReport from file
135150
151+
Args:
152+
load_file: The path to read the serialized report from.
153+
trusted_source: Whether the data comes from a trusted source.
154+
136155
Raises:
137156
ValueError: if the DataFrame or Config do not match with the current ProfileReport
138157
"""
139158
if not isinstance(load_file, Path):
140159
load_file = Path(str(load_file))
141160

142-
self.loads(load_file.read_bytes())
161+
self.loads(load_file.read_bytes(), trusted_source=trusted_source)
143162
return self

src/data_profiling/utils/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,15 @@ def _copy(self, target):
5656

5757

5858
def extract_zip(outfile, effective_path):
59+
effective_path = Path(effective_path).resolve()
5960
try:
6061
with zipfile.ZipFile(outfile) as z:
62+
for member in z.namelist():
63+
member_path = (effective_path / member).resolve()
64+
if not str(member_path).startswith(str(effective_path) + os.sep):
65+
raise ValueError(
66+
f"Zip file contains unsafe path: {member}"
67+
)
6168
z.extractall(effective_path)
6269
except zipfile.BadZipFile as e:
6370
raise ValueError("Bad zip file") from e

src/data_profiling/utils/dataframe.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,13 @@ def uncompressed_extension(file_name: Path) -> str:
7474
)
7575

7676

77-
def read_pandas(file_name: Path) -> pd.DataFrame:
77+
def read_pandas(file_name: Path, trusted_source: bool = False) -> pd.DataFrame:
7878
"""Read DataFrame based on the file extension. This function is used when the file is in a standard format.
7979
Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)
8080
8181
Args:
8282
file_name: the file to read
83+
trusted_source: Whether the file comes from a trusted source.
8384
8485
Returns:
8586
DataFrame
@@ -111,6 +112,13 @@ def read_pandas(file_name: Path) -> pd.DataFrame:
111112
elif extension == ".parquet":
112113
df = pd.read_parquet(str(file_name))
113114
elif extension in [".pkl", ".pickle"]:
115+
if not trusted_source:
116+
warnings.warn(
117+
"Loading pickle files from untrusted sources can lead to remote code execution. "
118+
"Only load pickle files from trusted sources or set trusted_source=True if you accept the risk.",
119+
RuntimeWarning,
120+
stacklevel=2,
121+
)
114122
df = pd.read_pickle(str(file_name))
115123
elif extension == ".tar":
116124
raise ValueError(

0 commit comments

Comments
 (0)