From 0f95acb11ca564f372d837ff89876b6042bf6315 Mon Sep 17 00:00:00 2001 From: dfgvaetyj3456356-hash Date: Thu, 28 May 2026 06:17:30 -0500 Subject: [PATCH 1/2] security: add timeouts to HTTP requests to prevent DoS via slow servers Adds a 30-second timeout to requests.get() calls in cache.py and common.py to prevent indefinite hanging when remote servers are unresponsive. This mitigates potential denial-of-service via slow or malicious endpoints. --- src/data_profiling/utils/cache.py | 5 ++++- src/data_profiling/utils/common.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/data_profiling/utils/cache.py b/src/data_profiling/utils/cache.py index 6945d5916..4ad8ccf30 100644 --- a/src/data_profiling/utils/cache.py +++ b/src/data_profiling/utils/cache.py @@ -2,7 +2,10 @@ import zipfile from pathlib import Path -from requests import get as get_file +from functools import partial +from requests import get as _get_file + +get_file = partial(_get_file, timeout=30) from data_profiling.utils.paths import get_data_path diff --git a/src/data_profiling/utils/common.py b/src/data_profiling/utils/common.py index 158ae5d20..76268d57b 100644 --- a/src/data_profiling/utils/common.py +++ b/src/data_profiling/utils/common.py @@ -102,7 +102,7 @@ def analytics_features( f"&dbx={dbx}" ) - requests.get(request_message) + requests.get(request_message, timeout=30) def is_running_in_databricks(): From fdcc23616f6efb1b7e48c6acbdca63aac75cebbf Mon Sep 17 00:00:00 2001 From: Security Fix Date: Fri, 29 May 2026 04:28:02 -0500 Subject: [PATCH 2/2] security: fix pickle RCE, XSS, and zip slip vulnerabilities - serialize_report.py: add trusted_source parameter with RuntimeWarning for pickle.loads - dataframe.py: add trusted_source parameter with RuntimeWarning for pd.read_pickle - templates.py: enable Jinja2 autoescape to prevent XSS in HTML reports - alerts.py: escape HTML in tooltip data-bs-title attribute - common.py: validate zip extraction paths to prevent zip slip --- src/data_profiling/model/alerts.py | 5 +++- .../presentation/flavours/html/templates.py | 5 +++- src/data_profiling/serialize_report.py | 25 ++++++++++++++++--- src/data_profiling/utils/common.py | 7 ++++++ src/data_profiling/utils/dataframe.py | 10 +++++++- 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/src/data_profiling/model/alerts.py b/src/data_profiling/model/alerts.py index 7a0d7b776..67732ae40 100644 --- a/src/data_profiling/model/alerts.py +++ b/src/data_profiling/model/alerts.py @@ -7,6 +7,8 @@ import numpy as np import pandas as pd +import html + from data_profiling.config import Settings from data_profiling.model.correlations import perform_check_correlation from data_profiling.utils.styles import get_alert_styles @@ -134,7 +136,8 @@ def fmt(self) -> str: num = len(self.values["fields"]) title = ", ".join(self.values["fields"]) corr = self.values["corr"] - hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {title}"' + safe_title = html.escape(title) + hint = f'data-bs-toggle="tooltip" data-bs-placement="right" data-bs-title="This variable has a high {corr} correlation with {num} fields: {safe_title}"' return ( f'{self.alert_type_name}' diff --git a/src/data_profiling/report/presentation/flavours/html/templates.py b/src/data_profiling/report/presentation/flavours/html/templates.py index b1ba21cf8..8ce54cb61 100644 --- a/src/data_profiling/report/presentation/flavours/html/templates.py +++ b/src/data_profiling/report/presentation/flavours/html/templates.py @@ -12,7 +12,10 @@ "data_profiling", "report/presentation/flavours/html/templates" ) jinja2_env = jinja2.Environment( - lstrip_blocks=True, trim_blocks=True, loader=package_loader + lstrip_blocks=True, + trim_blocks=True, + loader=package_loader, + autoescape=jinja2.select_autoescape(["html", "xml"]), ) jinja2_env.filters["is_list"] = lambda x: isinstance(x, list) jinja2_env.filters["fmt_badge"] = fmt_badge diff --git a/src/data_profiling/serialize_report.py b/src/data_profiling/serialize_report.py index bab5ad5e9..02dd1524c 100644 --- a/src/data_profiling/serialize_report.py +++ b/src/data_profiling/serialize_report.py @@ -43,12 +43,15 @@ def dumps(self) -> bytes: ] ) - def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]: + def loads( + self, data: bytes, trusted_source: bool = False + ) -> Union["ProfileReport", "SerializeReport"]: """ Deserialize the serialized report Args: data: The bytes of a serialize ProfileReport object. + trusted_source: Whether the data comes from a trusted source. Raises: ValueError: if ignore_config is set to False and the configs do not match. @@ -58,6 +61,14 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]: """ import pickle + if not trusted_source: + warnings.warn( + "Deserializing untrusted data with pickle can lead to remote code execution. " + "Only load data from trusted sources or set trusted_source=True if you accept the risk.", + RuntimeWarning, + stacklevel=2, + ) + try: ( df_hash, @@ -120,6 +131,10 @@ def loads(self, data: bytes) -> Union["ProfileReport", "SerializeReport"]: def dump(self, output_file: Union[Path, str]) -> None: """ Dump ProfileReport to file + + Args: + output_file: The path to write the serialized report to. + trusted_source: Whether the data will be treated as from a trusted source on load. """ if not isinstance(output_file, Path): output_file = Path(str(output_file)) @@ -128,16 +143,20 @@ def dump(self, output_file: Union[Path, str]) -> None: output_file.write_bytes(self.dumps()) def load( - self, load_file: Union[Path, str] + self, load_file: Union[Path, str], trusted_source: bool = False ) -> Union["ProfileReport", "SerializeReport"]: """ Load ProfileReport from file + Args: + load_file: The path to read the serialized report from. + trusted_source: Whether the data comes from a trusted source. + Raises: ValueError: if the DataFrame or Config do not match with the current ProfileReport """ if not isinstance(load_file, Path): load_file = Path(str(load_file)) - self.loads(load_file.read_bytes()) + self.loads(load_file.read_bytes(), trusted_source=trusted_source) return self diff --git a/src/data_profiling/utils/common.py b/src/data_profiling/utils/common.py index 76268d57b..22ae58574 100644 --- a/src/data_profiling/utils/common.py +++ b/src/data_profiling/utils/common.py @@ -56,8 +56,15 @@ def _copy(self, target): def extract_zip(outfile, effective_path): + effective_path = Path(effective_path).resolve() try: with zipfile.ZipFile(outfile) as z: + for member in z.namelist(): + member_path = (effective_path / member).resolve() + if not str(member_path).startswith(str(effective_path) + os.sep): + raise ValueError( + f"Zip file contains unsafe path: {member}" + ) z.extractall(effective_path) except zipfile.BadZipFile as e: raise ValueError("Bad zip file") from e diff --git a/src/data_profiling/utils/dataframe.py b/src/data_profiling/utils/dataframe.py index 19fe608cc..91d385d58 100644 --- a/src/data_profiling/utils/dataframe.py +++ b/src/data_profiling/utils/dataframe.py @@ -74,12 +74,13 @@ def uncompressed_extension(file_name: Path) -> str: ) -def read_pandas(file_name: Path) -> pd.DataFrame: +def read_pandas(file_name: Path, trusted_source: bool = False) -> pd.DataFrame: """Read DataFrame based on the file extension. This function is used when the file is in a standard format. Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet) Args: file_name: the file to read + trusted_source: Whether the file comes from a trusted source. Returns: DataFrame @@ -111,6 +112,13 @@ def read_pandas(file_name: Path) -> pd.DataFrame: elif extension == ".parquet": df = pd.read_parquet(str(file_name)) elif extension in [".pkl", ".pickle"]: + if not trusted_source: + warnings.warn( + "Loading pickle files from untrusted sources can lead to remote code execution. " + "Only load pickle files from trusted sources or set trusted_source=True if you accept the risk.", + RuntimeWarning, + stacklevel=2, + ) df = pd.read_pickle(str(file_name)) elif extension == ".tar": raise ValueError(