diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py
index e5e4624db..09dbecdde 100644
--- a/src/ydata_profiling/config.py
+++ b/src/ydata_profiling/config.py
@@ -68,6 +68,8 @@ class CatVars(BaseModel):
redact: bool = False
histogram_largest: int = 50
stop_words: List[str] = []
+ dirty_categories: bool = False
+ dirty_categories_threshold: float = 0.85
class BoolVars(BaseModel):
diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
index d6b627a8f..1b16d27a0 100644
--- a/src/ydata_profiling/model/alerts.py
+++ b/src/ydata_profiling/model/alerts.py
@@ -52,6 +52,9 @@ class AlertType(Enum):
DUPLICATES = auto()
"""This variable contains duplicates."""
+ NEAR_DUPLICATES = auto()
+ """This variable contains duplicates."""
+
SKEWED = auto()
"""This variable is highly skewed."""
@@ -70,6 +73,9 @@ class AlertType(Enum):
UNIQUE = auto()
"""This variable has unique values."""
+ DIRTY_CATEGORY = auto()
+ """This variable is a categories with potential fuzzy values, and for that reason might incur in consistency issues."""
+
CONSTANT_LENGTH = auto()
"""This variable has a constant length."""
@@ -205,7 +211,29 @@ def _get_description(self) -> str:
if self.values is not None:
return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
else:
- return "Dataset has duplicated values"
+ return "Dataset has no duplicated rows"
+
+
+class NearDuplicatesAlert(Alert):
+ def __init__(
+ self,
+ values: Optional[Dict] = None,
+ column_name: Optional[str] = None,
+ is_empty: bool = False,
+ ):
+ super().__init__(
+ alert_type=AlertType.NEAR_DUPLICATES,
+ values=values,
+ column_name=column_name,
+ fields={"n_near_dups"},
+ is_empty=is_empty,
+ )
+
+ def _get_description(self) -> str:
+ if self.values is not None:
+ return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
+ else:
+ return "Dataset has no near duplicated rows"
class EmptyAlert(Alert):
@@ -249,6 +277,28 @@ def _get_description(self) -> str:
return f"[{self.column_name}] has a high cardinality"
+class DirtyCategoryAlert(Alert):
+ def __init__(
+ self,
+ values: Optional[Dict] = None,
+ column_name: Optional[str] = None,
+ is_empty: bool = False,
+ ):
+ super().__init__(
+ alert_type=AlertType.DIRTY_CATEGORY,
+ values=values,
+ column_name=column_name,
+ fields={"n_fuzzy_vals"},
+ is_empty=is_empty,
+ )
+
+ def _get_description(self) -> str:
+ if self.values is not None:
+ return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
+ else:
+ return f"[{self.column_name}] no dirty categories values."
+
+
class HighCorrelationAlert(Alert):
def __init__(
self,
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
index 31ae57417..d7ffce62e 100644
--- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -16,6 +16,7 @@
series_handle_nulls,
series_hashable,
)
+from ydata_profiling.utils.information import DisplayInfo
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
@@ -210,6 +211,9 @@ def length_summary_vc(vc: pd.Series) -> dict:
return summary
+_displayed_catvar_banner = False
+
+
@describe_categorical_1d.register
@series_hashable
@series_handle_nulls
@@ -226,6 +230,8 @@ def pandas_describe_categorical_1d(
Returns:
A dict containing calculated series description values.
"""
+ # Global info banner
+ global _displayed_catvar_banner
# Make sure we deal with strings (Issue #100)
series = series.astype(str)
@@ -262,4 +268,13 @@ def pandas_describe_categorical_1d(
if config.vars.cat.words:
summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))
+ if config.vars.cat.dirty_categories: # noqa: SIM102
+ if not _displayed_catvar_banner:
+ display_info = DisplayInfo(
+ title="Identify dirty categories with ydata-sdk",
+ info_text="This feature is only available for ydata-sdk users. Register to give try it.",
+ )
+ display_info.display_message()
+ _displayed_catvar_banner = True
+
return config, series, summary
diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html
new file mode 100644
index 000000000..f1683ca4e
--- /dev/null
+++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html
@@ -0,0 +1 @@
+{{ alert.column_name }} has dirty categories: {{ alert.values['n_fuzzy_vals'] }} distinct values
diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html
new file mode 100644
index 000000000..10ad13c82
--- /dev/null
+++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html
@@ -0,0 +1 @@
+Dataset has {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) near duplicate rows
diff --git a/src/ydata_profiling/utils/information.py b/src/ydata_profiling/utils/information.py
index 5f606a0e6..2ac876d9b 100644
--- a/src/ydata_profiling/utils/information.py
+++ b/src/ydata_profiling/utils/information.py
@@ -21,22 +21,48 @@ def in_jupyter_notebook() -> bool:
return isiPython
+class DisplayInfo:
+ def __init__(
+ self,
+ title: str,
+ info_text: str,
+ link: str = "ttps://ydata.ai/register",
+ ):
+ self.title = title
+ self.link = link
+ self.info_text = info_text
+
+ def display_message(self) -> None:
+ """
+ Display an HTML message in case the user is in a Jupyter Notebook
+ """
+ if in_jupyter_notebook():
+ from IPython.display import HTML, display
+
+ info = f"""
+
+ {self.info_text} +
+- {info_text} -
-