Skip to content
2 changes: 2 additions & 0 deletions src/ydata_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class CatVars(BaseModel):
redact: bool = False
histogram_largest: int = 50
stop_words: List[str] = []
dirty_categories: bool = False
dirty_categories_threshold: float = 0.85


class BoolVars(BaseModel):
Expand Down
52 changes: 51 additions & 1 deletion src/ydata_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class AlertType(Enum):
DUPLICATES = auto()
"""This variable contains duplicates."""

NEAR_DUPLICATES = auto()
"""This variable contains duplicates."""

SKEWED = auto()
"""This variable is highly skewed."""

Expand All @@ -70,6 +73,9 @@ class AlertType(Enum):
UNIQUE = auto()
"""This variable has unique values."""

DIRTY_CATEGORY = auto()
"""This variable is a categories with potential fuzzy values, and for that reason might incur in consistency issues."""

CONSTANT_LENGTH = auto()
"""This variable has a constant length."""

Expand Down Expand Up @@ -205,7 +211,29 @@ def _get_description(self) -> str:
if self.values is not None:
return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
else:
return "Dataset has duplicated values"
return "Dataset has no duplicated rows"


class NearDuplicatesAlert(Alert):
def __init__(
self,
values: Optional[Dict] = None,
column_name: Optional[str] = None,
is_empty: bool = False,
):
super().__init__(
alert_type=AlertType.NEAR_DUPLICATES,
values=values,
column_name=column_name,
fields={"n_near_dups"},
is_empty=is_empty,
)

def _get_description(self) -> str:
if self.values is not None:
return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
else:
return "Dataset has no near duplicated rows"


class EmptyAlert(Alert):
Expand Down Expand Up @@ -249,6 +277,28 @@ def _get_description(self) -> str:
return f"[{self.column_name}] has a high cardinality"


class DirtyCategoryAlert(Alert):
def __init__(
self,
values: Optional[Dict] = None,
column_name: Optional[str] = None,
is_empty: bool = False,
):
super().__init__(
alert_type=AlertType.DIRTY_CATEGORY,
values=values,
column_name=column_name,
fields={"n_fuzzy_vals"},
is_empty=is_empty,
)

def _get_description(self) -> str:
if self.values is not None:
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
else:
return f"[{self.column_name}] no dirty categories values."


class HighCorrelationAlert(Alert):
def __init__(
self,
Expand Down
15 changes: 15 additions & 0 deletions src/ydata_profiling/model/pandas/describe_categorical_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
series_handle_nulls,
series_hashable,
)
from ydata_profiling.utils.information import DisplayInfo


def get_character_counts_vc(vc: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -210,6 +211,9 @@ def length_summary_vc(vc: pd.Series) -> dict:
return summary


_displayed_catvar_banner = False


@describe_categorical_1d.register
@series_hashable
@series_handle_nulls
Expand All @@ -226,6 +230,8 @@ def pandas_describe_categorical_1d(
Returns:
A dict containing calculated series description values.
"""
# Global info banner
global _displayed_catvar_banner

# Make sure we deal with strings (Issue #100)
series = series.astype(str)
Expand Down Expand Up @@ -262,4 +268,13 @@ def pandas_describe_categorical_1d(
if config.vars.cat.words:
summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))

if config.vars.cat.dirty_categories: # noqa: SIM102
if not _displayed_catvar_banner:
display_info = DisplayInfo(
title="Identify dirty categories with ydata-sdk",
info_text="This feature is only available for ydata-sdk users. Register to give try it.",
)
display_info.display_message()
_displayed_catvar_banner = True

return config, series, summary
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has dirty categories: {{ alert.values['n_fuzzy_vals'] }} distinct values
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Dataset has {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) <a href="#near_duplicate">near duplicate rows</a>
58 changes: 42 additions & 16 deletions src/ydata_profiling/utils/information.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,48 @@ def in_jupyter_notebook() -> bool:
return isiPython


class DisplayInfo:
def __init__(
self,
title: str,
info_text: str,
link: str = "ttps://ydata.ai/register",
):
self.title = title
self.link = link
self.info_text = info_text

def display_message(self) -> None:
"""
Display an HTML message in case the user is in a Jupyter Notebook
"""
if in_jupyter_notebook():
from IPython.display import HTML, display

info = f"""
<div>
<ins><a href="{self.link}">{self.title}</a></ins>
<p>
{self.info_text}
</p>
</div>
"""
display(HTML(info))
else:
info = (
f"\033[1;34m{self.title}\033[0m"
+ "\n"
+ f"{self.info_text}"
+ "\n"
+ f"Register at {self.link}"
)
print(info) # noqa: T201


def display_banner() -> None:
global _displayed_banner
if in_jupyter_notebook() and not _displayed_banner:
from IPython.display import HTML, display

banner_html = f"""
<div>
<ins><a href="{link}">{title}</a></ins>
<p>
{info_text}
</p>
</div>
"""
display(HTML(banner_html))
else:
print(f"\033[1;34m{title}\033[0m") # noqa: T201
print(info_text) # noqa: T201
print(f"Register at {link}") # noqa: T201
banner_info = DisplayInfo(title=title, info_text=info_text)

if not _displayed_banner:
banner_info.display_message()
_displayed_banner = True