diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index e5e4624db..09dbecdde 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -68,6 +68,8 @@ class CatVars(BaseModel): redact: bool = False histogram_largest: int = 50 stop_words: List[str] = [] + dirty_categories: bool = False + dirty_categories_threshold: float = 0.85 class BoolVars(BaseModel): diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index d6b627a8f..1b16d27a0 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -52,6 +52,9 @@ class AlertType(Enum): DUPLICATES = auto() """This variable contains duplicates.""" + NEAR_DUPLICATES = auto() + """This variable contains duplicates.""" + SKEWED = auto() """This variable is highly skewed.""" @@ -70,6 +73,9 @@ class AlertType(Enum): UNIQUE = auto() """This variable has unique values.""" + DIRTY_CATEGORY = auto() + """This variable is a categories with potential fuzzy values, and for that reason might incur in consistency issues.""" + CONSTANT_LENGTH = auto() """This variable has a constant length.""" @@ -205,7 +211,29 @@ def _get_description(self) -> str: if self.values is not None: return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows" else: - return "Dataset has duplicated values" + return "Dataset has no duplicated rows" + + +class NearDuplicatesAlert(Alert): + def __init__( + self, + values: Optional[Dict] = None, + column_name: Optional[str] = None, + is_empty: bool = False, + ): + super().__init__( + alert_type=AlertType.NEAR_DUPLICATES, + values=values, + column_name=column_name, + fields={"n_near_dups"}, + is_empty=is_empty, + ) + + def _get_description(self) -> str: + if self.values is not None: + return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows" + else: + return "Dataset has no near duplicated rows" class EmptyAlert(Alert): @@ -249,6 +277,28 @@ def _get_description(self) -> str: return f"[{self.column_name}] has a high cardinality" +class DirtyCategoryAlert(Alert): + def __init__( + self, + values: Optional[Dict] = None, + column_name: Optional[str] = None, + is_empty: bool = False, + ): + super().__init__( + alert_type=AlertType.DIRTY_CATEGORY, + values=values, + column_name=column_name, + fields={"n_fuzzy_vals"}, + is_empty=is_empty, + ) + + def _get_description(self) -> str: + if self.values is not None: + return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category" + else: + return f"[{self.column_name}] no dirty categories values." + + class HighCorrelationAlert(Alert): def __init__( self, diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 31ae57417..d7ffce62e 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -16,6 +16,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.utils.information import DisplayInfo def get_character_counts_vc(vc: pd.Series) -> pd.Series: @@ -210,6 +211,9 @@ def length_summary_vc(vc: pd.Series) -> dict: return summary +_displayed_catvar_banner = False + + @describe_categorical_1d.register @series_hashable @series_handle_nulls @@ -226,6 +230,8 @@ def pandas_describe_categorical_1d( Returns: A dict containing calculated series description values. """ + # Global info banner + global _displayed_catvar_banner # Make sure we deal with strings (Issue #100) series = series.astype(str) @@ -262,4 +268,13 @@ def pandas_describe_categorical_1d( if config.vars.cat.words: summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words)) + if config.vars.cat.dirty_categories: # noqa: SIM102 + if not _displayed_catvar_banner: + display_info = DisplayInfo( + title="Identify dirty categories with ydata-sdk", + info_text="This feature is only available for ydata-sdk users. Register to give try it.", + ) + display_info.display_message() + _displayed_catvar_banner = True + return config, series, summary diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html new file mode 100644 index 000000000..f1683ca4e --- /dev/null +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_dirty_category.html @@ -0,0 +1 @@ +{{ alert.column_name }} has dirty categories: {{ alert.values['n_fuzzy_vals'] }} distinct values diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html new file mode 100644 index 000000000..10ad13c82 --- /dev/null +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_near_duplicates.html @@ -0,0 +1 @@ +Dataset has {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) near duplicate rows diff --git a/src/ydata_profiling/utils/information.py b/src/ydata_profiling/utils/information.py index 5f606a0e6..2ac876d9b 100644 --- a/src/ydata_profiling/utils/information.py +++ b/src/ydata_profiling/utils/information.py @@ -21,22 +21,48 @@ def in_jupyter_notebook() -> bool: return isiPython +class DisplayInfo: + def __init__( + self, + title: str, + info_text: str, + link: str = "ttps://ydata.ai/register", + ): + self.title = title + self.link = link + self.info_text = info_text + + def display_message(self) -> None: + """ + Display an HTML message in case the user is in a Jupyter Notebook + """ + if in_jupyter_notebook(): + from IPython.display import HTML, display + + info = f""" +
+ {self.title} +

+ {self.info_text} +

+
+ """ + display(HTML(info)) + else: + info = ( + f"\033[1;34m{self.title}\033[0m" + + "\n" + + f"{self.info_text}" + + "\n" + + f"Register at {self.link}" + ) + print(info) # noqa: T201 + + def display_banner() -> None: global _displayed_banner - if in_jupyter_notebook() and not _displayed_banner: - from IPython.display import HTML, display - - banner_html = f""" -
- {title} -

- {info_text} -

-
- """ - display(HTML(banner_html)) - else: - print(f"\033[1;34m{title}\033[0m") # noqa: T201 - print(info_text) # noqa: T201 - print(f"Register at {link}") # noqa: T201 + banner_info = DisplayInfo(title=title, info_text=info_text) + + if not _displayed_banner: + banner_info.display_message() _displayed_banner = True