From c39e0b3407a4cf6dccc7c28314733b3191388ab7 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 15:59:52 +0800 Subject: [PATCH 1/8] feat: initial release --- Dockerfile | 21 +++ src/ydata_profiling/config.py | 142 ++++++++---------- src/ydata_profiling/model/handler.py | 20 +-- src/ydata_profiling/model/summarizer.py | 31 ++-- src/ydata_profiling/profile_report.py | 6 +- .../report/structure/__init__.py | 22 +++ .../report/structure/report.py | 2 +- 7 files changed, 130 insertions(+), 114 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..2bb934ed1 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,24 +6,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr - -def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: - """ - Recursive merge dictionaries. - - :param dict1: Base dictionary to merge. - :param dict2: Dictionary to merge on top of base dictionary. - :return: Merged dictionary - """ - for key, val in dict1.items(): - if isinstance(val, dict): - dict2_node = dict2.setdefault(key, {}) - _merge_dictionaries(val, dict2_node) - else: - if key not in dict2: - dict2[key] = val - - return dict2 +from ydata_profiling.utils.common import update class Dataset(BaseModel): @@ -355,60 +338,7 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - def update(self, updates: dict) -> "Settings": - update = _merge_dictionaries(self.dict(), updates) - return self.parse_obj(self.copy(update=update)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config: - arg_groups: Dict[str, Any] = { + _arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -475,8 +405,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + kwargs = Settings._arg_groups[key] + shorthand_args, _ = Settings.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -485,8 +415,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in Settings._shorthands: + shorthand_args[key] = Settings._shorthands[key] if split: del kwargs[key] @@ -494,3 +424,63 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + def update(self, updates: dict) -> "Settings": + merged = update(self.dict().copy(), updates) + return self.parse_obj(self.copy(update=merged)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config(Settings): + """ + Deprecated: Use Settings instead. + Backward compatibility alias for Settings class. + """ + + pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..e983ce2a1 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms +from ydata_profiling.report.structure import get_render_map - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a57ed1c97 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" + """Creates the summary map based on the backend.""" + common_map = { + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - summary_map = { + base_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], } else: - summary_map = { + base_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], } - return summary_map + + base_map.update(common_map) + return base_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index a7d6d9134..916b4681e 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Config, Settings, SparkSettings +from ydata_profiling.config import Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Config.get_arg_groups(key)) + cfg = cfg.update(Settings.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Config.shorthands(kwargs) + shorthands, kwargs = Settings.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 8324d248d..a2efd029a 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1 +1,23 @@ """Data structure for the report""" +from typing import Callable, Dict + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..b64a41aae 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map +from ydata_profiling.report.structure import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, From 27a314be64b586f58de6a2956d456e2a3d03da1f Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:14:23 +0800 Subject: [PATCH 2/8] feat: initial release --- src/ydata_profiling/config.py | 142 +++++++++++++----------- src/ydata_profiling/model/handler.py | 2 - src/ydata_profiling/model/summarizer.py | 31 +++--- src/ydata_profiling/profile_report.py | 6 +- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 2bb934ed1..09dbecdde 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,7 +6,24 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr -from ydata_profiling.utils.common import update + +def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: + """ + Recursive merge dictionaries. + + :param dict1: Base dictionary to merge. + :param dict2: Dictionary to merge on top of base dictionary. + :return: Merged dictionary + """ + for key, val in dict1.items(): + if isinstance(val, dict): + dict2_node = dict2.setdefault(key, {}) + _merge_dictionaries(val, dict2_node) + else: + if key not in dict2: + dict2[key] = val + + return dict2 class Dataset(BaseModel): @@ -338,7 +355,60 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - _arg_groups: Dict[str, Any] = { + def update(self, updates: dict) -> "Settings": + update = _merge_dictionaries(self.dict(), updates) + return self.parse_obj(self.copy(update=update)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config: + arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -405,8 +475,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Settings._arg_groups[key] - shorthand_args, _ = Settings.shorthands(kwargs, split=False) + kwargs = Config.arg_groups[key] + shorthand_args, _ = Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -415,8 +485,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Settings._shorthands: - shorthand_args[key] = Settings._shorthands[key] + if value is None and key in Config._shorthands: + shorthand_args[key] = Config._shorthands[key] if split: del kwargs[key] @@ -424,63 +494,3 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} - - def update(self, updates: dict) -> "Settings": - merged = update(self.dict().copy(), updates) - return self.parse_obj(self.copy(update=merged)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config(Settings): - """ - Deprecated: Use Settings instead. - Backward compatibility alias for Settings class. - """ - - pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index e983ce2a1..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,6 +60,4 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -from ydata_profiling.report.structure import get_render_map -__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index a57ed1c97..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -64,15 +65,7 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map based on the backend.""" - common_map = { - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - + """Creates the summary map for Pandas summarization.""" if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -85,7 +78,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - base_map = { + summary_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -96,9 +89,14 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], } else: - base_map = { + summary_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -109,10 +107,13 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], } - - base_map.update(common_map) - return base_map + return summary_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 916b4681e..a7d6d9134 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Settings, SparkSettings +from ydata_profiling.config import Config, Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Settings.get_arg_groups(key)) + cfg = cfg.update(Config.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Settings.shorthands(kwargs) + shorthands, kwargs = Config.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) From 8d8f6b71b5f46178749d0b100ba9bc8cefbfb261 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:39:22 +0800 Subject: [PATCH 3/8] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- .../report/structure/__init__.py | 8 ++ src/ydata_profiling/utils/backend.py | 2 +- 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..aa36a811c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,63 +1,60 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - - +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index a2efd029a..7ba9c10c9 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -3,6 +3,14 @@ def get_render_map() -> Dict[str, Callable]: + """Get the mapping of variable types to their render functions. + + This function was moved from model.handler to report.structure to eliminate + the reverse dependency from model layer to report layer. + + Returns: + Dictionary mapping type names to render functions. + """ import ydata_profiling.report.structure.variables as render_algorithms render_map = { diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..dd12f9fd3 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ - File with a function to check the backend being used +Backend detection utilities for pandas and spark. """ import importlib From 307cba98bfab9196a8de5355022f3919539e4520 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 18:09:37 +0800 Subject: [PATCH 4/8] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- src/ydata_profiling/model/summarizer.py | 5 +- .../report/structure/__init__.py | 30 ----- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ++++ src/ydata_profiling/utils/backend.py | 2 +- 6 files changed, 90 insertions(+), 95 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index aa36a811c..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,60 +1,63 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary + + + diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..54d839915 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for +from ydata_profiling.model.summary_algorithms import ( describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 7ba9c10c9..8324d248d 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1,31 +1 @@ """Data structure for the report""" -from typing import Callable, Dict - - -def get_render_map() -> Dict[str, Callable]: - """Get the mapping of variable types to their render functions. - - This function was moved from model.handler to report.structure to eliminate - the reverse dependency from model layer to report layer. - - Returns: - Dictionary mapping type names to render functions. - """ - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index b64a41aae..0f027f23f 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure import get_render_map +from ydata_profiling.report.structure.variables import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index 64f1d6d54..a8aa301b5 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,3 +1,5 @@ +from typing import Callable, Dict + from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -17,6 +19,26 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url + +def get_render_map() -> Dict[str, Callable]: + render_map = { + "Boolean": render_boolean, + "Numeric": render_real, + "Complex": render_complex, + "Text": render_text, + "DateTime": render_date, + "Categorical": render_categorical, + "URL": render_url, + "Path": render_path, + "File": render_file, + "Image": render_image, + "Unsupported": render_generic, + "TimeSeries": render_timeseries, + } + + return render_map + + __all__ = [ "render_boolean", "render_categorical", @@ -32,4 +54,5 @@ "render_text", "render_timeseries", "render_url", + "get_render_map", ] diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index dd12f9fd3..e99d91c11 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ -Backend detection utilities for pandas and spark. + File with a function to check the backend being used """ import importlib From 1e2fa10eaf7a951acea663fa270784244ac18404 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 19:27:37 +0800 Subject: [PATCH 5/8] feat: initial release --- src/ydata_profiling/model/handler.py | 18 +++++++++++++++ src/ydata_profiling/model/summarizer.py | 5 ++-- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ------------------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..992c1840c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,4 +60,22 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 54d839915..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( +from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 0f027f23f..482b410b2 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure.variables import get_render_map +from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index a8aa301b5..64f1d6d54 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,5 +1,3 @@ -from typing import Callable, Dict - from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -19,26 +17,6 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url - -def get_render_map() -> Dict[str, Callable]: - render_map = { - "Boolean": render_boolean, - "Numeric": render_real, - "Complex": render_complex, - "Text": render_text, - "DateTime": render_date, - "Categorical": render_categorical, - "URL": render_url, - "Path": render_path, - "File": render_file, - "Image": render_image, - "Unsupported": render_generic, - "TimeSeries": render_timeseries, - } - - return render_map - - __all__ = [ "render_boolean", "render_categorical", @@ -54,5 +32,4 @@ def get_render_map() -> Dict[str, Callable]: "render_text", "render_timeseries", "render_url", - "get_render_map", ] From 3f158155243784086e17dd0d59e60dc086dc0844 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 20:03:28 +0800 Subject: [PATCH 6/8] feat: initial release --- src/ydata_profiling/model/alerts.py | 18 +++++++++--------- .../model/summary_algorithms.py | 15 --------------- .../structure/variables/render_common.py | 1 - 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 1b16d27a0..611b5de85 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -12,8 +12,8 @@ from ydata_profiling.utils.styles import get_alert_styles -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. +def _fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage (internal copy to avoid circular imports). Args: edge_cases: Check for edge cases? @@ -209,7 +209,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows" + return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows" else: return "Dataset has no duplicated rows" @@ -231,7 +231,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows" + return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows" else: return "Dataset has no near duplicated rows" @@ -272,7 +272,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values" + return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values" else: return f"[{self.column_name}] has a high cardinality" @@ -294,7 +294,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category" + return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category" else: return f"[{self.column_name}] no dirty categories values." @@ -365,7 +365,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values" + return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values" else: return f"[{self.column_name}] has infinite values" @@ -387,7 +387,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values" + return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values" else: return f"[{self.column_name}] has missing values" @@ -541,7 +541,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros" + return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros" else: return f"[{self.column_name}] has predominantly zeros" diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 9c3e5ef38..49569605b 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -11,21 +11,6 @@ T = TypeVar("T") -def func_nullable_series_contains(fn: Callable) -> Callable: - @functools.wraps(fn) - def inner( - config: Settings, series: pd.Series, state: dict, *args, **kwargs - ) -> bool: - if series.hasnans: - series = series.dropna() - if series.empty: - return False - - return fn(config, series, state, *args, **kwargs) - - return inner - - def safe_histogram( values: np.ndarray, bins: Union[int, str, np.ndarray] = "auto", diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py index aef8de357..e90935640 100644 --- a/src/ydata_profiling/report/structure/variables/render_common.py +++ b/src/ydata_profiling/report/structure/variables/render_common.py @@ -10,7 +10,6 @@ def render_common(config: Settings, summary: dict) -> dict: n_freq_table_max = config.n_freq_table_max template_variables = { - # TODO: with nan "freq_table_rows": freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], From d83e1a17d23c4d1bdee001b326fdaac1a4707548 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 20:42:38 +0800 Subject: [PATCH 7/8] feat: initial release --- src/ydata_profiling/model/alerts.py | 18 ++++----- src/ydata_profiling/model/correlations.py | 28 +++---------- src/ydata_profiling/model/missing.py | 25 +++--------- .../model/pandas/table_pandas.py | 28 ++----------- .../model/spark/table_spark.py | 39 ++----------------- .../model/spark/timeseries_index_spark.py | 2 +- src/ydata_profiling/model/summarizer.py | 3 +- .../model/summary_algorithms.py | 15 +++++++ src/ydata_profiling/model/table.py | 37 ++++++++++++++++++ .../structure/variables/render_common.py | 1 + src/ydata_profiling/utils/backend.py | 34 +++++++++++++++- 11 files changed, 115 insertions(+), 115 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 611b5de85..1b16d27a0 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -12,8 +12,8 @@ from ydata_profiling.utils.styles import get_alert_styles -def _fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage (internal copy to avoid circular imports). +def fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage. Args: edge_cases: Check for edge cases? @@ -209,7 +209,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows" + return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows" else: return "Dataset has no duplicated rows" @@ -231,7 +231,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows" + return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows" else: return "Dataset has no near duplicated rows" @@ -272,7 +272,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values" + return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values" else: return f"[{self.column_name}] has a high cardinality" @@ -294,7 +294,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category" + return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category" else: return f"[{self.column_name}] no dirty categories values." @@ -365,7 +365,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values" + return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values" else: return f"[{self.column_name}] has infinite values" @@ -387,7 +387,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values" + return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values" else: return f"[{self.column_name}] has missing values" @@ -541,7 +541,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros" + return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros" else: return f"[{self.column_name}] has predominantly zeros" diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 2bbaa1112..25e2e13c4 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -3,12 +3,13 @@ """Correlations between variables.""" import warnings -from typing import Dict, List, Optional, Sized, no_type_check +from typing import Dict, List, Optional, Sized import numpy as np import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend try: from pandas.core.base import DataError @@ -16,30 +17,11 @@ from pandas.errors import DataError -class CorrelationBackend: +class CorrelationBackend(BaseBackend): """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" - @no_type_check - def __init__(self, df: Sized): - """Determine backend once and store it for all correlation computations.""" - if isinstance(df, pd.DataFrame): - from ydata_profiling.model.pandas import ( - correlations_pandas as correlation_backend, # type: ignore - ) - else: - from ydata_profiling.model.spark import ( - correlations_spark as correlation_backend, # type: ignore - ) - - self.backend = correlation_backend - - def get_method(self, method_name: str): # noqa: ANN201 - """Retrieve the appropriate correlation method class from the backend.""" - if hasattr(self.backend, method_name): - return getattr(self.backend, method_name) - raise AttributeError( - f"Correlation method '{method_name}' is not available in the backend." - ) + _pandas_module = "ydata_profiling.model.pandas.correlations_pandas" + _spark_module = "ydata_profiling.model.spark.correlations_spark" class Correlation: diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index 46ec2dee3..aa14cc425 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -1,32 +1,17 @@ -import importlib import warnings -from typing import Any, Callable, Dict, Optional, Sized +from typing import Any, Dict, Optional, Sized import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend -class MissingDataBackend: +class MissingDataBackend(BaseBackend): """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark).""" - def __init__(self, df: Sized): - """Determine backend once and store it for all missing-data computations.""" - if isinstance(df, pd.DataFrame): - self.backend_module = "ydata_profiling.model.pandas.missing_pandas" - else: - self.backend_module = "ydata_profiling.model.spark.missing_spark" - - self.module = importlib.import_module(self.backend_module) - - def get_method(self, method_name: str) -> Callable: - """Retrieve the appropriate missing-data function from the backend module.""" - try: - return getattr(self.module, method_name) - except AttributeError as ex: - raise AttributeError( - f"Missing-data function '{method_name}' is not available in {self.backend_module}." - ) from ex + _pandas_module = "ydata_profiling.model.pandas.missing_pandas" + _spark_module = "ydata_profiling.model.spark.missing_spark" class MissingData: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..28c79f849 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,9 +1,7 @@ -from collections import Counter - import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,36 +19,18 @@ def pandas_get_table_stats( A dictionary that contains the table statistics. """ n = len(df) if not df.empty else 0 + n_var = len(df.columns) memory_size = df.memory_usage(deep=config.memory_deep).sum() record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, - "n_var": len(df.columns), + "n_var": n_var, "memory_size": memory_size, "record_size": record_size, - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, } - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) - if table_stats["n"] > 0 and table_stats["n_var"] > 0 - else 0 - ) - - # Variable type counts - table_stats.update( - {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} - ) + table_stats.update(compute_common_table_stats(n, n_var, variable_stats)) return table_stats diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 33e862e61..2a2985059 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -1,9 +1,7 @@ -from collections import Counter - from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,38 +19,9 @@ def get_table_stats_spark( A dictionary that contains the table statistics. """ n = df.count() + n_var = len(df.columns) - result = {"n": n, "n_var": len(df.columns)} - - table_stats = { - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, - } - - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - # without this check we'll get a div by zero error - if result["n"] * result["n_var"] > 0: - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) - if result["n"] > 0 - else 0 - ) - else: - table_stats["p_cells_missing"] = 0 - - result["p_cells_missing"] = table_stats["p_cells_missing"] - result["n_cells_missing"] = table_stats["n_cells_missing"] - result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] - result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] - - # Variable type counts - result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + result = {"n": n, "n_var": n_var} + result.update(compute_common_table_stats(n, n_var, variable_stats)) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index e8145d76c..a31f25ccf 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -4,7 +4,7 @@ from ydata_profiling.config import Settings -def spark_get_time_index_description_spark( +def get_time_index_description_spark( config: Settings, df: DataFrame, table_stats: dict, diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..41b8d6f88 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 49569605b..9c3e5ef38 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -11,6 +11,21 @@ T = TypeVar("T") +def func_nullable_series_contains(fn: Callable) -> Callable: + @functools.wraps(fn) + def inner( + config: Settings, series: pd.Series, state: dict, *args, **kwargs + ) -> bool: + if series.hasnans: + series = series.dropna() + if series.empty: + return False + + return fn(config, series, state, *args, **kwargs) + + return inner + + def safe_histogram( values: np.ndarray, bins: Union[int, str, np.ndarray] = "auto", diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index e5eb6fdc2..6f5c7305d 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -1,3 +1,4 @@ +from collections import Counter from typing import Any from multimethod import multimethod @@ -5,6 +6,42 @@ from ydata_profiling.config import Settings +def compute_common_table_stats( + n: int, n_var: int, variable_stats: dict +) -> dict: + """Compute common table statistics shared by Pandas and Spark backends. + + Args: + n: Number of rows in the DataFrame + n_var: Number of columns (variables) + variable_stats: Previously calculated statistic on the DataFrame series + + Returns: + A dictionary with common table statistics: missing values counts, percentages, and type counts + """ + table_stats = { + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, + } + + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + total_cells = n * n_var + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0 + ) + + table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + + return table_stats + + @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py index e90935640..aef8de357 100644 --- a/src/ydata_profiling/report/structure/variables/render_common.py +++ b/src/ydata_profiling/report/structure/variables/render_common.py @@ -10,6 +10,7 @@ def render_common(config: Settings, summary: dict) -> dict: n_freq_table_max = config.n_freq_table_max template_variables = { + # TODO: with nan "freq_table_rows": freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..1cee2aea8 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,9 +1,41 @@ """ - File with a function to check the backend being used + File with backend utilities and helper functions to check the backend being used """ import importlib +from typing import Callable, Optional, Sized, Union + +import pandas as pd def is_pyspark_installed() -> bool: """Check if PySpark is installed without importing it.""" return importlib.util.find_spec("pyspark") is not None + + +class BaseBackend: + """Base helper class to select and cache the appropriate backend (Pandas or Spark).""" + + _pandas_module: Optional[str] = None + _spark_module: Optional[str] = None + + def __init__(self, df: Union[pd.DataFrame, Sized]): + """Determine backend once and store it for all computations.""" + if isinstance(df, pd.DataFrame): + module_path = self._pandas_module + else: + module_path = self._spark_module + + if module_path is None: + raise ValueError("Backend module path not configured") + + self.module = importlib.import_module(module_path) + self.module_path = module_path + + def get_method(self, method_name: str) -> Callable: + """Retrieve the appropriate function from the backend module.""" + try: + return getattr(self.module, method_name) + except AttributeError as ex: + raise AttributeError( + f"Function '{method_name}' is not available in {self.module_path}." + ) from ex From a1892a275016c5f6bc9eba06e1fdd88bbbbf5379 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 21:10:03 +0800 Subject: [PATCH 8/8] feat: initial release --- src/ydata_profiling/model/correlations.py | 28 +++++++++++--- src/ydata_profiling/model/missing.py | 25 ++++++++++--- .../model/pandas/table_pandas.py | 27 ++++++++++++-- .../model/spark/describe_boolean_spark.py | 2 + .../model/spark/describe_date_spark.py | 2 + .../model/spark/describe_generic_spark.py | 2 + .../model/spark/describe_numeric_spark.py | 12 +++--- .../model/spark/describe_text_spark.py | 2 + .../model/spark/table_spark.py | 37 +++++++++++++++++-- .../model/spark/timeseries_index_spark.py | 2 + src/ydata_profiling/model/summarizer.py | 3 +- src/ydata_profiling/model/table.py | 37 ------------------- src/ydata_profiling/utils/backend.py | 34 +---------------- 13 files changed, 117 insertions(+), 96 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 25e2e13c4..2bbaa1112 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -3,13 +3,12 @@ """Correlations between variables.""" import warnings -from typing import Dict, List, Optional, Sized +from typing import Dict, List, Optional, Sized, no_type_check import numpy as np import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.utils.backend import BaseBackend try: from pandas.core.base import DataError @@ -17,11 +16,30 @@ from pandas.errors import DataError -class CorrelationBackend(BaseBackend): +class CorrelationBackend: """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" - _pandas_module = "ydata_profiling.model.pandas.correlations_pandas" - _spark_module = "ydata_profiling.model.spark.correlations_spark" + @no_type_check + def __init__(self, df: Sized): + """Determine backend once and store it for all correlation computations.""" + if isinstance(df, pd.DataFrame): + from ydata_profiling.model.pandas import ( + correlations_pandas as correlation_backend, # type: ignore + ) + else: + from ydata_profiling.model.spark import ( + correlations_spark as correlation_backend, # type: ignore + ) + + self.backend = correlation_backend + + def get_method(self, method_name: str): # noqa: ANN201 + """Retrieve the appropriate correlation method class from the backend.""" + if hasattr(self.backend, method_name): + return getattr(self.backend, method_name) + raise AttributeError( + f"Correlation method '{method_name}' is not available in the backend." + ) class Correlation: diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index aa14cc425..46ec2dee3 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -1,17 +1,32 @@ +import importlib import warnings -from typing import Any, Dict, Optional, Sized +from typing import Any, Callable, Dict, Optional, Sized import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.utils.backend import BaseBackend -class MissingDataBackend(BaseBackend): +class MissingDataBackend: """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark).""" - _pandas_module = "ydata_profiling.model.pandas.missing_pandas" - _spark_module = "ydata_profiling.model.spark.missing_spark" + def __init__(self, df: Sized): + """Determine backend once and store it for all missing-data computations.""" + if isinstance(df, pd.DataFrame): + self.backend_module = "ydata_profiling.model.pandas.missing_pandas" + else: + self.backend_module = "ydata_profiling.model.spark.missing_spark" + + self.module = importlib.import_module(self.backend_module) + + def get_method(self, method_name: str) -> Callable: + """Retrieve the appropriate missing-data function from the backend module.""" + try: + return getattr(self.module, method_name) + except AttributeError as ex: + raise AttributeError( + f"Missing-data function '{method_name}' is not available in {self.backend_module}." + ) from ex class MissingData: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index 28c79f849..546b369ef 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,7 +1,9 @@ +from collections import Counter + import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.table import compute_common_table_stats, get_table_stats +from ydata_profiling.model.table import get_table_stats @get_table_stats.register @@ -19,18 +21,35 @@ def pandas_get_table_stats( A dictionary that contains the table statistics. """ n = len(df) if not df.empty else 0 - n_var = len(df.columns) memory_size = df.memory_usage(deep=config.memory_deep).sum() record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, - "n_var": n_var, + "n_var": len(df.columns), "memory_size": memory_size, "record_size": record_size, + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, } - table_stats.update(compute_common_table_stats(n, n_var, variable_stats)) + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) + if table_stats["n"] > 0 and table_stats["n_var"] > 0 + else 0 + ) + + table_stats.update( + {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} + ) return table_stats diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index 148dbce6c..ab5cf20fb 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_boolean_1d +@describe_boolean_1d.register def describe_boolean_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index c44d36650..a5e11a0f1 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -5,6 +5,7 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_date_1d def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -18,6 +19,7 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: return df.agg(*expr).first().asDict() +@describe_date_1d.register def describe_date_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py index 1171881cd..ee2356c0a 100644 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ b/src/ydata_profiling/model/spark/describe_generic_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_generic +@describe_generic.register def describe_generic_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 8c299577e..395b1461b 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -5,13 +5,15 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import histogram_compute +from ydata_profiling.model.summary_algorithms import ( + describe_numeric_1d, + histogram_compute, +) def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] - # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) finite_filter = ( F.col(column).isNotNull() & ~F.isnan(F.col(column)) @@ -32,6 +34,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: return non_null_df.agg(*expr).first().asDict() +@describe_numeric_1d.register def describe_numeric_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: @@ -90,7 +93,6 @@ def describe_numeric_1d_spark( quantile_threshold = 0.05 if summary.get("n") == summary.get("n_missing"): - # This means the entire column is null/nan, so summary values need to be hard-coded: summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) summary["mad"] = np.nan @@ -135,10 +137,6 @@ def describe_numeric_1d_spark( # ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark summary["monotonic"] = 0 - # this function only displays the top N (see config) values for a histogram. - # This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to - # display in pandas display - # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index 6d7804cf5..b5e27f615 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_text_1d +@describe_text_1d.register def describe_text_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 2a2985059..17ac03323 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -1,7 +1,9 @@ +from collections import Counter + from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.table import compute_common_table_stats, get_table_stats +from ydata_profiling.model.table import get_table_stats @get_table_stats.register @@ -19,9 +21,36 @@ def get_table_stats_spark( A dictionary that contains the table statistics. """ n = df.count() - n_var = len(df.columns) - result = {"n": n, "n_var": n_var} - result.update(compute_common_table_stats(n, n_var, variable_stats)) + result = {"n": n, "n_var": len(df.columns)} + + table_stats = { + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, + } + + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + if result["n"] * result["n_var"] > 0: + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) + if result["n"] > 0 + else 0 + ) + else: + table_stats["p_cells_missing"] = 0 + + result["p_cells_missing"] = table_stats["p_cells_missing"] + result["n_cells_missing"] = table_stats["n_cells_missing"] + result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] + result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] + + result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index a31f25ccf..c16204ac3 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -2,8 +2,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.timeseries_index import get_time_index_description +@get_time_index_description.register def get_time_index_description_spark( config: Settings, df: DataFrame, diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 41b8d6f88..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index 6f5c7305d..e5eb6fdc2 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -1,4 +1,3 @@ -from collections import Counter from typing import Any from multimethod import multimethod @@ -6,42 +5,6 @@ from ydata_profiling.config import Settings -def compute_common_table_stats( - n: int, n_var: int, variable_stats: dict -) -> dict: - """Compute common table statistics shared by Pandas and Spark backends. - - Args: - n: Number of rows in the DataFrame - n_var: Number of columns (variables) - variable_stats: Previously calculated statistic on the DataFrame series - - Returns: - A dictionary with common table statistics: missing values counts, percentages, and type counts - """ - table_stats = { - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, - } - - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - total_cells = n * n_var - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0 - ) - - table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) - - return table_stats - - @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index 1cee2aea8..e99d91c11 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,41 +1,9 @@ """ - File with backend utilities and helper functions to check the backend being used + File with a function to check the backend being used """ import importlib -from typing import Callable, Optional, Sized, Union - -import pandas as pd def is_pyspark_installed() -> bool: """Check if PySpark is installed without importing it.""" return importlib.util.find_spec("pyspark") is not None - - -class BaseBackend: - """Base helper class to select and cache the appropriate backend (Pandas or Spark).""" - - _pandas_module: Optional[str] = None - _spark_module: Optional[str] = None - - def __init__(self, df: Union[pd.DataFrame, Sized]): - """Determine backend once and store it for all computations.""" - if isinstance(df, pd.DataFrame): - module_path = self._pandas_module - else: - module_path = self._spark_module - - if module_path is None: - raise ValueError("Backend module path not configured") - - self.module = importlib.import_module(module_path) - self.module_path = module_path - - def get_method(self, method_name: str) -> Callable: - """Retrieve the appropriate function from the backend module.""" - try: - return getattr(self.module, method_name) - except AttributeError as ex: - raise AttributeError( - f"Function '{method_name}' is not available in {self.module_path}." - ) from ex