diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..ed45110d0 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -407,8 +407,15 @@ class SparkSettings(Settings): samples.random = 0 -class Config: - arg_groups: Dict[str, Any] = { +class _Config: + """Container for configuration presets and shorthand mappings. + + This class provides predefined configuration groups (sensitive, explorative, themes) + and shorthand mappings for common configuration options. It should be used only + through its static methods. + """ + + arg_groups = { "sensitive": { "samples": None, "duplicates": None, @@ -475,18 +482,36 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + """Get expanded configuration for a preset group. + + Args: + key: Name of preset group (e.g., "sensitive", "explorative") + + Returns: + Expanded configuration dictionary with shorthands resolved + """ + kwargs = _Config.arg_groups[key] + shorthand_args, _ = _Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: + """Expand shorthand configuration keys. + + Args: + kwargs: Configuration dictionary potentially containing shorthands + split: If True, remove shorthands from kwargs and return separately. + If False, expand shorthands in-place within kwargs. + + Returns: + Tuple of (shorthand_args, remaining_kwargs) + """ shorthand_args = {} if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in _Config._shorthands: + shorthand_args[key] = _Config._shorthands[key] if split: del kwargs[key] @@ -494,3 +519,6 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + +Config = _Config diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 74bdf924a..1aac3747f 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -27,6 +27,37 @@ from ydata_profiling.version import __version__ +def _validate_inputs( + config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"] # type: ignore[name-defined] # noqa: F821 +) -> None: + """Validate input types for profiling. + + Args: + config: Report configuration settings + df: DataFrame to profile + + Raises: + TypeError: If inputs are of incorrect type + """ + if not isinstance(config, Settings): + raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") + + if isinstance(df, pd.DataFrame): + return + + try: + from pyspark.sql import DataFrame as SparkDataFrame + if isinstance(df, SparkDataFrame): + return + except ImportError: + pass + + raise TypeError( + f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." + f"If using Spark, make sure PySpark is installed." + ) + + def describe( config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] # noqa: F821 @@ -52,26 +83,7 @@ def describe( - alerts: direct special attention to these patterns in your data. - package: package details. """ - # ** Validate Input types ** - if not isinstance(config, Settings): - raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") - - # Validate df input type - - if not isinstance(df, pd.DataFrame): - try: - from pyspark.sql import DataFrame as SparkDataFrame # type: ignore - - if not isinstance(df, SparkDataFrame): # noqa: TC301 - raise TypeError( # noqa: TC301 - f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." - ) - except ImportError as ex: - raise TypeError( - f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." - f"If using Spark, make sure PySpark is installed." - ) from ex - + _validate_inputs(config, df) df = preprocess(config, df) number_of_tasks = 5 diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..91d135973 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -25,9 +25,10 @@ def composed_function(*args) -> List[Any]: class Handler: - """A generic handler + """Generic handler for data type specific processing pipelines. - Allows any custom mapping between data types and functions + Builds a processing pipeline for each data type by composing functions + along the type hierarchy. Allows custom summarization strategies. """ def __init__( @@ -42,6 +43,11 @@ def __init__( self._complete_dag() def _complete_dag(self) -> None: + """Propagate functions along the type hierarchy DAG. + + Functions defined for parent types are inherited by subtypes, + creating a complete processing pipeline for each type. + """ for from_type, to_type in nx.topological_sort( nx.line_graph(self.typeset.base_graph) ): @@ -50,32 +56,17 @@ def _complete_dag(self) -> None: ) def handle(self, dtype: str, *args, **kwargs) -> dict: - """ + """Execute the processing pipeline for a given data type. + + Args: + dtype: Name of the data type to process + *args: Arguments passed to the processing pipeline + **kwargs: Additional keyword arguments + Returns: - object: a tuple containing the config, the dataset series and the summary extracted + Extracted summary dictionary """ funcs = self.mapping.get(dtype, []) op = compose(funcs) summary = op(*args)[-1] return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a3665f42b 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -10,53 +10,134 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.handler import Handler -from ydata_profiling.model.pandas import ( - pandas_describe_boolean_1d, - pandas_describe_categorical_1d, - pandas_describe_counts, - pandas_describe_date_1d, - pandas_describe_file_1d, - pandas_describe_generic, - pandas_describe_image_1d, - pandas_describe_numeric_1d, - pandas_describe_path_1d, - pandas_describe_text_1d, - pandas_describe_timeseries_1d, - pandas_describe_url_1d, -) -from ydata_profiling.model.pandas.describe_supported_pandas import ( - pandas_describe_supported, -) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for - describe_file_1d, - describe_image_1d, - describe_path_1d, - describe_timeseries_1d, - describe_url_1d, -) from ydata_profiling.utils.backend import is_pyspark_installed class BaseSummarizer(Handler): - """A base summarizer + """Base class for data summarization. - Can be used to define custom summarizations + Provides a flexible framework to define custom summarization strategies + for different data types and dataframe backends. """ def summarize( self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] ) -> dict: - """Generates the summary for a given series""" + """Generates the summary statistics for a given series. + + Args: + config: Report configuration settings + series: Data series to summarize + dtype: Detected data type from visions typeset + + Returns: + Dictionary containing summary statistics + """ return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. +def _create_pandas_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Pandas backend.""" + from ydata_profiling.model.pandas import ( + pandas_describe_boolean_1d, + pandas_describe_categorical_1d, + pandas_describe_counts, + pandas_describe_date_1d, + pandas_describe_file_1d, + pandas_describe_generic, + pandas_describe_image_1d, + pandas_describe_numeric_1d, + pandas_describe_path_1d, + pandas_describe_text_1d, + pandas_describe_timeseries_1d, + pandas_describe_url_1d, + ) + from ydata_profiling.model.pandas.describe_supported_pandas import ( + pandas_describe_supported, + ) + + return { + "Unsupported": [ + pandas_describe_counts, + pandas_describe_generic, + pandas_describe_supported, + ], + "Numeric": [pandas_describe_numeric_1d], + "DateTime": [pandas_describe_date_1d], + "Text": [pandas_describe_text_1d], + "Categorical": [pandas_describe_categorical_1d], + "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], + } + + +def _create_spark_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Spark backend.""" + from ydata_profiling.model.spark import ( + describe_boolean_1d_spark, + describe_categorical_1d_spark, + describe_counts_spark, + describe_date_1d_spark, + describe_generic_spark, + describe_numeric_1d_spark, + describe_supported_spark, + describe_text_1d_spark, + ) + from ydata_profiling.model.summary_algorithms import ( + describe_file_1d, + describe_image_1d, + describe_path_1d, + describe_timeseries_1d, + describe_url_1d, + ) + + return { + "Unsupported": [ + describe_counts_spark, + describe_generic_spark, + describe_supported_spark, + ], + "Numeric": [describe_numeric_1d_spark], + "DateTime": [describe_date_1d_spark], + "Text": [describe_text_1d_spark], + "Categorical": [describe_categorical_1d_spark], + "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + + +def _create_summary_map_factory(use_spark: bool) -> Dict[str, List[Callable]]: + """Factory function to create appropriate summary map based on backend. + + Args: + use_spark: If True, create Spark-compatible summary map + + Returns: + Mapping from data types to summary functions + """ + if use_spark: + return _create_spark_summary_map() + return _create_pandas_summary_map() + + class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """Standard summarizer for data profiling. + + Supports both Pandas and Spark backends, providing comprehensive + statistical summaries for all standard data types. + """ def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() - self._summary_map = self._create_summary_map() + self._summary_map = _create_summary_map_factory(self.use_spark) super().__init__(self._summary_map, typeset) @property @@ -64,57 +145,6 @@ def summary_map(self) -> Dict[str, List[Callable]]: """Allows users to modify the summary map after initialization.""" return self._summary_map - def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" - if self.use_spark: - from ydata_profiling.model.spark import ( - describe_boolean_1d_spark, - describe_categorical_1d_spark, - describe_counts_spark, - describe_date_1d_spark, - describe_generic_spark, - describe_numeric_1d_spark, - describe_supported_spark, - describe_text_1d_spark, - ) - - summary_map = { - "Unsupported": [ - describe_counts_spark, - describe_generic_spark, - describe_supported_spark, - ], - "Numeric": [describe_numeric_1d_spark], - "DateTime": [describe_date_1d_spark], - "Text": [describe_text_1d_spark], - "Categorical": [describe_categorical_1d_spark], - "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - else: - summary_map = { - "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, - pandas_describe_supported, - ], - "Numeric": [pandas_describe_numeric_1d], - "DateTime": [pandas_describe_date_1d], - "Text": [pandas_describe_text_1d], - "Categorical": [pandas_describe_categorical_1d], - "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], - } - return summary_map - def format_summary(summary: Union[BaseDescription, dict]) -> dict: """Prepare summary for export to json file. diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..64bec9fd8 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -1,5 +1,5 @@ import os -from typing import List, Sequence +from typing import Callable, Dict, List, Sequence import pandas as pd from tqdm.auto import tqdm @@ -7,7 +7,6 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, @@ -24,6 +23,30 @@ from ydata_profiling.utils.dataframe import slugify +def get_render_map() -> Dict[str, Callable]: + """Create mapping from data types to rendering functions. + + Returns: + Dictionary mapping data type names to their respective render functions + """ + import ydata_profiling.report.structure.variables as render_algorithms + + return { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + def get_missing_items(config: Settings, summary: BaseDescription) -> list: """Return the missing diagrams diff --git a/venv/Dockerfile b/venv/Dockerfile new file mode 100644 index 000000000..9e0a68801 --- /dev/null +++ b/venv/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]