Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions src/ydata_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,15 @@ class SparkSettings(Settings):
samples.random = 0


class Config:
arg_groups: Dict[str, Any] = {
class _Config:
"""Container for configuration presets and shorthand mappings.

This class provides predefined configuration groups (sensitive, explorative, themes)
and shorthand mappings for common configuration options. It should be used only
through its static methods.
"""

arg_groups = {
"sensitive": {
"samples": None,
"duplicates": None,
Expand Down Expand Up @@ -475,22 +482,43 @@ class Config:

@staticmethod
def get_arg_groups(key: str) -> dict:
kwargs = Config.arg_groups[key]
shorthand_args, _ = Config.shorthands(kwargs, split=False)
"""Get expanded configuration for a preset group.

Args:
key: Name of preset group (e.g., "sensitive", "explorative")

Returns:
Expanded configuration dictionary with shorthands resolved
"""
kwargs = _Config.arg_groups[key]
shorthand_args, _ = _Config.shorthands(kwargs, split=False)
return shorthand_args

@staticmethod
def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]:
"""Expand shorthand configuration keys.

Args:
kwargs: Configuration dictionary potentially containing shorthands
split: If True, remove shorthands from kwargs and return separately.
If False, expand shorthands in-place within kwargs.

Returns:
Tuple of (shorthand_args, remaining_kwargs)
"""
shorthand_args = {}
if not split:
shorthand_args = kwargs
for key, value in list(kwargs.items()):
if value is None and key in Config._shorthands:
shorthand_args[key] = Config._shorthands[key]
if value is None and key in _Config._shorthands:
shorthand_args[key] = _Config._shorthands[key]
if split:
del kwargs[key]

if split:
return shorthand_args, kwargs
else:
return shorthand_args, {}


Config = _Config
52 changes: 32 additions & 20 deletions src/ydata_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,37 @@
from ydata_profiling.version import __version__


def _validate_inputs(
config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"] # type: ignore[name-defined] # noqa: F821
) -> None:
"""Validate input types for profiling.

Args:
config: Report configuration settings
df: DataFrame to profile

Raises:
TypeError: If inputs are of incorrect type
"""
if not isinstance(config, Settings):
raise TypeError(f"`config` must be of type `Settings`, got {type(config)}")

if isinstance(df, pd.DataFrame):
return

try:
from pyspark.sql import DataFrame as SparkDataFrame
if isinstance(df, SparkDataFrame):
return
except ImportError:
pass

raise TypeError(
f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
f"If using Spark, make sure PySpark is installed."
)


def describe(
config: Settings,
df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] # noqa: F821
Expand All @@ -52,26 +83,7 @@ def describe(
- alerts: direct special attention to these patterns in your data.
- package: package details.
"""
# ** Validate Input types **
if not isinstance(config, Settings):
raise TypeError(f"`config` must be of type `Settings`, got {type(config)}")

# Validate df input type

if not isinstance(df, pd.DataFrame):
try:
from pyspark.sql import DataFrame as SparkDataFrame # type: ignore

if not isinstance(df, SparkDataFrame): # noqa: TC301
raise TypeError( # noqa: TC301
f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
)
except ImportError as ex:
raise TypeError(
f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
f"If using Spark, make sure PySpark is installed."
) from ex

_validate_inputs(config, df)
df = preprocess(config, df)

number_of_tasks = 5
Expand Down
41 changes: 16 additions & 25 deletions src/ydata_profiling/model/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ def composed_function(*args) -> List[Any]:


class Handler:
"""A generic handler
"""Generic handler for data type specific processing pipelines.

Allows any custom mapping between data types and functions
Builds a processing pipeline for each data type by composing functions
along the type hierarchy. Allows custom summarization strategies.
"""

def __init__(
Expand All @@ -42,6 +43,11 @@ def __init__(
self._complete_dag()

def _complete_dag(self) -> None:
"""Propagate functions along the type hierarchy DAG.

Functions defined for parent types are inherited by subtypes,
creating a complete processing pipeline for each type.
"""
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
Expand All @@ -50,32 +56,17 @@ def _complete_dag(self) -> None:
)

def handle(self, dtype: str, *args, **kwargs) -> dict:
"""
"""Execute the processing pipeline for a given data type.

Args:
dtype: Name of the data type to process
*args: Arguments passed to the processing pipeline
**kwargs: Additional keyword arguments

Returns:
object: a tuple containing the config, the dataset series and the summary extracted
Extracted summary dictionary
"""
funcs = self.mapping.get(dtype, [])
op = compose(funcs)
summary = op(*args)[-1]
return summary


def get_render_map() -> Dict[str, Callable]:
import ydata_profiling.report.structure.variables as render_algorithms

render_map = {
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
"Path": render_algorithms.render_path,
"File": render_algorithms.render_file,
"Image": render_algorithms.render_image,
"Unsupported": render_algorithms.render_generic,
"TimeSeries": render_algorithms.render_timeseries,
}

return render_map
Loading