diff --git a/.gitignore b/.gitignore index 2d2cfe2b9..71fb75bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ var/ *.egg-info/ .installed.cfg *.egg +uv.lock # PyInstaller # Usually these files are written by a python script from a template @@ -85,3 +86,6 @@ docsrc/source/pages/api/_autosummary/ # User created VERSION version.py + +# local java/spark sdk environment files +.sdkmanrc diff --git a/pyproject.toml b/pyproject.toml index eae6b3dd4..e46cb237c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ dev = [ "sphinx-autodoc-typehints>=1.10.3", "sphinx-multiversion>=0.2.3", "autodoc_pydantic", + "standard-imghdr" ] docs = [ diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py index f9f1d1ecb..441dfbf0f 100644 --- a/src/ydata_profiling/model/spark/correlations_spark.py +++ b/src/ydata_profiling/model/spark/correlations_spark.py @@ -45,10 +45,23 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"), and Correlation is from pyspark.ml.stat """ - variables = {column: description["type"] for column, description in summary.items()} + variables = { + column: { + "type": description["type"], + "all_missing": description.get("n", 0) - description.get("n_missing", 0) + == 0, + } + for column, description in summary.items() + } interval_columns = [ - column for column, type_name in variables.items() if type_name == "Numeric" + column + for column, mapping in variables.items() + if mapping["type"] == "Numeric" and not mapping["all_missing"] ] + + if not interval_columns: + return [], interval_columns + df = df.select(*interval_columns) # convert to vector column first @@ -63,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra assembler = VectorAssembler(**assembler_args) df_vector = assembler.transform(df).select(vector_col) + if df_vector.count() <= 1: + return [], interval_columns + # get correlation matrix matrix = ( Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray() diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index d7a091e7f..f02d1043c 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -6,6 +6,7 @@ import pandas as pd from pyspark.sql import DataFrame from pyspark.sql import functions as F +from pyspark.sql import types as T from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_counts @@ -25,6 +26,11 @@ def describe_counts_spark( Returns: Updated settings, input series, and summary dictionary. """ + # Cast Decimal Type s + if isinstance(series.schema.fields[0].dataType, T.DecimalType): + series = series.select( + F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]) + ) # Count occurrences of each value value_counts = series.groupBy(series.columns[0]).count() @@ -36,9 +42,23 @@ def describe_counts_spark( value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0])) # Count missing values - n_missing = ( - value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() - ) + if series.dtypes[0][1] in ("int", "float", "bigint", "double"): + n_missing = ( + # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not + value_counts.filter( + F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0])) + ) + .select("count") + .first() + ) + else: + n_missing = ( + # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not + value_counts.filter(F.col(series.columns[0]).isNull()) + .select("count") + .first() + ) + n_missing = n_missing["count"] if n_missing else 0 # Convert top 200 values to Pandas for frequency table display @@ -60,17 +80,15 @@ def describe_counts_spark( value_counts.filter(F.col(column).isNotNull()) # Exclude NaNs .filter(~F.isnan(F.col(column))) # Remove implicit NaNs (if numeric column) .groupBy(column) # Group by unique values - .count() # Count occurrences + .agg(F.sum("count").alias("count")) # Sum of count .orderBy(F.desc("count")) # Sort in descending order - .limit(200) # Limit for performance ) else: value_counts_no_nan = ( value_counts.filter(F.col(column).isNotNull()) # Exclude NULLs .groupBy(column) # Group by unique timestamp values - .count() # Count occurrences + .agg(F.sum("count").alias("count")) # Sum of count .orderBy(F.desc("count")) # Sort by most frequent timestamps - .limit(200) # Limit for performance ) # Convert to Pandas Series, forcing proper structure diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 5e30c7539..8c299577e 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -11,6 +11,14 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] + # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) + finite_filter = ( + F.col(column).isNotNull() + & ~F.isnan(F.col(column)) + & ~F.col(column).isin([np.inf, -np.inf]) + ) + non_null_df = df.filter(finite_filter) + expr = [ F.mean(F.col(column)).alias("mean"), F.stddev(F.col(column)).alias("std"), @@ -21,7 +29,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: F.skewness(F.col(column)).alias("skewness"), F.sum(F.col(column)).alias("sum"), ] - return df.agg(*expr).first().asDict() + return non_null_df.agg(*expr).first().asDict() def describe_numeric_1d_spark( @@ -81,30 +89,42 @@ def describe_numeric_1d_spark( quantiles = config.vars.num.quantiles quantile_threshold = 0.05 - summary.update( - { - f"{percentile:.0%}": value - for percentile, value in zip( - quantiles, - df.stat.approxQuantile( - f"{df.columns[0]}", + if summary.get("n") == summary.get("n_missing"): + # This means the entire column is null/nan, so summary values need to be hard-coded: + summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) + + summary["mad"] = np.nan + summary["iqr"] = np.nan + + else: + summary.update( + { + f"{percentile:.0%}": value + for percentile, value in zip( quantiles, - quantile_threshold, - ), - ) - } - ) + df.stat.approxQuantile( + f"{df.columns[0]}", + quantiles, + quantile_threshold, + ), + ) + } + ) - median = summary["50%"] + median = summary.get("50%") - summary["mad"] = df.select( - (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev") - ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] + summary["mad"] = df.select( + (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev") + ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] + + summary["iqr"] = summary["75%"] - summary["25%"] # FIXME: move to fmt summary["p_negative"] = summary["n_negative"] / summary["n"] - summary["range"] = summary["max"] - summary["min"] - summary["iqr"] = summary["75%"] - summary["25%"] + if summary["min"] is None or summary["max"] is None: + summary["range"] = np.nan + else: + summary["range"] = summary["max"] - summary["min"] summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.nan summary["p_zeros"] = summary["n_zeros"] / summary["n"] summary["p_infinite"] = summary["n_infinite"] / summary["n"] diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py index 1758f668d..7dcaff62c 100644 --- a/src/ydata_profiling/model/spark/describe_supported_spark.py +++ b/src/ydata_profiling/model/spark/describe_supported_spark.py @@ -19,15 +19,15 @@ def describe_supported_spark( """ # number of non-NaN observations in the Series - count = summary["count"] - n_distinct = summary["value_counts"].count() + n = summary["n"] + n_distinct = summary["value_counts"].count() # Null/nan's count in value_counts summary["n_distinct"] = n_distinct - summary["p_distinct"] = n_distinct / count if count > 0 else 0 + summary["p_distinct"] = n_distinct / n if n > 0 else 0 n_unique = summary["value_counts"].where("count == 1").count() - summary["is_unique"] = n_unique == count + summary["is_unique"] = n_unique == n summary["n_unique"] = n_unique - summary["p_unique"] = n_unique / count if count > 0 else 0 + summary["p_unique"] = n_unique / n if n > 0 else 0 return config, series, summary diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index 5a033b1d5..75e01e240 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -43,6 +43,8 @@ def spark_describe_1d( if str(series.schema[0].dataType).startswith("ArrayType"): dtype = "ArrayType" + elif str(series.schema[0].dataType).startswith("Decimal"): + dtype = "decimal" else: dtype = series.schema[0].dataType.simpleString() @@ -56,6 +58,7 @@ def spark_describe_1d( "boolean": "Boolean", "date": "DateTime", "timestamp": "DateTime", + "decimal": "Numeric", }[dtype] return summarizer.summarize(config, series, dtype=vtype) diff --git a/src/ydata_profiling/report/formatters.py b/src/ydata_profiling/report/formatters.py index d2c4657e0..199ea854d 100644 --- a/src/ydata_profiling/report/formatters.py +++ b/src/ydata_profiling/report/formatters.py @@ -245,13 +245,16 @@ def fmt_numeric(value: float, precision: int = 10) -> str: Returns: The numeric value with the given precision. """ - fmtted = f"{{:.{precision}g}}".format(value) - for v in ["e+", "e-"]: - if v in fmtted: - sign = "-" if v in "e-" else "" - fmtted = fmtted.replace(v, " × 10") + "" - fmtted = fmtted.replace("0", "") - fmtted = fmtted.replace("", f"{sign}") + if value is None: + fmtted = "N/A" + else: + fmtted = f"{{:.{precision}g}}".format(value) + for v in ["e+", "e-"]: + if v in fmtted: + sign = "-" if v in "e-" else "" + fmtted = fmtted.replace(v, " × 10") + "" + fmtted = fmtted.replace("0", "") + fmtted = fmtted.replace("", f"{sign}") return fmtted diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py index 86f5a262a..8fa6b6ae0 100644 --- a/src/ydata_profiling/report/structure/variables/render_categorical.py +++ b/src/ydata_profiling/report/structure/variables/render_categorical.py @@ -23,6 +23,7 @@ from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.frequency_table_utils import freq_table from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram @@ -95,7 +96,7 @@ def render_categorical_length( else: hist_data = histogram(config, *summary["histogram_length"]) - length_histo = Image( + length_histo = image_or_empty( hist_data, image_format=config.plot.image_format, alt="length histogram", diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py index e11e9913e..776bb6763 100644 --- a/src/ydata_profiling/report/structure/variables/render_count.py +++ b/src/ydata_profiling/report/structure/variables/render_count.py @@ -8,11 +8,11 @@ from ydata_profiling.report.presentation.core import ( Container, FrequencyTable, - Image, Table, VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -94,26 +94,41 @@ def render_count(config: Settings, summary: dict) -> dict: style=config.html.style, ) - mini_histo = Image( - mini_histogram(config, *summary["histogram"]), - image_format=image_format, + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + mini_hist_data = mini_histogram(config, *summary["histogram"]) + + mini_histo = image_or_empty( + mini_hist_data, alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) - seqs = [ - Image( - histogram(config, *summary["histogram"]), - image_format=image_format, - alt="Histogram", - caption=f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})", - name="Histogram", - anchor_id="histogram", - ) - ] + hist_data = None + hist_caption = None + + if summary_histogram: + hist_data = histogram(config, *summary["histogram"]) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + + hist = image_or_empty( + hist_data, + alt="Histogram", + image_format=image_format, + caption=hist_caption, + name="Histogram", + anchor_id="histogram", + ) + + seqs = [hist] fq = FrequencyTable( template_variables["freq_table_rows"], diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index 1f142daae..8d002d885 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -2,12 +2,8 @@ from ydata_profiling.config import Settings from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent -from ydata_profiling.report.presentation.core import ( - Container, - Image, - Table, - VariableInfo, -) +from ydata_profiling.report.presentation.core import Container, Table, VariableInfo +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -76,55 +72,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: style=config.html.style, ) - if isinstance(summary["histogram"], list): - mini_histo = Image( - mini_histogram( + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + if isinstance(summary_histogram, list): + mini_hist_data = mini_histogram( config, [x[0] for x in summary["histogram"]], [x[1] for x in summary["histogram"]], date=True, - ), - image_format=image_format, - alt="Mini histogram", - ) - else: - mini_histo = Image( - mini_histogram( + ) + else: + mini_hist_data = mini_histogram( config, summary["histogram"][0], summary["histogram"][1], date=True - ), - image_format=image_format, - alt="Mini histogram", - ) + ) + + mini_histo = image_or_empty( + mini_hist_data, + alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", + anchor_id=f"{varid}minihistogram", + ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) - if isinstance(summary["histogram"], list): - hist_data = histogram( - config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], - date=True, - ) - else: - hist_data = histogram( - config, summary["histogram"][0], summary["histogram"][1], date=True + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary["histogram"]], + [x[1] for x in summary["histogram"]], + date=True, + ) + else: + hist_data = histogram( + config, summary["histogram"][0], summary["histogram"][1], date=True + ) + + n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0 + hist_caption = ( + f"Histogram with fixed size bins (bins={n_bins})" ) + hist = image_or_empty( + hist_data, + image_format=image_format, + alt="Histogram", + caption=hist_caption, + name="Histogram", + anchor_id=f"{varid}histogram", + ) + # Bottom - n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0 bottom = Container( - [ - Image( - hist_data, - image_format=image_format, - alt="Histogram", - caption=f"Histogram with fixed size bins (bins={n_bins})", - name="Histogram", - anchor_id=f"{varid}histogram", - ) - ], + [hist], sequence_type="tabs", anchor_id=summary["varid"], ) diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py index 81379a41f..e014434ab 100644 --- a/src/ydata_profiling/report/structure/variables/render_file.py +++ b/src/ydata_profiling/report/structure/variables/render_file.py @@ -1,10 +1,11 @@ from typing import List from ydata_profiling.config import Settings -from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image +from ydata_profiling.report.presentation.core import Container, FrequencyTable from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.frequency_table_utils import freq_table from ydata_profiling.report.structure.variables.render_path import render_path +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram @@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict: image_format = config.plot.image_format file_tabs: List[Renderable] = [] + + hist_data = None + if summary["histogram_file_size"]: + hist_data = histogram(config, *summary["histogram_file_size"]) + if "file_size" in summary: - file_tabs.append( - Image( - histogram(config, *summary["histogram_file_size"]), - image_format=image_format, - alt="Size", - caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})", - name="File size", - anchor_id=f"{varid}file_size_histogram", - ) + hist = image_or_empty( + hist_data, + image_format=image_format, + alt="Size", + caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})", + name="File size", + anchor_id=f"{varid}file_size_histogram", ) + file_tabs.append(hist) file_dates = { "file_created_time": "Created", diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py index 227200c27..92c73fcf3 100644 --- a/src/ydata_profiling/report/structure/variables/render_real.py +++ b/src/ydata_profiling/report/structure/variables/render_real.py @@ -9,11 +9,11 @@ from ydata_profiling.report.presentation.core import ( Container, FrequencyTable, - Image, Table, VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -118,22 +118,27 @@ def render_real(config: Settings, summary: dict) -> dict: style=config.html.style, ) - if isinstance(summary.get("histogram", []), list): - mini_histo = Image( - mini_histogram( + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + if isinstance(summary_histogram, list): + mini_hist_data = mini_histogram( config, - [x[0] for x in summary.get("histogram", [])], - [x[1] for x in summary.get("histogram", [])], - ), - image_format=image_format, - alt="Mini histogram", - ) - else: - mini_histo = Image( - mini_histogram(config, *summary["histogram"]), - image_format=image_format, - alt="Mini histogram", - ) + [x[0] for x in summary_histogram], + [x[1] for x in summary_histogram], + ) + else: + mini_hist_data = mini_histogram(config, *summary_histogram) + + mini_histo = image_or_empty( + mini_hist_data, + alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", + anchor_id=f"{varid}minihistogram", + ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" @@ -243,22 +248,31 @@ def render_real(config: Settings, summary: dict) -> dict: sequence_type="grid", ) - if isinstance(summary.get("histogram", []), list): - hist_data = histogram( - config, - [x[0] for x in summary.get("histogram", [])], - [x[1] for x in summary.get("histogram", [])], - ) - bins = len(summary["histogram"][0][1]) - 1 if "histogram" in summary else 0 - hist_caption = f"Histogram with fixed size bins (bins={bins})" - else: - hist_data = histogram(config, *summary["histogram"]) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary_histogram], + [x[1] for x in summary_histogram], + ) + bins = len(summary_histogram[0][1]) + hist_caption = ( + f"Histogram with fixed size bins (bins={bins - 1})" + ) + else: + hist_data = histogram(config, *summary_histogram) + hist_caption = ( + f"Histogram with fixed size bins " + f"(bins={len(summary_histogram[1]) - 1})" + ) - hist = Image( + hist = image_or_empty( hist_data, - image_format=image_format, alt="Histogram", + image_format=image_format, caption=hist_caption, name="Histogram", anchor_id=f"{varid}histogram", diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py index 6f3bc27cd..17f396ac0 100644 --- a/src/ydata_profiling/report/structure/variables/render_timeseries.py +++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py @@ -15,6 +15,7 @@ VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import ( histogram, mini_ts_plot, @@ -289,18 +290,24 @@ def render_timeseries(config: Settings, summary: dict) -> dict: sequence_type="grid", ) - if isinstance(summary["histogram"], list): - hist_data = histogram( - config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], - ) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})" - else: - hist_data = histogram(config, *summary["histogram"]) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + summary_histogram = summary.get("histogram", []) - hist = Image( + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary["histogram"]], + [x[1] for x in summary["histogram"]], + ) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})" + else: + hist_data = histogram(config, *summary["histogram"]) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + + hist = image_or_empty( hist_data, image_format=image_format, alt="Histogram", diff --git a/src/ydata_profiling/report/utils.py b/src/ydata_profiling/report/utils.py new file mode 100644 index 000000000..9d8a09b04 --- /dev/null +++ b/src/ydata_profiling/report/utils.py @@ -0,0 +1,34 @@ +from typing import Optional + +from ydata_profiling.config import ImageType +from ydata_profiling.report.presentation.core import Container, Image + + +def image_or_empty( + image: Optional[str], + *, + alt: str, + image_format: ImageType, + caption: Optional[str] = None, + name: Optional[str] = None, + anchor_id: Optional[str] = None, +) -> Container | Image: + """ + Add helper to render an image or an empty container when necessary. + """ + if image is None: + return Container( + items=[], + name=name or f"{alt}_empty", + anchor_id=anchor_id, + sequence_type="grid", + ) + + return Image( + image=image, + image_format=image_format, + alt=alt, + caption=caption, + name=name, + anchor_id=anchor_id, + ) diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index a2df1fd5d..b3dc5338e 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -134,13 +134,49 @@ def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str: return plot_360_n0sc0pe(config) +def _is_valid_hist_data(series: np.ndarray, bins: Union[int, np.ndarray]) -> bool: + """ + Returns True if the series and bins contain enough usable numeric data + to produce a histogram without matplotlib errors. + """ + if series is None or bins is None: + return False + + if len(series) == 0: + return False + + try: + series_arr = np.asarray(series, dtype=float) + except Exception: + return False + + if not np.isfinite(series_arr).any(): + return False + + # Handle bins type + if isinstance(bins, int): + if bins < 1: + return False + else: + try: + bins_arr = np.asarray(bins, dtype=float) + except Exception: + return False + if len(bins_arr) < 2: + return False + if not np.isfinite(bins_arr).all(): + return False + + return True + + @manage_matplotlib_context() def histogram( config: Settings, series: np.ndarray, bins: Union[int, np.ndarray], date: bool = False, -) -> str: +) -> str | None: """Plot an histogram of the data. Args: @@ -153,6 +189,9 @@ def histogram( The resulting histogram encoded as a string. """ + + if not _is_valid_hist_data(series, bins): + return None plot = _plot_histogram(config, series, bins, date=date, figsize=(7, 3)) plot.xaxis.set_tick_params(rotation=90 if date else 45) plot.figure.tight_layout() @@ -165,7 +204,7 @@ def mini_histogram( series: np.ndarray, bins: Union[int, np.ndarray], date: bool = False, -) -> str: +) -> str | None: """Plot a small (mini) histogram of the data. Args: @@ -176,6 +215,9 @@ def mini_histogram( Returns: The resulting mini histogram encoded as a string. """ + if not _is_valid_hist_data(series, bins): + return None + plot = _plot_histogram( config, series, bins, figsize=(3, 2.25), date=date, hide_yaxis=True ) @@ -253,8 +295,16 @@ def correlation_matrix(config: Settings, data: pd.DataFrame, vmin: int = -1) -> cmap.set_bad(config.plot.correlation.bad) labels = data.columns + + try: + matrix = np.asarray(data, dtype=float) + except Exception: + # If conversion fails, create an all-NaN matrix of the appropriate shape + n = len(data) + matrix = np.full((n, n), np.nan, dtype=float) + matrix_image = axes_cor.imshow( - data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap + matrix, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap ) plt.colorbar(matrix_image) diff --git a/tests/backends/spark_backend/test_correlations_spark.py b/tests/backends/spark_backend/test_correlations_spark.py index ef16224a1..02ab6886c 100644 --- a/tests/backends/spark_backend/test_correlations_spark.py +++ b/tests/backends/spark_backend/test_correlations_spark.py @@ -43,7 +43,10 @@ def correlation_data_cat(spark_session): @pytest.fixture def correlation_var_types(): - return {"test_num_1": {"type": "Numeric"}, "test_num_2": {"type": "Numeric"}} + return { + "test_num_1": {"type": "Numeric", "n": 7, "n_missing": 0}, + "test_num_2": {"type": "Numeric", "n": 7, "n_missing": 0}, + } def test_spearman_spark(correlation_data_num, correlation_var_types): diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index 24de9afbb..63aabbe4e 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -110,8 +110,8 @@ def expected_results(): }, "x": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 7, "n_unique": 5, "p_distinct": 0.7777777777777778, @@ -130,21 +130,21 @@ def expected_results(): "95%": 50.0, "mad": 5.0, "min": -10.0, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 50.0, + "mean": 13.375, + "std": 23.68807716974934, + "variance": 561.125, + "kurtosis": -0.9061564710904939, + "skewness": 0.8700654233008702, + "sum": 107.0, + "range": 60.0, "iqr": 18.0, - "cv": check_is_NaN, + "cv": 1.771071190261633, }, "y": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 9, "n_unique": 9, "p_distinct": 1.0, @@ -163,16 +163,16 @@ def expected_results(): "95%": 3122.0, "mad": 15.9, "min": -3.1415926535, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 3122.0, + "mean": 491.1743650433125, + "std": 1086.1335236468508, + "variance": 1179686.0311895243, + "kurtosis": 2.6543509612939804, + "skewness": 2.097192909339154, + "sum": 3929.3949203465, + "range": 3125.1415926535, "iqr": 110.999999, - "cv": check_is_NaN, + "cv": 2.211299287883385, }, "cat": { "n": 9, @@ -180,9 +180,9 @@ def expected_results(): "p_missing": 0.1111111111111111, "n_distinct": 7, "n_unique": 6, - "p_distinct": 0.875, + "p_distinct": 0.7777777777777778, "is_unique": False, - "p_unique": 0.75, + "p_unique": 0.6666666666666666, }, "s1": { "n": 9, @@ -247,9 +247,9 @@ def expected_results(): "p_missing": 0.1111111111111111, "n_distinct": 6, "n_unique": 4, - "p_distinct": 0.75, + "p_distinct": 0.6666666666666666, "is_unique": False, - "p_unique": 0.5, + "p_unique": 0.4444444444444444, }, "bool_tf": { "count": 9, @@ -287,8 +287,8 @@ def expected_results(): }, "bool_01_with_nan": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 3, "n_unique": 1, "p_distinct": 0.3333333333333333, @@ -307,16 +307,16 @@ def expected_results(): "95%": 1.0, "mad": 0.0, "min": 0.0, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 1.0, + "mean": 0.5, + "std": 0.5345224838248488, + "variance": 0.2857142857142857, + "kurtosis": -2.0, + "skewness": 1.1102230246251565e-16, + "sum": 4.0, + "range": 1.0, "iqr": 1.0, - "cv": check_is_NaN, + "cv": 1.0690449676496976, }, "list": { "n": 9, diff --git a/tests/backends/spark_backend/test_issue1429.py b/tests/backends/spark_backend/test_issue1429.py new file mode 100644 index 000000000..3d17b8cef --- /dev/null +++ b/tests/backends/spark_backend/test_issue1429.py @@ -0,0 +1,138 @@ +""" +Test for issue 1429: +https://github.com/ydataai/ydata-profiling/issues/1429 +""" +from typing import List, Optional, Tuple + +import numpy as np +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import types as T + +from ydata_profiling import ProfileReport +from ydata_profiling.config import SparkSettings +from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark +from ydata_profiling.model.spark.describe_generic_spark import describe_generic_spark +from ydata_profiling.model.spark.describe_numeric_spark import ( + describe_numeric_1d_spark, + numeric_stats_spark, +) +from ydata_profiling.model.spark.describe_supported_spark import ( + describe_supported_spark, +) + +RowType = Tuple[ + Optional[str], + Optional[float], + Optional[int], + Optional[bool], + Optional[float], + Optional[str], +] + +config = SparkSettings() + + +def create_test_df(spark: SparkSession) -> DataFrame: + schema = T.StructType( + [ + T.StructField("category", T.StringType(), True), + T.StructField("double", T.DoubleType(), True), + T.StructField("int", T.IntegerType(), True), + T.StructField("boolean", T.BooleanType(), True), + T.StructField("null_double", T.DoubleType(), True), + T.StructField("null_string", T.StringType(), True), + ] + ) + + data: List[RowType] = [ + (f"test_{num + 1}", float(num), int(num), True, None, None) + for num in range(205) + ] + + # Adding dupes + data.extend([("test_1", float(1), int(1), False, None, None) for _ in range(205)]) + + # Adding nulls + data.extend([(None, None, None, None, None, None) for _ in range(100)]) + + return spark.createDataFrame(data, schema=schema) + + +def test_describe_numeric_spark(spark_session): + test_df = create_test_df(spark_session) + + numeric_stats = numeric_stats_spark(df=test_df.select("double"), summary={}) + + for _, value in numeric_stats.items(): + assert value is not None + + +def test_describe_numeric_1d_spark_for_null_column_edge_case( + spark_session, test_output_dir +): + spark = spark_session + test_df = create_test_df(spark) + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("null_double"), summary={} + ) + + _, _, summary = describe_generic_spark( + config=config, df=test_df.select("null_double"), summary=summary + ) + + _, _, summary = describe_supported_spark( + config=config, series=test_df.select("null_double"), summary=summary + ) + + _, _, summary = describe_numeric_1d_spark( + config=config, df=test_df.select("null_double"), summary=summary + ) + + assert summary["iqr"] is np.nan + assert summary["mad"] is np.nan + assert summary["cv"] is np.nan + assert summary["mean"] is None + assert summary["histogram"] == [] + + profile = ProfileReport(test_df, title="1429_edge_case.html", config=config) + + output_file = test_output_dir / "1429_edge_case.html" + + profile.to_file(output_file) + + assert output_file.exists() + + +def test_describe_counts_spark(spark_session): + test_df = create_test_df(spark_session) + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("category"), summary={} + ) + + assert summary["value_counts_without_nan"].loc["test_1"] == 206 + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("double"), summary={} + ) + + assert summary["value_counts_without_nan"].loc[float(1)] == 206 + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("int"), summary={} + ) + + assert summary["value_counts_without_nan"].loc[int(1)] == 206 + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("boolean"), summary={} + ) + + assert summary["value_counts_without_nan"].loc[True] == 205 + + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("null_double"), summary={} + ) + + assert summary["value_counts_without_nan"].size == 0 diff --git a/tests/backends/spark_backend/test_issue1602.py b/tests/backends/spark_backend/test_issue1602.py new file mode 100644 index 000000000..b25ae1873 --- /dev/null +++ b/tests/backends/spark_backend/test_issue1602.py @@ -0,0 +1,33 @@ +""" +Test for issue 1602: +https://github.com/ydataai/ydata-profiling/issues/1602 +""" + +from pyspark.sql import types as T + +from ydata_profiling import ProfileReport + + +def test_spark_handles_decimal_type(test_output_dir, spark_session): + from decimal import Decimal + + spark = spark_session + + schema = T.StructType( + [ + T.StructField("number", T.StringType(), True), + T.StructField("decimal", T.DecimalType(10, 2), True), + ] + ) + + data = [(f"test_{num + 1}", Decimal(num + 1)) for num in range(205)] + + data.extend([("test_1", Decimal("1.05")) for _ in range(205)]) + + test_df = spark.createDataFrame(data, schema=schema) + + profile = ProfileReport(test_df, title="decimal_handling", explorative=True) + output_file = test_output_dir / "decimal_handling.html" + profile.to_file(output_file) + + assert output_file.exists() diff --git a/tests/backends/spark_backend/test_issue1722.py b/tests/backends/spark_backend/test_issue1722.py new file mode 100644 index 000000000..6cb2d5596 --- /dev/null +++ b/tests/backends/spark_backend/test_issue1722.py @@ -0,0 +1,90 @@ +""" +Test for issue 1722: +https://github.com/ydataai/ydata-profiling/issues/1722 +""" + +from datetime import date, datetime + +from pyspark.sql import SparkSession +from pyspark.sql import types as T + +from ydata_profiling import ProfileReport + + +def make_non_numeric_df(spark: SparkSession): + # Intentionally not including any numeric types + schema = T.StructType( + [ + T.StructField("id", T.StringType(), False), + T.StructField("d", T.DateType(), False), + T.StructField("ts", T.TimestampType(), False), + T.StructField("arr", T.ArrayType(T.IntegerType()), False), + T.StructField("mp", T.MapType(T.StringType(), T.IntegerType()), False), + T.StructField( + "struct", + T.StructType( + [ + T.StructField("a", T.IntegerType(), False), + T.StructField("b", T.StringType(), False), + ] + ), + False, + ), + ] + ) + + data = [ + ( + "r1", + date(2020, 1, 1), + datetime(2020, 1, 1, 12, 0), + [1, 2], + {"x": 1, "y": 2}, + (10, "aa"), + ), + ( + "r2", + date(2021, 6, 15), + datetime(2021, 6, 15, 8, 30), + [3], + {"z": 3}, + (20, "bb"), + ), + ("r3", date(2022, 12, 31), datetime(2022, 12, 31, 23, 59), [], {}, (30, "cc")), + ] + + return spark.createDataFrame(data, schema=schema) + + +def test_issue1722(test_output_dir, spark_session): + from pyspark.sql import functions as F + + spark = spark_session + + non_numeric_df = make_non_numeric_df(spark) + + # type casting 1 + df_casted = non_numeric_df.select( + [ + ( + F.col(field.name).cast("string").alias(field.name) + if isinstance(field.dataType, (T.DateType, T.TimestampType)) + else F.col(field.name) + ) + for field in non_numeric_df.schema + ] + ) + # type casting 2 + complex_columns = [ + field.name + for field in non_numeric_df.schema.fields + if isinstance(field.dataType, (T.ArrayType, T.MapType, T.StructType)) + ] + for col_name in complex_columns: + df_casted = df_casted.withColumn(col_name, F.to_json(F.col(col_name))) + + profile = ProfileReport(df_casted, title="non_numeric_1722", explorative=True) + output_file = test_output_dir / "non_numeric_1722.html" + profile.to_file(output_file) + + assert output_file.exists() diff --git a/tests/conftest.py b/tests/conftest.py index 28ce21215..7db073a3c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,6 +88,10 @@ def spark_context(): .setAppName("pytest-pyspark-tests") .setMaster("local[*]") .set("spark.sql.ansi.enabled", "false") + .set("spark.driver.host", "127.0.0.1") + .set("spark.driver.bindAddress", "127.0.0.1") + .set("spark.driver.port", "4040") + .set("spark.blockManager.port", "4041") ) # Check if SparkContext exists before creating a new one @@ -112,6 +116,8 @@ def spark_session(spark_context): pytest.skip("Skipping Spark tests because PySpark is not installed.") spark = ( SparkSession.builder.master("local[*]") + .config("spark.driver.host", "127.0.0.1") + .config("spark.driver.bindAddress", "127.0.0.1") .appName("pytest") .config("spark.sql.ansi.enabled", "false") # <-- restore permissive casts .getOrCreate() diff --git a/tests/unit/test_formatters.py b/tests/unit/test_formatters.py index 156cc4385..f49e693d8 100644 --- a/tests/unit/test_formatters.py +++ b/tests/unit/test_formatters.py @@ -83,6 +83,7 @@ def test_fmt_array(array, threshold, expected): (1e20, 10, "1 × 1020"), (1e-20, 10, "1 × 10-20"), (1e8, 3, "1 × 108"), + (None, 3, "N/A"), ], ) def test_fmt_numeric(value, precision, expected):