From b2d6d1a06531da4752a7341b59156354c251828f Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Mon, 8 Dec 2025 09:17:24 -0600 Subject: [PATCH 1/9] fix: update spark numeric stats calculations In the Pandas implementation, the numeric stats like min/max/stddev/etc. by default ignore null values. This commit updates the spark implementation to more closely match that. --- .../model/spark/describe_numeric_spark.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 5e30c7539..ca28c5e10 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -11,6 +11,14 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] + # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) + finite_filter = ( + F.col(column).isNotNull() + & ~F.isnan(F.col(column)) + & ~F.col(column).isin([np.inf, -np.inf]) + ) + non_null_df = df.filter(finite_filter) + expr = [ F.mean(F.col(column)).alias("mean"), F.stddev(F.col(column)).alias("std"), @@ -21,7 +29,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: F.skewness(F.col(column)).alias("skewness"), F.sum(F.col(column)).alias("sum"), ] - return df.agg(*expr).first().asDict() + return non_null_df.agg(*expr).first().asDict() def describe_numeric_1d_spark( From 8a0204e4675a7a238b6bba810456041a4c55c29a Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Mon, 8 Dec 2025 11:29:23 -0600 Subject: [PATCH 2/9] fix: update spark null checks for describe_counts_spark method Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not. --- src/ydata_profiling/model/spark/describe_counts_spark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index d7a091e7f..597adfdd4 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -37,7 +37,8 @@ def describe_counts_spark( # Count missing values n_missing = ( - value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not + value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first() ) n_missing = n_missing["count"] if n_missing else 0 From d4d8a49e03d77272a4c7744a7c58e7f4fe3a82ff Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Mon, 8 Dec 2025 13:22:47 -0600 Subject: [PATCH 3/9] fix: update Spark frequency counts The previous calculation of counts was actually counting an already summarized dataframe, so it wasn't capturing the correct counts for each instance of a value. This is updated by summing the count value instead of performing a row count. --- src/ydata_profiling/model/spark/describe_counts_spark.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index 597adfdd4..9fdeb2737 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -61,17 +61,15 @@ def describe_counts_spark( value_counts.filter(F.col(column).isNotNull()) # Exclude NaNs .filter(~F.isnan(F.col(column))) # Remove implicit NaNs (if numeric column) .groupBy(column) # Group by unique values - .count() # Count occurrences + .agg(F.sum("count").alias("count")) # Sum of count .orderBy(F.desc("count")) # Sort in descending order - .limit(200) # Limit for performance ) else: value_counts_no_nan = ( value_counts.filter(F.col(column).isNotNull()) # Exclude NULLs .groupBy(column) # Group by unique timestamp values - .count() # Count occurrences + .agg(F.sum("count").alias("count")) # Sum of count .orderBy(F.desc("count")) # Sort by most frequent timestamps - .limit(200) # Limit for performance ) # Convert to Pandas Series, forcing proper structure From 64b75640e8e2603424bf33f4ed66be369c50bbc9 Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Mon, 8 Dec 2025 14:25:35 -0600 Subject: [PATCH 4/9] fix: adding tests for issue 1429 --- .../model/spark/describe_counts_spark.py | 15 ++-- .../backends/spark_backend/test_issue1429.py | 69 +++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 tests/backends/spark_backend/test_issue1429.py diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index 9fdeb2737..4b37fdc76 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -36,10 +36,17 @@ def describe_counts_spark( value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0])) # Count missing values - n_missing = ( - # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not - value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first() - ) + if series.dtypes[0][1] in ("int", "float", "bigint", "double"): + n_missing = ( + # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not + value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first() + ) + else: + n_missing = ( + # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not + value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + ) + n_missing = n_missing["count"] if n_missing else 0 # Convert top 200 values to Pandas for frequency table display diff --git a/tests/backends/spark_backend/test_issue1429.py b/tests/backends/spark_backend/test_issue1429.py new file mode 100644 index 000000000..991e50e71 --- /dev/null +++ b/tests/backends/spark_backend/test_issue1429.py @@ -0,0 +1,69 @@ +""" +Test for issue 1429: +https://github.com/ydataai/ydata-profiling/issues/1429 +""" + +from ydata_profiling.config import SparkSettings +from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark +from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark +from pyspark.sql import types as T, SparkSession, DataFrame + + +def create_test_df(spark: SparkSession) -> DataFrame: + schema = T.StructType( + [ + T.StructField("category", T.StringType(), True), + T.StructField("double", T.DoubleType(), True), + T.StructField("int", T.IntegerType(), True), + T.StructField("boolean", T.BooleanType(), True), + ] + ) + + data = [ + (f"test_{num + 1}", float(num), int(num), True) for num in range(205) + ] + + # Adding dupes + data.extend( + [ + ("test_1", float(1), int(1), False) for _ in range(205) + ] + ) + + # Adding nulls + data.extend( + [ + (None, None, None, None) for _ in range(100) + ] + ) + + return spark.createDataFrame(data, schema=schema) + + +def test_describe_numeric_spark(spark_session): + test_df = create_test_df(spark_session) + + numeric_stats = numeric_stats_spark(df=test_df.select("double"), summary={}) + + for _, value in numeric_stats.items(): + assert value is not None + + +def test_describe_counts_spark(spark_session): + test_df = create_test_df(spark_session) + + _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("category"), summary={}) + + assert summary["value_counts_without_nan"].loc["test_1"] == 206 + + _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("double"), summary={}) + + assert summary["value_counts_without_nan"].loc[float(1)] == 206 + + _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("int"), summary={}) + + assert summary["value_counts_without_nan"].loc[int(1)] == 206 + + _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("boolean"), summary={}) + + assert summary["value_counts_without_nan"].loc[True] == 205 From 3487c8434b6fee935089063c76a7c944cabb416a Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Wed, 10 Dec 2025 17:28:44 -0600 Subject: [PATCH 5/9] fix: edge case - completely null numeric field in Spark Discovered this edge case with real data, and still need to fix the rendering of an empty histogram. --- .../model/spark/describe_numeric_spark.py | 53 ++++++++++++------- .../backends/spark_backend/test_issue1429.py | 44 ++++++++++++--- 2 files changed, 71 insertions(+), 26 deletions(-) diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index ca28c5e10..0b616f91c 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -89,30 +89,47 @@ def describe_numeric_1d_spark( quantiles = config.vars.num.quantiles quantile_threshold = 0.05 - summary.update( - { - f"{percentile:.0%}": value - for percentile, value in zip( - quantiles, - df.stat.approxQuantile( - f"{df.columns[0]}", + if summary.get("n") == summary.get("n_missing"): + # This means the entire column is null/nan, so summary values need to be hard-coded: + summary.update( + { + f"{percentile:.0%}": np.nan + for percentile in quantiles + } + ) + + summary["mad"] = np.nan + summary["iqr"] = np.nan + + else: + summary.update( + { + f"{percentile:.0%}": value + for percentile, value in zip( quantiles, - quantile_threshold, - ), - ) - } - ) + df.stat.approxQuantile( + f"{df.columns[0]}", + quantiles, + quantile_threshold, + ), + ) + } + ) - median = summary["50%"] + median = summary.get("50%") - summary["mad"] = df.select( - (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev") - ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] + summary["mad"] = df.select( + (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev") + ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] + + summary["iqr"] = summary["75%"] - summary["25%"] # FIXME: move to fmt summary["p_negative"] = summary["n_negative"] / summary["n"] - summary["range"] = summary["max"] - summary["min"] - summary["iqr"] = summary["75%"] - summary["25%"] + if summary["min"] is None or summary["max"] is None: + summary["range"] = np.nan + else: + summary["range"] = summary["max"] - summary["min"] summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.nan summary["p_zeros"] = summary["n_zeros"] / summary["n"] summary["p_infinite"] = summary["n_infinite"] / summary["n"] diff --git a/tests/backends/spark_backend/test_issue1429.py b/tests/backends/spark_backend/test_issue1429.py index 991e50e71..6599931d5 100644 --- a/tests/backends/spark_backend/test_issue1429.py +++ b/tests/backends/spark_backend/test_issue1429.py @@ -2,12 +2,16 @@ Test for issue 1429: https://github.com/ydataai/ydata-profiling/issues/1429 """ +import numpy as np from ydata_profiling.config import SparkSettings -from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark +from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark, describe_numeric_1d_spark from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark +from ydata_profiling.model.spark.describe_generic_spark import describe_generic_spark +from ydata_profiling.model.spark.describe_supported_spark import describe_supported_spark from pyspark.sql import types as T, SparkSession, DataFrame +config = SparkSettings() def create_test_df(spark: SparkSession) -> DataFrame: schema = T.StructType( @@ -16,24 +20,25 @@ def create_test_df(spark: SparkSession) -> DataFrame: T.StructField("double", T.DoubleType(), True), T.StructField("int", T.IntegerType(), True), T.StructField("boolean", T.BooleanType(), True), + T.StructField("null_double", T.DoubleType(), True), ] ) data = [ - (f"test_{num + 1}", float(num), int(num), True) for num in range(205) + (f"test_{num + 1}", float(num), int(num), True, None) for num in range(205) ] # Adding dupes data.extend( [ - ("test_1", float(1), int(1), False) for _ in range(205) + ("test_1", float(1), int(1), False, None) for _ in range(205) ] ) # Adding nulls data.extend( [ - (None, None, None, None) for _ in range(100) + (None, None, None, None, None) for _ in range(100) ] ) @@ -49,21 +54,44 @@ def test_describe_numeric_spark(spark_session): assert value is not None +def test_describe_numeric_1d_spark_for_null_column_edge_case(spark_session, test_output_dir): + spark = spark_session + test_df = create_test_df(spark) + + _, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={}) + + _, _, summary = describe_generic_spark(config=config, df=test_df.select("null_double"), summary=summary) + + _, _, summary = describe_supported_spark(config=config, series=test_df.select("null_double"), summary=summary) + + _, _, summary = describe_numeric_1d_spark(config=config, df=test_df.select("null_double"), summary=summary) + + assert summary["iqr"] is np.nan + assert summary["mad"] is np.nan + assert summary["cv"] is np.nan + assert summary["mean"] is None + assert summary["histogram"] == [] + + def test_describe_counts_spark(spark_session): test_df = create_test_df(spark_session) - _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("category"), summary={}) + _, _, summary = describe_counts_spark(config=config, series=test_df.select("category"), summary={}) assert summary["value_counts_without_nan"].loc["test_1"] == 206 - _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("double"), summary={}) + _, _, summary = describe_counts_spark(config=config, series=test_df.select("double"), summary={}) assert summary["value_counts_without_nan"].loc[float(1)] == 206 - _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("int"), summary={}) + _, _, summary = describe_counts_spark(config=config, series=test_df.select("int"), summary={}) assert summary["value_counts_without_nan"].loc[int(1)] == 206 - _, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("boolean"), summary={}) + _, _, summary = describe_counts_spark(config=config, series=test_df.select("boolean"), summary={}) assert summary["value_counts_without_nan"].loc[True] == 205 + + _, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={}) + + assert summary["value_counts_without_nan"].size == 0 From abde1f2f396abdcee51108e7a0189065bd8625ab Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Sun, 7 Dec 2025 15:34:08 -0600 Subject: [PATCH 6/9] fix: add handling for spark DecimalType This change addresses issue #1602. Computations in the summarize process result in some floats when computing against decimal columns.To solution this, we simply convert those types to a DoubleType when performing those numeric operations. --- .../model/spark/describe_counts_spark.py | 5 ++- .../model/spark/summary_spark.py | 3 ++ .../backends/spark_backend/test_issue1602.py | 36 +++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tests/backends/spark_backend/test_issue1602.py diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index 4b37fdc76..d22f75170 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -5,7 +5,7 @@ import pandas as pd from pyspark.sql import DataFrame -from pyspark.sql import functions as F +from pyspark.sql import functions as F, types as T from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_counts @@ -25,6 +25,9 @@ def describe_counts_spark( Returns: Updated settings, input series, and summary dictionary. """ + # Cast Decimal Type s + if isinstance(series.schema.fields[0].dataType, T.DecimalType): + series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])) # Count occurrences of each value value_counts = series.groupBy(series.columns[0]).count() diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index 5a033b1d5..75e01e240 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -43,6 +43,8 @@ def spark_describe_1d( if str(series.schema[0].dataType).startswith("ArrayType"): dtype = "ArrayType" + elif str(series.schema[0].dataType).startswith("Decimal"): + dtype = "decimal" else: dtype = series.schema[0].dataType.simpleString() @@ -56,6 +58,7 @@ def spark_describe_1d( "boolean": "Boolean", "date": "DateTime", "timestamp": "DateTime", + "decimal": "Numeric", }[dtype] return summarizer.summarize(config, series, dtype=vtype) diff --git a/tests/backends/spark_backend/test_issue1602.py b/tests/backends/spark_backend/test_issue1602.py new file mode 100644 index 000000000..83e36e027 --- /dev/null +++ b/tests/backends/spark_backend/test_issue1602.py @@ -0,0 +1,36 @@ +""" +Test for issue 1602: +https://github.com/ydataai/ydata-profiling/issues/1602 +""" + +from ydata_profiling import ProfileReport +from pyspark.sql import types as T + +def test_spark_handles_decimal_type(test_output_dir, spark_session): + from decimal import Decimal + spark = spark_session + + schema = T.StructType( + [ + T.StructField("number", T.StringType(), True), + T.StructField("decimal", T.DecimalType(10, 2), True) + ] + ) + + data = [ + (f"test_{num + 1}", Decimal(num + 1)) for num in range(205) + ] + + data.extend( + [ + ("test_1", Decimal("1.05")) for _ in range(205) + ] + ) + + test_df = spark.createDataFrame(data, schema=schema) + + profile = ProfileReport(test_df, title="decimal_handling", explorative=True) + output_file = test_output_dir / "decimal_handling.html" + profile.to_file(output_file) + + assert output_file.exists() From 588be6b1f3077df9a40890e1b9a74ed2aa31d1a3 Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Sat, 6 Dec 2025 21:16:18 -0600 Subject: [PATCH 7/9] fix: add handling for spark correlations with no numeric fields Assembling a vector column in Spark with no numeric columns results in features with a NULL size, NULL indices, and an empty list of values. This causes an exception to be raised when computing correlations. The solution here is to avoid computing the correlation matrix when there are no interval columns (numeric). This change addresses issue #1722. --- .../model/spark/correlations_spark.py | 4 ++ .../backends/spark_backend/test_issue1722.py | 72 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 tests/backends/spark_backend/test_issue1722.py diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py index f9f1d1ecb..c27393c98 100644 --- a/src/ydata_profiling/model/spark/correlations_spark.py +++ b/src/ydata_profiling/model/spark/correlations_spark.py @@ -49,6 +49,10 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra interval_columns = [ column for column, type_name in variables.items() if type_name == "Numeric" ] + + if not interval_columns: + return [], interval_columns + df = df.select(*interval_columns) # convert to vector column first diff --git a/tests/backends/spark_backend/test_issue1722.py b/tests/backends/spark_backend/test_issue1722.py new file mode 100644 index 000000000..e6bca1d76 --- /dev/null +++ b/tests/backends/spark_backend/test_issue1722.py @@ -0,0 +1,72 @@ +""" +Test for issue 1722: +https://github.com/ydataai/ydata-profiling/issues/1722 +""" + +from ydata_profiling import ProfileReport +from datetime import date, datetime +from pyspark.sql import types as T, SparkSession + +def make_non_numeric_df(spark: SparkSession): + # Intentionally not including any numeric types + schema = T.StructType( + [ + T.StructField("id", T.StringType(), False), + T.StructField("d", T.DateType(), False), + T.StructField("ts", T.TimestampType(), False), + T.StructField("arr", T.ArrayType(T.IntegerType()), False), + T.StructField("mp", T.MapType(T.StringType(), T.IntegerType()), False), + T.StructField( + "struct", + T.StructType( + [ + T.StructField("a", T.IntegerType(), False), + T.StructField("b", T.StringType(), False), + ] + ), + False, + ), + ] + ) + + data = [ + ("r1", date(2020, 1, 1), datetime(2020, 1, 1, 12, 0), [1, 2], {"x": 1, "y": 2}, (10, "aa")), + ("r2", date(2021, 6, 15), datetime(2021, 6, 15, 8, 30), [3], {"z": 3}, (20, "bb")), + ("r3", date(2022, 12, 31), datetime(2022, 12, 31, 23, 59), [], {}, (30, "cc")), + ] + + return spark.createDataFrame(data, schema=schema) + + +def test_issue1722(test_output_dir, spark_session): + from pyspark.sql import functions as F + + spark = spark_session + + non_numeric_df = make_non_numeric_df(spark) + + # type casting 1 + df_casted = non_numeric_df.select( + [ + ( + F.col(field.name).cast("string").alias(field.name) + if isinstance(field.dataType, (T.DateType, T.TimestampType)) + else F.col(field.name) + ) + for field in non_numeric_df.schema + ] + ) + # type casting 2 + complex_columns = [ + field.name + for field in non_numeric_df.schema.fields + if isinstance(field.dataType, (T.ArrayType, T.MapType, T.StructType)) + ] + for col_name in complex_columns: + df_casted = df_casted.withColumn(col_name, F.to_json(F.col(col_name))) + + profile = ProfileReport(df_casted, title="non_numeric_1722", explorative=True) + output_file = test_output_dir / "non_numeric_1722.html" + profile.to_file(output_file) + + assert output_file.exists() From a9c7cf940bd427469fca2a3ac8825e393ae87760 Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Sat, 6 Dec 2025 16:21:16 -0600 Subject: [PATCH 8/9] fix: allow NoneType values to be string formatted This change addresses issue #1723. It implements a "N/A" string as the default when formatting NoneType values. --- .gitignore | 4 ++++ pyproject.toml | 1 + src/ydata_profiling/report/formatters.py | 17 ++++++++++------- tests/unit/test_formatters.py | 1 + 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 2d2cfe2b9..71fb75bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ var/ *.egg-info/ .installed.cfg *.egg +uv.lock # PyInstaller # Usually these files are written by a python script from a template @@ -85,3 +86,6 @@ docsrc/source/pages/api/_autosummary/ # User created VERSION version.py + +# local java/spark sdk environment files +.sdkmanrc diff --git a/pyproject.toml b/pyproject.toml index eae6b3dd4..e46cb237c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ dev = [ "sphinx-autodoc-typehints>=1.10.3", "sphinx-multiversion>=0.2.3", "autodoc_pydantic", + "standard-imghdr" ] docs = [ diff --git a/src/ydata_profiling/report/formatters.py b/src/ydata_profiling/report/formatters.py index d2c4657e0..199ea854d 100644 --- a/src/ydata_profiling/report/formatters.py +++ b/src/ydata_profiling/report/formatters.py @@ -245,13 +245,16 @@ def fmt_numeric(value: float, precision: int = 10) -> str: Returns: The numeric value with the given precision. """ - fmtted = f"{{:.{precision}g}}".format(value) - for v in ["e+", "e-"]: - if v in fmtted: - sign = "-" if v in "e-" else "" - fmtted = fmtted.replace(v, " × 10") + "" - fmtted = fmtted.replace("0", "") - fmtted = fmtted.replace("", f"{sign}") + if value is None: + fmtted = "N/A" + else: + fmtted = f"{{:.{precision}g}}".format(value) + for v in ["e+", "e-"]: + if v in fmtted: + sign = "-" if v in "e-" else "" + fmtted = fmtted.replace(v, " × 10") + "" + fmtted = fmtted.replace("0", "") + fmtted = fmtted.replace("", f"{sign}") return fmtted diff --git a/tests/unit/test_formatters.py b/tests/unit/test_formatters.py index 156cc4385..f49e693d8 100644 --- a/tests/unit/test_formatters.py +++ b/tests/unit/test_formatters.py @@ -83,6 +83,7 @@ def test_fmt_array(array, threshold, expected): (1e20, 10, "1 × 1020"), (1e-20, 10, "1 × 10-20"), (1e8, 3, "1 × 108"), + (None, 3, "N/A"), ], ) def test_fmt_numeric(value, precision, expected): From aed14e3a8cfd64384a8291a4f8195d9bd49c85b2 Mon Sep 17 00:00:00 2001 From: Michael Chapman Date: Thu, 11 Dec 2025 15:06:37 -0600 Subject: [PATCH 9/9] fix: multiple Spark fixes to approach closer parity to Pandas profiles Addresses handling of completely null numeric columns, and gracefully handling empty correlation sets and plots. --- .../model/spark/correlations_spark.py | 16 +++- .../model/spark/describe_counts_spark.py | 17 +++- .../model/spark/describe_numeric_spark.py | 13 +-- .../model/spark/describe_supported_spark.py | 10 +- .../structure/variables/render_categorical.py | 3 +- .../structure/variables/render_count.py | 43 ++++++--- .../report/structure/variables/render_date.py | 91 ++++++++++--------- .../report/structure/variables/render_file.py | 25 +++-- .../report/structure/variables/render_real.py | 72 +++++++++------ .../structure/variables/render_timeseries.py | 29 +++--- src/ydata_profiling/report/utils.py | 34 +++++++ src/ydata_profiling/visualisation/plot.py | 56 +++++++++++- .../spark_backend/test_correlations_spark.py | 5 +- .../spark_backend/test_descriptions_spark.py | 74 +++++++-------- .../backends/spark_backend/test_issue1429.py | 91 ++++++++++++++----- .../backends/spark_backend/test_issue1602.py | 17 ++-- .../backends/spark_backend/test_issue1722.py | 26 +++++- tests/conftest.py | 6 ++ 18 files changed, 422 insertions(+), 206 deletions(-) create mode 100644 src/ydata_profiling/report/utils.py diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py index c27393c98..441dfbf0f 100644 --- a/src/ydata_profiling/model/spark/correlations_spark.py +++ b/src/ydata_profiling/model/spark/correlations_spark.py @@ -45,9 +45,18 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"), and Correlation is from pyspark.ml.stat """ - variables = {column: description["type"] for column, description in summary.items()} + variables = { + column: { + "type": description["type"], + "all_missing": description.get("n", 0) - description.get("n_missing", 0) + == 0, + } + for column, description in summary.items() + } interval_columns = [ - column for column, type_name in variables.items() if type_name == "Numeric" + column + for column, mapping in variables.items() + if mapping["type"] == "Numeric" and not mapping["all_missing"] ] if not interval_columns: @@ -67,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra assembler = VectorAssembler(**assembler_args) df_vector = assembler.transform(df).select(vector_col) + if df_vector.count() <= 1: + return [], interval_columns + # get correlation matrix matrix = ( Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray() diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index d22f75170..f02d1043c 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -5,7 +5,8 @@ import pandas as pd from pyspark.sql import DataFrame -from pyspark.sql import functions as F, types as T +from pyspark.sql import functions as F +from pyspark.sql import types as T from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_counts @@ -27,7 +28,9 @@ def describe_counts_spark( """ # Cast Decimal Type s if isinstance(series.schema.fields[0].dataType, T.DecimalType): - series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])) + series = series.select( + F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]) + ) # Count occurrences of each value value_counts = series.groupBy(series.columns[0]).count() @@ -42,12 +45,18 @@ def describe_counts_spark( if series.dtypes[0][1] in ("int", "float", "bigint", "double"): n_missing = ( # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not - value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first() + value_counts.filter( + F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0])) + ) + .select("count") + .first() ) else: n_missing = ( # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not - value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + value_counts.filter(F.col(series.columns[0]).isNull()) + .select("count") + .first() ) n_missing = n_missing["count"] if n_missing else 0 diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 0b616f91c..8c299577e 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -13,9 +13,9 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) finite_filter = ( - F.col(column).isNotNull() - & ~F.isnan(F.col(column)) - & ~F.col(column).isin([np.inf, -np.inf]) + F.col(column).isNotNull() + & ~F.isnan(F.col(column)) + & ~F.col(column).isin([np.inf, -np.inf]) ) non_null_df = df.filter(finite_filter) @@ -91,12 +91,7 @@ def describe_numeric_1d_spark( if summary.get("n") == summary.get("n_missing"): # This means the entire column is null/nan, so summary values need to be hard-coded: - summary.update( - { - f"{percentile:.0%}": np.nan - for percentile in quantiles - } - ) + summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) summary["mad"] = np.nan summary["iqr"] = np.nan diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py index 1758f668d..7dcaff62c 100644 --- a/src/ydata_profiling/model/spark/describe_supported_spark.py +++ b/src/ydata_profiling/model/spark/describe_supported_spark.py @@ -19,15 +19,15 @@ def describe_supported_spark( """ # number of non-NaN observations in the Series - count = summary["count"] - n_distinct = summary["value_counts"].count() + n = summary["n"] + n_distinct = summary["value_counts"].count() # Null/nan's count in value_counts summary["n_distinct"] = n_distinct - summary["p_distinct"] = n_distinct / count if count > 0 else 0 + summary["p_distinct"] = n_distinct / n if n > 0 else 0 n_unique = summary["value_counts"].where("count == 1").count() - summary["is_unique"] = n_unique == count + summary["is_unique"] = n_unique == n summary["n_unique"] = n_unique - summary["p_unique"] = n_unique / count if count > 0 else 0 + summary["p_unique"] = n_unique / n if n > 0 else 0 return config, series, summary diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py index 86f5a262a..8fa6b6ae0 100644 --- a/src/ydata_profiling/report/structure/variables/render_categorical.py +++ b/src/ydata_profiling/report/structure/variables/render_categorical.py @@ -23,6 +23,7 @@ from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.frequency_table_utils import freq_table from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram @@ -95,7 +96,7 @@ def render_categorical_length( else: hist_data = histogram(config, *summary["histogram_length"]) - length_histo = Image( + length_histo = image_or_empty( hist_data, image_format=config.plot.image_format, alt="length histogram", diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py index e11e9913e..776bb6763 100644 --- a/src/ydata_profiling/report/structure/variables/render_count.py +++ b/src/ydata_profiling/report/structure/variables/render_count.py @@ -8,11 +8,11 @@ from ydata_profiling.report.presentation.core import ( Container, FrequencyTable, - Image, Table, VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -94,26 +94,41 @@ def render_count(config: Settings, summary: dict) -> dict: style=config.html.style, ) - mini_histo = Image( - mini_histogram(config, *summary["histogram"]), - image_format=image_format, + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + mini_hist_data = mini_histogram(config, *summary["histogram"]) + + mini_histo = image_or_empty( + mini_hist_data, alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) - seqs = [ - Image( - histogram(config, *summary["histogram"]), - image_format=image_format, - alt="Histogram", - caption=f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})", - name="Histogram", - anchor_id="histogram", - ) - ] + hist_data = None + hist_caption = None + + if summary_histogram: + hist_data = histogram(config, *summary["histogram"]) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + + hist = image_or_empty( + hist_data, + alt="Histogram", + image_format=image_format, + caption=hist_caption, + name="Histogram", + anchor_id="histogram", + ) + + seqs = [hist] fq = FrequencyTable( template_variables["freq_table_rows"], diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index 1f142daae..8d002d885 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -2,12 +2,8 @@ from ydata_profiling.config import Settings from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent -from ydata_profiling.report.presentation.core import ( - Container, - Image, - Table, - VariableInfo, -) +from ydata_profiling.report.presentation.core import Container, Table, VariableInfo +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -76,55 +72,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: style=config.html.style, ) - if isinstance(summary["histogram"], list): - mini_histo = Image( - mini_histogram( + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + if isinstance(summary_histogram, list): + mini_hist_data = mini_histogram( config, [x[0] for x in summary["histogram"]], [x[1] for x in summary["histogram"]], date=True, - ), - image_format=image_format, - alt="Mini histogram", - ) - else: - mini_histo = Image( - mini_histogram( + ) + else: + mini_hist_data = mini_histogram( config, summary["histogram"][0], summary["histogram"][1], date=True - ), - image_format=image_format, - alt="Mini histogram", - ) + ) + + mini_histo = image_or_empty( + mini_hist_data, + alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", + anchor_id=f"{varid}minihistogram", + ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) - if isinstance(summary["histogram"], list): - hist_data = histogram( - config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], - date=True, - ) - else: - hist_data = histogram( - config, summary["histogram"][0], summary["histogram"][1], date=True + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary["histogram"]], + [x[1] for x in summary["histogram"]], + date=True, + ) + else: + hist_data = histogram( + config, summary["histogram"][0], summary["histogram"][1], date=True + ) + + n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0 + hist_caption = ( + f"Histogram with fixed size bins (bins={n_bins})" ) + hist = image_or_empty( + hist_data, + image_format=image_format, + alt="Histogram", + caption=hist_caption, + name="Histogram", + anchor_id=f"{varid}histogram", + ) + # Bottom - n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0 bottom = Container( - [ - Image( - hist_data, - image_format=image_format, - alt="Histogram", - caption=f"Histogram with fixed size bins (bins={n_bins})", - name="Histogram", - anchor_id=f"{varid}histogram", - ) - ], + [hist], sequence_type="tabs", anchor_id=summary["varid"], ) diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py index 81379a41f..e014434ab 100644 --- a/src/ydata_profiling/report/structure/variables/render_file.py +++ b/src/ydata_profiling/report/structure/variables/render_file.py @@ -1,10 +1,11 @@ from typing import List from ydata_profiling.config import Settings -from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image +from ydata_profiling.report.presentation.core import Container, FrequencyTable from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.frequency_table_utils import freq_table from ydata_profiling.report.structure.variables.render_path import render_path +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram @@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict: image_format = config.plot.image_format file_tabs: List[Renderable] = [] + + hist_data = None + if summary["histogram_file_size"]: + hist_data = histogram(config, *summary["histogram_file_size"]) + if "file_size" in summary: - file_tabs.append( - Image( - histogram(config, *summary["histogram_file_size"]), - image_format=image_format, - alt="Size", - caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})", - name="File size", - anchor_id=f"{varid}file_size_histogram", - ) + hist = image_or_empty( + hist_data, + image_format=image_format, + alt="Size", + caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})", + name="File size", + anchor_id=f"{varid}file_size_histogram", ) + file_tabs.append(hist) file_dates = { "file_created_time": "Created", diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py index 227200c27..92c73fcf3 100644 --- a/src/ydata_profiling/report/structure/variables/render_real.py +++ b/src/ydata_profiling/report/structure/variables/render_real.py @@ -9,11 +9,11 @@ from ydata_profiling.report.presentation.core import ( Container, FrequencyTable, - Image, Table, VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import histogram, mini_histogram @@ -118,22 +118,27 @@ def render_real(config: Settings, summary: dict) -> dict: style=config.html.style, ) - if isinstance(summary.get("histogram", []), list): - mini_histo = Image( - mini_histogram( + summary_histogram = summary.get("histogram", []) + + mini_hist_data = None + + if summary_histogram: + if isinstance(summary_histogram, list): + mini_hist_data = mini_histogram( config, - [x[0] for x in summary.get("histogram", [])], - [x[1] for x in summary.get("histogram", [])], - ), - image_format=image_format, - alt="Mini histogram", - ) - else: - mini_histo = Image( - mini_histogram(config, *summary["histogram"]), - image_format=image_format, - alt="Mini histogram", - ) + [x[0] for x in summary_histogram], + [x[1] for x in summary_histogram], + ) + else: + mini_hist_data = mini_histogram(config, *summary_histogram) + + mini_histo = image_or_empty( + mini_hist_data, + alt="Mini histogram", + image_format=image_format, + name="Mini Histogram", + anchor_id=f"{varid}minihistogram", + ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" @@ -243,22 +248,31 @@ def render_real(config: Settings, summary: dict) -> dict: sequence_type="grid", ) - if isinstance(summary.get("histogram", []), list): - hist_data = histogram( - config, - [x[0] for x in summary.get("histogram", [])], - [x[1] for x in summary.get("histogram", [])], - ) - bins = len(summary["histogram"][0][1]) - 1 if "histogram" in summary else 0 - hist_caption = f"Histogram with fixed size bins (bins={bins})" - else: - hist_data = histogram(config, *summary["histogram"]) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary_histogram], + [x[1] for x in summary_histogram], + ) + bins = len(summary_histogram[0][1]) + hist_caption = ( + f"Histogram with fixed size bins (bins={bins - 1})" + ) + else: + hist_data = histogram(config, *summary_histogram) + hist_caption = ( + f"Histogram with fixed size bins " + f"(bins={len(summary_histogram[1]) - 1})" + ) - hist = Image( + hist = image_or_empty( hist_data, - image_format=image_format, alt="Histogram", + image_format=image_format, caption=hist_caption, name="Histogram", anchor_id=f"{varid}histogram", diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py index 6f3bc27cd..17f396ac0 100644 --- a/src/ydata_profiling/report/structure/variables/render_timeseries.py +++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py @@ -15,6 +15,7 @@ VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.report.utils import image_or_empty from ydata_profiling.visualisation.plot import ( histogram, mini_ts_plot, @@ -289,18 +290,24 @@ def render_timeseries(config: Settings, summary: dict) -> dict: sequence_type="grid", ) - if isinstance(summary["histogram"], list): - hist_data = histogram( - config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], - ) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})" - else: - hist_data = histogram(config, *summary["histogram"]) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + summary_histogram = summary.get("histogram", []) - hist = Image( + hist_data = None + hist_caption = None + + if summary_histogram: + if isinstance(summary_histogram, list): + hist_data = histogram( + config, + [x[0] for x in summary["histogram"]], + [x[1] for x in summary["histogram"]], + ) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})" + else: + hist_data = histogram(config, *summary["histogram"]) + hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" + + hist = image_or_empty( hist_data, image_format=image_format, alt="Histogram", diff --git a/src/ydata_profiling/report/utils.py b/src/ydata_profiling/report/utils.py new file mode 100644 index 000000000..9d8a09b04 --- /dev/null +++ b/src/ydata_profiling/report/utils.py @@ -0,0 +1,34 @@ +from typing import Optional + +from ydata_profiling.config import ImageType +from ydata_profiling.report.presentation.core import Container, Image + + +def image_or_empty( + image: Optional[str], + *, + alt: str, + image_format: ImageType, + caption: Optional[str] = None, + name: Optional[str] = None, + anchor_id: Optional[str] = None, +) -> Container | Image: + """ + Add helper to render an image or an empty container when necessary. + """ + if image is None: + return Container( + items=[], + name=name or f"{alt}_empty", + anchor_id=anchor_id, + sequence_type="grid", + ) + + return Image( + image=image, + image_format=image_format, + alt=alt, + caption=caption, + name=name, + anchor_id=anchor_id, + ) diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index a2df1fd5d..b3dc5338e 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -134,13 +134,49 @@ def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str: return plot_360_n0sc0pe(config) +def _is_valid_hist_data(series: np.ndarray, bins: Union[int, np.ndarray]) -> bool: + """ + Returns True if the series and bins contain enough usable numeric data + to produce a histogram without matplotlib errors. + """ + if series is None or bins is None: + return False + + if len(series) == 0: + return False + + try: + series_arr = np.asarray(series, dtype=float) + except Exception: + return False + + if not np.isfinite(series_arr).any(): + return False + + # Handle bins type + if isinstance(bins, int): + if bins < 1: + return False + else: + try: + bins_arr = np.asarray(bins, dtype=float) + except Exception: + return False + if len(bins_arr) < 2: + return False + if not np.isfinite(bins_arr).all(): + return False + + return True + + @manage_matplotlib_context() def histogram( config: Settings, series: np.ndarray, bins: Union[int, np.ndarray], date: bool = False, -) -> str: +) -> str | None: """Plot an histogram of the data. Args: @@ -153,6 +189,9 @@ def histogram( The resulting histogram encoded as a string. """ + + if not _is_valid_hist_data(series, bins): + return None plot = _plot_histogram(config, series, bins, date=date, figsize=(7, 3)) plot.xaxis.set_tick_params(rotation=90 if date else 45) plot.figure.tight_layout() @@ -165,7 +204,7 @@ def mini_histogram( series: np.ndarray, bins: Union[int, np.ndarray], date: bool = False, -) -> str: +) -> str | None: """Plot a small (mini) histogram of the data. Args: @@ -176,6 +215,9 @@ def mini_histogram( Returns: The resulting mini histogram encoded as a string. """ + if not _is_valid_hist_data(series, bins): + return None + plot = _plot_histogram( config, series, bins, figsize=(3, 2.25), date=date, hide_yaxis=True ) @@ -253,8 +295,16 @@ def correlation_matrix(config: Settings, data: pd.DataFrame, vmin: int = -1) -> cmap.set_bad(config.plot.correlation.bad) labels = data.columns + + try: + matrix = np.asarray(data, dtype=float) + except Exception: + # If conversion fails, create an all-NaN matrix of the appropriate shape + n = len(data) + matrix = np.full((n, n), np.nan, dtype=float) + matrix_image = axes_cor.imshow( - data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap + matrix, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap ) plt.colorbar(matrix_image) diff --git a/tests/backends/spark_backend/test_correlations_spark.py b/tests/backends/spark_backend/test_correlations_spark.py index ef16224a1..02ab6886c 100644 --- a/tests/backends/spark_backend/test_correlations_spark.py +++ b/tests/backends/spark_backend/test_correlations_spark.py @@ -43,7 +43,10 @@ def correlation_data_cat(spark_session): @pytest.fixture def correlation_var_types(): - return {"test_num_1": {"type": "Numeric"}, "test_num_2": {"type": "Numeric"}} + return { + "test_num_1": {"type": "Numeric", "n": 7, "n_missing": 0}, + "test_num_2": {"type": "Numeric", "n": 7, "n_missing": 0}, + } def test_spearman_spark(correlation_data_num, correlation_var_types): diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index 24de9afbb..63aabbe4e 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -110,8 +110,8 @@ def expected_results(): }, "x": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 7, "n_unique": 5, "p_distinct": 0.7777777777777778, @@ -130,21 +130,21 @@ def expected_results(): "95%": 50.0, "mad": 5.0, "min": -10.0, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 50.0, + "mean": 13.375, + "std": 23.68807716974934, + "variance": 561.125, + "kurtosis": -0.9061564710904939, + "skewness": 0.8700654233008702, + "sum": 107.0, + "range": 60.0, "iqr": 18.0, - "cv": check_is_NaN, + "cv": 1.771071190261633, }, "y": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 9, "n_unique": 9, "p_distinct": 1.0, @@ -163,16 +163,16 @@ def expected_results(): "95%": 3122.0, "mad": 15.9, "min": -3.1415926535, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 3122.0, + "mean": 491.1743650433125, + "std": 1086.1335236468508, + "variance": 1179686.0311895243, + "kurtosis": 2.6543509612939804, + "skewness": 2.097192909339154, + "sum": 3929.3949203465, + "range": 3125.1415926535, "iqr": 110.999999, - "cv": check_is_NaN, + "cv": 2.211299287883385, }, "cat": { "n": 9, @@ -180,9 +180,9 @@ def expected_results(): "p_missing": 0.1111111111111111, "n_distinct": 7, "n_unique": 6, - "p_distinct": 0.875, + "p_distinct": 0.7777777777777778, "is_unique": False, - "p_unique": 0.75, + "p_unique": 0.6666666666666666, }, "s1": { "n": 9, @@ -247,9 +247,9 @@ def expected_results(): "p_missing": 0.1111111111111111, "n_distinct": 6, "n_unique": 4, - "p_distinct": 0.75, + "p_distinct": 0.6666666666666666, "is_unique": False, - "p_unique": 0.5, + "p_unique": 0.4444444444444444, }, "bool_tf": { "count": 9, @@ -287,8 +287,8 @@ def expected_results(): }, "bool_01_with_nan": { "n": 9, - "count": 9, - "p_missing": 0.0, + "count": 8, + "p_missing": 0.1111111111111111, "n_distinct": 3, "n_unique": 1, "p_distinct": 0.3333333333333333, @@ -307,16 +307,16 @@ def expected_results(): "95%": 1.0, "mad": 0.0, "min": 0.0, - "max": check_is_NaN, - "mean": check_is_NaN, - "std": check_is_NaN, - "variance": check_is_NaN, - "kurtosis": check_is_NaN, - "skewness": check_is_NaN, - "sum": check_is_NaN, - "range": check_is_NaN, + "max": 1.0, + "mean": 0.5, + "std": 0.5345224838248488, + "variance": 0.2857142857142857, + "kurtosis": -2.0, + "skewness": 1.1102230246251565e-16, + "sum": 4.0, + "range": 1.0, "iqr": 1.0, - "cv": check_is_NaN, + "cv": 1.0690449676496976, }, "list": { "n": 9, diff --git a/tests/backends/spark_backend/test_issue1429.py b/tests/backends/spark_backend/test_issue1429.py index 6599931d5..3d17b8cef 100644 --- a/tests/backends/spark_backend/test_issue1429.py +++ b/tests/backends/spark_backend/test_issue1429.py @@ -2,17 +2,36 @@ Test for issue 1429: https://github.com/ydataai/ydata-profiling/issues/1429 """ +from typing import List, Optional, Tuple + import numpy as np +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import types as T +from ydata_profiling import ProfileReport from ydata_profiling.config import SparkSettings -from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark, describe_numeric_1d_spark from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark from ydata_profiling.model.spark.describe_generic_spark import describe_generic_spark -from ydata_profiling.model.spark.describe_supported_spark import describe_supported_spark -from pyspark.sql import types as T, SparkSession, DataFrame +from ydata_profiling.model.spark.describe_numeric_spark import ( + describe_numeric_1d_spark, + numeric_stats_spark, +) +from ydata_profiling.model.spark.describe_supported_spark import ( + describe_supported_spark, +) + +RowType = Tuple[ + Optional[str], + Optional[float], + Optional[int], + Optional[bool], + Optional[float], + Optional[str], +] config = SparkSettings() + def create_test_df(spark: SparkSession) -> DataFrame: schema = T.StructType( [ @@ -21,26 +40,20 @@ def create_test_df(spark: SparkSession) -> DataFrame: T.StructField("int", T.IntegerType(), True), T.StructField("boolean", T.BooleanType(), True), T.StructField("null_double", T.DoubleType(), True), + T.StructField("null_string", T.StringType(), True), ] ) - data = [ - (f"test_{num + 1}", float(num), int(num), True, None) for num in range(205) + data: List[RowType] = [ + (f"test_{num + 1}", float(num), int(num), True, None, None) + for num in range(205) ] # Adding dupes - data.extend( - [ - ("test_1", float(1), int(1), False, None) for _ in range(205) - ] - ) + data.extend([("test_1", float(1), int(1), False, None, None) for _ in range(205)]) # Adding nulls - data.extend( - [ - (None, None, None, None, None) for _ in range(100) - ] - ) + data.extend([(None, None, None, None, None, None) for _ in range(100)]) return spark.createDataFrame(data, schema=schema) @@ -54,17 +67,27 @@ def test_describe_numeric_spark(spark_session): assert value is not None -def test_describe_numeric_1d_spark_for_null_column_edge_case(spark_session, test_output_dir): +def test_describe_numeric_1d_spark_for_null_column_edge_case( + spark_session, test_output_dir +): spark = spark_session test_df = create_test_df(spark) - _, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("null_double"), summary={} + ) - _, _, summary = describe_generic_spark(config=config, df=test_df.select("null_double"), summary=summary) + _, _, summary = describe_generic_spark( + config=config, df=test_df.select("null_double"), summary=summary + ) - _, _, summary = describe_supported_spark(config=config, series=test_df.select("null_double"), summary=summary) + _, _, summary = describe_supported_spark( + config=config, series=test_df.select("null_double"), summary=summary + ) - _, _, summary = describe_numeric_1d_spark(config=config, df=test_df.select("null_double"), summary=summary) + _, _, summary = describe_numeric_1d_spark( + config=config, df=test_df.select("null_double"), summary=summary + ) assert summary["iqr"] is np.nan assert summary["mad"] is np.nan @@ -72,26 +95,44 @@ def test_describe_numeric_1d_spark_for_null_column_edge_case(spark_session, test assert summary["mean"] is None assert summary["histogram"] == [] + profile = ProfileReport(test_df, title="1429_edge_case.html", config=config) + + output_file = test_output_dir / "1429_edge_case.html" + + profile.to_file(output_file) + + assert output_file.exists() + def test_describe_counts_spark(spark_session): test_df = create_test_df(spark_session) - _, _, summary = describe_counts_spark(config=config, series=test_df.select("category"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("category"), summary={} + ) assert summary["value_counts_without_nan"].loc["test_1"] == 206 - _, _, summary = describe_counts_spark(config=config, series=test_df.select("double"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("double"), summary={} + ) assert summary["value_counts_without_nan"].loc[float(1)] == 206 - _, _, summary = describe_counts_spark(config=config, series=test_df.select("int"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("int"), summary={} + ) assert summary["value_counts_without_nan"].loc[int(1)] == 206 - _, _, summary = describe_counts_spark(config=config, series=test_df.select("boolean"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("boolean"), summary={} + ) assert summary["value_counts_without_nan"].loc[True] == 205 - _, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={}) + _, _, summary = describe_counts_spark( + config=config, series=test_df.select("null_double"), summary={} + ) assert summary["value_counts_without_nan"].size == 0 diff --git a/tests/backends/spark_backend/test_issue1602.py b/tests/backends/spark_backend/test_issue1602.py index 83e36e027..b25ae1873 100644 --- a/tests/backends/spark_backend/test_issue1602.py +++ b/tests/backends/spark_backend/test_issue1602.py @@ -3,29 +3,26 @@ https://github.com/ydataai/ydata-profiling/issues/1602 """ -from ydata_profiling import ProfileReport from pyspark.sql import types as T +from ydata_profiling import ProfileReport + + def test_spark_handles_decimal_type(test_output_dir, spark_session): from decimal import Decimal + spark = spark_session schema = T.StructType( [ T.StructField("number", T.StringType(), True), - T.StructField("decimal", T.DecimalType(10, 2), True) + T.StructField("decimal", T.DecimalType(10, 2), True), ] ) - data = [ - (f"test_{num + 1}", Decimal(num + 1)) for num in range(205) - ] + data = [(f"test_{num + 1}", Decimal(num + 1)) for num in range(205)] - data.extend( - [ - ("test_1", Decimal("1.05")) for _ in range(205) - ] - ) + data.extend([("test_1", Decimal("1.05")) for _ in range(205)]) test_df = spark.createDataFrame(data, schema=schema) diff --git a/tests/backends/spark_backend/test_issue1722.py b/tests/backends/spark_backend/test_issue1722.py index e6bca1d76..6cb2d5596 100644 --- a/tests/backends/spark_backend/test_issue1722.py +++ b/tests/backends/spark_backend/test_issue1722.py @@ -3,9 +3,13 @@ https://github.com/ydataai/ydata-profiling/issues/1722 """ -from ydata_profiling import ProfileReport from datetime import date, datetime -from pyspark.sql import types as T, SparkSession + +from pyspark.sql import SparkSession +from pyspark.sql import types as T + +from ydata_profiling import ProfileReport + def make_non_numeric_df(spark: SparkSession): # Intentionally not including any numeric types @@ -30,8 +34,22 @@ def make_non_numeric_df(spark: SparkSession): ) data = [ - ("r1", date(2020, 1, 1), datetime(2020, 1, 1, 12, 0), [1, 2], {"x": 1, "y": 2}, (10, "aa")), - ("r2", date(2021, 6, 15), datetime(2021, 6, 15, 8, 30), [3], {"z": 3}, (20, "bb")), + ( + "r1", + date(2020, 1, 1), + datetime(2020, 1, 1, 12, 0), + [1, 2], + {"x": 1, "y": 2}, + (10, "aa"), + ), + ( + "r2", + date(2021, 6, 15), + datetime(2021, 6, 15, 8, 30), + [3], + {"z": 3}, + (20, "bb"), + ), ("r3", date(2022, 12, 31), datetime(2022, 12, 31, 23, 59), [], {}, (30, "cc")), ] diff --git a/tests/conftest.py b/tests/conftest.py index 28ce21215..7db073a3c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,6 +88,10 @@ def spark_context(): .setAppName("pytest-pyspark-tests") .setMaster("local[*]") .set("spark.sql.ansi.enabled", "false") + .set("spark.driver.host", "127.0.0.1") + .set("spark.driver.bindAddress", "127.0.0.1") + .set("spark.driver.port", "4040") + .set("spark.blockManager.port", "4041") ) # Check if SparkContext exists before creating a new one @@ -112,6 +116,8 @@ def spark_session(spark_context): pytest.skip("Skipping Spark tests because PySpark is not installed.") spark = ( SparkSession.builder.master("local[*]") + .config("spark.driver.host", "127.0.0.1") + .config("spark.driver.bindAddress", "127.0.0.1") .appName("pytest") .config("spark.sql.ansi.enabled", "false") # <-- restore permissive casts .getOrCreate()