Skip to content

Commit 3487c84

Browse files
committed
fix: edge case - completely null numeric field in Spark
Discovered this edge case with real data, and still need to fix the rendering of an empty histogram.
1 parent 64b7564 commit 3487c84

2 files changed

Lines changed: 71 additions & 26 deletions

File tree

src/ydata_profiling/model/spark/describe_numeric_spark.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -89,30 +89,47 @@ def describe_numeric_1d_spark(
8989
quantiles = config.vars.num.quantiles
9090
quantile_threshold = 0.05
9191

92-
summary.update(
93-
{
94-
f"{percentile:.0%}": value
95-
for percentile, value in zip(
96-
quantiles,
97-
df.stat.approxQuantile(
98-
f"{df.columns[0]}",
92+
if summary.get("n") == summary.get("n_missing"):
93+
# This means the entire column is null/nan, so summary values need to be hard-coded:
94+
summary.update(
95+
{
96+
f"{percentile:.0%}": np.nan
97+
for percentile in quantiles
98+
}
99+
)
100+
101+
summary["mad"] = np.nan
102+
summary["iqr"] = np.nan
103+
104+
else:
105+
summary.update(
106+
{
107+
f"{percentile:.0%}": value
108+
for percentile, value in zip(
99109
quantiles,
100-
quantile_threshold,
101-
),
102-
)
103-
}
104-
)
110+
df.stat.approxQuantile(
111+
f"{df.columns[0]}",
112+
quantiles,
113+
quantile_threshold,
114+
),
115+
)
116+
}
117+
)
105118

106-
median = summary["50%"]
119+
median = summary.get("50%")
107120

108-
summary["mad"] = df.select(
109-
(F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev")
110-
).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
121+
summary["mad"] = df.select(
122+
(F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev")
123+
).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
124+
125+
summary["iqr"] = summary["75%"] - summary["25%"]
111126

112127
# FIXME: move to fmt
113128
summary["p_negative"] = summary["n_negative"] / summary["n"]
114-
summary["range"] = summary["max"] - summary["min"]
115-
summary["iqr"] = summary["75%"] - summary["25%"]
129+
if summary["min"] is None or summary["max"] is None:
130+
summary["range"] = np.nan
131+
else:
132+
summary["range"] = summary["max"] - summary["min"]
116133
summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.nan
117134
summary["p_zeros"] = summary["n_zeros"] / summary["n"]
118135
summary["p_infinite"] = summary["n_infinite"] / summary["n"]

tests/backends/spark_backend/test_issue1429.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
Test for issue 1429:
33
https://github.com/ydataai/ydata-profiling/issues/1429
44
"""
5+
import numpy as np
56

67
from ydata_profiling.config import SparkSettings
7-
from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark
8+
from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark, describe_numeric_1d_spark
89
from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark
10+
from ydata_profiling.model.spark.describe_generic_spark import describe_generic_spark
11+
from ydata_profiling.model.spark.describe_supported_spark import describe_supported_spark
912
from pyspark.sql import types as T, SparkSession, DataFrame
1013

14+
config = SparkSettings()
1115

1216
def create_test_df(spark: SparkSession) -> DataFrame:
1317
schema = T.StructType(
@@ -16,24 +20,25 @@ def create_test_df(spark: SparkSession) -> DataFrame:
1620
T.StructField("double", T.DoubleType(), True),
1721
T.StructField("int", T.IntegerType(), True),
1822
T.StructField("boolean", T.BooleanType(), True),
23+
T.StructField("null_double", T.DoubleType(), True),
1924
]
2025
)
2126

2227
data = [
23-
(f"test_{num + 1}", float(num), int(num), True) for num in range(205)
28+
(f"test_{num + 1}", float(num), int(num), True, None) for num in range(205)
2429
]
2530

2631
# Adding dupes
2732
data.extend(
2833
[
29-
("test_1", float(1), int(1), False) for _ in range(205)
34+
("test_1", float(1), int(1), False, None) for _ in range(205)
3035
]
3136
)
3237

3338
# Adding nulls
3439
data.extend(
3540
[
36-
(None, None, None, None) for _ in range(100)
41+
(None, None, None, None, None) for _ in range(100)
3742
]
3843
)
3944

@@ -49,21 +54,44 @@ def test_describe_numeric_spark(spark_session):
4954
assert value is not None
5055

5156

57+
def test_describe_numeric_1d_spark_for_null_column_edge_case(spark_session, test_output_dir):
58+
spark = spark_session
59+
test_df = create_test_df(spark)
60+
61+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={})
62+
63+
_, _, summary = describe_generic_spark(config=config, df=test_df.select("null_double"), summary=summary)
64+
65+
_, _, summary = describe_supported_spark(config=config, series=test_df.select("null_double"), summary=summary)
66+
67+
_, _, summary = describe_numeric_1d_spark(config=config, df=test_df.select("null_double"), summary=summary)
68+
69+
assert summary["iqr"] is np.nan
70+
assert summary["mad"] is np.nan
71+
assert summary["cv"] is np.nan
72+
assert summary["mean"] is None
73+
assert summary["histogram"] == []
74+
75+
5276
def test_describe_counts_spark(spark_session):
5377
test_df = create_test_df(spark_session)
5478

55-
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("category"), summary={})
79+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("category"), summary={})
5680

5781
assert summary["value_counts_without_nan"].loc["test_1"] == 206
5882

59-
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("double"), summary={})
83+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("double"), summary={})
6084

6185
assert summary["value_counts_without_nan"].loc[float(1)] == 206
6286

63-
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("int"), summary={})
87+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("int"), summary={})
6488

6589
assert summary["value_counts_without_nan"].loc[int(1)] == 206
6690

67-
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("boolean"), summary={})
91+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("boolean"), summary={})
6892

6993
assert summary["value_counts_without_nan"].loc[True] == 205
94+
95+
_, _, summary = describe_counts_spark(config=config, series=test_df.select("null_double"), summary={})
96+
97+
assert summary["value_counts_without_nan"].size == 0

0 commit comments

Comments
 (0)