Skip to content

Commit 4daf389

Browse files
committed
Adding tests for issue 1429
1 parent b50deed commit 4daf389

2 files changed

Lines changed: 80 additions & 4 deletions

File tree

src/ydata_profiling/model/spark/describe_counts_spark.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,17 @@ def describe_counts_spark(
3636
value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0]))
3737

3838
# Count missing values
39-
n_missing = (
40-
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
41-
value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first()
42-
)
39+
if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
40+
n_missing = (
41+
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
42+
value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first()
43+
)
44+
else:
45+
n_missing = (
46+
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
47+
value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
48+
)
49+
4350
n_missing = n_missing["count"] if n_missing else 0
4451

4552
# Convert top 200 values to Pandas for frequency table display

tests/issues/test_issue1429.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Test for issue 1429:
3+
https://github.com/ydataai/ydata-profiling/issues/1429
4+
"""
5+
6+
from ydata_profiling.config import SparkSettings
7+
from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark
8+
from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark
9+
from pyspark.sql import types as T, SparkSession, DataFrame
10+
11+
12+
def create_test_df(spark: SparkSession) -> DataFrame:
13+
schema = T.StructType(
14+
[
15+
T.StructField("category", T.StringType(), True),
16+
T.StructField("double", T.DoubleType(), True),
17+
T.StructField("int", T.IntegerType(), True),
18+
T.StructField("boolean", T.BooleanType(), True),
19+
]
20+
)
21+
22+
data = [
23+
(f"test_{num + 1}", float(num), int(num), True) for num in range(205)
24+
]
25+
26+
# Adding dupes
27+
data.extend(
28+
[
29+
("test_1", float(1), int(1), False) for _ in range(205)
30+
]
31+
)
32+
33+
# Adding nulls
34+
data.extend(
35+
[
36+
(None, None, None, None) for _ in range(100)
37+
]
38+
)
39+
40+
return spark.createDataFrame(data, schema=schema)
41+
42+
43+
def test_describe_numeric_spark(spark_session):
44+
test_df = create_test_df(spark_session)
45+
46+
numeric_stats = numeric_stats_spark(df=test_df.select("double"), summary={})
47+
48+
for _, value in numeric_stats.items():
49+
assert value is not None
50+
51+
52+
def test_describe_counts_spark(spark_session):
53+
test_df = create_test_df(spark_session)
54+
55+
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("category"), summary={})
56+
57+
assert summary["value_counts_without_nan"].loc["test_1"] == 206
58+
59+
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("double"), summary={})
60+
61+
assert summary["value_counts_without_nan"].loc[float(1)] == 206
62+
63+
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("int"), summary={})
64+
65+
assert summary["value_counts_without_nan"].loc[int(1)] == 206
66+
67+
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("boolean"), summary={})
68+
69+
assert summary["value_counts_without_nan"].loc[True] == 205

0 commit comments

Comments
 (0)