22Test for issue 1429:
33https://github.com/ydataai/ydata-profiling/issues/1429
44"""
5+ import numpy as np
56
67from ydata_profiling .config import SparkSettings
7- from ydata_profiling .model .spark .describe_numeric_spark import numeric_stats_spark
8+ from ydata_profiling .model .spark .describe_numeric_spark import numeric_stats_spark , describe_numeric_1d_spark
89from ydata_profiling .model .spark .describe_counts_spark import describe_counts_spark
10+ from ydata_profiling .model .spark .describe_generic_spark import describe_generic_spark
11+ from ydata_profiling .model .spark .describe_supported_spark import describe_supported_spark
912from pyspark .sql import types as T , SparkSession , DataFrame
1013
14+ config = SparkSettings ()
1115
1216def create_test_df (spark : SparkSession ) -> DataFrame :
1317 schema = T .StructType (
@@ -16,24 +20,25 @@ def create_test_df(spark: SparkSession) -> DataFrame:
1620 T .StructField ("double" , T .DoubleType (), True ),
1721 T .StructField ("int" , T .IntegerType (), True ),
1822 T .StructField ("boolean" , T .BooleanType (), True ),
23+ T .StructField ("null_double" , T .DoubleType (), True ),
1924 ]
2025 )
2126
2227 data = [
23- (f"test_{ num + 1 } " , float (num ), int (num ), True ) for num in range (205 )
28+ (f"test_{ num + 1 } " , float (num ), int (num ), True , None ) for num in range (205 )
2429 ]
2530
2631 # Adding dupes
2732 data .extend (
2833 [
29- ("test_1" , float (1 ), int (1 ), False ) for _ in range (205 )
34+ ("test_1" , float (1 ), int (1 ), False , None ) for _ in range (205 )
3035 ]
3136 )
3237
3338 # Adding nulls
3439 data .extend (
3540 [
36- (None , None , None , None ) for _ in range (100 )
41+ (None , None , None , None , None ) for _ in range (100 )
3742 ]
3843 )
3944
@@ -49,21 +54,44 @@ def test_describe_numeric_spark(spark_session):
4954 assert value is not None
5055
5156
57+ def test_describe_numeric_1d_spark_for_null_column_edge_case (spark_session , test_output_dir ):
58+ spark = spark_session
59+ test_df = create_test_df (spark )
60+
61+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("null_double" ), summary = {})
62+
63+ _ , _ , summary = describe_generic_spark (config = config , df = test_df .select ("null_double" ), summary = summary )
64+
65+ _ , _ , summary = describe_supported_spark (config = config , series = test_df .select ("null_double" ), summary = summary )
66+
67+ _ , _ , summary = describe_numeric_1d_spark (config = config , df = test_df .select ("null_double" ), summary = summary )
68+
69+ assert summary ["iqr" ] is np .nan
70+ assert summary ["mad" ] is np .nan
71+ assert summary ["cv" ] is np .nan
72+ assert summary ["mean" ] is None
73+ assert summary ["histogram" ] == []
74+
75+
5276def test_describe_counts_spark (spark_session ):
5377 test_df = create_test_df (spark_session )
5478
55- _ , _ , summary = describe_counts_spark (config = SparkSettings () , series = test_df .select ("category" ), summary = {})
79+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("category" ), summary = {})
5680
5781 assert summary ["value_counts_without_nan" ].loc ["test_1" ] == 206
5882
59- _ , _ , summary = describe_counts_spark (config = SparkSettings () , series = test_df .select ("double" ), summary = {})
83+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("double" ), summary = {})
6084
6185 assert summary ["value_counts_without_nan" ].loc [float (1 )] == 206
6286
63- _ , _ , summary = describe_counts_spark (config = SparkSettings () , series = test_df .select ("int" ), summary = {})
87+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("int" ), summary = {})
6488
6589 assert summary ["value_counts_without_nan" ].loc [int (1 )] == 206
6690
67- _ , _ , summary = describe_counts_spark (config = SparkSettings () , series = test_df .select ("boolean" ), summary = {})
91+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("boolean" ), summary = {})
6892
6993 assert summary ["value_counts_without_nan" ].loc [True ] == 205
94+
95+ _ , _ , summary = describe_counts_spark (config = config , series = test_df .select ("null_double" ), summary = {})
96+
97+ assert summary ["value_counts_without_nan" ].size == 0
0 commit comments