22 Pyspark counts
33"""
44from typing import Tuple
5+
56import pandas as pd
7+ from pyspark .sql import DataFrame
8+ from pyspark .sql import functions as F
69
710from ydata_profiling .config import Settings
8-
9- from pyspark .sql import DataFrame , functions as F
1011from ydata_profiling .model .summary_algorithms import describe_counts
1112
13+
1214@describe_counts .register
1315def describe_counts_spark (
1416 config : Settings , series : DataFrame , summary : dict
@@ -34,7 +36,9 @@ def describe_counts_spark(
3436 value_counts_index_sorted = value_counts .orderBy (F .asc (series .columns [0 ]))
3537
3638 # Count missing values
37- n_missing = value_counts .filter (F .col (series .columns [0 ]).isNull ()).select ("count" ).first ()
39+ n_missing = (
40+ value_counts .filter (F .col (series .columns [0 ]).isNull ()).select ("count" ).first ()
41+ )
3842 n_missing = n_missing ["count" ] if n_missing else 0
3943
4044 # Convert top 200 values to Pandas for frequency table display
@@ -51,11 +55,9 @@ def describe_counts_spark(
5155
5256 column = series .columns [0 ]
5357
54-
55- if series .dtypes [0 ][1 ] in ('int' , 'float' , 'bigint' , 'double' ):
58+ if series .dtypes [0 ][1 ] in ("int" , "float" , "bigint" , "double" ):
5659 value_counts_no_nan = (
57- value_counts
58- .filter (F .col (column ).isNotNull ()) # Exclude NaNs
60+ value_counts .filter (F .col (column ).isNotNull ()) # Exclude NaNs
5961 .filter (~ F .isnan (F .col (column ))) # Remove implicit NaNs (if numeric column)
6062 .groupBy (column ) # Group by unique values
6163 .count () # Count occurrences
@@ -64,8 +66,7 @@ def describe_counts_spark(
6466 )
6567 else :
6668 value_counts_no_nan = (
67- value_counts
68- .filter (F .col (column ).isNotNull ()) # Exclude NULLs
69+ value_counts .filter (F .col (column ).isNotNull ()) # Exclude NULLs
6970 .groupBy (column ) # Group by unique timestamp values
7071 .count () # Count occurrences
7172 .orderBy (F .desc ("count" )) # Sort by most frequent timestamps
@@ -75,8 +76,12 @@ def describe_counts_spark(
7576 # Convert to Pandas Series, forcing proper structure
7677 if value_counts_no_nan .count () > 0 :
7778 pdf = value_counts_no_nan .toPandas ().set_index (column )["count" ]
78- summary ["value_counts_without_nan" ] = pd .Series (pdf ) # Ensures it's always a Series
79+ summary ["value_counts_without_nan" ] = pd .Series (
80+ pdf
81+ ) # Ensures it's always a Series
7982 else :
80- summary ["value_counts_without_nan" ] = pd .Series (dtype = int ) # Ensures an empty Series
83+ summary ["value_counts_without_nan" ] = pd .Series (
84+ dtype = int
85+ ) # Ensures an empty Series
8186
82- return config , series , summary
87+ return config , series , summary
0 commit comments