Skip to content

Commit aed14e3

Browse files
committed
fix: multiple Spark fixes to approach closer parity to Pandas profiles
Addresses handling of completely null numeric columns, and gracefully handling empty correlation sets and plots.
1 parent a9c7cf9 commit aed14e3

18 files changed

Lines changed: 422 additions & 206 deletions

src/ydata_profiling/model/spark/correlations_spark.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,18 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
4545
exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"),
4646
and Correlation is from pyspark.ml.stat
4747
"""
48-
variables = {column: description["type"] for column, description in summary.items()}
48+
variables = {
49+
column: {
50+
"type": description["type"],
51+
"all_missing": description.get("n", 0) - description.get("n_missing", 0)
52+
== 0,
53+
}
54+
for column, description in summary.items()
55+
}
4956
interval_columns = [
50-
column for column, type_name in variables.items() if type_name == "Numeric"
57+
column
58+
for column, mapping in variables.items()
59+
if mapping["type"] == "Numeric" and not mapping["all_missing"]
5160
]
5261

5362
if not interval_columns:
@@ -67,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
6776
assembler = VectorAssembler(**assembler_args)
6877
df_vector = assembler.transform(df).select(vector_col)
6978

79+
if df_vector.count() <= 1:
80+
return [], interval_columns
81+
7082
# get correlation matrix
7183
matrix = (
7284
Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray()

src/ydata_profiling/model/spark/describe_counts_spark.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
import pandas as pd
77
from pyspark.sql import DataFrame
8-
from pyspark.sql import functions as F, types as T
8+
from pyspark.sql import functions as F
9+
from pyspark.sql import types as T
910

1011
from ydata_profiling.config import Settings
1112
from ydata_profiling.model.summary_algorithms import describe_counts
@@ -27,7 +28,9 @@ def describe_counts_spark(
2728
"""
2829
# Cast Decimal Type s
2930
if isinstance(series.schema.fields[0].dataType, T.DecimalType):
30-
series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]))
31+
series = series.select(
32+
F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])
33+
)
3134

3235
# Count occurrences of each value
3336
value_counts = series.groupBy(series.columns[0]).count()
@@ -42,12 +45,18 @@ def describe_counts_spark(
4245
if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
4346
n_missing = (
4447
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
45-
value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first()
48+
value_counts.filter(
49+
F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))
50+
)
51+
.select("count")
52+
.first()
4653
)
4754
else:
4855
n_missing = (
4956
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
50-
value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
57+
value_counts.filter(F.col(series.columns[0]).isNull())
58+
.select("count")
59+
.first()
5160
)
5261

5362
n_missing = n_missing["count"] if n_missing else 0

src/ydata_profiling/model/spark/describe_numeric_spark.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
1313

1414
# Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
1515
finite_filter = (
16-
F.col(column).isNotNull()
17-
& ~F.isnan(F.col(column))
18-
& ~F.col(column).isin([np.inf, -np.inf])
16+
F.col(column).isNotNull()
17+
& ~F.isnan(F.col(column))
18+
& ~F.col(column).isin([np.inf, -np.inf])
1919
)
2020
non_null_df = df.filter(finite_filter)
2121

@@ -91,12 +91,7 @@ def describe_numeric_1d_spark(
9191

9292
if summary.get("n") == summary.get("n_missing"):
9393
# This means the entire column is null/nan, so summary values need to be hard-coded:
94-
summary.update(
95-
{
96-
f"{percentile:.0%}": np.nan
97-
for percentile in quantiles
98-
}
99-
)
94+
summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})
10095

10196
summary["mad"] = np.nan
10297
summary["iqr"] = np.nan

src/ydata_profiling/model/spark/describe_supported_spark.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ def describe_supported_spark(
1919
"""
2020

2121
# number of non-NaN observations in the Series
22-
count = summary["count"]
23-
n_distinct = summary["value_counts"].count()
22+
n = summary["n"]
23+
n_distinct = summary["value_counts"].count() # Null/nan's count in value_counts
2424

2525
summary["n_distinct"] = n_distinct
26-
summary["p_distinct"] = n_distinct / count if count > 0 else 0
26+
summary["p_distinct"] = n_distinct / n if n > 0 else 0
2727

2828
n_unique = summary["value_counts"].where("count == 1").count()
29-
summary["is_unique"] = n_unique == count
29+
summary["is_unique"] = n_unique == n
3030
summary["n_unique"] = n_unique
31-
summary["p_unique"] = n_unique / count if count > 0 else 0
31+
summary["p_unique"] = n_unique / n if n > 0 else 0
3232

3333
return config, series, summary

src/ydata_profiling/report/structure/variables/render_categorical.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ydata_profiling.report.presentation.core.renderable import Renderable
2424
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
2525
from ydata_profiling.report.structure.variables.render_common import render_common
26+
from ydata_profiling.report.utils import image_or_empty
2627
from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram
2728

2829

@@ -95,7 +96,7 @@ def render_categorical_length(
9596
else:
9697
hist_data = histogram(config, *summary["histogram_length"])
9798

98-
length_histo = Image(
99+
length_histo = image_or_empty(
99100
hist_data,
100101
image_format=config.plot.image_format,
101102
alt="length histogram",

src/ydata_profiling/report/structure/variables/render_count.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
from ydata_profiling.report.presentation.core import (
99
Container,
1010
FrequencyTable,
11-
Image,
1211
Table,
1312
VariableInfo,
1413
)
1514
from ydata_profiling.report.structure.variables.render_common import render_common
15+
from ydata_profiling.report.utils import image_or_empty
1616
from ydata_profiling.visualisation.plot import histogram, mini_histogram
1717

1818

@@ -94,26 +94,41 @@ def render_count(config: Settings, summary: dict) -> dict:
9494
style=config.html.style,
9595
)
9696

97-
mini_histo = Image(
98-
mini_histogram(config, *summary["histogram"]),
99-
image_format=image_format,
97+
summary_histogram = summary.get("histogram", [])
98+
99+
mini_hist_data = None
100+
101+
if summary_histogram:
102+
mini_hist_data = mini_histogram(config, *summary["histogram"])
103+
104+
mini_histo = image_or_empty(
105+
mini_hist_data,
100106
alt="Mini histogram",
107+
image_format=image_format,
108+
name="Mini Histogram",
101109
)
102110

103111
template_variables["top"] = Container(
104112
[info, table1, table2, mini_histo], sequence_type="grid"
105113
)
106114

107-
seqs = [
108-
Image(
109-
histogram(config, *summary["histogram"]),
110-
image_format=image_format,
111-
alt="Histogram",
112-
caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
113-
name="Histogram",
114-
anchor_id="histogram",
115-
)
116-
]
115+
hist_data = None
116+
hist_caption = None
117+
118+
if summary_histogram:
119+
hist_data = histogram(config, *summary["histogram"])
120+
hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})"
121+
122+
hist = image_or_empty(
123+
hist_data,
124+
alt="Histogram",
125+
image_format=image_format,
126+
caption=hist_caption,
127+
name="Histogram",
128+
anchor_id="histogram",
129+
)
130+
131+
seqs = [hist]
117132

118133
fq = FrequencyTable(
119134
template_variables["freq_table_rows"],

src/ydata_profiling/report/structure/variables/render_date.py

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,8 @@
22

33
from ydata_profiling.config import Settings
44
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
5-
from ydata_profiling.report.presentation.core import (
6-
Container,
7-
Image,
8-
Table,
9-
VariableInfo,
10-
)
5+
from ydata_profiling.report.presentation.core import Container, Table, VariableInfo
6+
from ydata_profiling.report.utils import image_or_empty
117
from ydata_profiling.visualisation.plot import histogram, mini_histogram
128

139

@@ -76,55 +72,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
7672
style=config.html.style,
7773
)
7874

79-
if isinstance(summary["histogram"], list):
80-
mini_histo = Image(
81-
mini_histogram(
75+
summary_histogram = summary.get("histogram", [])
76+
77+
mini_hist_data = None
78+
79+
if summary_histogram:
80+
if isinstance(summary_histogram, list):
81+
mini_hist_data = mini_histogram(
8282
config,
8383
[x[0] for x in summary["histogram"]],
8484
[x[1] for x in summary["histogram"]],
8585
date=True,
86-
),
87-
image_format=image_format,
88-
alt="Mini histogram",
89-
)
90-
else:
91-
mini_histo = Image(
92-
mini_histogram(
86+
)
87+
else:
88+
mini_hist_data = mini_histogram(
9389
config, summary["histogram"][0], summary["histogram"][1], date=True
94-
),
95-
image_format=image_format,
96-
alt="Mini histogram",
97-
)
90+
)
91+
92+
mini_histo = image_or_empty(
93+
mini_hist_data,
94+
alt="Mini histogram",
95+
image_format=image_format,
96+
name="Mini Histogram",
97+
anchor_id=f"{varid}minihistogram",
98+
)
9899

99100
template_variables["top"] = Container(
100101
[info, table1, table2, mini_histo], sequence_type="grid"
101102
)
102103

103-
if isinstance(summary["histogram"], list):
104-
hist_data = histogram(
105-
config,
106-
[x[0] for x in summary["histogram"]],
107-
[x[1] for x in summary["histogram"]],
108-
date=True,
109-
)
110-
else:
111-
hist_data = histogram(
112-
config, summary["histogram"][0], summary["histogram"][1], date=True
104+
hist_data = None
105+
hist_caption = None
106+
107+
if summary_histogram:
108+
if isinstance(summary_histogram, list):
109+
hist_data = histogram(
110+
config,
111+
[x[0] for x in summary["histogram"]],
112+
[x[1] for x in summary["histogram"]],
113+
date=True,
114+
)
115+
else:
116+
hist_data = histogram(
117+
config, summary["histogram"][0], summary["histogram"][1], date=True
118+
)
119+
120+
n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
121+
hist_caption = (
122+
f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})"
113123
)
114124

125+
hist = image_or_empty(
126+
hist_data,
127+
image_format=image_format,
128+
alt="Histogram",
129+
caption=hist_caption,
130+
name="Histogram",
131+
anchor_id=f"{varid}histogram",
132+
)
133+
115134
# Bottom
116-
n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
117135
bottom = Container(
118-
[
119-
Image(
120-
hist_data,
121-
image_format=image_format,
122-
alt="Histogram",
123-
caption=f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})",
124-
name="Histogram",
125-
anchor_id=f"{varid}histogram",
126-
)
127-
],
136+
[hist],
128137
sequence_type="tabs",
129138
anchor_id=summary["varid"],
130139
)

src/ydata_profiling/report/structure/variables/render_file.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from typing import List
22

33
from ydata_profiling.config import Settings
4-
from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image
4+
from ydata_profiling.report.presentation.core import Container, FrequencyTable
55
from ydata_profiling.report.presentation.core.renderable import Renderable
66
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
77
from ydata_profiling.report.structure.variables.render_path import render_path
8+
from ydata_profiling.report.utils import image_or_empty
89
from ydata_profiling.visualisation.plot import histogram
910

1011

@@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict:
2021
image_format = config.plot.image_format
2122

2223
file_tabs: List[Renderable] = []
24+
25+
hist_data = None
26+
if summary["histogram_file_size"]:
27+
hist_data = histogram(config, *summary["histogram_file_size"])
28+
2329
if "file_size" in summary:
24-
file_tabs.append(
25-
Image(
26-
histogram(config, *summary["histogram_file_size"]),
27-
image_format=image_format,
28-
alt="Size",
29-
caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
30-
name="File size",
31-
anchor_id=f"{varid}file_size_histogram",
32-
)
30+
hist = image_or_empty(
31+
hist_data,
32+
image_format=image_format,
33+
alt="Size",
34+
caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
35+
name="File size",
36+
anchor_id=f"{varid}file_size_histogram",
3337
)
38+
file_tabs.append(hist)
3439

3540
file_dates = {
3641
"file_created_time": "Created",

0 commit comments

Comments
 (0)