Skip to content

Commit 8caefa8

Browse files
committed
fix: multiple Spark fixes to approach closer parity to Pandas profiles
Addresses handling of completely null numeric columns, and gracefully handling empty correlation sets and plots.
1 parent a9c7cf9 commit 8caefa8

18 files changed

+420
-198
lines changed

src/ydata_profiling/model/spark/correlations_spark.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,18 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
4545
exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"),
4646
and Correlation is from pyspark.ml.stat
4747
"""
48-
variables = {column: description["type"] for column, description in summary.items()}
48+
variables = {
49+
column: {
50+
"type": description["type"],
51+
"all_missing": description.get("n", 0) - description.get("n_missing", 0)
52+
== 0,
53+
}
54+
for column, description in summary.items()
55+
}
4956
interval_columns = [
50-
column for column, type_name in variables.items() if type_name == "Numeric"
57+
column
58+
for column, mapping in variables.items()
59+
if mapping["type"] == "Numeric" and not mapping["all_missing"]
5160
]
5261

5362
if not interval_columns:
@@ -67,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
6776
assembler = VectorAssembler(**assembler_args)
6877
df_vector = assembler.transform(df).select(vector_col)
6978

79+
if df_vector.count() <= 1:
80+
return [], interval_columns
81+
7082
# get correlation matrix
7183
matrix = (
7284
Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray()

src/ydata_profiling/model/spark/describe_counts_spark.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
import pandas as pd
77
from pyspark.sql import DataFrame
8-
from pyspark.sql import functions as F, types as T
8+
from pyspark.sql import functions as F
9+
from pyspark.sql import types as T
910

1011
from ydata_profiling.config import Settings
1112
from ydata_profiling.model.summary_algorithms import describe_counts
@@ -27,7 +28,9 @@ def describe_counts_spark(
2728
"""
2829
# Cast Decimal Type s
2930
if isinstance(series.schema.fields[0].dataType, T.DecimalType):
30-
series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]))
31+
series = series.select(
32+
F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])
33+
)
3134

3235
# Count occurrences of each value
3336
value_counts = series.groupBy(series.columns[0]).count()
@@ -42,12 +45,18 @@ def describe_counts_spark(
4245
if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
4346
n_missing = (
4447
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
45-
value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first()
48+
value_counts.filter(
49+
F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))
50+
)
51+
.select("count")
52+
.first()
4653
)
4754
else:
4855
n_missing = (
4956
# Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
50-
value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
57+
value_counts.filter(F.col(series.columns[0]).isNull())
58+
.select("count")
59+
.first()
5160
)
5261

5362
n_missing = n_missing["count"] if n_missing else 0

src/ydata_profiling/model/spark/describe_numeric_spark.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
1313

1414
# Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
1515
finite_filter = (
16-
F.col(column).isNotNull()
17-
& ~F.isnan(F.col(column))
18-
& ~F.col(column).isin([np.inf, -np.inf])
16+
F.col(column).isNotNull()
17+
& ~F.isnan(F.col(column))
18+
& ~F.col(column).isin([np.inf, -np.inf])
1919
)
2020
non_null_df = df.filter(finite_filter)
2121

@@ -91,12 +91,7 @@ def describe_numeric_1d_spark(
9191

9292
if summary.get("n") == summary.get("n_missing"):
9393
# This means the entire column is null/nan, so summary values need to be hard-coded:
94-
summary.update(
95-
{
96-
f"{percentile:.0%}": np.nan
97-
for percentile in quantiles
98-
}
99-
)
94+
summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})
10095

10196
summary["mad"] = np.nan
10297
summary["iqr"] = np.nan

src/ydata_profiling/model/spark/describe_supported_spark.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ def describe_supported_spark(
1919
"""
2020

2121
# number of non-NaN observations in the Series
22-
count = summary["count"]
23-
n_distinct = summary["value_counts"].count()
22+
n = summary["n"]
23+
n_distinct = summary["value_counts"].count() # Null/nan's count in value_counts
2424

2525
summary["n_distinct"] = n_distinct
26-
summary["p_distinct"] = n_distinct / count if count > 0 else 0
26+
summary["p_distinct"] = n_distinct / n if n > 0 else 0
2727

2828
n_unique = summary["value_counts"].where("count == 1").count()
29-
summary["is_unique"] = n_unique == count
29+
summary["is_unique"] = n_unique == n
3030
summary["n_unique"] = n_unique
31-
summary["p_unique"] = n_unique / count if count > 0 else 0
31+
summary["p_unique"] = n_unique / n if n > 0 else 0
3232

3333
return config, series, summary

src/ydata_profiling/report/structure/variables/render_categorical.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ydata_profiling.report.presentation.core.renderable import Renderable
2424
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
2525
from ydata_profiling.report.structure.variables.render_common import render_common
26+
from ydata_profiling.report.utils import image_or_empty
2627
from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram
2728

2829

@@ -95,7 +96,7 @@ def render_categorical_length(
9596
else:
9697
hist_data = histogram(config, *summary["histogram_length"])
9798

98-
length_histo = Image(
99+
length_histo = image_or_empty(
99100
hist_data,
100101
image_format=config.plot.image_format,
101102
alt="length histogram",

src/ydata_profiling/report/structure/variables/render_count.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
VariableInfo,
1414
)
1515
from ydata_profiling.report.structure.variables.render_common import render_common
16+
from ydata_profiling.report.utils import image_or_empty
1617
from ydata_profiling.visualisation.plot import histogram, mini_histogram
1718

1819

@@ -94,26 +95,41 @@ def render_count(config: Settings, summary: dict) -> dict:
9495
style=config.html.style,
9596
)
9697

97-
mini_histo = Image(
98-
mini_histogram(config, *summary["histogram"]),
99-
image_format=image_format,
98+
summary_histogram = summary.get("histogram", [])
99+
100+
mini_hist_data = None
101+
102+
if summary_histogram:
103+
mini_hist_data = mini_histogram(config, *summary["histogram"])
104+
105+
mini_histo = image_or_empty(
106+
mini_hist_data,
100107
alt="Mini histogram",
108+
image_format=image_format,
109+
name="Mini Histogram",
101110
)
102111

103112
template_variables["top"] = Container(
104113
[info, table1, table2, mini_histo], sequence_type="grid"
105114
)
106115

107-
seqs = [
108-
Image(
109-
histogram(config, *summary["histogram"]),
110-
image_format=image_format,
111-
alt="Histogram",
112-
caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
113-
name="Histogram",
114-
anchor_id="histogram",
115-
)
116-
]
116+
hist_data = None
117+
hist_caption = None
118+
119+
if summary_histogram:
120+
hist_data = histogram(config, *summary["histogram"])
121+
hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})"
122+
123+
hist = image_or_empty(
124+
hist_data,
125+
alt="Histogram",
126+
image_format=image_format,
127+
caption=hist_caption,
128+
name="Histogram",
129+
anchor_id="histogram",
130+
)
131+
132+
seqs = [hist]
117133

118134
fq = FrequencyTable(
119135
template_variables["freq_table_rows"],

src/ydata_profiling/report/structure/variables/render_date.py

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
Table,
99
VariableInfo,
1010
)
11+
from ydata_profiling.report.utils import image_or_empty
1112
from ydata_profiling.visualisation.plot import histogram, mini_histogram
1213

1314

@@ -76,55 +77,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
7677
style=config.html.style,
7778
)
7879

79-
if isinstance(summary["histogram"], list):
80-
mini_histo = Image(
81-
mini_histogram(
80+
summary_histogram = summary.get("histogram", [])
81+
82+
mini_hist_data = None
83+
84+
if summary_histogram:
85+
if isinstance(summary_histogram, list):
86+
mini_hist_data = mini_histogram(
8287
config,
8388
[x[0] for x in summary["histogram"]],
8489
[x[1] for x in summary["histogram"]],
8590
date=True,
86-
),
87-
image_format=image_format,
88-
alt="Mini histogram",
89-
)
90-
else:
91-
mini_histo = Image(
92-
mini_histogram(
91+
)
92+
else:
93+
mini_hist_data = mini_histogram(
9394
config, summary["histogram"][0], summary["histogram"][1], date=True
94-
),
95-
image_format=image_format,
96-
alt="Mini histogram",
97-
)
95+
)
96+
97+
mini_histo = image_or_empty(
98+
mini_hist_data,
99+
alt="Mini histogram",
100+
image_format=image_format,
101+
name="Mini Histogram",
102+
anchor_id=f"{varid}minihistogram",
103+
)
98104

99105
template_variables["top"] = Container(
100106
[info, table1, table2, mini_histo], sequence_type="grid"
101107
)
102108

103-
if isinstance(summary["histogram"], list):
104-
hist_data = histogram(
105-
config,
106-
[x[0] for x in summary["histogram"]],
107-
[x[1] for x in summary["histogram"]],
108-
date=True,
109-
)
110-
else:
111-
hist_data = histogram(
112-
config, summary["histogram"][0], summary["histogram"][1], date=True
109+
hist_data = None
110+
hist_caption = None
111+
112+
if summary_histogram:
113+
if isinstance(summary_histogram, list):
114+
hist_data = histogram(
115+
config,
116+
[x[0] for x in summary["histogram"]],
117+
[x[1] for x in summary["histogram"]],
118+
date=True,
119+
)
120+
else:
121+
hist_data = histogram(
122+
config, summary["histogram"][0], summary["histogram"][1], date=True
123+
)
124+
125+
n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
126+
hist_caption = (
127+
f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})"
113128
)
114129

130+
hist = image_or_empty(
131+
hist_data,
132+
image_format=image_format,
133+
alt="Histogram",
134+
caption=hist_caption,
135+
name="Histogram",
136+
anchor_id=f"{varid}histogram",
137+
)
138+
115139
# Bottom
116-
n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
117140
bottom = Container(
118-
[
119-
Image(
120-
hist_data,
121-
image_format=image_format,
122-
alt="Histogram",
123-
caption=f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})",
124-
name="Histogram",
125-
anchor_id=f"{varid}histogram",
126-
)
127-
],
141+
[hist],
128142
sequence_type="tabs",
129143
anchor_id=summary["varid"],
130144
)

src/ydata_profiling/report/structure/variables/render_file.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from ydata_profiling.report.presentation.core.renderable import Renderable
66
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
77
from ydata_profiling.report.structure.variables.render_path import render_path
8+
from ydata_profiling.report.utils import image_or_empty
89
from ydata_profiling.visualisation.plot import histogram
910

1011

@@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict:
2021
image_format = config.plot.image_format
2122

2223
file_tabs: List[Renderable] = []
24+
25+
hist_data = None
26+
if summary["histogram_file_size"]:
27+
hist_data = histogram(config, *summary["histogram_file_size"])
28+
2329
if "file_size" in summary:
24-
file_tabs.append(
25-
Image(
26-
histogram(config, *summary["histogram_file_size"]),
27-
image_format=image_format,
28-
alt="Size",
29-
caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
30-
name="File size",
31-
anchor_id=f"{varid}file_size_histogram",
32-
)
30+
hist = image_or_empty(
31+
hist_data,
32+
image_format=image_format,
33+
alt="Size",
34+
caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
35+
name="File size",
36+
anchor_id=f"{varid}file_size_histogram",
3337
)
38+
file_tabs.append(hist)
3439

3540
file_dates = {
3641
"file_created_time": "Created",

0 commit comments

Comments
 (0)