diff --git a/.gitignore b/.gitignore
index 2d2cfe2b9..71fb75bd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ var/
*.egg-info/
.installed.cfg
*.egg
+uv.lock
# PyInstaller
# Usually these files are written by a python script from a template
@@ -85,3 +86,6 @@ docsrc/source/pages/api/_autosummary/
# User created
VERSION
version.py
+
+# local java/spark sdk environment files
+.sdkmanrc
diff --git a/pyproject.toml b/pyproject.toml
index eae6b3dd4..e46cb237c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,7 @@ dev = [
"sphinx-autodoc-typehints>=1.10.3",
"sphinx-multiversion>=0.2.3",
"autodoc_pydantic",
+ "standard-imghdr"
]
docs = [
diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py
index f9f1d1ecb..441dfbf0f 100644
--- a/src/ydata_profiling/model/spark/correlations_spark.py
+++ b/src/ydata_profiling/model/spark/correlations_spark.py
@@ -45,10 +45,23 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"),
and Correlation is from pyspark.ml.stat
"""
- variables = {column: description["type"] for column, description in summary.items()}
+ variables = {
+ column: {
+ "type": description["type"],
+ "all_missing": description.get("n", 0) - description.get("n_missing", 0)
+ == 0,
+ }
+ for column, description in summary.items()
+ }
interval_columns = [
- column for column, type_name in variables.items() if type_name == "Numeric"
+ column
+ for column, mapping in variables.items()
+ if mapping["type"] == "Numeric" and not mapping["all_missing"]
]
+
+ if not interval_columns:
+ return [], interval_columns
+
df = df.select(*interval_columns)
# convert to vector column first
@@ -63,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
assembler = VectorAssembler(**assembler_args)
df_vector = assembler.transform(df).select(vector_col)
+ if df_vector.count() <= 1:
+ return [], interval_columns
+
# get correlation matrix
matrix = (
Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray()
diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py
index d7a091e7f..f02d1043c 100644
--- a/src/ydata_profiling/model/spark/describe_counts_spark.py
+++ b/src/ydata_profiling/model/spark/describe_counts_spark.py
@@ -6,6 +6,7 @@
import pandas as pd
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
+from pyspark.sql import types as T
from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_counts
@@ -25,6 +26,11 @@ def describe_counts_spark(
Returns:
Updated settings, input series, and summary dictionary.
"""
+ # Cast Decimal Type s
+ if isinstance(series.schema.fields[0].dataType, T.DecimalType):
+ series = series.select(
+ F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])
+ )
# Count occurrences of each value
value_counts = series.groupBy(series.columns[0]).count()
@@ -36,9 +42,23 @@ def describe_counts_spark(
value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0]))
# Count missing values
- n_missing = (
- value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
- )
+ if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
+ n_missing = (
+ # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
+ value_counts.filter(
+ F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))
+ )
+ .select("count")
+ .first()
+ )
+ else:
+ n_missing = (
+ # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
+ value_counts.filter(F.col(series.columns[0]).isNull())
+ .select("count")
+ .first()
+ )
+
n_missing = n_missing["count"] if n_missing else 0
# Convert top 200 values to Pandas for frequency table display
@@ -60,17 +80,15 @@ def describe_counts_spark(
value_counts.filter(F.col(column).isNotNull()) # Exclude NaNs
.filter(~F.isnan(F.col(column))) # Remove implicit NaNs (if numeric column)
.groupBy(column) # Group by unique values
- .count() # Count occurrences
+ .agg(F.sum("count").alias("count")) # Sum of count
.orderBy(F.desc("count")) # Sort in descending order
- .limit(200) # Limit for performance
)
else:
value_counts_no_nan = (
value_counts.filter(F.col(column).isNotNull()) # Exclude NULLs
.groupBy(column) # Group by unique timestamp values
- .count() # Count occurrences
+ .agg(F.sum("count").alias("count")) # Sum of count
.orderBy(F.desc("count")) # Sort by most frequent timestamps
- .limit(200) # Limit for performance
)
# Convert to Pandas Series, forcing proper structure
diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py
index 5e30c7539..8c299577e 100644
--- a/src/ydata_profiling/model/spark/describe_numeric_spark.py
+++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py
@@ -11,6 +11,14 @@
def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
column = df.columns[0]
+ # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
+ finite_filter = (
+ F.col(column).isNotNull()
+ & ~F.isnan(F.col(column))
+ & ~F.col(column).isin([np.inf, -np.inf])
+ )
+ non_null_df = df.filter(finite_filter)
+
expr = [
F.mean(F.col(column)).alias("mean"),
F.stddev(F.col(column)).alias("std"),
@@ -21,7 +29,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
F.skewness(F.col(column)).alias("skewness"),
F.sum(F.col(column)).alias("sum"),
]
- return df.agg(*expr).first().asDict()
+ return non_null_df.agg(*expr).first().asDict()
def describe_numeric_1d_spark(
@@ -81,30 +89,42 @@ def describe_numeric_1d_spark(
quantiles = config.vars.num.quantiles
quantile_threshold = 0.05
- summary.update(
- {
- f"{percentile:.0%}": value
- for percentile, value in zip(
- quantiles,
- df.stat.approxQuantile(
- f"{df.columns[0]}",
+ if summary.get("n") == summary.get("n_missing"):
+ # This means the entire column is null/nan, so summary values need to be hard-coded:
+ summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})
+
+ summary["mad"] = np.nan
+ summary["iqr"] = np.nan
+
+ else:
+ summary.update(
+ {
+ f"{percentile:.0%}": value
+ for percentile, value in zip(
quantiles,
- quantile_threshold,
- ),
- )
- }
- )
+ df.stat.approxQuantile(
+ f"{df.columns[0]}",
+ quantiles,
+ quantile_threshold,
+ ),
+ )
+ }
+ )
- median = summary["50%"]
+ median = summary.get("50%")
- summary["mad"] = df.select(
- (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev")
- ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
+ summary["mad"] = df.select(
+ (F.abs(F.col(f"{df.columns[0]}").cast("int") - median)).alias("abs_dev")
+ ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
+
+ summary["iqr"] = summary["75%"] - summary["25%"]
# FIXME: move to fmt
summary["p_negative"] = summary["n_negative"] / summary["n"]
- summary["range"] = summary["max"] - summary["min"]
- summary["iqr"] = summary["75%"] - summary["25%"]
+ if summary["min"] is None or summary["max"] is None:
+ summary["range"] = np.nan
+ else:
+ summary["range"] = summary["max"] - summary["min"]
summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.nan
summary["p_zeros"] = summary["n_zeros"] / summary["n"]
summary["p_infinite"] = summary["n_infinite"] / summary["n"]
diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py
index 1758f668d..7dcaff62c 100644
--- a/src/ydata_profiling/model/spark/describe_supported_spark.py
+++ b/src/ydata_profiling/model/spark/describe_supported_spark.py
@@ -19,15 +19,15 @@ def describe_supported_spark(
"""
# number of non-NaN observations in the Series
- count = summary["count"]
- n_distinct = summary["value_counts"].count()
+ n = summary["n"]
+ n_distinct = summary["value_counts"].count() # Null/nan's count in value_counts
summary["n_distinct"] = n_distinct
- summary["p_distinct"] = n_distinct / count if count > 0 else 0
+ summary["p_distinct"] = n_distinct / n if n > 0 else 0
n_unique = summary["value_counts"].where("count == 1").count()
- summary["is_unique"] = n_unique == count
+ summary["is_unique"] = n_unique == n
summary["n_unique"] = n_unique
- summary["p_unique"] = n_unique / count if count > 0 else 0
+ summary["p_unique"] = n_unique / n if n > 0 else 0
return config, series, summary
diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py
index 5a033b1d5..75e01e240 100644
--- a/src/ydata_profiling/model/spark/summary_spark.py
+++ b/src/ydata_profiling/model/spark/summary_spark.py
@@ -43,6 +43,8 @@ def spark_describe_1d(
if str(series.schema[0].dataType).startswith("ArrayType"):
dtype = "ArrayType"
+ elif str(series.schema[0].dataType).startswith("Decimal"):
+ dtype = "decimal"
else:
dtype = series.schema[0].dataType.simpleString()
@@ -56,6 +58,7 @@ def spark_describe_1d(
"boolean": "Boolean",
"date": "DateTime",
"timestamp": "DateTime",
+ "decimal": "Numeric",
}[dtype]
return summarizer.summarize(config, series, dtype=vtype)
diff --git a/src/ydata_profiling/report/formatters.py b/src/ydata_profiling/report/formatters.py
index d2c4657e0..199ea854d 100644
--- a/src/ydata_profiling/report/formatters.py
+++ b/src/ydata_profiling/report/formatters.py
@@ -245,13 +245,16 @@ def fmt_numeric(value: float, precision: int = 10) -> str:
Returns:
The numeric value with the given precision.
"""
- fmtted = f"{{:.{precision}g}}".format(value)
- for v in ["e+", "e-"]:
- if v in fmtted:
- sign = "-" if v in "e-" else ""
- fmtted = fmtted.replace(v, " × 10") + ""
- fmtted = fmtted.replace("0", "")
- fmtted = fmtted.replace("", f"{sign}")
+ if value is None:
+ fmtted = "N/A"
+ else:
+ fmtted = f"{{:.{precision}g}}".format(value)
+ for v in ["e+", "e-"]:
+ if v in fmtted:
+ sign = "-" if v in "e-" else ""
+ fmtted = fmtted.replace(v, " × 10") + ""
+ fmtted = fmtted.replace("0", "")
+ fmtted = fmtted.replace("", f"{sign}")
return fmtted
diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py
index 86f5a262a..8fa6b6ae0 100644
--- a/src/ydata_profiling/report/structure/variables/render_categorical.py
+++ b/src/ydata_profiling/report/structure/variables/render_categorical.py
@@ -23,6 +23,7 @@
from ydata_profiling.report.presentation.core.renderable import Renderable
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram
@@ -95,7 +96,7 @@ def render_categorical_length(
else:
hist_data = histogram(config, *summary["histogram_length"])
- length_histo = Image(
+ length_histo = image_or_empty(
hist_data,
image_format=config.plot.image_format,
alt="length histogram",
diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py
index e11e9913e..776bb6763 100644
--- a/src/ydata_profiling/report/structure/variables/render_count.py
+++ b/src/ydata_profiling/report/structure/variables/render_count.py
@@ -8,11 +8,11 @@
from ydata_profiling.report.presentation.core import (
Container,
FrequencyTable,
- Image,
Table,
VariableInfo,
)
from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import histogram, mini_histogram
@@ -94,26 +94,41 @@ def render_count(config: Settings, summary: dict) -> dict:
style=config.html.style,
)
- mini_histo = Image(
- mini_histogram(config, *summary["histogram"]),
- image_format=image_format,
+ summary_histogram = summary.get("histogram", [])
+
+ mini_hist_data = None
+
+ if summary_histogram:
+ mini_hist_data = mini_histogram(config, *summary["histogram"])
+
+ mini_histo = image_or_empty(
+ mini_hist_data,
alt="Mini histogram",
+ image_format=image_format,
+ name="Mini Histogram",
)
template_variables["top"] = Container(
[info, table1, table2, mini_histo], sequence_type="grid"
)
- seqs = [
- Image(
- histogram(config, *summary["histogram"]),
- image_format=image_format,
- alt="Histogram",
- caption=f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})",
- name="Histogram",
- anchor_id="histogram",
- )
- ]
+ hist_data = None
+ hist_caption = None
+
+ if summary_histogram:
+ hist_data = histogram(config, *summary["histogram"])
+ hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})"
+
+ hist = image_or_empty(
+ hist_data,
+ alt="Histogram",
+ image_format=image_format,
+ caption=hist_caption,
+ name="Histogram",
+ anchor_id="histogram",
+ )
+
+ seqs = [hist]
fq = FrequencyTable(
template_variables["freq_table_rows"],
diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py
index 1f142daae..8d002d885 100644
--- a/src/ydata_profiling/report/structure/variables/render_date.py
+++ b/src/ydata_profiling/report/structure/variables/render_date.py
@@ -2,12 +2,8 @@
from ydata_profiling.config import Settings
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
-from ydata_profiling.report.presentation.core import (
- Container,
- Image,
- Table,
- VariableInfo,
-)
+from ydata_profiling.report.presentation.core import Container, Table, VariableInfo
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import histogram, mini_histogram
@@ -76,55 +72,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
style=config.html.style,
)
- if isinstance(summary["histogram"], list):
- mini_histo = Image(
- mini_histogram(
+ summary_histogram = summary.get("histogram", [])
+
+ mini_hist_data = None
+
+ if summary_histogram:
+ if isinstance(summary_histogram, list):
+ mini_hist_data = mini_histogram(
config,
[x[0] for x in summary["histogram"]],
[x[1] for x in summary["histogram"]],
date=True,
- ),
- image_format=image_format,
- alt="Mini histogram",
- )
- else:
- mini_histo = Image(
- mini_histogram(
+ )
+ else:
+ mini_hist_data = mini_histogram(
config, summary["histogram"][0], summary["histogram"][1], date=True
- ),
- image_format=image_format,
- alt="Mini histogram",
- )
+ )
+
+ mini_histo = image_or_empty(
+ mini_hist_data,
+ alt="Mini histogram",
+ image_format=image_format,
+ name="Mini Histogram",
+ anchor_id=f"{varid}minihistogram",
+ )
template_variables["top"] = Container(
[info, table1, table2, mini_histo], sequence_type="grid"
)
- if isinstance(summary["histogram"], list):
- hist_data = histogram(
- config,
- [x[0] for x in summary["histogram"]],
- [x[1] for x in summary["histogram"]],
- date=True,
- )
- else:
- hist_data = histogram(
- config, summary["histogram"][0], summary["histogram"][1], date=True
+ hist_data = None
+ hist_caption = None
+
+ if summary_histogram:
+ if isinstance(summary_histogram, list):
+ hist_data = histogram(
+ config,
+ [x[0] for x in summary["histogram"]],
+ [x[1] for x in summary["histogram"]],
+ date=True,
+ )
+ else:
+ hist_data = histogram(
+ config, summary["histogram"][0], summary["histogram"][1], date=True
+ )
+
+ n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
+ hist_caption = (
+ f"Histogram with fixed size bins (bins={n_bins})"
)
+ hist = image_or_empty(
+ hist_data,
+ image_format=image_format,
+ alt="Histogram",
+ caption=hist_caption,
+ name="Histogram",
+ anchor_id=f"{varid}histogram",
+ )
+
# Bottom
- n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
bottom = Container(
- [
- Image(
- hist_data,
- image_format=image_format,
- alt="Histogram",
- caption=f"Histogram with fixed size bins (bins={n_bins})",
- name="Histogram",
- anchor_id=f"{varid}histogram",
- )
- ],
+ [hist],
sequence_type="tabs",
anchor_id=summary["varid"],
)
diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py
index 81379a41f..e014434ab 100644
--- a/src/ydata_profiling/report/structure/variables/render_file.py
+++ b/src/ydata_profiling/report/structure/variables/render_file.py
@@ -1,10 +1,11 @@
from typing import List
from ydata_profiling.config import Settings
-from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image
+from ydata_profiling.report.presentation.core import Container, FrequencyTable
from ydata_profiling.report.presentation.core.renderable import Renderable
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
from ydata_profiling.report.structure.variables.render_path import render_path
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import histogram
@@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict:
image_format = config.plot.image_format
file_tabs: List[Renderable] = []
+
+ hist_data = None
+ if summary["histogram_file_size"]:
+ hist_data = histogram(config, *summary["histogram_file_size"])
+
if "file_size" in summary:
- file_tabs.append(
- Image(
- histogram(config, *summary["histogram_file_size"]),
- image_format=image_format,
- alt="Size",
- caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})",
- name="File size",
- anchor_id=f"{varid}file_size_histogram",
- )
+ hist = image_or_empty(
+ hist_data,
+ image_format=image_format,
+ alt="Size",
+ caption=f"Histogram with fixed size bins of file sizes (in bytes) (bins={len(summary['histogram_file_size'][1]) - 1})",
+ name="File size",
+ anchor_id=f"{varid}file_size_histogram",
)
+ file_tabs.append(hist)
file_dates = {
"file_created_time": "Created",
diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py
index 227200c27..92c73fcf3 100644
--- a/src/ydata_profiling/report/structure/variables/render_real.py
+++ b/src/ydata_profiling/report/structure/variables/render_real.py
@@ -9,11 +9,11 @@
from ydata_profiling.report.presentation.core import (
Container,
FrequencyTable,
- Image,
Table,
VariableInfo,
)
from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import histogram, mini_histogram
@@ -118,22 +118,27 @@ def render_real(config: Settings, summary: dict) -> dict:
style=config.html.style,
)
- if isinstance(summary.get("histogram", []), list):
- mini_histo = Image(
- mini_histogram(
+ summary_histogram = summary.get("histogram", [])
+
+ mini_hist_data = None
+
+ if summary_histogram:
+ if isinstance(summary_histogram, list):
+ mini_hist_data = mini_histogram(
config,
- [x[0] for x in summary.get("histogram", [])],
- [x[1] for x in summary.get("histogram", [])],
- ),
- image_format=image_format,
- alt="Mini histogram",
- )
- else:
- mini_histo = Image(
- mini_histogram(config, *summary["histogram"]),
- image_format=image_format,
- alt="Mini histogram",
- )
+ [x[0] for x in summary_histogram],
+ [x[1] for x in summary_histogram],
+ )
+ else:
+ mini_hist_data = mini_histogram(config, *summary_histogram)
+
+ mini_histo = image_or_empty(
+ mini_hist_data,
+ alt="Mini histogram",
+ image_format=image_format,
+ name="Mini Histogram",
+ anchor_id=f"{varid}minihistogram",
+ )
template_variables["top"] = Container(
[info, table1, table2, mini_histo], sequence_type="grid"
@@ -243,22 +248,31 @@ def render_real(config: Settings, summary: dict) -> dict:
sequence_type="grid",
)
- if isinstance(summary.get("histogram", []), list):
- hist_data = histogram(
- config,
- [x[0] for x in summary.get("histogram", [])],
- [x[1] for x in summary.get("histogram", [])],
- )
- bins = len(summary["histogram"][0][1]) - 1 if "histogram" in summary else 0
- hist_caption = f"Histogram with fixed size bins (bins={bins})"
- else:
- hist_data = histogram(config, *summary["histogram"])
- hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})"
+ hist_data = None
+ hist_caption = None
+
+ if summary_histogram:
+ if isinstance(summary_histogram, list):
+ hist_data = histogram(
+ config,
+ [x[0] for x in summary_histogram],
+ [x[1] for x in summary_histogram],
+ )
+ bins = len(summary_histogram[0][1])
+ hist_caption = (
+ f"Histogram with fixed size bins (bins={bins - 1})"
+ )
+ else:
+ hist_data = histogram(config, *summary_histogram)
+ hist_caption = (
+ f"Histogram with fixed size bins "
+ f"(bins={len(summary_histogram[1]) - 1})"
+ )
- hist = Image(
+ hist = image_or_empty(
hist_data,
- image_format=image_format,
alt="Histogram",
+ image_format=image_format,
caption=hist_caption,
name="Histogram",
anchor_id=f"{varid}histogram",
diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py
index 6f3bc27cd..17f396ac0 100644
--- a/src/ydata_profiling/report/structure/variables/render_timeseries.py
+++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py
@@ -15,6 +15,7 @@
VariableInfo,
)
from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
from ydata_profiling.visualisation.plot import (
histogram,
mini_ts_plot,
@@ -289,18 +290,24 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
sequence_type="grid",
)
- if isinstance(summary["histogram"], list):
- hist_data = histogram(
- config,
- [x[0] for x in summary["histogram"]],
- [x[1] for x in summary["histogram"]],
- )
- hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})"
- else:
- hist_data = histogram(config, *summary["histogram"])
- hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})"
+ summary_histogram = summary.get("histogram", [])
- hist = Image(
+ hist_data = None
+ hist_caption = None
+
+ if summary_histogram:
+ if isinstance(summary_histogram, list):
+ hist_data = histogram(
+ config,
+ [x[0] for x in summary["histogram"]],
+ [x[1] for x in summary["histogram"]],
+ )
+ hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})"
+ else:
+ hist_data = histogram(config, *summary["histogram"])
+ hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})"
+
+ hist = image_or_empty(
hist_data,
image_format=image_format,
alt="Histogram",
diff --git a/src/ydata_profiling/report/utils.py b/src/ydata_profiling/report/utils.py
new file mode 100644
index 000000000..9d8a09b04
--- /dev/null
+++ b/src/ydata_profiling/report/utils.py
@@ -0,0 +1,34 @@
+from typing import Optional
+
+from ydata_profiling.config import ImageType
+from ydata_profiling.report.presentation.core import Container, Image
+
+
+def image_or_empty(
+ image: Optional[str],
+ *,
+ alt: str,
+ image_format: ImageType,
+ caption: Optional[str] = None,
+ name: Optional[str] = None,
+ anchor_id: Optional[str] = None,
+) -> Container | Image:
+ """
+ Add helper to render an image or an empty container when necessary.
+ """
+ if image is None:
+ return Container(
+ items=[],
+ name=name or f"{alt}_empty",
+ anchor_id=anchor_id,
+ sequence_type="grid",
+ )
+
+ return Image(
+ image=image,
+ image_format=image_format,
+ alt=alt,
+ caption=caption,
+ name=name,
+ anchor_id=anchor_id,
+ )
diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py
index a2df1fd5d..b3dc5338e 100644
--- a/src/ydata_profiling/visualisation/plot.py
+++ b/src/ydata_profiling/visualisation/plot.py
@@ -134,13 +134,49 @@ def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str:
return plot_360_n0sc0pe(config)
+def _is_valid_hist_data(series: np.ndarray, bins: Union[int, np.ndarray]) -> bool:
+ """
+ Returns True if the series and bins contain enough usable numeric data
+ to produce a histogram without matplotlib errors.
+ """
+ if series is None or bins is None:
+ return False
+
+ if len(series) == 0:
+ return False
+
+ try:
+ series_arr = np.asarray(series, dtype=float)
+ except Exception:
+ return False
+
+ if not np.isfinite(series_arr).any():
+ return False
+
+ # Handle bins type
+ if isinstance(bins, int):
+ if bins < 1:
+ return False
+ else:
+ try:
+ bins_arr = np.asarray(bins, dtype=float)
+ except Exception:
+ return False
+ if len(bins_arr) < 2:
+ return False
+ if not np.isfinite(bins_arr).all():
+ return False
+
+ return True
+
+
@manage_matplotlib_context()
def histogram(
config: Settings,
series: np.ndarray,
bins: Union[int, np.ndarray],
date: bool = False,
-) -> str:
+) -> str | None:
"""Plot an histogram of the data.
Args:
@@ -153,6 +189,9 @@ def histogram(
The resulting histogram encoded as a string.
"""
+
+ if not _is_valid_hist_data(series, bins):
+ return None
plot = _plot_histogram(config, series, bins, date=date, figsize=(7, 3))
plot.xaxis.set_tick_params(rotation=90 if date else 45)
plot.figure.tight_layout()
@@ -165,7 +204,7 @@ def mini_histogram(
series: np.ndarray,
bins: Union[int, np.ndarray],
date: bool = False,
-) -> str:
+) -> str | None:
"""Plot a small (mini) histogram of the data.
Args:
@@ -176,6 +215,9 @@ def mini_histogram(
Returns:
The resulting mini histogram encoded as a string.
"""
+ if not _is_valid_hist_data(series, bins):
+ return None
+
plot = _plot_histogram(
config, series, bins, figsize=(3, 2.25), date=date, hide_yaxis=True
)
@@ -253,8 +295,16 @@ def correlation_matrix(config: Settings, data: pd.DataFrame, vmin: int = -1) ->
cmap.set_bad(config.plot.correlation.bad)
labels = data.columns
+
+ try:
+ matrix = np.asarray(data, dtype=float)
+ except Exception:
+ # If conversion fails, create an all-NaN matrix of the appropriate shape
+ n = len(data)
+ matrix = np.full((n, n), np.nan, dtype=float)
+
matrix_image = axes_cor.imshow(
- data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap
+ matrix, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap
)
plt.colorbar(matrix_image)
diff --git a/tests/backends/spark_backend/test_correlations_spark.py b/tests/backends/spark_backend/test_correlations_spark.py
index ef16224a1..02ab6886c 100644
--- a/tests/backends/spark_backend/test_correlations_spark.py
+++ b/tests/backends/spark_backend/test_correlations_spark.py
@@ -43,7 +43,10 @@ def correlation_data_cat(spark_session):
@pytest.fixture
def correlation_var_types():
- return {"test_num_1": {"type": "Numeric"}, "test_num_2": {"type": "Numeric"}}
+ return {
+ "test_num_1": {"type": "Numeric", "n": 7, "n_missing": 0},
+ "test_num_2": {"type": "Numeric", "n": 7, "n_missing": 0},
+ }
def test_spearman_spark(correlation_data_num, correlation_var_types):
diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py
index 24de9afbb..63aabbe4e 100644
--- a/tests/backends/spark_backend/test_descriptions_spark.py
+++ b/tests/backends/spark_backend/test_descriptions_spark.py
@@ -110,8 +110,8 @@ def expected_results():
},
"x": {
"n": 9,
- "count": 9,
- "p_missing": 0.0,
+ "count": 8,
+ "p_missing": 0.1111111111111111,
"n_distinct": 7,
"n_unique": 5,
"p_distinct": 0.7777777777777778,
@@ -130,21 +130,21 @@ def expected_results():
"95%": 50.0,
"mad": 5.0,
"min": -10.0,
- "max": check_is_NaN,
- "mean": check_is_NaN,
- "std": check_is_NaN,
- "variance": check_is_NaN,
- "kurtosis": check_is_NaN,
- "skewness": check_is_NaN,
- "sum": check_is_NaN,
- "range": check_is_NaN,
+ "max": 50.0,
+ "mean": 13.375,
+ "std": 23.68807716974934,
+ "variance": 561.125,
+ "kurtosis": -0.9061564710904939,
+ "skewness": 0.8700654233008702,
+ "sum": 107.0,
+ "range": 60.0,
"iqr": 18.0,
- "cv": check_is_NaN,
+ "cv": 1.771071190261633,
},
"y": {
"n": 9,
- "count": 9,
- "p_missing": 0.0,
+ "count": 8,
+ "p_missing": 0.1111111111111111,
"n_distinct": 9,
"n_unique": 9,
"p_distinct": 1.0,
@@ -163,16 +163,16 @@ def expected_results():
"95%": 3122.0,
"mad": 15.9,
"min": -3.1415926535,
- "max": check_is_NaN,
- "mean": check_is_NaN,
- "std": check_is_NaN,
- "variance": check_is_NaN,
- "kurtosis": check_is_NaN,
- "skewness": check_is_NaN,
- "sum": check_is_NaN,
- "range": check_is_NaN,
+ "max": 3122.0,
+ "mean": 491.1743650433125,
+ "std": 1086.1335236468508,
+ "variance": 1179686.0311895243,
+ "kurtosis": 2.6543509612939804,
+ "skewness": 2.097192909339154,
+ "sum": 3929.3949203465,
+ "range": 3125.1415926535,
"iqr": 110.999999,
- "cv": check_is_NaN,
+ "cv": 2.211299287883385,
},
"cat": {
"n": 9,
@@ -180,9 +180,9 @@ def expected_results():
"p_missing": 0.1111111111111111,
"n_distinct": 7,
"n_unique": 6,
- "p_distinct": 0.875,
+ "p_distinct": 0.7777777777777778,
"is_unique": False,
- "p_unique": 0.75,
+ "p_unique": 0.6666666666666666,
},
"s1": {
"n": 9,
@@ -247,9 +247,9 @@ def expected_results():
"p_missing": 0.1111111111111111,
"n_distinct": 6,
"n_unique": 4,
- "p_distinct": 0.75,
+ "p_distinct": 0.6666666666666666,
"is_unique": False,
- "p_unique": 0.5,
+ "p_unique": 0.4444444444444444,
},
"bool_tf": {
"count": 9,
@@ -287,8 +287,8 @@ def expected_results():
},
"bool_01_with_nan": {
"n": 9,
- "count": 9,
- "p_missing": 0.0,
+ "count": 8,
+ "p_missing": 0.1111111111111111,
"n_distinct": 3,
"n_unique": 1,
"p_distinct": 0.3333333333333333,
@@ -307,16 +307,16 @@ def expected_results():
"95%": 1.0,
"mad": 0.0,
"min": 0.0,
- "max": check_is_NaN,
- "mean": check_is_NaN,
- "std": check_is_NaN,
- "variance": check_is_NaN,
- "kurtosis": check_is_NaN,
- "skewness": check_is_NaN,
- "sum": check_is_NaN,
- "range": check_is_NaN,
+ "max": 1.0,
+ "mean": 0.5,
+ "std": 0.5345224838248488,
+ "variance": 0.2857142857142857,
+ "kurtosis": -2.0,
+ "skewness": 1.1102230246251565e-16,
+ "sum": 4.0,
+ "range": 1.0,
"iqr": 1.0,
- "cv": check_is_NaN,
+ "cv": 1.0690449676496976,
},
"list": {
"n": 9,
diff --git a/tests/backends/spark_backend/test_issue1429.py b/tests/backends/spark_backend/test_issue1429.py
new file mode 100644
index 000000000..3d17b8cef
--- /dev/null
+++ b/tests/backends/spark_backend/test_issue1429.py
@@ -0,0 +1,138 @@
+"""
+Test for issue 1429:
+https://github.com/ydataai/ydata-profiling/issues/1429
+"""
+from typing import List, Optional, Tuple
+
+import numpy as np
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql import types as T
+
+from ydata_profiling import ProfileReport
+from ydata_profiling.config import SparkSettings
+from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark
+from ydata_profiling.model.spark.describe_generic_spark import describe_generic_spark
+from ydata_profiling.model.spark.describe_numeric_spark import (
+ describe_numeric_1d_spark,
+ numeric_stats_spark,
+)
+from ydata_profiling.model.spark.describe_supported_spark import (
+ describe_supported_spark,
+)
+
+RowType = Tuple[
+ Optional[str],
+ Optional[float],
+ Optional[int],
+ Optional[bool],
+ Optional[float],
+ Optional[str],
+]
+
+config = SparkSettings()
+
+
+def create_test_df(spark: SparkSession) -> DataFrame:
+ schema = T.StructType(
+ [
+ T.StructField("category", T.StringType(), True),
+ T.StructField("double", T.DoubleType(), True),
+ T.StructField("int", T.IntegerType(), True),
+ T.StructField("boolean", T.BooleanType(), True),
+ T.StructField("null_double", T.DoubleType(), True),
+ T.StructField("null_string", T.StringType(), True),
+ ]
+ )
+
+ data: List[RowType] = [
+ (f"test_{num + 1}", float(num), int(num), True, None, None)
+ for num in range(205)
+ ]
+
+ # Adding dupes
+ data.extend([("test_1", float(1), int(1), False, None, None) for _ in range(205)])
+
+ # Adding nulls
+ data.extend([(None, None, None, None, None, None) for _ in range(100)])
+
+ return spark.createDataFrame(data, schema=schema)
+
+
+def test_describe_numeric_spark(spark_session):
+ test_df = create_test_df(spark_session)
+
+ numeric_stats = numeric_stats_spark(df=test_df.select("double"), summary={})
+
+ for _, value in numeric_stats.items():
+ assert value is not None
+
+
+def test_describe_numeric_1d_spark_for_null_column_edge_case(
+ spark_session, test_output_dir
+):
+ spark = spark_session
+ test_df = create_test_df(spark)
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("null_double"), summary={}
+ )
+
+ _, _, summary = describe_generic_spark(
+ config=config, df=test_df.select("null_double"), summary=summary
+ )
+
+ _, _, summary = describe_supported_spark(
+ config=config, series=test_df.select("null_double"), summary=summary
+ )
+
+ _, _, summary = describe_numeric_1d_spark(
+ config=config, df=test_df.select("null_double"), summary=summary
+ )
+
+ assert summary["iqr"] is np.nan
+ assert summary["mad"] is np.nan
+ assert summary["cv"] is np.nan
+ assert summary["mean"] is None
+ assert summary["histogram"] == []
+
+ profile = ProfileReport(test_df, title="1429_edge_case.html", config=config)
+
+ output_file = test_output_dir / "1429_edge_case.html"
+
+ profile.to_file(output_file)
+
+ assert output_file.exists()
+
+
+def test_describe_counts_spark(spark_session):
+ test_df = create_test_df(spark_session)
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("category"), summary={}
+ )
+
+ assert summary["value_counts_without_nan"].loc["test_1"] == 206
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("double"), summary={}
+ )
+
+ assert summary["value_counts_without_nan"].loc[float(1)] == 206
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("int"), summary={}
+ )
+
+ assert summary["value_counts_without_nan"].loc[int(1)] == 206
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("boolean"), summary={}
+ )
+
+ assert summary["value_counts_without_nan"].loc[True] == 205
+
+ _, _, summary = describe_counts_spark(
+ config=config, series=test_df.select("null_double"), summary={}
+ )
+
+ assert summary["value_counts_without_nan"].size == 0
diff --git a/tests/backends/spark_backend/test_issue1602.py b/tests/backends/spark_backend/test_issue1602.py
new file mode 100644
index 000000000..b25ae1873
--- /dev/null
+++ b/tests/backends/spark_backend/test_issue1602.py
@@ -0,0 +1,33 @@
+"""
+Test for issue 1602:
+https://github.com/ydataai/ydata-profiling/issues/1602
+"""
+
+from pyspark.sql import types as T
+
+from ydata_profiling import ProfileReport
+
+
+def test_spark_handles_decimal_type(test_output_dir, spark_session):
+ from decimal import Decimal
+
+ spark = spark_session
+
+ schema = T.StructType(
+ [
+ T.StructField("number", T.StringType(), True),
+ T.StructField("decimal", T.DecimalType(10, 2), True),
+ ]
+ )
+
+ data = [(f"test_{num + 1}", Decimal(num + 1)) for num in range(205)]
+
+ data.extend([("test_1", Decimal("1.05")) for _ in range(205)])
+
+ test_df = spark.createDataFrame(data, schema=schema)
+
+ profile = ProfileReport(test_df, title="decimal_handling", explorative=True)
+ output_file = test_output_dir / "decimal_handling.html"
+ profile.to_file(output_file)
+
+ assert output_file.exists()
diff --git a/tests/backends/spark_backend/test_issue1722.py b/tests/backends/spark_backend/test_issue1722.py
new file mode 100644
index 000000000..6cb2d5596
--- /dev/null
+++ b/tests/backends/spark_backend/test_issue1722.py
@@ -0,0 +1,90 @@
+"""
+Test for issue 1722:
+https://github.com/ydataai/ydata-profiling/issues/1722
+"""
+
+from datetime import date, datetime
+
+from pyspark.sql import SparkSession
+from pyspark.sql import types as T
+
+from ydata_profiling import ProfileReport
+
+
+def make_non_numeric_df(spark: SparkSession):
+ # Intentionally not including any numeric types
+ schema = T.StructType(
+ [
+ T.StructField("id", T.StringType(), False),
+ T.StructField("d", T.DateType(), False),
+ T.StructField("ts", T.TimestampType(), False),
+ T.StructField("arr", T.ArrayType(T.IntegerType()), False),
+ T.StructField("mp", T.MapType(T.StringType(), T.IntegerType()), False),
+ T.StructField(
+ "struct",
+ T.StructType(
+ [
+ T.StructField("a", T.IntegerType(), False),
+ T.StructField("b", T.StringType(), False),
+ ]
+ ),
+ False,
+ ),
+ ]
+ )
+
+ data = [
+ (
+ "r1",
+ date(2020, 1, 1),
+ datetime(2020, 1, 1, 12, 0),
+ [1, 2],
+ {"x": 1, "y": 2},
+ (10, "aa"),
+ ),
+ (
+ "r2",
+ date(2021, 6, 15),
+ datetime(2021, 6, 15, 8, 30),
+ [3],
+ {"z": 3},
+ (20, "bb"),
+ ),
+ ("r3", date(2022, 12, 31), datetime(2022, 12, 31, 23, 59), [], {}, (30, "cc")),
+ ]
+
+ return spark.createDataFrame(data, schema=schema)
+
+
+def test_issue1722(test_output_dir, spark_session):
+ from pyspark.sql import functions as F
+
+ spark = spark_session
+
+ non_numeric_df = make_non_numeric_df(spark)
+
+ # type casting 1
+ df_casted = non_numeric_df.select(
+ [
+ (
+ F.col(field.name).cast("string").alias(field.name)
+ if isinstance(field.dataType, (T.DateType, T.TimestampType))
+ else F.col(field.name)
+ )
+ for field in non_numeric_df.schema
+ ]
+ )
+ # type casting 2
+ complex_columns = [
+ field.name
+ for field in non_numeric_df.schema.fields
+ if isinstance(field.dataType, (T.ArrayType, T.MapType, T.StructType))
+ ]
+ for col_name in complex_columns:
+ df_casted = df_casted.withColumn(col_name, F.to_json(F.col(col_name)))
+
+ profile = ProfileReport(df_casted, title="non_numeric_1722", explorative=True)
+ output_file = test_output_dir / "non_numeric_1722.html"
+ profile.to_file(output_file)
+
+ assert output_file.exists()
diff --git a/tests/conftest.py b/tests/conftest.py
index 28ce21215..7db073a3c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -88,6 +88,10 @@ def spark_context():
.setAppName("pytest-pyspark-tests")
.setMaster("local[*]")
.set("spark.sql.ansi.enabled", "false")
+ .set("spark.driver.host", "127.0.0.1")
+ .set("spark.driver.bindAddress", "127.0.0.1")
+ .set("spark.driver.port", "4040")
+ .set("spark.blockManager.port", "4041")
)
# Check if SparkContext exists before creating a new one
@@ -112,6 +116,8 @@ def spark_session(spark_context):
pytest.skip("Skipping Spark tests because PySpark is not installed.")
spark = (
SparkSession.builder.master("local[*]")
+ .config("spark.driver.host", "127.0.0.1")
+ .config("spark.driver.bindAddress", "127.0.0.1")
.appName("pytest")
.config("spark.sql.ansi.enabled", "false") # <-- restore permissive casts
.getOrCreate()
diff --git a/tests/unit/test_formatters.py b/tests/unit/test_formatters.py
index 156cc4385..f49e693d8 100644
--- a/tests/unit/test_formatters.py
+++ b/tests/unit/test_formatters.py
@@ -83,6 +83,7 @@ def test_fmt_array(array, threshold, expected):
(1e20, 10, "1 × 1020"),
(1e-20, 10, "1 × 10-20"),
(1e8, 3, "1 × 108"),
+ (None, 3, "N/A"),
],
)
def test_fmt_numeric(value, precision, expected):