Data-Centric-AI-Community
diff --git a/‎src/ydata_profiling/model/spark/correlations_spark.py‎
Lines changed: 14 additions & 2 deletions b/‎src/ydata_profiling/model/spark/correlations_spark.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎src/ydata_profiling/model/spark/describe_counts_spark.py‎
Lines changed: 13 additions & 4 deletions b/‎src/ydata_profiling/model/spark/describe_counts_spark.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/ydata_profiling/model/spark/describe_numeric_spark.py‎
Lines changed: 4 additions & 9 deletions b/‎src/ydata_profiling/model/spark/describe_numeric_spark.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎src/ydata_profiling/model/spark/describe_supported_spark.py‎
Lines changed: 5 additions & 5 deletions b/‎src/ydata_profiling/model/spark/describe_supported_spark.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/ydata_profiling/report/structure/variables/render_categorical.py‎
Lines changed: 2 additions & 1 deletion b/‎src/ydata_profiling/report/structure/variables/render_categorical.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/ydata_profiling/report/structure/variables/render_count.py‎
Lines changed: 29 additions & 13 deletions b/‎src/ydata_profiling/report/structure/variables/render_count.py‎
Lines changed: 29 additions & 13 deletions
diff --git a/‎src/ydata_profiling/report/structure/variables/render_date.py‎
Lines changed: 49 additions & 35 deletions b/‎src/ydata_profiling/report/structure/variables/render_date.py‎
Lines changed: 49 additions & 35 deletions
diff --git a/‎src/ydata_profiling/report/structure/variables/render_file.py‎
Lines changed: 14 additions & 9 deletions b/‎src/ydata_profiling/report/structure/variables/render_file.py‎
Lines changed: 14 additions & 9 deletions
@@ -45,9 +45,18 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
     exact same workflow. The syntax is Correlation.corr(dataframe, method="pearson" OR "spearman"),
     and Correlation is from pyspark.ml.stat
     """
-    variables = {column: description["type"] for column, description in summary.items()}
+    variables = {
+        column: {
+            "type": description["type"],
+            "all_missing": description.get("n", 0) - description.get("n_missing", 0)
+            == 0,
+        }
+        for column, description in summary.items()
+    }
     interval_columns = [
-        column for column, type_name in variables.items() if type_name == "Numeric"
+        column
+        for column, mapping in variables.items()
+        if mapping["type"] == "Numeric" and not mapping["all_missing"]
     ]
 
     if not interval_columns:
@@ -67,6 +76,9 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
     assembler = VectorAssembler(**assembler_args)
     df_vector = assembler.transform(df).select(vector_col)
 
+    if df_vector.count() <= 1:
+        return [], interval_columns
+
     # get correlation matrix
     matrix = (
         Correlation.corr(df_vector, vector_col, method=corr_type).head()[0].toArray()
 
@@ -5,7 +5,8 @@
 
 import pandas as pd
 from pyspark.sql import DataFrame
-from pyspark.sql import functions as F, types as T
+from pyspark.sql import functions as F
+from pyspark.sql import types as T
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_counts
@@ -27,7 +28,9 @@ def describe_counts_spark(
     """
     # Cast Decimal Type s
     if isinstance(series.schema.fields[0].dataType, T.DecimalType):
-        series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]))
+        series = series.select(
+            F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0])
+        )
 
     # Count occurrences of each value
     value_counts = series.groupBy(series.columns[0]).count()
@@ -42,12 +45,18 @@ def describe_counts_spark(
     if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
         n_missing = (
             # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
-            value_counts.filter(F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))).select("count").first()
+            value_counts.filter(
+                F.col(series.columns[0]).isNull() | F.isnan(F.col(series.columns[0]))
+            )
+            .select("count")
+            .first()
         )
     else:
         n_missing = (
             # Need to add the isnan() check because Pandas isnull check will count NaN as null, but Spark does not
-            value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
+            value_counts.filter(F.col(series.columns[0]).isNull())
+            .select("count")
+            .first()
         )
 
     n_missing = n_missing["count"] if n_missing else 0
 
@@ -13,9 +13,9 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
 
     # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
     finite_filter = (
-            F.col(column).isNotNull()
-            & ~F.isnan(F.col(column))
-            & ~F.col(column).isin([np.inf, -np.inf])
+        F.col(column).isNotNull()
+        & ~F.isnan(F.col(column))
+        & ~F.col(column).isin([np.inf, -np.inf])
     )
     non_null_df = df.filter(finite_filter)
 
@@ -91,12 +91,7 @@ def describe_numeric_1d_spark(
 
     if summary.get("n") == summary.get("n_missing"):
         # This means the entire column is null/nan, so summary values need to be hard-coded:
-        summary.update(
-            {
-                f"{percentile:.0%}": np.nan
-                for percentile in quantiles
-            }
-        )
+        summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})
 
         summary["mad"] = np.nan
         summary["iqr"] = np.nan
 
@@ -19,15 +19,15 @@ def describe_supported_spark(
     """
 
     # number of non-NaN observations in the Series
-    count = summary["count"]
-    n_distinct = summary["value_counts"].count()
+    n = summary["n"]
+    n_distinct = summary["value_counts"].count()  # Null/nan's count in value_counts
 
     summary["n_distinct"] = n_distinct
-    summary["p_distinct"] = n_distinct / count if count > 0 else 0
+    summary["p_distinct"] = n_distinct / n if n > 0 else 0
 
     n_unique = summary["value_counts"].where("count == 1").count()
-    summary["is_unique"] = n_unique == count
+    summary["is_unique"] = n_unique == n
     summary["n_unique"] = n_unique
-    summary["p_unique"] = n_unique / count if count > 0 else 0
+    summary["p_unique"] = n_unique / n if n > 0 else 0
 
     return config, series, summary
@@ -23,6 +23,7 @@
 from ydata_profiling.report.presentation.core.renderable import Renderable
 from ydata_profiling.report.presentation.frequency_table_utils import freq_table
 from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
 from ydata_profiling.visualisation.plot import cat_frequency_plot, histogram
 
 
@@ -95,7 +96,7 @@ def render_categorical_length(
     else:
         hist_data = histogram(config, *summary["histogram_length"])
 
-    length_histo = Image(
+    length_histo = image_or_empty(
         hist_data,
         image_format=config.plot.image_format,
         alt="length histogram",
 
@@ -13,6 +13,7 @@
     VariableInfo,
 )
 from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.report.utils import image_or_empty
 from ydata_profiling.visualisation.plot import histogram, mini_histogram
 
 
@@ -94,26 +95,41 @@ def render_count(config: Settings, summary: dict) -> dict:
         style=config.html.style,
     )
 
-    mini_histo = Image(
-        mini_histogram(config, *summary["histogram"]),
-        image_format=image_format,
+    summary_histogram = summary.get("histogram", [])
+
+    mini_hist_data = None
+
+    if summary_histogram:
+        mini_hist_data = mini_histogram(config, *summary["histogram"])
+
+    mini_histo = image_or_empty(
+        mini_hist_data,
         alt="Mini histogram",
+        image_format=image_format,
+        name="Mini Histogram",
     )
 
     template_variables["top"] = Container(
         [info, table1, table2, mini_histo], sequence_type="grid"
     )
 
-    seqs = [
-        Image(
-            histogram(config, *summary["histogram"]),
-            image_format=image_format,
-            alt="Histogram",
-            caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})",
-            name="Histogram",
-            anchor_id="histogram",
-        )
-    ]
+    hist_data = None
+    hist_caption = None
+
+    if summary_histogram:
+        hist_data = histogram(config, *summary["histogram"])
+        hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})"
+
+    hist = image_or_empty(
+        hist_data,
+        alt="Histogram",
+        image_format=image_format,
+        caption=hist_caption,
+        name="Histogram",
+        anchor_id="histogram",
+    )
+
+    seqs = [hist]
 
     fq = FrequencyTable(
         template_variables["freq_table_rows"],
 
@@ -8,6 +8,7 @@
     Table,
     VariableInfo,
 )
+from ydata_profiling.report.utils import image_or_empty
 from ydata_profiling.visualisation.plot import histogram, mini_histogram
 
 
@@ -76,55 +77,68 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
         style=config.html.style,
     )
 
-    if isinstance(summary["histogram"], list):
-        mini_histo = Image(
-            mini_histogram(
+    summary_histogram = summary.get("histogram", [])
+
+    mini_hist_data = None
+
+    if summary_histogram:
+        if isinstance(summary_histogram, list):
+            mini_hist_data = mini_histogram(
                 config,
                 [x[0] for x in summary["histogram"]],
                 [x[1] for x in summary["histogram"]],
                 date=True,
-            ),
-            image_format=image_format,
-            alt="Mini histogram",
-        )
-    else:
-        mini_histo = Image(
-            mini_histogram(
+            )
+        else:
+            mini_hist_data = mini_histogram(
                 config, summary["histogram"][0], summary["histogram"][1], date=True
-            ),
-            image_format=image_format,
-            alt="Mini histogram",
-        )
+            )
+
+    mini_histo = image_or_empty(
+        mini_hist_data,
+        alt="Mini histogram",
+        image_format=image_format,
+        name="Mini Histogram",
+        anchor_id=f"{varid}minihistogram",
+    )
 
     template_variables["top"] = Container(
         [info, table1, table2, mini_histo], sequence_type="grid"
     )
 
-    if isinstance(summary["histogram"], list):
-        hist_data = histogram(
-            config,
-            [x[0] for x in summary["histogram"]],
-            [x[1] for x in summary["histogram"]],
-            date=True,
-        )
-    else:
-        hist_data = histogram(
-            config, summary["histogram"][0], summary["histogram"][1], date=True
+    hist_data = None
+    hist_caption = None
+
+    if summary_histogram:
+        if isinstance(summary_histogram, list):
+            hist_data = histogram(
+                config,
+                [x[0] for x in summary["histogram"]],
+                [x[1] for x in summary["histogram"]],
+                date=True,
+            )
+        else:
+            hist_data = histogram(
+                config, summary["histogram"][0], summary["histogram"][1], date=True
+            )
+
+        n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
+        hist_caption = (
+            f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})"
         )
 
+    hist = image_or_empty(
+        hist_data,
+        image_format=image_format,
+        alt="Histogram",
+        caption=hist_caption,
+        name="Histogram",
+        anchor_id=f"{varid}histogram",
+    )
+
     # Bottom
-    n_bins = len(summary["histogram"][1]) - 1 if summary["histogram"] else 0
     bottom = Container(
-        [
-            Image(
-                hist_data,
-                image_format=image_format,
-                alt="Histogram",
-                caption=f"<strong>Histogram with fixed size bins</strong> (bins={n_bins})",
-                name="Histogram",
-                anchor_id=f"{varid}histogram",
-            )
-        ],
+        [hist],
         sequence_type="tabs",
         anchor_id=summary["varid"],
     )
 
@@ -5,6 +5,7 @@
 from ydata_profiling.report.presentation.core.renderable import Renderable
 from ydata_profiling.report.presentation.frequency_table_utils import freq_table
 from ydata_profiling.report.structure.variables.render_path import render_path
+from ydata_profiling.report.utils import image_or_empty
 from ydata_profiling.visualisation.plot import histogram
 
 
@@ -20,17 +21,21 @@ def render_file(config: Settings, summary: dict) -> dict:
     image_format = config.plot.image_format
 
     file_tabs: List[Renderable] = []
+
+    hist_data = None
+    if summary["histogram_file_size"]:
+        hist_data = histogram(config, *summary["histogram_file_size"])
+
     if "file_size" in summary:
-        file_tabs.append(
-            Image(
-                histogram(config, *summary["histogram_file_size"]),
-                image_format=image_format,
-                alt="Size",
-                caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
-                name="File size",
-                anchor_id=f"{varid}file_size_histogram",
-            )
+        hist = image_or_empty(
+            hist_data,
+            image_format=image_format,
+            alt="Size",
+            caption=f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})",
+            name="File size",
+            anchor_id=f"{varid}file_size_histogram",
         )
+        file_tabs.append(hist)
 
     file_dates = {
         "file_created_time": "Created",