Add handling for spark correlations with no numeric fields

MCBoarder289 · MCBoarder289 · commit 4a5280424ab8 · 2025-12-06T21:26:09.000-06:00
This change addresses issue #1722 (#1722). Assembling a vector column in Spark with no numeric columns results in features with a NULL size, NULL indices, and an empty list of values. This causes an exception to be raised when computing correlations. The solution here is to avoid computing the correlation matrix when there are no interval columns (numeric).
diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py
@@ -49,6 +49,10 @@ def _compute_corr_natively(df: DataFrame, summary: dict, corr_type: str) -> Arra
     interval_columns = [
         column for column, type_name in variables.items() if type_name == "Numeric"
     ]
+
+    if not interval_columns:
+        return [], interval_columns
+
     df = df.select(*interval_columns)
 
     # convert to vector column first
diff --git a/tests/issues/test_issue1722.py b/tests/issues/test_issue1722.py
@@ -0,0 +1,77 @@
+"""
+Test for issue 1723:
+https://github.com/ydataai/ydata-profiling/issues/1723
+"""
+import subprocess
+
+import pandas as pd
+import pytest
+from docutils.nodes import title
+
+from ydata_profiling import ProfileReport
+from datetime import date, datetime
+from pyspark.sql import types as T, SparkSession
+
+def make_non_numeric_df(spark: SparkSession):
+    # Intentionally not including any numeric types
+    schema = T.StructType(
+        [
+            T.StructField("id", T.StringType(), False),
+            T.StructField("d", T.DateType(), False),
+            T.StructField("ts", T.TimestampType(), False),
+            T.StructField("arr", T.ArrayType(T.IntegerType()), False),
+            T.StructField("mp", T.MapType(T.StringType(), T.IntegerType()), False),
+            T.StructField(
+                "struct",
+                T.StructType(
+                    [
+                        T.StructField("a", T.IntegerType(), False),
+                        T.StructField("b", T.StringType(), False),
+                    ]
+                ),
+                False,
+            ),
+        ]
+    )
+
+    data = [
+        ("r1", date(2020, 1, 1), datetime(2020, 1, 1, 12, 0), [1, 2], {"x": 1, "y": 2}, (10, "aa")),
+        ("r2", date(2021, 6, 15), datetime(2021, 6, 15, 8, 30), [3], {"z": 3}, (20, "bb")),
+        ("r3", date(2022, 12, 31), datetime(2022, 12, 31, 23, 59), [], {}, (30, "cc")),
+    ]
+
+    return spark.createDataFrame(data, schema=schema)
+
+
+def test_issue1722(test_output_dir, spark_session):
+    from pyspark.sql import functions as F, types as T
+
+    spark = spark_session
+
+    non_numeric_df = make_non_numeric_df(spark)
+
+    # type casting 1
+    df_casted = non_numeric_df.select(
+        [
+            (
+                F.col(field.name).cast("string").alias(field.name)
+                if isinstance(field.dataType, (T.DateType, T.TimestampType))
+                else F.col(field.name)
+            )
+            for field in non_numeric_df.schema
+        ]
+    )
+    # type casting 2
+    complex_columns = [
+        field.name
+        for field in non_numeric_df.schema.fields
+        if isinstance(field.dataType, (T.ArrayType, T.MapType, T.StructType))
+    ]
+    for col_name in complex_columns:
+        df_casted = df_casted.withColumn(col_name, F.to_json(F.col(col_name)))
+
+    profile = ProfileReport(df_casted, title="non_numeric_1722", explorative=True)
+    output_file = test_output_dir / "non_numeric_1722.html"
+    profile.to_file(output_file)
+
+    assert output_file.exists()