Data-Centric-AI-Community · kchpp940 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . .
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
+    pip install --no-cache-dir . && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
+    pip install --no-cache-dir jupyter
+
+EXPOSE 8888
+
+CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]
+
+
diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py
@@ -48,7 +48,6 @@ def pandas_get_table_stats(
         else 0
     )
 
-    # Variable type counts
     table_stats.update(
         {"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
     )

diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py
@@ -3,8 +3,10 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_boolean_1d
 
 
+@describe_boolean_1d.register
 def describe_boolean_1d_spark(
     config: Settings, df: DataFrame, summary: dict
 ) -> Tuple[Settings, DataFrame, dict]:

diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py
@@ -5,6 +5,7 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_date_1d
 
 
 def date_stats_spark(df: DataFrame, summary: dict) -> dict:
@@ -18,6 +19,7 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:
     return df.agg(*expr).first().asDict()
 
 
+@describe_date_1d.register
 def describe_date_1d_spark(
     config: Settings, df: DataFrame, summary: dict
 ) -> Tuple[Settings, DataFrame, dict]:

diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py
@@ -3,8 +3,10 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_generic
 
 
+@describe_generic.register
 def describe_generic_spark(
     config: Settings, df: DataFrame, summary: dict
 ) -> Tuple[Settings, DataFrame, dict]:

diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py
@@ -5,13 +5,15 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import histogram_compute
+from ydata_profiling.model.summary_algorithms import (
+    describe_numeric_1d,
+    histogram_compute,
+)
 
 
 def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
     column = df.columns[0]
 
-    # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
     finite_filter = (
         F.col(column).isNotNull()
         & ~F.isnan(F.col(column))
@@ -32,6 +34,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
     return non_null_df.agg(*expr).first().asDict()
 
 
+@describe_numeric_1d.register
 def describe_numeric_1d_spark(
     config: Settings, df: DataFrame, summary: dict
 ) -> Tuple[Settings, DataFrame, dict]:
@@ -90,7 +93,6 @@ def describe_numeric_1d_spark(
     quantile_threshold = 0.05
 
     if summary.get("n") == summary.get("n_missing"):
-        # This means the entire column is null/nan, so summary values need to be hard-coded:
         summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})
 
         summary["mad"] = np.nan
@@ -135,10 +137,6 @@ def describe_numeric_1d_spark(
     # ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark
     summary["monotonic"] = 0
 
-    # this function only displays the top N (see config) values for a histogram.
-    # This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to
-    # display in pandas display
-    # the alternative is to do this in spark natively, but it is not trivial
     infinity_values = [np.inf, -np.inf]
 
     infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values)

diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py
@@ -3,8 +3,10 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_text_1d
 
 
+@describe_text_1d.register
 def describe_text_1d_spark(
     config: Settings, df: DataFrame, summary: dict
 ) -> Tuple[Settings, DataFrame, dict]:

diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py
@@ -37,7 +37,6 @@ def get_table_stats_spark(
             if series_summary["n_missing"] == n:
                 table_stats["n_vars_all_missing"] += 1
 
-    # without this check we'll get a div by zero error
     if result["n"] * result["n_var"] > 0:
         table_stats["p_cells_missing"] = (
             table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
@@ -52,7 +51,6 @@ def get_table_stats_spark(
     result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
     result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]
 
-    # Variable type counts
     result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))
 
     return result
diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py
@@ -2,9 +2,11 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.timeseries_index import get_time_index_description
 
 
-def spark_get_time_index_description_spark(
+@get_time_index_description.register
+def get_time_index_description_spark(
     config: Settings,
     df: DataFrame,
     table_stats: dict,
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,7 +48,6 @@ def pandas_get_table_stats( @@
             else 0
         )
-        # Variable type counts
         table_stats.update(
             {"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
         )
@@ Expand Down @@