diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..546b369ef 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -48,7 +48,6 @@ def pandas_get_table_stats( else 0 ) - # Variable type counts table_stats.update( {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} ) diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index 148dbce6c..ab5cf20fb 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_boolean_1d +@describe_boolean_1d.register def describe_boolean_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index c44d36650..a5e11a0f1 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -5,6 +5,7 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_date_1d def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -18,6 +19,7 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: return df.agg(*expr).first().asDict() +@describe_date_1d.register def describe_date_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py index 1171881cd..ee2356c0a 100644 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ b/src/ydata_profiling/model/spark/describe_generic_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_generic +@describe_generic.register def describe_generic_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 8c299577e..395b1461b 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -5,13 +5,15 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import histogram_compute +from ydata_profiling.model.summary_algorithms import ( + describe_numeric_1d, + histogram_compute, +) def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] - # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) finite_filter = ( F.col(column).isNotNull() & ~F.isnan(F.col(column)) @@ -32,6 +34,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: return non_null_df.agg(*expr).first().asDict() +@describe_numeric_1d.register def describe_numeric_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: @@ -90,7 +93,6 @@ def describe_numeric_1d_spark( quantile_threshold = 0.05 if summary.get("n") == summary.get("n_missing"): - # This means the entire column is null/nan, so summary values need to be hard-coded: summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) summary["mad"] = np.nan @@ -135,10 +137,6 @@ def describe_numeric_1d_spark( # ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark summary["monotonic"] = 0 - # this function only displays the top N (see config) values for a histogram. - # This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to - # display in pandas display - # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index 6d7804cf5..b5e27f615 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_text_1d +@describe_text_1d.register def describe_text_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 33e862e61..17ac03323 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -37,7 +37,6 @@ def get_table_stats_spark( if series_summary["n_missing"] == n: table_stats["n_vars_all_missing"] += 1 - # without this check we'll get a div by zero error if result["n"] * result["n_var"] > 0: table_stats["p_cells_missing"] = ( table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) @@ -52,7 +51,6 @@ def get_table_stats_spark( result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] - # Variable type counts result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index e8145d76c..c16204ac3 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -2,9 +2,11 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.timeseries_index import get_time_index_description -def spark_get_time_index_description_spark( +@get_time_index_description.register +def get_time_index_description_spark( config: Settings, df: DataFrame, table_stats: dict,