Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*

COPY . .

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
pip install --no-cache-dir . && \
pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
pip install --no-cache-dir jupyter

EXPOSE 8888

CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]


1 change: 0 additions & 1 deletion src/ydata_profiling/model/pandas/table_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def pandas_get_table_stats(
else 0
)

# Variable type counts
table_stats.update(
{"types": dict(Counter([v["type"] for v in variable_stats.values()]))}
)
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/spark/describe_boolean_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_boolean_1d


@describe_boolean_1d.register
def describe_boolean_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/spark/describe_date_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_date_1d


def date_stats_spark(df: DataFrame, summary: dict) -> dict:
Expand All @@ -18,6 +19,7 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:
return df.agg(*expr).first().asDict()


@describe_date_1d.register
def describe_date_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/spark/describe_generic_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_generic


@describe_generic.register
def describe_generic_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
Expand Down
12 changes: 5 additions & 7 deletions src/ydata_profiling/model/spark/describe_numeric_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import histogram_compute
from ydata_profiling.model.summary_algorithms import (
describe_numeric_1d,
histogram_compute,
)


def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
column = df.columns[0]

# Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False)
finite_filter = (
F.col(column).isNotNull()
& ~F.isnan(F.col(column))
Expand All @@ -32,6 +34,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
return non_null_df.agg(*expr).first().asDict()


@describe_numeric_1d.register
def describe_numeric_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
Expand Down Expand Up @@ -90,7 +93,6 @@ def describe_numeric_1d_spark(
quantile_threshold = 0.05

if summary.get("n") == summary.get("n_missing"):
# This means the entire column is null/nan, so summary values need to be hard-coded:
summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles})

summary["mad"] = np.nan
Expand Down Expand Up @@ -135,10 +137,6 @@ def describe_numeric_1d_spark(
# ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark
summary["monotonic"] = 0

# this function only displays the top N (see config) values for a histogram.
# This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to
# display in pandas display
# the alternative is to do this in spark natively, but it is not trivial
infinity_values = [np.inf, -np.inf]

infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values)
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/spark/describe_text_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_text_1d


@describe_text_1d.register
def describe_text_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
Expand Down
2 changes: 0 additions & 2 deletions src/ydata_profiling/model/spark/table_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def get_table_stats_spark(
if series_summary["n_missing"] == n:
table_stats["n_vars_all_missing"] += 1

# without this check we'll get a div by zero error
if result["n"] * result["n_var"] > 0:
table_stats["p_cells_missing"] = (
table_stats["n_cells_missing"] / (result["n"] * result["n_var"])
Expand All @@ -52,7 +51,6 @@ def get_table_stats_spark(
result["n_vars_all_missing"] = table_stats["n_vars_all_missing"]
result["n_vars_with_missing"] = table_stats["n_vars_with_missing"]

# Variable type counts
result["types"] = dict(Counter([v["type"] for v in variable_stats.values()]))

return result
4 changes: 3 additions & 1 deletion src/ydata_profiling/model/spark/timeseries_index_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.timeseries_index import get_time_index_description


def spark_get_time_index_description_spark(
@get_time_index_description.register
def get_time_index_description_spark(
config: Settings,
df: DataFrame,
table_stats: dict,
Expand Down