Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/ydata_profiling/model/spark/describe_counts_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas as pd
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import functions as F, types as T

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_counts
Expand All @@ -25,6 +25,9 @@ def describe_counts_spark(
Returns:
Updated settings, input series, and summary dictionary.
"""
# Cast Decimal Type s
if isinstance(series.schema.fields[0].dataType, T.DecimalType):
series = series.select(F.col(series.columns[0]).cast(T.DoubleType()).alias(series.columns[0]))

# Count occurrences of each value
value_counts = series.groupBy(series.columns[0]).count()
Expand Down
3 changes: 3 additions & 0 deletions src/ydata_profiling/model/spark/summary_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def spark_describe_1d(

if str(series.schema[0].dataType).startswith("ArrayType"):
dtype = "ArrayType"
elif str(series.schema[0].dataType).startswith("Decimal"):
dtype = "decimal"
else:
dtype = series.schema[0].dataType.simpleString()

Expand All @@ -56,6 +58,7 @@ def spark_describe_1d(
"boolean": "Boolean",
"date": "DateTime",
"timestamp": "DateTime",
"decimal": "Numeric",
}[dtype]

return summarizer.summarize(config, series, dtype=vtype)
Expand Down
36 changes: 36 additions & 0 deletions tests/issues/test_issue1602.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Test for issue 1602:
https://github.com/ydataai/ydata-profiling/issues/1602
"""

from ydata_profiling import ProfileReport
from pyspark.sql import types as T

def test_spark_handles_decimal_type(test_output_dir, spark_session):
from decimal import Decimal
spark = spark_session

schema = T.StructType(
[
T.StructField("number", T.StringType(), True),
T.StructField("decimal", T.DecimalType(10, 2), True)
]
)

data = [
(f"test_{num + 1}", Decimal(num + 1)) for num in range(205)
]

data.extend(
[
("test_1", Decimal("1.05")) for _ in range(205)
]
)

test_df = spark.createDataFrame(data, schema=schema)

profile = ProfileReport(test_df, title="decimal_handling", explorative=True)
output_file = test_output_dir / "decimal_handling.html"
profile.to_file(output_file)

assert output_file.exists()