feat: support for decimal type in pandas and spark

sbrugman · sbrugman · commit b860ef2f5873 · 2022-09-06T15:24:51.000+02:00
diff --git a/histogrammar/dfinterface/filling_utils.py b/histogrammar/dfinterface/filling_utils.py
@@ -38,7 +38,7 @@ def check_column(col, sep=":"):
     return col
 
 
-def check_dtype(dtype):
+def normalize_dtype(dtype):
     """Convert datatype to consistent numpy datatype
 
     :param dtype: input datatype
diff --git a/histogrammar/dfinterface/histogram_filler_base.py b/histogrammar/dfinterface/histogram_filler_base.py
@@ -27,7 +27,7 @@
 from ..primitives.stack import Stack
 from ..primitives.sum import Sum
 
-from .filling_utils import check_column, check_dtype
+from .filling_utils import check_column, normalize_dtype
 
 
 class HistogramFillerBase(object):
@@ -111,7 +111,7 @@ def __init__(
         self.bin_specs = bin_specs or {}
         self.time_axis = time_axis
         var_dtype = var_dtype or {}
-        self.var_dtype = {k: check_dtype(v) for k, v in var_dtype.items()}
+        self.var_dtype = {k: normalize_dtype(v) for k, v in var_dtype.items()}
         self.read_key = read_key
         self.store_key = store_key
 
@@ -404,32 +404,31 @@ def categorize_features(self, df):
 
         for col_list in features:
             for col in col_list:
+                # data type with metadata
+                dt_col = self.get_data_type(df, col)
 
-                dt = self.var_dtype.get(col, check_dtype(self.get_data_type(df, col)))
+                # normalized data type
+                dt = self.var_dtype.get(col, normalize_dtype(dt_col))
 
                 if col not in self.var_dtype:
                     self.var_dtype[col] = dt
 
+                # metadata indicates decimal
+                if hasattr(dt_col, 'metadata') and dt_col.metadata is not None and dt_col.metadata["decimal"]:
+                    cols_by_type["decimal"].add(col)
+
                 if np.issubdtype(dt, np.integer):
-                    colset = cols_by_type["int"]
-                    if col not in colset:
-                        colset.add(col)
+                    cols_by_type["int"].add(col)
+
                 if np.issubdtype(dt, np.number):
                     colset = cols_by_type["num"]
-                    if col not in colset:
-                        colset.add(col)
                 elif np.issubdtype(dt, np.datetime64):
                     colset = cols_by_type["dt"]
-                    if col not in colset:
-                        colset.add(col)
                 elif np.issubdtype(dt, np.bool_):
                     colset = cols_by_type["bool"]
-                    if col not in colset:
-                        colset.add(col)
                 else:
                     colset = cols_by_type["str"]
-                    if col not in colset:
-                        colset.add(col)
+                colset.add(col)
 
                 self.logger.debug(
                     'Data type of column "{col}" is "{type}".'.format(
diff --git a/histogrammar/dfinterface/make_histograms.py b/histogrammar/dfinterface/make_histograms.py
@@ -42,7 +42,7 @@
 
 from .pandas_histogrammar import PandasHistogrammar
 from .spark_histogrammar import SparkHistogrammar
-from .filling_utils import check_dtype
+from .filling_utils import normalize_dtype
 from ..util import _get_sub_hist
 
 logger = logging.getLogger()
@@ -232,7 +232,7 @@ def get_time_axes(df):
     return [
         c
         for c in df.columns
-        if np.issubdtype(check_dtype(get_data_type(df, c)), np.datetime64)
+        if np.issubdtype(normalize_dtype(get_data_type(df, c)), np.datetime64)
     ]
 
 
diff --git a/histogrammar/dfinterface/pandas_histogrammar.py b/histogrammar/dfinterface/pandas_histogrammar.py
@@ -136,7 +136,11 @@ def get_data_type(self, df, col):
         elif inferred == 'boolean':
             data_type = 'bool'
         elif inferred in {'decimal', 'floating', 'mixed-integer-float'}:
-            data_type = 'float'
+            # decimal needs preprocessing (cast), signal this in metadata
+            if inferred == "decimal":
+                data_type = np.dtype('float', metadata={"decimal": True})
+            else:
+                data_type = "float"
         elif inferred in {'date', 'datetime', 'datetime64'}:
             data_type = 'datetime64'
         else:  # categorical, mixed, etc -> object uses to_string()
@@ -187,6 +191,12 @@ def process_features(self, df, cols_by_type):
                 )
             )
             idf[col] = df[col].apply(to_ns)
+
+        # treat decimal as float, as decimal is not supported by .quantile
+        # (https://github.com/pandas-dev/pandas/issues/13157)
+        for col in cols_by_type["decimal"]:
+            idf[col] = df[col].apply(float)
+
         return idf
 
     def fill_histograms(self, idf):
diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py
@@ -169,6 +169,8 @@ def get_data_type(self, df, col):
             dt = bool
         elif dt == "bigint":
             dt = np.int64
+        elif dt.startswith("decimal("):
+            return np.dtype(float, metadata={"decimal": True})
 
         return np.dtype(dt)
 

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`
`43`	`43`	`from .pandas_histogrammar import PandasHistogrammar`
`44`	`44`	`from .spark_histogrammar import SparkHistogrammar`
`45`		`-from .filling_utils import check_dtype`
	`45`	`+from .filling_utils import normalize_dtype`
`46`	`46`	`from ..util import _get_sub_hist`
`47`	`47`
`48`	`48`	`logger = logging.getLogger()`
`@@ -232,7 +232,7 @@ def get_time_axes(df):`
`232`	`232`	`return [`
`233`	`233`	`c`
`234`	`234`	`for c in df.columns`
`235`		`- if np.issubdtype(check_dtype(get_data_type(df, c)), np.datetime64)`
	`235`	`+ if np.issubdtype(normalize_dtype(get_data_type(df, c)), np.datetime64)`
`236`	`236`	`]`
`237`	`237`
`238`	`238`