Modify CategoricalDataProcessor to avoid regrouping of dummy variables

MatthiasRoelsPython · MatthiasRoelsPython · commit eeddacea6d22 · 2020-04-21T14:06:29.000+02:00
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
@@ -191,6 +191,13 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
 
         unique_categories = list(X.unique())
 
+        # do not merge categories in case of dummies, i.e. 0 and 1
+        # (and possibly "Missings")
+        if (len(unique_categories) == 2
+            or (len(unique_categories) == 3
+                and "Missing" in unique_categories)):
+            return set(unique_categories)
+
         # get small categories and add them to the merged category list
         small_categories = (CategoricalDataProcessor
                             ._get_small_categories(
@@ -420,7 +427,8 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
 
     @staticmethod
     def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
-        """replace categories in set with "Other"
+        """replace categories in set with "Other" and transform the remaining
+        categories to strings to avoid type errors later on in the pipeline
 
         Parameters
         ----------
@@ -434,4 +442,4 @@ def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
         pd.Series
             Description
         """
-        return data.apply(lambda x: x if x in categories else "Other")
+        return data.apply(lambda x: str(x) if x in categories else "Other")