Merge pull request #17 from PythonPredictions/hotfix/target_encoder_missing_value_imputation

MatthiasRoelsPython · web-flow · commit 9e905a7d3eba · 2020-06-18T16:16:52.000+02:00
Hotfix/target encoder missing value imputation
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
@@ -11,6 +11,7 @@
 from sklearn.metrics import roc_curve
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import roc_auc_score
+from sklearn.metrics import matthews_corrcoef
 from sklearn.exceptions import NotFittedError
 
 
@@ -109,6 +110,7 @@ def compute_scalar_metrics(y_true: np.ndarray,
             "precision": precision_score(y_true, y_pred_b),
             "recall": recall_score(y_true, y_pred_b),
             "F1": f1_score(y_true, y_pred_b, average=None)[1],
+            "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
             "lift at  {}".format(lift_at): np.round(Evaluator
                                                     ._compute_lift(
                                                         y_true=y_true,
diff --git a/cobra/evaluation/plotting_utils.py b/cobra/evaluation/plotting_utils.py
@@ -7,7 +7,8 @@
 
 
 def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
-                                      dim: tuple=(12, 8)):
+                                      dim: tuple=(12, 8),
+                                      path: str=None):
     """Plot univariate quality of the predictors
 
     Parameters
@@ -18,6 +19,8 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
         criteria
     dim : tuple, optional
         tuple with width and lentgh of the plot
+    path : str, optional
+        path to store the figure
     """
 
     df = (df_auc[df_auc["preselection"]]
@@ -41,6 +44,9 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
         # Remove white lines from the second axis
         ax.grid(False)
 
+        if path is not None:
+            plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
+
         plt.show()
 
 
@@ -70,6 +76,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
 
 def plot_performance_curves(model_performance: pd.DataFrame,
                             dim: tuple=(12, 8),
+                            path: str=None,
                             colors: dict={"train": "#0099bf",
                                           "selection": "#ff9500",
                                           "validation": "#8064a2"}):
@@ -83,6 +90,8 @@ def plot_performance_curves(model_performance: pd.DataFrame,
         in the forward feature selection
     dim : tuple, optional
         tuple with width and lentgh of the plot
+    path : str, optional
+        path to store the figure
     """
     highest_auc = np.round(max(max(model_performance['train_performance']),
                                max(model_performance['selection_performance']),
@@ -113,6 +122,10 @@ def plot_performance_curves(model_performance: pd.DataFrame,
         fig.suptitle('Performance curves - forward feature selection',
                      fontsize=20)
         plt.ylabel('Model performance')
+
+        if path is not None:
+            plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
+
         plt.show()
 
 
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -80,6 +80,7 @@ def from_params(cls,
                     scale_contingency_table: bool=True,
                     forced_categories: dict={},
                     weight: float=0.0,
+                    imputation_strategy: str="mean",
                     serialization_path: Optional[str]=None):
         """Constructor to instantiate PreProcessor from all the parameters
         that can be set in all its required (attribute) classes.
@@ -130,6 +131,12 @@ def from_params(cls,
             parameter, the bigger the contribution of the overall mean.
             When set to zero, there is no smoothing
             (e.g. the pure target incidence is used).
+        imputation_strategy : str, optional
+            in case there is a particular column which contains new categories,
+            the encoding will lead to NULL values which should be imputed.
+            Valid strategies are to replace with the global mean of the train
+            set or the min (resp. max) incidence of the categories of that
+            particular variable.
         serialization_path : str, optional
             path to save the pipeline to
 
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
@@ -34,22 +34,37 @@ class TargetEncoder(BaseEstimator):
 
     Attributes
     ----------
-    columns : list
-        A list of columns to encode, if None, all string columns will be
-        encoded.
+    imputation_strategy : str
+        in case there is a particular column which contains new categories,
+        the encoding will lead to NULL values which should be imputed.
+        Valid strategies are to replace with the global mean of the train
+        set or the min (resp. max) incidence of the categories of that
+        particular variable.
     weight : float
         Smoothing parameters (non-negative). The higher the value of the
         parameter, the bigger the contribution of the overall mean. When set to
         zero, there is no smoothing (e.g. the pure target incidence is used).
     """
 
-    def __init__(self, weight: float=0.0):
+    valid_strategies = ("mean", "min", "max")
+
+    def __init__(self, weight: float=0.0,
+                 imputation_strategy: str="mean"):
 
         if weight < 0:
             raise ValueError("The value of weight cannot be smaller than zero")
+        elif imputation_strategy not in self.valid_strategies:
+            raise ValueError("Valid options for 'imputation_strategy' are {}."
+                             " Got imputation_strategy={!r} instead"
+                             .format(self.valid_strategies,
+                                     imputation_strategy))
 
         self.weight = weight
+        self.imputation_strategy = imputation_strategy
+
         self._mapping = {}  # placeholder for fitted output
+        # placeholder for the global incidence of the data used for fitting
+        self._global_mean = None
 
         # not implemented yet!
         # randomized: bool=False, sigma=0.05
@@ -72,6 +87,8 @@ def attributes_to_dict(self) -> dict:
             for key, value in self._mapping.items()
         }
 
+        params["_global_mean"] = self._global_mean
+
         return params
 
     def set_attributes_from_dict(self, params: dict):
@@ -88,6 +105,14 @@ def set_attributes_from_dict(self, params: dict):
         if "weight" in params and type(params["weight"]) == float:
             self.weight = params["weight"]
 
+        if ("imputation_strategy" in params and
+                params["imputation_strategy"] in self.valid_strategies):
+
+            self.imputation_strategy = params["imputation_strategy"]
+
+        if "_global_mean" in params and type(params["_global_mean"]) == float:
+            self._global_mean = params["_global_mean"]
+
         _mapping = {}
         if "_mapping" in params and type(params["_mapping"]) == dict:
             _mapping = params["_mapping"]
@@ -121,19 +146,17 @@ def fit(self, data: pd.DataFrame, column_names: list,
 
         # compute global mean (target incidence in case of binary target)
         y = data[target_column]
-        global_mean = y.sum() / y.count()
+        self._global_mean = y.sum() / y.count()
 
         for column in column_names:
             if column not in data.columns:
                 log.warning("DataFrame has no column '{}', so it will be "
                             "skipped in fitting" .format(column))
                 continue
 
-            self._mapping[column] = self._fit_column(data[column], y,
-                                                     global_mean)
+            self._mapping[column] = self._fit_column(data[column], y)
 
-    def _fit_column(self, X: pd.Series, y: pd.Series,
-                    global_mean: float) -> pd.Series:
+    def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
         """Summary
 
         Parameters
@@ -143,8 +166,6 @@ def _fit_column(self, X: pd.Series, y: pd.Series,
             categorical variable.
         y : pd.Series
             series containing the targets for each observation
-        global_mean : float
-            Global mean of the target
 
         Returns
         -------
@@ -158,7 +179,9 @@ def _fit_column(self, X: pd.Series, y: pd.Series,
         # Q: do we need to do this here or during the transform phase???
 
         # Note if self.weight = 0, we have the ordinary incidence replacement
-        numerator = stats["count"]*stats["mean"] + self.weight*global_mean
+        numerator = (stats["count"]*stats["mean"]
+                     + self.weight * self._global_mean)
+
         denominator = stats["count"] + self.weight
 
         return numerator/denominator
@@ -187,13 +210,12 @@ def transform(self, data: pd.DataFrame,
             method
 
         """
-        if len(self._mapping) == 0:
+        if (len(self._mapping) == 0) or (self._global_mean is None):
             msg = ("This {} instance is not fitted yet. Call 'fit' with "
                    "appropriate arguments before using this method.")
 
             raise NotFittedError(msg.format(self.__class__.__name__))
 
-        new_columns = []
         for column in column_names:
 
             if column not in data.columns:
@@ -205,15 +227,47 @@ def transform(self, data: pd.DataFrame,
                             "and will be skipped".format(column))
                 continue
 
-            new_column = TargetEncoder._clean_column_name(column)
+            data = self._transform_column(data, column)
+
+        return data
+
+    def _transform_column(self, data: pd.DataFrame,
+                          column_name: str) -> pd.DataFrame:
+        """Replace (e.g. encode) categories of each column with its average
+        incidence which was computed when the fit method was called
 
-            # Convert dtype to float because when the original dtype
-            # is of type "category", the resulting dtype is also of type
-            # "category"
-            data[new_column] = (data[column].map(self._mapping[column])
-                                .astype("float"))
+        Parameters
+        ----------
+        X : pd.DataFrame
+            data to encode
+        column_name : str
+            Name of the column in data to be encoded
 
-            new_columns.append(new_column)
+        Returns
+        -------
+        pd.DataFrame
+            transformed data
+        """
+        new_column = TargetEncoder._clean_column_name(column_name)
+
+        # Convert dtype to float because when the original dtype
+        # is of type "category", the resulting dtype is also of type
+        # "category"
+        data[new_column] = (data[column_name].map(self._mapping[column_name])
+                            .astype("float"))
+
+        # In case of categorical data, it could be that new categories will
+        # emerge which were not present in the train set, so this will result
+        # in missing values (which should be replaced)
+        if data[new_column].isnull().sum() > 0:
+            if self.imputation_strategy == "mean":
+                data[new_column].fillna(self._global_mean, inplace=True)
+            elif self.imputation_strategy == "min":
+                data[new_column].fillna(data[new_column].min(),
+                                        inplace=True)
+            elif self.imputation_strategy == "max":
+                data[new_column].fillna(data[new_column].max(),
+                                        inplace=True)
 
         return data
 
diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py
@@ -21,9 +21,13 @@ def test_target_encoder_attributes_to_dict(self):
 
         encoder._mapping["variable"] = mapping_data
 
+        encoder._global_mean = 0.5
+
         actual = encoder.attributes_to_dict()
 
         expected = {"weight": 0.0,
+                    "imputation_strategy": "mean",
+                    "_global_mean": 0.5,
                     "_mapping": {"variable": {
                         "negative": 0.333333,
                         "neutral": 0.50000,
@@ -58,6 +62,7 @@ def test_target_encoder_set_attributes_from_dict(self):
         encoder = TargetEncoder()
 
         data = {"weight": 0.0,
+                "_global_mean": 0.5,
                 "_mapping": {"variable": {
                     "negative": 0.333333,
                     "neutral": 0.50000,
@@ -85,8 +90,8 @@ def test_target_encoder_fit_column(self):
                            'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
 
         encoder = TargetEncoder()
-        actual = encoder._fit_column(X=df.variable, y=df.target,
-                                     global_mean=0.0)
+        encoder._global_mean = 0.5
+        actual = encoder._fit_column(X=df.variable, y=df.target)
 
         expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
                              index=["negative", "neutral", "positive"])
@@ -103,11 +108,10 @@ def test_target_encoder_fit_column_global_mean(self):
                                         'neutral'],
                            'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
 
-        global_mean = df.target.sum() / df.target.count()  # is 0.5
-
         encoder = TargetEncoder(weight=1)
-        actual = encoder._fit_column(X=df.variable, y=df.target,
-                                     global_mean=global_mean)
+        encoder._global_mean = df.target.sum() / df.target.count()  # is 0.5
+
+        actual = encoder._fit_column(X=df.variable, y=df.target)
 
         expected = pd.Series(data=[0.375, 0.500, 0.625],
                              index=["negative", "neutral", "positive"])
@@ -160,6 +164,33 @@ def test_target_encoder_transform(self):
         pd.testing.assert_frame_equal(actual, expected,
                                       check_less_precise=5)
 
+    def test_target_encoder_transform_new_category(self):
+
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        df_appended = df.append({"variable": "new", "target": 1},
+                                ignore_index=True)
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+        df_appended["variable"] = df_appended["variable"].astype("category")
+
+        expected = df_appended.copy()
+        expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
+                                    0.333333, 0.666667, 0.333333, 0.50000,
+                                    0.50000, 0.50000, 0.333333]
+
+        encoder = TargetEncoder(imputation_strategy="min")
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+        actual = encoder.transform(data=df_appended, column_names=["variable"])
+
+        pd.testing.assert_frame_equal(actual, expected,
+                                      check_less_precise=5)
+
     # Tests for _clean_column_name
     def test_target_encoder_clean_column_name(self):