add model_type functionality in PreProcessor class & tests

sborms · sborms · commit 07338fb213dd · 2021-08-06T17:07:13.000+02:00
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
@@ -46,7 +46,7 @@ class CategoricalDataProcessor(BaseEstimator):
     keep_missing : bool
         Whether or not to keep missing as a separate category.
     model_type : str
-        Model type ("classification" or "regression").
+        Model type (``classification`` or ``regression``).
     p_value_threshold : float
         Significance threshold for regrouping.
     regroup : bool
@@ -442,7 +442,7 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
         category : str
             Category for which we carry out the test.
         model_type : str
-            Model type ("classification" or "regression").
+            Model type (``classification`` or ``regression``).
         scale_contingency_table : bool
             Whether we scale contingency table with incidence rate.
             Only used when model_type = "classification".
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -44,23 +44,27 @@ class PreProcessor(BaseEstimator):
     ----------
     categorical_data_processor : CategoricalDataProcessor
         Instance of CategoricalDataProcessor to do the preprocessing of
-        categorical variables
+        categorical variables. The model_type variable is specified
+        here (``classification`` or ``regression``).
     discretizer : KBinsDiscretizer
         Instance of KBinsDiscretizer to do the prepocessing of continuous
-        variables by means of discretization
+        variables by means of discretization.
     serialization_path : str
-        path to save the pipeline to
+        Path to save the pipeline to.
     stratify_split : bool
-        Whether or not to stratify the train-test split
+        Whether or not to stratify the train-test split.
     target_encoder : TargetEncoder
-        Instance of TargetEncoder to do the incidence replacement
+        Instance of TargetEncoder to do the incidence replacement.
     """
 
-    def __init__(self, categorical_data_processor: CategoricalDataProcessor,
+    def __init__(self,
+                 categorical_data_processor: CategoricalDataProcessor,
                  discretizer: KBinsDiscretizer,
                  target_encoder: TargetEncoder,
                  is_fitted: bool = False):
 
+        self.model_type = categorical_data_processor.model_type
+
         self._categorical_data_processor = categorical_data_processor
         self._discretizer = discretizer
         self._target_encoder = target_encoder
@@ -69,6 +73,7 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor,
 
     @classmethod
     def from_params(cls,
+                    model_type: str = "classification",
                     n_bins: int = 10,
                     strategy: str = "quantile",
                     closed: str = "right",
@@ -91,16 +96,18 @@ def from_params(cls,
 
         Parameters
         ----------
+        model_type : str
+            Model type (``classification`` or ``regression``).
         n_bins : int, optional
             Number of bins to produce. Raises ValueError if ``n_bins < 2``.
         strategy : str, optional
             Binning strategy. Currently only ``uniform`` and ``quantile``
-            e.g. equifrequency is supported
+            e.g. equifrequency is supported.
         closed : str, optional
-            Whether to close the bins (intervals) from the left or right
+            Whether to close the bins (intervals) from the left or right.
         auto_adapt_bins : bool, optional
-            reduces the number of bins (starting from n_bins) as a function of
-            the number of missings
+            Reduces the number of bins (starting from n_bins) as a function of
+            the number of missings.
         starting_precision : int, optional
             Initial precision for the bin edges to start from,
             can also be negative. Given a list of bin edges, the class will
@@ -110,33 +117,32 @@ def from_params(cls,
             will be made to round up the numbers of the bin edges
             e.g. ``5.55 -> 10``, ``146 -> 100``, ...
         label_format : str, optional
-            format string to display the bin labels
+            Format string to display the bin labels
             e.g. ``min - max``, ``(min, max]``, ...
         change_endpoint_format : bool, optional
             Whether or not to change the format of the lower and upper bins
             into ``< x`` and ``> y`` resp.
         regroup : bool
-            Whether or not to regroup categories
+            Whether or not to regroup categories.
         regroup_name : str
-            New name of the non-significant regrouped variables
+            New name of the non-significant regrouped variables.
         keep_missing : bool
-            Whether or not to keep missing as a separate category
+            Whether or not to keep missing as a separate category.
         category_size_threshold : int
-            minimal size of a category to keep it as a separate category
+            Minimal size of a category to keep it as a separate category.
         p_value_threshold : float
             Significance threshold for regrouping.
         forced_categories : dict
             Map to prevent certain categories from being group into ``Other``
             for each column - dict of the form ``{col:[forced vars]}``.
         scale_contingency_table : bool
-            Whether contingency table should be scaled before chi^2.'
+            Whether contingency table should be scaled before chi^2.
         weight : float, optional
             Smoothing parameters (non-negative). The higher the value of the
             parameter, the bigger the contribution of the overall mean.
-            When set to zero, there is no smoothing
-            (e.g. the pure target incidence is used).
+            When set to zero, there is no smoothing (e.g. the pure target incidence is used).
         imputation_strategy : str, optional
-            in case there is a particular column which contains new categories,
+            In case there is a particular column which contains new categories,
             the encoding will lead to NULL values which should be imputed.
             Valid strategies are to replace with the global mean of the train
             set or the min (resp. max) incidence of the categories of that
@@ -145,25 +151,29 @@ def from_params(cls,
         Returns
         -------
         PreProcessor
-            Description
+            class encapsulating CategoricalDataProcessor,
+            KBinsDiscretizer, and TargetEncoder instances
         """
         categorical_data_processor = CategoricalDataProcessor(
+            model_type,
             regroup,
             regroup_name,
             keep_missing,
             category_size_threshold,
             p_value_threshold,
             scale_contingency_table,
             forced_categories)
+
         discretizer = KBinsDiscretizer(n_bins, strategy, closed,
                                        auto_adapt_bins,
                                        starting_precision,
                                        label_format,
                                        change_endpoint_format)
 
-        target_encoder = TargetEncoder(weight)
+        target_encoder = TargetEncoder(weight, imputation_strategy)
 
-        return cls(categorical_data_processor, discretizer, target_encoder)
+        return cls(model_type,
+                   categorical_data_processor, discretizer, target_encoder)
 
     @classmethod
     def from_pipeline(cls, pipeline: dict):
@@ -187,20 +197,22 @@ def from_pipeline(cls, pipeline: dict):
         """
 
         if not PreProcessor._is_valid_pipeline(pipeline):
-            raise ValueError("Invalid pipeline")  # To do: specify error
+            raise ValueError("Invalid pipeline")  ## TODO: specify error
 
         categorical_data_processor = CategoricalDataProcessor()
         categorical_data_processor.set_attributes_from_dict(
             pipeline["categorical_data_processor"]
         )
+        model_type = categorical_data_processor.model_type
 
         discretizer = KBinsDiscretizer()
         discretizer.set_attributes_from_dict(pipeline["discretizer"])
 
         target_encoder = TargetEncoder()
         target_encoder.set_attributes_from_dict(pipeline["target_encoder"])
 
-        return cls(categorical_data_processor, discretizer, target_encoder,
+        return cls(model_type,
+                   categorical_data_processor, discretizer, target_encoder,
                    is_fitted=pipeline["_is_fitted"])
 
     def fit(self, train_data: pd.DataFrame, continuous_vars: list,
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
@@ -97,6 +97,7 @@ def test_is_valid_pipeline(self, injection_location: str,
         # is_valid_pipeline only checks for relevant keys atm
         pipeline_dict = {
             "categorical_data_processor": {
+                "model_type": None,
                 "regroup": None,
                 "regroup_name": None,
                 "keep_missing": None,