integrating comments Sander & finetuning

sborms · sborms · commit 3f067509a6f3 · 2021-10-01T14:57:31.000+02:00
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
@@ -40,7 +40,7 @@ class ClassificationEvaluator():
     cumulative_gains : tuple
         Data for plotting cumulative gains curve.
     evaluation_metrics : dict
-        Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.)
+        Map containing various scalar evaluation metrics (precision, recall, accuracy, AUC, F1, etc.).
     lift_at : float
         Parameter to determine at which top level percentage the lift of the
         model should be computed.
@@ -191,7 +191,7 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_xlabel("False Positive Rate", fontsize=15)
             ax.set_ylabel("True Positive Rate", fontsize=15)
             ax.legend(loc="lower right")
-            ax.set_title("ROC Curve", fontsize=20)
+            ax.set_title("ROC curve", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -274,7 +274,7 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.grid(False)
 
             # Description
-            ax.set_title("Cumulative response", fontsize=20)
+            ax.set_title("Cumulative Response curve", fontsize=20)
 
             if path is not None:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -323,7 +323,7 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
             ax.grid(False)
 
             # Description
-            ax.set_title("Cumulative Lift", fontsize=20)
+            ax.set_title("Cumulative Lift curve", fontsize=20)
 
             if path is not None:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -350,7 +350,7 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
             ax.plot(ax.get_xlim(), ax.get_ylim(), linewidth=3,
                     ls="--", color="darkorange", label="random selection")
 
-            ax.set_title("Cumulative Gains", fontsize=20)
+            ax.set_title("Cumulative Gains curve", fontsize=20)
 
             # Format axes
             ax.set_xlim([0, 100])
@@ -681,7 +681,7 @@ def plot_predictions(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_xlabel("Index", fontsize=15)
             ax.set_ylabel("Value", fontsize=15)
             ax.legend(loc="best")
-            ax.set_title("Prediction Plot", fontsize=20)
+            ax.set_title("Predictions vs. Actuals", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -722,7 +722,7 @@ def plot_qq(self, path: str=None, dim: tuple=(12, 8)):
             ax.set_yticks(range(int(np.floor(min(y))), int(np.ceil(max(y[x < float("inf")])))+1, 1))
 
             ax.legend(loc="best")
-            ax.set_title("Q-Q Plot", fontsize=20)
+            ax.set_title("Q-Q plot", fontsize=20)
 
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
@@ -110,7 +110,7 @@ def compute_model_performances(self, data: pd.DataFrame,
                 "last_added_predictor": list(last_added_predictor)[0]
             }
 
-            # Evaluate model on each data set split,
+            # Evaluate model on each dataset split,
             # e.g. train-selection-validation
             tmp.update({
                 f"{split}_performance": model.evaluate(
@@ -138,9 +138,11 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
         Parameters
         ----------
         train_data : pd.DataFrame
-            Data on which to fit the model. The "train" split is used to
-            train a model, the "selection" split is used to evaluate
-            the actual forward feature selection.
+            Data on which to fit the model. Should include a "train"
+            and "selection" split for correct model selection! The
+            "train" split is used to train a model, the "selection"
+            split is used to evaluate which model to include in the
+            actual forward feature selection.
         target_column_name : str
             Name of the target column.
         predictors : list
@@ -156,6 +158,9 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
             In case the number of forced predictors is larger than the maximum
             number of allowed predictors in the model.
         """
+        assert all(s in ["train", "selection"] for s in train_data["split"].unique()), \
+            "The train_data input df does not include a 'train' and 'selection' split."
+
         # remove excluded predictors from predictor lists
         filtered_predictors = [var for var in predictors
                                if (var not in excluded_predictors and
@@ -164,13 +169,13 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
         # checks on predictor lists and self.max_predictors attr
         if len(forced_predictors) > self.max_predictors:
             raise ValueError("Size of forced_predictors cannot be bigger than "
-                             "max_predictors")
+                             "max_predictors.")
         elif len(forced_predictors) == self.max_predictors:
             log.info("Size of forced_predictors equals max_predictors "
                      "only one model will be trained...")
             # train model with all forced_predictors (only)
             (self._fitted_models
-             .append(self._train_model(train_data,
+             .append(self._train_model(train_data[train_data["split"] == "train"],
                                        target_column_name,
                                        forced_predictors)))
         else:
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -37,7 +37,7 @@ class PreProcessor(BaseEstimator):
         Instance of CategoricalDataProcessor to do the preprocessing of
         categorical variables.
     discretizer : KBinsDiscretizer
-        Instance of KBinsDiscretizer to do the prepocessing of continuous
+        Instance of KBinsDiscretizer to do the preprocessing of continuous
         variables by means of discretization.
     target_encoder : TargetEncoder
         Instance of TargetEncoder to do the incidence replacement.
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
@@ -66,7 +66,7 @@ def __init__(self, weight: float=0.0,
                  imputation_strategy: str="mean"):
 
         if weight < 0:
-            raise ValueError("The value of weight cannot be smaller than zero")
+            raise ValueError("The value of weight cannot be smaller than zero.")
         elif imputation_strategy not in self.valid_imputation_strategies:
             raise ValueError("Valid options for 'imputation_strategy' are {}."
                              " Got imputation_strategy={!r} instead."
diff --git a/tutorials/tutorial_Cobra_linear_regression.ipynb b/tutorials/tutorial_Cobra_linear_regression.ipynb
diff --git a/tutorials/tutorial_Cobra_logistic_regression.ipynb b/tutorials/tutorial_Cobra_logistic_regression.ipynb