unit testing ffs assertions

sborms · sborms · commit fca06dafd699 · 2021-10-01T15:41:08.000+02:00
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
@@ -158,7 +158,10 @@ def fit(self, train_data: pd.DataFrame, target_column_name: str,
             In case the number of forced predictors is larger than the maximum
             number of allowed predictors in the model.
         """
-        assert all(s in ["train", "selection"] for s in train_data["split"].unique()), \
+
+        assert "split" in train_data.columns, "The train_data input df does not include a split column."
+        print(train_data["split"].unique())
+        assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \
             "The train_data input df does not include a 'train' and 'selection' split."
 
         # remove excluded predictors from predictor lists
diff --git a/tests/evaluation/test_evaluation.py b/tests/evaluation/test_evaluation.py
@@ -1,10 +1,11 @@
+
 import pytest
 import pandas as pd
 import numpy as np
+
 from cobra.evaluation import plot_incidence
 from cobra.evaluation import ClassificationEvaluator, RegressionEvaluator
 
-
 def mock_data():
     d = {'variable': ['education', 'education', 'education', 'education'],
          'label': ['1st-4th', '5th-6th', '7th-8th', '9th'],
diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py
@@ -1,12 +1,11 @@
+
 from contextlib import contextmanager
 import pytest
-
 import pandas as pd
 
 from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
 from cobra.model_building.forward_selection import ForwardFeatureSelection
 
-
 @contextmanager
 def does_not_raise():
     yield
@@ -96,6 +95,18 @@ def mock_evaluate(self, X, y, split):  # on AUC scale, but gives the same for RM
 
         pd.testing.assert_frame_equal(actual, expected)
 
+    @pytest.mark.parametrize("model_type", ["classification", "regression"])
+    def test_ffs_train_data_assertions(self, model_type):
+
+        fw_selection = ForwardFeatureSelection(model_type=model_type)
+
+        with pytest.raises(AssertionError):  # no split column
+            fw_selection.fit(pd.DataFrame(), "target", predictors=[""])
+
+        df = mock_data(add_split_col=True, model_type=model_type)
+        with pytest.raises(AssertionError):  # not at least train & selection sets
+            fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
+
     @pytest.mark.parametrize("model_type, max_predictors, expectation",
                              [("classification", 2, pytest.raises(ValueError)),
                               ("classification", 3, does_not_raise()),
@@ -137,8 +148,9 @@ def mock_forward_selection(self, train_data, target_column_name,
         mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection",
                      mock_forward_selection)
 
+        df = mock_data(add_split_col=True, model_type=model_type)
         with expectation:
-            fw_selection.fit(pd.DataFrame(), "target",
+            fw_selection.fit(df, "target",  # data is ignored
                              predictors=predictors_list,
                              forced_predictors=forced_predictors_list,
                              excluded_predictors=[])
diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
@@ -1,9 +1,9 @@
+
 import numpy as np
 import pandas as pd
 
 from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
 
-
 def mock_data():
     return pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py
@@ -1,10 +1,8 @@
-import pytest
 
 import pandas as pd
 
 from cobra.model_building import univariate_selection
 
-
 def mock_data():
     return pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
@@ -1,11 +1,10 @@
-import pytest
 
+import pytest
 import numpy as np
 import pandas as pd
 
 from cobra.preprocessing import CategoricalDataProcessor
 
-
 class TestCategoricalDataProcessor:
 
     def test_attributes_to_dict(self):
diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py
@@ -1,13 +1,11 @@
+
 from contextlib import contextmanager
 import pytest
-
 import numpy as np
 import pandas as pd
-import math
 
 from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
 
-
 @contextmanager
 def does_not_raise():
     yield
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
@@ -1,14 +1,12 @@
-from contextlib import contextmanager
-import pytest
 
+from contextlib import contextmanager
 from typing import Any
-
+import pytest
 import numpy as np
 import pandas as pd
 
 from cobra.preprocessing.preprocessor import PreProcessor
 
-
 @contextmanager
 def does_not_raise():
     yield
diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py
@@ -1,10 +1,10 @@
+
 import pytest
 import pandas as pd
 from sklearn.exceptions import NotFittedError
 
 from cobra.preprocessing.target_encoder import TargetEncoder
 
-
 class TestTargetEncoder:
 
     def test_target_encoder_constructor_weight_value_error(self):
diff --git a/tutorials/tutorial_Cobra_linear_regression.ipynb b/tutorials/tutorial_Cobra_linear_regression.ipynb