Added drop of columns containing only NANs

Patrick Leonardy · Patrick Leonardy · commit 2ac2a3d50796 · 2022-12-05T13:07:09.000+01:00
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -61,7 +61,7 @@ def __init__(self,
         self._is_fitted = is_fitted
 
         self.model_type = categorical_data_processor.model_type
-
+    
     @classmethod
     def from_params(cls,
                     model_type: str="classification",
@@ -234,6 +234,10 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list,
         # Ensure to operate on separate copy of data
         train_data = train_data.copy()
 
+
+        # drop NAN columns if they exist
+        train_data = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(train_data)
+
         # Fit discretizer, categorical preprocessor & target encoder
         # Note that in order to fit target_encoder, we first have to transform
         # the data using the fitted discretizer & categorical_data_processor
@@ -486,3 +490,38 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list:
             raise ValueError("Variable var_list is None or empty list.")
 
         return var_list
+
+    def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) -> pd.DataFrame:
+        """Checkes how much missing values are in the dataframe and drops columns that contain only missing values. 
+        It also logs an error message displaying the percentage of missing values in the diffenent columns 
+        (columns are only diosplaied if they contain a missing values)
+
+        Parameters
+        ----------
+        data : pd.DataFrame
+            Data that should be checked for columns that contain only missing values
+
+        Returns
+        -------
+        pd.DataFrame
+            Data without columns conatining only missing values
+        """
+
+        # Check how much NaN values are in each variable 
+        # and output a warning if a variable has more than 0% of missing values 
+        
+        perc_na = data.isna().mean() * 100
+
+        if not perc_na[perc_na > 0].empty:
+            logging.warning("\nPercentage of missing values per variable:\n" +  perc_na[perc_na > 0].round(2).to_string(float_format=lambda x: str(x)+"%"))
+
+        
+        # drop variables that have only missing values
+        to_drop = [perc_na.index[i] for i, percentage in enumerate(perc_na) if percentage == 100]
+
+
+        if to_drop:
+            data = data.drop(to_drop, axis=1)
+            logging.warning(f"Following variables contain only missing values and were droped: {to_drop}")
+        
+        return data
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
@@ -178,3 +178,64 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
             )
         assert "new_column" not in train_data.columns
         assert "new_column" in result.columns
+    
+
+
+    @pytest.mark.parametrize(("input, expected"), 
+    [
+        # example 1
+        (pd.DataFrame({
+            "a":[1,8,np.nan],
+            "b":[np.nan,8,np.nan],
+            "c":[np.nan,np.nan,np.nan],
+            "d":[np.nan,np.nan,5],
+            "e":[1,960,np.nan],
+            "f":[np.nan,np.nan,np.nan]
+            }),
+        pd.DataFrame({
+            'a': [1.0, 8.0, np.nan],
+            'b': [np.nan, 8.0, np.nan],
+            'd': [np.nan, np.nan, 5.0],
+            'e': [1.0, 960.0, np.nan]
+            })),
+        
+        #example 2
+        (pd.DataFrame({
+            "a":[1,8,np.nan],
+            "b":[np.nan,8,np.nan],
+            "c":[np.nan,np.nan,np.nan],
+            "d":[np.nan,np.nan,5],
+            "e":[1,960,np.nan],
+            }),
+        pd.DataFrame({
+            'a': [1.0, 8.0, np.nan],
+            'b': [np.nan, 8.0, np.nan],
+            'd': [np.nan, np.nan, 5.0],
+            'e': [1.0, 960.0, np.nan]
+            })),
+        
+        #example 3
+        (pd.DataFrame({
+            "a":[1,8,np.nan],
+            "b":[np.nan,8,np.nan],
+            "d":[np.nan,np.nan,5],
+            "e":[1,960,np.nan],
+            }),
+        pd.DataFrame({
+            'a': [1.0, 8.0, np.nan],
+            'b': [np.nan, 8.0, np.nan],
+            'd': [np.nan, np.nan, 5.0],
+            'e': [1.0, 960.0, np.nan]
+            }))
+    ])
+    def test_drops_columns_containing_only_nan(self, input, expected):
+        
+        output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(input)
+
+        assert output.equals(expected)
+
+
+        
+
+
+