Merge pull request #145 from PythonPredictions/137-automatically-search-for-categorical-variables

patrickleonardy · web-flow · commit 116386141bc8 · 2023-01-16T11:16:01.000+01:00
137 automatically search for categorical variables
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -223,28 +223,87 @@ def from_pipeline(cls, pipeline: dict):
             target_encoder,
             is_fitted=pipeline["_is_fitted"],
         )
+    
+    def get_continous_and_discreate_columns(
+        self, 
+        df : pd.DataFrame, 
+        id_col_name : str, 
+        target_column_name :str
+        ) -> tuple:
+        """Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names 
+        It assumes that numerical comumns with less than or equal to 10 different values are categorical
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            DataFrame that you want to divide in discreate and continous variables
+        id_col_name : str
+            column name of the id column, can be None
+        target_column_name : str
+            column name of the target column
+
+        Returns
+        -------
+        tuple
+            tuple containing 2 lists of column names. (continuous_vars, discrete_vars)
+        """        
+        if id_col_name == None:
+            log.warning("id_col_name is equal to None. If there is no id column ignore this warning")
+        
+        # find continuous_vars and discrete_vars in the dateframe
+        col_dtypes = df.dtypes
+        discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]]
+        
+
+        for col in df.columns:
+            if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target
+                val_counts = df[col].nunique()
+                if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values
+                    discrete_vars.append(col)
+
+        continuous_vars = list(set(df.columns)
+                            - set(discrete_vars) 
+                            - set([id_col_name, target_column_name]))
+        log.warning(
+            f"""Cobra automaticaly assumes that following variables are 
+            discrete: {discrete_vars}
+            continuous: {continuous_vars}
+            If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical"""
+            )
+        return continuous_vars, discrete_vars
 
     def fit(
         self,
         train_data: pd.DataFrame,
         continuous_vars: list,
         discrete_vars: list,
         target_column_name: str,
+        id_col_name: str = None
     ):
         """Fit the data to the preprocessing pipeline.
+        If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not 
 
         Parameters
         ----------
         train_data : pd.DataFrame
             Data to be preprocessed.
-        continuous_vars : list
-            List of continuous variables.
-        discrete_vars : list
-            List of discrete variables.
+        continuous_vars : list | None
+            List of continuous variables, can be None.
+        discrete_vars : list | None
+            List of discrete variables, can be None.
         target_column_name : str
             Column name of the target.
+        id_col_name : str, optional
+            _description_, by default None
         """
+        if not (continuous_vars and discrete_vars):
+            continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
+                df=train_data,
+                id_col_name=id_col_name,
+                target_column_name=target_column_name
 
+            )
+        
         # get list of all variables
         preprocessed_variable_names = PreProcessor._get_variable_list(
             continuous_vars, discrete_vars
@@ -359,27 +418,38 @@ def fit_transform(
         continuous_vars: list,
         discrete_vars: list,
         target_column_name: str,
+        id_col_name: str = None
     ) -> pd.DataFrame:
+
         """Fit preprocessing pipeline and transform the data.
+        If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not          
 
         Parameters
         ----------
         train_data : pd.DataFrame
             Data to be preprocessed
         continuous_vars : list
-            List of continuous variables.
+            List of continuous variables, can be None.
         discrete_vars : list
-            List of discrete variables.
+            List of discrete variables, can be None.
         target_column_name : str
             Column name of the target.
+        id_col_name : str, optional
+            _description_, by default None
 
         Returns
         -------
         pd.DataFrame
             Transformed (preprocessed) data.
         """
+        if not (continuous_vars and discrete_vars) and id_col_name:
+            continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
+                df=train_data,
+                id_col_name=id_col_name,
+                target_column_name=target_column_name
 
-        self.fit(train_data, continuous_vars, discrete_vars, target_column_name)
+            )
+        self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name)
 
         return self.transform(train_data, continuous_vars, discrete_vars)
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 numpy>=1.19.4
 pandas>=1.1.5
 scipy>=1.5.4
-scikit-learn>=0.24.1
+scikit-learn>=1.2.0
 matplotlib>=3.4.3
 seaborn>=0.11.0
 tqdm>=4.62.2
diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
@@ -225,7 +225,6 @@ def test_serialize(self):
                 "copy_X": True,
                 "fit_intercept": True,
                 "n_jobs": None,
-                "normalize": "deprecated",
                 "positive": False
             }
         }
@@ -244,7 +243,6 @@ def test_deserialize(self):
                 "copy_X": True,
                 "fit_intercept": True,
                 "n_jobs": None,
-                "normalize": "deprecated",
                 "positive": False
             },
             "coef_": [[0.5, 0.75]],
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
@@ -160,6 +160,95 @@ def test_get_variable_list(
 
             assert actual == expected
 
+    @pytest.mark.parametrize(
+    ("input, expected"),
+    [
+        # example 1
+        (
+            pd.DataFrame({
+                "ID": list(range(20)),
+                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+                }
+            ),
+            pd.DataFrame({
+                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+                }
+            ),
+        )
+    ]
+    )
+    def test_fit_transform_without_id_col_name(self, input, expected):
+        
+        preprocessor = PreProcessor.from_params(model_type="classification")
+        
+        continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
+
+        calculated = preprocessor.fit_transform(
+            input,
+            continuous_vars=continuous_vars,
+            discrete_vars=discrete_vars,
+            target_column_name="Target"
+            )
+        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
+    @pytest.mark.parametrize(
+    ("input, expected"),
+    [
+        # example 1
+        (
+            pd.DataFrame({
+                "ID": list(range(20)),
+                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+                }
+            ),
+            pd.DataFrame({
+                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+                }
+            ),
+        )
+    ]
+    )
+    def test_fit_transform_with_id_col_name(self, input, expected):
+        
+        preprocessor = PreProcessor.from_params(model_type="classification")
+        
+        # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
+
+        calculated = preprocessor.fit_transform(
+            input,
+            continuous_vars=None,
+            discrete_vars=None,
+            target_column_name="Target",
+            id_col_name="ID"
+            )
+        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
     @staticmethod
     def mock_transform(df: pd.DataFrame, args):
         """Mock the transform method."""