Skip to content

Commit 1163861

Browse files
Merge pull request #145 from PythonPredictions/137-automatically-search-for-categorical-variables
137 automatically search for categorical variables
2 parents 8f770c7 + 2d610d7 commit 1163861

4 files changed

Lines changed: 167 additions & 10 deletions

File tree

cobra/preprocessing/preprocessor.py

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -223,28 +223,87 @@ def from_pipeline(cls, pipeline: dict):
223223
target_encoder,
224224
is_fitted=pipeline["_is_fitted"],
225225
)
226+
227+
def get_continous_and_discreate_columns(
228+
self,
229+
df : pd.DataFrame,
230+
id_col_name : str,
231+
target_column_name :str
232+
) -> tuple:
233+
"""Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names
234+
It assumes that numerical comumns with less than or equal to 10 different values are categorical
235+
236+
Parameters
237+
----------
238+
df : pd.DataFrame
239+
DataFrame that you want to divide in discreate and continous variables
240+
id_col_name : str
241+
column name of the id column, can be None
242+
target_column_name : str
243+
column name of the target column
244+
245+
Returns
246+
-------
247+
tuple
248+
tuple containing 2 lists of column names. (continuous_vars, discrete_vars)
249+
"""
250+
if id_col_name == None:
251+
log.warning("id_col_name is equal to None. If there is no id column ignore this warning")
252+
253+
# find continuous_vars and discrete_vars in the dateframe
254+
col_dtypes = df.dtypes
255+
discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]]
256+
257+
258+
for col in df.columns:
259+
if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target
260+
val_counts = df[col].nunique()
261+
if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values
262+
discrete_vars.append(col)
263+
264+
continuous_vars = list(set(df.columns)
265+
- set(discrete_vars)
266+
- set([id_col_name, target_column_name]))
267+
log.warning(
268+
f"""Cobra automaticaly assumes that following variables are
269+
discrete: {discrete_vars}
270+
continuous: {continuous_vars}
271+
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical"""
272+
)
273+
return continuous_vars, discrete_vars
226274

227275
def fit(
228276
self,
229277
train_data: pd.DataFrame,
230278
continuous_vars: list,
231279
discrete_vars: list,
232280
target_column_name: str,
281+
id_col_name: str = None
233282
):
234283
"""Fit the data to the preprocessing pipeline.
284+
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
235285
236286
Parameters
237287
----------
238288
train_data : pd.DataFrame
239289
Data to be preprocessed.
240-
continuous_vars : list
241-
List of continuous variables.
242-
discrete_vars : list
243-
List of discrete variables.
290+
continuous_vars : list | None
291+
List of continuous variables, can be None.
292+
discrete_vars : list | None
293+
List of discrete variables, can be None.
244294
target_column_name : str
245295
Column name of the target.
296+
id_col_name : str, optional
297+
_description_, by default None
246298
"""
299+
if not (continuous_vars and discrete_vars):
300+
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
301+
df=train_data,
302+
id_col_name=id_col_name,
303+
target_column_name=target_column_name
247304

305+
)
306+
248307
# get list of all variables
249308
preprocessed_variable_names = PreProcessor._get_variable_list(
250309
continuous_vars, discrete_vars
@@ -359,27 +418,38 @@ def fit_transform(
359418
continuous_vars: list,
360419
discrete_vars: list,
361420
target_column_name: str,
421+
id_col_name: str = None
362422
) -> pd.DataFrame:
423+
363424
"""Fit preprocessing pipeline and transform the data.
425+
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
364426
365427
Parameters
366428
----------
367429
train_data : pd.DataFrame
368430
Data to be preprocessed
369431
continuous_vars : list
370-
List of continuous variables.
432+
List of continuous variables, can be None.
371433
discrete_vars : list
372-
List of discrete variables.
434+
List of discrete variables, can be None.
373435
target_column_name : str
374436
Column name of the target.
437+
id_col_name : str, optional
438+
_description_, by default None
375439
376440
Returns
377441
-------
378442
pd.DataFrame
379443
Transformed (preprocessed) data.
380444
"""
445+
if not (continuous_vars and discrete_vars) and id_col_name:
446+
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
447+
df=train_data,
448+
id_col_name=id_col_name,
449+
target_column_name=target_column_name
381450

382-
self.fit(train_data, continuous_vars, discrete_vars, target_column_name)
451+
)
452+
self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name)
383453

384454
return self.transform(train_data, continuous_vars, discrete_vars)
385455

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
numpy>=1.19.4
22
pandas>=1.1.5
33
scipy>=1.5.4
4-
scikit-learn>=0.24.1
4+
scikit-learn>=1.2.0
55
matplotlib>=3.4.3
66
seaborn>=0.11.0
77
tqdm>=4.62.2

tests/model_building/test_models.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,6 @@ def test_serialize(self):
225225
"copy_X": True,
226226
"fit_intercept": True,
227227
"n_jobs": None,
228-
"normalize": "deprecated",
229228
"positive": False
230229
}
231230
}
@@ -244,7 +243,6 @@ def test_deserialize(self):
244243
"copy_X": True,
245244
"fit_intercept": True,
246245
"n_jobs": None,
247-
"normalize": "deprecated",
248246
"positive": False
249247
},
250248
"coef_": [[0.5, 0.75]],

tests/preprocessing/test_preprocessor.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,95 @@ def test_get_variable_list(
160160

161161
assert actual == expected
162162

163+
@pytest.mark.parametrize(
164+
("input, expected"),
165+
[
166+
# example 1
167+
(
168+
pd.DataFrame({
169+
"ID": list(range(20)),
170+
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
171+
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
172+
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
173+
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
174+
}
175+
),
176+
pd.DataFrame({
177+
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
178+
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
179+
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
180+
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
181+
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
182+
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
183+
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
184+
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
185+
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
186+
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
187+
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
188+
}
189+
),
190+
)
191+
]
192+
)
193+
def test_fit_transform_without_id_col_name(self, input, expected):
194+
195+
preprocessor = PreProcessor.from_params(model_type="classification")
196+
197+
continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
198+
199+
calculated = preprocessor.fit_transform(
200+
input,
201+
continuous_vars=continuous_vars,
202+
discrete_vars=discrete_vars,
203+
target_column_name="Target"
204+
)
205+
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
206+
207+
@pytest.mark.parametrize(
208+
("input, expected"),
209+
[
210+
# example 1
211+
(
212+
pd.DataFrame({
213+
"ID": list(range(20)),
214+
"A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
215+
"B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
216+
"C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
217+
"Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
218+
}
219+
),
220+
pd.DataFrame({
221+
'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
222+
'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
223+
'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
224+
'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
225+
'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
226+
'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
227+
'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
228+
'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
229+
'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
230+
'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
231+
'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
232+
}
233+
),
234+
)
235+
]
236+
)
237+
def test_fit_transform_with_id_col_name(self, input, expected):
238+
239+
preprocessor = PreProcessor.from_params(model_type="classification")
240+
241+
# continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
242+
243+
calculated = preprocessor.fit_transform(
244+
input,
245+
continuous_vars=None,
246+
discrete_vars=None,
247+
target_column_name="Target",
248+
id_col_name="ID"
249+
)
250+
pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
251+
163252
@staticmethod
164253
def mock_transform(df: pd.DataFrame, args):
165254
"""Mock the transform method."""

0 commit comments

Comments
 (0)