Skip to content

Commit 2447c86

Browse files
Merge pull request #147 from PythonPredictions/137-automatically-search-for-categorical-variables
137 automatically search for categorical variables - typo fix
2 parents 1163861 + c1e2725 commit 2447c86

2 files changed

Lines changed: 45 additions & 35 deletions

File tree

cobra/preprocessing/preprocessor.py

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -223,20 +223,17 @@ def from_pipeline(cls, pipeline: dict):
223223
target_encoder,
224224
is_fitted=pipeline["_is_fitted"],
225225
)
226-
227-
def get_continous_and_discreate_columns(
228-
self,
229-
df : pd.DataFrame,
230-
id_col_name : str,
231-
target_column_name :str
232-
) -> tuple:
233-
"""Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names
234-
It assumes that numerical comumns with less than or equal to 10 different values are categorical
226+
227+
def get_continuous_and_discrete_columns(
228+
self, df: pd.DataFrame, id_col_name: str, target_column_name: str
229+
) -> tuple:
230+
"""Filters out the continuous and discrete variables out of a dataframe and returns a tuple containing lists of column names
231+
It assumes that numerical columns with less than or equal to 10 different values are categorical
235232
236233
Parameters
237234
----------
238235
df : pd.DataFrame
239-
DataFrame that you want to divide in discreate and continous variables
236+
DataFrame that you want to divide in discrete and continuous variables
240237
id_col_name : str
241238
column name of the id column, can be None
242239
target_column_name : str
@@ -246,30 +243,43 @@ def get_continous_and_discreate_columns(
246243
-------
247244
tuple
248245
tuple containing 2 lists of column names. (continuous_vars, discrete_vars)
249-
"""
246+
"""
250247
if id_col_name == None:
251-
log.warning("id_col_name is equal to None. If there is no id column ignore this warning")
252-
248+
log.warning(
249+
"id_col_name is equal to None. If there is no id column ignore this warning"
250+
)
251+
253252
# find continuous_vars and discrete_vars in the dateframe
254253
col_dtypes = df.dtypes
255-
discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]]
256-
254+
discrete_vars = [
255+
col
256+
for col in col_dtypes[col_dtypes == object].index.tolist()
257+
if col not in [id_col_name, target_column_name]
258+
]
257259

258260
for col in df.columns:
259-
if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target
261+
if col not in discrete_vars and col not in [
262+
id_col_name,
263+
target_column_name,
264+
]: # omit discrete because a string, and target
260265
val_counts = df[col].nunique()
261-
if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values
266+
if (
267+
val_counts > 1 and val_counts <= 10
268+
): # the column contains less than 10 different values
262269
discrete_vars.append(col)
263270

264-
continuous_vars = list(set(df.columns)
265-
- set(discrete_vars)
266-
- set([id_col_name, target_column_name]))
271+
continuous_vars = list(
272+
set(df.columns)
273+
- set(discrete_vars)
274+
- set([id_col_name, target_column_name])
275+
)
267276
log.warning(
268277
f"""Cobra automaticaly assumes that following variables are
269278
discrete: {discrete_vars}
270279
continuous: {continuous_vars}
271-
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical"""
272-
)
280+
If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords.
281+
It assumes that numerical columns with less than or equal to 10 different values are categorical"""
282+
)
273283
return continuous_vars, discrete_vars
274284

275285
def fit(
@@ -278,10 +288,10 @@ def fit(
278288
continuous_vars: list,
279289
discrete_vars: list,
280290
target_column_name: str,
281-
id_col_name: str = None
291+
id_col_name: str = None,
282292
):
283293
"""Fit the data to the preprocessing pipeline.
284-
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
294+
If you put continuous_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which variables are continuous and which are not.
285295
286296
Parameters
287297
----------
@@ -297,13 +307,12 @@ def fit(
297307
_description_, by default None
298308
"""
299309
if not (continuous_vars and discrete_vars):
300-
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
310+
continuous_vars, discrete_vars = self.get_continuous_and_discrete_columns(
301311
df=train_data,
302312
id_col_name=id_col_name,
303-
target_column_name=target_column_name
304-
313+
target_column_name=target_column_name,
305314
)
306-
315+
307316
# get list of all variables
308317
preprocessed_variable_names = PreProcessor._get_variable_list(
309318
continuous_vars, discrete_vars
@@ -418,11 +427,11 @@ def fit_transform(
418427
continuous_vars: list,
419428
discrete_vars: list,
420429
target_column_name: str,
421-
id_col_name: str = None
430+
id_col_name: str = None,
422431
) -> pd.DataFrame:
423432

424433
"""Fit preprocessing pipeline and transform the data.
425-
If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not
434+
If you put continuous_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which variables are continuous and which are not.
426435
427436
Parameters
428437
----------
@@ -443,13 +452,14 @@ def fit_transform(
443452
Transformed (preprocessed) data.
444453
"""
445454
if not (continuous_vars and discrete_vars) and id_col_name:
446-
continuous_vars, discrete_vars = self.get_continous_and_discreate_columns(
455+
continuous_vars, discrete_vars = self.get_continuous_and_discrete_columns(
447456
df=train_data,
448457
id_col_name=id_col_name,
449-
target_column_name=target_column_name
450-
458+
target_column_name=target_column_name,
451459
)
452-
self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name)
460+
self.fit(
461+
train_data, continuous_vars, discrete_vars, target_column_name, id_col_name
462+
)
453463

454464
return self.transform(train_data, continuous_vars, discrete_vars)
455465

tests/preprocessing/test_preprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def test_fit_transform_without_id_col_name(self, input, expected):
194194

195195
preprocessor = PreProcessor.from_params(model_type="classification")
196196

197-
continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
197+
continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target")
198198

199199
calculated = preprocessor.fit_transform(
200200
input,

0 commit comments

Comments
 (0)