Skip to content

Commit 2ac2a3d

Browse files
author
Patrick Leonardy
committed
Added drop of columns containing only NANs
1 parent f37867f commit 2ac2a3d

2 files changed

Lines changed: 101 additions & 1 deletion

File tree

cobra/preprocessing/preprocessor.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(self,
6161
self._is_fitted = is_fitted
6262

6363
self.model_type = categorical_data_processor.model_type
64-
64+
6565
@classmethod
6666
def from_params(cls,
6767
model_type: str="classification",
@@ -234,6 +234,10 @@ def fit(self, train_data: pd.DataFrame, continuous_vars: list,
234234
# Ensure to operate on separate copy of data
235235
train_data = train_data.copy()
236236

237+
238+
# drop NAN columns if they exist
239+
train_data = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(train_data)
240+
237241
# Fit discretizer, categorical preprocessor & target encoder
238242
# Note that in order to fit target_encoder, we first have to transform
239243
# the data using the fitted discretizer & categorical_data_processor
@@ -486,3 +490,38 @@ def _get_variable_list(continuous_vars: list, discrete_vars: list) -> list:
486490
raise ValueError("Variable var_list is None or empty list.")
487491

488492
return var_list
493+
494+
def _check_nan_columns_and_drop_columns_containing_only_nan(data: pd.DataFrame) -> pd.DataFrame:
495+
"""Checkes how much missing values are in the dataframe and drops columns that contain only missing values.
496+
It also logs an error message displaying the percentage of missing values in the diffenent columns
497+
(columns are only diosplaied if they contain a missing values)
498+
499+
Parameters
500+
----------
501+
data : pd.DataFrame
502+
Data that should be checked for columns that contain only missing values
503+
504+
Returns
505+
-------
506+
pd.DataFrame
507+
Data without columns conatining only missing values
508+
"""
509+
510+
# Check how much NaN values are in each variable
511+
# and output a warning if a variable has more than 0% of missing values
512+
513+
perc_na = data.isna().mean() * 100
514+
515+
if not perc_na[perc_na > 0].empty:
516+
logging.warning("\nPercentage of missing values per variable:\n" + perc_na[perc_na > 0].round(2).to_string(float_format=lambda x: str(x)+"%"))
517+
518+
519+
# drop variables that have only missing values
520+
to_drop = [perc_na.index[i] for i, percentage in enumerate(perc_na) if percentage == 100]
521+
522+
523+
if to_drop:
524+
data = data.drop(to_drop, axis=1)
525+
logging.warning(f"Following variables contain only missing values and were droped: {to_drop}")
526+
527+
return data

tests/preprocessing/test_preprocessor.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,64 @@ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
178178
)
179179
assert "new_column" not in train_data.columns
180180
assert "new_column" in result.columns
181+
182+
183+
184+
@pytest.mark.parametrize(("input, expected"),
185+
[
186+
# example 1
187+
(pd.DataFrame({
188+
"a":[1,8,np.nan],
189+
"b":[np.nan,8,np.nan],
190+
"c":[np.nan,np.nan,np.nan],
191+
"d":[np.nan,np.nan,5],
192+
"e":[1,960,np.nan],
193+
"f":[np.nan,np.nan,np.nan]
194+
}),
195+
pd.DataFrame({
196+
'a': [1.0, 8.0, np.nan],
197+
'b': [np.nan, 8.0, np.nan],
198+
'd': [np.nan, np.nan, 5.0],
199+
'e': [1.0, 960.0, np.nan]
200+
})),
201+
202+
#example 2
203+
(pd.DataFrame({
204+
"a":[1,8,np.nan],
205+
"b":[np.nan,8,np.nan],
206+
"c":[np.nan,np.nan,np.nan],
207+
"d":[np.nan,np.nan,5],
208+
"e":[1,960,np.nan],
209+
}),
210+
pd.DataFrame({
211+
'a': [1.0, 8.0, np.nan],
212+
'b': [np.nan, 8.0, np.nan],
213+
'd': [np.nan, np.nan, 5.0],
214+
'e': [1.0, 960.0, np.nan]
215+
})),
216+
217+
#example 3
218+
(pd.DataFrame({
219+
"a":[1,8,np.nan],
220+
"b":[np.nan,8,np.nan],
221+
"d":[np.nan,np.nan,5],
222+
"e":[1,960,np.nan],
223+
}),
224+
pd.DataFrame({
225+
'a': [1.0, 8.0, np.nan],
226+
'b': [np.nan, 8.0, np.nan],
227+
'd': [np.nan, np.nan, 5.0],
228+
'e': [1.0, 960.0, np.nan]
229+
}))
230+
])
231+
def test_drops_columns_containing_only_nan(self, input, expected):
232+
233+
output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(input)
234+
235+
assert output.equals(expected)
236+
237+
238+
239+
240+
241+

0 commit comments

Comments
 (0)