Skip to content

Commit 9e905a7

Browse files
Merge pull request #17 from PythonPredictions/hotfix/target_encoder_missing_value_imputation
Hotfix/target encoder missing value imputation
2 parents 581f7b4 + 3865b36 commit 9e905a7

5 files changed

Lines changed: 135 additions & 28 deletions

File tree

cobra/evaluation/evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from sklearn.metrics import roc_curve
1212
from sklearn.metrics import confusion_matrix
1313
from sklearn.metrics import roc_auc_score
14+
from sklearn.metrics import matthews_corrcoef
1415
from sklearn.exceptions import NotFittedError
1516

1617

@@ -109,6 +110,7 @@ def compute_scalar_metrics(y_true: np.ndarray,
109110
"precision": precision_score(y_true, y_pred_b),
110111
"recall": recall_score(y_true, y_pred_b),
111112
"F1": f1_score(y_true, y_pred_b, average=None)[1],
113+
"matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
112114
"lift at {}".format(lift_at): np.round(Evaluator
113115
._compute_lift(
114116
y_true=y_true,

cobra/evaluation/plotting_utils.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88

99
def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
10-
dim: tuple=(12, 8)):
10+
dim: tuple=(12, 8),
11+
path: str=None):
1112
"""Plot univariate quality of the predictors
1213
1314
Parameters
@@ -18,6 +19,8 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
1819
criteria
1920
dim : tuple, optional
2021
tuple with width and lentgh of the plot
22+
path : str, optional
23+
path to store the figure
2124
"""
2225

2326
df = (df_auc[df_auc["preselection"]]
@@ -41,6 +44,9 @@ def plot_univariate_predictor_quality(df_auc: pd.DataFrame,
4144
# Remove white lines from the second axis
4245
ax.grid(False)
4346

47+
if path is not None:
48+
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
49+
4450
plt.show()
4551

4652

@@ -70,6 +76,7 @@ def plot_correlation_matrix(df_corr: pd.DataFrame,
7076

7177
def plot_performance_curves(model_performance: pd.DataFrame,
7278
dim: tuple=(12, 8),
79+
path: str=None,
7380
colors: dict={"train": "#0099bf",
7481
"selection": "#ff9500",
7582
"validation": "#8064a2"}):
@@ -83,6 +90,8 @@ def plot_performance_curves(model_performance: pd.DataFrame,
8390
in the forward feature selection
8491
dim : tuple, optional
8592
tuple with width and lentgh of the plot
93+
path : str, optional
94+
path to store the figure
8695
"""
8796
highest_auc = np.round(max(max(model_performance['train_performance']),
8897
max(model_performance['selection_performance']),
@@ -113,6 +122,10 @@ def plot_performance_curves(model_performance: pd.DataFrame,
113122
fig.suptitle('Performance curves - forward feature selection',
114123
fontsize=20)
115124
plt.ylabel('Model performance')
125+
126+
if path is not None:
127+
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
128+
116129
plt.show()
117130

118131

cobra/preprocessing/preprocessor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def from_params(cls,
8080
scale_contingency_table: bool=True,
8181
forced_categories: dict={},
8282
weight: float=0.0,
83+
imputation_strategy: str="mean",
8384
serialization_path: Optional[str]=None):
8485
"""Constructor to instantiate PreProcessor from all the parameters
8586
that can be set in all its required (attribute) classes.
@@ -130,6 +131,12 @@ def from_params(cls,
130131
parameter, the bigger the contribution of the overall mean.
131132
When set to zero, there is no smoothing
132133
(e.g. the pure target incidence is used).
134+
imputation_strategy : str, optional
135+
in case there is a particular column which contains new categories,
136+
the encoding will lead to NULL values which should be imputed.
137+
Valid strategies are to replace with the global mean of the train
138+
set or the min (resp. max) incidence of the categories of that
139+
particular variable.
133140
serialization_path : str, optional
134141
path to save the pipeline to
135142

cobra/preprocessing/target_encoder.py

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,37 @@ class TargetEncoder(BaseEstimator):
3434
3535
Attributes
3636
----------
37-
columns : list
38-
A list of columns to encode, if None, all string columns will be
39-
encoded.
37+
imputation_strategy : str
38+
in case there is a particular column which contains new categories,
39+
the encoding will lead to NULL values which should be imputed.
40+
Valid strategies are to replace with the global mean of the train
41+
set or the min (resp. max) incidence of the categories of that
42+
particular variable.
4043
weight : float
4144
Smoothing parameters (non-negative). The higher the value of the
4245
parameter, the bigger the contribution of the overall mean. When set to
4346
zero, there is no smoothing (e.g. the pure target incidence is used).
4447
"""
4548

46-
def __init__(self, weight: float=0.0):
49+
valid_strategies = ("mean", "min", "max")
50+
51+
def __init__(self, weight: float=0.0,
52+
imputation_strategy: str="mean"):
4753

4854
if weight < 0:
4955
raise ValueError("The value of weight cannot be smaller than zero")
56+
elif imputation_strategy not in self.valid_strategies:
57+
raise ValueError("Valid options for 'imputation_strategy' are {}."
58+
" Got imputation_strategy={!r} instead"
59+
.format(self.valid_strategies,
60+
imputation_strategy))
5061

5162
self.weight = weight
63+
self.imputation_strategy = imputation_strategy
64+
5265
self._mapping = {} # placeholder for fitted output
66+
# placeholder for the global incidence of the data used for fitting
67+
self._global_mean = None
5368

5469
# not implemented yet!
5570
# randomized: bool=False, sigma=0.05
@@ -72,6 +87,8 @@ def attributes_to_dict(self) -> dict:
7287
for key, value in self._mapping.items()
7388
}
7489

90+
params["_global_mean"] = self._global_mean
91+
7592
return params
7693

7794
def set_attributes_from_dict(self, params: dict):
@@ -88,6 +105,14 @@ def set_attributes_from_dict(self, params: dict):
88105
if "weight" in params and type(params["weight"]) == float:
89106
self.weight = params["weight"]
90107

108+
if ("imputation_strategy" in params and
109+
params["imputation_strategy"] in self.valid_strategies):
110+
111+
self.imputation_strategy = params["imputation_strategy"]
112+
113+
if "_global_mean" in params and type(params["_global_mean"]) == float:
114+
self._global_mean = params["_global_mean"]
115+
91116
_mapping = {}
92117
if "_mapping" in params and type(params["_mapping"]) == dict:
93118
_mapping = params["_mapping"]
@@ -121,19 +146,17 @@ def fit(self, data: pd.DataFrame, column_names: list,
121146

122147
# compute global mean (target incidence in case of binary target)
123148
y = data[target_column]
124-
global_mean = y.sum() / y.count()
149+
self._global_mean = y.sum() / y.count()
125150

126151
for column in column_names:
127152
if column not in data.columns:
128153
log.warning("DataFrame has no column '{}', so it will be "
129154
"skipped in fitting" .format(column))
130155
continue
131156

132-
self._mapping[column] = self._fit_column(data[column], y,
133-
global_mean)
157+
self._mapping[column] = self._fit_column(data[column], y)
134158

135-
def _fit_column(self, X: pd.Series, y: pd.Series,
136-
global_mean: float) -> pd.Series:
159+
def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
137160
"""Summary
138161
139162
Parameters
@@ -143,8 +166,6 @@ def _fit_column(self, X: pd.Series, y: pd.Series,
143166
categorical variable.
144167
y : pd.Series
145168
series containing the targets for each observation
146-
global_mean : float
147-
Global mean of the target
148169
149170
Returns
150171
-------
@@ -158,7 +179,9 @@ def _fit_column(self, X: pd.Series, y: pd.Series,
158179
# Q: do we need to do this here or during the transform phase???
159180

160181
# Note if self.weight = 0, we have the ordinary incidence replacement
161-
numerator = stats["count"]*stats["mean"] + self.weight*global_mean
182+
numerator = (stats["count"]*stats["mean"]
183+
+ self.weight * self._global_mean)
184+
162185
denominator = stats["count"] + self.weight
163186

164187
return numerator/denominator
@@ -187,13 +210,12 @@ def transform(self, data: pd.DataFrame,
187210
method
188211
189212
"""
190-
if len(self._mapping) == 0:
213+
if (len(self._mapping) == 0) or (self._global_mean is None):
191214
msg = ("This {} instance is not fitted yet. Call 'fit' with "
192215
"appropriate arguments before using this method.")
193216

194217
raise NotFittedError(msg.format(self.__class__.__name__))
195218

196-
new_columns = []
197219
for column in column_names:
198220

199221
if column not in data.columns:
@@ -205,15 +227,47 @@ def transform(self, data: pd.DataFrame,
205227
"and will be skipped".format(column))
206228
continue
207229

208-
new_column = TargetEncoder._clean_column_name(column)
230+
data = self._transform_column(data, column)
231+
232+
return data
233+
234+
def _transform_column(self, data: pd.DataFrame,
235+
column_name: str) -> pd.DataFrame:
236+
"""Replace (e.g. encode) categories of each column with its average
237+
incidence which was computed when the fit method was called
209238
210-
# Convert dtype to float because when the original dtype
211-
# is of type "category", the resulting dtype is also of type
212-
# "category"
213-
data[new_column] = (data[column].map(self._mapping[column])
214-
.astype("float"))
239+
Parameters
240+
----------
241+
X : pd.DataFrame
242+
data to encode
243+
column_name : str
244+
Name of the column in data to be encoded
215245
216-
new_columns.append(new_column)
246+
Returns
247+
-------
248+
pd.DataFrame
249+
transformed data
250+
"""
251+
new_column = TargetEncoder._clean_column_name(column_name)
252+
253+
# Convert dtype to float because when the original dtype
254+
# is of type "category", the resulting dtype is also of type
255+
# "category"
256+
data[new_column] = (data[column_name].map(self._mapping[column_name])
257+
.astype("float"))
258+
259+
# In case of categorical data, it could be that new categories will
260+
# emerge which were not present in the train set, so this will result
261+
# in missing values (which should be replaced)
262+
if data[new_column].isnull().sum() > 0:
263+
if self.imputation_strategy == "mean":
264+
data[new_column].fillna(self._global_mean, inplace=True)
265+
elif self.imputation_strategy == "min":
266+
data[new_column].fillna(data[new_column].min(),
267+
inplace=True)
268+
elif self.imputation_strategy == "max":
269+
data[new_column].fillna(data[new_column].max(),
270+
inplace=True)
217271

218272
return data
219273

tests/preprocessing/test_target_encoder.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,13 @@ def test_target_encoder_attributes_to_dict(self):
2121

2222
encoder._mapping["variable"] = mapping_data
2323

24+
encoder._global_mean = 0.5
25+
2426
actual = encoder.attributes_to_dict()
2527

2628
expected = {"weight": 0.0,
29+
"imputation_strategy": "mean",
30+
"_global_mean": 0.5,
2731
"_mapping": {"variable": {
2832
"negative": 0.333333,
2933
"neutral": 0.50000,
@@ -58,6 +62,7 @@ def test_target_encoder_set_attributes_from_dict(self):
5862
encoder = TargetEncoder()
5963

6064
data = {"weight": 0.0,
65+
"_global_mean": 0.5,
6166
"_mapping": {"variable": {
6267
"negative": 0.333333,
6368
"neutral": 0.50000,
@@ -85,8 +90,8 @@ def test_target_encoder_fit_column(self):
8590
'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
8691

8792
encoder = TargetEncoder()
88-
actual = encoder._fit_column(X=df.variable, y=df.target,
89-
global_mean=0.0)
93+
encoder._global_mean = 0.5
94+
actual = encoder._fit_column(X=df.variable, y=df.target)
9095

9196
expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
9297
index=["negative", "neutral", "positive"])
@@ -103,11 +108,10 @@ def test_target_encoder_fit_column_global_mean(self):
103108
'neutral'],
104109
'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
105110

106-
global_mean = df.target.sum() / df.target.count() # is 0.5
107-
108111
encoder = TargetEncoder(weight=1)
109-
actual = encoder._fit_column(X=df.variable, y=df.target,
110-
global_mean=global_mean)
112+
encoder._global_mean = df.target.sum() / df.target.count() # is 0.5
113+
114+
actual = encoder._fit_column(X=df.variable, y=df.target)
111115

112116
expected = pd.Series(data=[0.375, 0.500, 0.625],
113117
index=["negative", "neutral", "positive"])
@@ -160,6 +164,33 @@ def test_target_encoder_transform(self):
160164
pd.testing.assert_frame_equal(actual, expected,
161165
check_less_precise=5)
162166

167+
def test_target_encoder_transform_new_category(self):
168+
169+
df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
170+
'neutral', 'negative', 'positive',
171+
'negative', 'neutral', 'neutral',
172+
'neutral'],
173+
'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
174+
175+
df_appended = df.append({"variable": "new", "target": 1},
176+
ignore_index=True)
177+
178+
# inputs of TargetEncoder will be of dtype category
179+
df["variable"] = df["variable"].astype("category")
180+
df_appended["variable"] = df_appended["variable"].astype("category")
181+
182+
expected = df_appended.copy()
183+
expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
184+
0.333333, 0.666667, 0.333333, 0.50000,
185+
0.50000, 0.50000, 0.333333]
186+
187+
encoder = TargetEncoder(imputation_strategy="min")
188+
encoder.fit(data=df, column_names=["variable"], target_column="target")
189+
actual = encoder.transform(data=df_appended, column_names=["variable"])
190+
191+
pd.testing.assert_frame_equal(actual, expected,
192+
check_less_precise=5)
193+
163194
# Tests for _clean_column_name
164195
def test_target_encoder_clean_column_name(self):
165196

0 commit comments

Comments
 (0)