Skip to content

Commit a44a693

Browse files
authored
Merge pull request #87 from PythonPredictions/develop
Develop
2 parents 7459b71 + 953e81b commit a44a693

2 files changed

Lines changed: 241 additions & 74 deletions

File tree

cobra/preprocessing/target_encoder.py

Lines changed: 89 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
Incidence Replacement Module. The implementation is inspired by
3-
https://contrib.scikit-learn.org/categorical-encoding/index.html
3+
https://github.com/scikit-learn-contrib/category_encoders.
44
55
Authors:
66
@@ -9,7 +9,6 @@
99
"""
1010
import logging
1111

12-
#import numpy as np
1312
import pandas as pd
1413
from tqdm.auto import tqdm
1514
from sklearn.base import BaseEstimator
@@ -20,47 +19,76 @@
2019

2120
class TargetEncoder(BaseEstimator):
2221

23-
"""Target encoding for categorical features.
22+
"""Target encoding for categorical features, inspired by
23+
http://contrib.scikit-learn.org/category_encoders/targetencoder.html.
2424
2525
Replace each value of the categorical feature with the average of the
2626
target values (in case of a binary target, this is the incidence of the
2727
group). This encoding scheme is also called Mean encoding.
2828
29+
Note that, when applying this target encoding, values of the categorical
30+
feature that have not been seen during fit will be imputed according to the
31+
configured imputation strategy: replacement with the mean, minimum or
32+
maximum value of the categorical variable.
33+
2934
The main problem with Target encoding is overfitting; the fact that we are
3035
encoding the feature based on target classes may lead to data leakage,
31-
rendering the feature biased. This can be solved using some type of
32-
regularization. A popular way to handle this is to use cross-validation
33-
and compute the means in each out-of-fold. However, the approach
34-
implemented here makes use of additive smoothing
35-
(https://en.wikipedia.org/wiki/Additive_smoothing)
36+
rendering the feature biased.
37+
This can be solved using some type of regularization. A popular way to
38+
handle this is to use cross-validation and compute the means in each
39+
out-of-fold. However, the approach implemented here makes use of
40+
additive smoothing (https://en.wikipedia.org/wiki/Additive_smoothing).
41+
42+
In summary:
43+
44+
- with a binary classification target, a value of a categorical variable is
45+
replaced with:
46+
47+
[count(variable=value) * P(target=1|variable=value) + weight * P(target=1)]
48+
/ [count(variable=value) + weight]
49+
50+
- with a regression target, a value of a categorical variable is replaced
51+
with:
52+
53+
[count(variable=value) * E(target|variable=value) + weight * E(target)]
54+
/ [count(variable=value) + weight]
3655
3756
Attributes
3857
----------
3958
imputation_strategy : str
4059
in case there is a particular column which contains new categories,
4160
the encoding will lead to NULL values which should be imputed.
42-
Valid strategies are to replace with the global mean of the train
43-
set or the min (resp. max) incidence of the categories of that
44-
particular variable.
61+
Valid strategies then are to replace the NULL values with the global
62+
mean of the train set or the min (resp. max) incidence of the
63+
categories of that particular variable.
4564
weight : float
46-
Smoothing parameters (non-negative). The higher the value of the
47-
parameter, the bigger the contribution of the overall mean. When set to
48-
zero, there is no smoothing (e.g. the pure target incidence is used).
65+
Smoothing parameter (non-negative). The higher the value of the
66+
parameter, the bigger the contribution of the overall mean of targets
67+
learnt from all training data (prior) and the smaller the contribution
68+
of the mean target learnt from data with the current categorical value
69+
(posterior), so the bigger the smoothing (regularization) effect.
70+
When set to zero, there is no smoothing (e.g. the mean target of the
71+
current categorical value is used).
4972
"""
5073

51-
valid_strategies = ("mean", "min", "max")
74+
valid_imputation_strategies = ("mean", "min", "max")
5275

5376
def __init__(self, weight: float=0.0,
5477
imputation_strategy: str="mean"):
5578

5679
if weight < 0:
5780
raise ValueError("The value of weight cannot be smaller than zero")
58-
elif imputation_strategy not in self.valid_strategies:
81+
elif imputation_strategy not in self.valid_imputation_strategies:
5982
raise ValueError("Valid options for 'imputation_strategy' are {}."
60-
" Got imputation_strategy={!r} instead"
61-
.format(self.valid_strategies,
83+
" Got imputation_strategy={!r} instead."
84+
.format(self.valid_imputation_strategies,
6285
imputation_strategy))
6386

87+
if weight == 0:
88+
log.warning("The target encoder's additive smoothing weight is "
89+
"set to 0. This disables smoothing and may make the "
90+
"encoding prone to overfitting.")
91+
6492
self.weight = weight
6593
self.imputation_strategy = imputation_strategy
6694

@@ -69,7 +97,7 @@ def __init__(self, weight: float=0.0,
6997
self._global_mean = None
7098

7199
def attributes_to_dict(self) -> dict:
72-
"""Return the attributes of TargetEncoder in a dictionary
100+
"""Return the attributes of TargetEncoder in a dictionary.
73101
74102
Returns
75103
-------
@@ -98,13 +126,11 @@ def set_attributes_from_dict(self, params: dict):
98126
Contains the attributes of TargetEncoder with their
99127
names as key.
100128
"""
101-
102129
if "weight" in params and type(params["weight"]) == float:
103130
self.weight = params["weight"]
104131

105132
if ("imputation_strategy" in params and
106-
params["imputation_strategy"] in self.valid_strategies):
107-
133+
params["imputation_strategy"] in self.valid_imputation_strategies):
108134
self.imputation_strategy = params["imputation_strategy"]
109135

110136
if "_global_mean" in params and type(params["_global_mean"]) == float:
@@ -128,7 +154,7 @@ def dict_to_series(key, value):
128154

129155
def fit(self, data: pd.DataFrame, column_names: list,
130156
target_column: str):
131-
"""Fit the TargetEncoder to the data
157+
"""Fit the TargetEncoder to the data.
132158
133159
Parameters
134160
----------
@@ -140,7 +166,6 @@ def fit(self, data: pd.DataFrame, column_names: list,
140166
target_column : str
141167
Column name of the target
142168
"""
143-
144169
# compute global mean (target incidence in case of binary target)
145170
y = data[target_column]
146171
self._global_mean = y.sum() / y.count()
@@ -154,108 +179,113 @@ def fit(self, data: pd.DataFrame, column_names: list,
154179
self._mapping[column] = self._fit_column(data[column], y)
155180

156181
def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series:
157-
"""Summary
182+
"""Replace the values of a column, holding a categorical value,
183+
with a new value reflecting the formulas mentioned in the docstring
184+
of this class.
158185
159186
Parameters
160187
----------
161188
X : pd.Series
162189
data used to compute the encoding mapping for an individual
163190
categorical variable.
164191
y : pd.Series
165-
series containing the targets for each observation
192+
series containing the targets for each observation (value) of
193+
this categorical variable.
166194
167195
Returns
168196
-------
169197
pd.Series
170-
Mapping containing the value to replace each group of the
171-
categorical with.
198+
Mapping containing the new value to replace each distinct value
199+
of the categorical variable with.
172200
"""
173201
stats = y.groupby(X).agg(["mean", "count"])
174202

175-
# Note if self.weight = 0, we have the ordinary incidence replacement
176-
numerator = (stats["count"]*stats["mean"]
203+
# Note: if self.weight = 0, we have the ordinary incidence replacement
204+
numerator = (stats["count"] * stats["mean"]
177205
+ self.weight * self._global_mean)
178206

179207
denominator = stats["count"] + self.weight
180208

181-
return numerator/denominator
209+
return numerator / denominator
182210

183211
def transform(self, data: pd.DataFrame,
184212
column_names: list) -> pd.DataFrame:
185-
"""Replace (e.g. encode) categories of each column with its average
186-
incidence which was computed when the fit method was called
213+
"""Replace (e.g. encode) values of each categorical column with a
214+
new value (reflecting the corresponding average target value,
215+
optionally smoothed by a regularization weight),
216+
which was computed when the fit method was called.
187217
188218
Parameters
189219
----------
190-
X : pd.DataFrame
191-
data to encode
220+
data : pd.DataFrame
221+
the data to encode.
192222
column_names : list
193-
Columns of data to be encoded
223+
the name of the categorical columns in the data to be encoded.
194224
195225
Returns
196226
-------
197227
pd.DataFrame
198-
transformed data
228+
the resulting transformed data.
199229
200230
Raises
201231
------
202232
NotFittedError
203233
Exception when TargetEncoder was not fitted before calling this
204-
method
205-
234+
method.
206235
"""
207236
if (len(self._mapping) == 0) or (self._global_mean is None):
208237
msg = ("This {} instance is not fitted yet. Call 'fit' with "
209238
"appropriate arguments before using this method.")
210-
211239
raise NotFittedError(msg.format(self.__class__.__name__))
212240

213241
for column in tqdm(column_names, desc="Applying target encoding..."):
214-
215242
if column not in data.columns:
216-
log.warning("Unknown column '{}' will be skipped"
243+
log.warning("Unknown column '{}' will be skipped."
217244
.format(column))
218245
continue
219246
elif column not in self._mapping:
220247
log.warning("Column '{}' is not in fitted output "
221-
"and will be skipped".format(column))
248+
"and will be skipped.".format(column))
222249
continue
223-
224250
data = self._transform_column(data, column)
225251

226252
return data
227253

228254
def _transform_column(self, data: pd.DataFrame,
229255
column_name: str) -> pd.DataFrame:
230-
"""Replace (e.g. encode) categories of each column with its average
231-
incidence which was computed when the fit method was called
256+
"""Replace (e.g. encode) values of a categorical column with a
257+
new value (reflecting the corresponding average target value,
258+
optionally smoothed by a regularization weight),
259+
which was computed when the fit method was called.
232260
233261
Parameters
234262
----------
235-
X : pd.DataFrame
236-
data to encode
263+
data : pd.DataFrame
264+
the data to encode.
237265
column_name : str
238-
Name of the column in data to be encoded
266+
the name of the column in the data to be encoded.
239267
240268
Returns
241269
-------
242270
pd.DataFrame
243-
transformed data
271+
the resulting transformed data.
244272
"""
245273
new_column = TargetEncoder._clean_column_name(column_name)
246274

247-
# Convert dtype to float because when the original dtype
248-
# is of type "category", the resulting dtype is also of type
249-
# "category"
275+
# Convert dtype to float, because when the original dtype
276+
# is of type "category", the resulting dtype would otherwise also be of
277+
# type "category":
250278
data[new_column] = (data[column_name].map(self._mapping[column_name])
251279
.astype("float"))
252280

253281
# In case of categorical data, it could be that new categories will
254282
# emerge which were not present in the train set, so this will result
255-
# in missing values (which should be replaced)
283+
# in missing values, which should be replaced according to the
284+
# configured imputation strategy:
256285
if data[new_column].isnull().sum() > 0:
257286
if self.imputation_strategy == "mean":
258-
data[new_column].fillna(self._global_mean, inplace=True)
287+
data[new_column].fillna(self._global_mean,
288+
inplace=True)
259289
elif self.imputation_strategy == "min":
260290
data[new_column].fillna(data[new_column].min(),
261291
inplace=True)
@@ -282,14 +312,16 @@ def fit_transform(self, data: pd.DataFrame,
282312
Returns
283313
-------
284314
pd.DataFrame
285-
data with additional discretized variables
315+
data with additional columns, holding the target-encoded variables.
286316
"""
287317
self.fit(data, column_names, target_column)
288318
return self.transform(data, column_names)
289319

290320
@staticmethod
291321
def _clean_column_name(column_name: str) -> str:
292-
"""Clean column name string by removing "_bin" and adding "_enc"
322+
"""Generate a name for the new column that this target encoder
323+
generates in the given data, by removing "_bin", "_processed" or
324+
"_cleaned" from the original categorical column, and adding "_enc".
293325
294326
Parameters
295327
----------

0 commit comments

Comments
 (0)