11"""
22Incidence Replacement Module. The implementation is inspired by
3- https://contrib. scikit-learn.org/categorical-encoding/index.html
3+ https://github.com/ scikit-learn-contrib/category_encoders.
44
55Authors:
66
99"""
1010import logging
1111
12- #import numpy as np
1312import pandas as pd
1413from tqdm .auto import tqdm
1514from sklearn .base import BaseEstimator
2019
2120class TargetEncoder (BaseEstimator ):
2221
23- """Target encoding for categorical features.
22+ """Target encoding for categorical features, inspired by
23+ http://contrib.scikit-learn.org/category_encoders/targetencoder.html.
2424
2525 Replace each value of the categorical feature with the average of the
2626 target values (in case of a binary target, this is the incidence of the
2727 group). This encoding scheme is also called Mean encoding.
2828
29+ Note that, when applying this target encoding, values of the categorical
30+ feature that have not been seen during fit will be imputed according to the
31+ configured imputation strategy: replacement with the mean, minimum or
32+ maximum value of the categorical variable.
33+
2934 The main problem with Target encoding is overfitting; the fact that we are
3035 encoding the feature based on target classes may lead to data leakage,
31- rendering the feature biased. This can be solved using some type of
32- regularization. A popular way to handle this is to use cross-validation
33- and compute the means in each out-of-fold. However, the approach
34- implemented here makes use of additive smoothing
35- (https://en.wikipedia.org/wiki/Additive_smoothing)
36+ rendering the feature biased.
37+ This can be solved using some type of regularization. A popular way to
38+ handle this is to use cross-validation and compute the means in each
39+ out-of-fold. However, the approach implemented here makes use of
40+ additive smoothing (https://en.wikipedia.org/wiki/Additive_smoothing).
41+
42+ In summary:
43+
44+ - with a binary classification target, a value of a categorical variable is
45+ replaced with:
46+
47+ [count(variable=value) * P(target=1|variable=value) + weight * P(target=1)]
48+ / [count(variable=value) + weight]
49+
50+ - with a regression target, a value of a categorical variable is replaced
51+ with:
52+
53+ [count(variable=value) * E(target|variable=value) + weight * E(target)]
54+ / [count(variable=value) + weight]
3655
3756 Attributes
3857 ----------
3958 imputation_strategy : str
4059 in case there is a particular column which contains new categories,
4160 the encoding will lead to NULL values which should be imputed.
42- Valid strategies are to replace with the global mean of the train
43- set or the min (resp. max) incidence of the categories of that
44- particular variable.
61+ Valid strategies then are to replace the NULL values with the global
62+ mean of the train set or the min (resp. max) incidence of the
63+ categories of that particular variable.
4564 weight : float
46- Smoothing parameters (non-negative). The higher the value of the
47- parameter, the bigger the contribution of the overall mean. When set to
48- zero, there is no smoothing (e.g. the pure target incidence is used).
65+ Smoothing parameter (non-negative). The higher the value of the
66+ parameter, the bigger the contribution of the overall mean of targets
67+ learnt from all training data (prior) and the smaller the contribution
68+ of the mean target learnt from data with the current categorical value
69+ (posterior), so the bigger the smoothing (regularization) effect.
70+ When set to zero, there is no smoothing (e.g. the mean target of the
71+ current categorical value is used).
4972 """
5073
51- valid_strategies = ("mean" , "min" , "max" )
74+ valid_imputation_strategies = ("mean" , "min" , "max" )
5275
5376 def __init__ (self , weight : float = 0.0 ,
5477 imputation_strategy : str = "mean" ):
5578
5679 if weight < 0 :
5780 raise ValueError ("The value of weight cannot be smaller than zero" )
58- elif imputation_strategy not in self .valid_strategies :
81+ elif imputation_strategy not in self .valid_imputation_strategies :
5982 raise ValueError ("Valid options for 'imputation_strategy' are {}."
60- " Got imputation_strategy={!r} instead"
61- .format (self .valid_strategies ,
83+ " Got imputation_strategy={!r} instead. "
84+ .format (self .valid_imputation_strategies ,
6285 imputation_strategy ))
6386
87+ if weight == 0 :
88+ log .warning ("The target encoder's additive smoothing weight is "
89+ "set to 0. This disables smoothing and may make the "
90+ "encoding prone to overfitting." )
91+
6492 self .weight = weight
6593 self .imputation_strategy = imputation_strategy
6694
@@ -69,7 +97,7 @@ def __init__(self, weight: float=0.0,
6997 self ._global_mean = None
7098
7199 def attributes_to_dict (self ) -> dict :
72- """Return the attributes of TargetEncoder in a dictionary
100+ """Return the attributes of TargetEncoder in a dictionary.
73101
74102 Returns
75103 -------
@@ -98,13 +126,11 @@ def set_attributes_from_dict(self, params: dict):
98126 Contains the attributes of TargetEncoder with their
99127 names as key.
100128 """
101-
102129 if "weight" in params and type (params ["weight" ]) == float :
103130 self .weight = params ["weight" ]
104131
105132 if ("imputation_strategy" in params and
106- params ["imputation_strategy" ] in self .valid_strategies ):
107-
133+ params ["imputation_strategy" ] in self .valid_imputation_strategies ):
108134 self .imputation_strategy = params ["imputation_strategy" ]
109135
110136 if "_global_mean" in params and type (params ["_global_mean" ]) == float :
@@ -128,7 +154,7 @@ def dict_to_series(key, value):
128154
129155 def fit (self , data : pd .DataFrame , column_names : list ,
130156 target_column : str ):
131- """Fit the TargetEncoder to the data
157+ """Fit the TargetEncoder to the data.
132158
133159 Parameters
134160 ----------
@@ -140,7 +166,6 @@ def fit(self, data: pd.DataFrame, column_names: list,
140166 target_column : str
141167 Column name of the target
142168 """
143-
144169 # compute global mean (target incidence in case of binary target)
145170 y = data [target_column ]
146171 self ._global_mean = y .sum () / y .count ()
@@ -154,108 +179,113 @@ def fit(self, data: pd.DataFrame, column_names: list,
154179 self ._mapping [column ] = self ._fit_column (data [column ], y )
155180
156181 def _fit_column (self , X : pd .Series , y : pd .Series ) -> pd .Series :
157- """Summary
182+ """Replace the values of a column, holding a categorical value,
183+ with a new value reflecting the formulas mentioned in the docstring
184+ of this class.
158185
159186 Parameters
160187 ----------
161188 X : pd.Series
162189 data used to compute the encoding mapping for an individual
163190 categorical variable.
164191 y : pd.Series
165- series containing the targets for each observation
192+ series containing the targets for each observation (value) of
193+ this categorical variable.
166194
167195 Returns
168196 -------
169197 pd.Series
170- Mapping containing the value to replace each group of the
171- categorical with.
198+ Mapping containing the new value to replace each distinct value
199+ of the categorical variable with.
172200 """
173201 stats = y .groupby (X ).agg (["mean" , "count" ])
174202
175- # Note if self.weight = 0, we have the ordinary incidence replacement
176- numerator = (stats ["count" ]* stats ["mean" ]
203+ # Note: if self.weight = 0, we have the ordinary incidence replacement
204+ numerator = (stats ["count" ] * stats ["mean" ]
177205 + self .weight * self ._global_mean )
178206
179207 denominator = stats ["count" ] + self .weight
180208
181- return numerator / denominator
209+ return numerator / denominator
182210
183211 def transform (self , data : pd .DataFrame ,
184212 column_names : list ) -> pd .DataFrame :
185- """Replace (e.g. encode) categories of each column with its average
186- incidence which was computed when the fit method was called
213+ """Replace (e.g. encode) values of each categorical column with a
214+ new value (reflecting the corresponding average target value,
215+ optionally smoothed by a regularization weight),
216+ which was computed when the fit method was called.
187217
188218 Parameters
189219 ----------
190- X : pd.DataFrame
191- data to encode
220+ data : pd.DataFrame
221+ the data to encode.
192222 column_names : list
193- Columns of data to be encoded
223+ the name of the categorical columns in the data to be encoded.
194224
195225 Returns
196226 -------
197227 pd.DataFrame
198- transformed data
228+ the resulting transformed data.
199229
200230 Raises
201231 ------
202232 NotFittedError
203233 Exception when TargetEncoder was not fitted before calling this
204- method
205-
234+ method.
206235 """
207236 if (len (self ._mapping ) == 0 ) or (self ._global_mean is None ):
208237 msg = ("This {} instance is not fitted yet. Call 'fit' with "
209238 "appropriate arguments before using this method." )
210-
211239 raise NotFittedError (msg .format (self .__class__ .__name__ ))
212240
213241 for column in tqdm (column_names , desc = "Applying target encoding..." ):
214-
215242 if column not in data .columns :
216- log .warning ("Unknown column '{}' will be skipped"
243+ log .warning ("Unknown column '{}' will be skipped. "
217244 .format (column ))
218245 continue
219246 elif column not in self ._mapping :
220247 log .warning ("Column '{}' is not in fitted output "
221- "and will be skipped" .format (column ))
248+ "and will be skipped. " .format (column ))
222249 continue
223-
224250 data = self ._transform_column (data , column )
225251
226252 return data
227253
228254 def _transform_column (self , data : pd .DataFrame ,
229255 column_name : str ) -> pd .DataFrame :
230- """Replace (e.g. encode) categories of each column with its average
231- incidence which was computed when the fit method was called
256+ """Replace (e.g. encode) values of a categorical column with a
257+ new value (reflecting the corresponding average target value,
258+ optionally smoothed by a regularization weight),
259+ which was computed when the fit method was called.
232260
233261 Parameters
234262 ----------
235- X : pd.DataFrame
236- data to encode
263+ data : pd.DataFrame
264+ the data to encode.
237265 column_name : str
238- Name of the column in data to be encoded
266+ the name of the column in the data to be encoded.
239267
240268 Returns
241269 -------
242270 pd.DataFrame
243- transformed data
271+ the resulting transformed data.
244272 """
245273 new_column = TargetEncoder ._clean_column_name (column_name )
246274
247- # Convert dtype to float because when the original dtype
248- # is of type "category", the resulting dtype is also of type
249- # "category"
275+ # Convert dtype to float, because when the original dtype
276+ # is of type "category", the resulting dtype would otherwise also be of
277+ # type "category":
250278 data [new_column ] = (data [column_name ].map (self ._mapping [column_name ])
251279 .astype ("float" ))
252280
253281 # In case of categorical data, it could be that new categories will
254282 # emerge which were not present in the train set, so this will result
255- # in missing values (which should be replaced)
283+ # in missing values, which should be replaced according to the
284+ # configured imputation strategy:
256285 if data [new_column ].isnull ().sum () > 0 :
257286 if self .imputation_strategy == "mean" :
258- data [new_column ].fillna (self ._global_mean , inplace = True )
287+ data [new_column ].fillna (self ._global_mean ,
288+ inplace = True )
259289 elif self .imputation_strategy == "min" :
260290 data [new_column ].fillna (data [new_column ].min (),
261291 inplace = True )
@@ -282,14 +312,16 @@ def fit_transform(self, data: pd.DataFrame,
282312 Returns
283313 -------
284314 pd.DataFrame
285- data with additional discretized variables
315+ data with additional columns, holding the target-encoded variables.
286316 """
287317 self .fit (data , column_names , target_column )
288318 return self .transform (data , column_names )
289319
290320 @staticmethod
291321 def _clean_column_name (column_name : str ) -> str :
292- """Clean column name string by removing "_bin" and adding "_enc"
322+ """Generate a name for the new column that this target encoder
323+ generates in the given data, by removing "_bin", "_processed" or
324+ "_cleaned" from the original categorical column, and adding "_enc".
293325
294326 Parameters
295327 ----------
0 commit comments