1414- Jan Benisek (implementation)
1515- Matthias Roels (implementation)
1616"""
17+
1718# standard lib imports
1819import re
1920from typing import Optional
@@ -38,33 +39,42 @@ class CategoricalDataProcessor(BaseEstimator):
3839 Attributes
3940 ----------
4041 category_size_threshold : int
41- minimal size of a category to keep it as a separate category
42+ Minimal size of a category to keep it as a separate category.
4243 forced_categories : dict
4344 Map to prevent certain categories from being group into ``Other``
44- for each colum - dict of the form ``{col:[forced vars]}``.
45+ for each column - dict of the form ``{col:[forced vars]}``.
4546 keep_missing : bool
46- Whether or not to keep missing as a separate category
47+ Whether or not to keep missing as a separate category.
48+ model_type : str
49+ Model type ("classification" or "regression").
4750 p_value_threshold : float
4851 Significance threshold for regrouping.
4952 regroup : bool
50- Whether or not to regroup categories
53+ Whether or not to regroup categories.
5154 regroup_name : str
5255 New name of the non-significant regrouped variables
5356 scale_contingency_table : bool
54- Whether contingency table should be scaled before chi^2.'
57+ Whether contingency table should be scaled before chi^2.
5558 """
5659
57- valid_keys = ["regroup" , "regroup_name" , "keep_missing" ,
60+ valid_keys = ["model_type" , " regroup" , "regroup_name" , "keep_missing" ,
5861 "category_size_threshold" , "p_value_threshold" ,
5962 "scale_contingency_table" , "forced_categories" ]
6063
61- def __init__ (self , regroup : bool = True , regroup_name : str = "Other" ,
64+ def __init__ (self ,
65+ model_type : str = "classification" ,
66+ regroup : bool = True ,
67+ regroup_name : str = "Other" ,
6268 keep_missing : bool = True ,
6369 category_size_threshold : int = 5 ,
6470 p_value_threshold : float = 0.001 ,
6571 scale_contingency_table : bool = True ,
6672 forced_categories : dict = {}):
6773
74+ if model_type not in ["classification" , "regression" ]:
75+ raise ValueError ("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'." )
76+
77+ self .model_type = model_type
6878 self .regroup = regroup
6979 self .regroup_name = regroup_name
7080 self .keep_missing = keep_missing
@@ -136,12 +146,12 @@ def fit(self, data: pd.DataFrame, column_names: list,
136146 Parameters
137147 ----------
138148 data : pd.DataFrame
139- data used to compute the mapping to encode the categorical
149+ Data used to compute the mapping to encode the categorical
140150 variables with.
141151 column_names : list
142- Columns of data to be processed
152+ Columns of data to be processed.
143153 target_column : str
144- Column name of the target
154+ Column name of the target.
145155 """
146156
147157 if not self .regroup :
@@ -168,8 +178,8 @@ def fit(self, data: pd.DataFrame, column_names: list,
168178
169179 def _fit_column (self , data : pd .DataFrame , column_name : str ,
170180 target_column ) -> set :
171- """Compute which categories to regroup into "Other" for a particular
172- column
181+ """Compute which categories to regroup into "Other"
182+ for a particular column
173183
174184 Parameters
175185 ----------
@@ -183,13 +193,18 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
183193 list
184194 list of categories to combine into a category "Other"
185195 """
196+ model_type = self .model_type
197+
186198 if len (data [column_name ].unique ()) == 1 :
187199 log .warning (f"Predictor { column_name } is constant"
188200 " and will be ignored in computation." )
189201 return set (data [column_name ].unique ())
190202
191203 y = data [target_column ]
192- incidence = y .mean ()
204+ if model_type == "classification" :
205+ incidence = y .mean ()
206+ else :
207+ incidence = None
193208
194209 combined_categories = set ()
195210
@@ -201,13 +216,14 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
201216 unique_categories = list (X .unique ())
202217
203218 # do not merge categories in case of dummies, i.e. 0 and 1
204- # (and possibly "Missings ")
219+ # (and possibly "Missing ")
205220 if (len (unique_categories ) == 2
206221 or (len (unique_categories ) == 3
207222 and "Missing" in unique_categories )):
208223 return set (unique_categories )
209224
210225 # get small categories and add them to the merged category list
226+ # does not apply incidence factor when model_type = "regression"
211227 small_categories = (CategoricalDataProcessor
212228 ._get_small_categories (
213229 X ,
@@ -221,6 +237,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
221237
222238 pval = (CategoricalDataProcessor
223239 ._compute_p_value (X , y , category ,
240+ model_type ,
224241 self .scale_contingency_table ))
225242
226243 # if not significant, add it to the list
@@ -348,24 +365,27 @@ def _get_small_categories(predictor_series: pd.Series,
348365 incidence : float ,
349366 category_size_threshold : int ) -> set :
350367 """Fetch categories with a size below a certain threshold.
351- Note that we use an additional weighting with the overall incidence
368+ Note that we use an additional weighting with the overall incidence.
352369
353370 Parameters
354371 ----------
355372 predictor_series : pd.Series
356- Description
373+ Variables data.
357374 incidence : float
358- global train incidence
375+ Global train incidence.
359376 category_size_threshold : int
360- minimal size of a category to keep it as a separate category
377+ Minimal size of a category to keep it as a separate category.
361378
362379 Returns
363380 -------
364381 set
365- List a categories with a count below a certain threshold
382+ List a categories with a count below a certain threshold.
366383 """
367384 category_counts = predictor_series .groupby (predictor_series ).size ()
368- factor = max (incidence , 1 - incidence )
385+ if incidence is not None :
386+ factor = max (incidence , 1 - incidence )
387+ else :
388+ factor = 1
369389
370390 # Get all categories with a count below a threshold
371391 bool_mask = (category_counts * factor ) <= category_size_threshold
@@ -404,10 +424,14 @@ def _replace_missings(data: pd.DataFrame,
404424
405425 @staticmethod
406426 def _compute_p_value (X : pd .Series , y : pd .Series , category : str ,
427+ model_type : str ,
407428 scale_contingency_table : bool ) -> float :
408- """Calculates p-value in contingency table (chi-square test) in
409- order to evaluate whether category of interest is significantly
410- different from the rest of the categories, given the target variable.
429+ """Calculates p-value in order to evaluate whether category of
430+ interest is significantly different from the rest of the
431+ categories, given the target variable.
432+
433+ In case model_type is "classification", chi-squared test based on a contingency table.
434+ In case model_type is "regression", Kruskal-Wallis test.
411435
412436 Parameters
413437 ----------
@@ -416,31 +440,42 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
416440 y : pd.Series
417441 Target data.
418442 category : str
419- Category for which we carry out the test
443+ Category for which we carry out the test.
444+ model_type : str
445+ Model type ("classification" or "regression").
420446 scale_contingency_table : bool
421- Whether we scale contingency table with incidence rate
447+ Whether we scale contingency table with incidence rate.
448+ Only used when model_type = "classification".
422449
423450 Returns
424451 -------
425452 float
426- p-value of chi-square test
453+ p-value of applied statistical test
427454 """
428455 df = pd .concat ([X , y ], axis = 1 )
456+ df .columns = ["X" , "y" ]
429457 df ["other_categories" ] = np .where (X == category , 0 , 1 )
430458
431- contigency_table = pd .crosstab (index = df ['other_categories' ], columns = y ,
432- margins = False )
459+ if model_type == "classification" :
460+ contingency_table = pd .crosstab (index = df ["other_categories" ], columns = df ["y" ],
461+ margins = False )
462+
463+ # if true, we scale the "other" categories
464+ if scale_contingency_table :
465+ size_other_cats = contingency_table .iloc [1 ].sum ()
466+ incidence_mean = y .mean ()
467+
468+ contingency_table .iloc [1 , 0 ] = (1 - incidence_mean ) * size_other_cats
469+ contingency_table .iloc [1 , 1 ] = incidence_mean * size_other_cats
470+ contingency_table = contingency_table .values .astype (np .int64 )
433471
434- # if true, we scale the "other" categories
435- if scale_contingency_table :
436- size_other_cats = contigency_table .iloc [1 ].sum ()
437- incidence_mean = y .mean ()
472+ pval = stats .chi2_contingency (contingency_table , correction = False )[1 ]
438473
439- contigency_table . iloc [ 1 , 0 ] = ( 1 - incidence_mean ) * size_other_cats
440- contigency_table . iloc [ 1 , 1 ] = incidence_mean * size_other_cats
441- contigency_table = contigency_table . values . astype ( np . int64 )
474+ elif model_type == "regression" :
475+ pval = stats . kruskal ( df . y [ df . other_categories == 0 ],
476+ df . y [ df . other_categories == 1 ])[ 1 ]
442477
443- return stats . chi2_contingency ( contigency_table , correction = False )[ 1 ]
478+ return pval
444479
445480 @staticmethod
446481 def _replace_categories (data : pd .Series , categories : set ,
0 commit comments