Skip to content

Commit ecb7d90

Browse files
author
Sam Borms
authored
Merge pull request #82 from PythonPredictions/issue-#65-categorical_data_preprocessor
Issue #65 categorical data preprocessor
2 parents abfdc66 + d4e0d71 commit ecb7d90

3 files changed

Lines changed: 93 additions & 42 deletions

File tree

.gitignore

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
#Ignored directories in root folder
2-
1+
# Ignored directories in root folder
32

43
# Byte-compiled / optimized / DLL files
54
__pycache__/
@@ -107,7 +106,7 @@ ENV/
107106
# vscode settings
108107
.vscode/
109108

110-
# Other ignore files
109+
# Other ignored files
111110
*.pptx
112111
*.ppt
113112
.idea/

cobra/preprocessing/categorical_data_processor.py

Lines changed: 71 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- Jan Benisek (implementation)
1515
- Matthias Roels (implementation)
1616
"""
17+
1718
# standard lib imports
1819
import re
1920
from typing import Optional
@@ -38,33 +39,42 @@ class CategoricalDataProcessor(BaseEstimator):
3839
Attributes
3940
----------
4041
category_size_threshold : int
41-
minimal size of a category to keep it as a separate category
42+
Minimal size of a category to keep it as a separate category.
4243
forced_categories : dict
4344
Map to prevent certain categories from being group into ``Other``
44-
for each colum - dict of the form ``{col:[forced vars]}``.
45+
for each column - dict of the form ``{col:[forced vars]}``.
4546
keep_missing : bool
46-
Whether or not to keep missing as a separate category
47+
Whether or not to keep missing as a separate category.
48+
model_type : str
49+
Model type ("classification" or "regression").
4750
p_value_threshold : float
4851
Significance threshold for regrouping.
4952
regroup : bool
50-
Whether or not to regroup categories
53+
Whether or not to regroup categories.
5154
regroup_name : str
5255
New name of the non-significant regrouped variables
5356
scale_contingency_table : bool
54-
Whether contingency table should be scaled before chi^2.'
57+
Whether contingency table should be scaled before chi^2.
5558
"""
5659

57-
valid_keys = ["regroup", "regroup_name", "keep_missing",
60+
valid_keys = ["model_type", "regroup", "regroup_name", "keep_missing",
5861
"category_size_threshold", "p_value_threshold",
5962
"scale_contingency_table", "forced_categories"]
6063

61-
def __init__(self, regroup: bool = True, regroup_name: str = "Other",
64+
def __init__(self,
65+
model_type: str = "classification",
66+
regroup: bool = True,
67+
regroup_name: str = "Other",
6268
keep_missing: bool = True,
6369
category_size_threshold: int = 5,
6470
p_value_threshold: float = 0.001,
6571
scale_contingency_table: bool = True,
6672
forced_categories: dict = {}):
6773

74+
if model_type not in ["classification", "regression"]:
75+
raise ValueError("An unexpected model_type was provided. Valid model_types are either 'classification' or 'regression'.")
76+
77+
self.model_type = model_type
6878
self.regroup = regroup
6979
self.regroup_name = regroup_name
7080
self.keep_missing = keep_missing
@@ -136,12 +146,12 @@ def fit(self, data: pd.DataFrame, column_names: list,
136146
Parameters
137147
----------
138148
data : pd.DataFrame
139-
data used to compute the mapping to encode the categorical
149+
Data used to compute the mapping to encode the categorical
140150
variables with.
141151
column_names : list
142-
Columns of data to be processed
152+
Columns of data to be processed.
143153
target_column : str
144-
Column name of the target
154+
Column name of the target.
145155
"""
146156

147157
if not self.regroup:
@@ -168,8 +178,8 @@ def fit(self, data: pd.DataFrame, column_names: list,
168178

169179
def _fit_column(self, data: pd.DataFrame, column_name: str,
170180
target_column) -> set:
171-
"""Compute which categories to regroup into "Other" for a particular
172-
column
181+
"""Compute which categories to regroup into "Other"
182+
for a particular column
173183
174184
Parameters
175185
----------
@@ -183,13 +193,18 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
183193
list
184194
list of categories to combine into a category "Other"
185195
"""
196+
model_type = self.model_type
197+
186198
if len(data[column_name].unique()) == 1:
187199
log.warning(f"Predictor {column_name} is constant"
188200
" and will be ignored in computation.")
189201
return set(data[column_name].unique())
190202

191203
y = data[target_column]
192-
incidence = y.mean()
204+
if model_type == "classification":
205+
incidence = y.mean()
206+
else:
207+
incidence = None
193208

194209
combined_categories = set()
195210

@@ -201,13 +216,14 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
201216
unique_categories = list(X.unique())
202217

203218
# do not merge categories in case of dummies, i.e. 0 and 1
204-
# (and possibly "Missings")
219+
# (and possibly "Missing")
205220
if (len(unique_categories) == 2
206221
or (len(unique_categories) == 3
207222
and "Missing" in unique_categories)):
208223
return set(unique_categories)
209224

210225
# get small categories and add them to the merged category list
226+
# does not apply incidence factor when model_type = "regression"
211227
small_categories = (CategoricalDataProcessor
212228
._get_small_categories(
213229
X,
@@ -221,6 +237,7 @@ def _fit_column(self, data: pd.DataFrame, column_name: str,
221237

222238
pval = (CategoricalDataProcessor
223239
._compute_p_value(X, y, category,
240+
model_type,
224241
self.scale_contingency_table))
225242

226243
# if not significant, add it to the list
@@ -348,24 +365,27 @@ def _get_small_categories(predictor_series: pd.Series,
348365
incidence: float,
349366
category_size_threshold: int) -> set:
350367
"""Fetch categories with a size below a certain threshold.
351-
Note that we use an additional weighting with the overall incidence
368+
Note that we use an additional weighting with the overall incidence.
352369
353370
Parameters
354371
----------
355372
predictor_series : pd.Series
356-
Description
373+
Variables data.
357374
incidence : float
358-
global train incidence
375+
Global train incidence.
359376
category_size_threshold : int
360-
minimal size of a category to keep it as a separate category
377+
Minimal size of a category to keep it as a separate category.
361378
362379
Returns
363380
-------
364381
set
365-
List a categories with a count below a certain threshold
382+
List a categories with a count below a certain threshold.
366383
"""
367384
category_counts = predictor_series.groupby(predictor_series).size()
368-
factor = max(incidence, 1 - incidence)
385+
if incidence is not None:
386+
factor = max(incidence, 1 - incidence)
387+
else:
388+
factor = 1
369389

370390
# Get all categories with a count below a threshold
371391
bool_mask = (category_counts*factor) <= category_size_threshold
@@ -404,10 +424,14 @@ def _replace_missings(data: pd.DataFrame,
404424

405425
@staticmethod
406426
def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
427+
model_type: str,
407428
scale_contingency_table: bool) -> float:
408-
"""Calculates p-value in contingency table (chi-square test) in
409-
order to evaluate whether category of interest is significantly
410-
different from the rest of the categories, given the target variable.
429+
"""Calculates p-value in order to evaluate whether category of
430+
interest is significantly different from the rest of the
431+
categories, given the target variable.
432+
433+
In case model_type is "classification", chi-squared test based on a contingency table.
434+
In case model_type is "regression", Kruskal-Wallis test.
411435
412436
Parameters
413437
----------
@@ -416,31 +440,42 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
416440
y : pd.Series
417441
Target data.
418442
category : str
419-
Category for which we carry out the test
443+
Category for which we carry out the test.
444+
model_type : str
445+
Model type ("classification" or "regression").
420446
scale_contingency_table : bool
421-
Whether we scale contingency table with incidence rate
447+
Whether we scale contingency table with incidence rate.
448+
Only used when model_type = "classification".
422449
423450
Returns
424451
-------
425452
float
426-
p-value of chi-square test
453+
p-value of applied statistical test
427454
"""
428455
df = pd.concat([X, y], axis=1)
456+
df.columns = ["X", "y"]
429457
df["other_categories"] = np.where(X == category, 0, 1)
430458

431-
contigency_table = pd.crosstab(index=df['other_categories'], columns=y,
432-
margins=False)
459+
if model_type == "classification":
460+
contingency_table = pd.crosstab(index=df["other_categories"], columns=df["y"],
461+
margins=False)
462+
463+
# if true, we scale the "other" categories
464+
if scale_contingency_table:
465+
size_other_cats = contingency_table.iloc[1].sum()
466+
incidence_mean = y.mean()
467+
468+
contingency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
469+
contingency_table.iloc[1, 1] = incidence_mean * size_other_cats
470+
contingency_table = contingency_table.values.astype(np.int64)
433471

434-
# if true, we scale the "other" categories
435-
if scale_contingency_table:
436-
size_other_cats = contigency_table.iloc[1].sum()
437-
incidence_mean = y.mean()
472+
pval = stats.chi2_contingency(contingency_table, correction=False)[1]
438473

439-
contigency_table.iloc[1, 0] = (1-incidence_mean) * size_other_cats
440-
contigency_table.iloc[1, 1] = incidence_mean * size_other_cats
441-
contigency_table = contigency_table.values.astype(np.int64)
474+
elif model_type == "regression":
475+
pval = stats.kruskal(df.y[df.other_categories == 0],
476+
df.y[df.other_categories == 1])[1]
442477

443-
return stats.chi2_contingency(contigency_table, correction=False)[1]
478+
return pval
444479

445480
@staticmethod
446481
def _replace_categories(data: pd.Series, categories: set,

tests/preprocessing/test_categorical_data_processor.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def test_attributes_to_dict(self):
2020
actual = processor.attributes_to_dict()
2121

2222
expected = {
23+
"model_type": "classification",
2324
"regroup": True,
2425
"regroup_name": "Other",
2526
"keep_missing": True,
@@ -72,22 +73,38 @@ def test_set_attributes_from_dict(self, attribute):
7273
@pytest.mark.parametrize("scale_contingency_table, expected",
7374
[(False, 0.01329),
7475
(True, 0.43437)])
75-
def test_compute_p_value(self, scale_contingency_table, expected):
76+
def test_compute_p_value_classification(self, scale_contingency_table, expected):
7677

7778
X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
7879
y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
7980
category = "c1"
8081

8182
actual = (CategoricalDataProcessor
82-
._compute_p_value(X, y, category, scale_contingency_table))
83+
._compute_p_value(X, y, category, "classification", scale_contingency_table))
84+
85+
assert pytest.approx(actual, abs=1e-5) == expected
86+
87+
@pytest.mark.parametrize("seed, expected",
88+
[(505, 0.02222),
89+
(603, 0.89230)])
90+
def test_compute_p_value_regression(self, seed, expected):
91+
92+
np.random.seed(seed)
93+
94+
X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
95+
y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
96+
category = "c1"
97+
98+
actual = (CategoricalDataProcessor
99+
._compute_p_value(X, y, category, "regression", None))
83100

84101
assert pytest.approx(actual, abs=1e-5) == expected
85102

86103
def test_get_small_categories(self):
87104

88105
data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
89106
incidence = 0.35
90-
threshold = 10 # to make it easy to manualy compute
107+
threshold = 10 # to make it easy to manualLy compute
91108
expected = {"c3", "c4"}
92109

93110
actual = (CategoricalDataProcessor

0 commit comments

Comments
 (0)