Skip to content

Commit 2ce8394

Browse files
committed
Make TargetEncoder from sklearn.preprocessing match with actual sourcecode
1 parent 65063a3 commit 2ce8394

3 files changed

Lines changed: 213 additions & 40 deletions

File tree

stubs/sklearn/preprocessing/__init__.pyi

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ from ._data import (
1919
scale as scale,
2020
)
2121
from ._discretization import KBinsDiscretizer as KBinsDiscretizer
22-
from ._encoders import OneHotEncoder as OneHotEncoder, OrdinalEncoder as OrdinalEncoder, TargetEncoder as TargetEncoder
22+
from ._encoders import OneHotEncoder as OneHotEncoder, OrdinalEncoder as OrdinalEncoder
2323
from ._function_transformer import FunctionTransformer as FunctionTransformer
2424
from ._label import (
2525
LabelBinarizer as LabelBinarizer,
@@ -28,6 +28,7 @@ from ._label import (
2828
label_binarize as label_binarize,
2929
)
3030
from ._polynomial import PolynomialFeatures as PolynomialFeatures, SplineTransformer as SplineTransformer
31+
from ._target_encoder import TargetEncoder as TargetEncoder
3132

3233
__all__ = [
3334
"Binarizer",
@@ -36,27 +37,27 @@ __all__ = [
3637
"KernelCenterer",
3738
"LabelBinarizer",
3839
"LabelEncoder",
39-
"MultiLabelBinarizer",
40-
"MinMaxScaler",
4140
"MaxAbsScaler",
42-
"QuantileTransformer",
41+
"MinMaxScaler",
42+
"MultiLabelBinarizer",
4343
"Normalizer",
4444
"OneHotEncoder",
4545
"OrdinalEncoder",
46+
"PolynomialFeatures",
4647
"PowerTransformer",
48+
"QuantileTransformer",
4749
"RobustScaler",
4850
"SplineTransformer",
4951
"StandardScaler",
5052
"TargetEncoder",
5153
"add_dummy_feature",
52-
"PolynomialFeatures",
5354
"binarize",
54-
"normalize",
55-
"scale",
56-
"robust_scale",
55+
"label_binarize",
5756
"maxabs_scale",
5857
"minmax_scale",
59-
"label_binarize",
60-
"quantile_transform",
58+
"normalize",
6159
"power_transform",
60+
"quantile_transform",
61+
"robust_scale",
62+
"scale",
6263
]

stubs/sklearn/preprocessing/_encoders.pyi

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin
1313
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
1414
# License: BSD 3 clause
1515

16-
__all__ = ["OneHotEncoder", "OrdinalEncoder", "TargetEncoder"]
16+
__all__ = ["OneHotEncoder", "OrdinalEncoder"]
1717

1818
class _BaseEncoder(TransformerMixin, BaseEstimator): ...
1919

@@ -67,32 +67,3 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
6767
def fit(self, X: MatrixLike, y: Series | None = None) -> Self: ...
6868
def transform(self, X: MatrixLike) -> ndarray: ...
6969
def inverse_transform(self, X: MatrixLike) -> ndarray: ...
70-
71-
class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
72-
feature_names_in_: ndarray = ...
73-
n_features_in_: int = ...
74-
categories_: list[ndarray] = ...
75-
encodings_: list[ndarray] = ...
76-
target_type_: str = ...
77-
target_mean_: ndarray = ...
78-
classes_: ndarray | None = ...
79-
80-
@property
81-
def infrequent_categories_(self) -> list[ndarray]: ...
82-
83-
_parameter_constraints: ClassVar[dict] = ...
84-
85-
def __init__(
86-
self,
87-
*,
88-
categories: Sequence[ArrayLike] | Literal["auto"] = "auto",
89-
target_type: Literal["auto", "continuous", "binary", "multiclass"] = "auto",
90-
smooth: Literal["auto"] | float = "auto",
91-
cv: int = 5,
92-
shuffle: bool = True,
93-
random_state: Int | None = None,
94-
) -> None: ...
95-
def fit(self, X: MatrixLike, y: ArrayLike) -> Self: ...
96-
def transform(self, X: MatrixLike) -> ndarray: ...
97-
def fit_transform(self, X: MatrixLike, y: ArrayLike) -> ndarray: ...
98-
def get_feature_names_out(self, input_features: None | ArrayLike = None) -> ndarray: ...
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
from typing import ClassVar, Literal
2+
from typing_extensions import Self
3+
4+
from numpy import ndarray
5+
6+
from .._typing import ArrayLike, Int, MatrixLike
7+
from ..base import OneToOneFeatureMixin
8+
from ._encoders import _BaseEncoder
9+
10+
class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
11+
"""Target Encoder for regression and classification targets.
12+
13+
Each category is encoded based on a shrunk estimate of the average target
14+
values for observations belonging to the category. The encoding scheme mixes
15+
the global target mean with the target mean conditioned on the value of the
16+
category (see [MIC]_).
17+
18+
When the target type is "multiclass", encodings are based
19+
on the conditional probability estimate for each class. The target is first
20+
binarized using the "one-vs-all" scheme via
21+
:class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
22+
value for each class and each category is used for encoding, resulting in
23+
`n_features` * `n_classes` encoded output features.
24+
25+
:class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
26+
as another category and encodes them like any other category. Categories
27+
that are not seen during :meth:`fit` are encoded with the target mean, i.e.
28+
`target_mean_`.
29+
30+
For a demo on the importance of the `TargetEncoder` internal cross-fitting,
31+
see
32+
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
33+
For a comparison of different encoders, refer to
34+
:ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
35+
more in the :ref:`User Guide <target_encoder>`.
36+
37+
.. note::
38+
`fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
39+
:term:`cross fitting` scheme is used in `fit_transform` for encoding.
40+
See the :ref:`User Guide <target_encoder>` for details.
41+
42+
.. versionadded:: 1.3
43+
44+
Parameters
45+
----------
46+
categories : "auto" or list of shape (n_features,) of array-like, default="auto"
47+
Categories (unique values) per feature:
48+
49+
- `"auto"` : Determine categories automatically from the training data.
50+
- list : `categories[i]` holds the categories expected in the i-th column. The
51+
passed categories should not mix strings and numeric values within a single
52+
feature, and should be sorted in case of numeric values.
53+
54+
The used categories are stored in the `categories_` fitted attribute.
55+
56+
target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
57+
Type of target.
58+
59+
- `"auto"` : Type of target is inferred with
60+
:func:`~sklearn.utils.multiclass.type_of_target`.
61+
- `"continuous"` : Continuous target
62+
- `"binary"` : Binary target
63+
- `"multiclass"` : Multiclass target
64+
65+
.. note::
66+
The type of target inferred with `"auto"` may not be the desired target
67+
type used for modeling. For example, if the target consisted of integers
68+
between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
69+
will infer the target as `"multiclass"`. In this case, setting
70+
`target_type="continuous"` will specify the target as a regression
71+
problem. The `target_type_` attribute gives the target type used by the
72+
encoder.
73+
74+
.. versionchanged:: 1.4
75+
Added the option 'multiclass'.
76+
77+
smooth : "auto" or float, default="auto"
78+
The amount of mixing of the target mean conditioned on the value of the
79+
category with the global target mean. A larger `smooth` value will put
80+
more weight on the global target mean.
81+
If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
82+
83+
cv : int, default=5
84+
Determines the number of folds in the :term:`cross fitting` strategy used in
85+
:meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
86+
and for continuous targets, `KFold` is used.
87+
88+
shuffle : bool, default=True
89+
Whether to shuffle the data in :meth:`fit_transform` before splitting into
90+
folds. Note that the samples within each split will not be shuffled.
91+
92+
random_state : int, RandomState instance or None, default=None
93+
When `shuffle` is True, `random_state` affects the ordering of the
94+
indices, which controls the randomness of each fold. Otherwise, this
95+
parameter has no effect.
96+
Pass an int for reproducible output across multiple function calls.
97+
See :term:`Glossary <random_state>`.
98+
99+
Attributes
100+
----------
101+
encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
102+
ndarray
103+
Encodings learnt on all of `X`.
104+
For feature `i`, `encodings_[i]` are the encodings matching the
105+
categories listed in `categories_[i]`. When `target_type_` is
106+
"multiclass", the encoding for feature `i` and class `j` is stored in
107+
`encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
108+
3 classes (c), encodings are ordered:
109+
f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
110+
111+
categories_ : list of shape (n_features,) of ndarray
112+
The categories of each input feature determined during fitting or
113+
specified in `categories`
114+
(in order of the features in `X` and corresponding with the output
115+
of :meth:`transform`).
116+
117+
target_type_ : str
118+
Type of target.
119+
120+
target_mean_ : float
121+
The overall mean of the target. This value is only used in :meth:`transform`
122+
to encode categories.
123+
124+
n_features_in_ : int
125+
Number of features seen during :term:`fit`.
126+
127+
feature_names_in_ : ndarray of shape (`n_features_in_`,)
128+
Names of features seen during :term:`fit`. Defined only when `X`
129+
has feature names that are all strings.
130+
131+
classes_ : ndarray or None
132+
If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
133+
otherwise `None`.
134+
135+
See Also
136+
--------
137+
OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
138+
Contrary to TargetEncoder, this encoding is not supervised. Treating the
139+
resulting encoding as a numerical features therefore lead arbitrarily
140+
ordered values and therefore typically lead to lower predictive performance
141+
when used as preprocessing for a classifier or regressor.
142+
OneHotEncoder : Performs a one-hot encoding of categorical features. This
143+
unsupervised encoding is better suited for low cardinality categorical
144+
variables as it generate one new feature per unique category.
145+
146+
References
147+
----------
148+
.. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
149+
categorical attributes in classification and prediction problems"
150+
SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
151+
152+
Examples
153+
--------
154+
With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
155+
156+
>>> import numpy as np
157+
>>> from sklearn.preprocessing import TargetEncoder
158+
>>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
159+
>>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
160+
>>> enc_auto = TargetEncoder(smooth="auto")
161+
>>> X_trans = enc_auto.fit_transform(X, y)
162+
163+
>>> # A high `smooth` parameter puts more weight on global mean on the categorical
164+
>>> # encodings:
165+
>>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
166+
>>> enc_high_smooth.target_mean_
167+
np.float64(44.3)
168+
>>> enc_high_smooth.encodings_
169+
[array([44.1, 44.4, 44.3])]
170+
171+
>>> # On the other hand, a low `smooth` parameter puts more weight on target
172+
>>> # conditioned on the value of the categorical:
173+
>>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
174+
>>> enc_low_smooth.encodings_
175+
[array([21, 80.8, 43.2])]
176+
"""
177+
178+
encodings_: list[ndarray]
179+
categories_: list[ndarray]
180+
target_type_: str
181+
target_mean_: float
182+
n_features_in_: int
183+
feature_names_in_: ndarray
184+
classes_: ndarray | None
185+
186+
_parameter_constraints: ClassVar[dict] = ...
187+
188+
def __init__(
189+
self,
190+
categories: list[ArrayLike] | Literal["auto"] = "auto",
191+
target_type: Literal["auto", "continuous", "binary", "multiclass"] = "auto",
192+
smooth: Literal["auto"] | float = "auto",
193+
cv: int = 5,
194+
shuffle: bool = True,
195+
random_state: Int | None = None,
196+
) -> None: ...
197+
def fit(self, X: MatrixLike, y: ArrayLike) -> Self: ...
198+
def fit_transform(self, X: MatrixLike, y: ArrayLike) -> ndarray: ...
199+
def transform(self, X: MatrixLike) -> ndarray: ...
200+
def get_feature_names_out(self, input_features: ArrayLike | None = None) -> ndarray: ...
201+
def __sklearn_tags__(self) -> dict: ...

0 commit comments

Comments
 (0)