-
Notifications
You must be signed in to change notification settings - Fork 303
Expand file tree
/
Copy pathbase_validator.py
More file actions
136 lines (119 loc) · 5.55 KB
/
Copy pathbase_validator.py
File metadata and controls
136 lines (119 loc) · 5.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Base class for the input validator given a task
* A wrapper class of the sklearn.base.BaseEstimator
* The input validator for each task inherits this class
* Check if the provided data are compatible with AutoPytorch implementation
* Manage both target_ and feature_validator in this class
TODO:
* typing.<type> --> <type>
* logging.Logger --> Logger
* Inherit feature_validator and target_validator from a child class
via super().__init__()
"""
# -*- encoding: utf-8 -*-
import logging.handlers
import typing
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
class BaseInputValidator(BaseEstimator):
"""
Makes sure the input data complies with Auto-PyTorch requirements.
Categorical inputs are encoded via an Encoder, if the input
is a dataframe. This allow us to nicely predict string targets
This class also perform checks for data integrity and flags the user
via informative errors.
Attributes:
is_classification (bool):
For classification task, this flag indicates that the target data
should be encoded
feature_validator (FeatureValidator):
A FeatureValidator instance used to validate and encode feature columns to match
sklearn expectations on the data
target_validator (TargetValidator):
A TargetValidator instance used to validate and encode (in case of classification)
the target values
"""
def __init__(
self,
is_classification: bool = False,
logger_port: typing.Optional[int] = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
) -> None:
raise NotImplementedError()
def fit(
self,
X_train: SUPPORTED_FEAT_TYPES,
y_train: SUPPORTED_TARGET_TYPES,
X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the features, and
a encoder for targets in the case of classification. Specifically:
For features:
+ Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy
sparse) as well as dimensionality checks
+ If the provided data is a pandas DataFrame with categorical/boolean/int columns,
such columns will be encoded using an Ordinal Encoder
For targets:
+ Checks for dimensionality as well as missing values are performed.
+ If performing a classification task, the data is going to be encoded
Arguments:
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks). If this data contains categorical columns, an encoder is going to
be instantiated and trained with this data.
y_train (SUPPORTED_TARGET_TYPES):
A set of targets that are going to be encoded if the task is for classification
X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of features used for checking
y_test (SUPPORTED_TARGET_TYPES):
A hold out set of targets used for checking. Additionally, if the current task
is a classification task, this y_test categories are also going to be used to
fit a pre-processing encoding (to prevent errors on unseen classes).
Returns:
self
"""
# Check that the data is valid
if np.shape(X_train)[0] != np.shape(y_train)[0]:
raise ValueError("Inconsistent number of train datapoints for features and targets,"
" {} for features and {} for targets".format(
np.shape(X_train)[0],
np.shape(y_train)[0],
))
if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]:
raise ValueError("Inconsistent number of test datapoints for features and targets,"
" {} for features and {} for targets".format(
np.shape(X_test)[0],
np.shape(y_test)[0],
))
self.feature_validator.fit(X_train, X_test)
self.target_validator.fit(y_train, y_test)
self._is_fitted = True
return self
def transform(
self,
X: SUPPORTED_FEAT_TYPES,
y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]:
"""
Transform the given target or features to a numpy array
Arguments:
X (SUPPORTED_FEAT_TYPES):
A set of features to transform
y (typing.Optional[SUPPORTED_TARGET_TYPES]):
A set of targets to transform
Returns:
np.ndarray:
The transformed features array
np.ndarray:
The transformed targets array
"""
if not self._is_fitted:
raise NotFittedError("Cannot call transform on a validator that is not fitted")
X_transformed = self.feature_validator.transform(X)
if y is not None:
return X_transformed, self.target_validator.transform(y)
else:
return X_transformed, y