-
Notifications
You must be signed in to change notification settings - Fork 303
Expand file tree
/
Copy pathtabular_validator.py
More file actions
130 lines (117 loc) · 4.92 KB
/
Copy pathtabular_validator.py
File metadata and controls
130 lines (117 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- encoding: utf-8 -*-
import logging
from typing import List, Optional, Tuple, Union
import numpy as np
from scipy.sparse import issparse
from autoPyTorch.data.base_validator import BaseInputValidator
from autoPyTorch.data.tabular_feature_validator import SupportedFeatTypes, TabularFeatureValidator
from autoPyTorch.data.tabular_target_validator import SupportedTargetTypes, TabularTargetValidator
from autoPyTorch.data.utils import (
DatasetCompressionInputType,
DatasetCompressionSpec,
DatasetDTypeContainerType,
reduce_dataset_size_if_too_large
)
from autoPyTorch.utils.common import ispandas
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
class TabularInputValidator(BaseInputValidator):
"""
Makes sure the input data complies with Auto-PyTorch requirements.
Categorical inputs are encoded via an Encoder, if the input
is a dataframe. This allow us to nicely predict string targets
This class also perform checks for data integrity and flags the user
via informative errors.
Attributes:
is_classification (bool):
For classification task, this flag indicates that the target data
should be encoded
feature_validator (FeatureValidator):
A FeatureValidator instance used to validate and encode feature columns to match
sklearn expectations on the data
target_validator (TargetValidator):
A TargetValidator instance used to validate and encode (in case of classification)
the target values
dataset_compression (Optional[DatasetCompressionSpec]):
specifications for dataset compression. For more info check
documentation for `BaseTask.get_dataset`.
feat_types (List[str]):
Description about the feature types of the columns.
Accepts `numerical` for integers, float data and `categorical`
for categories, strings and bool
"""
def __init__(
self,
is_classification: bool = False,
logger_port: Optional[int] = None,
dataset_compression: Optional[DatasetCompressionSpec] = None,
feat_types: Optional[List[str]] = None,
seed: int = 42,
):
self.dataset_compression = dataset_compression
self._reduced_dtype: Optional[DatasetDTypeContainerType] = None
self.is_classification = is_classification
self.logger_port = logger_port
self.feat_types = feat_types
self.seed = seed
if self.logger_port is not None:
self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger(
name='Validation',
port=self.logger_port,
)
else:
self.logger = logging.getLogger('Validation')
self.feature_validator = TabularFeatureValidator(
logger=self.logger,
feat_types=self.feat_types)
self.target_validator = TabularTargetValidator(
is_classification=self.is_classification,
logger=self.logger
)
self._is_fitted = False
def _compress_dataset(
self,
X: DatasetCompressionInputType,
y: SupportedTargetTypes,
) -> DatasetCompressionInputType:
"""
Compress the dataset. This function ensures that
the testing data is converted to the same dtype as
the training data.
See `autoPyTorch.data.utils.reduce_dataset_size_if_too_large`
for more information.
Args:
X (DatasetCompressionInputType):
features of dataset
y (SupportedTargetTypes):
targets of dataset
Returns:
DatasetCompressionInputType:
Compressed dataset.
"""
is_dataframe = ispandas(X)
is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
if not is_reducible_type or self.dataset_compression is None:
return X, y
elif self._reduced_dtype is not None:
X = X.astype(self._reduced_dtype)
return X, y
else:
X, y = reduce_dataset_size_if_too_large(
X,
y=y,
is_classification=self.is_classification,
random_state=self.seed,
categorical_columns=self.feature_validator.categorical_columns,
n_categories_per_cat_column=self.feature_validator.num_categories_per_col,
**self.dataset_compression # type: ignore [arg-type]
)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
return X, y
def transform(
self,
X: SupportedFeatTypes,
y: Optional[SupportedTargetTypes] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
X, y = super().transform(X, y)
X_reduced, y_reduced = self._compress_dataset(X, y)
return X_reduced, y_reduced