Auto-PyTorch/autoPyTorch/data/base_validator.py at aeedc2f8eed1f1c37754e5bfc5f059b84f78d346 · automl/Auto-PyTorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Base class for the input validator given a task
* A wrapper class of the sklearn.base.BaseEstimator
* The input validator for each task inherits this class
* Check if the provided data are compatible with AutoPytorch implementation
* Manage both target_ and feature_validator in this class

TODO:
    * typing.<type> --> <type>
    * logging.Logger --> Logger
    * Inherit feature_validator and target_validator from a child class
      via super().__init__()
"""

# -*- encoding: utf-8 -*-
import logging.handlers
import typing

import numpy as np

from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES


class BaseInputValidator(BaseEstimator):
    """
    Makes sure the input data complies with Auto-PyTorch requirements.
    Categorical inputs are encoded via an Encoder, if the input
    is a dataframe. This allow us to nicely predict string targets

    This class also perform checks for data integrity and flags the user
    via informative errors.

    Attributes:
        is_classification (bool):
            For classification task, this flag indicates that the target data
            should be encoded
        feature_validator (FeatureValidator):
            A FeatureValidator instance used to validate and encode feature columns to match
            sklearn expectations on the data
        target_validator (TargetValidator):
            A TargetValidator instance used to validate and encode (in case of classification)
            the target values
    """
    def __init__(
        self,
        is_classification: bool = False,
        logger_port: typing.Optional[int] = logging.handlers.DEFAULT_TCP_LOGGING_PORT,
    ) -> None:
        raise NotImplementedError()

    def fit(
        self,
        X_train: SUPPORTED_FEAT_TYPES,
        y_train: SUPPORTED_TARGET_TYPES,
        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> BaseEstimator:
        """
        Validates and fit a categorical encoder (if needed) to the features, and
        a encoder for targets in the case of classification. Specifically:

        For features:
            + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy
              sparse) as well as dimensionality checks
            + If the provided data is a pandas DataFrame with categorical/boolean/int columns,
              such columns will be encoded using an Ordinal Encoder
        For targets:
            + Checks for dimensionality as well as missing values are performed.
            + If performing a classification task, the data is going to be encoded

        Arguments:
            X_train (SUPPORTED_FEAT_TYPES):
                A set of features that are going to be validated (type and dimensionality
                checks). If this data contains categorical columns, an encoder is going to
                be instantiated and trained with this data.
            y_train (SUPPORTED_TARGET_TYPES):
                A set of targets that are going to be encoded if the task is for classification
            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
                A hold out set of features used for checking
            y_test (SUPPORTED_TARGET_TYPES):
                A hold out set of targets used for checking. Additionally, if the current task
                is a classification task, this y_test categories are also going to be used to
                fit a pre-processing encoding (to prevent errors on unseen classes).
        Returns:
            self
        """
        # Check that the data is valid
        if np.shape(X_train)[0] != np.shape(y_train)[0]:
            raise ValueError("Inconsistent number of train datapoints for features and targets,"
                             " {} for features and {} for targets".format(
                                 np.shape(X_train)[0],
                                 np.shape(y_train)[0],
                             ))
        if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]:
            raise ValueError("Inconsistent number of test datapoints for features and targets,"
                             " {} for features and {} for targets".format(
                                 np.shape(X_test)[0],
                                 np.shape(y_test)[0],
                             ))

        self.feature_validator.fit(X_train, X_test)
        self.target_validator.fit(y_train, y_test)
        self._is_fitted = True

        return self

    def transform(
        self,
        X: SUPPORTED_FEAT_TYPES,
        y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]:
        """
        Transform the given target or features to a numpy array

        Arguments:
            X (SUPPORTED_FEAT_TYPES):
                A set of features to transform
            y (typing.Optional[SUPPORTED_TARGET_TYPES]):
                A set of targets to transform

        Returns:
            np.ndarray:
                The transformed features array
            np.ndarray:
                The transformed targets array
        """
        if not self._is_fitted:
            raise NotFittedError("Cannot call transform on a validator that is not fitted")
        X_transformed = self.feature_validator.transform(X)
        if y is not None:
            return X_transformed, self.target_validator.transform(y)
        else:
            return X_transformed, y