Skip to content

Commit 9810e1e

Browse files
authored
Merge branch 'master' into dependabot/pip/notebook-6.1.5
2 parents 3fb106c + ed12ba8 commit 9810e1e

14 files changed

Lines changed: 448 additions & 19 deletions

keras_batchflow/base/batch_generators/batch_generator.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,25 @@ class BatchGenerator:
3535
- **shuffle** - (optional) *bool*, if true, the input dataframe is shuffled before each new epoch.
3636
**Default: False**
3737
- **train_mode** - (optional) *bool*. If true, both X and Y are returned, otherwise only X is returned
38+
- **encoder_adapter** - (optional) *str* or a single instance of a class derived from
39+
keras_batchflow.base.batch_shapers.IEncoderAdaptor class. String values supported: 'numpy' and 'pandas'. If
40+
not provided, 'numpy' is used. This parameter sets format that encoders are using. Sklearn encoders are
41+
created for numpy arrays hence the default value is numpy. If your encoders require pandas format, use
42+
'pandas'. Alternatively, if your encoders need some special format, create your instance derived from
43+
IEncoderAdaptor class
3844
"""
3945

4046
def __init__(self, data: pd.DataFrame, x_structure, y_structure=None,
41-
batch_transforms=None, batch_size=32, shuffle=True, train_mode=True):
47+
batch_transforms=None, batch_size=32, shuffle=True, train_mode=True, encoder_adaptor=None):
4248
self.data = data
4349
self.batch_size = batch_size
4450
self.shuffle = shuffle
4551
self.train_mode = train_mode
4652
self.__check_batch_transformers(batch_transforms)
4753
self.batch_transforms = batch_transforms
4854
self.batch_shaper = BatchShaper(x_structure, y_structure,
49-
data_sample=self._apply_batch_transforms(data.iloc[:min(data.shape[0], 10)]))
55+
data_sample=self._apply_batch_transforms(data.iloc[:min(data.shape[0], 10)]),
56+
encoder_adaptor=encoder_adaptor)
5057
self.indices = np.arange(self.data.shape[0])
5158
self.on_epoch_end()
5259

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
from .batch_shaper import BatchShaper
2+
from .encoder_adaptor import IEncoderAdaptor
3+
from .numpy_encoder_adaptor import NumpyEncoderAdaptor
4+
from .pandas_encoder_adaptor import PandasEncoderAdaptor

keras_batchflow/base/batch_shapers/batch_shaper.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,12 @@ class BatchShaper:
1616
with train/test splitted datasets
1717
"""
1818

19-
def __init__(self, x_structure, y_structure=None, data_sample=None, multiindex_xy_keys=('x', 'y')):
19+
def __init__(self, x_structure, y_structure=None, data_sample=None, multiindex_xy_keys=None,
20+
encoder_adaptor=None):
21+
multiindex_xy_keys = ('x', 'y') if multiindex_xy_keys is None else multiindex_xy_keys
2022
self._validate_multiindex_xy_keys(x_structure, y_structure, multiindex_xy_keys)
2123
self.multiindex_xy_keys = multiindex_xy_keys
24+
self._encoder_adaptor = encoder_adaptor
2225
data_sample_x, data_sample_y = self._get_data_xy(data_sample)
2326
self.x_structure = self._create_shapers(structure=x_structure, data_sample=data_sample_x)
2427
self.y_structure = self._create_shapers(structure=y_structure, data_sample=data_sample_y)
@@ -63,7 +66,7 @@ def _create_shapers(self, structure, data_sample):
6366

6467
def _create_shaper_func(self, data, leaf, **kwargs):
6568
self._check_structure_leaf(leaf)
66-
return VarShaper(var_name=leaf[0], encoder=leaf[1], data_sample=data)
69+
return VarShaper(var_name=leaf[0], encoder=leaf[1], data_sample=data, encoder_adaptor=self._encoder_adaptor)
6770

6871
def _walk(self, data: pd.DataFrame, func, **kwargs):
6972
data_x, data_y = self._get_data_xy(data)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from abc import abstractmethod
2+
import pandas as pd
3+
4+
5+
class IEncoderAdaptor:
6+
7+
"""
8+
This class is used for converting data between source data in pandas dataframe and encoders, which might accept
9+
different format types, e.g. numpy array or pandas series
10+
11+
This is an interface class for defining 2 standard adapters for numpy-based and pandas-based encoders. It can
12+
also be used for defining your own class if needed.
13+
"""
14+
15+
@abstractmethod
16+
def transform(self, x: pd.Series):
17+
"""
18+
This method converts data before sending to an encoder. Define format conversion here
19+
:param x:
20+
:return:
21+
"""
22+
pass
23+
24+
@abstractmethod
25+
def inverse_transform(self, x, dtype=None) -> pd.Series:
26+
"""
27+
This method coverts data received from encoder back into pandas Series
28+
:param x: data in the format of encoder
29+
:param dtype: optional, target dtype for data in pandas Series object created. Normally this is an original dtype
30+
from original data. If not provided, the data type will be inferred by pandas
31+
:return: a pandas series object
32+
"""
33+
pass
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
from .encoder_adaptor import IEncoderAdaptor
5+
6+
7+
class NumpyEncoderAdaptor(IEncoderAdaptor):
8+
9+
"""
10+
This is an adapter that converts data for Numpy based encoders.
11+
"""
12+
13+
def transform(self, x: pd.Series):
14+
"""
15+
This method converts pandas Series object to a numpy array. It uses to_numpy method rather than values
16+
property because some Pandas data types do not exist in numpy (e.g. IntegerArray) and by using to_numpy,
17+
we involve pandas internal conversion
18+
:param x:
19+
:return:
20+
"""
21+
return x.to_numpy()
22+
23+
def inverse_transform(self, x, dtype=None) -> pd.Series:
24+
"""
25+
Both input and output are pandas Series here. THis method will only check that encoder indeed returns
26+
pandas Series and will make sure the data type is correct
27+
:param x:
28+
:param dtype:
29+
:return:
30+
"""
31+
if not isinstance(x, np.ndarray):
32+
raise TypeError(f"Error: the encoder is supposed to return numpy array, got {type(x)}")
33+
if x.ndim > 1:
34+
x = np.squeeze(x)
35+
if x.ndim > 1:
36+
raise ValueError(f"Error: the encoder is supposed to return 1D data. Got {x.ndim}D even after "
37+
f"squeezing")
38+
x = pd.Series(x, dtype=dtype)
39+
return x
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import pandas as pd
2+
3+
from .encoder_adaptor import IEncoderAdaptor
4+
5+
6+
class PandasEncoderAdaptor(IEncoderAdaptor):
7+
8+
"""
9+
This is an adapter that converts data for Pandas based encoders.
10+
"""
11+
12+
def transform(self, x: pd.Series):
13+
"""
14+
Because both input and output data is a pandas Series there will be no conversion
15+
:param x:
16+
:return:
17+
"""
18+
return x
19+
20+
def inverse_transform(self, x, dtype=None) -> pd.Series:
21+
"""
22+
Both input and output are pandas Series here. THis method will only check that encoder indeed returns
23+
pandas Series and will make sure the data type is correct
24+
:param x:
25+
:param dtype:
26+
:return:
27+
"""
28+
if not isinstance(x, pd.Series):
29+
raise TypeError(f"Error: the encoder is supposed to return Pandas Series object, got {type(x)}")
30+
if dtype is not None:
31+
if x.dtype != dtype:
32+
x = x.astype(dtype)
33+
return x

keras_batchflow/base/batch_shapers/var_shaper.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
import pandas as pd
33
from numbers import Number
44

5+
from .encoder_adaptor import IEncoderAdaptor
6+
from .numpy_encoder_adaptor import NumpyEncoderAdaptor
7+
from .pandas_encoder_adaptor import PandasEncoderAdaptor
8+
59

610
class VarShaper:
711

@@ -20,10 +24,11 @@ class VarShaper:
2024

2125
_dummy_constant_counter = 0
2226

23-
def __init__(self, var_name, encoder, data_sample=None):
27+
def __init__(self, var_name, encoder, data_sample=None, encoder_adaptor=None):
2428
self._var_name = var_name
2529
# _name will be included in metadata for using in ML models, e.g. for naming input layers in Keras
2630
self._name = var_name
31+
self._encoder_adaptor = self._build_encoder_adaptor(encoder_adaptor)
2732
self._encoder = encoder
2833
self._class = self._self_classify(var_name, encoder)
2934
self._decoded_dtype, self._dtype = self._get_dtypes(data_sample)
@@ -34,6 +39,24 @@ def __init__(self, var_name, encoder, data_sample=None):
3439
self._shape = self._get_shape(var_name, encoder, data_sample)
3540
self._n_classes = self._get_n_classes(encoder)
3641

42+
def _build_encoder_adaptor(self, encoder_adaptor):
43+
"""
44+
This method makes encoder adaptor class that utilises polymorphism to accommodate encoders that require
45+
different data types. The encoder adaptor instance takes care of
46+
:param encoder_adaptor: str ('numpy' or 'pandas') or a custom class derived from IEncoderAdaptor
47+
:return: instance of IEncoderAdaptor
48+
"""
49+
if (encoder_adaptor == 'numpy') or (encoder_adaptor is None):
50+
adaptor = NumpyEncoderAdaptor()
51+
elif encoder_adaptor == 'pandas':
52+
adaptor = PandasEncoderAdaptor()
53+
elif isinstance(encoder_adaptor, IEncoderAdaptor):
54+
adaptor = encoder_adaptor
55+
else:
56+
raise TypeError(f"Error: The encoder adaptor must be a string ('numpy' or 'pandas') or an instance of a "
57+
f"custom class derived from IEncoderAdaptor")
58+
return adaptor
59+
3760
@staticmethod
3861
def _self_classify(var_name, encoder):
3962
"""
@@ -58,6 +81,8 @@ def _self_classify(var_name, encoder):
5881
return "direct"
5982
if not hasattr(encoder, "transform"):
6083
raise ValueError(f"Error: encoder provided for column '{var_name}' has no 'transform' method")
84+
if not hasattr(encoder, "inverse_transform"):
85+
raise ValueError(f"Error: encoder provided for column '{var_name}' has no 'inverse_transform' method")
6186
return "encoder"
6287
else:
6388
raise ValueError(f"Error: variable name must be a str or None. Got {type(var_name)}")
@@ -155,20 +180,21 @@ def transform(self, data):
155180
# if not hasattr(self._encoder, 'transform'):
156181
# raise ValueError(f"Error: encoders of class {type(self._encoder).__name__} provided in structure "
157182
# f"definition has no 'transform' method")
183+
encoder_input = self._encoder_adaptor.transform(data[self._var_name])
158184
try:
159-
x = getattr(self._encoder, 'transform')(data[self._var_name].values)
185+
x = self._encoder.transform(encoder_input)
160186
except ValueError as e:
161187
raise ValueError(f'Error: ValueError exception occured while calling '
162188
f'{type(self._encoder).__name__}.transform method. Most likely you used'
163189
f' 2D encoders. At the moment, only 1D transformers are supported. Please use 1D '
164190
f'variant or use wrapper. The error was: {e}')
165-
except Exception as e:
166-
raise RuntimeError(f'Error: unknown error while calling transform method of '
167-
f'{type(self._encoder).__name__} class provided in structure. The error was: {e}')
191+
# except Exception as e:
192+
# raise RuntimeError(f'Error: unknown error while calling transform method of '
193+
# f'{type(self._encoder).__name__} class provided in structure. The error was: {e}')
168194
elif self._class == "constant":
169195
x = np.repeat(self._encoder, data.shape[0])
170196
elif self._class == "direct":
171-
x = data[self._var_name].values
197+
x = data[self._var_name].to_numpy()
172198
else:
173199
raise RuntimeError('Error: this should not have happened. Maybe it needs to be reported')
174200
# if self._dtype is None:
@@ -192,13 +218,13 @@ def inverse_transform(self, df, encoded_data):
192218
# changes. These cases have structure entry like this ('col_name', None)
193219
df[self._var_name] = pd.Series(np.squeeze(encoded_data), dtype=self._decoded_dtype)
194220
elif self._class == "encoder":
195-
if not hasattr(self._encoder, "inverse_transform"):
196-
raise ValueError(f"Error: encoder provided for column '{self._var_name}' has no 'inverse_transform' method")
197-
if not hasattr(self._encoder, 'inverse_transform'):
198-
raise ValueError('Error: the encoders {} used for column {} has no inverse_transform method'
199-
.format(type(self._encoder).__name__, self._var_name))
200-
it = self._encoder.inverse_transform(encoded_data)
201-
df[self._var_name] = pd.Series(it, dtype=self._decoded_dtype)
221+
# it has already been checked at init stage. It is redundant here
222+
# if not hasattr(self._encoder, "inverse_transform"):
223+
# raise ValueError(f"Error: encoder provided for column '{self._var_name}' has no "
224+
# "'inverse_transform' method")
225+
it = self._encoder_adaptor.inverse_transform(self._encoder.inverse_transform(encoded_data),
226+
dtype=self._decoded_dtype)
227+
df[self._var_name] = it
202228

203229
def _reshape(self, x: np.ndarray):
204230
if x.ndim == 1:
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
from keras_batchflow.base.batch_shapers import BatchShaper
1+
from keras_batchflow.base.batch_shapers import BatchShaper
2+
from keras_batchflow.base.batch_shapers import IEncoderAdaptor
3+
from keras_batchflow.base.batch_shapers import NumpyEncoderAdaptor
4+
from keras_batchflow.base.batch_shapers import PandasEncoderAdaptor
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
from keras_batchflow.base.batch_shapers import BatchShaper
1+
from keras_batchflow.base.batch_shapers import BatchShaper
2+
from keras_batchflow.base.batch_shapers import IEncoderAdaptor
3+
from keras_batchflow.base.batch_shapers import NumpyEncoderAdaptor
4+
from keras_batchflow.base.batch_shapers import PandasEncoderAdaptor

test/test_batch_generator.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
from keras_batchflow.base.batch_transformers import BatchTransformer
66
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
77

8+
from keras_batchflow.base.batch_shapers.numpy_encoder_adaptor import NumpyEncoderAdaptor
9+
from keras_batchflow.base.batch_shapers.pandas_encoder_adaptor import PandasEncoderAdaptor
10+
811

912
class TestBatchGenerator:
1013

@@ -221,5 +224,27 @@ def test_shapes(self):
221224
assert sh[0][1] == (1,)
222225
assert sh[1] == (1,)
223226

227+
def test_encoder_adaptor(self):
228+
"""
229+
This test only makes sure the adaptor parameter is passed correctly
230+
:return:
231+
"""
232+
bg = BatchGenerator(
233+
self.df,
234+
x_structure=('var1', self.lb),
235+
y_structure=('label', self.le),
236+
shuffle=False,
237+
encoder_adaptor='numpy'
238+
)
239+
assert isinstance(bg.batch_shaper.x_structure._encoder_adaptor, NumpyEncoderAdaptor)
240+
bg = BatchGenerator(
241+
self.df,
242+
x_structure=('var1', self.lb),
243+
y_structure=('label', self.le),
244+
shuffle=False,
245+
encoder_adaptor='pandas'
246+
)
247+
assert isinstance(bg.batch_shaper.x_structure._encoder_adaptor, PandasEncoderAdaptor)
248+
224249
if __name__ == '__main__':
225250
pytest.main([__file__])

0 commit comments

Comments
 (0)