Skip to content

Commit ef72984

Browse files
committed
Added all 100 datasets from KEEL imbalanced. Preprocessing option for one-hot-conversion of nominal features.
1 parent 658158c commit ef72984

4 files changed

Lines changed: 33 additions & 4 deletions

File tree

905 Bytes
Binary file not shown.

ml_datasets/processing/data_processor.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ def __call__(self, dataset_metadata, return_folds=True):
4141

4242
class ARFFReader():
4343

44-
def __init__(self):
45-
pass
44+
def __init__(self, pre_process = True):
45+
self.pre_process = pre_process
4646

4747
def _removeUnneccessaryTags(self,filename):
4848
"""
@@ -64,13 +64,40 @@ def load_ARFF(self, filename):
6464
data, meta = arff.loadarff(filename)
6565

6666
df = DataFrame(data=data, columns=meta.names())
67-
uniq = np.unique(df[meta.names()[-1]].values)
6867

69-
y = df[meta.names()[-1]].map({uniq[0]: 0, uniq[1]: 1})
68+
if self.pre_process:
69+
self._pre_process(df, meta)
70+
71+
print(df.columns)
72+
y = df[meta.names()[-1]]
7073
X = df.drop([meta.names()[-1]], axis=1)
7174
return (X.values, y.values)
7275

7376

77+
def _pre_process(self, df, meta):
78+
"""
79+
Converts discrete feature into psuedo-continous versios.
80+
One hot encodes features that take on multiple discrete values.
81+
:param df:
82+
:param meta:
83+
:return:
84+
"""
85+
for name, type in zip(meta.names(), meta.types()):
86+
if type == 'nominal':
87+
_, values = meta[name]
88+
89+
if len(values) == 2:
90+
uniq = np.unique(values)
91+
df[name] = df[name].map({uniq[0].encode(): 0, uniq[1].encode(): 1})
92+
else:
93+
# one-hot-encodes discrete feature values into multiple features
94+
for value in values:
95+
df[name+"_"+value] = [1 if x == value.encode() else 0 for x in df[name]]
96+
df.drop(name, axis=1, inplace=True)
97+
98+
99+
100+
74101
def read(self, resource_path):
75102
file = pkg_resources.resource_filename('datasets', resource_path)
76103
return self.load_ARFF(file)
19 Bytes
Binary file not shown.

ml_datasets/tests/test_dataloader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def test_dataset_retrieval(self):
3232
X_train, y_train = train
3333
X_test, y_test = test
3434

35+
print(X_train)
36+
3537
pass
3638

3739

0 commit comments

Comments
 (0)