-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathlopster.py
More file actions
105 lines (81 loc) · 5.09 KB
/
lopster.py
File metadata and controls
105 lines (81 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import argparse
from datasets import get_tf_database, load_regression, load_regression_dirty, load_features_and_data, reverse_categorical_columns, reverse_to_input_domain, get_date_columns
from latent_operators import LatentOperator
from utils import create_and_train_LOP
from error_detection import predict_on_enhanced
from copy import deepcopy
parser = argparse.ArgumentParser(description="Generalizable Data Cleaning of Tabular Data in Latent Space")
parser.add_argument("--epochs", type=int, default=100)
parser.add_argument("--learning_rate", type=float, default=0.001)
parser.add_argument("--latent", type=int, default=120)
parser.add_argument("--batch_size", type=int, default=256)
parser.add_argument("--K", type=int, default=12)
parser.add_argument("--dataset", default='adult')
parser.add_argument("--path", default="./")
args = parser.parse_args()
print("Is Eager execution?",tf.executing_eagerly())
LOP = None
@tf.function
def _translate_all_columns_by_1(inputs):
zs = inputs
for column in range(zs.shape[0]):
new_z = LOP.translate_operator(zs[column], 1)
zs = tf.tensor_scatter_nd_update(zs, [column], new_z)
return zs
def generate_cleaned_data(x_, Zs, Ks, decoder):
#avoid decoding Zs when there is no error, just use the value
xs = tf.squeeze(tf.transpose(decoder(tf.unstack(Zs, axis = 1)), [1,0,2]))
x_p = tf.where(Ks == 0, x_, xs)
return x_p
#=========================== CONFIGS ==========================================
ds = args.dataset
SAMPLE_SIZE = -1
MISSING_REPLACE = '3.0'
T = 'missing_values'
x_clean_train, y_clean_train, x_clean, y_clean, MAX, MIN, SCALER, CAT_ENCODER = load_regression(ds,
SAMPLE_SIZE,
SAMPLE_SIZE,
True,
normalize_sklearn = True,
path_to_dataset = args.path)
COLS = x_clean_train.shape[1]
#======================= TRAIN MODELS =========================================
train_dataset = get_tf_database(x_clean_train,x_clean_train,args.batch_size)
val_dataset = get_tf_database(x_clean,x_clean,args.batch_size)
encoder, decoder, LOP, t_acc, v_acc = create_and_train_LOP(train_dataset,
val_dataset,
COLS,
args.latent,
args.K,
1,
T,
epochs=args.epochs,
model_name=args.dataset)
#======================= Data Cleaning =========================================
headers, target_name, dirty_data, _, data_with_y, _, clean_data, FULL_SCALER, CAT_ENCODER = load_features_and_data(ds,
SAMPLE_SIZE,
SAMPLE_SIZE,
MISSING_REPLACE,
SCALER, CAT_ENCODER,
True, MAX, MIN,
normalize_sklearn = True,
path_to_dataset = args.path)
filtered_header = headers["filtered_header"]
X_dirty = deepcopy(data_with_y[filtered_header]).to_numpy()
Zs_csv, Ks_csv = predict_on_enhanced(X_dirty, LOP, encoder, decoder, _translate_all_columns_by_1)
clean_csv = generate_cleaned_data(X_dirty, Zs_csv, Ks_csv, decoder)
#=========== Replace the dirty data with the clean one =========================
#replace the cleaned columns, and reverse to input domain
dirty_data[filtered_header] = clean_csv
lop_data = reverse_to_input_domain(args.dataset, dirty_data, FULL_SCALER, CAT_ENCODER)
#recover the date columns
lop_data = get_date_columns(ds, lop_data, dirty_data)
#save a cleaned CSV
lop_data.to_csv(f'{args.path}/{args.dataset}/lopster.csv', index = False)