-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPredictive_Model.py
More file actions
153 lines (115 loc) · 5.25 KB
/
Predictive_Model.py
File metadata and controls
153 lines (115 loc) · 5.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.decomposition import PCA, FastICA
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline, _name_estimators
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID']
num_train = len(train)
df_all = pd.concat([train, test])
df_all.drop(['ID', 'y'], axis=1, inplace=True)
# One-hot encoding of categorical/strings
df_all = pd.get_dummies(df_all, drop_first=True)
train = df_all[:num_train]
test = df_all[num_train:]
class AddColumns(BaseEstimator, TransformerMixin):
def __init__(self, transform_=None):
self.transform_ = transform_
def fit(self, X, y=None):
self.transform_.fit(X, y)
return self
def transform(self, X, y=None):
xform_data = self.transform_.transform(X, y)
return np.append(X, xform_data, axis=1)
class LogExpPipeline(Pipeline):
def fit(self, X, y):
super(LogExpPipeline, self).fit(X, np.log1p(y))
def predict(self, X):
return np.expm1(super(LogExpPipeline, self).predict(X))
#
# Model/pipeline with scaling,pca,svm
#
svm_pipe = LogExpPipeline(_name_estimators([RobustScaler(),
PCA(),
SVR(kernel='rbf', C=1.0, epsilon=0.05)]))
# results = cross_val_score(svm_pipe, train, y_train, cv=5, scoring='r2')
# print("SVM score: %.4f (%.4f)" % (results.mean(), results.std()))
# exit()
#
# Model/pipeline with scaling,pca,ElasticNet
#
en_pipe = LogExpPipeline(_name_estimators([RobustScaler(),
PCA(n_components=125),
ElasticNet(alpha=0.001, l1_ratio=0.1)]))
#
# XGBoost model
#
xgb_model = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.921,
objective='reg:linear', n_estimators=1300, base_score=y_mean)
xgb_pipe = Pipeline(_name_estimators([AddColumns(transform_=PCA(n_components=10)),
AddColumns(transform_=FastICA(n_components=10, max_iter=500)),
xgb_model]))
# results = cross_val_score(xgb_model, train, y_train, cv=5, scoring='r2')
# print("XGB score: %.4f (%.4f)" % (results.mean(), results.std()))
#
# Random Forest
#
rf_model = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25,
min_samples_leaf=25, max_depth=3)
# results = cross_val_score(rf_model, train, y_train, cv=5, scoring='r2')
# print("RF score: %.4f (%.4f)" % (results.mean(), results.std()))
#
# Now the training and stacking part. In previous version i just tried to train each model and
# find the best combination, that lead to a horrible score (Overfit?). Code below does out-of-fold
# training/predictions and then we combine the final results.
#
# Read here for more explanation (This code was borrowed/adapted) :
#
class Ensemble(object):
def __init__(self, n_splits, stacker, base_models):
self.n_splits = n_splits
self.stacker = stacker
self.base_models = base_models
def fit_predict(self, X, y, T):
X = np.array(X)
y = np.array(y)
T = np.array(T)
folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models)))
for i, clf in enumerate(self.base_models):
S_test_i = np.zeros((T.shape[0], self.n_splits))
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
y_holdout = y[test_idx]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_holdout)[:]
print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred)))
S_train[test_idx, i] = y_pred
S_test_i[:, j] = clf.predict(T)[:]
S_test[:, i] = S_test_i.mean(axis=1)
# results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
# print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
# exit()
self.stacker.fit(S_train, y)
res = self.stacker.predict(S_test)[:]
return res
stack = Ensemble(n_splits=5,
#stacker=ElasticNetCV(l1_ratio=[x/10.0 for x in range(1,10)]),
stacker=ElasticNet(l1_ratio=0.1, alpha=1.4),
base_models=(svm_pipe, en_pipe, xgb_pipe, rf_model))
y_test = stack.fit_predict(train, y_train, test)
df_sub = pd.DataFrame({'ID': id_test, 'y': y_test})
df_sub.to_csv('submission.csv', index=False)