-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipeline.py
More file actions
79 lines (54 loc) · 2.17 KB
/
pipeline.py
File metadata and controls
79 lines (54 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
def encode_features(df, exclude_columns):
"""
df: dataframe
exclude_columns: numerical columns to exclude in the dataframe
return: return an encoded dataframe
"""
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_features = df.select_dtypes(["object"])
# Exclude numerical columns + Churn column
features_to_encode = categorical_features.loc[
:,
~categorical_features.columns.isin(exclude_columns),
].copy()
# Select the columns to encoded
columns_to_encode = features_to_encode.columns
for column in columns_to_encode:
features_to_encode[column] = le.fit_transform(features_to_encode[column])
return features_to_encode
def concat_encoded_numerical(df_encoded_features, df_numerical_features):
"""
df_encoded_features: encoded features dataframe
df_numerical_features: numerical dataframe
return: contacted dataframe encoded + numerical
"""
return pd.concat([df_encoded_features, df_numerical_features], axis=1)
def split_data(X, y, test_size=0.2):
return train_test_split(X, y, test_size=test_size, random_state=42)
def train_model(model, X_train, y_train):
rf = model
rf.fit(X_train, y_train)
return rf
def metric_model(y_test, y_pred):
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="binary")
recall = recall_score(y_test, y_pred, average="binary")
f1 = f1_score(y_test, y_pred, average="binary")
return [accuracy, precision, recall, f1]
def display_pr_curve(model, X_test, y_test, title, label):
y_probs = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_probs)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker=".", color="b", label=label)
plt.title(title)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.grid(True)
plt.legend()
plt.show()