examples/integrations-and-supported-tools/catboost/scripts/Neptune_CatBoost.py at main · neptune-ai/examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Neptune + CatBoost

## Import dependencies
import neptune
from catboost import CatBoostClassifier
from catboost.datasets import titanic
from neptune.types import File
from neptune.utils import stringify_unsupported
from sklearn.model_selection import train_test_split

## (Neptune) Start a run
run = neptune.init_run(
    api_token=neptune.ANONYMOUS_API_TOKEN,  # Replace with your own
    project="common/catboost-support",  # Replace with your own
    tags=["catboost", "classifier", "script"],  # (optional) use your own
)

## Load data
titanic_train, titanic_test = titanic()

### (Neptune) Upload raw data
run["data/raw/train"].upload(File.as_html(titanic_train))
run["data/raw/test"].upload(File.as_html(titanic_test))

### Preprocess data
titanic_train["Age"].fillna(titanic_train["Age"].median(), inplace=True)
titanic_train["Cabin"].fillna("", inplace=True)
titanic_train["Embarked"].fillna(titanic_train["Embarked"].mode()[0], inplace=True)

titanic_test["Age"].fillna(titanic_test["Age"].median(), inplace=True)
titanic_test["Fare"].fillna(titanic_test["Fare"].median(), inplace=True)
titanic_test["Cabin"].fillna("", inplace=True)

label = ["Survived"]
cat_features = ["Sex", "Embarked"]
text_features = ["Name", "Ticket", "Cabin"]

X_train, X_eval, y_train, y_eval = train_test_split(
    titanic_train.drop(columns=label + ["PassengerId"]),
    titanic_train[label],
    test_size=0.25,
    shuffle=True,
)

### (Neptune) Upload processed data
run["data/processed/train"].upload(File.as_html(titanic_train))
run["data/processed/test"].upload(File.as_html(titanic_test))

## Train a CatBoost model

model = CatBoostClassifier()

plot_file = "training_plot.html"

model.fit(
    X=X_train,
    y=y_train,
    eval_set=(X_eval, y_eval),
    cat_features=cat_features,
    text_features=text_features,
    plot_file=plot_file,
    use_best_model=True,
)

### (Neptune) Upload training results
#### Upload training plot
run["training/plot"].upload(plot_file)

#### Upload training metrics

run["training/best_score"] = stringify_unsupported(model.get_best_score())
run["training/best_iteration"] = stringify_unsupported(model.get_best_iteration())

## Make predictions
titanic_test["prediction"] = model.predict(
    data=titanic_test.drop(columns=["PassengerId"]),
    prediction_type="Class",
)

### (Neptune) Upload predictions
titanic_test.to_csv("results.csv", index=False)

run["data/results"].upload("results.csv")

## (Neptune) Upload model metadata to Neptune
### Upload model binary
model.save_model("model.cbm")

run["model/binary"].upload("model.cbm")

### Upload model attributes
run["model/attributes/tree_count"] = model.tree_count_
run["model/attributes/feature_importances"] = dict(
    zip(model.feature_names_, model.get_feature_importance())
)
run["model/attributes/probability_threshold"] = model.get_probability_threshold()

### Upload model parameters
run["model/parameters"] = stringify_unsupported(model.get_all_params())

## Stop logging
run.stop()

## Analyze run in the Neptune app
# Follow the run link in the console output and explore the logged metadata.
# You can also explore this example run
# https://app.neptune.ai/o/common/org/catboost-support/runs/details?viewId=standard-view&detailsTab=dashboard&dashboardId=Overview-99f571df-0fec-4447-9ffe-5a4c668577cd&shortId=CAT-2