Agency-Data/gradient_boost_machine.R at master · neuggs/Agency-Data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library(xlsx)
library(caret)
library(dplyr)
library(gbm)
library(xgboost)

# Read the original data set
agency_data_orig <- read.xlsx('./data/AgencyData_clean.xlsx', sheetIndex=1, stringsAsFactors=T)

# Trim the phat - i.e., data that's irrelevant
agency_data_used <- agency_data_orig[c(-1, -4)]
str(agency_data_used)

# Reusable function for one-hot encodigin
ordinal <- function(the_var, the_var_str){
  dmy <- dummyVars(paste(" ~ ", the_var_str), data = agency_data_used)
  dummy_df <- data.frame(predict(dmy, newdata = agency_data_used))
  agency_data_used <- cbind(agency_data_used, dummy_df)
  return(agency_data_used)
}

# Reusable function for non-ordingal
non_ordinal <- function(the_var){
  the_var <- as.integer(the_var)
  return (the_var)

}

agency_data_used$effective_date <- as.numeric(agency_data_used$effective_date)

# Convert all the categorical features to numeric. Simple translation is required for
# non-ordinal categorical variables. Ordinal categorical variables are encoded using
# one hot encoding.
agency_data_used$account_type <- non_ordinal(agency_data_used$account_type)
agency_data_used$assigned_agent <- non_ordinal(agency_data_used$assigned_agent)
agency_data_used$lob <- non_ordinal(agency_data_used$lob)
agency_data_used$master_company <- non_ordinal(agency_data_used$master_company)
agency_data_used$policy_type <- non_ordinal(agency_data_used$policy_type)
agency_data_used <- ordinal(agency_data_used$policy_term, "policy_term")
agency_data_used$policy_term <- NULL # can't get this into the function for some reason
agency_data_used$rating_state <- non_ordinal(agency_data_used$rating_state)
agency_data_used$status <- non_ordinal(agency_data_used$status)
agency_data_used$transaction_type <- non_ordinal(agency_data_used$transaction_type)


# Setup train and test data, 70% / 30%
train_ad <- sample_frac(agency_data_used, 0.7)
sid <- as.numeric(rownames(train_ad)) # because rownames() returns character
test_ad <- agency_data_used[-sid,]

fit <- glm(transaction_type ~ ., data=agency_data_used, family=poisson())
summary(fit)

# for reproducibility
set.seed(123)

# train GBM model
gbm.fit <- gbm(
  formula = transaction_type ~ .,
  distribution = "gaussian",
  data = agency_data_used,
  n.trees = 10000,
  interaction.depth = 1,
  shrinkage = 0.001,
  cv.folds = 5,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
)

# print results
print(gbm.fit)

# get MSE and compute RMSE
sqrt(min(gbm.fit$cv.error))

# plot loss function as a result of n trees added to the ensemble
gbm.perf(gbm.fit, method = "cv")

set.seed(123)
# train GBM model
gbm.fit2 <- gbm(
  formula = transaction_type ~ .,
  distribution = "gaussian",
  data = agency_data_used,
  n.trees = 5000,
  interaction.depth = 3,
  shrinkage = 0.1,
  cv.folds = 5,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
)
min_MSE <- which.min(gbm.fit2$cv.error)
sqrt(gbm.fit2$cv.error[min_MSE])
gbm.perf(gbm.fit2, method = "cv")