-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoml4spark_function_confusion_matrix_in_spark.r
More file actions
156 lines (140 loc) · 8.42 KB
/
oml4spark_function_confusion_matrix_in_spark.r
File metadata and controls
156 lines (140 loc) · 8.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
###############################################################
# oml4spark_function_confusion_matrix_in_spark.r
#
# Function to Compute a Confusion Matrix based on Spark SQL
# given a Dataset that contains Actuals and Predicted
#
# Usage: createBalancedInput( input_bal ,
# formula_bal ,
# feedback = FALSE ,
# rangeForNoProcess = c(0.45,0.55)
# )
#
# Copyright (c) 2020 Oracle Corporation
# The Universal Permissive License (UPL), Version 1.0
#
# https://oss.oracle.com/licenses/upl/
#
#
###############################################################
########################################################################################
### COMPUTING A CONFUSION MATRIX FOR BINARY CLASSIFIERS IN-MEMORY IN SPARK VIA OML4Spark ###
########################################################################################
confusionMatrixInSpark <- function(list_of_model_pred) {
model <- list_of_model_pred[[1]]
predictions <- list_of_model_pred[[2]]
model_name <- list_of_model_pred[[3]]
model_timings <- list_of_model_pred[[4]]
# Initialize output
allStats <- list()
# Check original Target column from Model
targetFromFormula <- strsplit(model$formula, " ")[[1]][1]
# Remove as.factor() if it is present
if (startsWith(targetFromFormula,"as.factor(")) {
targetFromFormula <- regmatches(targetFromFormula,
gregexpr("(?<=\\().+?(?=\\))",
targetFromFormula,perl = T))[[1]]
}
# Persist the Scoring Data for added performance
orch.df.persist(predictions,storageLevel = "MEMORY_ONLY",verbose = FALSE)
# From all columns of the Predictions
predAllColumns <- predictions$columns()
# Capture Predictions and Probabilities
if (!is.null(model$call)) {
# Capture the Probability column
predLastColumn <- ""
if (predAllColumns[length(predAllColumns)]=='prediction2') {
probLastColumn <- "prediction2"
} else { probLastColumn <- "prediction" }
} else {
# Gather the Last column which is usually the Predicted Label for some models...
predLastColumn <- predAllColumns[length(predAllColumns)]
# ...and the last columns with Probability in the name which is usually the P(target)
# with the exception of LASSO, which is
if (attr(model,"class")=="orch.ml.lasso") { probLastColumn <- predLastColumn
} else {
probColumns <- grep("probability", predAllColumns)
probLastColumn <- predAllColumns[probColumns[length(probColumns)]]
}
}
# Create Temporary View of original Dataset
op <- options(digits.secs = 6)
time <- as.character(Sys.time())
options(op)
tempViewName <- paste0("tmp_view_",
paste(regmatches(time,gregexpr('\\(?[0-9]+', time))[[1]],collapse = ''),
collapse = " ")
orch.df.createView(predictions , tempViewName)
# If the Model includes Probabilities, get general statistics from the Prediction Data
# in particular the proportion of Target=1 if model returns Probability
targetInfo <- orch.df.collect(orch.df.sql(paste0("select ",targetFromFormula," as target, count(*) as num_rows from ",tempViewName," group by ",targetFromFormula, " order by ",targetFromFormula)))
proportionTarget <- targetInfo[2,2]/sum(targetInfo$num_rows)
# confMtxStructure is the structure of a Confusion Matrix for a traditional Binary Target Model
confMtxStructure <- cbind(predicted=c(1,1,0,0),rbind(targetInfo[1],targetInfo[1]))
colnames(confMtxStructure)[2] <- 'actual'
# confusionComputed will compute the actual data from the Predicted column
if (length(probLastColumn)>0) {
confusionComputed <- orch.df.collect(orch.df.sql(paste0("select predicted, ",targetFromFormula,
" as actual, count(*) as count from (select ",
targetFromFormula," , case when `",probLastColumn,"` > ",proportionTarget,
" then 1 else 0 end as predicted from ",tempViewName,") group by ",
targetFromFormula," , predicted")))
} else {
# Check if model is ELM then response is the label and not 0/1
predInfo <- orch.df.collect(orch.df.sql(paste0("select ",predLastColumn," as target from ",tempViewName," group by ",predLastColumn, " order by ",predLastColumn)))
if (predInfo[2,]=='1') {
confusionComputed <- orch.df.collect(orch.df.sql(paste0("select predicted, ",targetFromFormula,
" as actual, count(*) as count from (select ",
targetFromFormula," , case when ",
predLastColumn," == 1 then 1 else 0 end as predicted from ",
tempViewName,") group by ",
targetFromFormula," , predicted")))
} else {
confusionComputed <- orch.df.collect(orch.df.sql(paste0("select predicted, ",targetFromFormula,
" as actual, count(*) as count from (select ",
targetFromFormula," , case when ",
predLastColumn," == '",predInfo[2,],"' then 1 else 0 end as predicted from ",
tempViewName,") group by ",
targetFromFormula," , predicted")))
}
}
# confusionMatrixFinal is the final structured Matrix with '0's replacing any NA
confusionMatrixFinal <- merge(confMtxStructure,confusionComputed, by=c('predicted','actual'), all = TRUE)
confusionMatrixFinal[is.na(confusionMatrixFinal)] <- 0
# Adding Percentage
confusionMatrixFinal$percentage <- confusionMatrixFinal$count/sum(confusionMatrixFinal$count)
# Preparing for several statistics based on the Confusion Matrix
# See https://en.wikipedia.org/wiki/Sensitivity_and_specificity
# Also https://en.wikipedia.org/wiki/Matthews_correlation_coefficient
# Compute True Positive, True Negative, False Positives and False Negatives
TP <- confusionMatrixFinal$percentage[4]
FP <- confusionMatrixFinal$percentage[3]
TN <- confusionMatrixFinal$percentage[1]
FN <- confusionMatrixFinal$percentage[2]
TPR <- TP/(TP+FN) # True Positive Rate - Sensitivity - Recall
FPR <- FP/(FP+TN) # False Positive Rate
FNR <- FN/(FN+TP) # False Negative Rate
TNR <- TN/(TN+FP) # True Negative Rate - Specificity
Precision = TP/(TP+FP) # Precision
Accuracy = (TP+TN)/(TP+TN+FP+FN) # Accuracy
# Approximate AUC using Triangles
AUC <- (1/2)*FPR*TPR + (1/2)*(1-FPR)*(1-TPR) + (1-FPR)*TPR
# Prevalence <- (TP+FN)/(TP+TN+FP+FN) # Prevalence is a constant for all Models
PPV <- Precision # Positive Predictive Value
NPV <- TN/(FN+TN) # Negative Predictive Value
DetectionRate <- TP/(TP+TN+FP+FN) # Detection Rate
DetectionPrevalence <- Accuracy # Detection Prevalence
BalancedAccuracy <- (TPR+TNR)/2 # Balanced Accuracy
F1Score <- 2*Precision*TPR/(Precision+TPR) # F1 Score
Informedness <- TPR+TNR-1 # Informedness
MathewsCorrCoef <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) # Mathews Correlation Coefficient
allStats <- list(model_name,
confusionMatrixFinal,
cbind(BuildTime=model_timings[1],ScoreTime=model_timings[3],
TP,FP,TN,FN,TPR,FPR,FNR,TNR,
Precision,Accuracy,AUC,PPV,NPV,DetectionRate,
DetectionPrevalence,BalancedAccuracy,F1Score,
Informedness,MathewsCorrCoef))
return(allStats)
}
######### END OF CONFUSION MATRIX ANALYSIS