-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoml4spark_function_variable_selection_via_pca.r
More file actions
172 lines (146 loc) · 7.22 KB
/
oml4spark_function_variable_selection_via_pca.r
File metadata and controls
172 lines (146 loc) · 7.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
###############################################################
# oml4spark_function_variable_selection_via_pca.r
#
# Function to create Variable Selection using only PCA based
# analysis. Variables are selected based on the Total
# Variability of any Normally-scaled variable
# given a Dataset
#
# Usage: selectVariablesViaPCA ( formulaForPCA ,
# inputForPCA ,
# feedback = FALSE ,
# varianceExplainedCutoff=0.90,
# minSignificanceEigenVectors = 0.20
# )
#
# Copyright (c) 2020 Oracle Corporation
# The Universal Permissive License (UPL), Version 1.0
#
# https://oss.oracle.com/licenses/upl/
#
###############################################################
############################################################
### VARIABLE SELECTION WITH PRINCIPAL COMPONENT ANALYSIS ###
### WITH SUPPORT FOR NUMERICALS AND FACTORS ###
### ONLY BASED ON OVERALL VARIABILITY, NOT THE TARGET ###
############################################################
## INPUT IS ORIGINAL DATASET AND FORMULA
## OPTIONAL INPUTS ARE: CUMULATIVE PERCENT VARIANCE EXPLANATION, CORRELATION
selectVariablesViaPCA <- function(formulaForPCA,
inputForPCA,
feedback=FALSE,
varianceExplainedCutoff=0.90 ,
minSignificanceEigenVectors = 0.20 )
{
formulaForPCA <- paste0('~ ',gsub(".*~","",Reduce(paste, deparse(formulaForPCA))))
if (!(grepl(Reduce(paste, deparse(formulaForPCA)),"-1"))) {
formulaForPCA <- paste0(formulaForPCA," -1")
}
if (grepl(feedback, "FULL", fixed = TRUE))
{verbose_user <- TRUE
} else {verbose_user <- FALSE}
dmm <- orch.model.matrix(formulaForPCA, data=inputForPCA,
type = 'dmm', factorMode = "none",
verbose = verbose_user)
eigenVectorNames <- dmm$getCoefName()
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',length(eigenVectorNames),' total Eigenvector Levels\n'))
print(eigenVectorNames)
}
# Need to scale variables first.
scaledData <- orch.df.scale(data=inputForPCA, method = 'unitization_zero_minimum')
# Review the Scaled Data
scaledData$show(5L)
# Run the PCA requesting up to 100 Components
model_dspca <- orch.dspca(formula = formulaForPCA, data=scaledData,
k = min(length(eigenVectorNames),100),
formU = FALSE,
verbose = verbose_user)
# Get the coefficients and output
coef_pca <- coef(model_dspca)
# Capture Partial Eigenvalues
eigenValues <- coef_pca$s
# Number of computed PCA Components
numComputedComponents <- length(eigenValues)
# IF necessary, let's complement the rest of the estimated Eigenvalues
# by using an Exponential Decay (-log) to simulate the rest of the Eigenvalues
if (numComputedComponents < length(eigenVectorNames)) {
numTotalVariables <- length(eigenVectorNames)
# Build the Log Range necessary based on Number of Eigenvalues
logRange <- abs(-log(numTotalVariables-numComputedComponents))
# Fill-in the rest of the Eigenvalues with the decay values which go to 0.
eigenValues[(numComputedComponents+1):numTotalVariables] <- -log(1:(numTotalVariables-numComputedComponents))*
eigenValues[numComputedComponents]/logRange+
eigenValues[numComputedComponents]
# Preview the Eigenvalues + Computed numbers
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',numComputedComponents,' Total Eigenvalues + Computed Eigenvalues \n'))
print(eigenValues)
}
} else {
# Preview the Eigenvalues numbers
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',numComputedComponents,' Total Eigenvalues \n'))
print(eigenValues)
}
}
# Compute the proportion of variance explained by each of the original Eigenvalues
propVarianceExplainedEigenValues <- eigenValues^2/sum(eigenValues^2)
# Preview the Eigenvalues numbers
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',length(propVarianceExplainedEigenValues),
' Proportions of Variance Explained by Eigenvalues and Scree Plot \n'))
print(propVarianceExplainedEigenValues)
plot(propVarianceExplainedEigenValues)
}
# Evaluate the cumulative Sum of the Variance Explained
evaluationVariance <- cumsum(propVarianceExplainedEigenValues)
EigenvaluesIdx <- which(evaluationVariance<=varianceExplainedCutoff)
reducedEigenvaluesIdx <- c(EigenvaluesIdx, (length(EigenvaluesIdx)+1))
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',length(reducedEigenvaluesIdx),
' Reduced Number of Eigenvalues with a minimum of ',
varianceExplainedCutoff*100,' % of Total Cumulative Variance \n'))
print(formatC(propVarianceExplainedEigenValues[reducedEigenvaluesIdx],format='fg', digits=4))
print(formatC(evaluationVariance[reducedEigenvaluesIdx],format='fg', digits=4))
}
# Capture the final Eigenvectors
eigenVectors <- as.matrix(coef_pca$V)
# Apply the names of the Vectors
row.names(eigenVectors) <- eigenVectorNames
# From EigenVectors, select the Reduced Set
significantVectors <- eigenVectors[,1:length(reducedEigenvaluesIdx)]
finalListSignificantLevels <- character()
for (x in 1:length(eigenVectorNames)) {
if (any(abs(significantVectors[x,]) >= minSignificanceEigenVectors)) {
finalListSignificantLevels <- rbind(finalListSignificantLevels,as.character(eigenVectorNames[x]))
}
}
finalListSignificantLevels <- as.character(finalListSignificantLevels)
# Print the list of Levels found significant
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',length(finalListSignificantLevels),
' Reduced Number of Levels \n'))
print(as.character(finalListSignificantLevels))
}
originalColumnNames <- scaledData$columns()
# Look for the matching Columns used from the original Dataset
matchingCols <- as.data.frame(sapply(originalColumnNames, grepl,
as.character(finalListSignificantLevels),
ignore.case=TRUE))
# Get the final list of significant columns
finalListSignificantVars <- colnames(matchingCols[,(colSums(matchingCols)>0)])
if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
cat('\n')
cat(paste0('\n ',length(finalListSignificantVars),
' Reduced Set of Variables \n'))
print(as.character(finalListSignificantVars))
}
return(as.character(finalListSignificantVars))
}