ORAAH-Tutorials/oml4spark_function_variable_selection_via_pca.r at master · marancibia/ORAAH-Tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
###############################################################
# oml4spark_function_variable_selection_via_pca.r
#
# Function to create Variable Selection using only PCA based
# analysis.  Variables are selected based on the Total
# Variability of any Normally-scaled variable
# given a Dataset
#
# Usage: selectVariablesViaPCA ( formulaForPCA ,
#                                inputForPCA ,
#                                feedback = FALSE ,
#                                varianceExplainedCutoff=0.90,
#                                minSignificanceEigenVectors = 0.20
#                               )
#
# Copyright (c) 2020 Oracle Corporation
# The Universal Permissive License (UPL), Version 1.0
#
# https://oss.oracle.com/licenses/upl/
#
###############################################################

############################################################
### VARIABLE SELECTION WITH PRINCIPAL COMPONENT ANALYSIS ###
### WITH SUPPORT FOR NUMERICALS AND FACTORS              ###
### ONLY BASED ON OVERALL VARIABILITY, NOT THE TARGET    ###
############################################################

## INPUT IS ORIGINAL DATASET AND FORMULA
## OPTIONAL INPUTS ARE: CUMULATIVE PERCENT VARIANCE EXPLANATION, CORRELATION
selectVariablesViaPCA <- function(formulaForPCA,
                                  inputForPCA,
                                  feedback=FALSE,
                                  varianceExplainedCutoff=0.90 ,
                                  minSignificanceEigenVectors = 0.20 )
{

  formulaForPCA <- paste0('~ ',gsub(".*~","",Reduce(paste, deparse(formulaForPCA))))

  if (!(grepl(Reduce(paste, deparse(formulaForPCA)),"-1"))) {
    formulaForPCA <- paste0(formulaForPCA," -1")
  }

  if (grepl(feedback, "FULL", fixed = TRUE))
  {verbose_user <- TRUE
  } else {verbose_user <- FALSE}

  dmm <- orch.model.matrix(formulaForPCA, data=inputForPCA,
                           type = 'dmm', factorMode = "none",
                           verbose = verbose_user)

  eigenVectorNames <- dmm$getCoefName()
  if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
    cat('\n')
    cat(paste0('\n ',length(eigenVectorNames),' total Eigenvector Levels\n'))
    print(eigenVectorNames)
  }

  # Need to scale variables first.
  scaledData <- orch.df.scale(data=inputForPCA, method = 'unitization_zero_minimum')
  # Review the Scaled Data
  scaledData$show(5L)

  # Run the PCA requesting up to 100 Components
  model_dspca <- orch.dspca(formula = formulaForPCA, data=scaledData,
                            k = min(length(eigenVectorNames),100),
                            formU = FALSE,
                            verbose = verbose_user)

  # Get the coefficients and output
  coef_pca <- coef(model_dspca)

  # Capture Partial Eigenvalues
  eigenValues <- coef_pca$s

  # Number of computed PCA Components
  numComputedComponents <- length(eigenValues)

  # IF necessary, let's complement the rest of the estimated Eigenvalues
  # by using an Exponential Decay (-log) to simulate the rest of the Eigenvalues
  if (numComputedComponents < length(eigenVectorNames)) {
    numTotalVariables <- length(eigenVectorNames)
    # Build the Log Range necessary based on Number of Eigenvalues
    logRange <- abs(-log(numTotalVariables-numComputedComponents))
    # Fill-in the rest of the Eigenvalues with the decay values which go to 0.
    eigenValues[(numComputedComponents+1):numTotalVariables] <- -log(1:(numTotalVariables-numComputedComponents))*
      eigenValues[numComputedComponents]/logRange+
      eigenValues[numComputedComponents]
    # Preview the Eigenvalues + Computed numbers
    if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
      cat('\n')
      cat(paste0('\n ',numComputedComponents,' Total Eigenvalues + Computed Eigenvalues \n'))
      print(eigenValues)
    }
  } else {
    # Preview the Eigenvalues numbers
    if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
      cat('\n')
      cat(paste0('\n ',numComputedComponents,' Total Eigenvalues \n'))
      print(eigenValues)
    }

  }

  # Compute the proportion of variance explained by each of the original Eigenvalues
  propVarianceExplainedEigenValues <- eigenValues^2/sum(eigenValues^2)
  # Preview the Eigenvalues numbers
  if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
    cat('\n')
    cat(paste0('\n ',length(propVarianceExplainedEigenValues),
               ' Proportions of Variance Explained by Eigenvalues and Scree Plot \n'))
    print(propVarianceExplainedEigenValues)
    plot(propVarianceExplainedEigenValues)
  }

  # Evaluate the cumulative Sum of the Variance Explained
  evaluationVariance <- cumsum(propVarianceExplainedEigenValues)
  EigenvaluesIdx <- which(evaluationVariance<=varianceExplainedCutoff)
  reducedEigenvaluesIdx <- c(EigenvaluesIdx, (length(EigenvaluesIdx)+1))

  if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
    cat('\n')
    cat(paste0('\n ',length(reducedEigenvaluesIdx),
               ' Reduced Number of Eigenvalues with a minimum of ',
               varianceExplainedCutoff*100,' % of Total Cumulative Variance \n'))
    print(formatC(propVarianceExplainedEigenValues[reducedEigenvaluesIdx],format='fg', digits=4))
    print(formatC(evaluationVariance[reducedEigenvaluesIdx],format='fg', digits=4))
  }

  # Capture the final Eigenvectors
  eigenVectors <- as.matrix(coef_pca$V)
  # Apply the names of the Vectors
  row.names(eigenVectors) <- eigenVectorNames

  # From EigenVectors, select the Reduced Set
  significantVectors <- eigenVectors[,1:length(reducedEigenvaluesIdx)]

  finalListSignificantLevels <- character()
  for (x in 1:length(eigenVectorNames)) {
    if (any(abs(significantVectors[x,]) >= minSignificanceEigenVectors)) {
      finalListSignificantLevels <- rbind(finalListSignificantLevels,as.character(eigenVectorNames[x]))
    }
  }
  finalListSignificantLevels <- as.character(finalListSignificantLevels)

  # Print the list of Levels found significant
  if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
    cat('\n')
    cat(paste0('\n ',length(finalListSignificantLevels),
               ' Reduced Number of Levels \n'))
    print(as.character(finalListSignificantLevels))
  }
  originalColumnNames <- scaledData$columns()

  # Look for the matching Columns used from the original Dataset
  matchingCols <- as.data.frame(sapply(originalColumnNames, grepl,
                                       as.character(finalListSignificantLevels),
                                       ignore.case=TRUE))

  # Get the final list of significant columns
  finalListSignificantVars <- colnames(matchingCols[,(colSums(matchingCols)>0)])

  if (grepl(feedback, "FULL|TRUE", fixed = TRUE)) {
    cat('\n')
    cat(paste0('\n ',length(finalListSignificantVars),
               ' Reduced Set of Variables \n'))
    print(as.character(finalListSignificantVars))
  }

  return(as.character(finalListSignificantVars))

}