-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathhistogramDS1.R
More file actions
152 lines (119 loc) · 6.6 KB
/
histogramDS1.R
File metadata and controls
152 lines (119 loc) · 6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#'
#' @title returns the minimum and the maximum of the input numeric vector
#' @description this function returns the minimum and maximum of the input numeric vector which
#' depends on the argument \code{method.indicator}. If the method.indicator is set to 1 (i.e. the
#' 'smallCellsRule' is used) the computed minimum and maximum values are multiplied by a very small
#' random number. If the method.indicator is set to 2 (i.e. the 'deterministic' method is used) the
#' function returns the minimum and maximum values of the vector with the scaled centroids. If the
#' method.indicator is set to 3 (i.e. the 'probabilistic' method is used) the function returns the
#' minimum and maximum values of the generated 'noisy' vector.
#' @param xvect the numeric vector for which the histogram is desired.
#' @param method.indicator a number equal to either 1, 2 or 3 indicating the method of disclosure
#' control that is used for the generation of the histogram. If the value is equal to 1 then the
#' 'smallCellsRule' is used. If the value is equal to 2 then the 'deterministic' method is used.
#' If the value is set to 3 then the 'probabilistic' method is used.
#' @param k the number of the nearest neighbours for which their centroid is calculated if the
#' \code{method.indicator} is equal to 2 (i.e. deterministic method).
#' @param noise the percentage of the initial variance that is used as the variance of the embedded
#' noise if the \code{method.indicator} is equal to 3 (i.e. probabilistic method).
#' @return a numeric vector which contains the minimum and the maximum values of the vector
#' @author Amadou Gaye, Demetris Avraam for DataSHIELD Development Team
#' @export
#'
histogramDS1 <- function(xvect, method.indicator, k, noise){
##################################################################
# MODULE 1: CAPTURE THE nfilter SETTINGS #
thr <- dsBase::listDisclosureSettingsDS() #
nfilter.tab <- as.numeric(thr$nfilter.tab) #
#nfilter.glm <- as.numeric(thr$nfilter.glm) #
#nfilter.subset <- as.numeric(thr$nfilter.subset) #
#nfilter.string <- as.numeric(thr$nfilter.string) #
#nfilter.stringShort <- as.numeric(thr$nfilter.stringShort) #
nfilter.kNN <- as.numeric(thr$nfilter.kNN) #
nfilter.noise <- as.numeric(thr$nfilter.noise) #
nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) #
nfilter.levels.max <- as.numeric(thr$nfilter.levels.max) #
##################################################################
# back-up current .Random.seed and revert on.exit
if (exists(x = ".Random.seed", envir = globalenv())) {
assign(x = "old_seed", value = .Random.seed, envir = parent.frame());
on.exit({ assign(x = ".Random.seed", value = old_seed, envir = globalenv()); remove("old_seed", envir = parent.frame()) }, add = TRUE)
} else
on.exit(if (exists(x = ".Random.seed", envir = globalenv())) remove(".Random.seed", envir = globalenv()), add = TRUE)
# print an error message if the input vector is not a numeric
if(!(is.numeric(xvect))){
output <- "The input vector is not a numeric!"
}else{
if (method.indicator==1){
# the study-specific seed for random number generation
seed <- getOption("datashield.seed")
if (is.null(seed))
stop("histogramDS1 requires 'datashield.seed' R option to operate", call.=FALSE)
set.seed(seed)
rr <- c(min(xvect, na.rm=TRUE), max(xvect, na.rm=TRUE))
if(rr[1] < 0){ min <- rr[1] * stats::runif(1, 1.01, 1.05) }else{ min <- rr[1] * stats::runif(1, 0.95, 0.99) }
if(rr[2] < 0){ max <- rr[2] * stats::runif(1, 0.95, 0.99) }else{ max <- rr[2] * stats::runif(1, 1.01, 1.05) }
output <- c(min, max)
}
if(method.indicator==2){
# Remove any missing values
x <- stats::na.omit(xvect)
# Standardise the variable
x.standardised <- (x-mean(x))/stats::sd(x)
# Calculate the length of the variable after ommitting any NAs
N.data <- length(x)
# Check if k is integer and has a value greater than or equal to the pre-specified threshold
# and less than or equal to the length of rows of data.complete minus the pre-specified threshold
if(k < nfilter.kNN | k > (N.data - nfilter.kNN)){
stop(paste0("k must be greater than or equal to ", nfilter.kNN, " and less than or equal to ", (N.data-nfilter.kNN), "."), call.=FALSE)
}else{
neighbours = k
}
# Find the k-1 nearest neighbours of each data point
nearest <- RANN::nn2(x.standardised, k = neighbours)
# Calculate the centroid of each n nearest data points
x.centroid <- matrix()
for (i in 1:N.data){
x.centroid[i] <- mean(x.standardised[nearest$nn.idx[i,1:neighbours]])
}
# Calculate the scaling factor
x.scalingFactor <- stats::sd(x.standardised)/stats::sd(x.centroid)
# Apply the scaling factor to the centroids
x.masked <- x.centroid * x.scalingFactor
# Shift the centroids back to the actual position and scale of the original data
x.new <- (x.masked * stats::sd(x)) + mean(x)
# find the minimum and the maximum of the distribution
min <- min(x.new)
max <- max(x.new)
output <- c(min, max)
}
if(method.indicator==3){
# Remove any missing values
x <- stats::na.omit(xvect)
# Calculate the length of the variable after ommitting any NAs
N.data <- length(x)
# Check if the percentage of the variance that is specified in the argument 'noise'
# and is used as the variance of the embedded noise is a greater
# than the minimum threshold specified in the filter 'nfilter.noise'
if(noise < nfilter.noise){
stop(paste0("'noise' must be greater than or equal to ", nfilter.noise), call.=FALSE)
}else{
percentage <- noise
}
# the study-specific seed for random number generation
seed <- getOption("datashield.seed")
if (is.null(seed))
stop("histogramDS requires 'datashield.seed' R option to operate", call.=FALSE)
set.seed(seed)
# generate the noise-augmented vector
x.new <- x + stats::rnorm(N.data, mean=0, sd=sqrt(percentage*stats::var(x)))
# find the minimum and the maximum of the distribution
min <- min(x.new)
max <- max(x.new)
output <- c(min, max)
}
}
return (output)
}
# AGGREGATE FUNCTION
# histogramDS1