-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathhistogramDS2.R
More file actions
176 lines (145 loc) · 7.6 KB
/
histogramDS2.R
File metadata and controls
176 lines (145 loc) · 7.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#'
#' @title Computes a histogram of the input variable without plotting.
#' @description This function produces the information required to plot
#' a histogram. This is done without allowing for bins (cells) with number
#' of counts less than the pre-specified disclosure control set for the minimum cell
#' size of a table. If a bin has less counts than this threshold then their counts
#' and its density are replaced by a 0 value.
#' @details Please find more details in the documentation of the clientside ds.histogram function.
#' @param xvect the numeric vector for which the histogram is desired.
#' @param num.breaks the number of breaks that the range of the variable is divided.
#' @param min a numeric, the lower limit of the distribution.
#' @param max a numeric, the upper limit of the distribution.
#' @param method.indicator a number equal to either 1, 2 or 3 indicating the method of disclosure
#' control that is used for the generation of the histogram. If the value is equal to 1 then the
#' 'smallCellsRule' is used. If the value is equal to 2 then the 'deterministic' method is used.
#' If the value is set to 3 then the 'probabilistic' method is used.
#' @param k the number of the nearest neighbours for which their centroid is calculated if the
#' \code{method.indicator} is equal to 2 (i.e. deterministic method).
#' @param noise the percentage of the initial variance that is used as the variance of the embedded
#' noise if the \code{method.indicator} is equal to 3 (i.e. probabilistic method).
#' @return a list with an object of class \code{histogram} and the number of invalid cells
#' @author Amadou Gaye, Demetris Avraam for DataSHIELD Development Team
#' @export
#'
histogramDS2 <- function (xvect, num.breaks, min, max, method.indicator, k, noise){
##################################################################
# MODULE 1: CAPTURE THE nfilter SETTINGS #
thr <- dsBase::listDisclosureSettingsDS() #
nfilter.tab <- as.numeric(thr$nfilter.tab) #
#nfilter.glm <- as.numeric(thr$nfilter.glm) #
#nfilter.subset <- as.numeric(thr$nfilter.subset) #
#nfilter.string <- as.numeric(thr$nfilter.string) #
#nfilter.stringShort <- as.numeric(thr$nfilter.stringShort) #
nfilter.kNN <- as.numeric(thr$nfilter.kNN) #
nfilter.noise <- as.numeric(thr$nfilter.noise) #
nfilter.levels.density <- as.numeric(thr$nfilter.levels.density) #
# nfilter.levels.max <- as.numeric(thr$nfilter.levels.max) #
##################################################################
# back-up current .Random.seed and revert on.exit
if (exists(x = ".Random.seed", envir = globalenv())) {
assign(x = "old_seed", value = .Random.seed, envir = parent.frame());
on.exit({ assign(x = ".Random.seed", value = old_seed, envir = globalenv()); remove("old_seed", envir = parent.frame()) }, add = TRUE)
} else
on.exit(if (exists(x = ".Random.seed", envir = globalenv())) remove(".Random.seed", envir = globalenv()), add = TRUE)
if (method.indicator==1){
# Check if the number of breaks meets the DataSHIELD privacy criteria (disclosure control for
# saturation)
if (num.breaks > (nfilter.levels.density * length(xvect))){
studysideMessage <- "FAILED: Number of breaks is too big. It may be disclosive - please shorten"
stop(studysideMessage, call. = FALSE)
}else{
# breaks
brks <- seq(from=min, to=max, by=(max-min)/num.breaks)
}
# Get the histogram object
histout <- graphics::hist(xvect, breaks=brks, plot=FALSE)
# Check if the counts in each 'bin' are more than the disclosure setting for the minimum size
# of table's cells
indx <- which(histout$counts > 0 & histout$counts < nfilter.tab)
# get the number of invalid cells
invalidcells <- length(indx)
if(invalidcells > 0){
# Replace the corresponding counts and densities by zeros
histout$counts[indx] <- 0
histout$density[indx] <- 0
}
}
if(method.indicator==2){
# Load the RANN package to use the 'nn2' function that searches for the Nearest Neighbours
# library(RANN)
# Remove any missing values
x <- stats::na.omit(xvect)
# Standardise the variable
x.standardised <- (x-mean(x))/stats::sd(x)
# Calculate the length of the variable after ommitting any NAs
N.data <- length(x)
# Check if k is integer and has a value greater than or equal to the pre-specified threshold
# and less than or equal to the length of rows of data.complete minus the pre-specified threshold
if(k < nfilter.kNN | k > (N.data - nfilter.kNN)){
stop(paste0("k must be greater than or equal to ", nfilter.kNN, " and less than or equal to ", (N.data-nfilter.kNN), "."), call.=FALSE)
}else{
neighbours = k
}
# Find the k-1 nearest neighbours of each data point
nearest <- RANN::nn2(x.standardised, k = neighbours)
# Calculate the centroid of each n nearest data points
x.centroid <- matrix()
for (i in 1:N.data){
x.centroid[i] <- mean(x.standardised[nearest$nn.idx[i,1:neighbours]])
}
# Calculate the scaling factor
x.scalingFactor <- stats::sd(x.standardised)/stats::sd(x.centroid)
# Apply the scaling factor to the centroids
x.masked <- x.centroid * x.scalingFactor
# Shift the centroids back to the actual position and scale of the original data
x.new <- (x.masked * stats::sd(x)) + mean(x)
# Check if the number of breaks meets the DataSHIELD privacy criteria (disclosure control for
# saturation)
if (num.breaks > (nfilter.levels.density * length(xvect))){
studysideMessage <- "FAILED: Number of breaks is too big. It may be disclosive - please shorten"
stop(studysideMessage, call. = FALSE)
}else{
# breaks
brks <- seq(from=min, to=max, by=(max-min)/num.breaks)
}
# Get the histogram object
histout <- graphics::hist(x.new, breaks=brks, plot=FALSE)
invalidcells <- 0
}
if(method.indicator==3){
# Remove any missing values
x <- stats::na.omit(xvect)
# Calculate the length of the variable after ommitting any NAs
N.data <- length(x)
# Check if the percentage of the variance that is specified in the argument 'noise'
# and is used as the variance of the embedded noise is a greater
# than the minimum threshold specified in the filter 'nfilter.noise'
if(noise < nfilter.noise){
stop(paste0("'noise' must be greater than or equal to ", nfilter.noise), call.=FALSE)
}else{
percentage <- noise
}
# the study-specific seed for random number generation
seed <- getOption("datashield.seed")
set.seed(seed)
# generate the noise-augmented vector
x.new <- x + stats::rnorm(N.data, mean=0, sd=sqrt(percentage*stats::var(x)))
# Check if the number of breaks meets the DataSHIELD privacy criteria (disclosure control for
# saturation)
if (num.breaks > (nfilter.levels.density * length(xvect))){
studysideMessage <- "FAILED: Number of breaks is too big. It may be disclosive - please shorten"
stop(studysideMessage, call. = FALSE)
}else{
# breaks
brks <- seq(from=min, to=max, by=(max-min)/num.breaks)
}
# Get the histogram object
histout <- graphics::hist(x.new, breaks=brks, plot=FALSE)
invalidcells <- 0
}
# Return a list with the histogram object and the number of invalid cells
return(list("histobject"=histout, "invalidcells"=invalidcells))
}
# AGGREGATE FUNCTION
# histogramDS2