-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathblackBoxRanksDS.R
More file actions
275 lines (196 loc) · 10.9 KB
/
blackBoxRanksDS.R
File metadata and controls
275 lines (196 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#' @title Secure ranking of "V2BR" (vector to be ranked) across all sources
#' @description The second key serverside function that prepares the global
#' ranks of the the real data only generated in the first stage of the
#' ranking procedure and encrypts them in preparation for generating global
#' ranks that correspond 1 to 1 with only the real data in V2BR.
#' @details Severside assign function called by ds.ranksSecure. It takes the
#' global ranks currently held in sR5.df which reflect the global ranks based on
#' the "combined real+pseudo data vector" as encrypted by blackBoxDS
#' but with all pseudo-data stripped out. It then uses these global ranks (of
#' the real data) as if they were a new variable to be ranked. This is then
#' equivalent to blackBoxDS with the primary difference that no
#' pseudo-data are needed. This is because the global ranks are fundamentally
#' non-disclosive and so can be transferred to the clientside with no risk of
#' disclosure. However, in order to ensure that the client cannot compare
#' the list of global.ranks in sR4.df (after initial global ranking based on
#' ranking of real and pseudo-data combined) with the global.ranks to be
#' generated by blackBoxRanksDS (based solely on the real data they are
#' processed through seven more rounds of encryption as before in blackBoxDS.
#' In consequence the client remains unable to determine which of the original
#' global ranks corresponded to real data and which to pseudo-data. In
#' addition, blackBoxRanksDS does not need to determine the number of decimal
#' places in the data because it is only applied to ranks which are assumed to
#' be integers. For more details about the cluster of functions that
#' collectively enable secure global ranking and estimation of global quantiles
#' see the associated document entitled "secure.global.ranking.docx". Also
#' see the header file for ds.ranksSecure and the header file for blackBoxDS
#' @param input.var.name a character string specifying the name of the
#' vector holding the global ranks. This argument is set automatically by
#' the clientside function ds.ranksSecure
#' @param shared.seedval a pseudorandom number seed that ensures that the
#' processes generating the order and parameterisation of the encryption
#' algorithms are the same in each study. This argument is set by the argument
#' <shared.seed.value> in the clientside function ds.ranksSecure. The
#' seed value shared by all studies in setting up the encryption procedures
#' in blackBoxRanksDS is arbitrarily changed from that used to set up the
#' encryption procedures in blackBoxDS, so the the set of 7 encryption
#' algorithms is deliberately different. For more
#' details, including future plans to share this starting seed in a more secure
#' way, please see the associated document entitled "secure.global.ranking.docx"
#' and the header file for ds.ranksSecure.
#' @return writes a data frame object entitled blackbox.ranks.df to the
#' serverside. In each study this contains the encrypted global ranks
#' and a range of other key components from the second stage (ranking of global
#' ranks for real observations only) of the ranking procedure. For more details
#' see the associated document entitled "secure.global.ranking.docx"
#' @author Paul Burton 9th November, 2021
#' @export
#'
blackBoxRanksDS <- function(input.var.name=NULL, shared.seedval){ #START FUNC
#######################################################
#MODULE 1: CAPTURE THE nfilter SETTINGS
#thr<-dsBase::listDisclosureSettingsDS()
#nfilter.tab <- as.numeric(thr$nfilter.tab)
#nfilter.glm <- as.numeric(thr$nfilter.glm)
#nfilter.subset <- as.numeric(thr$nfilter.subset)
#nfilter.string <- as.numeric(thr$nfilter.string)
#nfilter.stringShort <- as.numeric(thr$nfilter.stringShort)
#nfilter.kNN <- as.numeric(thr$nfilter.kNN)
#nfilter.noise <- as.numeric(thr$nfilter.noise)
#nfilter.levels <- as.numeric(thr$nfilter.levels)
########################################################
# back-up current .Random.seed and revert on.exit
if (exists(x = ".Random.seed", envir = globalenv())) {
assign(x = "old_seed", value = .Random.seed, envir = parent.frame());
on.exit({ assign(x = ".Random.seed", value = old_seed, envir = globalenv()); remove("old_seed", envir = parent.frame()) }, add = TRUE)
} else
on.exit(if (exists(x = ".Random.seed", envir = globalenv())) remove(".Random.seed", envir = globalenv()), add = TRUE)
input.var <- eval(parse(text=input.var.name), envir = parent.frame())
input.global.ranks<-input.var
#ESTIMATED OVERALL MEAN AND SD FROM meanQuantileDS
#SAVED IN input.mean.sd.df BY ds.dmtC2S
max.sd.input.ranks<-input.ranks.sd.df$max.sd.input.ranks
mean.input.ranks<-input.ranks.sd.df$mean.input.ranks
numsubs.real<-length(input.global.ranks)
numsubs<-numsubs.real
#Create indicators for original original sequence order and real/synthetic
ID.seq.real.orig<-1:numsubs
#Allow individual implementations of blackBoxDS for groups of projects to have a
#unique but shared starting seed (given specified starting seed) but only
#the person setting up the function can know how that seed is perturbed relative
#to the specified seed
restart.seed.transformation.control.n<-78615
restart.seed.other.seed.actions<-43
#Now set up for repeated transformations
shared.seedval<-(shared.seedval+19)*7
set.seed(shared.seedval)
#Create long fixed sequence of calls to seed to keep resetting seed at start
null.vector<-stats::runif(restart.seed.transformation.control.n,0,1)
#Create transformation control vectors/values first so all studies
#have the same transformation controls given input random seed
#even if the lengths of later vectors (eg synthetic concealing data) are
#different between studies so would lead to inconsistent control
#vectors/values
control.vector<-sample(c(1,1,2,2,3,3),replace=FALSE)
#control.vector<-c(1,control.vector)
control.vector
control.value<-stats::runif(6,0.0001,1)
control.value
#Reset seed for other purposes
set.seed(shared.seedval+restart.seed.other.seed.actions)
#Initialise input.var for analysis. Start by converting to values 0 to 1
#in same order as original input.var. Going to use probit function (stats::pnorm) for this.
#This can take any values -inf to +inf but to avoid extreme value rounding errors
#scale input.var to normal 0 1
#Unlike blackBoxDS no need to check that no attempt has been made to enter a
#fake value for max.sd.input.var. That is because in blackBoxRanksDS no
#synthetic data are generated so the only impact of changing the value of
#max.sd.input.var is to change the first probit transformation. This will either
#cause the whole analysis to fail if the value of max.sd.input.var is too small
#or large or will have no effect at all. So there is no enhanced disclosure risk
#and no need for the equivalent disclosure trap.
input.var.probit.temp<-((input.global.ranks-mean.input.ranks)/max.sd.input.ranks)
input.var.probit<-stats::pnorm(input.var.probit.temp)
if(min(input.var.probit)<=0 | max(input.var.probit)>=1){
error.message<-
paste0("FAILED: initialised values should strictly be >0 and <1 this rule has been violated
there is possiblyly an NA, inf or other error in the input.global.ranks")
stop(error.message, call. = FALSE)
}
if(min(rank(input.global.ranks)-rank(input.var.probit))<0 | max(rank(input.global.ranks)-rank(input.var.probit))>0) {
error.message<-
paste0("FAILED: probit initialised values are not in an identical order to the original input variable please check")
stop(error.message, call. = FALSE)
}
intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)
intermediate.value.matrix[,1]<-input.global.ranks
intermediate.value.matrix[,2]<-input.var.probit
for(cv in 3:(length(control.vector)+2)){
if(control.vector[cv-2]==1){
intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]^(control.value[cv-2])
}
if(control.vector[cv-2]==2){
intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]+(control.value[cv-2])
}
if(control.vector[cv-2]==3){
intermediate.value.matrix[,cv]<-intermediate.value.matrix[,(cv-1)]*(control.value[cv-2])
}
}
intermediate.value.matrix<-cbind(intermediate.value.matrix,ID.seq.real.orig)
colnames(intermediate.value.matrix)<-c("input.global.ranks.orig","input.var.probit",
1:6,"ID.seq.real.orig")
dim(intermediate.value.matrix)
utils::head(intermediate.value.matrix)
utils::tail(intermediate.value.matrix)
rank.intermediate.value.matrix<-matrix(NA,ncol=8,nrow=numsubs)
for(k in 1:8)
{
rank.intermediate.value.matrix[,k]<-rank(intermediate.value.matrix[,k])
}
rank.intermediate.value.matrix<-cbind(rank.intermediate.value.matrix,ID.seq.real.orig)
colnames(rank.intermediate.value.matrix)<-c("input.global.ranks.orig","input.var.probit",
1:6,"ID.seq.real.orig")
message("\nRANKS IN ALL COLUMNS ABOVE SHOULD BE THE SAME\n")
control.vector
control.value
#CREATE blackBox OUTPUT DF
intermediate.value.df<-data.frame(intermediate.value.matrix)
names(intermediate.value.df)
encrypted.global.ranks<-intermediate.value.df$"X6"
ranks.input.global.ranks.orig<-rank(intermediate.value.df$input.global.ranks.orig)
ranks.encrypted.global.ranks<-rank(intermediate.value.df$"X6")
output.df<-data.frame(cbind(input.global.ranks,encrypted.global.ranks,ranks.input.global.ranks.orig,ranks.encrypted.global.ranks,ID.seq.real.orig))
utils::head(output.df)
utils::tail(output.df)
#Sort df by magnitude of key variable
ord.by.val<-order(input.global.ranks)
output.temp.sort.by.val<-output.df[ord.by.val,]
utils::head(output.temp.sort.by.val)
ID.by.val<-1:nrow(output.temp.sort.by.val)
output.df.sort.by.val<-data.frame(cbind(output.temp.sort.by.val,ID.by.val))
utils::head(output.df.sort.by.val)
utils::tail(output.df.sort.by.val)
dim(output.df.sort.by.val)
blackbox.ranks.df <- cbind(sR5.df[,c(5,1,2,3,9,6,10)],output.df.sort.by.val[,1:2])
colnames(blackbox.ranks.df)[3]<-"encrypted.input.var"
colnames(blackbox.ranks.df)[5]<-"global.ranks.input.from.sR5"
colnames(blackbox.ranks.df)[6]<-"is.real"
colnames(blackbox.ranks.df)[8]<-"global.ranks.after.blackbox"
colnames(blackbox.ranks.df)[9]<-"encrypted.global.ranks"
utils::head(blackbox.ranks.df)
if(sum(round(rank(blackbox.ranks.df[,5])-rank(blackbox.ranks.df[,8]),2)==0)!=numsubs)
{
error.message<-
paste0("FAILED: inconsistent ranking across different transformations in black box,
try a different seed. Altenatively this could reflect modification of the
clientside code which is not recommended. Finally, it can also occur
if the R session on one or more of the opal data servers runs out
of memory")
stop(error.message, call. = FALSE)
}else{
message("\nPROCESSING SUCCESSFUL, ALL RANKS AGREE FOR ALL TRANSFORMATIONS\n\n")
}
return(blackbox.ranks.df)
} #END FUNCTION
#ASSIGN
# blackBoxRanksDS