Skip to content

Commit 1083fe0

Browse files
committed
Simplified/Updated/Fixed code to compute ldiversity measures (measure_risk_cpp()) which fixes issue #363 dealing with NAs in keyVars;
Updated worker function (fixed Ordering, ...) and added Unit-Tests for `ldiversity()`
1 parent 4553d45 commit 1083fe0

3 files changed

Lines changed: 471 additions & 803 deletions

File tree

R/measure_risk.R

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,18 @@ measure_riskWORK <- function(data, keyVars, w=NULL, missing=-999, hid=NULL, max_
306306
#' @param missing a integer value to be used as missing value in the C++ routine
307307
#' @param ldiv_index indices (or names) of the variables used for l-diversity
308308
#' @param l_recurs_c l-Diversity Constant
309-
ldiversity <- function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) {
310-
ldiversityX(obj=obj, ldiv_index=ldiv_index, l_recurs_c=l_recurs_c, missing=missing, ...)
309+
ldiversity <- function(obj,
310+
ldiv_index = NULL,
311+
l_recurs_c = 2,
312+
missing = -999,
313+
...) {
314+
ldiversityX(
315+
obj = obj,
316+
ldiv_index = ldiv_index,
317+
l_recurs_c = l_recurs_c,
318+
missing = missing,
319+
...
320+
)
311321
}
312322

313323
setGeneric("ldiversityX", function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) {
@@ -321,23 +331,27 @@ definition=function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999) {
321331
n <- obj@manipNumVars
322332
s <- obj@manipStrataVar
323333
ldiv_index <- ldiv_index
324-
if ( is.null(ldiv_index) ) {
334+
if (is.null(ldiv_index)) {
325335
sensVar <- get.sdcMicroObj(obj, "sensibleVar")
326-
if ( is.null(sensVar) ) {
336+
if (is.null(sensVar)) {
327337
err <- paste0("You need to specify argument 'sensibleVar' in 'createSdcObj()'")
328-
err <- paste0(err, " or specify it directly (argument 'ldiv_index') so that the")
338+
err <- paste0(err,
339+
" or specify it directly (argument 'ldiv_index') so that the")
329340
err <- paste0(err, " ldiversity risk-measure can be calculated!\n")
330341
stop(err)
331342
} else{
332343
ldiv_index <- sensVar
333344
}
334345
}
335-
if (!is.null(k))
346+
if (!is.null(k)) {
336347
o[, colnames(k)] <- k
337-
if (!is.null(n))
348+
}
349+
if (!is.null(n)) {
338350
o[, colnames(n)] <- n
339-
if (!is.null(s))
351+
}
352+
if (!is.null(s)) {
340353
o$sdcGUI_strataVar <- s
354+
}
341355
kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")]
342356
obj@risk$ldiversity <- ldiversityWORK(
343357
data = o,
@@ -371,48 +385,83 @@ ldiversityWORK <- function(data, keyVars, ldiv_index, missing=-999, l_recurs_c=2
371385
stop("Please define valid key variables", call. = FALSE)
372386
}
373387
}
388+
389+
# Index of sensitive variable(s)
374390
if (!is.null(ldiv_index)) {
375391
if (is.numeric(ldiv_index)) {
376392
ldiv_var <- colnames(data)[ldiv_index]
377-
ldiv_index <- length(variables) + 1:length(ldiv_index)
378393
} else if (is.character(ldiv_index)) {
379394
ldiv_var <- ldiv_index
380-
ldiv_index <- length(variables) + 1:length(ldiv_index)
381395
}
382-
if (any(ldiv_var %in% variables))
396+
397+
# Calculate the 1-based index for the C++ matrix (KeyVars + SensVars)
398+
ldiv_index_cpp <- length(variables) + 1:length(ldiv_index)
399+
400+
if (any(ldiv_var %in% variables)) {
383401
stop("Sensitivity variable should not be a keyVariable")
384-
} else ldiv_var <- character(0)
402+
}
403+
} else {
404+
ldiv_var <- character(0)
405+
ldiv_index_cpp <- -99
406+
}
385407

408+
# Prep data (factors/strings -> numeric)
386409
n_key_vars <- length(variables)
387410
dataX <- data[, c(variables, ldiv_var), drop=FALSE]
388411
for (i in 1:ncol(dataX)) {
389-
if (!is.numeric(dataX[, i]))
390-
dataX[, i] <- as.numeric(unlist(dataX[, i]))
412+
if (!is.numeric(dataX[, i])) {
413+
dataX[, i] <- as.numeric(as.factor(dataX[, i]))
414+
}
391415
}
392416
dataX <- as.matrix(dataX)
393-
ind <- do.call(order, data.frame(dataX))
394-
dataX <- dataX[ind, , drop=FALSE]
395-
ind <- order(c(1:nrow(dataX))[ind])
396-
if (is.null(ldiv_index))
397-
ldiv_index=-99
398-
if (length(ldiv_index) > 5)
417+
418+
# Order data for C++ Function
419+
# Matrix is ordered in a way so that NAs are grouped together for the C++ group-matching
420+
# na.last = TRUE ensures that NAs appear at the end of their respective groups
421+
ind <- do.call(order, c(as.data.frame(dataX), list(na.last = TRUE)))
422+
dataX_sorted <- dataX[ind, , drop = FALSE]
423+
424+
# We need an index to be able to restore original order after
425+
# calling the c++ function
426+
back_ind <- order(ind)
427+
428+
# Call C++ function
429+
if (length(ldiv_index_cpp) > 5) {
399430
stop("Maximal number of sensitivity variables is 5")
400-
res <- measure_risk_cpp(dataX, 0, n_key_vars, l_recurs_c, ldiv_index, missing)
401-
res$Fk <- res$Res[, 3]
402-
res$Res <- res$Res[ind, ]
403-
if (all(ldiv_index != -99)) {
404-
res$Mat_Risk <- res$Mat_Risk[ind, ]
405-
names(res)[names(res) == "Mat_Risk"] <- "ldiversity"
406-
colnames(res$ldiversity) <- c(paste(rep(ldiv_var, each=3), rep(c("Distinct_Ldiversity",
407-
"Entropy_Ldiversity", "Recursive_Ldiversity"), length(ldiv_index)), sep="_"),
408-
"MultiEntropy_Ldiversity", "MultiRecursive_Ldiversity")
431+
}
432+
433+
res <- measure_risk_cpp(
434+
data = dataX_sorted,
435+
weighted_R = 0,
436+
n_key_vars_R = n_key_vars,
437+
l_recurs_c_R = l_recurs_c,
438+
ldiv_index_R = ldiv_index_cpp,
439+
missing_value_R = missing
440+
)
441+
442+
# Re-order results back to original order
443+
res$Fk <- res$Res[back_ind, 3]
444+
445+
if (all(ldiv_index_cpp != -99)) {
446+
# Reorder the risk matrix to match original data input
447+
ldiv_mat <- res$Mat_Risk[back_ind, , drop = FALSE]
448+
449+
# Specifiy column names
450+
col_names <- c(paste(rep(ldiv_var, each = 3), rep(
451+
c(
452+
"Distinct_Ldiversity",
453+
"Entropy_Ldiversity",
454+
"Recursive_Ldiversity"
455+
), length(ldiv_var)), sep = "_"),
456+
"MultiEntropy_Ldiversity",
457+
"MultiRecursive_Ldiversity")
458+
colnames(ldiv_mat) <- col_names
459+
res_final <- ldiv_mat
409460
} else {
410-
res <- res[names(res) != "Mat_Risk"]
461+
res_final <- res$Res[back_ind, ]
411462
}
412-
ind <- order(res$Res[, 1], decreasing=TRUE)
413-
res <- res$ldiversity
414-
class(res) <- "ldiversity"
415-
invisible(res)
463+
class(res_final) <- "ldiversity"
464+
invisible(res_final)
416465
}
417466

418467
#' Print method for objects of class measure_risk

0 commit comments

Comments
 (0)