Skip to content

Commit bd52b42

Browse files
authored
Merge pull request #364 from sdcTools/fix-ldiv-nas
Fix ldiv nas
2 parents ee3a303 + 963fc0e commit bd52b42

6 files changed

Lines changed: 597 additions & 1221 deletions

File tree

NEWS

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# 5.8.2
2+
- Bugfix: `ldiversity()` now computes distinct l-diversity measure correctly in case of `NAs` in Keyvars
3+
* improved/simplified uderlying C++ code
4+
* added Unit-Tests for `ldiversity()`
5+
- Fixing some Header-Definitions for CRAN-Compliance
6+
- Updated Unit-Tests for `pram()`
7+
18
# 5.8.1
29
- New AI-assisted anonymization features:
310
+ `AI_createSdcObj()`: LLM-assisted variable classification into SDC roles

R/measure_risk.R

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,18 @@ measure_riskWORK <- function(data, keyVars, w=NULL, missing=-999, hid=NULL, max_
306306
#' @param missing a integer value to be used as missing value in the C++ routine
307307
#' @param ldiv_index indices (or names) of the variables used for l-diversity
308308
#' @param l_recurs_c l-Diversity Constant
309-
ldiversity <- function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) {
310-
ldiversityX(obj=obj, ldiv_index=ldiv_index, l_recurs_c=l_recurs_c, missing=missing, ...)
309+
ldiversity <- function(obj,
310+
ldiv_index = NULL,
311+
l_recurs_c = 2,
312+
missing = -999,
313+
...) {
314+
ldiversityX(
315+
obj = obj,
316+
ldiv_index = ldiv_index,
317+
l_recurs_c = l_recurs_c,
318+
missing = missing,
319+
...
320+
)
311321
}
312322

313323
setGeneric("ldiversityX", function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999, ...) {
@@ -321,23 +331,27 @@ definition=function(obj, ldiv_index=NULL, l_recurs_c=2, missing=-999) {
321331
n <- obj@manipNumVars
322332
s <- obj@manipStrataVar
323333
ldiv_index <- ldiv_index
324-
if ( is.null(ldiv_index) ) {
334+
if (is.null(ldiv_index)) {
325335
sensVar <- get.sdcMicroObj(obj, "sensibleVar")
326-
if ( is.null(sensVar) ) {
336+
if (is.null(sensVar)) {
327337
err <- paste0("You need to specify argument 'sensibleVar' in 'createSdcObj()'")
328-
err <- paste0(err, " or specify it directly (argument 'ldiv_index') so that the")
338+
err <- paste0(err,
339+
" or specify it directly (argument 'ldiv_index') so that the")
329340
err <- paste0(err, " ldiversity risk-measure can be calculated!\n")
330341
stop(err)
331342
} else{
332343
ldiv_index <- sensVar
333344
}
334345
}
335-
if (!is.null(k))
346+
if (!is.null(k)) {
336347
o[, colnames(k)] <- k
337-
if (!is.null(n))
348+
}
349+
if (!is.null(n)) {
338350
o[, colnames(n)] <- n
339-
if (!is.null(s))
351+
}
352+
if (!is.null(s)) {
340353
o$sdcGUI_strataVar <- s
354+
}
341355
kV <- colnames(obj@origData)[get.sdcMicroObj(obj, "keyVars")]
342356
obj@risk$ldiversity <- ldiversityWORK(
343357
data = o,
@@ -371,48 +385,83 @@ ldiversityWORK <- function(data, keyVars, ldiv_index, missing=-999, l_recurs_c=2
371385
stop("Please define valid key variables", call. = FALSE)
372386
}
373387
}
388+
389+
# Index of sensitive variable(s)
374390
if (!is.null(ldiv_index)) {
375391
if (is.numeric(ldiv_index)) {
376392
ldiv_var <- colnames(data)[ldiv_index]
377-
ldiv_index <- length(variables) + 1:length(ldiv_index)
378393
} else if (is.character(ldiv_index)) {
379394
ldiv_var <- ldiv_index
380-
ldiv_index <- length(variables) + 1:length(ldiv_index)
381395
}
382-
if (any(ldiv_var %in% variables))
396+
397+
# Calculate the 1-based index for the C++ matrix (KeyVars + SensVars)
398+
ldiv_index_cpp <- length(variables) + 1:length(ldiv_index)
399+
400+
if (any(ldiv_var %in% variables)) {
383401
stop("Sensitivity variable should not be a keyVariable")
384-
} else ldiv_var <- character(0)
402+
}
403+
} else {
404+
ldiv_var <- character(0)
405+
ldiv_index_cpp <- -99
406+
}
385407

408+
# Prep data (factors/strings -> numeric)
386409
n_key_vars <- length(variables)
387410
dataX <- data[, c(variables, ldiv_var), drop=FALSE]
388411
for (i in 1:ncol(dataX)) {
389-
if (!is.numeric(dataX[, i]))
390-
dataX[, i] <- as.numeric(unlist(dataX[, i]))
412+
if (!is.numeric(dataX[, i])) {
413+
dataX[, i] <- as.numeric(as.factor(dataX[, i]))
414+
}
391415
}
392416
dataX <- as.matrix(dataX)
393-
ind <- do.call(order, data.frame(dataX))
394-
dataX <- dataX[ind, , drop=FALSE]
395-
ind <- order(c(1:nrow(dataX))[ind])
396-
if (is.null(ldiv_index))
397-
ldiv_index=-99
398-
if (length(ldiv_index) > 5)
417+
418+
# Order data for C++ Function
419+
# Matrix is ordered in a way so that NAs are grouped together for the C++ group-matching
420+
# na.last = TRUE ensures that NAs appear at the end of their respective groups
421+
ind <- do.call(order, c(as.data.frame(dataX), list(na.last = TRUE)))
422+
dataX_sorted <- dataX[ind, , drop = FALSE]
423+
424+
# We need an index to be able to restore original order after
425+
# calling the c++ function
426+
back_ind <- order(ind)
427+
428+
# Call C++ function
429+
if (length(ldiv_index_cpp) > 5) {
399430
stop("Maximal number of sensitivity variables is 5")
400-
res <- measure_risk_cpp(dataX, 0, n_key_vars, l_recurs_c, ldiv_index, missing)
401-
res$Fk <- res$Res[, 3]
402-
res$Res <- res$Res[ind, ]
403-
if (all(ldiv_index != -99)) {
404-
res$Mat_Risk <- res$Mat_Risk[ind, ]
405-
names(res)[names(res) == "Mat_Risk"] <- "ldiversity"
406-
colnames(res$ldiversity) <- c(paste(rep(ldiv_var, each=3), rep(c("Distinct_Ldiversity",
407-
"Entropy_Ldiversity", "Recursive_Ldiversity"), length(ldiv_index)), sep="_"),
408-
"MultiEntropy_Ldiversity", "MultiRecursive_Ldiversity")
431+
}
432+
433+
res <- measure_risk_cpp(
434+
data = dataX_sorted,
435+
weighted_R = 0,
436+
n_key_vars_R = n_key_vars,
437+
l_recurs_c_R = l_recurs_c,
438+
ldiv_index_R = ldiv_index_cpp,
439+
missing_value_R = missing
440+
)
441+
442+
# Re-order results back to original order
443+
res$Fk <- res$Res[back_ind, 3]
444+
445+
if (all(ldiv_index_cpp != -99)) {
446+
# Reorder the risk matrix to match original data input
447+
ldiv_mat <- res$Mat_Risk[back_ind, , drop = FALSE]
448+
449+
# Specifiy column names
450+
col_names <- c(paste(rep(ldiv_var, each = 3), rep(
451+
c(
452+
"Distinct_Ldiversity",
453+
"Entropy_Ldiversity",
454+
"Recursive_Ldiversity"
455+
), length(ldiv_var)), sep = "_"),
456+
"MultiEntropy_Ldiversity",
457+
"MultiRecursive_Ldiversity")
458+
colnames(ldiv_mat) <- col_names
459+
res_final <- ldiv_mat
409460
} else {
410-
res <- res[names(res) != "Mat_Risk"]
461+
res_final <- res$Res[back_ind, ]
411462
}
412-
ind <- order(res$Res[, 1], decreasing=TRUE)
413-
res <- res$ldiversity
414-
class(res) <- "ldiversity"
415-
invisible(res)
463+
class(res_final) <- "ldiversity"
464+
invisible(res_final)
416465
}
417466

418467
#' Print method for objects of class measure_risk

src/Framework.h

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ typedef int BOOL;
4848

4949
// ============================= Display Messages =================================
5050
inline extern char g_TxtBuffer[1024]; // character TxtBufferfer to display messages
51-
static int OS_Printf(const char *Str, ...);
51+
inline int OS_Printf(const char *Str, ...);
5252

5353
// ============================= Assert =================================
5454
#ifdef _DEBUG
@@ -252,16 +252,16 @@ inline extern int g_NbNew;
252252
#endif
253253
#endif // _MSC_VER
254254

255-
static char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn = TRUE);
256-
static char *ReplaceChar(char *Str, char OldChar, char NewChar);
257-
static char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter = FALSE, BOOL ReturnNULL = TRUE);
255+
inline static char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn = TRUE);
256+
inline static char *ReplaceChar(char *Str, char OldChar, char NewChar);
257+
inline static char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter = FALSE, BOOL ReturnNULL = TRUE);
258258

259-
//=== Parsing
260-
static char *RemoveComment(char *Ptr, int Size = -1); // remove text between /* & */
261-
static char *GoToNextLine(char *Ptr); // renvoie Ptr avanc� jusqu'apr�s le '\n' suivant
262-
static char *GoTo1stChar(char *Ptr);
263-
static char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE);
264-
static char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE);
259+
//=== Parsing
260+
inline static char *RemoveComment(char *Ptr, int Size = -1); // remove text between /* & */
261+
inline static char *GoToNextLine(char *Ptr); // returns the ptr moved forward to the character right after the next newline
262+
inline static char *GoTo1stChar(char *Ptr);
263+
inline static char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE);
264+
inline static char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar = TRUE);
265265

266266
//============================================= Time function
267267

@@ -284,7 +284,7 @@ int gettimeofday(struct timeval *tv, struct timezone *tz);
284284

285285
#endif // _MSC_VER
286286

287-
static uint TimeGetMilliSecond(void);
287+
inline static uint TimeGetMilliSecond(void);
288288

289289
// ============================= CTooFile =============================
290290
class CTooFile
@@ -618,7 +618,7 @@ inline int SubMain(int argc, char *argv[])
618618
#include <iostream>
619619
#endif
620620

621-
char g_TxtBuffer[1024]; // character TxtBufferfer to display messages
621+
inline char g_TxtBuffer[1024]; // character TxtBufferfer to display messages
622622

623623
int OS_Printf(const char *Str, ...)
624624
{
@@ -703,10 +703,7 @@ int stricmp(char *str1, char *str2)
703703
}
704704
#endif // _MSC_VER
705705

706-
#pragma GCC diagnostic push
707-
#pragma GCC diagnostic ignored "-Wunused-function"
708-
709-
char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn)
706+
inline char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn)
710707
{
711708
if (Max > 0)
712709
{
@@ -716,7 +713,7 @@ char *Strncpy(char *Dst, const char *Src, int Max, BOOL Warn)
716713
return Dst;
717714
}
718715

719-
char *ReplaceChar(char *Str, char OldChar, char NewChar)
716+
inline char *ReplaceChar(char *Str, char OldChar, char NewChar)
720717
{
721718
char *Ret = Str;
722719

@@ -731,7 +728,7 @@ char *ReplaceChar(char *Str, char OldChar, char NewChar)
731728
return Ret;
732729
}
733730

734-
char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL)
731+
inline char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL)
735732
{
736733
int l = (int) strlen(SubString);
737734

@@ -753,7 +750,7 @@ char *Stristr(char *Ptr, char *SubString, BOOL LeaveAfter, BOOL ReturnNULL)
753750
}
754751

755752
///============================================= Parsing
756-
char *RemoveComment(char *Ptr, int Size)
753+
inline char *RemoveComment(char *Ptr, int Size)
757754
{
758755
if (Size < 0)
759756
Size = (int) strlen(Ptr) + 1;
@@ -804,7 +801,7 @@ char *RemoveComment(char *Ptr, int Size)
804801
}
805802

806803

807-
char *GoToNextLine(char *Ptr)
804+
inline char *GoToNextLine(char *Ptr)
808805
{
809806
ASSERT(Ptr != NULL);
810807

@@ -829,7 +826,7 @@ char *GoToNextLine(char *Ptr)
829826
}
830827

831828

832-
char *GoTo1stChar(char *Ptr)
829+
inline char *GoTo1stChar(char *Ptr)
833830
{
834831
while ((*Ptr == ' ' || *Ptr == '\t') && *Ptr != 0 && *Ptr != '\r' && *Ptr != '\n')
835832
++Ptr;
@@ -838,7 +835,7 @@ char *GoTo1stChar(char *Ptr)
838835
}
839836

840837

841-
char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar)
838+
inline char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar)
842839
{
843840
//BOOL Warn = FALSE;
844841
int i = 0;
@@ -892,7 +889,7 @@ char *ParseString(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar)
892889
}
893890

894891

895-
char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar)
892+
inline char *ParseLine(char *Ptr, char *Str, int Size, BOOL AdvanceTo1stChar)
896893
{
897894
//BOOL Warn = FALSE;
898895
int i = 0;
@@ -987,7 +984,7 @@ int gettimeofday(struct timeval *tv, struct timezone *tz)
987984

988985
#endif // _MSC_VER
989986

990-
uint TimeGetMilliSecond(void)
987+
inline uint TimeGetMilliSecond(void)
991988
{
992989
struct timeval tv;
993990

@@ -996,8 +993,6 @@ uint TimeGetMilliSecond(void)
996993
return (tv.tv_sec & 0x000FFFFFF) * 1000 + tv.tv_usec / 1000;
997994
}
998995

999-
#pragma GCC diagnostic pop
1000-
1001996
// ===============================================================================
1002997
//
1003998
// CTooFile

0 commit comments

Comments
 (0)