-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPrivatized_dataset.R
More file actions
128 lines (101 loc) · 4 KB
/
Privatized_dataset.R
File metadata and controls
128 lines (101 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
rm(list = ls()) # Clean the environment
library(VGAM)
library(readr)
##### Load the dataset #####
fitness <- read_csv("fitness.csv") # Read the dataset
n <- nrow(fitness) # Number of observations
colnames(fitness) <- c("index" , "Frequences_per_mounth") # Rename the column
fitness$index <- seq(from = 1 , to = n , by = 1) # Indexing
##### Plotting & Summary statistics #####
# Historam
par(mfrow = c(1,1))
hist(fitness$Frequences_per_mounth , col = "darkgoldenrod1" ,
main = "Work out" , freq = F , xlab = "times work out",
border = "white")
box()
summary(fitness$Frequences_per_mounth)
# Normalize the data
maximum <- max(fitness$Frequences_per_mounth)
minimun <- min(fitness$Frequences_per_mounth)
fitness$norm <- round(( fitness$Frequences_per_mounth - minimun ) / (maximum - minimun) , 2)
##### Build the privatized function #####
privatized_engine <- function(
data = fitness$norm, m = 20 , h = 1 / m , eps = 0.1 )
{
n = length(data) # Number of the observations
bins <- seq(0,1,h) # Number of bins
intervals <- cut( data, bins, include.lowest = T)
pj_hat <- table(intervals) / n
p_hat <- as.vector(pj_hat / h)
nu <- rlaplace(m, 0, 2/eps) # Perturbeb data
Dj <- table(intervals) + nu
Dj[Dj < 0] = 0
qj_hat = Dj
if (sum(qj_hat) != 0){qj_hat <- qj_hat / sum(qj_hat)} else {qj_hat <- rep(0, length(qj_hat))}
q_absfre <- round(qj_hat * n , 0 )
Z <- c() #Build the new dataset
i <- 0
for ( x in q_absfre ){
i <- i + 1
Z <- c(Z, runif(x, bins[i],bins[i+1]))
}
private_dat <- data.frame(Z)
private_dat$index <- seq(from = 1 , to = length(private_dat) , 1)
colnames(private_dat) <- c("privatized_times_norm" , "index")
private_dat$privatized_times <- (private_dat$privatized_times_norm) * maximum
return(private_dat)
}
##### Different set-up ####
m_vec <- c( 5 , 10 , 15 , 20) # Number of bins
eps_vec <- c( .001 , .01 , .1 , 1) # level of privacy
k_vec <- c( 25 , 50 , 75 , 100 ) # (half - 3/4 - all) of the data
##### Run over m let be fixed the other parameters #####
summary_stats_m <- matrix( NA , nrow = length(m_vec) , ncol = 6)
hist_m <- c()
colnames(summary_stats_m) <- c("Min", "1st Qu" , "Median" , "Mean" , "3rd Qu" , "Max" )
rownames(summary_stats_m) <- m_vec
par(mfrow= c(2,2))
i <- 1
for (x in m_vec){
dataset <- privatized_engine(data = fitness$norm , m = x)
summary_stats_m[i,] <- summary(dataset$privatized_times)
hist(dataset$privatized_times, main = paste(" privatized dataset \n with m set to: " , x ) , xlab = 'times per month' , freq = F , col = "cyan4")
box()
i <- i + 1
}
summary(fitness$Frequences_per_mounth)
summary_stats_m
#####
#### Run over eps let be fixed the other parameters ####
summary_stats_eps <- matrix( NA , nrow = length(eps_vec) , ncol = 6)
hist_eps <- c()
colnames(summary_stats_eps) <- c("Min", "1st Qu" , "Median" , "Mean" , "3rd Qu" , "Max" )
rownames(summary_stats_eps) <- eps_vec
par(mfrow= c(2,2))
i <- 1
for (x in eps_vec){
dataset <- privatized_engine(data = fitness$norm , eps = x)
summary_stats_eps[i,] <- summary(dataset$privatized_times)
hist(dataset$privatized_times, main = paste(" privatized dataset \n with eps set to: " , x ) , xlab = 'times per month' , freq = F , col = "cyan4")
box()
i <- i + 1
}
summary(fitness$Frequences_per_mounth)
summary_stats_eps
#####
##### Run over k let be fixed the other parameters #####
summary_stats_k <- matrix( NA , nrow = length(k_vec) , ncol = 6)
colnames(summary_stats_m) <- c("Min", "1st Qu" , "Median" , "Mean" , "3rd Qu" , "Max" )
rownames(summary_stats_m) <- k_vec
par(mfrow= c(2,2))
i <- 1
for (x in k_vec){
dataset <- privatized_engine(data = fitness$norm[1:x])
summary_stats_k[i,] <- summary(dataset$privatized_times)
hist(dataset$privatized_times, main = paste(" privatized dataset \n with k set to: " , x ) , xlab = 'times per month' , freq = F , col = "cyan4")
box()
i <- i + 1
}
summary(fitness$Frequences_per_mounth)
summary_stats_k
#####