-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPropensity score Matching.Rmd
More file actions
472 lines (332 loc) · 13.1 KB
/
Propensity score Matching.Rmd
File metadata and controls
472 lines (332 loc) · 13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
---
title: "R Notebook"
output: pdf_document
---
## Set up
```{r}
libs <- c('MatchIt', "ggplot2")
needed <- setdiff(libs, .packages(all = TRUE))
if (length(needed) > 0) {install.packages(needed); print("Yay!")}
for (lib in libs) {library(lib, character.only = TRUE)}
```
## Data Generation
```{r}
set.seed(123)
N = 5000
a = rnorm(N)
b = rnorm(N)
# True Treatment effect = 2 (i.e. 102-100, the other parts just add noise of
# differing variance)
# note: y1 has twice the slope of y0 on both variables. But we are only interested in
# the AVERAGE binary treatment effect over all levels of a and b, rather than
# prediction accuracy at particular levels (at which treatment effect may vary
# greatly from its avg as it does here). The ATE is equivalent to the difference of
# in y-intercept bc our data (both variables a and b) are centered around x=0
# because they are drawn from ~N(0,1) dist.
y1 = 102 + 6*a + 4*b + rnorm(N)
y0 = 100 + 3*a + 2*b + rnorm(N)
u = (a+b)/2 # why divide by 2?
p_d_given_a_b = plogis(u)
d = rbinom(rep(1,N), 1, p_d_given_a_b)
y = d * y1 + (1-d) * y0
data = data.frame(d, y, a, b, u)
sum(d)/length(d)
# 50.14% in treatment group
```
## naive estimate (of ATE?), no cofounds controlled for
```{r}
# the coefficient for D represents the niave treatment effect
model = lm(y ~ d, data)
summary(model)
```
## regression with conditioning estimate of ATE
```{r}
model <- lm(y ~ d+a+b, data)
summary(model)
# d coef = 1.96058
# this is close to accurate!
# look at data by treatent status
ggplot(data) +
geom_point(aes(x=u, y=y, col= factor(d)), alpha = .5) +
geom_line(aes(x = u, y=predict(model), group=d, col= factor(d))) #+
#geom_line(aes(x = u, y=predict(model2), group=d), col= "orange")
```
## Manual Pronensity score
### Create propensity scores
```{r}
fit <- glm(d~a+b, family = binomial(link = "logit"), data)
propensity <- predict(fit, type = "response") # P(D | Zs)
data$ps <- propensity
```
## Check Shared Support on propensity scores between treatment and control
```{r}
# overlapping histogram
ggplot() +
geom_histogram(data = data, mapping = aes(x= ps, fill = as.factor(d)), alpha = .7, position= "identity")
# identical result
ggplot() +
geom_histogram(data = data[data$d == 1,], mapping = aes(x= ps), alpha = .5, fill = "red") +
geom_histogram(data = data[data$d == 0,], mapping = aes(x= ps), alpha = .5, fill = "blue")
# percent of data unsupported
shared_suport <- sort(c(range(data[data$d == 1, "ps"]), range(data[data$d == 0, "ps"])))[2:3]
unsupported_rows <- (data$ps < shared_suport[1]) | (data$ps > shared_suport[2])
# percent of data unsupported
(sum(unsupported_rows)/nrow(data)) * 100
# 0.82% .. under 1 percent of data set is unsupported
```
### Match controls to treatment
```{r}
# Q: why can't we just match without propensity scores using knn in the Z vars?
###
match <- function(data, D_col, PS_col) {
treated <- data[data[D_col] == 1,]
control <- data[data[D_col] == 0,]
nearest <- function(cell, match_with = c("treated", "control")){
assignment <- match.arg(match_with)
df <- list(treated = treated, control = control)[[assignment]]
nearest_loc <- which.min(abs(cell - df[[PS_col]]))
nearest_loc
}
# match controls to treateds
treated$matched_controls_idx <- apply(treated[PS_col], 1, function(cell){nearest(cell, "control")})
treated$matched_controls_y <- control$y[treated$matched_controls_idx]
# match treateds to controls
control$matched_treateds_idx <- apply(control[PS_col], 1, function(cell){nearest(cell, "treated")})
control$matched_treateds_y <- treated$y[control$matched_treateds_idx]
list(treated = treated, control = control)
}
#####
get_estimates <- function(matched_out) {
stopifnot("list" %in% class(matched_out))
stopifnot(c("treated", "control") %in% names(matched_out))
#stopifnot(c("d", "y"))
# ATT
y1_d1 <- mean(matched_out$treated$y)
y0_d1_hat <- mean(matched_out$treated$matched_controls_y)
ATT <- y1_d1 - y0_d1_hat
#ATC
y1_d0_hat <- mean(matched_out$control$matched_treateds_y)
y0_d0 <- mean(matched_out$control$y)
ATC <- y1_d0_hat - y0_d0
#ATE
all <- rbind(matched_out$treated[,1:6], matched_out$control[,1:6])
all$y1 <- all$y
all$y0 <- all$y
all[all$d ==1, "y0"] <- matched_out$treated$matched_controls_y
all[all$d ==0, "y1"] <- matched_out$control$matched_treateds_y
ATE <- mean(all$y1 - all$y0)
return(list(ATT = ATT, ATC = ATC, ATE = ATE))
}
#####
test <- match(data, D_col = "d", PS_col = "ps")
## Check balance
# for "a" var
mean(test$treated$a) - mean(test$control$a[test$treated$matched_controls_idx])
# -0.0154599 ... fairly close to zero
## get ATT, ATC, and ATE estimates
y1_d1 <- mean(test$treated$y)
y0_d1_hat <- mean(test$treated$matched_controls_y)
ATT <- y1_d1 - y0_d1_hat
# 3.166293... i.e. correct treatment
y1_d0_hat <- mean(test$control$matched_treateds_y)
y0_d0 <- mean(test$control$y)
ATC <- y1_d0_hat - y0_d0
# 0.7619367
##!!!!### is this a weighted avg?.. NO itseems
ATE <- (ATT + ATC)/2
# 1.964115 ... quite close to 2
all <- rbind(test$treated[,1:6], test$control[,1:6])
all$y1 <- all$y
all$y0 <- all$y
all[all$d ==1, "y0"] <- test$treated$matched_controls_y
all[all$d ==0, "y1"] <- test$control$matched_treateds_y
mean(all$y1 - all$y0)
# 1.967481... even slightly closer!
# The difference from ATE calculated from ATT and ATC above is likely due to the this
# being ineffect a weighted avg of ATT and ATC based on the slight imbalance in the
# size of the treatment and control groups
```
## Calculate ATT, ATC (, ATE) with Matchit package
### Match control units to treatment units to approximate hypothetical Y_0 of treated units and visa versa
```{r}
# att model
result <- matchit(d ~ a + b, data, method = "nearest", distance = "logit", replace=TRUE)
matched_data_att = match.data(result)
att_model = lm(y ~ d, data=matched_data_att, weights=matched_data_att$weights)
# atc model
data$d <- (data$d + 1) %% 2 # flip the assignment to match treatement to control
result <- matchit(d ~ a + b, data, method = "nearest", distance = "logit", replace=TRUE)
matched_data_atc = match.data(result)
data$d <- (data$d + 1) %% 2 # flip it back for further use
matched_data_atc$d <- (matched_data_atc$d + 1) %% 2 # revert to get correct treatment effect estimates
atc_model = lm(y ~ d, data=matched_data_atc, weights=matched_data_atc$weights)
#atc_estimates[[i]]<-atc_model$coefficients[[2]]
#att_estimates[[i]]<-att_model$coefficients[[2]]
#att_errs[[i]]<-coef(summary(att_model))[,2][[2]]
#atc_errs[[i]]<-coef(summary(atc_model))[,2][[2]]
```
### check balance of Zs between groups
```{r}
summary(result)
```
### avg both groups and take difference to get ATT
```{r}
atc_model$coefficients[[2]]
# 0.7619367... literally identical to my manual values!!
att_model$coefficients[[2]]
# 3.166293
# identical
matched_data_att$weighted_y = matched_data_att$weights * matched_data_att$y
disp <- aggregate(matched_data_att[,c("d","weighted_y")], list(matched_data_att$d), mean)
# y1_d1 - y0_d1
disp[2,3] - disp[1,3]
# ATT
# 3.166293
# identical
model <- lm(weighted_y ~ d, data = matched_data_att)
summary(model)
```
### Is PSM more robust to Treatment/Control imbalance due to Selection Bias??: YES!
```{r}
### Data Generation
set.seed(123)
N = 5000
a = rnorm(N)
b = rnorm(N)
# True Treatment effect = 2 (i.e. 102-100, the other parts just add noise of differing variance)
# note: y1 has twice the slope of y0 on both variables. But we are only interested in
# the AVERAGE binary treatment effect, rather than prediction accuracy at all levels
y1 = 102 + 6*a + 4*b + rnorm(N)
y0 = 100 + 3*a + 2*b + rnorm(N)
u = (a+b)/2 # why divide by 2?
p_d_given_a_b = plogis((2*u)-3) # multiply to make curve steeper and substract 3 to # shift probabilities toward 0
d = rbinom(rep(1,N), 1, p_d_given_a_b)
y = d * y1 + (1-d) * y0
data = data.frame(d, y, a, b, u)
sum(d)/length(d)
# 8.82% in treatment group
### Regression w/ conditioning estimate of ATE
model1 <- lm(y ~ d+a+b, data)
summary(model1)
# beta_d == 5.55926 ... way bigger than 2
#however if we use interactions....
# note: I only know that d:a and d:b interactions are necessary and useful but not
# a:b (which might cause over fitting...maybe), because I know the data generation
# process and I have seen the graph.. This is unrealistic
model2 <- lm(y ~ d+a+b+d:a+d:b, data)
summary(model2)
# beta_d == 2.08545 ... very close to True value! because model adapts to the data # better... will this cause real world over fitting problems...?
# look at data by treatent status
ggplot(data) +
geom_point(aes(x=u, y=y, col= factor(d)), alpha = .5) +
geom_line(aes(x = u, y=predict(model1), group=d), col="purple") +
geom_line(aes(x = u, y=predict(model2), group=d), col= "orange")
### PSM estimate of ATE
# make sure to read in the match() function from above first
# make propensity scores
fit <- glm(d~a+b, family = binomial(link = "logit"), data)
propensity <- predict(fit, type = "response") # P(D | Zs)
data$ps <- propensity
# check shared support of treatment and control on Propensity Scores
# overlapping hists plot
ggplot() +
geom_histogram(data = data, mapping = aes(x= ps, fill = as.factor(d)), alpha = .7, position= "identity")
# percent of data unsupported
shared_suport <- sort(c(range(data[data$d == 1, "ps"]), range(data[data$d == 0, "ps"])))[2:3]
unsupported_rows <- (data$ps < shared_suport[1]) | (data$ps > shared_suport[2])
(sum(unsupported_rows)/nrow(data)) * 100
# 2.08% of data set is unsupported... pretty good!
# match treatments and controls
matched <- match(data, D_col = "d", PS_col = "ps")
# ATE, ATT, ATC
estimates <- get_estimates(matched)
estimates$ATE
# 2.080637 ... very close to true value of 2!
estimates$ATT
# 5.888378 ... ok we believe this... close to the reg w/ cond estimate under these conditions
estimates$ATC
# 1.712308 .. close to two.. this shows that most of the data in the control
## conclusion: PSM preforms much better than regression with condition with a
# heavily imbalanced dataset, due to selection bias misleading the regression.
# Further we see that ATT only gives info on the treatment group that is selected with its selection bias.
```
### doing cond regression on a data matrix that is biased....
```{r}
test <- lm(weighted_y ~ d + a + b, data = matched_data_att) # correct matched data?
summary(test)
# d estimate -2.6863
# why is ths so far from the true value of two 2.. bc it is based off a data matrix
# made up of only the y0_d1_hat approximated values gotten by nearest neighbors on
# propensity scores matched to y1_d1 treatments units... no pairing of control and
# approximated y1_d0_hat values... so the specification doesn't make theoretical
# sense as an estimation of ATE
diff <- predict(test, data.frame(d=c(1,0), a= mean(matched_data_att$a), b = mean(matched_data_att$b)))
# -2.686343
```
### Is conditioning reggression or PSM more accurate??: balanced treatment and control
```{r}
cond_reg_ATEs <- list()
PSM_ATEs <- list()
sample_ATE <- list()
for (n in 1:500) {
## Data generation
N = 5000
a = rnorm(N)
b = rnorm(N)
# True Treatment effect = 2 (i.e. 102-100, the other parts just add noise of
# differing variance)
y1 = 102 + 6*a + 4*b + rnorm(N)
y0 = 100 + 3*a + 2*b + rnorm(N)
u = (a+b)/2
p_d_given_a_b = plogis(u)
d = rbinom(rep(1,N), 1, p_d_given_a_b)
y = d * y1 + (1-d) * y0
data = data.frame(d, y, a, b, u)
# record sample ATE
sample_ATE[n] <- mean(y1-y0)
## get conditioning regression ATEs
cond_model <- lm(y ~ d+a+b, data)
cond_reg_ATEs[n] <- cond_model$coefficients[2]
## Get PSM ATEs
# make sure to read in the match() function from above first
# make propensity scores
fit <- glm(d~a+b, family = binomial(link = "logit"), data)
propensity <- predict(fit, type = "response") # P(D | Zs)
data$ps <- propensity
# match treatments and controls
test <- match(data, D_col = "d", PS_col = "ps")
all <- rbind(test$treated[,1:6], test$control[,1:6])
all$y1 <- all$y
all$y0 <- all$y
all[all$d ==1, "y0"] <- test$treated$matched_controls_y
all[all$d ==0, "y1"] <- test$control$matched_treateds_y
# get psm ATE
PSM_ATEs[n] <- mean(all$y1 - all$y0)
}
mean(unlist(sample_ATE))
#2nd 2.004365
mean(unlist(PSM_ATEs))
# 2.007392 #2nd 2.010691
sd(unlist(PSM_ATEs))
# 0.06748241 #2nd 0.06430236
mean(unlist(cond_reg_ATEs))
# 2.001612 #2nd 2.004621
sd(unlist(cond_reg_ATEs))
# 0.06747916 #2nd 0.06216361
# So it seems that over this sample there is slightly less bias in the conditioning
# regression appoach and pretty much identical variance...
mean(unlist(sample_ATE) - unlist(PSM_ATEs))
#2nd -0.006326656
sd(unlist(sample_ATE) - unlist(PSM_ATEs))
#2nd 0.03475738
#MSE
var(unlist(sample_ATE) - unlist(PSM_ATEs)) + mean(unlist(sample_ATE) - unlist(PSM_ATEs))^2
#2nd MSE 0.001248102
mean(unlist(sample_ATE) - unlist(cond_reg_ATEs))
#2nd -0.0002565596
sd(unlist(sample_ATE) - unlist(cond_reg_ATEs))
# 0.04312974
var(unlist(sample_ATE) - unlist(cond_reg_ATEs)) + mean(unlist(sample_ATE) - unlist(cond_reg_ATEs))^2
#2nd MSE 0.00186024 ... but now this way of calculating MSE says PSM has lower
#MSE..? essentially bc more variance...
```