|
| 1 | +using DataFrames |
| 2 | +using EventStudyInteracts |
| 3 | +using FixedEffectModels |
| 4 | +using LinearAlgebra |
| 5 | +using Random |
| 6 | + |
| 7 | +const SINGLE_COHORT_REL_VARLIST = [:g_3, :g_2, :g0, :g1, :g2, :g3] |
| 8 | +const SINGLE_COHORT_POST_VARLIST = [:g0, :g1, :g2, :g3] |
| 9 | + |
| 10 | +function make_single_cohort_panel(; seed::Int = 20260316, n_ids::Int = 96, n_periods::Int = 10, treat_time::Int = 5) |
| 11 | + Random.seed!(seed) |
| 12 | + |
| 13 | + ids = repeat(1:n_ids, inner = n_periods) |
| 14 | + periods = repeat(1:n_periods, outer = n_ids) |
| 15 | + treated_ids = collect(1:(n_ids ÷ 2)) |
| 16 | + treated = in.(ids, Ref(treated_ids)) |
| 17 | + first_treat = ifelse.(treated, treat_time, missing) |
| 18 | + never_treat = .!treated |
| 19 | + rel_time = periods .- first_treat |
| 20 | + |
| 21 | + industry_by_id = mod1.(shuffle(1:n_ids), 6) |
| 22 | + industry = repeat(industry_by_id, inner = n_periods) |
| 23 | + |
| 24 | + unit_fe = randn(n_ids) |
| 25 | + time_fe = randn(n_periods) |
| 26 | + unit_slope = randn(n_ids) |
| 27 | + treated_time_shock = randn(n_periods) |
| 28 | + industry_time_shock = randn(maximum(industry_by_id), n_periods) |
| 29 | + id_noise = randn(n_ids, n_periods) |
| 30 | + iid_noise = randn(length(ids)) |
| 31 | + |
| 32 | + dynamic_effect = Dict(-3 => 0.0, -2 => 0.0, 0 => 1.0, 1 => 1.4, 2 => 1.8, 3 => 2.1) |
| 33 | + effect = [get(dynamic_effect, Int(coalesce(rt, -999)), 0.0) for rt in rel_time] |
| 34 | + |
| 35 | + centered_t = periods .- ((n_periods + 1) / 2) |
| 36 | + residual = similar(iid_noise) |
| 37 | + for i in eachindex(ids) |
| 38 | + residual[i] = 0.35 * unit_slope[ids[i]] * centered_t[i] / n_periods + |
| 39 | + 0.55 * treated[i] * treated_time_shock[periods[i]] + |
| 40 | + 0.40 * industry_time_shock[industry[i], periods[i]] + |
| 41 | + 0.25 * id_noise[ids[i], periods[i]] + |
| 42 | + 0.20 * iid_noise[i] |
| 43 | + end |
| 44 | + |
| 45 | + y = similar(iid_noise) |
| 46 | + for i in eachindex(ids) |
| 47 | + y[i] = unit_fe[ids[i]] + time_fe[periods[i]] + effect[i] + residual[i] |
| 48 | + end |
| 49 | + |
| 50 | + df = DataFrame( |
| 51 | + id = ids, |
| 52 | + t = periods, |
| 53 | + industry = industry, |
| 54 | + first_treat = first_treat, |
| 55 | + never_treat = never_treat, |
| 56 | + rel_time = rel_time, |
| 57 | + y = y, |
| 58 | + ) |
| 59 | + |
| 60 | + for k in 3:-1:2 |
| 61 | + df[!, Symbol("g_$k")] = Int.(coalesce.(df.rel_time .== -k, false)) |
| 62 | + end |
| 63 | + for k in 0:3 |
| 64 | + df[!, Symbol("g$k")] = Int.(coalesce.(df.rel_time .== k, false)) |
| 65 | + end |
| 66 | + |
| 67 | + return df |
| 68 | +end |
| 69 | + |
| 70 | +single_cohort_vcov_estimators() = [ |
| 71 | + ("simple", Vcov.simple()), |
| 72 | + ("robust", Vcov.robust()), |
| 73 | + ("cluster(id)", Vcov.cluster(:id)), |
| 74 | + ("cluster(id, t)", Vcov.cluster(:id, :t)), |
| 75 | +] |
| 76 | + |
| 77 | +function fit_direct_single_cohort(df::DataFrame, vcov) |
| 78 | + formula = term(:y) ~ sum(term.(SINGLE_COHORT_REL_VARLIST)) + sum(fe.(term.([:id, :t]))) |
| 79 | + return reg(df, formula, vcov; progress_bar = false) |
| 80 | +end |
| 81 | + |
| 82 | +function fit_iw_single_cohort(df::DataFrame, vcov) |
| 83 | + formula = term(:y) ~ sum(fe.(term.([:id, :t]))) |
| 84 | + return eventreg(df, formula, SINGLE_COHORT_REL_VARLIST, :never_treat, :first_treat, vcov; progress_bar = false) |
| 85 | +end |
| 86 | + |
| 87 | +function _coef_lookup(model) |
| 88 | + return Dict(String(name) => value for (name, value) in zip(coefnames(model), coef(model))) |
| 89 | +end |
| 90 | + |
| 91 | +function _stderror_lookup(model) |
| 92 | + ses = sqrt.(diag(Matrix(vcov(model)))) |
| 93 | + return Dict(String(name) => value for (name, value) in zip(coefnames(model), ses)) |
| 94 | +end |
| 95 | + |
| 96 | +function average_effect(model, vars::Vector{Symbol} = collect(SINGLE_COHORT_POST_VARLIST)) |
| 97 | + cn = String.(coefnames(model)) |
| 98 | + idx = [findfirst(==(string(var)), cn) for var in vars] |
| 99 | + any(isnothing, idx) && error("Missing coefficients in average effect calculation.") |
| 100 | + weights = zeros(length(cn)) |
| 101 | + weights[collect(idx)] .= 1 / length(vars) |
| 102 | + estimate = dot(weights, coef(model)) |
| 103 | + variance = dot(weights, Matrix(vcov(model)) * weights) |
| 104 | + return (estimate = estimate, se = sqrt(variance)) |
| 105 | +end |
| 106 | + |
| 107 | +function compare_single_cohort_models(direct, iw) |
| 108 | + direct_coef = _coef_lookup(direct) |
| 109 | + iw_coef = _coef_lookup(iw) |
| 110 | + direct_se = _stderror_lookup(direct) |
| 111 | + iw_se = _stderror_lookup(iw) |
| 112 | + |
| 113 | + period_rows = NamedTuple[] |
| 114 | + max_coef_diff = 0.0 |
| 115 | + max_se_diff = 0.0 |
| 116 | + for name in string.(SINGLE_COHORT_REL_VARLIST) |
| 117 | + coef_diff = abs(direct_coef[name] - iw_coef[name]) |
| 118 | + se_diff = abs(direct_se[name] - iw_se[name]) |
| 119 | + max_coef_diff = max(max_coef_diff, coef_diff) |
| 120 | + max_se_diff = max(max_se_diff, se_diff) |
| 121 | + push!(period_rows, ( |
| 122 | + name = name, |
| 123 | + direct_estimate = direct_coef[name], |
| 124 | + iw_estimate = iw_coef[name], |
| 125 | + direct_se = direct_se[name], |
| 126 | + iw_se = iw_se[name], |
| 127 | + estimate_diff = coef_diff, |
| 128 | + se_diff = se_diff, |
| 129 | + )) |
| 130 | + end |
| 131 | + |
| 132 | + ate_direct = average_effect(direct) |
| 133 | + ate_iw = average_effect(iw) |
| 134 | + vcov_diff = maximum(abs.(Matrix(vcov(direct)) .- Matrix(vcov(iw)))) |
| 135 | + |
| 136 | + return ( |
| 137 | + period_rows = period_rows, |
| 138 | + max_abs_coef_diff = max_coef_diff, |
| 139 | + max_abs_se_diff = max_se_diff, |
| 140 | + max_abs_vcov_diff = vcov_diff, |
| 141 | + ate_direct = ate_direct.estimate, |
| 142 | + ate_iw = ate_iw.estimate, |
| 143 | + ate_estimate_diff = abs(ate_direct.estimate - ate_iw.estimate), |
| 144 | + ate_direct_se = ate_direct.se, |
| 145 | + ate_iw_se = ate_iw.se, |
| 146 | + ate_se_diff = abs(ate_direct.se - ate_iw.se), |
| 147 | + ) |
| 148 | +end |
| 149 | + |
| 150 | +function run_single_cohort_equivalence() |
| 151 | + df = make_single_cohort_panel() |
| 152 | + results = NamedTuple[] |
| 153 | + for (label, estimator) in single_cohort_vcov_estimators() |
| 154 | + direct = fit_direct_single_cohort(df, estimator) |
| 155 | + iw = fit_iw_single_cohort(df, estimator) |
| 156 | + comparison = compare_single_cohort_models(direct, iw) |
| 157 | + push!(results, (label = label, direct = direct, iw = iw, comparison = comparison)) |
| 158 | + end |
| 159 | + return (data = df, results = results) |
| 160 | +end |
| 161 | + |
| 162 | +function _fmt(x) |
| 163 | + return string(round(x; sigdigits = 8)) |
| 164 | +end |
| 165 | + |
| 166 | +function render_single_cohort_markdown(report = run_single_cohort_equivalence()) |
| 167 | + io = IOBuffer() |
| 168 | + println(io, "## Single-Cohort Equivalence Check") |
| 169 | + println(io) |
| 170 | + println(io, "When all treated units start treatment in the same period, the IW estimator collapses to the ordinary event-study regression because there is only one treated cohort and all cohort-share weights are equal to one.") |
| 171 | + println(io) |
| 172 | + println(io, "The script [`scripts/single_cohort_equivalence.jl`](scripts/single_cohort_equivalence.jl) generates a deterministic panel and compares `eventreg(...)` against a direct `FixedEffectModels.reg(...)` event-study regression under four covariance estimators.") |
| 173 | + println(io) |
| 174 | + println(io, "| Vcov | max abs coef diff | max abs se diff | max abs vcov diff | abs ATE diff | abs ATE se diff |") |
| 175 | + println(io, "| --- | ---: | ---: | ---: | ---: | ---: |") |
| 176 | + for row in report.results |
| 177 | + cmp = row.comparison |
| 178 | + println(io, "| $(row.label) | $(_fmt(cmp.max_abs_coef_diff)) | $(_fmt(cmp.max_abs_se_diff)) | $(_fmt(cmp.max_abs_vcov_diff)) | $(_fmt(cmp.ate_estimate_diff)) | $(_fmt(cmp.ate_se_diff)) |") |
| 179 | + end |
| 180 | + println(io) |
| 181 | + detail = only(filter(x -> x.label == "cluster(id, t)", report.results)) |
| 182 | + println(io, "For the two-way clustered case (`cluster(id, t)`), the period-by-period estimates and the average post-treatment effect over `g0:g3` are:") |
| 183 | + println(io) |
| 184 | + println(io, "| Term | Direct FEM estimate | eventreg estimate | Direct FEM std. error | eventreg std. error |") |
| 185 | + println(io, "| --- | ---: | ---: | ---: | ---: |") |
| 186 | + for period in detail.comparison.period_rows |
| 187 | + println(io, "| $(period.name) | $(_fmt(period.direct_estimate)) | $(_fmt(period.iw_estimate)) | $(_fmt(period.direct_se)) | $(_fmt(period.iw_se)) |") |
| 188 | + end |
| 189 | + println(io, "| ATE(g0:g3) | $(_fmt(detail.comparison.ate_direct)) | $(_fmt(detail.comparison.ate_iw)) | $(_fmt(detail.comparison.ate_direct_se)) | $(_fmt(detail.comparison.ate_iw_se)) |") |
| 190 | + return String(take!(io)) |
| 191 | +end |
| 192 | + |
| 193 | +function main(args = ARGS) |
| 194 | + output = nothing |
| 195 | + for arg in args |
| 196 | + startswith(arg, "--output=") || continue |
| 197 | + output = split(arg, "="; limit = 2)[2] |
| 198 | + end |
| 199 | + markdown = render_single_cohort_markdown() |
| 200 | + println(markdown) |
| 201 | + if output !== nothing |
| 202 | + write(output, markdown) |
| 203 | + end |
| 204 | +end |
| 205 | + |
| 206 | +if abspath(PROGRAM_FILE) == @__FILE__ |
| 207 | + main() |
| 208 | +end |
| 209 | + |
0 commit comments