Skip to content

Commit 68f8009

Browse files
committed
fix preprocessing
1 parent 39b2acd commit 68f8009

1 file changed

Lines changed: 17 additions & 19 deletions

File tree

docs/source/tutorials/oregon.rst

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@ The dataset includes multiple files containing information about participants in
2121

2222
This data supports research on how health insurance affects healthcare utilization and is maintained by researchers Amy Finkelstein and Katherine Baicker. Please ensure you comply with the data use agreements when downloading and using this dataset.
2323

24-
**Important**: When using this dataset for research or publications, appropriate citation is required as specified in the NBER data use agreement.
25-
2624
.. code-block:: python
2725
2826
import numpy as np
@@ -56,18 +54,18 @@ This data supports research on how health insurance affects healthcare utilizati
5654
# Prepare the data for dte_adj analysis
5755
# Create treatment assignment (instrumental variable): 0=Not selected, 1=Selected
5856
treatment_assignment_mapping = {'Not selected': 0, 'Selected': 1}
59-
df['Z'] = df['treatment'].map(treatment_assignment_mapping).astype(float).fillna(-1).astype(int)
57+
df['Z'] = df['treatment'].map(treatment_assignment_mapping)
6058
6159
# Create actual treatment indicator: 0=Not enrolled, 1=Enrolled, -1=Missing
6260
treatment_mapping = {'NOT enrolled': 0, 'Enrolled': 1}
63-
df['D'] = df['ohp_all_ever_inperson'].map(treatment_mapping).astype(float).fillna(-1).astype(int)
64-
65-
# Use emergency department costs and visits as outcome variables
66-
df['Y_ED_CHARG_TOT_ED'] = df['ed_charg_tot_ed'].fillna(0)
67-
df['Y_NUM_VISIT_CENS_ED'] = df['num_visit_cens_ed'].fillna(0)
61+
df['D'] = df['ohp_all_ever_inperson'].map(treatment_mapping)
6862
6963
# Create strata based on household size
70-
df['strata'] = df['numhh_list']
64+
df.rename(columns={'numhh_list': 'strata'}, inplace=True)
65+
df['strata'] = df['strata'].replace({
66+
'signed self up + 1 additional person': 'signed self up + others',
67+
'signed self up + 2 additional people': 'signed self up + others'
68+
})
7169
7270
# Create feature mappings for categorical variables
7371
gender_mapping = {'Male': 0, 'Female': 1, 'Transgender F to M': 2, 'Transgender M to F': 3}
@@ -80,21 +78,21 @@ This data supports research on how health insurance affects healthcare utilizati
8078
df['edu_inp'] = df['edu_inp'].map(edu_mapping).astype(float).fillna(-1).astype(int)
8179
8280
# Select control variables: pre-randomization ED utilization variables
83-
ctrl_cols = [col for col in df_ed.columns if 'pre' in col and 'num' in col]
84-
ctrl_cols.append('charg_tot_pre_ed')
85-
selected_cols = ['person_id', 'strata', 'Y_NUM_VISIT_CENS_ED', 'Y_ED_CHARG_TOT_ED', 'Z', 'D'] + ctrl_cols + ['gender_inp', 'age', 'health_last12_inp', 'edu_inp']
81+
ctrl_cols = [col for col in df_ed.columns if 'pre' in col and 'num' in col] + ['gender_inp', 'age', 'health_last12_inp', 'edu_inp', 'charg_tot_pre_ed']
82+
selected_cols = ['person_id', 'strata', 'ed_charg_tot_ed', 'num_visit_cens_ed', 'Z', 'D'] + ctrl_cols
8683
df = df[selected_cols]
87-
df = df[df.isna().any(axis=1) == False]
84+
df = df.dropna().reset_index(drop=True)
8885
8986
# Create feature matrix (excluding treatment variables)
90-
features = pd.DataFrame(df[ctrl_cols + ['gender_inp', 'age', 'health_last12_inp', 'edu_inp']])
91-
X = features.values
87+
X = df[ctrl_cols].values
9288
93-
Z = df['Z'].values # Treatment assignment (instrumental variable)
94-
D = df['D'].values # Actual treatment (endogenous variable)
89+
Z = df['Z'].astype(int).values # Treatment assignment (instrumental variable)
90+
D = df['D'].astype(int).values # Actual treatment (endogenous variable)
9591
strata = df['strata'].values # Stratification variable
96-
Y_ED_CHARG_TOT_ED = df['Y_ED_CHARG_TOT_ED'].values
97-
Y_NUM_VISIT_CENS_ED = df['Y_NUM_VISIT_CENS_ED'].values
92+
93+
# Use num_visit_cens_ed and ed_charg_tot_ed as outcome variables
94+
Y_ED_CHARG_TOT_ED = df['ed_charg_tot_ed'].values
95+
Y_NUM_VISIT_CENS_ED = df['num_visit_cens_ed'].values
9896
9997
print(f"\nDataset size: {len(D):,} people")
10098
print(f"Treatment assignment (Z) - Not selected: {(Z==0).sum():,} ({(Z==0).mean():.1%})")

0 commit comments

Comments
 (0)