Merge pull request #451 from PolicyEngine/migrate-random-to-data-upstream

MaxGhenis · web-flow · commit 09641de94f37 · 2026-02-06T19:24:04.000-05:00
Move all randomness to data package for deterministic country package
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,11 @@
+- bump: minor
+  changes:
+    added:
+      - Name-based seeding (seeded_rng) for order-independent reproducibility
+      - State-specific Medicaid takeup rates (53%-99% range, 51 jurisdictions)
+      - SSI resource test pass rate parameter (0.4)
+      - WIC takeup and nutritional risk draw variables (float)
+      - meets_ssi_resource_test boolean generation
+    changed:
+      - Replaced shared RNG (seed=100) with per-variable name-based seeding
+      - Medicaid takeup now uses state-specific rates instead of uniform 93%
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -14,6 +14,8 @@
 )
 from microimpute.models.qrf import QRF
 import logging
+from policyengine_us_data.parameters import load_take_up_rate
+from policyengine_us_data.utils.randomness import seeded_rng
 
 
 class CPS(Dataset):
@@ -191,28 +193,101 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
 def add_takeup(self):
     data = self.load_dataset()
 
-    from policyengine_us import system, Microsimulation
+    from policyengine_us import Microsimulation
 
     baseline = Microsimulation(dataset=self)
-    parameters = baseline.tax_benefit_system.parameters(self.time_period)
 
-    generator = np.random.default_rng(seed=100)
+    n_persons = len(data["person_id"])
+    n_tax_units = len(data["tax_unit_id"])
+    n_spm_units = len(data["spm_unit_id"])
+
+    # Load take-up rates
+    eitc_rates_by_children = load_take_up_rate("eitc", self.time_period)
+    dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period)
+    snap_rate = load_take_up_rate("snap", self.time_period)
+    aca_rate = load_take_up_rate("aca", self.time_period)
+    medicaid_rates_by_state = load_take_up_rate("medicaid", self.time_period)
+    head_start_rate = load_take_up_rate("head_start", self.time_period)
+    early_head_start_rate = load_take_up_rate(
+        "early_head_start", self.time_period
+    )
+    ssi_pass_rate = load_take_up_rate("ssi_pass_rate", self.time_period)
 
-    eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup
+    # EITC: varies by number of children
     eitc_child_count = baseline.calculate("eitc_child_count").values
-    eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count)
-    data["takes_up_eitc"] = (
-        generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate
+    eitc_takeup_rate = np.array(
+        [
+            eitc_rates_by_children.get(min(int(c), 3), 0.85)
+            for c in eitc_child_count
+        ]
+    )
+    rng = seeded_rng("takes_up_eitc")
+    data["takes_up_eitc"] = rng.random(n_tax_units) < eitc_takeup_rate
+
+    # DC Property Tax Credit
+    rng = seeded_rng("takes_up_dc_ptc")
+    data["takes_up_dc_ptc"] = rng.random(n_tax_units) < dc_ptc_rate
+
+    # SNAP
+    rng = seeded_rng("takes_up_snap_if_eligible")
+    data["takes_up_snap_if_eligible"] = rng.random(n_spm_units) < snap_rate
+
+    # ACA
+    rng = seeded_rng("takes_up_aca_if_eligible")
+    data["takes_up_aca_if_eligible"] = rng.random(n_tax_units) < aca_rate
+
+    # Medicaid: state-specific rates
+    state_codes = baseline.calculate("state_code_str").values
+    hh_ids = data["household_id"]
+    person_hh_ids = data["person_household_id"]
+    hh_to_state = dict(zip(hh_ids, state_codes))
+    person_states = np.array(
+        [hh_to_state.get(hh_id, "CA") for hh_id in person_hh_ids]
     )
-    dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup
-    data["takes_up_dc_ptc"] = (
-        generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate
+    medicaid_rate_by_person = np.array(
+        [medicaid_rates_by_state.get(s, 0.93) for s in person_states]
+    )
+    rng = seeded_rng("takes_up_medicaid_if_eligible")
+    data["takes_up_medicaid_if_eligible"] = (
+        rng.random(n_persons) < medicaid_rate_by_person
+    )
+
+    # Head Start
+    rng = seeded_rng("takes_up_head_start_if_eligible")
+    data["takes_up_head_start_if_eligible"] = (
+        rng.random(n_persons) < head_start_rate
+    )
+
+    # Early Head Start
+    rng = seeded_rng("takes_up_early_head_start_if_eligible")
+    data["takes_up_early_head_start_if_eligible"] = (
+        rng.random(n_persons) < early_head_start_rate
     )
-    generator = np.random.default_rng(seed=100)
 
-    data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"]))
-    data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"]))
-    data["medicaid_take_up_seed"] = generator.random(len(data["person_id"]))
+    # SSI resource test
+    rng = seeded_rng("meets_ssi_resource_test")
+    data["meets_ssi_resource_test"] = rng.random(n_persons) < ssi_pass_rate
+
+    # WIC: resolve draws to bools using category-specific rates
+    wic_categories = baseline.calculate("wic_category_str").values
+    wic_takeup_rates = load_take_up_rate("wic_takeup", self.time_period)
+    wic_takeup_rate_by_person = np.array(
+        [wic_takeup_rates.get(c, 0) for c in wic_categories]
+    )
+    rng = seeded_rng("would_claim_wic")
+    data["would_claim_wic"] = rng.random(n_persons) < wic_takeup_rate_by_person
+
+    # WIC nutritional risk — fully resolved
+    wic_risk_rates = load_take_up_rate(
+        "wic_nutritional_risk", self.time_period
+    )
+    wic_risk_rate_by_person = np.array(
+        [wic_risk_rates.get(c, 0) for c in wic_categories]
+    )
+    receives_wic = baseline.calculate("receives_wic").values
+    rng = seeded_rng("is_wic_at_nutritional_risk")
+    imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person
+    data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk
 
     self.save_dataset(data)
 
diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py
@@ -0,0 +1,72 @@
+"""
+Take-up rate parameters for stochastic simulation.
+
+These parameters are stored in the data package to keep the country package
+as a purely deterministic rules engine.
+"""
+
+import yaml
+from pathlib import Path
+
+PARAMETERS_DIR = Path(__file__).parent
+
+
+def load_take_up_rate(variable_name: str, year: int = 2018):
+    """Load take-up rate from YAML parameter files.
+
+    Args:
+        variable_name: Name of the take-up parameter file (without .yaml)
+        year: Year for which to get the rate
+
+    Returns:
+        float, dict (EITC rates_by_children), or dict (Medicaid
+        rates_by_state)
+    """
+    yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml"
+
+    with open(yaml_path) as f:
+        data = yaml.safe_load(f)
+
+    # EITC: rates by number of children
+    if "rates_by_children" in data:
+        return data["rates_by_children"]
+
+    # Medicaid: state-specific rates
+    if "rates_by_state" in data:
+        return data["rates_by_state"]
+
+    # WIC-style: rates by category (each category has a time series)
+    if "rates_by_category" in data:
+        result = {}
+        for category, time_series in data["rates_by_category"].items():
+            applicable_value = None
+            for y, value in sorted(time_series.items()):
+                if int(y) <= year:
+                    applicable_value = value
+                else:
+                    break
+            if applicable_value is not None:
+                result[category] = applicable_value
+        return result
+
+    # Standard time-series values
+    values = data["values"]
+    applicable_value = None
+
+    for date_key, value in sorted(values.items()):
+        if hasattr(date_key, "year"):
+            date_year = date_key.year
+        else:
+            date_year = int(date_key.split("-")[0])
+
+        if date_year <= year:
+            applicable_value = value
+        else:
+            break
+
+    if applicable_value is None:
+        raise ValueError(
+            f"No take-up rate found for {variable_name} in {year}"
+        )
+
+    return applicable_value
diff --git a/policyengine_us_data/parameters/take_up/aca.yaml b/policyengine_us_data/parameters/take_up/aca.yaml
@@ -0,0 +1,10 @@
+description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible.
+metadata:
+  label: ACA takeup rate
+  unit: /1
+  period: year
+  reference:
+    - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP"
+      href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the
+values:
+  2018-01-01: 0.672
diff --git a/policyengine_us_data/parameters/take_up/dc_ptc.yaml b/policyengine_us_data/parameters/take_up/dc_ptc.yaml
@@ -0,0 +1,11 @@
+description: The share of eligible individuals who claim the DC property tax credit.
+metadata:
+  unit: /1
+  label: DC property tax credit takeup rate
+  period: year
+  reference:
+    - title: District of Columbia Tax Expenditure Report, 2024
+      href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234
+values:
+  # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate)
+  2021-01-01: 0.32
diff --git a/policyengine_us_data/parameters/take_up/early_head_start.yaml b/policyengine_us_data/parameters/take_up/early_head_start.yaml
@@ -0,0 +1,9 @@
+description: Percentage of eligible infants and toddlers who enroll in Early Head Start.
+metadata:
+  label: Early Head Start take-up rate
+  unit: /1
+  reference:
+    - title: NIEER State(s) of Head Start and Early Head Start Report
+      href: https://nieer.org/research-library/states-head-start-early-head-start
+values:
+  2020-09-01: 0.09
diff --git a/policyengine_us_data/parameters/take_up/eitc.yaml b/policyengine_us_data/parameters/take_up/eitc.yaml
@@ -0,0 +1,12 @@
+description: The share of eligible individuals who claim the EITC (by number of children).
+metadata:
+  label: EITC take-up rate by number of children
+  reference:
+    - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS
+      href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62
+# Maps number of children to take-up rate
+rates_by_children:
+  0: 0.65
+  1: 0.86
+  2: 0.85
+  3: 0.85  # Assume same as 2
diff --git a/policyengine_us_data/parameters/take_up/head_start.yaml b/policyengine_us_data/parameters/take_up/head_start.yaml
@@ -0,0 +1,10 @@
+description: Percentage of eligible children who enroll in Head Start.
+metadata:
+  label: Head Start take-up rate
+  unit: /1
+  reference:
+    - title: NIEER State(s) of Head Start and Early Head Start Report
+      href: https://nieer.org/research-library/states-head-start-early-head-start
+values:
+  2020-09-01: 0.40
+  2021-09-01: 0.30
diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml
@@ -0,0 +1,64 @@
+description: Percentage of people who do enroll in Medicaid, if eligible.
+metadata:
+  label: Medicaid takeup rate
+  unit: /1
+  period: year
+  breakdown:
+    - state_code
+  reference:
+    - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP"
+      href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/
+    - title: State-specific rates derived from MACPAC enrollment targets vs modeled eligibility
+      href: https://www.medicaid.gov/medicaid/program-information/medicaid-and-chip-enrollment-data/report-highlights/index.html
+rates_by_state:
+  AK: 0.88
+  AL: 0.92
+  AR: 0.79
+  AZ: 0.95
+  CA: 0.78
+  CO: 0.99
+  CT: 0.89
+  DC: 0.99
+  DE: 0.86
+  FL: 0.98
+  GA: 0.73
+  HI: 0.88
+  IA: 0.84
+  ID: 0.78
+  IL: 0.85
+  IN: 0.99
+  KS: 0.92
+  KY: 0.87
+  LA: 0.79
+  MA: 0.94
+  MD: 0.95
+  ME: 0.92
+  MI: 0.91
+  MN: 0.89
+  MO: 0.89
+  MS: 0.75
+  MT: 0.83
+  NC: 0.94
+  ND: 0.91
+  NE: 0.79
+  NH: 0.84
+  NJ: 0.74
+  NM: 0.84
+  NV: 0.93
+  NY: 0.86
+  OH: 0.82
+  OK: 0.77
+  OR: 0.92
+  PA: 0.64
+  RI: 0.94
+  SC: 0.93
+  SD: 0.88
+  TN: 0.92
+  TX: 0.76
+  UT: 0.53
+  VA: 0.82
+  VT: 0.93
+  WA: 0.98
+  WI: 0.91
+  WV: 0.83
+  WY: 0.70
diff --git a/policyengine_us_data/parameters/take_up/snap.yaml b/policyengine_us_data/parameters/take_up/snap.yaml
@@ -0,0 +1,9 @@
+description: Percentage of eligible SNAP recipients who claim SNAP.
+metadata:
+  label: SNAP takeup rate
+  unit: /1
+  reference:
+    - title: USDA
+      href: https://www.fns.usda.gov/usamap
+values:
+  2018-01-01: 0.82
diff --git a/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml b/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml
@@ -0,0 +1,10 @@
+description: Proportion of SSI-aged-blind-disabled recipients who meet the asset test.
+metadata:
+  label: SSI resource test pass rate
+  unit: /1
+  period: year
+  reference:
+    - title: SSI resource test pass rate from policyengine-us
+      href: https://github.com/PolicyEngine/policyengine-us
+values:
+  2018-01-01: 0.4
diff --git a/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml b/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml
@@ -0,0 +1,13 @@
+rates_by_category:
+  PREGNANT:
+    1980: 0.913
+  POSTPARTUM:
+    1980: 0.933
+  BREASTFEEDING:
+    1980: 0.889
+  INFANT:
+    1980: 0.95
+  CHILD:
+    1980: 0.752
+  NONE:
+    1980: 0
diff --git a/policyengine_us_data/parameters/take_up/wic_takeup.yaml b/policyengine_us_data/parameters/take_up/wic_takeup.yaml
diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py
diff --git a/policyengine_us_data/utils/randomness.py b/policyengine_us_data/utils/randomness.py