Avoid cross-country imports in simulation worker

MaxGhenis · web-flow · commit 630ee2a70322 · 2026-05-12T07:43:20.000-04:00
Set the simulation worker to avoid top-level country imports and pass Hugging Face credentials into Modal for private country data manifests.
diff --git a/.github/scripts/modal-sync-secrets.sh b/.github/scripts/modal-sync-secrets.sh
@@ -62,6 +62,11 @@ if [ -n "${GCP_CREDENTIALS_JSON:-}" ]; then
     --force || true
 fi
 
+uv run modal secret create policyengine-data-credentials \
+  "HUGGING_FACE_TOKEN=${HUGGING_FACE_TOKEN:-}" \
+  --env="$MODAL_ENV" \
+  --force || true
+
 # Sync gateway auth config. The gateway runtime only needs issuer/audience and
 # the explicit requirement flag; client credentials stay on the GitHub side and
 # are only used to mint integration-test tokens.
diff --git a/.github/workflows/modal-deploy.reusable.yml b/.github/workflows/modal-deploy.reusable.yml
@@ -55,6 +55,7 @@ jobs:
         MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
         LOGFIRE_TOKEN: ${{ secrets.LOGFIRE_TOKEN }}
         GCP_CREDENTIALS_JSON: ${{ secrets.GCP_CREDENTIALS_JSON }}
+        HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
         GATEWAY_AUTH_ISSUER: ${{ secrets.GATEWAY_AUTH_ISSUER }}
         GATEWAY_AUTH_AUDIENCE: ${{ secrets.GATEWAY_AUTH_AUDIENCE }}
         GATEWAY_AUTH_CLIENT_ID: ${{ secrets.GATEWAY_AUTH_CLIENT_ID }}
diff --git a/projects/policyengine-api-simulation/src/modal/app.py b/projects/policyengine-api-simulation/src/modal/app.py
@@ -39,6 +39,7 @@ def get_app_name(us_version: str, uk_version: str) -> str:
 # Secrets
 # GCP credentials are shared across environments (always from main)
 gcp_secret = modal.Secret.from_name("gcp-credentials", environment_name="main")
+data_secret = modal.Secret.from_name("policyengine-data-credentials")
 # Logfire secret is environment-specific
 logfire_secret = modal.Secret.from_name("policyengine-logfire")
 
@@ -80,7 +81,7 @@ def configure_logfire(service_name: str = "policyengine-simulation"):
     timeout=3600,
     retries=0,
     max_containers=100,
-    secrets=[gcp_secret, logfire_secret],
+    secrets=[gcp_secret, data_secret, logfire_secret],
 )
 def run_simulation(params: dict) -> dict:
     """
@@ -118,7 +119,7 @@ def run_simulation(params: dict) -> dict:
     timeout=3600,
     retries=0,
     max_containers=100,
-    secrets=[gcp_secret, logfire_secret],
+    secrets=[gcp_secret, data_secret, logfire_secret],
 )
 def run_budget_window_batch(params: dict) -> dict:
     """Execute a multi-year budget-window batch orchestration."""
diff --git a/projects/policyengine-api-simulation/src/modal/simulation.py b/projects/policyengine-api-simulation/src/modal/simulation.py
@@ -10,8 +10,14 @@
 import logging
 import os
 import tempfile
+import importlib
 from typing import Any, Iterator
 
+# policyengine.core is imported for every simulation. Without this guard,
+# importing the package pulls both country modules into the process; a US run
+# can then fail before it starts if UK private-data credentials are absent.
+os.environ.setdefault("POLICYENGINE_SKIP_COUNTRY_IMPORTS", "1")
+
 try:
     from src.modal.telemetry import split_internal_payload
 except ModuleNotFoundError:
@@ -236,13 +242,11 @@ def group_subset(entity: str):
 
 
 def _country_module(country: str):
-    import policyengine as pe
-
     country = country.lower()
     if country == "us":
-        return pe.us
+        return importlib.import_module("policyengine.tax_benefit_models.us")
     if country == "uk":
-        return pe.uk
+        return importlib.import_module("policyengine.tax_benefit_models.uk")
     raise ValueError(f"Unsupported country: {country}")
 
 
@@ -327,18 +331,10 @@ def _budget_result(country: str, baseline, reform) -> dict[str, float]:
 
 
 def _poverty_result(country: str, baseline, reform) -> dict[str, list[dict[str, Any]]]:
-    import policyengine as pe
-
-    if country == "us":
-        baseline_poverty = pe.us.economic_impact_analysis(
-            baseline, reform
-        ).baseline_poverty
-        reform_poverty = pe.us.economic_impact_analysis(baseline, reform).reform_poverty
-    else:
-        baseline_poverty = pe.uk.economic_impact_analysis(
-            baseline, reform
-        ).baseline_poverty
-        reform_poverty = pe.uk.economic_impact_analysis(baseline, reform).reform_poverty
+    country_module = _country_module(country)
+    impact = country_module.economic_impact_analysis(baseline, reform)
+    baseline_poverty = impact.baseline_poverty
+    reform_poverty = impact.reform_poverty
 
     return {
         "baseline": baseline_poverty.dataframe.to_dict("records"),
@@ -347,12 +343,8 @@ def _poverty_result(country: str, baseline, reform) -> dict[str, list[dict[str,
 
 
 def _analysis_result(country: str, baseline, reform) -> dict[str, Any]:
-    import policyengine as pe
-
-    if country == "us":
-        analysis = pe.us.economic_impact_analysis(baseline, reform)
-    else:
-        analysis = pe.uk.economic_impact_analysis(baseline, reform)
+    country_module = _country_module(country)
+    analysis = country_module.economic_impact_analysis(baseline, reform)
 
     return {
         "decile_impacts": analysis.decile_impacts.dataframe.to_dict("records"),