|
77 | 77 | ), |
78 | 78 | } |
79 | 79 |
|
| 80 | +CPS_BUILD_SCRIPT = "policyengine_us_data/datasets/cps/cps.py" |
| 81 | +PUF_BUILD_SCRIPT = "policyengine_us_data/datasets/puf/puf.py" |
| 82 | + |
80 | 83 | # Test modules to run individually for checkpoint tracking |
81 | 84 | TEST_MODULES = [ |
82 | 85 | "tests/unit/", |
@@ -314,6 +317,25 @@ def run_script_with_checkpoint( |
314 | 317 | return script_path |
315 | 318 |
|
316 | 319 |
|
| 320 | +def run_cps_then_puf_phase( |
| 321 | + branch: str, |
| 322 | + volume: modal.Volume, |
| 323 | + *, |
| 324 | + env: dict, |
| 325 | + log_file: IO = None, |
| 326 | +) -> None: |
| 327 | + """Build CPS before PUF because PUF pension imputation loads CPS_2024.""" |
| 328 | + for script in (CPS_BUILD_SCRIPT, PUF_BUILD_SCRIPT): |
| 329 | + run_script_with_checkpoint( |
| 330 | + script, |
| 331 | + SCRIPT_OUTPUTS[script], |
| 332 | + branch, |
| 333 | + volume, |
| 334 | + env=env, |
| 335 | + log_file=log_file, |
| 336 | + ) |
| 337 | + |
| 338 | + |
317 | 339 | def run_tests_with_checkpoints( |
318 | 340 | branch: str, |
319 | 341 | volume: modal.Volume, |
@@ -508,34 +530,16 @@ def build_datasets( |
508 | 530 | for future in as_completed(futures): |
509 | 531 | future.result() # Raises if script failed |
510 | 532 |
|
511 | | - # GROUP 2: Depends on Group 1 - run in parallel |
512 | | - # cps.py needs acs, puf.py needs irs_puf + uprating |
513 | | - print("=== Phase 2: Building CPS and PUF (parallel) ===") |
514 | | - group2 = [ |
515 | | - ( |
516 | | - "policyengine_us_data/datasets/cps/cps.py", |
517 | | - SCRIPT_OUTPUTS["policyengine_us_data/datasets/cps/cps.py"], |
518 | | - ), |
519 | | - ( |
520 | | - "policyengine_us_data/datasets/puf/puf.py", |
521 | | - SCRIPT_OUTPUTS["policyengine_us_data/datasets/puf/puf.py"], |
522 | | - ), |
523 | | - ] |
524 | | - with ThreadPoolExecutor(max_workers=2) as executor: |
525 | | - futures = { |
526 | | - executor.submit( |
527 | | - run_script_with_checkpoint, |
528 | | - script, |
529 | | - output, |
530 | | - branch, |
531 | | - checkpoint_volume, |
532 | | - env=env, |
533 | | - log_file=log_file, |
534 | | - ): script |
535 | | - for script, output in group2 |
536 | | - } |
537 | | - for future in as_completed(futures): |
538 | | - future.result() |
| 533 | + # GROUP 2: Depends on Group 1 - run sequentially. |
| 534 | + # puf.py pension imputation can instantiate CPS_2024, so it must |
| 535 | + # not run while cps.py is writing cps_2024.h5. |
| 536 | + print("=== Phase 2: Building CPS then PUF (sequential) ===") |
| 537 | + run_cps_then_puf_phase( |
| 538 | + branch, |
| 539 | + checkpoint_volume, |
| 540 | + env=env, |
| 541 | + log_file=log_file, |
| 542 | + ) |
539 | 543 |
|
540 | 544 | # SEQUENTIAL: Extended CPS (needs both cps and puf) |
541 | 545 | print("=== Phase 3: Building extended CPS ===") |
|
0 commit comments