@@ -468,49 +468,27 @@ def build_datasets(
468468 for future in as_completed (futures ):
469469 future .result () # Raises if script failed
470470
471- # GROUP 2: Depends on Group 1 - run in parallel
472- # cps.py needs acs, puf.py needs irs_puf + uprating
473- print ("=== Phase 2: Building CPS and PUF (parallel) ===" )
474- group2 = [
475- (
476- "policyengine_us_data/datasets/cps/cps.py" ,
477- SCRIPT_OUTPUTS ["policyengine_us_data/datasets/cps/cps.py" ],
478- ),
479- (
480- "policyengine_us_data/datasets/puf/puf.py" ,
481- SCRIPT_OUTPUTS ["policyengine_us_data/datasets/puf/puf.py" ],
482- ),
483- ]
484- with ThreadPoolExecutor (max_workers = 2 ) as executor :
485- futures = {
486- executor .submit (
487- run_script_with_checkpoint ,
488- script ,
489- output ,
490- branch ,
491- checkpoint_volume ,
492- env = env ,
493- log_file = log_file ,
494- ): script
495- for script , output in group2
496- }
497- for future in as_completed (futures ):
498- future .result ()
499-
500- # SEQUENTIAL: Extended CPS (needs both cps and puf)
501- print ("=== Phase 3: Building extended CPS ===" )
502- run_script_with_checkpoint (
471+ # GROUP 2: Sequential chain — each step depends on the previous.
472+ # cps.py needs acs; puf.py needs irs_puf + uprating + cps
473+ # (pension imputation); extended_cps.py needs both cps and puf.
474+ print ("=== Phase 2: Building CPS → PUF → extended CPS ===" )
475+ for script in (
476+ "policyengine_us_data/datasets/cps/cps.py" ,
477+ "policyengine_us_data/datasets/puf/puf.py" ,
503478 "policyengine_us_data/datasets/cps/extended_cps.py" ,
504- SCRIPT_OUTPUTS ["policyengine_us_data/datasets/cps/extended_cps.py" ],
505- branch ,
506- checkpoint_volume ,
507- env = env ,
508- log_file = log_file ,
509- )
479+ ):
480+ run_script_with_checkpoint (
481+ script ,
482+ SCRIPT_OUTPUTS [script ],
483+ branch ,
484+ checkpoint_volume ,
485+ env = env ,
486+ log_file = log_file ,
487+ )
510488
511489 # GROUP 3: After extended_cps - run in parallel
512490 # enhanced_cps and stratified_cps both depend on extended_cps
513- print ("=== Phase 4 : Building enhanced and stratified CPS (parallel) ===" )
491+ print ("=== Phase 3 : Building enhanced and stratified CPS (parallel) ===" )
514492 phase4_futures = []
515493 with ThreadPoolExecutor (max_workers = 2 ) as executor :
516494 if not skip_enhanced_cps :
@@ -545,11 +523,11 @@ def build_datasets(
545523 for future in as_completed (phase4_futures ):
546524 future .result ()
547525
548- # GROUP 4: After Phase 4 - run in parallel
526+ # GROUP 4: After Phase 3 - run in parallel
549527 # create_source_imputed_cps needs stratified_cps
550528 # small_enhanced_cps needs enhanced_cps
551529 print (
552- "=== Phase 5 : Building source imputed CPS "
530+ "=== Phase 4 : Building source imputed CPS "
553531 "and small enhanced CPS (parallel) ==="
554532 )
555533 phase5_futures = []
0 commit comments