1515import subprocess
1616import sys
1717import json
18+ import traceback
1819import modal
1920from pathlib import Path
2021from typing import List , Dict
@@ -244,6 +245,10 @@ def run_phase(
244245 for i , handle in enumerate (handles ):
245246 try :
246247 result = handle .get ()
248+ if result is None :
249+ all_errors .append ({"worker" : i , "error" : "Worker returned None" })
250+ print (f" Worker { i } : returned None (no results)" )
251+ continue
247252 all_results .append (result )
248253 print (
249254 f" Worker { i } : { len (result ['completed' ])} completed, "
@@ -257,7 +262,7 @@ def run_phase(
257262 all_validation_rows .extend (v_rows )
258263 print (f" Worker { i } : { len (v_rows )} validation rows" )
259264 except Exception as e :
260- all_errors .append ({"worker" : i , "error" : str (e )})
265+ all_errors .append ({"worker" : i , "error" : str (e ), "traceback" : traceback . format_exc () })
261266 print (f" Worker { i } : CRASHED - { e } " )
262267
263268 total_completed = sum (len (r ["completed" ]) for r in all_results )
@@ -277,7 +282,7 @@ def run_phase(
277282 if all_errors :
278283 print (f"\n Errors ({ len (all_errors )} ):" )
279284 for err in all_errors [:5 ]:
280- err_msg = err .get ("error" , "Unknown" )[:100 ]
285+ err_msg = str ( err .get ("error" ) or "Unknown" )[:200 ]
281286 print (f" - { err .get ('item' , err .get ('worker' ))} : { err_msg } " )
282287 if len (all_errors ) > 5 :
283288 print (f" ... and { len (all_errors ) - 5 } more" )
@@ -355,15 +360,17 @@ def build_areas_worker(
355360 result = subprocess .run (
356361 worker_cmd ,
357362 stdout = subprocess .PIPE ,
363+ stderr = subprocess .PIPE ,
358364 text = True ,
359365 env = os .environ .copy (),
360366 )
361367
362368 if result .returncode != 0 :
369+ print (f"Worker stderr:\n { result .stderr } " , file = __import__ ('sys' ).stderr )
363370 return {
364371 "completed" : [],
365372 "failed" : [f"{ item ['type' ]} :{ item ['id' ]} " for item in work_items ],
366- "errors" : [{"error" : result .stderr }],
373+ "errors" : [{"error" : ( result .stderr or "No stderr" )[: 2000 ] }],
367374 }
368375
369376 try :
@@ -621,6 +628,7 @@ def coordinate_publish(
621628 n_clones : int = 430 ,
622629 validate : bool = True ,
623630 run_id : str = "" ,
631+ expected_fingerprint : str = "" ,
624632) -> Dict :
625633 """Coordinate the full publishing workflow."""
626634 setup_gcp_credentials ()
@@ -676,43 +684,77 @@ def coordinate_publish(
676684 }
677685 validate_artifacts (config_json_path , artifacts )
678686
687+ if validate :
688+ try :
689+ from sqlalchemy import create_engine as _create_engine
690+ from policyengine_us_data .calibration .validate_staging import (
691+ _query_all_active_targets ,
692+ )
693+ _test_engine = _create_engine (f"sqlite:///{ db_path } " )
694+ _df = _query_all_active_targets (_test_engine , 2024 )
695+ print (f"Validation pre-flight OK: { len (_df )} targets queryable" )
696+ _test_engine .dispose ()
697+ except Exception as e :
698+ print (f"WARNING: Validation pre-flight failed: { e } " )
699+ print ("Disabling validation to protect H5 builds" )
700+ validate = False
701+
679702 # Fingerprint-based cache invalidation
680- fp_result = subprocess .run (
681- [
682- "uv" ,
683- "run" ,
684- "python" ,
685- "-c" ,
686- f"""
703+ if expected_fingerprint :
704+ fingerprint = expected_fingerprint
705+ print (f"Using pinned fingerprint from pipeline: { fingerprint } " )
706+ else :
707+ fp_result = subprocess .run (
708+ [
709+ "uv" ,
710+ "run" ,
711+ "python" ,
712+ "-c" ,
713+ f"""
687714from policyengine_us_data.calibration.publish_local_area import (
688715 compute_input_fingerprint,
689716)
690717print(compute_input_fingerprint("{ weights_path } ", "{ dataset_path } ", { n_clones } , seed=42))
691718""" ,
692- ],
693- capture_output = True ,
694- text = True ,
695- env = os .environ .copy (),
696- )
697- if fp_result .returncode != 0 :
698- raise RuntimeError (f"Failed to compute fingerprint: { fp_result .stderr } " )
699- fingerprint = fp_result .stdout .strip ()
719+ ],
720+ capture_output = True ,
721+ text = True ,
722+ env = os .environ .copy (),
723+ )
724+ if fp_result .returncode != 0 :
725+ raise RuntimeError (f"Failed to compute fingerprint: { fp_result .stderr } " )
726+ fingerprint = fp_result .stdout .strip ()
700727 fingerprint_file = version_dir / "fingerprint.json"
701728 if version_dir .exists ():
729+ h5_count = len (list (version_dir .rglob ("*.h5" )))
702730 if fingerprint_file .exists ():
703731 stored = json .loads (fingerprint_file .read_text ())
704732 if stored .get ("fingerprint" ) == fingerprint :
705733 print (f"Inputs unchanged ({ fingerprint } ), resuming..." )
706734 else :
735+ if h5_count > 0 :
736+ print (
737+ f"WARNING: Inputs changed "
738+ f"({ stored .get ('fingerprint' )} -> { fingerprint } ) "
739+ f"but { h5_count } H5 files exist. "
740+ f"Updating fingerprint and resuming."
741+ )
742+ else :
743+ print (
744+ f"Inputs changed "
745+ f"({ stored .get ('fingerprint' )} -> { fingerprint } ), "
746+ f"clearing empty directory..."
747+ )
748+ shutil .rmtree (version_dir )
749+ else :
750+ if h5_count > 0 :
707751 print (
708- f"Inputs changed "
709- f"({ stored .get ('fingerprint' )} -> { fingerprint } ), "
710- f"rebuilding..."
752+ f"WARNING: No fingerprint found but { h5_count } H5 files exist. "
753+ f"Writing fingerprint and resuming."
711754 )
755+ else :
756+ print ("No fingerprint found, clearing empty stale directory..." )
712757 shutil .rmtree (version_dir )
713- else :
714- print ("No fingerprint found, clearing stale directory..." )
715- shutil .rmtree (version_dir )
716758 version_dir .mkdir (parents = True , exist_ok = True )
717759 fingerprint_file .write_text (json .dumps ({"fingerprint" : fingerprint }))
718760 staging_volume .commit ()
@@ -834,6 +876,7 @@ def coordinate_publish(
834876 return {
835877 "message" : (f"Build complete for version { version } . Upload skipped." ),
836878 "validation_rows" : accumulated_validation_rows ,
879+ "fingerprint" : fingerprint ,
837880 }
838881
839882 print ("\n Validating staging..." )
@@ -869,6 +912,7 @@ def coordinate_publish(
869912 "message" : result ,
870913 "run_id" : run_id ,
871914 "validation_rows" : accumulated_validation_rows ,
915+ "fingerprint" : fingerprint ,
872916 }
873917
874918
0 commit comments