Skip to content

Commit 0633a53

Browse files
baogorekclaude
andcommitted
Disable preemption on all Modal functions, log fc IDs at spawn points
Preemptible spot instances caused silent worker terminations that left the pipeline hanging with no clear diagnostic trail. Every function except pipeline_status (read-only, 60s) is now nonpreemptible. Spawn points now print function-call IDs for coordinate_publish workers, fit_weights, and H5 build orchestrators. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0e192eb commit 0633a53

3 files changed

Lines changed: 24 additions & 0 deletions

File tree

modal_app/local_area.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ def run_phase(
276276
calibration_inputs=calibration_inputs,
277277
validate=validate,
278278
)
279+
print(f" → fc: {handle.object_id}")
279280
handles.append(handle)
280281

281282
print(f"Waiting for {phase_name} workers to complete...")
@@ -337,6 +338,7 @@ def run_phase(
337338
memory=16384,
338339
cpu=4.0,
339340
timeout=14400,
341+
nonpreemptible=True,
340342
)
341343
def build_areas_worker(
342344
branch: str,
@@ -428,6 +430,7 @@ def build_areas_worker(
428430
volumes={VOLUME_MOUNT: staging_volume},
429431
memory=4096,
430432
timeout=1800,
433+
nonpreemptible=True,
431434
)
432435
def validate_staging(branch: str, version: str) -> Dict:
433436
"""Validate all expected files and generate manifest."""
@@ -480,6 +483,7 @@ def validate_staging(branch: str, version: str) -> Dict:
480483
volumes={VOLUME_MOUNT: staging_volume},
481484
memory=8192,
482485
timeout=14400,
486+
nonpreemptible=True,
483487
)
484488
def upload_to_staging(branch: str, version: str, manifest: Dict) -> str:
485489
"""
@@ -551,6 +555,7 @@ def upload_to_staging(branch: str, version: str, manifest: Dict) -> str:
551555
volumes={VOLUME_MOUNT: staging_volume},
552556
memory=4096,
553557
timeout=3600,
558+
nonpreemptible=True,
554559
)
555560
def promote_publish(branch: str = "main", version: str = "") -> str:
556561
"""
@@ -1084,6 +1089,7 @@ def main_national(branch: str = "main", n_clones: int = 430):
10841089
volumes={VOLUME_MOUNT: staging_volume},
10851090
memory=4096,
10861091
timeout=3600,
1092+
nonpreemptible=True,
10871093
)
10881094
def promote_national_publish(
10891095
branch: str = "main",

modal_app/pipeline.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,7 @@ def run_pipeline(
828828
lambda_l2=1e-8,
829829
log_freq=500,
830830
)
831+
print(f" → regional fit fc: {regional_handle.object_id}")
831832

832833
# Spawn national fit (if enabled)
833834
national_handle = None
@@ -848,6 +849,7 @@ def run_pipeline(
848849
lambda_l2=1e-12,
849850
log_freq=500,
850851
)
852+
print(f" → national fit fc: {national_handle.object_id}")
851853

852854
# Collect regional results
853855
print(" Waiting for regional fit...")
@@ -929,6 +931,7 @@ def run_pipeline(
929931
n_clones=n_clones,
930932
validate=True,
931933
)
934+
print(f" → coordinate_publish fc: {regional_h5_handle.object_id}")
932935

933936
national_h5_handle = None
934937
if not skip_national:
@@ -938,6 +941,9 @@ def run_pipeline(
938941
n_clones=n_clones,
939942
validate=True,
940943
)
944+
print(
945+
f" → coordinate_national_publish fc: {national_h5_handle.object_id}"
946+
)
941947

942948
# While H5 builds run, stage base datasets
943949
# and upload diagnostics in this container

modal_app/remote_calibration_runner.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ def _build_package_impl(
441441
cpu=8.0,
442442
timeout=50400,
443443
volumes={PIPELINE_MOUNT: pipeline_vol},
444+
nonpreemptible=True,
444445
)
445446
def build_package_remote(
446447
branch: str = "main",
@@ -462,6 +463,7 @@ def build_package_remote(
462463
image=image,
463464
timeout=30,
464465
volumes={PIPELINE_MOUNT: pipeline_vol},
466+
nonpreemptible=True,
465467
)
466468
def check_volume_package() -> dict:
467469
"""Check if a calibration package exists on the volume.
@@ -515,6 +517,7 @@ def check_volume_package() -> dict:
515517
gpu="T4",
516518
timeout=14400,
517519
volumes={PIPELINE_MOUNT: pipeline_vol},
520+
nonpreemptible=True,
518521
)
519522
def fit_weights_t4(
520523
branch: str = "main",
@@ -550,6 +553,7 @@ def fit_weights_t4(
550553
gpu="A10",
551554
timeout=14400,
552555
volumes={PIPELINE_MOUNT: pipeline_vol},
556+
nonpreemptible=True,
553557
)
554558
def fit_weights_a10(
555559
branch: str = "main",
@@ -585,6 +589,7 @@ def fit_weights_a10(
585589
gpu="A100-40GB",
586590
timeout=14400,
587591
volumes={PIPELINE_MOUNT: pipeline_vol},
592+
nonpreemptible=True,
588593
)
589594
def fit_weights_a100_40(
590595
branch: str = "main",
@@ -620,6 +625,7 @@ def fit_weights_a100_40(
620625
gpu="A100-80GB",
621626
timeout=14400,
622627
volumes={PIPELINE_MOUNT: pipeline_vol},
628+
nonpreemptible=True,
623629
)
624630
def fit_weights_a100_80(
625631
branch: str = "main",
@@ -655,6 +661,7 @@ def fit_weights_a100_80(
655661
gpu="H100",
656662
timeout=14400,
657663
volumes={PIPELINE_MOUNT: pipeline_vol},
664+
nonpreemptible=True,
658665
)
659666
def fit_weights_h100(
660667
branch: str = "main",
@@ -701,6 +708,7 @@ def fit_weights_h100(
701708
gpu="T4",
702709
timeout=14400,
703710
volumes={PIPELINE_MOUNT: pipeline_vol},
711+
nonpreemptible=True,
704712
)
705713
def fit_from_package_t4(
706714
branch: str = "main",
@@ -733,6 +741,7 @@ def fit_from_package_t4(
733741
gpu="A10",
734742
timeout=14400,
735743
volumes={PIPELINE_MOUNT: pipeline_vol},
744+
nonpreemptible=True,
736745
)
737746
def fit_from_package_a10(
738747
branch: str = "main",
@@ -765,6 +774,7 @@ def fit_from_package_a10(
765774
gpu="A100-40GB",
766775
timeout=14400,
767776
volumes={PIPELINE_MOUNT: pipeline_vol},
777+
nonpreemptible=True,
768778
)
769779
def fit_from_package_a100_40(
770780
branch: str = "main",
@@ -797,6 +807,7 @@ def fit_from_package_a100_40(
797807
gpu="A100-80GB",
798808
timeout=14400,
799809
volumes={PIPELINE_MOUNT: pipeline_vol},
810+
nonpreemptible=True,
800811
)
801812
def fit_from_package_a100_80(
802813
branch: str = "main",
@@ -829,6 +840,7 @@ def fit_from_package_a100_80(
829840
gpu="H100",
830841
timeout=14400,
831842
volumes={PIPELINE_MOUNT: pipeline_vol},
843+
nonpreemptible=True,
832844
)
833845
def fit_from_package_h100(
834846
branch: str = "main",

0 commit comments

Comments
 (0)