Skip to content

Commit d2c10ec

Browse files
authored
Make long-run Modal retries preserve outputs (#1018)
* Make long-run Modal retries preserve outputs * Add long-run preemption changelog
1 parent ef0f2d4 commit d2c10ec

3 files changed

Lines changed: 105 additions & 5 deletions

File tree

changelog.d/1018.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Preserve resumable long-run Modal projection artifacts across preemption retries.

modal_app/long_term_projection.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,45 @@ def _output_files(output_dir: Path) -> list[str]:
251251
)
252252

253253

254+
def _has_resume_artifacts(output_dir: Path) -> bool:
255+
"""Return whether deleting output_dir would lose resumable long-run work."""
256+
if not output_dir.exists():
257+
return False
258+
progress_patterns = (
259+
"*.h5",
260+
"*.h5.metadata.json",
261+
"calibration_manifest.json",
262+
"long_run_production_manifest.json",
263+
".parallel_tmp/*/*.h5",
264+
".parallel_tmp/*/*.h5.metadata.json",
265+
".parallel_tmp/*/calibration_manifest.json",
266+
".parallel_logs/*.log",
267+
)
268+
return any(
269+
next(output_dir.glob(pattern), None) is not None
270+
for pattern in progress_patterns
271+
)
272+
273+
274+
def _prepare_output_dir(
275+
output_dir: Path,
276+
*,
277+
clear_output: bool,
278+
) -> None:
279+
if clear_output and output_dir.exists():
280+
if _has_resume_artifacts(output_dir):
281+
print(
282+
"clear_output requested, but existing long-run artifacts were "
283+
f"found in {output_dir}; preserving them for resumable execution. "
284+
"Use a separate explicit Modal volume removal command for an "
285+
"intentional destructive restart.",
286+
flush=True,
287+
)
288+
else:
289+
shutil.rmtree(output_dir)
290+
output_dir.mkdir(parents=True, exist_ok=True)
291+
292+
254293
def _commit_output_volume(*, suppress_errors: bool = False) -> None:
255294
try:
256295
output_volume.commit()
@@ -286,7 +325,7 @@ def build_long_term_projection(
286325
upload_to_hf_staging: bool = False,
287326
allow_validation_failures: bool = False,
288327
keep_temp: bool = False,
289-
clear_output: bool = True,
328+
clear_output: bool = False,
290329
support_augmentation_profile: str = "",
291330
support_augmentation_target_year: int | None = None,
292331
support_augmentation_align_to_run_year: bool = False,
@@ -306,9 +345,10 @@ def build_long_term_projection(
306345

307346
run_id = sanitize_run_id(run_id)
308347
output_dir = _OUTPUT_MOUNT / run_id
309-
if clear_output and output_dir.exists():
310-
shutil.rmtree(output_dir)
311-
output_dir.mkdir(parents=True, exist_ok=True)
348+
_prepare_output_dir(
349+
output_dir,
350+
clear_output=clear_output,
351+
)
312352

313353
command = _build_command(
314354
years=years,
@@ -384,7 +424,7 @@ def main(
384424
upload_to_hf_staging: bool = False,
385425
allow_validation_failures: bool = False,
386426
keep_temp: bool = False,
387-
clear_output: bool = True,
427+
clear_output: bool = False,
388428
support_augmentation_profile: str = "",
389429
support_augmentation_target_year: int | None = None,
390430
support_augmentation_align_to_run_year: bool = False,

tests/unit/test_long_term_modal_projection.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ def fake_spawn(**kwargs):
176176
assert captured_kwargs["run_id"] == "crfb-sentinel"
177177
assert captured_kwargs["source_sha"] == "abc123"
178178
assert captured_kwargs["upload_to_hf_staging"] is False
179+
assert captured_kwargs["clear_output"] is False
179180

180181

181182
def test_main_refuses_dirty_source_by_default(monkeypatch):
@@ -245,3 +246,61 @@ def test_remote_result_reports_hf_staging_prefix(monkeypatch, tmp_path):
245246
assert result["run_id"] == "run-123"
246247
assert result["hf_staging_prefix"] == "staging/abc123-run-123/long_term"
247248
assert Path(result["output_dir"]) == tmp_path / "run-123"
249+
250+
251+
def test_remote_preserves_resume_artifacts_when_clear_output_requested(
252+
monkeypatch,
253+
tmp_path,
254+
):
255+
long_term = _load_long_term_projection_module(monkeypatch)
256+
monkeypatch.setattr(long_term, "_OUTPUT_MOUNT", tmp_path)
257+
258+
output_dir = tmp_path / "run-123"
259+
resume_dir = output_dir / ".parallel_tmp" / "2026"
260+
resume_dir.mkdir(parents=True)
261+
completed_h5 = resume_dir / "2026.h5"
262+
completed_h5.write_text("already-complete", encoding="utf-8")
263+
(resume_dir / "2026.h5.metadata.json").write_text("{}", encoding="utf-8")
264+
(resume_dir / "calibration_manifest.json").write_text("{}", encoding="utf-8")
265+
266+
def fake_stream_command(command, env):
267+
assert completed_h5.exists()
268+
269+
monkeypatch.setattr(long_term, "_stream_command", fake_stream_command)
270+
271+
result = long_term.build_long_term_projection(
272+
years="2026-2100",
273+
run_id="Run 123",
274+
source_sha="abc123",
275+
clear_output=True,
276+
)
277+
278+
assert completed_h5.read_text(encoding="utf-8") == "already-complete"
279+
assert ".parallel_tmp/2026/2026.h5" in result["files"]
280+
281+
282+
def test_remote_clear_output_can_remove_non_resumable_scratch(
283+
monkeypatch,
284+
tmp_path,
285+
):
286+
long_term = _load_long_term_projection_module(monkeypatch)
287+
monkeypatch.setattr(long_term, "_OUTPUT_MOUNT", tmp_path)
288+
289+
output_dir = tmp_path / "run-123"
290+
output_dir.mkdir(parents=True)
291+
scratch_file = output_dir / "scratch.txt"
292+
scratch_file.write_text("scratch", encoding="utf-8")
293+
294+
def fake_stream_command(command, env):
295+
assert not scratch_file.exists()
296+
297+
monkeypatch.setattr(long_term, "_stream_command", fake_stream_command)
298+
299+
result = long_term.build_long_term_projection(
300+
years="2026-2100",
301+
run_id="Run 123",
302+
source_sha="abc123",
303+
clear_output=True,
304+
)
305+
306+
assert "scratch.txt" not in result["files"]

0 commit comments

Comments
 (0)