Skip to content

Commit 50e9b7f

Browse files
anth-volkclaude
andcommitted
Fix checkpoint system for cross-container --script mode
Three bugs fixed: 1. Restore to STORAGE_FOLDER, not source tree: Dataset classes resolve file_path via the installed package's STORAGE_FOLDER (in .venv/site-packages/), but restore_from_checkpoint wrote to the source-tree relative path. Now restores to both locations so both subprocess scripts and Dataset class lookups find files. 2. Add volume.reload() in run_single_script and run_integration_test: Without reload, containers see stale volume state and miss files written by prior --script calls. 3. Preserve full path in checkpoint keys: get_checkpoint_path now uses the full relative path (e.g., calibration/policy_data.db) instead of just the filename, preventing potential collisions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 32be41e commit 50e9b7f

1 file changed

Lines changed: 78 additions & 7 deletions

File tree

modal_app/data_build.py

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,32 @@ def get_current_commit() -> str:
167167
return "unknown"
168168

169169

170+
def _get_storage_folder() -> Path:
171+
"""Resolve the installed package's STORAGE_FOLDER path.
172+
173+
This is where Dataset classes (CPS_2024, etc.) look for H5 files.
174+
In an editable install it matches the source tree; in a regular
175+
install it's inside .venv/lib/.../site-packages/.
176+
"""
177+
try:
178+
from policyengine_us_data.storage import STORAGE_FOLDER
179+
180+
return Path(STORAGE_FOLDER)
181+
except ImportError:
182+
# Fallback if package not importable (shouldn't happen in
183+
# the Modal image, but safe for local dev)
184+
return Path("policyengine_us_data/storage")
185+
186+
170187
def get_checkpoint_path(branch: str, output_file: str) -> Path:
171-
"""Get the checkpoint path for an output file, scoped by branch and commit."""
188+
"""Get the checkpoint path for an output file, scoped by branch and commit.
189+
190+
Preserves the relative path structure to avoid filename collisions
191+
(e.g., calibration/policy_data.db stays distinct from policy_data.db).
192+
"""
172193
commit = get_current_commit()
173-
return Path(VOLUME_MOUNT) / branch / commit / Path(output_file).name
194+
# Use the relative path as-is (not just filename) to avoid collisions
195+
return Path(VOLUME_MOUNT) / branch / commit / output_file
174196

175197

176198
def is_checkpointed(branch: str, output_file: str) -> bool:
@@ -183,13 +205,44 @@ def is_checkpointed(branch: str, output_file: str) -> bool:
183205
return False
184206

185207

208+
def _resolve_local_path(output_file: str) -> Path:
209+
"""Resolve where a checkpointed file should be restored to.
210+
211+
Maps the relative source-tree path to the installed package's
212+
STORAGE_FOLDER so that Dataset classes can find the files.
213+
"""
214+
output_path = Path(output_file)
215+
storage_folder = _get_storage_folder()
216+
217+
# Files under policyengine_us_data/storage/ get mapped to
218+
# the installed package's STORAGE_FOLDER
219+
storage_prefix = Path("policyengine_us_data/storage")
220+
try:
221+
relative = output_path.relative_to(storage_prefix)
222+
return storage_folder / relative
223+
except ValueError:
224+
# Not under storage/ — use the path as-is (relative to cwd)
225+
return output_path
226+
227+
186228
def restore_from_checkpoint(branch: str, output_file: str) -> bool:
187-
"""Restore output file from checkpoint volume if it exists."""
229+
"""Restore output file from checkpoint volume to STORAGE_FOLDER.
230+
231+
Writes to the installed package's storage directory so that
232+
Dataset classes (which use STORAGE_FOLDER) can find the files.
233+
"""
188234
checkpoint_path = get_checkpoint_path(branch, output_file)
189235
if checkpoint_path.exists() and checkpoint_path.stat().st_size > 0:
190-
local_path = Path(output_file)
236+
local_path = _resolve_local_path(output_file)
191237
local_path.parent.mkdir(parents=True, exist_ok=True)
192238
shutil.copy2(checkpoint_path, local_path)
239+
# Also restore to the source-tree relative path so that
240+
# scripts run via subprocess (which use cwd-relative paths)
241+
# can find the file.
242+
source_path = Path(output_file)
243+
if source_path != local_path:
244+
source_path.parent.mkdir(parents=True, exist_ok=True)
245+
shutil.copy2(checkpoint_path, source_path)
193246
print(f"Restored from checkpoint: {output_file}")
194247
return True
195248
return False
@@ -200,12 +253,24 @@ def save_checkpoint(
200253
output_file: str,
201254
volume: modal.Volume,
202255
) -> None:
203-
"""Save output file to checkpoint volume."""
204-
local_path = Path(output_file)
256+
"""Save output file to checkpoint volume.
257+
258+
Checks both the installed package path and the source-tree
259+
relative path to find the file.
260+
"""
261+
local_path = _resolve_local_path(output_file)
262+
source_path = Path(output_file)
263+
# Try installed path first, fall back to source-tree path
264+
actual_path = None
205265
if local_path.exists() and local_path.stat().st_size > 0:
266+
actual_path = local_path
267+
elif source_path.exists() and source_path.stat().st_size > 0:
268+
actual_path = source_path
269+
270+
if actual_path:
206271
checkpoint_path = get_checkpoint_path(branch, output_file)
207272
checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
208-
shutil.copy2(local_path, checkpoint_path)
273+
shutil.copy2(actual_path, checkpoint_path)
209274
with _volume_lock:
210275
volume.commit()
211276
print(f"Checkpointed: {output_file}")
@@ -734,6 +799,9 @@ def run_single_script(
734799
setup_gcp_credentials()
735800
os.chdir("/root/policyengine-us-data")
736801

802+
# Reload volume to see writes from prior --script containers
803+
checkpoint_volume.reload()
804+
737805
# Resolve short name to full path
738806
script_path = SCRIPT_SHORT_NAMES.get(script_name, script_name)
739807

@@ -822,6 +890,9 @@ def run_integration_test(
822890
setup_gcp_credentials()
823891
os.chdir("/root/policyengine-us-data")
824892

893+
# Reload volume to see writes from prior containers
894+
checkpoint_volume.reload()
895+
825896
# Restore all prerequisites and dataset outputs
826897
for prereq in PREREQUISITE_FILES:
827898
restore_from_checkpoint(branch, prereq)

0 commit comments

Comments
 (0)