Skip to content

Commit 9fdd92b

Browse files
hua7450claude
andcommitted
Address P1 review feedback on Modal snapshot rollout
Two fixes flagged by review on #1528: 1. Gateway backward compatibility during the release transition. The default Modal release flow promotes the existing frontier worker to current without redeploying it. With this PR shipped, the new worker exposes the `HouseholdWorker` class but the promoted current worker still only exposes the pre-PR top-level `handle_household_request` function. `call_worker_function` now tries `modal.Cls.from_name(..., "HouseholdWorker")` first and falls back to `modal.Function.from_name(..., "handle_household_request")` on `modal.exception.NotFoundError`, so both shapes route correctly during the one release cycle where both coexist. Added two gateway unit tests covering the class path and the function-fallback path. 2. Reset live network state captured by the memory snapshot. Memory snapshots preserve Python object state but not live TCP sockets; the SQLAlchemy pool and the Cloud SQL Connector created at snapshot time hold sockets that closed when Modal froze the container. Add `@modal.enter(snap=False) reset_post_snapshot_state` on `HouseholdWorker` to run on every snapshot-restored container start: re-establish the credentials file in case `/tmp` was not preserved, then call `analytics_setup.cleanup()` (drops the Cloud SQL Connector singleton) and `db.engine.dispose()` (drops the SQLAlchemy connection pool). Subsequent queries open fresh connections. Added a unit test asserting the hook is declared on the class. References: - https://modal.com/docs/guide/memory-snapshot - https://modal.com/blog/mem-snapshots Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ac64198 commit 9fdd92b

4 files changed

Lines changed: 156 additions & 12 deletions

File tree

policyengine_household_api/modal_release/gateway.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -163,13 +163,22 @@ def resolve_app_for_request(
163163
def call_worker_function(app_name: str, payload: dict[str, Any]) -> Response:
164164
import modal
165165

166-
worker_cls = modal.Cls.from_name(
167-
app_name,
168-
"HouseholdWorker",
169-
)
170-
return _response_from_dispatch_result(
171-
worker_cls().handle_household_request.remote(payload)
172-
)
166+
# Prefer the class-based worker (post #1528). During the release
167+
# transition the existing frontier is promoted to current without a
168+
# redeploy, so for one release cycle the current worker may still expose
169+
# the pre-#1528 top-level `handle_household_request` function. Fall back
170+
# to that shape if the class is not present.
171+
try:
172+
worker_cls = modal.Cls.from_name(app_name, "HouseholdWorker")
173+
return _response_from_dispatch_result(
174+
worker_cls().handle_household_request.remote(payload)
175+
)
176+
except modal.exception.NotFoundError:
177+
worker_function = modal.Function.from_name(
178+
app_name,
179+
"handle_household_request",
180+
)
181+
return _response_from_dispatch_result(worker_function.remote(payload))
173182

174183

175184
def _extract_requested_version(body: bytes) -> tuple[bytes, str]:

policyengine_household_api/modal_release/worker_app.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,41 @@ def load_flask_app(self) -> None:
7878

7979
self.flask_app = flask_app
8080

81+
@modal.enter(snap=False)
82+
def reset_post_snapshot_state(self) -> None:
83+
# Runs on every container start AFTER snapshot restore. Memory
84+
# snapshots preserve Python object state but not live network
85+
# connections; the SQLAlchemy pool and the Cloud SQL Connector
86+
# captured in the snapshot hold sockets that closed at snapshot
87+
# time. Reset them so the first request opens fresh connections.
88+
#
89+
# Also re-runs `configure_google_credentials()` in case Modal did
90+
# not preserve `/tmp` across snapshots and the credentials file is
91+
# missing on the restored filesystem.
92+
# See: https://modal.com/docs/guide/memory-snapshot
93+
configure_google_credentials()
94+
95+
from policyengine_household_api.data import analytics_setup
96+
97+
if not analytics_setup.is_analytics_enabled():
98+
return
99+
100+
analytics_setup.cleanup()
101+
102+
try:
103+
with self.flask_app.app_context():
104+
analytics_setup.db.engine.dispose()
105+
except Exception as exc:
106+
import logging
107+
108+
logging.getLogger(__name__).warning(
109+
"Failed to dispose analytics DB engine after snapshot "
110+
"restore; subsequent queries may reconnect lazily: %s",
111+
exc,
112+
)
113+
81114
@modal.method()
82115
def handle_household_request(
83116
self, payload: dict[str, Any]
84117
) -> dict[str, Any]:
85-
# Idempotent: if env var is already set (snapshot-restored or fresh
86-
# container that ran the snap hook), this is a no-op. Kept here so
87-
# filesystem-only credential state is re-established on restored
88-
# containers if Modal does not preserve /tmp across snapshots.
89-
configure_google_credentials()
90118
return dispatch_to_flask_app(self.flask_app, payload)

tests/unit/modal_release/test_gateway.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,3 +231,101 @@ def test_country_versions_rejects_unsupported_country():
231231
assert response.status_code == 404
232232
assert not worker_requests
233233
assert response.get_json()["message"] == "Unsupported country `zz`"
234+
235+
236+
def test_call_worker_function_uses_class_when_available(monkeypatch):
237+
"""When the worker exposes the new ``HouseholdWorker`` class, the
238+
gateway should dispatch through ``modal.Cls.from_name``."""
239+
import modal
240+
241+
from policyengine_household_api.modal_release import gateway
242+
243+
captured = {}
244+
245+
class _StubMethod:
246+
def remote(self, payload):
247+
captured["dispatched_via"] = "class"
248+
captured["payload"] = payload
249+
return {"status_code": 200, "body": b'{"status":"ok"}'}
250+
251+
class _StubInstance:
252+
handle_household_request = _StubMethod()
253+
254+
class _StubCls:
255+
def __call__(self):
256+
return _StubInstance()
257+
258+
def fake_cls_from_name(app_name, class_name):
259+
captured["cls_app_name"] = app_name
260+
captured["class_name"] = class_name
261+
return _StubCls()
262+
263+
def fake_function_from_name(app_name, function_name):
264+
captured["fallback_invoked"] = True
265+
raise AssertionError("Function fallback must not be invoked")
266+
267+
monkeypatch.setattr(
268+
modal.Cls, "from_name", staticmethod(fake_cls_from_name)
269+
)
270+
monkeypatch.setattr(
271+
modal.Function, "from_name", staticmethod(fake_function_from_name)
272+
)
273+
274+
response = gateway.call_worker_function(
275+
"frontier-app", {"household": {"foo": "bar"}}
276+
)
277+
278+
assert response.status_code == 200
279+
assert captured["dispatched_via"] == "class"
280+
assert captured["cls_app_name"] == "frontier-app"
281+
assert captured["class_name"] == "HouseholdWorker"
282+
assert captured["payload"] == {"household": {"foo": "bar"}}
283+
assert "fallback_invoked" not in captured
284+
285+
286+
def test_call_worker_function_falls_back_to_function_for_legacy_workers(
287+
monkeypatch,
288+
):
289+
"""During a release transition, the existing frontier worker gets
290+
promoted to current without a redeploy, so the current worker may
291+
still expose the pre-class ``handle_household_request`` function.
292+
The gateway must fall back to ``modal.Function.from_name`` when the
293+
class cannot be found."""
294+
import modal
295+
296+
from policyengine_household_api.modal_release import gateway
297+
298+
captured = {}
299+
300+
def fake_cls_from_name(app_name, class_name):
301+
raise modal.exception.NotFoundError(
302+
f"No class named `{class_name}` in app `{app_name}`"
303+
)
304+
305+
class _StubFunction:
306+
def remote(self, payload):
307+
captured["dispatched_via"] = "function"
308+
captured["payload"] = payload
309+
return {"status_code": 200, "body": b'{"status":"ok"}'}
310+
311+
def fake_function_from_name(app_name, function_name):
312+
captured["fn_app_name"] = app_name
313+
captured["function_name"] = function_name
314+
return _StubFunction()
315+
316+
monkeypatch.setattr(
317+
modal.Cls, "from_name", staticmethod(fake_cls_from_name)
318+
)
319+
monkeypatch.setattr(
320+
modal.Function, "from_name", staticmethod(fake_function_from_name)
321+
)
322+
323+
response = gateway.call_worker_function(
324+
"current-app", {"household": {"foo": "bar"}}
325+
)
326+
327+
assert response.status_code == 200
328+
assert captured["dispatched_via"] == "function"
329+
assert captured["fn_app_name"] == "current-app"
330+
assert captured["function_name"] == "handle_household_request"
331+
assert captured["payload"] == {"household": {"foo": "bar"}}

tests/unit/modal_release/test_worker_app.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,12 @@ def test_household_worker_exposes_snapshot_entrypoint(worker_app):
6767
worker_cls = worker_app.HouseholdWorker
6868
assert hasattr(worker_cls, "load_flask_app")
6969
assert hasattr(worker_cls, "handle_household_request")
70+
71+
72+
def test_household_worker_exposes_post_snapshot_reset_hook(worker_app):
73+
"""The class must declare a post-restore hook so network state
74+
captured in the memory snapshot (SQLAlchemy pool, Cloud SQL
75+
Connector) gets reset on every container start. Modal preserves
76+
Python object state but not live TCP sockets across snapshots."""
77+
worker_cls = worker_app.HouseholdWorker
78+
assert hasattr(worker_cls, "reset_post_snapshot_state")

0 commit comments

Comments
 (0)