Skip to content

Commit 8dce13e

Browse files
LEANDERANTONYclaude
andcommitted
fix: run cache refresh in the background, return 202
The 4-hourly /admin/refresh-cache fans out across ~130 ATS boards and runs for minutes — past the upstream gateway's ~100s timeout — so the gateway logged a misleading 524 even though the refresh completed server-side every time. The endpoint now hands refresh_cached_jobs to a FastAPI BackgroundTasks job and returns 202 Accepted immediately (the decouple-the-response shape /analyze-jobs already uses). The Sentry cached-jobs-refresh cron check-in moved into the background job so its in_progress -> ok/error window still spans the real refresh duration; the background task never raises (failures log at ERROR + error the check-in). pg_cron now records a fast 202 instead of a 524. See DEVLOG Day 78. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8bbbfe2 commit 8dce13e

3 files changed

Lines changed: 98 additions & 33 deletions

File tree

backend/routers/jobs.py

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
import logging
12
import secrets
23

3-
from fastapi import APIRouter, Depends, Header, HTTPException, Request
4+
from fastapi import APIRouter, BackgroundTasks, Depends, Header, HTTPException, Request
45

56
from backend import quota
67
from backend.models import (
@@ -26,8 +27,11 @@
2627
from backend.tiers import resolve_user_tier
2728
from src.config import REFRESH_CACHE_SECRET
2829
from src.errors import AppError
30+
from src.logging_utils import get_logger, log_event
2931

3032

33+
LOGGER = get_logger(__name__)
34+
3135
router = APIRouter(prefix="/jobs", tags=["jobs"])
3236

3337

@@ -152,32 +156,69 @@ def _verify_refresh_secret(authorization: str | None = Header(default=None)) ->
152156
raise HTTPException(status_code=401, detail="Invalid refresh-cache token.")
153157

154158

155-
@admin_router.post("/refresh-cache")
156-
def refresh_cache(
157-
request: Request,
158-
_: None = Depends(_verify_refresh_secret),
159-
):
160-
"""Refresh cached_jobs from all configured providers.
159+
def _run_cache_refresh_job() -> None:
160+
"""Background-task body for ``POST /admin/refresh-cache``.
161161
162-
This is what Supabase pg_cron hits every 4 hours (six times a
163-
day on the `0 */4 * * *` schedule). Returns the
164-
structured refresh report (see `refresh_cached_jobs`) so cron
165-
output can be inspected when something goes wrong.
162+
The refresh fans out across ~130 ATS boards and runs for minutes —
163+
far longer than the upstream gateway will wait — so it runs here,
164+
after the endpoint has already returned 202. Running it inline made
165+
the gateway time the *response* out at ~100s and record a
166+
misleading ``524`` in the cron logs even though the refresh
167+
completed server-side.
166168
167-
Wrapped in a Sentry cron check-in (`cached-jobs-refresh`): a
168-
missed or errored check-in is how monitoring learns the 4-hourly
169-
refresh stopped firing — pg_cron failing silently is otherwise
170-
invisible.
169+
The Sentry ``cached-jobs-refresh`` check-in wraps THIS function,
170+
not the request, so its in_progress -> ok/error window reflects the
171+
real refresh duration. Never raises: a background task has no
172+
caller to surface an exception to, so any failure is recorded as an
173+
errored cron check-in plus an ERROR-level log (a Sentry issue via
174+
the LoggingIntegration) and swallowed. ``refresh_cached_jobs`` is
175+
itself crash-safe and idempotent — a failed run is recovered by the
176+
next 4-hourly tick.
171177
"""
172178
try:
173179
with sentry_cron_monitor(
174180
CACHED_JOBS_REFRESH_MONITOR_SLUG,
175181
CACHED_JOBS_REFRESH_MONITOR_CONFIG,
176182
):
177-
report = refresh_cached_jobs()
178-
except RuntimeError as exc:
179-
raise HTTPException(status_code=503, detail=str(exc)) from exc
180-
return report
183+
refresh_cached_jobs()
184+
except Exception as exc: # noqa: BLE001 — background-task boundary
185+
log_event(
186+
LOGGER,
187+
logging.ERROR,
188+
"cached_jobs_refresh_background_failed",
189+
f"Background cached_jobs refresh failed: "
190+
f"{type(exc).__name__}: {exc}",
191+
error=f"{type(exc).__name__}: {exc}",
192+
)
193+
194+
195+
@admin_router.post("/refresh-cache", status_code=202)
196+
def refresh_cache(
197+
request: Request,
198+
background_tasks: BackgroundTasks,
199+
_: None = Depends(_verify_refresh_secret),
200+
):
201+
"""Trigger a cached_jobs refresh from all configured providers.
202+
203+
Supabase pg_cron hits this every 4 hours (`0 */4 * * *`). The
204+
refresh fans out across ~130 ATS boards and runs for minutes —
205+
longer than the upstream gateway will wait — so the work is handed
206+
to a background task and the endpoint returns **202 Accepted**
207+
immediately. Running it synchronously made the gateway time the
208+
response out at ~100s and record a misleading `524` in the cron
209+
logs, even though the refresh completed server-side.
210+
211+
The refresh outcome, its structured report, and the Sentry
212+
`cached-jobs-refresh` cron check-in are all produced by the
213+
background job — see `_run_cache_refresh_job`. A failed run
214+
surfaces as an errored Sentry check-in plus an ERROR-level issue;
215+
the daily `/admin/refresh-healthcheck` is the backstop.
216+
"""
217+
background_tasks.add_task(_run_cache_refresh_job)
218+
return {
219+
"status": "accepted",
220+
"detail": "cached_jobs refresh started in the background.",
221+
}
181222

182223

183224
@admin_router.post("/refresh-healthcheck")

docs/DEVLOG.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3957,3 +3957,29 @@ on both raise paths, `job_searched`, `resume_uploaded`,
39573957
`analysis_started`, `artifact_exported` — added to the existing quota,
39583958
app, and workspace suites. Also removed a pre-existing unused `Form`
39593959
import flagged by ruff in `routers/workspace.py`.
3960+
3961+
## Day 78: Async cache refresh — return 202, run in the background
3962+
3963+
Verifying the Day 77 cron work surfaced an incidental finding in the
3964+
pg_net response log: the 4-hourly `/admin/refresh-cache` call returns
3965+
`524`. The refresh fans out across ~130 ATS boards and runs for
3966+
minutes — past the upstream gateway's ~100s timeout — so the gateway
3967+
gave up on the *response* and logged a 524, even though the refresh
3968+
completed server-side every time (the healthcheck confirmed the cache
3969+
was fresh, only ~2% stale). A misleading failure signal, not a real
3970+
failure.
3971+
3972+
The fix: `/admin/refresh-cache` now hands `refresh_cached_jobs` to a
3973+
FastAPI `BackgroundTasks` job and returns **202 Accepted**
3974+
immediately — the same decouple-the-response shape the `/analyze-jobs`
3975+
route already uses. The Sentry `cached-jobs-refresh` cron check-in
3976+
moved with the work: it now wraps the background
3977+
`_run_cache_refresh_job`, not the request, so its
3978+
in_progress -> ok/error window still spans the real refresh duration.
3979+
The background task never raises — a failure becomes an errored
3980+
check-in plus an ERROR-level log (a Sentry issue via the
3981+
LoggingIntegration), and the daily healthcheck is the backstop.
3982+
pg_cron now records a fast 202 instead of a 524; nothing else changes.
3983+
3984+
Verification: test suite green; the `/admin/refresh-cache` endpoint
3985+
test was updated to assert 202 + a scheduled background worker.

tests/test_backend_app.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -278,21 +278,18 @@ def test_admin_refresh_cache_returns_503_when_secret_unconfigured(monkeypatch):
278278
assert response.status_code == 503
279279

280280

281-
def test_admin_refresh_cache_accepts_correct_bearer_then_runs_worker(monkeypatch):
282-
"""Right secret + worker stub → endpoint returns the worker's
283-
report. Pins that the auth → worker handoff actually works."""
281+
def test_admin_refresh_cache_accepts_correct_bearer_then_schedules_worker(monkeypatch):
282+
"""Right secret → 202 immediately, with the refresh handed to a
283+
background task. The endpoint no longer blocks on the multi-minute
284+
refresh (which used to time the gateway out into a 524)."""
284285
monkeypatch.setattr(
285286
"backend.routers.jobs.REFRESH_CACHE_SECRET", "test-secret"
286287
)
288+
ran = []
287289

288290
def _fake_refresh():
289-
return {
290-
"started_at": "2026-05-07T18:00:00Z",
291-
"finished_at": "2026-05-07T18:00:05Z",
292-
"duration_seconds": 5.0,
293-
"providers": {"greenhouse": {"status": "ok"}},
294-
"total_active_after": 42,
295-
}
291+
ran.append(True)
292+
return {"total_active_after": 42}
296293

297294
monkeypatch.setattr(
298295
"backend.routers.jobs.refresh_cached_jobs", _fake_refresh
@@ -302,10 +299,11 @@ def _fake_refresh():
302299
"/api/admin/refresh-cache",
303300
headers={"Authorization": "Bearer test-secret"},
304301
)
305-
assert response.status_code == 200
306-
payload = response.json()
307-
assert payload["total_active_after"] == 42
308-
assert payload["providers"]["greenhouse"]["status"] == "ok"
302+
assert response.status_code == 202
303+
assert response.json()["status"] == "accepted"
304+
# Starlette's TestClient runs background tasks before returning, so
305+
# the worker has executed by the time the response is in hand.
306+
assert ran == [True]
309307

310308

311309
def test_admin_refresh_healthcheck_runs_service_with_correct_bearer(monkeypatch):

0 commit comments

Comments
 (0)