Skip to content

Commit 444e0f2

Browse files
authored
fix: Prevent infinite loop in legacy edx course export op (#2212)
The export_edx_courses op had two bugs causing the courses/ folder to go missing in S3 data packages (mitx-etl-mitxonline-production, since ~2026-01-21): 1. Infinite loop in status-polling HTTPStatusError (e.g. HTTP 404 after django-user-tasks record cleanup) was swallowed by a bare `continue`, so failed courses were never added to failed_exports and the while-loop never terminated. Fixed by only treating 404 as a soft failure (mark as failed_exports); all other status codes now re-raise. Added a 60-minute hard timeout that raises TimeoutError instead of silently breaking. 2. Abort on partial-failure from the export trigger API The ol_openedx_course_export plugin returns HTTP 400 when some (but not all) courses fail to queue. The unconditional raise_for_status() aborted the entire op before successfully-queued courses could be processed. Fixed by allowing 400 through only when the response body contains the expected partial-failure schema (upload_task_ids key); other 400s still raise. Fixes: mitodl/hq#11104 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6a3020a commit 444e0f2

2 files changed

Lines changed: 50 additions & 4 deletions

File tree

  • dg_projects/legacy_openedx/legacy_openedx/ops
  • packages/ol-orchestrate-lib/src/ol_orchestrate/resources

dg_projects/legacy_openedx/legacy_openedx/ops/open_edx.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from pypika import MySQLQuery as Query
3232
from pypika import Table, Tables
3333

34+
COURSE_EXPORT_TIMEOUT = timedelta(minutes=60)
35+
3436

3537
class ListCoursesConfig(Config):
3638
edx_course_api_page_size: int = Field(
@@ -568,7 +570,7 @@ def export_edx_forum_database(
568570
),
569571
},
570572
)
571-
def export_edx_courses(
573+
def export_edx_courses( # noqa: C901
572574
context: OpExecutionContext,
573575
edx_course_ids: list[str],
574576
daily_extracts_dir: str,
@@ -577,23 +579,48 @@ def export_edx_courses(
577579
exported_courses = context.resources.openedx.export_courses(
578580
course_ids=edx_course_ids,
579581
)
582+
failed_initial = exported_courses.get("failed_uploads", {})
583+
if failed_initial:
584+
context.log.warning(
585+
"The following courses failed to queue for export and will be skipped: %s",
586+
list(failed_initial.keys()),
587+
)
580588
successful_exports: set[str] = set()
581589
failed_exports: set[str] = set()
582590
tasks = exported_courses["upload_task_ids"]
583591
context.log.info("Exporting %s tasks from Open edX", len(tasks))
584592
# Possible status values found here:
585593
# https://github.com/openedx/django-user-tasks/blob/master/user_tasks/models.py
594+
start_time = datetime.now(tz=UTC)
586595
while len(successful_exports.union(failed_exports)) < len(tasks):
596+
if datetime.now(tz=UTC) - start_time > COURSE_EXPORT_TIMEOUT:
597+
timed_out = set(tasks.keys()) - successful_exports - failed_exports
598+
err_msg = (
599+
f"Timed out waiting for course exports after {COURSE_EXPORT_TIMEOUT}. "
600+
f"Unresolved courses: {timed_out}"
601+
)
602+
raise TimeoutError(err_msg)
587603
time.sleep(timedelta(seconds=60).seconds)
588604
for course_id, task_id in tasks.items():
605+
if course_id in successful_exports or course_id in failed_exports:
606+
continue
589607
try:
590608
task_status = context.resources.openedx.check_course_export_status(
591609
course_id,
592610
task_id,
593611
)
594-
except httpx.HTTPStatusError:
595-
# Don't fail the whole job if one task status yields an error
596-
continue
612+
except httpx.HTTPStatusError as e:
613+
if e.response.status_code == 404: # noqa: PLR2004
614+
context.log.warning(
615+
"Export task not found (HTTP 404) for course %s "
616+
"(task %s), marking as failed",
617+
course_id,
618+
task_id,
619+
)
620+
failed_exports.add(course_id)
621+
continue
622+
else:
623+
raise
597624
if task_status["state"] == "Succeeded":
598625
successful_exports.add(course_id)
599626
if task_status["state"] in {"Failed", "Canceled", "Retrying"}:

packages/ol-orchestrate-lib/src/ol_orchestrate/resources/openedx.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@ def export_courses(self, course_ids: list[str]) -> dict[str, dict[str, str]]:
6464
6565
:returns: A dictionary of course IDs and the S3 URL where it will be exported
6666
to.
67+
68+
.. note::
69+
The ol_openedx_course_export plugin returns HTTP 400 when some (but
70+
not all) courses fail to queue. We allow 400 through only when the
71+
response body contains the expected partial-failure schema
72+
(``upload_task_ids`` key present), so callers can still process
73+
successfully-queued tasks. A 400 with an unrecognised body (e.g. a
74+
bad-request error from a malformed payload) still raises so the
75+
caller gets a clear error instead of a later ``KeyError``.
6776
"""
6877
request_url = f"{self.studio_url}/api/courses/v0/export/"
6978
response = self.http_client.post(
@@ -72,6 +81,16 @@ def export_courses(self, course_ids: list[str]) -> dict[str, dict[str, str]]:
7281
headers={"Authorization": f"JWT {self._fetch_access_token()}"},
7382
timeout=60,
7483
)
84+
if response.status_code == 400: # noqa: PLR2004
85+
# Partial-failure: some courses failed to queue but others succeeded.
86+
# Only suppress the error when the body matches the expected schema.
87+
try:
88+
body = response.json()
89+
except ValueError:
90+
response.raise_for_status()
91+
if "upload_task_ids" not in body:
92+
response.raise_for_status()
93+
return body
7594
response.raise_for_status()
7695
return response.json()
7796

0 commit comments

Comments
 (0)