Skip to content

Commit 146dc8d

Browse files
author
zhangyue
committed
fix(ci): treat exit 137 as success when pytest junit XML reports no failures
Docker 18.09 on Ascend CI hosts races on `--rm` cleanup: the inner process exits cleanly with rc=0 but the daemon SIGKILLs the container during teardown, surfacing exit code 137 to `run.py` even though the pytest stage succeeded. Parse the per-run junit XML when returncode==137 and downgrade to a warning if no failures/errors are reported.
1 parent a3cd770 commit 146dc8d

1 file changed

Lines changed: 54 additions & 2 deletions

File tree

.ci/run.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import subprocess
99
import sys
1010
import uuid
11+
import xml.etree.ElementTree as ET
1112
from datetime import datetime
1213
from pathlib import Path
1314

@@ -24,6 +25,42 @@
2425
_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
2526

2627

28+
def _junit_xml_indicates_pass(results_dir):
29+
"""Return True if `pytest` junit XML under `results_dir` reports no failures/errors.
30+
31+
Used to distinguish a real CI failure from the docker 18.09
32+
container-teardown `SIGKILL` (exit code 137) that occurs on this host
33+
after a child process exits successfully — bash returns 0 from inside
34+
the container, but the docker daemon reports 137 due to a race in its
35+
`--rm` cleanup path. The junit XML is written by pytest before that
36+
teardown and reliably captures the real outcome of the test stage.
37+
"""
38+
for junit in Path(results_dir).rglob("test-results.xml"):
39+
try:
40+
root = ET.parse(junit).getroot()
41+
except ET.ParseError:
42+
continue
43+
44+
suites = root.findall("testsuite") if root.tag == "testsuites" else [root]
45+
46+
if not suites:
47+
continue
48+
49+
for suite in suites:
50+
try:
51+
if int(suite.get("failures", 0)) > 0:
52+
return False
53+
54+
if int(suite.get("errors", 0)) > 0:
55+
return False
56+
except ValueError:
57+
return False
58+
59+
return True
60+
61+
return False
62+
63+
2764
def apply_test_override(run_cmd, test_path):
2865
"""Replace positional test path(s) in a pytest stage command.
2966
@@ -437,8 +474,23 @@ def main():
437474
pool.release(allocated_ids)
438475

439476
if returncode != 0:
440-
print(f"job {job_name} failed (exit code {returncode})", file=sys.stderr)
441-
failed += 1
477+
# Docker 18.09 on this host occasionally SIGKILLs containers
478+
# during `--rm` cleanup after the inner process already exited
479+
# cleanly, producing exit code 137. Fall back to the pytest
480+
# junit XML to recover the real outcome in that case.
481+
if returncode == 137 and _junit_xml_indicates_pass(results_dir):
482+
print(
483+
f"[warn] job {job_name}: container exited with 137 "
484+
f"(likely docker teardown SIGKILL after clean pytest); "
485+
f"junit XML reports no failures — treating as success",
486+
file=sys.stderr,
487+
)
488+
else:
489+
print(
490+
f"job {job_name} failed (exit code {returncode})",
491+
file=sys.stderr,
492+
)
493+
failed += 1
442494

443495
sys.exit(1 if failed else 0)
444496

0 commit comments

Comments
 (0)