|
11 | 11 | import os |
12 | 12 | import re |
13 | 13 | import shutil |
| 14 | +import signal |
14 | 15 | import subprocess |
15 | 16 | import tempfile |
16 | 17 | import uuid |
|
46 | 47 | _VALID_JAVA_CLASS_NAME = re.compile(r"^[a-zA-Z_$][a-zA-Z0-9_$.]*$") |
47 | 48 |
|
48 | 49 |
|
| 50 | +def _run_cmd_kill_pg_on_timeout( |
| 51 | + cmd: list[str], |
| 52 | + *, |
| 53 | + cwd: Path | None = None, |
| 54 | + env: dict[str, str] | None = None, |
| 55 | + timeout: int | None = None, |
| 56 | + text: bool = True, |
| 57 | +) -> subprocess.CompletedProcess: |
| 58 | + """Run a command, killing its entire process group on timeout. |
| 59 | +
|
| 60 | + Unlike subprocess.run(), this function uses start_new_session=True so the |
| 61 | + child process gets its own process group. When the timeout fires we send |
| 62 | + SIGTERM (then SIGKILL) to the whole process group, not just the process |
| 63 | + itself. This is critical for Maven, which forks child JVM processes |
| 64 | + (Maven Surefire forks) that would otherwise become orphaned when the Maven |
| 65 | + parent is killed by a plain subprocess.run() timeout. Orphaned JVMs keep |
| 66 | + SQLite file-handles open, causing "database is locked" errors. |
| 67 | +
|
| 68 | + Args: |
| 69 | + cmd: Command and arguments. |
| 70 | + cwd: Working directory. |
| 71 | + env: Environment variables. |
| 72 | + timeout: Seconds to wait before killing the process group. |
| 73 | + text: If True, decode stdout/stderr as text. |
| 74 | +
|
| 75 | + Returns: |
| 76 | + CompletedProcess. On timeout, returncode is -2 and stderr contains a |
| 77 | + human-readable explanation. |
| 78 | +
|
| 79 | + """ |
| 80 | + proc = subprocess.Popen( |
| 81 | + cmd, |
| 82 | + cwd=cwd, |
| 83 | + env=env, |
| 84 | + stdout=subprocess.PIPE, |
| 85 | + stderr=subprocess.PIPE, |
| 86 | + text=text, |
| 87 | + start_new_session=True, # puts proc in its own process group |
| 88 | + ) |
| 89 | + try: |
| 90 | + stdout, stderr = proc.communicate(timeout=timeout) |
| 91 | + return subprocess.CompletedProcess(args=cmd, returncode=proc.returncode, stdout=stdout, stderr=stderr) |
| 92 | + except subprocess.TimeoutExpired: |
| 93 | + # Kill the entire process group so Maven's forked Surefire JVMs don't |
| 94 | + # become orphans that keep the SQLite database locked. |
| 95 | + pgid = None |
| 96 | + try: |
| 97 | + pgid = os.getpgid(proc.pid) |
| 98 | + os.killpg(pgid, signal.SIGTERM) |
| 99 | + except (ProcessLookupError, OSError): |
| 100 | + proc.kill() |
| 101 | + # Give processes a few seconds to shut down gracefully before SIGKILL. |
| 102 | + try: |
| 103 | + proc.wait(timeout=5) |
| 104 | + except subprocess.TimeoutExpired: |
| 105 | + if pgid is not None: |
| 106 | + try: |
| 107 | + os.killpg(pgid, signal.SIGKILL) |
| 108 | + except (ProcessLookupError, OSError): |
| 109 | + pass |
| 110 | + else: |
| 111 | + proc.kill() |
| 112 | + proc.wait() |
| 113 | + # Drain pipes so we don't leave zombie pipe buffers. |
| 114 | + try: |
| 115 | + stdout_data = proc.stdout.read() if proc.stdout else "" |
| 116 | + stderr_data = proc.stderr.read() if proc.stderr else "" |
| 117 | + except Exception: |
| 118 | + stdout_data, stderr_data = "", "" |
| 119 | + return subprocess.CompletedProcess( |
| 120 | + args=cmd, |
| 121 | + returncode=-2, |
| 122 | + stdout=stdout_data, |
| 123 | + stderr=f"Process group killed after timeout ({timeout}s): {stderr_data}", |
| 124 | + ) |
| 125 | + |
| 126 | + |
49 | 127 | def _validate_java_class_name(class_name: str) -> bool: |
50 | 128 | """Validate that a string is a valid Java class name. |
51 | 129 |
|
@@ -505,14 +583,7 @@ def _compile_tests( |
505 | 583 | logger.debug("Compiling tests: %s in %s", " ".join(cmd), project_root) |
506 | 584 |
|
507 | 585 | try: |
508 | | - return subprocess.run( |
509 | | - cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout |
510 | | - ) |
511 | | - except subprocess.TimeoutExpired: |
512 | | - logger.exception("Maven compilation timed out after %d seconds", timeout) |
513 | | - return subprocess.CompletedProcess( |
514 | | - args=cmd, returncode=-2, stdout="", stderr=f"Compilation timed out after {timeout} seconds" |
515 | | - ) |
| 586 | + return _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout) |
516 | 587 | except Exception as e: |
517 | 588 | logger.exception("Maven compilation failed: %s", e) |
518 | 589 | return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e)) |
@@ -548,9 +619,7 @@ def _get_test_classpath( |
548 | 619 | logger.debug("Getting classpath: %s", " ".join(cmd)) |
549 | 620 |
|
550 | 621 | try: |
551 | | - result = subprocess.run( |
552 | | - cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout |
553 | | - ) |
| 622 | + result = _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout) |
554 | 623 |
|
555 | 624 | if result.returncode != 0: |
556 | 625 | logger.error("Failed to get classpath: %s", result.stderr) |
@@ -600,9 +669,6 @@ def _get_test_classpath( |
600 | 669 |
|
601 | 670 | return os.pathsep.join(cp_parts) |
602 | 671 |
|
603 | | - except subprocess.TimeoutExpired: |
604 | | - logger.exception("Getting classpath timed out") |
605 | | - return None |
606 | 672 | except Exception as e: |
607 | 673 | logger.exception("Failed to get classpath: %s", e) |
608 | 674 | return None |
@@ -804,14 +870,7 @@ def _run_tests_direct( |
804 | 870 | logger.debug("Running tests directly: java -cp ... ConsoleLauncher --select-class %s", test_classes) |
805 | 871 |
|
806 | 872 | try: |
807 | | - return subprocess.run( |
808 | | - cmd, check=False, cwd=working_dir, env=env, capture_output=True, text=True, timeout=timeout |
809 | | - ) |
810 | | - except subprocess.TimeoutExpired: |
811 | | - logger.exception("Direct test execution timed out after %d seconds", timeout) |
812 | | - return subprocess.CompletedProcess( |
813 | | - args=cmd, returncode=-2, stdout="", stderr=f"Test execution timed out after {timeout} seconds" |
814 | | - ) |
| 873 | + return _run_cmd_kill_pg_on_timeout(cmd, cwd=working_dir, env=env, timeout=timeout) |
815 | 874 | except Exception as e: |
816 | 875 | logger.exception("Direct test execution failed: %s", e) |
817 | 876 | return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e)) |
@@ -1511,9 +1570,13 @@ def _run_maven_tests( |
1511 | 1570 | logger.debug("Running Maven command: %s in %s", " ".join(cmd), project_root) |
1512 | 1571 |
|
1513 | 1572 | try: |
1514 | | - result = subprocess.run( |
1515 | | - cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout |
1516 | | - ) |
| 1573 | + # Use _run_cmd_kill_pg_on_timeout instead of subprocess.run so that on |
| 1574 | + # timeout we kill the entire Maven process GROUP (including forked Surefire |
| 1575 | + # JVMs). With plain subprocess.run(), only the Maven parent is killed and |
| 1576 | + # the child JVMs become orphaned, holding the SQLite result file open and |
| 1577 | + # causing "database is locked" errors when Python reads the file immediately |
| 1578 | + # after Maven is killed. |
| 1579 | + result = _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout) |
1517 | 1580 |
|
1518 | 1581 | # Check if Maven failed due to compilation errors (not just test failures) |
1519 | 1582 | if result.returncode != 0: |
@@ -1546,11 +1609,6 @@ def _run_maven_tests( |
1546 | 1609 |
|
1547 | 1610 | return result |
1548 | 1611 |
|
1549 | | - except subprocess.TimeoutExpired: |
1550 | | - logger.exception("Maven test execution timed out after %d seconds", timeout) |
1551 | | - return subprocess.CompletedProcess( |
1552 | | - args=cmd, returncode=-2, stdout="", stderr=f"Test execution timed out after {timeout} seconds" |
1553 | | - ) |
1554 | 1612 | except Exception as e: |
1555 | 1613 | logger.exception("Maven test execution failed: %s", e) |
1556 | 1614 | return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e)) |
|
0 commit comments