Merge remote-tracking branch 'origin/claude/fix-blackwell-monai-tests-Apjwl' into 8587-test-erros-on-pytorch-release-2508-on-series-50

garciadias · garciadias · commit 0a90770bdf0e · 2026-04-02T09:50:47.000+01:00
diff --git a/runtests.sh b/runtests.sh
@@ -53,6 +53,7 @@ doMypyFormat=false
 doCleanup=false
 doDistTests=false
 doPrecommit=false
+testTimeout=0
 
 NUM_PARALLEL=1
 
@@ -109,6 +110,8 @@ function print_usage {
     echo "    -v, --version     : show MONAI and system version information and exit"
     echo "    -p, --path        : specify the path used for formatting, default is the current dir if unspecified"
     echo "    --formatfix       : format code using \"isort\" and \"black\" for user specified directories"
+    echo "    --timeout [secs]  : per-test timeout in seconds; tests exceeding this are marked as errors and skipped"
+    echo "                        (default: 180s when flag is given without a value; 0 = disabled)"
     echo ""
     echo "${separator}For bug reports and feature requests, please file an issue at:"
     echo "    https://github.com/Project-MONAI/MONAI/issues/new/choose"
@@ -344,6 +347,15 @@ do
             testdir=$2
             shift
         ;;
+        --timeout)
+            # Accept an optional numeric value; default to 180s if none given.
+            if [[ -n "$2" ]] && [[ "$2" =~ ^[0-9]+$ ]]; then
+                testTimeout=$2
+                shift
+            else
+                testTimeout=180
+            fi
+        ;;
         *)
             print_error_msg "Incorrect commandline provided, invalid key: $key"
             print_usage
@@ -695,7 +707,11 @@ if [ $doUnitTests = true ]
 then
     echo "${separator}${blue}unittests${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration|test_perceptual_loss|test_auto3dseg_ensemble).*(?<!_dist)$"  # excluding integration/dist/perceptual_loss tests
+    timeoutArg=""
+    if [ "$testTimeout" -gt 0 ] 2>/dev/null; then
+        timeoutArg="--timeout $testTimeout"
+    fi
+    ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration|test_perceptual_loss|test_auto3dseg_ensemble).*(?<!_dist)$" $timeoutArg  # excluding integration/dist/perceptual_loss tests
 fi
 
 # distributed test only
diff --git a/tests/runner.py b/tests/runner.py
@@ -15,6 +15,7 @@
 import inspect
 import os
 import re
+import signal
 import sys
 import time
 import unittest
@@ -24,10 +25,23 @@
 
 results: dict = {}
 
+_SIGALRM_AVAILABLE = hasattr(signal, "SIGALRM")
+
+
+class _TestTimeoutError(Exception):
+    """Raised when a single test exceeds the per-test timeout."""
+
+
+def _alarm_handler(signum, frame):
+    raise _TestTimeoutError("Test timed out")
+
 
 class TimeLoggingTestResult(unittest.TextTestResult):
     """Overload the default results so that we can store the results."""
 
+    # Set by the caller before running; 0 means no timeout.
+    timeout: int = 0
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.timed_tests = {}
@@ -37,10 +51,15 @@ def startTest(self, test):  # noqa: N802
         self.start_time = time.time()
         name = self.getDescription(test)
         self.stream.write(f"Starting test: {name}...\n")
+        if _SIGALRM_AVAILABLE and self.timeout > 0:
+            signal.signal(signal.SIGALRM, _alarm_handler)
+            signal.alarm(self.timeout)
         super().startTest(test)
 
     def stopTest(self, test):  # noqa: N802
         """On test end, get time, print, store and do normal behaviour."""
+        if _SIGALRM_AVAILABLE and self.timeout > 0:
+            signal.alarm(0)  # cancel any pending alarm
         elapsed = time.time() - self.start_time
         name = self.getDescription(test)
         self.stream.write(f"Finished test: {name} ({elapsed:.03}s)\n")
@@ -99,6 +118,13 @@ def parse_args():
     parser.add_argument(
         "-f", "--failfast", action="store_true", dest="failfast", default=False, help="Stop testing on first failure"
     )
+    parser.add_argument(
+        "--timeout",
+        dest="timeout",
+        default=0,
+        type=int,
+        help="Per-test timeout in seconds; 0 disables (default: %(default)d). Requires SIGALRM (Linux/macOS only).",
+    )
     args = parser.parse_args()
     print(f"Running tests in folder: '{args.path}'")
     if args.pattern:
@@ -145,6 +171,13 @@ def get_default_pattern(loader):
     discovery_time = pc.total_time
     print(f"time to discover tests: {discovery_time}s, total cases: {tests.countTestCases()}.")
 
+    if args.timeout > 0:
+        if _SIGALRM_AVAILABLE:
+            TimeLoggingTestResult.timeout = args.timeout
+            print(f"Per-test timeout enabled: {args.timeout}s")
+        else:
+            print("Warning: --timeout ignored; SIGALRM is not available on this platform.")
+
     test_runner = unittest.runner.TextTestRunner(
         resultclass=TimeLoggingTestResult, verbosity=args.verbosity, failfast=args.failfast
     )