Merge pull request #20 from InfiniTensor/feat/stability_check

Chamberlain0w0 · web-flow · commit 7f7d5feb0762 · 2026-02-02T10:02:54.000+08:00
Add stabilty check
diff --git a/infinimetrics/common/constants.py b/infinimetrics/common/constants.py
@@ -190,3 +190,30 @@ class InfiniCoreResult:
 
 # Metric prefixes
 METRIC_PREFIX_MEM_SWEEP = "hardware.mem_sweep"
+
+# ============================================================
+# Error Code Constants
+# ============================================================
+
+class ErrorCode:
+    """Error code values for different types of failures, organized by severity layer"""
+    # Success
+    SUCCESS = 0              # Test succeeded
+
+    # Layer 1: Input/Configuration issues (not stability issues)
+    CONFIG = 1               # Invalid configuration or input (user error)
+
+    # Layer 2: Framework internal errors (tested framework's fault)
+    INTERNAL = 2             # InfiniLM/InfiniCore internal error or non-zero return
+
+    # Layer 3: Incompatibility issues
+    INCOMPAT = 3             # Compilation errors, version incompatibility
+
+    # Layer 4: System resource issues
+    SYSTEM = 4               # OS/Hardware issues (OOM, disk full, GPU driver)
+
+    # Layer 5: Test framework issues (our fault)
+    GENERIC = 5              # Test framework logic error
+
+    # Layer 6: Timeout issues
+    TIMEOUT = 6              # Test started but hung/timeout
diff --git a/infinimetrics/communication/nccl_adapter.py b/infinimetrics/communication/nccl_adapter.py
@@ -98,7 +98,7 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
         self.run_id = input_dict.get("run_id") or self._gen_run_id(testcase)
         self.test_spec = self._parse_test_spec(testcase)
         if not self.test_spec:
-            return self._err(input_dict, f"Unknown operation in testcase: {testcase}")
+            raise ValueError(f"Unknown operation in testcase: {testcase}")
 
         try:
             cmd = self._build_command(config)
@@ -114,7 +114,7 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
                 msg = f"No performance data parsed. returncode={rc}"
                 if stderr:
                     msg += "\nStderr(last 20 lines):\n" + "\n".join(stderr.splitlines()[-20:])
-                return self._err(input_dict, msg)
+                raise RuntimeError(msg)
 
             raw_files = self._save_raw_csv(results)
             metrics = self._build_metrics(wall_ms, raw_files)
@@ -137,8 +137,17 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
             }
 
         except Exception as e:
-            logger.error(f"Test failed: {e}", exc_info=True)
-            return self._err(input_dict, str(e))
+            # Log error with context, then re-raise for Executor to handle
+            operation = self.test_spec.get("op", "unknown") if self.test_spec else "unknown"
+            logger.error(
+                f"NCCLAdapter: Test failed for {testcase}\n"
+                f"  Operation: {operation}\n"
+                f"  Nodes: {self.resolved.nodes}\n"
+                f"  GPUs per node: {self.resolved.gpus_per_node}\n"
+                f"  Error: {str(e)}",
+                exc_info=True
+            )
+            raise
 
     # -----------------------------
     # Config helpers
diff --git a/infinimetrics/executor.py b/infinimetrics/executor.py
@@ -15,6 +15,8 @@
 from infinimetrics.adapter import BaseAdapter
 from infinimetrics.input import TestInput
 from infinimetrics.utils.path_utils import sanitize_filename
+from infinimetrics.common.constants import ErrorCode
+
 
 logger = logging.getLogger(__name__)
 
@@ -24,6 +26,8 @@
     "--format=csv,noheader",
 ]
 
+AMD_SMI_CANDIDATES = ["amd-smi", "rocm-smi"]
+
 
 def _which(cmd: str) -> Optional[str]:
     try:
@@ -50,6 +54,7 @@ class TestResult:
     result_code: int  # 0 = success, non-zero = error code
     result_file: Optional[str] = None
     skipped: bool = False
+    config: Optional[Dict[str, Any]] = None
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to lightweight dictionary format for Dispatcher aggregation."""
@@ -59,6 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
             "result_code": self.result_code,
             "result_file": self.result_file,
             "skipped": self.skipped,
+            "config": self.config,
         }
 
 
@@ -107,8 +113,8 @@ def setup(self) -> None:
         config["_run_id"] = self.payload.get("run_id", "")
         config["_time"] = self.payload.get("time", None)
 
-        # Also inject the full payload for adapters that need the complete structure
-        config["_full_payload"] = self.payload
+        # Initialize test_input from payload
+        self.test_input = self.payload
 
         self.adapter.setup(config)
 
@@ -158,13 +164,17 @@ def execute(self) -> TestResult:
         logger.info(f"Executor: Running {self.testcase}")
 
         # Initialize TestResult directly (default: result_code=0)
+        config = self.payload.get("config", {})
         test_result = TestResult(
             run_id=self.run_id,
             testcase=self.testcase,
             result_code=0,  # Default to success
             result_file=None,
+            config=config,
         )
 
+        response = {}
+
         try:
             # Phase 1: Setup
             self.setup()
@@ -173,15 +183,6 @@ def execute(self) -> TestResult:
             logger.debug(f"Executor: Calling adapter.process()")
             response = self.adapter.process(self.test_input)
 
-            # Process response (0 = success, non-zero = error code)
-            test_result.result_code = (
-                int(response.get("result_code", 1)) if isinstance(response, dict) else 1
-            )
-            if test_result.result_code != 0:
-                logger.warning(
-                    f"Executor: Adapter failed with error code {test_result.result_code}"
-                )
-
             # Enrich environment ONLY if missing
             if isinstance(response, dict) and "environment" not in response:
                 env = self._build_environment(response)
@@ -220,14 +221,150 @@ def execute(self) -> TestResult:
 
             return test_result
 
+        except subprocess.TimeoutExpired as e:
+            # Timeout errors (possible hardware hang)
+            logger.error(
+                f"Executor: STABILITY CHECK FAILED for {self.testcase}\n"
+                f"  Issue Type: timeout\n"
+                f"  Severity: CRITICAL\n"
+                f"  Analysis: Test timed out. Hardware may be hung or overloaded.\n"
+                f"  Error: {str(e)[:300]}"
+            )
+            test_result.result_code = ErrorCode.TIMEOUT
+            # Build error response for saving
+            response = self._build_error_response(str(e), ErrorCode.TIMEOUT)
+
+        except ValueError as e:
+            # Configuration or input validation errors
+            logger.warning(
+                f"Executor: Test failed for {self.testcase}\n"
+                f"  Issue Type: configuration_error\n"
+                f"  Error: {str(e)[:300]}"
+            )
+            test_result.result_code = ErrorCode.CONFIG
+            # Build error response for saving
+            response = self._build_error_response(str(e), ErrorCode.CONFIG)
+
+        except RuntimeError as e:
+            # RuntimeError: analyze error message for specific patterns
+            error_msg = str(e).lower()
+
+            # Check for memory insufficient errors
+            memory_keywords = [
+                "out of memory", "oom", "memory", "memory leak",
+                "allocate", "allocation failed", "insufficient memory"
+            ]
+            if any(kw in error_msg for kw in memory_keywords):
+                logger.error(
+                    f"Executor: STABILITY CHECK FAILED for {self.testcase}\n"
+                    f"  Issue Type: memory\n"
+                    f"  Severity: CRITICAL\n"
+                    f"  Analysis: Memory allocation failed. Possible causes: insufficient memory, memory leak, or test data too large.\n"
+                    f"  Error: {str(e)[:300]}"
+                )
+                test_result.result_code = ErrorCode.SYSTEM
+                # Build error response for saving
+                response = self._build_error_response(str(e), ErrorCode.SYSTEM)
+            else:
+                # Other RuntimeError
+                logger.warning(
+                    f"Executor: Test failed for {self.testcase}\n"
+                    f"  Issue Type: runtime_error\n"
+                    f"  Error: {str(e)[:300]}"
+                )
+                test_result.result_code = ErrorCode.GENERIC
+                # Build error response for saving
+                response = self._build_error_response(str(e), ErrorCode.GENERIC)
+
         except Exception as e:
-            logger.error(f"Executor: {self.testcase} failed: {e}", exc_info=True)
+            # Unexpected exceptions
+            logger.error(
+                f"Executor: {self.testcase} failed with unexpected exception: {e}",
+                exc_info=True
+            )
+            test_result.result_code = ErrorCode.GENERIC
+            # Build error response for saving
+            response = self._build_error_response(str(e), ErrorCode.GENERIC)
 
-            # Still run teardown on failure
-            self._save_result(None)
-            test_result.result_code = 1  # Failure
+        finally:
+            # Always save result (even on failure)
+            try:
+                if not test_result.result_file:
+                    result_file = self._save_result(response)
+                    test_result.result_file = result_file
+            except Exception as teardown_error:
+                logger.error(f"Executor: Failed to save result: {teardown_error}")
 
-            return test_result
+        return test_result
+
+    def _build_error_response(self, error_msg: str, result_code: int) -> Dict[str, Any]:
+        """
+        Build a response dict containing error information for saving to disk.
+
+        Args:
+            error_msg: Error message string
+            result_code: Error result code
+
+        Returns:
+            Dictionary with basic test info and error details
+        """
+        config = self.payload.get("config", {})
+
+        # Create a cleaned config without injected metadata
+        cleaned_config = {
+            k: v for k, v in config.items()
+            if not k.startswith("_")  # Skip _testcase, _run_id, _time
+        }
+
+        # Extract device information
+        resolved = self._extract_device_info(config)
+
+        return {
+            "run_id": self.run_id,
+            "testcase": self.testcase,
+            "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "result_code": result_code,
+            "error_msg": error_msg,
+            "success": 1,  # 1 = failure
+            "config": cleaned_config,
+            "resolved": resolved,
+        }
+
+    def _extract_device_info(self, config: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract device information from config."""
+        device_used = 0
+        gpus_per_node = 0
+        nodes = 1
+
+        # Try device_involved
+        if "device_involved" in config:
+            try:
+                device_used = int(config.get("device_involved", 0) or 0)
+            except (ValueError, TypeError):
+                device_used = 0
+
+        # Try single_node config
+        if isinstance(config.get("single_node"), dict):
+            single_node = config["single_node"]
+            device_ids = single_node.get("device_ids", [])
+            if device_ids:
+                device_used = len(device_ids)
+            gpus_per_node = device_used
+        else:
+            gpus_per_node = device_used
+
+        # Try multi_node config
+        if "multi_node" in config:
+            try:
+                nodes = int(config.get("multi_node", {}).get("num_nodes", 1) or 1)
+            except (ValueError, TypeError):
+                nodes = 1
+
+        return {
+            "nodes": nodes,
+            "gpus_per_node": gpus_per_node,
+            "device_used": device_used,
+        }
 
     def _build_environment(self, response: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/infinimetrics/hardware/hardware_adapter.py b/infinimetrics/hardware/hardware_adapter.py
@@ -64,9 +64,7 @@ def process(self, test_input: Any) -> Dict[str, Any]:
         # Normalize test input to dict format
         test_input = self._normalize_test_input(test_input)
         if not test_input:
-            return self._create_error_response(
-                f"Invalid test_input type: {type(test_input)}"
-            )
+            raise ValueError(f"Invalid test_input type: {type(test_input)}")
 
         testcase = test_input.get(InfiniMetricsJson.TESTCASE, "unknown")
         config = test_input.get(InfiniMetricsJson.CONFIG, {})
@@ -107,9 +105,17 @@ def process(self, test_input: Any) -> Dict[str, Any]:
                 InfiniMetricsJson.CONFIG: result_config,
                 InfiniMetricsJson.METRICS: metrics,
             }
+
         except Exception as e:
-            logger.error(f"Hardware test failed: {e}", exc_info=True)
-            return self._create_error_response(str(e), test_input)
+            # Log error with context, then re-raise for Executor to handle
+            logger.error(
+                f"HardwareTestAdapter: Test failed for {testcase}\n"
+                f"  Device: {device}\n"
+                f"  Test Type: {test_type}\n"
+                f"  Error: {str(e)}",
+                exc_info=True
+            )
+            raise
 
     def _build_cuda_project(self) -> None:
         """Build CUDA project if needed."""
diff --git a/infinimetrics/inference/inference_adapter.py b/infinimetrics/inference/inference_adapter.py
@@ -23,14 +23,17 @@ def __init__(self):
         self.config = None
         self.mode = "direct"
         self.framework = "infinilm"
-        self._full_payload = None
+        self._testcase = None
 
     def setup(self, config: Dict[str, Any]) -> None:
         """Initialize inference resources"""
         # Get testcase and run_id from injected fields
         testcase = config.get("_testcase", "")
         run_id = config.get("_run_id", "")
 
+        # Store testcase for error reporting
+        self._testcase = testcase
+
         # Parse mode and framework
         if "service" in testcase.lower():
             self.mode = "service"
@@ -42,8 +45,6 @@ def setup(self, config: Dict[str, Any]) -> None:
         elif "infinilm" in testcase.lower():
             self.framework = "infinilm"
 
-        self._full_payload = config.get("_full_payload", None)
-
         # Create configuration object
         self.config = self._create_inference_config(config)
 
@@ -107,18 +108,15 @@ def process(self, test_input) -> Dict[str, Any]:
             }
 
         except Exception as e:
-            logger.error(f"Inference test failed: {e}", exc_info=True)
-
-            err = self._create_error_response(
-                error_msg=str(e),
-                test_input=(
-                    self._full_payload if isinstance(self._full_payload, dict) else None
-                ),
-                result_code=1,
+            # Log error with context, then re-raise for Executor to handle
+            logger.error(
+                f"InferenceAdapter: Test failed for {self._testcase}\n"
+                f"  Mode: {self.mode}\n"
+                f"  Framework: {self.framework}\n"
+                f"  Error: {str(e)}",
+                exc_info=True
             )
-            err["success"] = 1
-            err["resolved"] = {"nodes": 1, "gpus_per_node": 0, "device_used": 0}
-            return err
+            raise
 
     def teardown(self) -> None:
         """Cleanup resources"""
diff --git a/infinimetrics/operators/infinicore_adapter.py b/infinimetrics/operators/infinicore_adapter.py