Skip to content

Commit 7f7d5fe

Browse files
Merge pull request #20 from InfiniTensor/feat/stability_check
Add stabilty check
2 parents d3b8a33 + 8f92c07 commit 7f7d5fe

6 files changed

Lines changed: 229 additions & 44 deletions

File tree

infinimetrics/common/constants.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,30 @@ class InfiniCoreResult:
190190

191191
# Metric prefixes
192192
METRIC_PREFIX_MEM_SWEEP = "hardware.mem_sweep"
193+
194+
# ============================================================
195+
# Error Code Constants
196+
# ============================================================
197+
198+
class ErrorCode:
199+
"""Error code values for different types of failures, organized by severity layer"""
200+
# Success
201+
SUCCESS = 0 # Test succeeded
202+
203+
# Layer 1: Input/Configuration issues (not stability issues)
204+
CONFIG = 1 # Invalid configuration or input (user error)
205+
206+
# Layer 2: Framework internal errors (tested framework's fault)
207+
INTERNAL = 2 # InfiniLM/InfiniCore internal error or non-zero return
208+
209+
# Layer 3: Incompatibility issues
210+
INCOMPAT = 3 # Compilation errors, version incompatibility
211+
212+
# Layer 4: System resource issues
213+
SYSTEM = 4 # OS/Hardware issues (OOM, disk full, GPU driver)
214+
215+
# Layer 5: Test framework issues (our fault)
216+
GENERIC = 5 # Test framework logic error
217+
218+
# Layer 6: Timeout issues
219+
TIMEOUT = 6 # Test started but hung/timeout

infinimetrics/communication/nccl_adapter.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
9898
self.run_id = input_dict.get("run_id") or self._gen_run_id(testcase)
9999
self.test_spec = self._parse_test_spec(testcase)
100100
if not self.test_spec:
101-
return self._err(input_dict, f"Unknown operation in testcase: {testcase}")
101+
raise ValueError(f"Unknown operation in testcase: {testcase}")
102102

103103
try:
104104
cmd = self._build_command(config)
@@ -114,7 +114,7 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
114114
msg = f"No performance data parsed. returncode={rc}"
115115
if stderr:
116116
msg += "\nStderr(last 20 lines):\n" + "\n".join(stderr.splitlines()[-20:])
117-
return self._err(input_dict, msg)
117+
raise RuntimeError(msg)
118118

119119
raw_files = self._save_raw_csv(results)
120120
metrics = self._build_metrics(wall_ms, raw_files)
@@ -137,8 +137,17 @@ def process(self, test_input: Dict[str, Any]) -> Dict[str, Any]:
137137
}
138138

139139
except Exception as e:
140-
logger.error(f"Test failed: {e}", exc_info=True)
141-
return self._err(input_dict, str(e))
140+
# Log error with context, then re-raise for Executor to handle
141+
operation = self.test_spec.get("op", "unknown") if self.test_spec else "unknown"
142+
logger.error(
143+
f"NCCLAdapter: Test failed for {testcase}\n"
144+
f" Operation: {operation}\n"
145+
f" Nodes: {self.resolved.nodes}\n"
146+
f" GPUs per node: {self.resolved.gpus_per_node}\n"
147+
f" Error: {str(e)}",
148+
exc_info=True
149+
)
150+
raise
142151

143152
# -----------------------------
144153
# Config helpers

infinimetrics/executor.py

Lines changed: 153 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from infinimetrics.adapter import BaseAdapter
1616
from infinimetrics.input import TestInput
1717
from infinimetrics.utils.path_utils import sanitize_filename
18+
from infinimetrics.common.constants import ErrorCode
19+
1820

1921
logger = logging.getLogger(__name__)
2022

@@ -24,6 +26,8 @@
2426
"--format=csv,noheader",
2527
]
2628

29+
AMD_SMI_CANDIDATES = ["amd-smi", "rocm-smi"]
30+
2731

2832
def _which(cmd: str) -> Optional[str]:
2933
try:
@@ -50,6 +54,7 @@ class TestResult:
5054
result_code: int # 0 = success, non-zero = error code
5155
result_file: Optional[str] = None
5256
skipped: bool = False
57+
config: Optional[Dict[str, Any]] = None
5358

5459
def to_dict(self) -> Dict[str, Any]:
5560
"""Convert to lightweight dictionary format for Dispatcher aggregation."""
@@ -59,6 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
5964
"result_code": self.result_code,
6065
"result_file": self.result_file,
6166
"skipped": self.skipped,
67+
"config": self.config,
6268
}
6369

6470

@@ -107,8 +113,8 @@ def setup(self) -> None:
107113
config["_run_id"] = self.payload.get("run_id", "")
108114
config["_time"] = self.payload.get("time", None)
109115

110-
# Also inject the full payload for adapters that need the complete structure
111-
config["_full_payload"] = self.payload
116+
# Initialize test_input from payload
117+
self.test_input = self.payload
112118

113119
self.adapter.setup(config)
114120

@@ -158,13 +164,17 @@ def execute(self) -> TestResult:
158164
logger.info(f"Executor: Running {self.testcase}")
159165

160166
# Initialize TestResult directly (default: result_code=0)
167+
config = self.payload.get("config", {})
161168
test_result = TestResult(
162169
run_id=self.run_id,
163170
testcase=self.testcase,
164171
result_code=0, # Default to success
165172
result_file=None,
173+
config=config,
166174
)
167175

176+
response = {}
177+
168178
try:
169179
# Phase 1: Setup
170180
self.setup()
@@ -173,15 +183,6 @@ def execute(self) -> TestResult:
173183
logger.debug(f"Executor: Calling adapter.process()")
174184
response = self.adapter.process(self.test_input)
175185

176-
# Process response (0 = success, non-zero = error code)
177-
test_result.result_code = (
178-
int(response.get("result_code", 1)) if isinstance(response, dict) else 1
179-
)
180-
if test_result.result_code != 0:
181-
logger.warning(
182-
f"Executor: Adapter failed with error code {test_result.result_code}"
183-
)
184-
185186
# Enrich environment ONLY if missing
186187
if isinstance(response, dict) and "environment" not in response:
187188
env = self._build_environment(response)
@@ -220,14 +221,150 @@ def execute(self) -> TestResult:
220221

221222
return test_result
222223

224+
except subprocess.TimeoutExpired as e:
225+
# Timeout errors (possible hardware hang)
226+
logger.error(
227+
f"Executor: STABILITY CHECK FAILED for {self.testcase}\n"
228+
f" Issue Type: timeout\n"
229+
f" Severity: CRITICAL\n"
230+
f" Analysis: Test timed out. Hardware may be hung or overloaded.\n"
231+
f" Error: {str(e)[:300]}"
232+
)
233+
test_result.result_code = ErrorCode.TIMEOUT
234+
# Build error response for saving
235+
response = self._build_error_response(str(e), ErrorCode.TIMEOUT)
236+
237+
except ValueError as e:
238+
# Configuration or input validation errors
239+
logger.warning(
240+
f"Executor: Test failed for {self.testcase}\n"
241+
f" Issue Type: configuration_error\n"
242+
f" Error: {str(e)[:300]}"
243+
)
244+
test_result.result_code = ErrorCode.CONFIG
245+
# Build error response for saving
246+
response = self._build_error_response(str(e), ErrorCode.CONFIG)
247+
248+
except RuntimeError as e:
249+
# RuntimeError: analyze error message for specific patterns
250+
error_msg = str(e).lower()
251+
252+
# Check for memory insufficient errors
253+
memory_keywords = [
254+
"out of memory", "oom", "memory", "memory leak",
255+
"allocate", "allocation failed", "insufficient memory"
256+
]
257+
if any(kw in error_msg for kw in memory_keywords):
258+
logger.error(
259+
f"Executor: STABILITY CHECK FAILED for {self.testcase}\n"
260+
f" Issue Type: memory\n"
261+
f" Severity: CRITICAL\n"
262+
f" Analysis: Memory allocation failed. Possible causes: insufficient memory, memory leak, or test data too large.\n"
263+
f" Error: {str(e)[:300]}"
264+
)
265+
test_result.result_code = ErrorCode.SYSTEM
266+
# Build error response for saving
267+
response = self._build_error_response(str(e), ErrorCode.SYSTEM)
268+
else:
269+
# Other RuntimeError
270+
logger.warning(
271+
f"Executor: Test failed for {self.testcase}\n"
272+
f" Issue Type: runtime_error\n"
273+
f" Error: {str(e)[:300]}"
274+
)
275+
test_result.result_code = ErrorCode.GENERIC
276+
# Build error response for saving
277+
response = self._build_error_response(str(e), ErrorCode.GENERIC)
278+
223279
except Exception as e:
224-
logger.error(f"Executor: {self.testcase} failed: {e}", exc_info=True)
280+
# Unexpected exceptions
281+
logger.error(
282+
f"Executor: {self.testcase} failed with unexpected exception: {e}",
283+
exc_info=True
284+
)
285+
test_result.result_code = ErrorCode.GENERIC
286+
# Build error response for saving
287+
response = self._build_error_response(str(e), ErrorCode.GENERIC)
225288

226-
# Still run teardown on failure
227-
self._save_result(None)
228-
test_result.result_code = 1 # Failure
289+
finally:
290+
# Always save result (even on failure)
291+
try:
292+
if not test_result.result_file:
293+
result_file = self._save_result(response)
294+
test_result.result_file = result_file
295+
except Exception as teardown_error:
296+
logger.error(f"Executor: Failed to save result: {teardown_error}")
229297

230-
return test_result
298+
return test_result
299+
300+
def _build_error_response(self, error_msg: str, result_code: int) -> Dict[str, Any]:
301+
"""
302+
Build a response dict containing error information for saving to disk.
303+
304+
Args:
305+
error_msg: Error message string
306+
result_code: Error result code
307+
308+
Returns:
309+
Dictionary with basic test info and error details
310+
"""
311+
config = self.payload.get("config", {})
312+
313+
# Create a cleaned config without injected metadata
314+
cleaned_config = {
315+
k: v for k, v in config.items()
316+
if not k.startswith("_") # Skip _testcase, _run_id, _time
317+
}
318+
319+
# Extract device information
320+
resolved = self._extract_device_info(config)
321+
322+
return {
323+
"run_id": self.run_id,
324+
"testcase": self.testcase,
325+
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
326+
"result_code": result_code,
327+
"error_msg": error_msg,
328+
"success": 1, # 1 = failure
329+
"config": cleaned_config,
330+
"resolved": resolved,
331+
}
332+
333+
def _extract_device_info(self, config: Dict[str, Any]) -> Dict[str, Any]:
334+
"""Extract device information from config."""
335+
device_used = 0
336+
gpus_per_node = 0
337+
nodes = 1
338+
339+
# Try device_involved
340+
if "device_involved" in config:
341+
try:
342+
device_used = int(config.get("device_involved", 0) or 0)
343+
except (ValueError, TypeError):
344+
device_used = 0
345+
346+
# Try single_node config
347+
if isinstance(config.get("single_node"), dict):
348+
single_node = config["single_node"]
349+
device_ids = single_node.get("device_ids", [])
350+
if device_ids:
351+
device_used = len(device_ids)
352+
gpus_per_node = device_used
353+
else:
354+
gpus_per_node = device_used
355+
356+
# Try multi_node config
357+
if "multi_node" in config:
358+
try:
359+
nodes = int(config.get("multi_node", {}).get("num_nodes", 1) or 1)
360+
except (ValueError, TypeError):
361+
nodes = 1
362+
363+
return {
364+
"nodes": nodes,
365+
"gpus_per_node": gpus_per_node,
366+
"device_used": device_used,
367+
}
231368

232369
def _build_environment(self, response: Dict[str, Any]) -> Dict[str, Any]:
233370
"""

infinimetrics/hardware/hardware_adapter.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@ def process(self, test_input: Any) -> Dict[str, Any]:
6464
# Normalize test input to dict format
6565
test_input = self._normalize_test_input(test_input)
6666
if not test_input:
67-
return self._create_error_response(
68-
f"Invalid test_input type: {type(test_input)}"
69-
)
67+
raise ValueError(f"Invalid test_input type: {type(test_input)}")
7068

7169
testcase = test_input.get(InfiniMetricsJson.TESTCASE, "unknown")
7270
config = test_input.get(InfiniMetricsJson.CONFIG, {})
@@ -107,9 +105,17 @@ def process(self, test_input: Any) -> Dict[str, Any]:
107105
InfiniMetricsJson.CONFIG: result_config,
108106
InfiniMetricsJson.METRICS: metrics,
109107
}
108+
110109
except Exception as e:
111-
logger.error(f"Hardware test failed: {e}", exc_info=True)
112-
return self._create_error_response(str(e), test_input)
110+
# Log error with context, then re-raise for Executor to handle
111+
logger.error(
112+
f"HardwareTestAdapter: Test failed for {testcase}\n"
113+
f" Device: {device}\n"
114+
f" Test Type: {test_type}\n"
115+
f" Error: {str(e)}",
116+
exc_info=True
117+
)
118+
raise
113119

114120
def _build_cuda_project(self) -> None:
115121
"""Build CUDA project if needed."""

infinimetrics/inference/inference_adapter.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,17 @@ def __init__(self):
2323
self.config = None
2424
self.mode = "direct"
2525
self.framework = "infinilm"
26-
self._full_payload = None
26+
self._testcase = None
2727

2828
def setup(self, config: Dict[str, Any]) -> None:
2929
"""Initialize inference resources"""
3030
# Get testcase and run_id from injected fields
3131
testcase = config.get("_testcase", "")
3232
run_id = config.get("_run_id", "")
3333

34+
# Store testcase for error reporting
35+
self._testcase = testcase
36+
3437
# Parse mode and framework
3538
if "service" in testcase.lower():
3639
self.mode = "service"
@@ -42,8 +45,6 @@ def setup(self, config: Dict[str, Any]) -> None:
4245
elif "infinilm" in testcase.lower():
4346
self.framework = "infinilm"
4447

45-
self._full_payload = config.get("_full_payload", None)
46-
4748
# Create configuration object
4849
self.config = self._create_inference_config(config)
4950

@@ -107,18 +108,15 @@ def process(self, test_input) -> Dict[str, Any]:
107108
}
108109

109110
except Exception as e:
110-
logger.error(f"Inference test failed: {e}", exc_info=True)
111-
112-
err = self._create_error_response(
113-
error_msg=str(e),
114-
test_input=(
115-
self._full_payload if isinstance(self._full_payload, dict) else None
116-
),
117-
result_code=1,
111+
# Log error with context, then re-raise for Executor to handle
112+
logger.error(
113+
f"InferenceAdapter: Test failed for {self._testcase}\n"
114+
f" Mode: {self.mode}\n"
115+
f" Framework: {self.framework}\n"
116+
f" Error: {str(e)}",
117+
exc_info=True
118118
)
119-
err["success"] = 1
120-
err["resolved"] = {"nodes": 1, "gpus_per_node": 0, "device_used": 0}
121-
return err
119+
raise
122120

123121
def teardown(self) -> None:
124122
"""Cleanup resources"""

0 commit comments

Comments
 (0)