1515from infinimetrics .adapter import BaseAdapter
1616from infinimetrics .input import TestInput
1717from infinimetrics .utils .path_utils import sanitize_filename
18+ from infinimetrics .common .constants import ErrorCode
19+
1820
1921logger = logging .getLogger (__name__ )
2022
2426 "--format=csv,noheader" ,
2527]
2628
29+ AMD_SMI_CANDIDATES = ["amd-smi" , "rocm-smi" ]
30+
2731
2832def _which (cmd : str ) -> Optional [str ]:
2933 try :
@@ -50,6 +54,7 @@ class TestResult:
5054 result_code : int # 0 = success, non-zero = error code
5155 result_file : Optional [str ] = None
5256 skipped : bool = False
57+ config : Optional [Dict [str , Any ]] = None
5358
5459 def to_dict (self ) -> Dict [str , Any ]:
5560 """Convert to lightweight dictionary format for Dispatcher aggregation."""
@@ -59,6 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
5964 "result_code" : self .result_code ,
6065 "result_file" : self .result_file ,
6166 "skipped" : self .skipped ,
67+ "config" : self .config ,
6268 }
6369
6470
@@ -107,8 +113,8 @@ def setup(self) -> None:
107113 config ["_run_id" ] = self .payload .get ("run_id" , "" )
108114 config ["_time" ] = self .payload .get ("time" , None )
109115
110- # Also inject the full payload for adapters that need the complete structure
111- config [ "_full_payload" ] = self .payload
116+ # Initialize test_input from payload
117+ self . test_input = self .payload
112118
113119 self .adapter .setup (config )
114120
@@ -158,13 +164,17 @@ def execute(self) -> TestResult:
158164 logger .info (f"Executor: Running { self .testcase } " )
159165
160166 # Initialize TestResult directly (default: result_code=0)
167+ config = self .payload .get ("config" , {})
161168 test_result = TestResult (
162169 run_id = self .run_id ,
163170 testcase = self .testcase ,
164171 result_code = 0 , # Default to success
165172 result_file = None ,
173+ config = config ,
166174 )
167175
176+ response = {}
177+
168178 try :
169179 # Phase 1: Setup
170180 self .setup ()
@@ -173,15 +183,6 @@ def execute(self) -> TestResult:
173183 logger .debug (f"Executor: Calling adapter.process()" )
174184 response = self .adapter .process (self .test_input )
175185
176- # Process response (0 = success, non-zero = error code)
177- test_result .result_code = (
178- int (response .get ("result_code" , 1 )) if isinstance (response , dict ) else 1
179- )
180- if test_result .result_code != 0 :
181- logger .warning (
182- f"Executor: Adapter failed with error code { test_result .result_code } "
183- )
184-
185186 # Enrich environment ONLY if missing
186187 if isinstance (response , dict ) and "environment" not in response :
187188 env = self ._build_environment (response )
@@ -220,14 +221,150 @@ def execute(self) -> TestResult:
220221
221222 return test_result
222223
224+ except subprocess .TimeoutExpired as e :
225+ # Timeout errors (possible hardware hang)
226+ logger .error (
227+ f"Executor: STABILITY CHECK FAILED for { self .testcase } \n "
228+ f" Issue Type: timeout\n "
229+ f" Severity: CRITICAL\n "
230+ f" Analysis: Test timed out. Hardware may be hung or overloaded.\n "
231+ f" Error: { str (e )[:300 ]} "
232+ )
233+ test_result .result_code = ErrorCode .TIMEOUT
234+ # Build error response for saving
235+ response = self ._build_error_response (str (e ), ErrorCode .TIMEOUT )
236+
237+ except ValueError as e :
238+ # Configuration or input validation errors
239+ logger .warning (
240+ f"Executor: Test failed for { self .testcase } \n "
241+ f" Issue Type: configuration_error\n "
242+ f" Error: { str (e )[:300 ]} "
243+ )
244+ test_result .result_code = ErrorCode .CONFIG
245+ # Build error response for saving
246+ response = self ._build_error_response (str (e ), ErrorCode .CONFIG )
247+
248+ except RuntimeError as e :
249+ # RuntimeError: analyze error message for specific patterns
250+ error_msg = str (e ).lower ()
251+
252+ # Check for memory insufficient errors
253+ memory_keywords = [
254+ "out of memory" , "oom" , "memory" , "memory leak" ,
255+ "allocate" , "allocation failed" , "insufficient memory"
256+ ]
257+ if any (kw in error_msg for kw in memory_keywords ):
258+ logger .error (
259+ f"Executor: STABILITY CHECK FAILED for { self .testcase } \n "
260+ f" Issue Type: memory\n "
261+ f" Severity: CRITICAL\n "
262+ f" Analysis: Memory allocation failed. Possible causes: insufficient memory, memory leak, or test data too large.\n "
263+ f" Error: { str (e )[:300 ]} "
264+ )
265+ test_result .result_code = ErrorCode .SYSTEM
266+ # Build error response for saving
267+ response = self ._build_error_response (str (e ), ErrorCode .SYSTEM )
268+ else :
269+ # Other RuntimeError
270+ logger .warning (
271+ f"Executor: Test failed for { self .testcase } \n "
272+ f" Issue Type: runtime_error\n "
273+ f" Error: { str (e )[:300 ]} "
274+ )
275+ test_result .result_code = ErrorCode .GENERIC
276+ # Build error response for saving
277+ response = self ._build_error_response (str (e ), ErrorCode .GENERIC )
278+
223279 except Exception as e :
224- logger .error (f"Executor: { self .testcase } failed: { e } " , exc_info = True )
280+ # Unexpected exceptions
281+ logger .error (
282+ f"Executor: { self .testcase } failed with unexpected exception: { e } " ,
283+ exc_info = True
284+ )
285+ test_result .result_code = ErrorCode .GENERIC
286+ # Build error response for saving
287+ response = self ._build_error_response (str (e ), ErrorCode .GENERIC )
225288
226- # Still run teardown on failure
227- self ._save_result (None )
228- test_result .result_code = 1 # Failure
289+ finally :
290+ # Always save result (even on failure)
291+ try :
292+ if not test_result .result_file :
293+ result_file = self ._save_result (response )
294+ test_result .result_file = result_file
295+ except Exception as teardown_error :
296+ logger .error (f"Executor: Failed to save result: { teardown_error } " )
229297
230- return test_result
298+ return test_result
299+
300+ def _build_error_response (self , error_msg : str , result_code : int ) -> Dict [str , Any ]:
301+ """
302+ Build a response dict containing error information for saving to disk.
303+
304+ Args:
305+ error_msg: Error message string
306+ result_code: Error result code
307+
308+ Returns:
309+ Dictionary with basic test info and error details
310+ """
311+ config = self .payload .get ("config" , {})
312+
313+ # Create a cleaned config without injected metadata
314+ cleaned_config = {
315+ k : v for k , v in config .items ()
316+ if not k .startswith ("_" ) # Skip _testcase, _run_id, _time
317+ }
318+
319+ # Extract device information
320+ resolved = self ._extract_device_info (config )
321+
322+ return {
323+ "run_id" : self .run_id ,
324+ "testcase" : self .testcase ,
325+ "time" : datetime .now ().strftime ("%Y-%m-%d %H:%M:%S" ),
326+ "result_code" : result_code ,
327+ "error_msg" : error_msg ,
328+ "success" : 1 , # 1 = failure
329+ "config" : cleaned_config ,
330+ "resolved" : resolved ,
331+ }
332+
333+ def _extract_device_info (self , config : Dict [str , Any ]) -> Dict [str , Any ]:
334+ """Extract device information from config."""
335+ device_used = 0
336+ gpus_per_node = 0
337+ nodes = 1
338+
339+ # Try device_involved
340+ if "device_involved" in config :
341+ try :
342+ device_used = int (config .get ("device_involved" , 0 ) or 0 )
343+ except (ValueError , TypeError ):
344+ device_used = 0
345+
346+ # Try single_node config
347+ if isinstance (config .get ("single_node" ), dict ):
348+ single_node = config ["single_node" ]
349+ device_ids = single_node .get ("device_ids" , [])
350+ if device_ids :
351+ device_used = len (device_ids )
352+ gpus_per_node = device_used
353+ else :
354+ gpus_per_node = device_used
355+
356+ # Try multi_node config
357+ if "multi_node" in config :
358+ try :
359+ nodes = int (config .get ("multi_node" , {}).get ("num_nodes" , 1 ) or 1 )
360+ except (ValueError , TypeError ):
361+ nodes = 1
362+
363+ return {
364+ "nodes" : nodes ,
365+ "gpus_per_node" : gpus_per_node ,
366+ "device_used" : device_used ,
367+ }
231368
232369 def _build_environment (self , response : Dict [str , Any ]) -> Dict [str , Any ]:
233370 """
0 commit comments