77import logging
88from typing import Any
99
10+ from sqlalchemy import String , cast , or_ , select
1011from sqlalchemy .ext .asyncio import AsyncSession , async_sessionmaker
1112
1213from agent_debugger_sdk .core .events import Checkpoint , EventType , Session , SessionStatus , TraceEvent
1718from collector .intelligence .facade import TraceIntelligence
1819from redaction .pipeline import RedactionPipeline
1920from storage import TraceRepository
21+ from storage .converters import orm_to_event , orm_to_session
22+ from storage .models import EventModel , SessionModel
2023
2124logger = logging .getLogger (__name__ )
2225SESSION_ANALYSIS_CAP = 100
26+ FAILURE_SIMILARITY_THRESHOLD = 0.5
2327
2428
2529def normalize_session (
@@ -400,69 +404,97 @@ async def find_similar_failures(
400404 # Get the failure event
401405 failure_event = await repo .get_event (failure_event_id )
402406 if not failure_event :
403- return []
407+ raise NotFoundError (f"Failure event { failure_event_id } not found" )
408+ if failure_event .session_id != session_id :
409+ raise NotFoundError (
410+ f"Failure event { failure_event_id } was not found in session { session_id } "
411+ )
404412
405413 # Determine failure characteristics
406- error_text = failure_event .error or failure_event .error_message or failure_event .name or ""
407- error_type = failure_event .error_type or ""
408-
409- # Get all sessions with failures
410- all_sessions = await repo .list_sessions (limit = 500 , offset = 0 , sort_by = "started_at" )
411-
412- similar_failures : list [dict [str , Any ]] = []
413-
414- for session in all_sessions :
415- # Skip the current session
416- if session .id == session_id :
417- continue
418-
419- # Skip sessions without errors
420- if session .errors == 0 :
421- continue
422-
423- # Get events from this session to find matching failures
424- try :
425- session_events = await repo .list_events (session .id , limit = 1000 )
426- except Exception :
414+ error_text = _event_error_text (failure_event )
415+ error_type = _event_error_type (failure_event )
416+ candidate_failures = await _load_candidate_failure_events (repo , failure_event , session_id )
417+
418+ best_match_by_session : dict [str , dict [str , Any ]] = {}
419+
420+ for event , session in candidate_failures :
421+ similarity = _calculate_failure_similarity (
422+ failure_event ,
423+ event ,
424+ error_text ,
425+ error_type ,
426+ )
427+ if similarity < FAILURE_SIMILARITY_THRESHOLD :
427428 continue
428429
429- # Find failure events in this session
430- for event in session_events :
431- if not _is_failure_event (event ):
432- continue
433-
434- # Calculate similarity score
435- similarity = _calculate_failure_similarity (
436- failure_event ,
437- event ,
438- error_text ,
439- error_type ,
440- )
441-
442- # Only include reasonably similar failures
443- if similarity >= 0.3 :
444- # Derive failure mode and root cause
445- failure_mode = _derive_failure_mode (event )
446- root_cause = _derive_root_cause (event )
447-
448- similar_failures .append ({
449- "session_id" : session .id ,
450- "agent_name" : session .agent_name ,
451- "framework" : session .framework ,
452- "started_at" : session .started_at ,
453- "failure_type" : str (event .event_type ),
454- "failure_mode" : failure_mode ,
455- "root_cause" : root_cause ,
456- "similarity" : similarity ,
457- "fix_note" : session .fix_note ,
458- })
459- break # Only add one failure per session
430+ failure_summary = {
431+ "session_id" : session .id ,
432+ "agent_name" : session .agent_name ,
433+ "framework" : session .framework ,
434+ "started_at" : session .started_at ,
435+ "failure_type" : str (event .event_type ),
436+ "failure_mode" : _derive_failure_mode (event ),
437+ "root_cause" : _derive_root_cause (event ),
438+ "similarity" : similarity ,
439+ "fix_note" : session .fix_note ,
440+ }
441+ existing = best_match_by_session .get (session .id )
442+ if existing is None or failure_summary ["similarity" ] > existing ["similarity" ]:
443+ best_match_by_session [session .id ] = failure_summary
460444
461445 # Sort by similarity and limit
446+ similar_failures = list (best_match_by_session .values ())
462447 similar_failures .sort (key = lambda x : x ["similarity" ], reverse = True )
463448 return similar_failures [:limit ]
464449
465450
451+ async def _load_candidate_failure_events (
452+ repo : TraceRepository ,
453+ failure_event : TraceEvent ,
454+ session_id : str ,
455+ ) -> list [tuple [TraceEvent , Session ]]:
456+ """Load tenant-scoped failure candidates without per-session N+1 queries."""
457+ failure_event_types = [
458+ str (EventType .ERROR ),
459+ str (EventType .REFUSAL ),
460+ str (EventType .POLICY_VIOLATION ),
461+ str (EventType .BEHAVIOR_ALERT ),
462+ str (EventType .TOOL_RESULT ),
463+ str (EventType .SAFETY_CHECK ),
464+ ]
465+
466+ source_clues = [EventModel .event_type == str (failure_event .event_type )]
467+ source_error_type = _event_error_type (failure_event )
468+ if source_error_type :
469+ source_clues .append (cast (EventModel .data , String ).ilike (f"%{ source_error_type } %" ))
470+ source_tool_name = getattr (failure_event , "tool_name" , None )
471+ if source_tool_name :
472+ source_clues .append (cast (EventModel .data , String ).ilike (f"%{ source_tool_name } %" ))
473+
474+ stmt = (
475+ select (EventModel , SessionModel )
476+ .join (SessionModel , EventModel .session_id == SessionModel .id )
477+ .where (
478+ SessionModel .tenant_id == repo .tenant_id ,
479+ EventModel .tenant_id == repo .tenant_id ,
480+ SessionModel .id != session_id ,
481+ SessionModel .errors > 0 ,
482+ EventModel .event_type .in_ (failure_event_types ),
483+ or_ (* source_clues ),
484+ )
485+ .order_by (SessionModel .started_at .desc (), EventModel .timestamp .desc ())
486+ )
487+ result = await repo .session .execute (stmt )
488+
489+ candidates : list [tuple [TraceEvent , Session ]] = []
490+ for db_event , db_session in result .all ():
491+ event = orm_to_event (db_event )
492+ if not _is_failure_event (event ):
493+ continue
494+ candidates .append ((event , orm_to_session (db_session )))
495+ return candidates
496+
497+
466498def _is_failure_event (event : TraceEvent ) -> bool :
467499 """Check if an event represents a failure."""
468500 return (
@@ -475,6 +507,28 @@ def _is_failure_event(event: TraceEvent) -> bool:
475507 )
476508
477509
510+ def _event_error_text (event : TraceEvent ) -> str :
511+ """Return the most useful error-like text available on an event."""
512+ return (
513+ getattr (event , "error" , None )
514+ or getattr (event , "error_message" , None )
515+ or getattr (event , "reason" , None )
516+ or event .name
517+ or ""
518+ )
519+
520+
521+ def _event_error_type (event : TraceEvent ) -> str :
522+ """Return the most useful error-like type available on an event."""
523+ return (
524+ getattr (event , "error_type" , None )
525+ or getattr (event , "violation_type" , None )
526+ or getattr (event , "alert_type" , None )
527+ or ""
528+ )
529+
530+
531+
478532def _calculate_failure_similarity (
479533 source_event : TraceEvent ,
480534 candidate_event : TraceEvent ,
@@ -492,13 +546,13 @@ def _calculate_failure_similarity(
492546 score += 0.4
493547
494548 # Error type match
495- candidate_error_type = candidate_event . error_type or ""
549+ candidate_error_type = _event_error_type ( candidate_event )
496550 if source_error_type and candidate_error_type :
497551 if source_error_type .lower () == candidate_error_type .lower ():
498552 score += 0.3
499553
500554 # Error text similarity (simple keyword overlap)
501- candidate_error_text = candidate_event . error or candidate_event . error_message or candidate_event . name or ""
555+ candidate_error_text = _event_error_text ( candidate_event )
502556 if source_error_text and candidate_error_text :
503557 source_words = set (source_error_text .lower ().split ())
504558 candidate_words = set (candidate_error_text .lower ().split ())
0 commit comments