fix: Wire CircuitBreaker and FailoverManager into execution paths (#1539)

praisonai-triage-agent[bot] · MervinPraison · web-flow · commit 7bb5911f4dc8 · 2026-04-24T11:11:38.000+01:00
* fix: Wire CircuitBreaker and FailoverManager into execution paths (fixes #1535) - Add FailoverManager integration to LLM._call_with_retry() and _call_with_retry_async() * New failover_manager parameter in LLM constructor * _switch_to_profile() method for switching auth profiles * Mark success/failure and automatic profile switching on retries - Add CircuitBreaker integration to tool execution in AgentToolExecution * _execute_tool_with_circuit_breaker() wrapper method * 5-failure threshold, 60s recovery timeout, graceful degradation * Per-tool circuit breakers prevent cascading failures Both components were fully implemented but never called from execution paths. Now users who configure them get actual resilience protection. Co-authored-by: MervinPraison <MervinPraison@users.noreply.github.com> * fix: Address critical bugs in CircuitBreaker and FailoverManager implementation - Fix FailoverManager initialization: properly initialize _current_profile - Fix CircuitBreaker failure detection: convert error dicts to exceptions - Fix profile switching: update in-flight request parameters in retry loop - Add proper typing with FailoverManagerProtocol instead of Any - Fix import handling to prevent NameError on CircuitBreakerException - Add agent/session context to circuit breaker error returns - Allow failover on non-retryable provider failures (bad credentials, etc) Addresses critical P1 issues identified by Greptile and CodeRabbit reviews. Makes FailoverManager and CircuitBreaker actually functional end-to-end. Co-authored-by: Mervin Praison <MervinPraison@users.noreply.github.com> --------- Co-authored-by: praisonai-triage-agent[bot] <272766704+praisonai-triage-agent[bot]@users.noreply.github.com> Co-authored-by: MervinPraison <MervinPraison@users.noreply.github.com>
diff --git a/src/praisonai-agents/praisonaiagents/agent/tool_execution.py b/src/praisonai-agents/praisonaiagents/agent/tool_execution.py
@@ -202,7 +202,7 @@ def _execute_tool_with_context(self, function_name, arguments, state, tool_call_
                 
                 def execute_with_context():
                     with with_injection_context(state):
-                        return self._execute_tool_impl(function_name, arguments)
+                        return self._execute_tool_with_circuit_breaker(function_name, arguments)
                 
                 # Use reusable executor to prevent resource leaks
                 if not hasattr(self, '_tool_executor'):
@@ -219,7 +219,7 @@ def execute_with_context():
                     result = {"error": f"Tool timed out after {tool_timeout}s", "timeout": True}
             else:
                 with with_injection_context(state):
-                    result = self._execute_tool_impl(function_name, arguments)
+                    result = self._execute_tool_with_circuit_breaker(function_name, arguments)
             
             # Apply tool output truncation to prevent context overflow
             # Uses context manager budget if enabled, otherwise applies default limit
@@ -594,6 +594,73 @@ async def _check_tool_approval_async(self, function_name, arguments):
             logging.info(f"Using modified arguments: {arguments}")
         return None, arguments
 
+    def _execute_tool_with_circuit_breaker(self, function_name, arguments):
+        """Execute tool with circuit breaker protection.
+        
+        Args:
+            function_name: Name of the tool to execute
+            arguments: Arguments for the tool
+            
+        Returns:
+            Tool execution result or circuit breaker error
+        """
+        # Import circuit breaker components first (lazy import for performance)
+        try:
+            from ..tools.circuit_breaker import get_circuit_breaker, CircuitBreakerConfig, CircuitBreakerException
+        except ImportError:
+            # Circuit breaker not available - fallback to direct execution
+            logging.debug("Circuit breaker not available, falling back to direct tool execution")
+            return self._execute_tool_impl(function_name, arguments)
+
+        try:
+            
+            # Get or create circuit breaker for this tool
+            breaker_name = f"tool_{function_name}"
+            config = CircuitBreakerConfig(
+                failure_threshold=5,        # Open after 5 failures
+                recovery_timeout=60.0,      # Wait 60s before trying half-open
+                timeout=30.0,               # Tool call timeout
+                graceful_degradation=True   # Return error instead of raising exception
+            )
+            breaker = get_circuit_breaker(breaker_name, config)
+            
+            # Execute tool through circuit breaker with failure detection wrapper
+            def _tool_wrapper():
+                result = self._execute_tool_impl(function_name, arguments)
+                # Convert error dicts to exceptions so circuit breaker can detect failures
+                # Don't treat approval/permission denials as circuit breaker failures
+                if isinstance(result, dict) and result.get("error") and \
+                   not result.get("approval_denied") and \
+                   not result.get("permission_denied") and \
+                   not result.get("approval_error"):
+                    # Create a sentinel exception to register failure with circuit breaker
+                    class _ToolFailure(Exception):
+                        def __init__(self, error_dict):
+                            self.error_dict = error_dict
+                            super().__init__(error_dict.get("error", "Tool execution failed"))
+                    raise _ToolFailure(result)
+                return result
+            
+            try:
+                return breaker.call(_tool_wrapper)
+            except Exception as e:
+                # Check if this is our sentinel exception
+                if hasattr(e, 'error_dict'):
+                    return e.error_dict  # Return the original error dict
+                else:
+                    raise  # Re-raise other exceptions
+            
+        except CircuitBreakerException as e:
+            # Circuit breaker is open - return error dict instead of raising
+            logging.warning(f"Tool '{function_name}' circuit breaker open: {e}")
+            return {
+                "error": f"Tool '{function_name}' circuit breaker open - too many recent failures",
+                "circuit_open": True,
+                "agent_name": getattr(self, "name", None),
+                "session_id": getattr(self, "_session_id", None),
+                "remediation": "Wait for recovery_timeout (60s) or investigate recent tool failures.",
+            }
+
     def _execute_tool_impl(self, function_name, arguments):
         """Internal tool execution implementation."""
 
diff --git a/src/praisonai-agents/praisonaiagents/llm/llm.py b/src/praisonai-agents/praisonaiagents/llm/llm.py
@@ -6,11 +6,17 @@
 import inspect
 import asyncio
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union, Literal, Callable, TYPE_CHECKING
+from typing import Any, Dict, List, Optional, Union, Literal, Callable, TYPE_CHECKING, Protocol
 
 if TYPE_CHECKING:
     from rich.console import Console
     from rich.live import Live
+    
+class FailoverManagerProtocol(Protocol):
+    """Protocol for failover manager implementations."""
+    def get_next_profile(self) -> Optional["AuthProfile"]: ...
+    def mark_failure(self, profile: "AuthProfile", error: str, is_rate_limit: bool = False) -> None: ...
+    def mark_success(self, profile: "AuthProfile") -> None: ...
 from pydantic import BaseModel
 import time
 import json
@@ -354,6 +360,7 @@ def __init__(
         web_fetch: Optional[Union[bool, Dict[str, Any]]] = None,
         prompt_caching: Optional[bool] = None,
         claude_memory: Optional[Union[bool, Any]] = None,
+        failover_manager: Optional[FailoverManagerProtocol] = None,
         **extra_settings
     ):
         # Configure logging only once at the class level
@@ -429,6 +436,14 @@ def __init__(
         self._rate_limiter = extra_settings.get('rate_limiter', None)
         self._max_retries = extra_settings.get('max_retries', 3)
         self._retry_delay = extra_settings.get('retry_delay', 60)  # Default 60 seconds
+        
+        # Failover management
+        self._failover_manager = failover_manager
+        self._current_profile = None  # Track current auth profile for failover
+        if self._failover_manager:
+            self._current_profile = self._failover_manager.get_next_profile()
+            if self._current_profile:
+                self._switch_to_profile(self._current_profile)
 
         # Cache for formatted tools and messages
         self._formatted_tools_cache = {}
@@ -685,8 +700,23 @@ def _classify_error_and_should_retry(self, error: Exception, attempt: int = 1) -
             delay = self._parse_retry_delay(str(error)) if is_rate_limit else 0.0
             return "rate_limit" if is_rate_limit else "unknown", is_rate_limit, delay
 
+    def _switch_to_profile(self, profile: "AuthProfile") -> None:
+        """Switch to a new auth profile for failover.
+        
+        Args:
+            profile: AuthProfile to switch to
+        """
+        if profile.api_key:
+            self.api_key = profile.api_key
+        if profile.base_url:
+            self.base_url = profile.base_url
+        if profile.model and profile.model != self.model:
+            # Only log if model actually changes
+            logging.info(f"Failover: switching from {self.model} to {profile.model}")
+            self.model = profile.model
+
     def _call_with_retry(self, func, *args, **kwargs):
-        """Call a function with automatic retry on rate limit errors.
+        """Call a function with automatic retry on rate limit errors and failover support.
 
         Args:
             func: The function to call (e.g., litellm.completion)
@@ -707,16 +737,45 @@ def _call_with_retry(self, func, *args, **kwargs):
                 if self._rate_limiter is not None:
                     self._rate_limiter.acquire()
 
-                return func(*args, **kwargs)
+                result = func(*args, **kwargs)
+                
+                # Mark success if failover is configured
+                if self._failover_manager and self._current_profile:
+                    self._failover_manager.mark_success(self._current_profile)
+                    
+                return result
 
             except Exception as e:
                 category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
-                if not can_retry:
-                    raise
-
+                
                 last_error = e
                 error_str = str(e)
 
+                # Failover: mark failure and try next profile (do this before early exit)
+                if self._failover_manager and self._current_profile:
+                    is_rate_limit = (category == "rate_limit")
+                    self._failover_manager.mark_failure(
+                        self._current_profile, error_str, is_rate_limit=is_rate_limit
+                    )
+                    next_profile = self._failover_manager.get_next_profile()
+                    if next_profile and next_profile != self._current_profile:
+                        self._switch_to_profile(next_profile)
+                        self._current_profile = next_profile
+                        # Update the kwargs with new profile values for the next retry
+                        if "api_key" in kwargs:
+                            kwargs["api_key"] = self.api_key
+                        if "base_url" in kwargs:
+                            kwargs["base_url"] = self.base_url
+                        if "model" in kwargs:
+                            kwargs["model"] = self.model
+                        # Enable retry for profile switch even if originally non-retryable
+                        can_retry = True
+                        retry_delay = 0.0
+                        logging.info(f"Failover: switched to profile '{next_profile.name}'")
+                
+                if not can_retry:
+                    raise
+
                 if attempt < self._max_retries:
                     logging.warning(
                         f"{category} error hit (attempt {attempt + 1}/{self._max_retries + 1}), "
@@ -746,7 +805,7 @@ def _call_with_retry(self, func, *args, **kwargs):
         raise last_error
 
     async def _call_with_retry_async(self, func, *args, **kwargs):
-        """Async version of _call_with_retry.
+        """Async version of _call_with_retry with failover support.
 
         Args:
             func: The async function to call
@@ -767,16 +826,45 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
                 if self._rate_limiter is not None:
                     await self._rate_limiter.acquire_async()
 
-                return await func(*args, **kwargs)
+                result = await func(*args, **kwargs)
+                
+                # Mark success if failover is configured
+                if self._failover_manager and self._current_profile:
+                    self._failover_manager.mark_success(self._current_profile)
+                    
+                return result
 
             except Exception as e:
                 category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
-                if not can_retry:
-                    raise
-
+                
                 last_error = e
                 error_str = str(e)
 
+                # Failover: mark failure and try next profile (do this before early exit)
+                if self._failover_manager and self._current_profile:
+                    is_rate_limit = (category == "rate_limit")
+                    self._failover_manager.mark_failure(
+                        self._current_profile, error_str, is_rate_limit=is_rate_limit
+                    )
+                    next_profile = self._failover_manager.get_next_profile()
+                    if next_profile and next_profile != self._current_profile:
+                        self._switch_to_profile(next_profile)
+                        self._current_profile = next_profile
+                        # Update the kwargs with new profile values for the next retry
+                        if "api_key" in kwargs:
+                            kwargs["api_key"] = self.api_key
+                        if "base_url" in kwargs:
+                            kwargs["base_url"] = self.base_url
+                        if "model" in kwargs:
+                            kwargs["model"] = self.model
+                        # Enable retry for profile switch even if originally non-retryable
+                        can_retry = True
+                        retry_delay = 0.0
+                        logging.info(f"Failover: switched to profile '{next_profile.name}'")
+                
+                if not can_retry:
+                    raise
+
                 if attempt < self._max_retries:
                     logging.warning(
                         f"{category} error hit (attempt {attempt + 1}/{self._max_retries + 1}), "