fix: resolve critical architecture issues identified in code reviews

praisonai-triage-agent[bot] · MervinPraison · praisonai-triage-agent[bot] · commit 1277a7a08b5b · 2026-04-14T16:06:48.000Z
- Fix checkpoint pruning logic reversal (use newest-last semantics)
- Add CHECKPOINTS_PRUNED event type to replace ERROR for normal operations
- Fix ThreadPoolExecutor timeout bypass with explicit executor lifecycle
- Unify AsyncSafeState to use single thread lock across sync/async contexts
- Fix agent cleanup to target actual live clients (llm_instance, openai_client)
- Sync memory fallback logic across async/structured STM entry points
- Move contextvars import to module level for better performance

Addresses critical concurrency, security, and data integrity issues.

Co-authored-by: Mervin Praison &lt;MervinPraison@users.noreply.github.com&gt;
diff --git a/src/praisonai-agents/praisonaiagents/agent/agent.py b/src/praisonai-agents/praisonaiagents/agent/agent.py
@@ -4501,9 +4501,33 @@ def close(self) -> None:
         except Exception as e:
             logger.warning(f"Memory cleanup failed: {e}")
 
-        # LLM client cleanup
+        # LLM client cleanup - target actual live clients, not model strings
         try:
-            if hasattr(self, 'llm') and self.llm:
+            # Primary cleanup targets - actual live clients
+            if hasattr(self, 'llm_instance') and self.llm_instance:
+                if hasattr(self.llm_instance, 'aclose'):
+                    # Try async close first
+                    try:
+                        import asyncio
+                        if asyncio.iscoroutinefunction(self.llm_instance.aclose):
+                            # We're in sync context, so use asyncio.run() for the cleanup
+                            asyncio.run(self.llm_instance.aclose())
+                        else:
+                            self.llm_instance.aclose()
+                    except Exception:
+                        # Fall back to sync close if async fails
+                        if hasattr(self.llm_instance, 'close'):
+                            self.llm_instance.close()
+                elif hasattr(self.llm_instance, 'close'):
+                    self.llm_instance.close()
+            
+            # Check for OpenAI client (common pattern in agents)
+            if hasattr(self, '_Agent__openai_client') and self._Agent__openai_client:
+                if hasattr(self._Agent__openai_client, 'close'):
+                    self._Agent__openai_client.close()
+            
+            # Legacy fallback - check self.llm._client (but less likely to work)
+            if hasattr(self, 'llm') and self.llm and not isinstance(self.llm, str):
                 llm_client = getattr(self.llm, '_client', None)
                 if llm_client and hasattr(llm_client, 'close'):
                     llm_client.close()
diff --git a/src/praisonai-agents/praisonaiagents/agent/async_safety.py b/src/praisonai-agents/praisonaiagents/agent/async_safety.py
@@ -35,30 +35,8 @@ class DualLock:
     """
     
     def __init__(self):
-        self._thread_lock = threading.Lock()
-        self._async_lock: Optional[asyncio.Lock] = None
-        self._loop_id: Optional[int] = None
-        
-    def _get_async_lock(self) -> asyncio.Lock:
-        """Get or create asyncio.Lock for current event loop."""
-        try:
-            current_loop = asyncio.get_running_loop()
-            current_loop_id = id(current_loop)
-            
-            # Atomic check and create: use thread lock to protect async lock creation
-            with self._thread_lock:
-                # Create new lock if loop changed or first time
-                if self._loop_id != current_loop_id:
-                    self._async_lock = asyncio.Lock()
-                    self._loop_id = current_loop_id
-                    
-                return self._async_lock
-        except RuntimeError:
-            # No event loop running, fall back to thread lock in a new loop
-            with self._thread_lock:
-                if self._async_lock is None:
-                    self._async_lock = asyncio.Lock()
-                return self._async_lock
+        """Initialize with unified thread-safe locking."""
+        self._thread_lock = threading.Lock()  # Single canonical lock for all contexts
     
     @contextmanager
     def sync(self):
@@ -68,10 +46,13 @@ def sync(self):
             
     @asynccontextmanager
     async def async_lock(self):
-        """Acquire lock in asynchronous context using asyncio.Lock."""
-        async_lock = self._get_async_lock()
-        async with async_lock:
+        """Acquire lock in asynchronous context using threading.Lock via asyncio.to_thread()."""
+        # Use asyncio.to_thread to acquire the thread lock without blocking the event loop
+        await asyncio.to_thread(self._thread_lock.acquire)
+        try:
             yield
+        finally:
+            self._thread_lock.release()
             
     def is_async_context(self) -> bool:
         """Check if we're currently in an async context."""
@@ -133,14 +114,12 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         
     async def __aenter__(self):
         """Support for asynchronous context manager protocol."""
-        async_lock = self._lock._get_async_lock()
-        await async_lock.acquire()
+        await asyncio.to_thread(self._lock._thread_lock.acquire)
         return self.value
         
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         """Support for asynchronous context manager protocol."""
-        async_lock = self._lock._get_async_lock()
-        async_lock.release()
+        self._lock._thread_lock.release()
         return None
             
     def get(self) -> Any:
diff --git a/src/praisonai-agents/praisonaiagents/agent/tool_execution.py b/src/praisonai-agents/praisonaiagents/agent/tool_execution.py
@@ -12,6 +12,7 @@
 import logging
 import asyncio
 import inspect
+import contextvars
 import concurrent.futures
 from typing import List, Optional, Any, Dict, Union, TYPE_CHECKING
 
@@ -194,20 +195,31 @@ def _execute_tool_with_context(self, function_name, arguments, state, tool_call_
             tool_timeout = getattr(self, '_tool_timeout', None)
             if tool_timeout and tool_timeout > 0:
                 # Use copy_context to preserve injection context in executor thread
-                import contextvars
                 ctx = contextvars.copy_context()
                 
                 def execute_with_context():
                     with with_injection_context(state):
                         return self._execute_tool_impl(function_name, arguments)
                 
-                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                # Use explicit executor lifecycle to actually bound execution time
+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+                try:
                     future = executor.submit(ctx.run, execute_with_context)
                     try:
                         result = future.result(timeout=tool_timeout)
                     except concurrent.futures.TimeoutError:
+                        # Cancel and shutdown immediately to avoid blocking
+                        future.cancel()
+                        executor.shutdown(wait=False, cancel_futures=True)
                         logging.warning(f"Tool {function_name} timed out after {tool_timeout}s")
                         result = {"error": f"Tool timed out after {tool_timeout}s", "timeout": True}
+                    else:
+                        # Normal completion - shutdown gracefully
+                        executor.shutdown(wait=False)
+                finally:
+                    # Ensure executor is always cleaned up
+                    if not executor._shutdown:
+                        executor.shutdown(wait=False)
             else:
                 with with_injection_context(state):
                     result = self._execute_tool_impl(function_name, arguments)
diff --git a/src/praisonai-agents/praisonaiagents/checkpoints/service.py b/src/praisonai-agents/praisonaiagents/checkpoints/service.py
@@ -487,15 +487,15 @@ async def _prune_checkpoints(self):
         
         # Calculate how many to remove
         num_to_remove = len(self._checkpoints) - self.config.max_checkpoints
-        checkpoints_to_remove = self._checkpoints[-num_to_remove:]  # Remove oldest ones
         
-        # Keep only the most recent checkpoints in memory
-        self._checkpoints = self._checkpoints[:self.config.max_checkpoints]
+        # Keep only the most recent checkpoints in memory (newest-last semantics)
+        # Since save() appends (newest last), keep the last N entries
+        self._checkpoints = self._checkpoints[-self.config.max_checkpoints:]
         
         logger.info(f"Pruned {num_to_remove} old checkpoints to stay under limit of {self.config.max_checkpoints}")
         
         # Emit pruning event for any cleanup hooks
-        self._emit(CheckpointEvent.ERROR, {"action": "pruned", "removed_count": num_to_remove})
+        self._emit(CheckpointEvent.CHECKPOINTS_PRUNED, {"action": "pruned", "removed_count": num_to_remove})
     
     async def get_checkpoint(self, checkpoint_id: str) -> Optional[Checkpoint]:
         """Get a specific checkpoint by ID."""
diff --git a/src/praisonai-agents/praisonaiagents/checkpoints/types.py b/src/praisonai-agents/praisonaiagents/checkpoints/types.py
@@ -26,6 +26,7 @@ class CheckpointEvent(str, Enum):
     INITIALIZED = "initialized"
     CHECKPOINT_CREATED = "checkpoint_created"
     CHECKPOINT_RESTORED = "checkpoint_restored"
+    CHECKPOINTS_PRUNED = "checkpoints_pruned"
     ERROR = "error"
 
 
diff --git a/src/praisonai-agents/praisonaiagents/memory/core.py b/src/praisonai-agents/praisonaiagents/memory/core.py
@@ -120,38 +120,39 @@ def store_short_term_structured(self, content: str, metadata: Optional[Dict] = N
         clean_metadata = self._sanitize_metadata(metadata)
         
         # Protocol-driven storage: Try primary adapter first
+        memory_id = ""
         primary_error = None
-        memory_id = None
-        
         try:
             if hasattr(self, 'memory_adapter') and self.memory_adapter:
                 memory_id = self.memory_adapter.store_short_term(content, metadata=clean_metadata, **kwargs)
                 self._log_verbose(f"Stored in {self.provider} STM via adapter: {content[:100]}...")
-                
-                # Auto-promote to long-term memory if quality is high
-                if auto_promote and quality_score >= 7.5:
-                    try:
-                        self.store_long_term(content, clean_metadata, quality_score, user_id, **kwargs)
-                        self._log_verbose(f"Auto-promoted STM content to LTM (score: {quality_score:.2f})")
-                    except Exception as e:
-                        # Auto-promotion failure doesn't affect the primary storage result
-                        logging.warning(f"Failed to auto-promote to LTM: {e}")
-                
-                # Emit memory event for successful storage
-                self._emit_memory_event("store", "short_term", content, clean_metadata)
-                
-                return MemoryResult.success_result(
-                    memory_id=memory_id, 
-                    adapter_used=self.provider,
-                    context={
-                        "quality_score": quality_score,
-                        "auto_promoted": auto_promote and quality_score >= 7.5
-                    }
-                )
         except Exception as e:
             primary_error = str(e)
             self._log_verbose(f"Failed to store in {self.provider} STM: {e}", logging.WARNING)
         
+        # Only proceed with success if we got a valid memory_id 
+        if memory_id:
+            # Auto-promote to long-term memory if quality is high
+            if auto_promote and quality_score >= 7.5:
+                try:
+                    self.store_long_term(content, clean_metadata, quality_score, user_id, **kwargs)
+                    self._log_verbose(f"Auto-promoted STM content to LTM (score: {quality_score:.2f})")
+                except Exception as e:
+                    # Auto-promotion failure doesn't affect the primary storage result
+                    logging.warning(f"Failed to auto-promote to LTM: {e}")
+            
+            # Emit memory event for successful storage
+            self._emit_memory_event("store", "short_term", content, clean_metadata)
+            
+            return MemoryResult.success_result(
+                memory_id=memory_id, 
+                adapter_used=self.provider,
+                context={
+                    "quality_score": quality_score,
+                    "auto_promoted": auto_promote and quality_score >= 7.5
+                }
+            )
+        
         # Fallback to SQLite if available and different from primary adapter
         fallback_error = None
         if hasattr(self, '_sqlite_adapter') and self._sqlite_adapter != getattr(self, 'memory_adapter', None):
@@ -448,13 +449,25 @@ async def store_short_term_async(self, content: str, metadata: Optional[Dict] =
             raw_metadata["user_id"] = user_id
         clean_metadata = self._sanitize_metadata(raw_metadata)
 
-        # Store in SQLite STM
+        # Try primary adapter first (async version)
         memory_id = ""
         try:
-            memory_id = await asyncio.to_thread(self._store_sqlite_stm, content, clean_metadata, quality_score)
+            if hasattr(self, 'memory_adapter') and self.memory_adapter:
+                memory_id = await asyncio.to_thread(
+                    self.memory_adapter.store_short_term, content, metadata=clean_metadata, **kwargs
+                )
+                self._log_verbose(f"Stored in {self.provider} async STM via adapter: {content[:100]}...")
         except Exception as e:
-            logging.error(f"Failed to store in SQLite STM: {e}")
-            return ""
+            self._log_verbose(f"Failed to store in {self.provider} async STM: {e}", logging.WARNING)
+        
+        # Only use SQLite fallback if primary storage failed completely
+        if not memory_id and hasattr(self, '_sqlite_adapter') and self._sqlite_adapter != getattr(self, 'memory_adapter', None):
+            try:
+                memory_id = await asyncio.to_thread(self._store_sqlite_stm, content, clean_metadata, quality_score)
+                self._log_verbose(f"Stored in SQLite async STM as fallback: {content[:100]}...")
+            except Exception as e:
+                logging.error(f"Failed to store in SQLite async STM fallback: {e}")
+                return ""
         
         # Auto-promote to long-term memory if quality is high (async)
         if auto_promote and quality_score >= 7.5:  # High quality threshold