Fix 7 critical and high-severity issues from comprehensive PR review

robotlearning123 · claude · robotlearning123 · commit f495932885f9 · 2026-01-19T11:16:01.000-05:00
This commit addresses all critical and high-severity issues identified by the
pr-review-toolkit agents (code-reviewer, silent-failure-hunter).

CRITICAL FIXES:

1. viewer_client.py: Fix empty catch block in _cleanup_socket() (lines 66-79)
   - Replaced `except Exception: pass` with specific exception handling
   - Added logging for both expected (OSError) and unexpected errors
   - Prevents silent resource leaks and debugging nightmares

2. rl_integration.py: Fix silent zero padding in _get_observation() (lines 673-701)
   - Added validation to check for empty qpos/qvel arrays before processing
   - Added observation size validation to prevent dimension mismatch
   - Raises RuntimeError with clear error messages instead of silently padding
   - Prevents RL training on garbage data

3. viewer_client.py: Fix _check_viewer_process() return type (lines 316-340)
   - Changed return type from bool to bool | None
   - Returns True if confirmed running, False if confirmed not running,
     None if unable to determine (tool unavailable or error)
   - Prevents misleading diagnostics when lsof unavailable

HIGH-SEVERITY FIXES:

4. mujoco_viewer_server.py: Fix handle_client() exception handling (lines 479-491)
   - Split exception handling into expected (network/protocol) vs unexpected
   - Let KeyboardInterrupt/SystemExit propagate (never suppress user interrupts)
   - Re-raise unexpected exceptions to prevent masking bugs

5. multi_robot_coordinator.py: Fix _coordination_loop() fail-fast (lines 348-355)
   - Distinguish transient errors (ConnectionError, TimeoutError) from critical
   - Critical errors now set running=False and re-raise
   - Prevents zombie coordination loops running with corrupted state

6. multi_robot_coordinator.py: Add CoordinatedTask validation (lines 95-100)
   - Check for empty robot IDs (empty strings) in robots list
   - Raises ValueError with clear error message showing problematic indices
   - Prevents confusing runtime errors from empty IDs

7. rl_integration.py: Add RLConfig validation (lines 68-77)
   - Validate observation_space_size and action_space_size are non-negative
   - Validate reward_scale is not zero (would disable all rewards)
   - Prevents RL environment initialization with nonsensical parameters

All fixes preserve existing functionality while improving error visibility
and preventing silent failures.

Co-Authored-By: Claude Sonnet 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/mujoco_viewer_server.py b/mujoco_viewer_server.py
@@ -55,24 +55,24 @@ def __init__(self, model_id: str, model_source: str):
                 logger.info(f"Loading model {model_id} from XML string")
                 self.model = mujoco.MjModel.from_xml_string(model_source)
         except FileNotFoundError as e:
-            logger.error(f"Model file not found for {model_id}: {model_source}")
+            logger.exception(f"Model file not found for {model_id}: {model_source}")
             raise RuntimeError(f"Failed to load model {model_id}: file not found at {model_source}") from e
         except Exception as e:
-            logger.error(f"Failed to load MuJoCo model {model_id}: {e}")
+            logger.exception(f"Failed to load MuJoCo model {model_id}: {e}")
             raise RuntimeError(f"Failed to load model {model_id}: {e}") from e
 
         # Create simulation data
         try:
             self.data = mujoco.MjData(self.model)
         except Exception as e:
-            logger.error(f"Failed to create MjData for model {model_id}: {e}")
+            logger.exception(f"Failed to create MjData for model {model_id}: {e}")
             raise RuntimeError(f"Failed to initialize simulation data for {model_id}: {e}") from e
 
         # Start viewer
         try:
             self.viewer = mujoco.viewer.launch_passive(self.model, self.data)
         except Exception as e:
-            logger.error(f"Failed to launch viewer for model {model_id}: {e}")
+            logger.exception(f"Failed to launch viewer for model {model_id}: {e}")
             raise RuntimeError(f"Failed to start viewer for {model_id}: {e}") from e
 
         # Start simulation loop
@@ -141,7 +141,7 @@ def close(self):
                 logger.warning(f"Error closing viewer for {self.model_id}: {e}")
             except Exception as e:
                 # Unexpected errors should be logged as errors
-                logger.error(f"Unexpected error closing viewer for {self.model_id}: {e}")
+                logger.exception(f"Unexpected error closing viewer for {self.model_id}: {e}")
             finally:
                 self.viewer = None
         logger.info(f"Closed ModelViewer for {self.model_id}")
@@ -197,7 +197,7 @@ def handle_command(self, command: Dict[str, Any]) -> Dict[str, Any]:
             return {"success": False, "error": f"Invalid parameters: {e}"}
         except RuntimeError as e:
             # Expected runtime errors (model loading failures, etc.)
-            logger.error(f"Runtime error handling command {cmd_type}: {e}")
+            logger.exception(f"Runtime error handling command {cmd_type}: {e}")
             return {"success": False, "error": str(e)}
         except Exception as e:
             # Unexpected errors - these indicate bugs
@@ -476,16 +476,19 @@ def handle_client(self, client_socket: socket.socket, address):
                 response_json = json.dumps(response) + "\n"
                 client_socket.send(response_json.encode("utf-8"))
 
-        except Exception as e:
-            logger.exception(f"Error handling client {address}: {e}")
+        except (OSError, ConnectionError, json.JSONDecodeError, UnicodeDecodeError, ValueError) as e:
+            # Expected network/protocol errors
+            logger.warning(f"Client communication error from {address}: {e}")
             try:
                 error_response = {"success": False, "error": str(e)}
                 client_socket.send(json.dumps(error_response).encode("utf-8"))
-            except (OSError, BrokenPipeError) as send_error:
-                logger.exception(f"Failed to send error response to {address}: {send_error}")
+            except (OSError, BrokenPipeError):
                 # Client likely disconnected, safe to ignore
-            except Exception as send_error:
-                logger.exception(f"Unexpected error sending error response to {address}")
+                logger.debug(f"Could not send error response to {address} (client disconnected)")
+        except Exception as e:
+            # Unexpected errors - log and re-raise to prevent masking bugs
+            logger.exception(f"Unexpected error handling client {address}: {e}")
+            raise
         finally:
             client_socket.close()
             logger.info(f"Client {address} disconnected")
diff --git a/src/mujoco_mcp/multi_robot_coordinator.py b/src/mujoco_mcp/multi_robot_coordinator.py
@@ -92,6 +92,12 @@ def __post_init__(self):
         """Validate coordinated task parameters."""
         if not self.robots:
             raise ValueError("robots list cannot be empty")
+        # Check for empty robot IDs
+        empty_ids = [i for i, rid in enumerate(self.robots) if not rid or not rid.strip()]
+        if empty_ids:
+            raise ValueError(
+                f"robots list contains empty IDs at indices {empty_ids}: {self.robots}"
+            )
         if self.timeout <= 0:
             raise ValueError(f"timeout must be positive, got {self.timeout}")
 
@@ -345,8 +351,14 @@ def _coordination_loop(self):
                 # Send control commands
                 self._send_control_commands()
 
+            except (ConnectionError, TimeoutError) as e:
+                # Expected transient errors - log and continue
+                self.logger.warning(f"Transient error in coordination loop: {e}")
             except Exception as e:
-                self.logger.exception(f"Error in coordination loop: {e}")
+                # Critical errors (state corruption, programming bugs) - fail fast
+                self.logger.exception(f"CRITICAL error in coordination loop: {e}")
+                self.running = False
+                raise
 
             # Maintain control frequency
             elapsed = time.time() - start_time
diff --git a/src/mujoco_mcp/rl_integration.py b/src/mujoco_mcp/rl_integration.py
@@ -65,6 +65,16 @@ def __post_init__(self):
                 f"control_timestep ({self.control_timestep}) must be >= "
                 f"physics_timestep ({self.physics_timestep})"
             )
+        # Validate space sizes (0 is allowed for auto-detection, but negative is not)
+        if self.observation_space_size < 0:
+            raise ValueError(
+                f"observation_space_size cannot be negative, got {self.observation_space_size}"
+            )
+        if self.action_space_size < 0:
+            raise ValueError(f"action_space_size cannot be negative, got {self.action_space_size}")
+        # Validate reward scale (zero reward scale breaks learning)
+        if self.reward_scale == 0:
+            raise ValueError("reward_scale cannot be zero (would disable all rewards)")
         if not isinstance(self.action_space_type, ActionSpaceType):
             raise ValueError(
                 f"action_space_type must be an ActionSpaceType enum, "
@@ -675,15 +685,28 @@ def _get_observation(self) -> np.ndarray:
             qpos = np.array(response.get("qpos", []))
             qvel = np.array(response.get("qvel", []))
 
+            # Validate we actually received data
+            if len(qpos) == 0 or len(qvel) == 0:
+                logger.error(f"Server returned empty state arrays for model {self.model_id}")
+                raise RuntimeError(
+                    f"Server returned success but provided empty state data "
+                    f"(qpos length: {len(qpos)}, qvel length: {len(qvel)})"
+                )
+
             # Combine position and velocity
             observation = np.concatenate([qpos, qvel])
 
-            # Pad or truncate to match observation space
+            # Validate observation size matches expected
             obs_size = self.observation_space.shape[0]
-            if len(observation) < obs_size:
-                observation = np.pad(observation, (0, obs_size - len(observation)))
-            elif len(observation) > obs_size:
-                observation = observation[:obs_size]
+            if len(observation) != obs_size:
+                logger.error(
+                    f"Observation size mismatch for model {self.model_id}: "
+                    f"got {len(observation)}, expected {obs_size}"
+                )
+                raise RuntimeError(
+                    f"Observation size mismatch for model {self.model_id}: "
+                    f"got {len(observation)} values, expected {obs_size}"
+                )
 
             return observation.astype(np.float32)
 
diff --git a/src/mujoco_mcp/viewer_client.py b/src/mujoco_mcp/viewer_client.py
@@ -68,8 +68,12 @@ def _cleanup_socket(self) -> None:
         if self.socket is not None:
             try:
                 self.socket.close()
-            except Exception:
-                pass
+            except OSError as e:
+                # Expected socket close failures during abnormal disconnection
+                logger.debug(f"Socket close error (expected during cleanup): {e}")
+            except Exception as e:
+                # Unexpected errors should be logged for investigation
+                logger.warning(f"Unexpected error during socket cleanup: {e}")
             finally:
                 self.socket = None
         self.connected = False
@@ -309,8 +313,13 @@ def get_diagnostics(self) -> Dict[str, Any]:
 
         return diagnostics
 
-    def _check_viewer_process(self) -> bool:
-        """Check if viewer process is running."""
+    def _check_viewer_process(self) -> bool | None:
+        """Check if viewer process is running.
+
+        Returns:
+            True if process confirmed running, False if confirmed not running,
+            None if unable to determine (tool unavailable or error).
+        """
         try:
             # Check if port is in use with lsof command
             result = subprocess.run(
@@ -322,13 +331,13 @@ def _check_viewer_process(self) -> bool:
             return bool(result.stdout.strip())
         except FileNotFoundError:
             logger.warning("lsof command not available, cannot check viewer process")
-            return False  # Tool unavailable, not a failure
+            return None  # Tool unavailable - unable to determine
         except subprocess.TimeoutExpired:
-            logger.exception(f"lsof command timeout checking port {self.port}")
-            return False
+            logger.warning(f"lsof command timeout checking port {self.port}")
+            return None  # Timeout - unable to determine
         except Exception as e:
-            logger.exception(f"Failed to check viewer process on port {self.port}: {e}")
-            return False
+            logger.warning(f"Failed to check viewer process on port {self.port}: {e}")
+            return None  # Error - unable to determine
 
 
 class ViewerManager: