Skip to content

Commit f495932

Browse files
Fix 7 critical and high-severity issues from comprehensive PR review
This commit addresses all critical and high-severity issues identified by the pr-review-toolkit agents (code-reviewer, silent-failure-hunter). CRITICAL FIXES: 1. viewer_client.py: Fix empty catch block in _cleanup_socket() (lines 66-79) - Replaced `except Exception: pass` with specific exception handling - Added logging for both expected (OSError) and unexpected errors - Prevents silent resource leaks and debugging nightmares 2. rl_integration.py: Fix silent zero padding in _get_observation() (lines 673-701) - Added validation to check for empty qpos/qvel arrays before processing - Added observation size validation to prevent dimension mismatch - Raises RuntimeError with clear error messages instead of silently padding - Prevents RL training on garbage data 3. viewer_client.py: Fix _check_viewer_process() return type (lines 316-340) - Changed return type from bool to bool | None - Returns True if confirmed running, False if confirmed not running, None if unable to determine (tool unavailable or error) - Prevents misleading diagnostics when lsof unavailable HIGH-SEVERITY FIXES: 4. mujoco_viewer_server.py: Fix handle_client() exception handling (lines 479-491) - Split exception handling into expected (network/protocol) vs unexpected - Let KeyboardInterrupt/SystemExit propagate (never suppress user interrupts) - Re-raise unexpected exceptions to prevent masking bugs 5. multi_robot_coordinator.py: Fix _coordination_loop() fail-fast (lines 348-355) - Distinguish transient errors (ConnectionError, TimeoutError) from critical - Critical errors now set running=False and re-raise - Prevents zombie coordination loops running with corrupted state 6. multi_robot_coordinator.py: Add CoordinatedTask validation (lines 95-100) - Check for empty robot IDs (empty strings) in robots list - Raises ValueError with clear error message showing problematic indices - Prevents confusing runtime errors from empty IDs 7. rl_integration.py: Add RLConfig validation (lines 68-77) - Validate observation_space_size and action_space_size are non-negative - Validate reward_scale is not zero (would disable all rewards) - Prevents RL environment initialization with nonsensical parameters All fixes preserve existing functionality while improving error visibility and preventing silent failures. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent a4cb45b commit f495932

4 files changed

Lines changed: 74 additions & 27 deletions

File tree

mujoco_viewer_server.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,24 +55,24 @@ def __init__(self, model_id: str, model_source: str):
5555
logger.info(f"Loading model {model_id} from XML string")
5656
self.model = mujoco.MjModel.from_xml_string(model_source)
5757
except FileNotFoundError as e:
58-
logger.error(f"Model file not found for {model_id}: {model_source}")
58+
logger.exception(f"Model file not found for {model_id}: {model_source}")
5959
raise RuntimeError(f"Failed to load model {model_id}: file not found at {model_source}") from e
6060
except Exception as e:
61-
logger.error(f"Failed to load MuJoCo model {model_id}: {e}")
61+
logger.exception(f"Failed to load MuJoCo model {model_id}: {e}")
6262
raise RuntimeError(f"Failed to load model {model_id}: {e}") from e
6363

6464
# Create simulation data
6565
try:
6666
self.data = mujoco.MjData(self.model)
6767
except Exception as e:
68-
logger.error(f"Failed to create MjData for model {model_id}: {e}")
68+
logger.exception(f"Failed to create MjData for model {model_id}: {e}")
6969
raise RuntimeError(f"Failed to initialize simulation data for {model_id}: {e}") from e
7070

7171
# Start viewer
7272
try:
7373
self.viewer = mujoco.viewer.launch_passive(self.model, self.data)
7474
except Exception as e:
75-
logger.error(f"Failed to launch viewer for model {model_id}: {e}")
75+
logger.exception(f"Failed to launch viewer for model {model_id}: {e}")
7676
raise RuntimeError(f"Failed to start viewer for {model_id}: {e}") from e
7777

7878
# Start simulation loop
@@ -141,7 +141,7 @@ def close(self):
141141
logger.warning(f"Error closing viewer for {self.model_id}: {e}")
142142
except Exception as e:
143143
# Unexpected errors should be logged as errors
144-
logger.error(f"Unexpected error closing viewer for {self.model_id}: {e}")
144+
logger.exception(f"Unexpected error closing viewer for {self.model_id}: {e}")
145145
finally:
146146
self.viewer = None
147147
logger.info(f"Closed ModelViewer for {self.model_id}")
@@ -197,7 +197,7 @@ def handle_command(self, command: Dict[str, Any]) -> Dict[str, Any]:
197197
return {"success": False, "error": f"Invalid parameters: {e}"}
198198
except RuntimeError as e:
199199
# Expected runtime errors (model loading failures, etc.)
200-
logger.error(f"Runtime error handling command {cmd_type}: {e}")
200+
logger.exception(f"Runtime error handling command {cmd_type}: {e}")
201201
return {"success": False, "error": str(e)}
202202
except Exception as e:
203203
# Unexpected errors - these indicate bugs
@@ -476,16 +476,19 @@ def handle_client(self, client_socket: socket.socket, address):
476476
response_json = json.dumps(response) + "\n"
477477
client_socket.send(response_json.encode("utf-8"))
478478

479-
except Exception as e:
480-
logger.exception(f"Error handling client {address}: {e}")
479+
except (OSError, ConnectionError, json.JSONDecodeError, UnicodeDecodeError, ValueError) as e:
480+
# Expected network/protocol errors
481+
logger.warning(f"Client communication error from {address}: {e}")
481482
try:
482483
error_response = {"success": False, "error": str(e)}
483484
client_socket.send(json.dumps(error_response).encode("utf-8"))
484-
except (OSError, BrokenPipeError) as send_error:
485-
logger.exception(f"Failed to send error response to {address}: {send_error}")
485+
except (OSError, BrokenPipeError):
486486
# Client likely disconnected, safe to ignore
487-
except Exception as send_error:
488-
logger.exception(f"Unexpected error sending error response to {address}")
487+
logger.debug(f"Could not send error response to {address} (client disconnected)")
488+
except Exception as e:
489+
# Unexpected errors - log and re-raise to prevent masking bugs
490+
logger.exception(f"Unexpected error handling client {address}: {e}")
491+
raise
489492
finally:
490493
client_socket.close()
491494
logger.info(f"Client {address} disconnected")

src/mujoco_mcp/multi_robot_coordinator.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ def __post_init__(self):
9292
"""Validate coordinated task parameters."""
9393
if not self.robots:
9494
raise ValueError("robots list cannot be empty")
95+
# Check for empty robot IDs
96+
empty_ids = [i for i, rid in enumerate(self.robots) if not rid or not rid.strip()]
97+
if empty_ids:
98+
raise ValueError(
99+
f"robots list contains empty IDs at indices {empty_ids}: {self.robots}"
100+
)
95101
if self.timeout <= 0:
96102
raise ValueError(f"timeout must be positive, got {self.timeout}")
97103

@@ -345,8 +351,14 @@ def _coordination_loop(self):
345351
# Send control commands
346352
self._send_control_commands()
347353

354+
except (ConnectionError, TimeoutError) as e:
355+
# Expected transient errors - log and continue
356+
self.logger.warning(f"Transient error in coordination loop: {e}")
348357
except Exception as e:
349-
self.logger.exception(f"Error in coordination loop: {e}")
358+
# Critical errors (state corruption, programming bugs) - fail fast
359+
self.logger.exception(f"CRITICAL error in coordination loop: {e}")
360+
self.running = False
361+
raise
350362

351363
# Maintain control frequency
352364
elapsed = time.time() - start_time

src/mujoco_mcp/rl_integration.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ def __post_init__(self):
6565
f"control_timestep ({self.control_timestep}) must be >= "
6666
f"physics_timestep ({self.physics_timestep})"
6767
)
68+
# Validate space sizes (0 is allowed for auto-detection, but negative is not)
69+
if self.observation_space_size < 0:
70+
raise ValueError(
71+
f"observation_space_size cannot be negative, got {self.observation_space_size}"
72+
)
73+
if self.action_space_size < 0:
74+
raise ValueError(f"action_space_size cannot be negative, got {self.action_space_size}")
75+
# Validate reward scale (zero reward scale breaks learning)
76+
if self.reward_scale == 0:
77+
raise ValueError("reward_scale cannot be zero (would disable all rewards)")
6878
if not isinstance(self.action_space_type, ActionSpaceType):
6979
raise ValueError(
7080
f"action_space_type must be an ActionSpaceType enum, "
@@ -675,15 +685,28 @@ def _get_observation(self) -> np.ndarray:
675685
qpos = np.array(response.get("qpos", []))
676686
qvel = np.array(response.get("qvel", []))
677687

688+
# Validate we actually received data
689+
if len(qpos) == 0 or len(qvel) == 0:
690+
logger.error(f"Server returned empty state arrays for model {self.model_id}")
691+
raise RuntimeError(
692+
f"Server returned success but provided empty state data "
693+
f"(qpos length: {len(qpos)}, qvel length: {len(qvel)})"
694+
)
695+
678696
# Combine position and velocity
679697
observation = np.concatenate([qpos, qvel])
680698

681-
# Pad or truncate to match observation space
699+
# Validate observation size matches expected
682700
obs_size = self.observation_space.shape[0]
683-
if len(observation) < obs_size:
684-
observation = np.pad(observation, (0, obs_size - len(observation)))
685-
elif len(observation) > obs_size:
686-
observation = observation[:obs_size]
701+
if len(observation) != obs_size:
702+
logger.error(
703+
f"Observation size mismatch for model {self.model_id}: "
704+
f"got {len(observation)}, expected {obs_size}"
705+
)
706+
raise RuntimeError(
707+
f"Observation size mismatch for model {self.model_id}: "
708+
f"got {len(observation)} values, expected {obs_size}"
709+
)
687710

688711
return observation.astype(np.float32)
689712

src/mujoco_mcp/viewer_client.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,12 @@ def _cleanup_socket(self) -> None:
6868
if self.socket is not None:
6969
try:
7070
self.socket.close()
71-
except Exception:
72-
pass
71+
except OSError as e:
72+
# Expected socket close failures during abnormal disconnection
73+
logger.debug(f"Socket close error (expected during cleanup): {e}")
74+
except Exception as e:
75+
# Unexpected errors should be logged for investigation
76+
logger.warning(f"Unexpected error during socket cleanup: {e}")
7377
finally:
7478
self.socket = None
7579
self.connected = False
@@ -309,8 +313,13 @@ def get_diagnostics(self) -> Dict[str, Any]:
309313

310314
return diagnostics
311315

312-
def _check_viewer_process(self) -> bool:
313-
"""Check if viewer process is running."""
316+
def _check_viewer_process(self) -> bool | None:
317+
"""Check if viewer process is running.
318+
319+
Returns:
320+
True if process confirmed running, False if confirmed not running,
321+
None if unable to determine (tool unavailable or error).
322+
"""
314323
try:
315324
# Check if port is in use with lsof command
316325
result = subprocess.run(
@@ -322,13 +331,13 @@ def _check_viewer_process(self) -> bool:
322331
return bool(result.stdout.strip())
323332
except FileNotFoundError:
324333
logger.warning("lsof command not available, cannot check viewer process")
325-
return False # Tool unavailable, not a failure
334+
return None # Tool unavailable - unable to determine
326335
except subprocess.TimeoutExpired:
327-
logger.exception(f"lsof command timeout checking port {self.port}")
328-
return False
336+
logger.warning(f"lsof command timeout checking port {self.port}")
337+
return None # Timeout - unable to determine
329338
except Exception as e:
330-
logger.exception(f"Failed to check viewer process on port {self.port}: {e}")
331-
return False
339+
logger.warning(f"Failed to check viewer process on port {self.port}: {e}")
340+
return None # Error - unable to determine
332341

333342

334343
class ViewerManager:

0 commit comments

Comments
 (0)