-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Fixes NaN after env reset by syncing body_q before collision detection #5176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9b5288a
3ebe17e
36b7659
87ed2b1
f4f988a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,7 @@ class NewtonManager(PhysicsManager): | |
| _collision_pipeline = None | ||
| _newton_contact_sensors: dict = {} # Maps sensor_key to NewtonContactSensor | ||
| _report_contacts: bool = False | ||
| _fk_dirty: bool = False | ||
|
|
||
| # CUDA graphing | ||
| _graph = None | ||
|
|
@@ -158,6 +159,7 @@ def reset(cls, soft: bool = False) -> None: | |
| def forward(cls) -> None: | ||
| """Update articulation kinematics without stepping physics.""" | ||
| eval_fk(cls._model, cls._state_0.joint_q, cls._state_0.joint_qd, cls._state_0, None) | ||
| cls._fk_dirty = False | ||
|
|
||
| @classmethod | ||
| def pre_render(cls) -> None: | ||
|
|
@@ -298,6 +300,13 @@ def step(cls) -> None: | |
| else: | ||
| logger.warning("Newton deferred CUDA graph capture failed; using eager execution") | ||
|
|
||
| # Ensure body_q is up-to-date before collision detection. | ||
| # After env resets, joint_q is written but body_q (used by | ||
| # broadphase/narrowphase) is stale until FK runs. | ||
| if cls._fk_dirty and cls._needs_collision_pipeline: | ||
| eval_fk(cls._model, cls._state_0.joint_q, cls._state_0.joint_qd, cls._state_0, None) | ||
| cls._fk_dirty = False | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The unconditional Good. |
||
| # Step simulation (graphed or not; _graph is None when capture is disabled or failed) | ||
| if cfg is not None and cfg.use_cuda_graph and cls._graph is not None and "cuda" in device: # type: ignore[union-attr] | ||
| wp.capture_launch(cls._graph) | ||
|
|
@@ -358,6 +367,7 @@ def clear(cls): | |
| cls._collision_pipeline = None | ||
| cls._newton_contact_sensors = {} | ||
| cls._report_contacts = False | ||
| cls._fk_dirty = False | ||
| cls._graph = None | ||
| cls._graph_capture_pending = False | ||
| cls._newton_stage_path = None | ||
|
|
@@ -377,6 +387,16 @@ def add_model_change(cls, change: SolverNotifyFlags) -> None: | |
| """Register a model change to notify the solver.""" | ||
| cls._model_changes.add(change) | ||
|
|
||
| @classmethod | ||
| def invalidate_fk(cls) -> None: | ||
| """Mark forward kinematics as needing recomputation. | ||
|
|
||
| Called by articulation write methods that modify ``joint_q`` or root | ||
| transforms. The flag is checked in :meth:`step` before collision | ||
| detection to ensure ``body_q`` is up-to-date. | ||
| """ | ||
| cls._fk_dirty = True | ||
|
|
||
| @classmethod | ||
| def start_simulation(cls) -> None: | ||
| """Start simulation by finalizing model and initializing state. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2406,5 +2406,86 @@ def test_write_joint_frictions_to_sim(sim, num_articulations, device, add_ground | |
| assert torch.allclose(joint_friction_coeff_sim_2, friction_2) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("num_articulations", [2]) | ||
|
AntoineRichard marked this conversation as resolved.
|
||
| @pytest.mark.parametrize("device", ["cuda:0"]) | ||
| @pytest.mark.parametrize("articulation_type", ["anymal"]) | ||
| def test_body_q_consistent_after_root_write(num_articulations, device, articulation_type): | ||
| """Test that body_q is fresh when collide() runs after a root pose write. | ||
|
|
||
| Regression test for a NaN bug where collide() used stale body_q after env | ||
| reset because eval_fk was not called between write_root_pose and collide. | ||
|
|
||
| Uses ``use_mujoco_contacts=False`` so the Newton collision pipeline is | ||
| active, then patches ``_simulate_physics_only`` to capture body_q at | ||
| the moment collide() is called and asserts it matches joint_q. | ||
| """ | ||
| from unittest.mock import patch | ||
|
|
||
| sim_cfg = SimulationCfg( | ||
| dt=1 / 200, | ||
| physics=NewtonCfg( | ||
| solver_cfg=MJWarpSolverCfg( | ||
| njmax=70, | ||
| nconmax=70, | ||
| integrator="implicitfast", | ||
| use_mujoco_contacts=False, | ||
| ), | ||
| num_substeps=1, | ||
| use_cuda_graph=False, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The test hardcodes |
||
| ), | ||
| ) | ||
| with build_simulation_context(sim_cfg=sim_cfg, device=device) as sim: | ||
| sim._app_control_on_stop_handle = None | ||
| articulation_cfg = generate_articulation_cfg(articulation_type=articulation_type) | ||
| articulation, env_pos = generate_articulation(articulation_cfg, num_articulations, device) | ||
|
|
||
| sim.reset() | ||
|
|
||
| model = SimulationManager.get_model() | ||
| jc_starts = model.joint_coord_world_start.numpy() | ||
| body_starts = model.body_world_start.numpy() | ||
|
|
||
| for _ in range(5): | ||
| sim.step() | ||
| articulation.update(sim.cfg.dt) | ||
|
|
||
| # Teleport env 0 by 10m (simulating a reset) | ||
| new_pose = wp.to_torch(articulation.data.default_root_pose).clone() | ||
| new_pose[0, 0] += 10.0 | ||
| new_pose[0, 1] += 5.0 | ||
| articulation.write_root_pose_to_sim_index( | ||
| root_pose=new_pose[0:1], | ||
| env_ids=torch.tensor([0], device=device, dtype=torch.int32), | ||
| ) | ||
|
|
||
| # Patch _simulate_physics_only to capture body_q before collide runs | ||
| captured = {} | ||
| original_simulate = SimulationManager._simulate_physics_only.__func__ | ||
|
|
||
| @classmethod # type: ignore[misc] | ||
| def _patched_simulate(cls): | ||
| if cls._needs_collision_pipeline: | ||
| bq = wp.to_torch(cls._state_0.body_q) | ||
| jq = wp.to_torch(cls._state_0.joint_q) | ||
| b0 = int(body_starts[0]) | ||
| jc0 = int(jc_starts[0]) | ||
| captured["bq_root"] = bq[b0, :3].clone() | ||
| captured["jq_root"] = jq[jc0 : jc0 + 3].clone() | ||
| original_simulate(cls) | ||
|
|
||
| with patch.object(SimulationManager, "_simulate_physics_only", _patched_simulate): | ||
| sim.step() | ||
| articulation.update(sim.cfg.dt) | ||
|
|
||
| assert captured, "collision pipeline did not run — _needs_collision_pipeline is False" | ||
|
|
||
| bq_root = captured["bq_root"] | ||
| jq_root = captured["jq_root"] | ||
| diff = (jq_root - bq_root).abs().max().item() | ||
| assert diff < 0.01, ( | ||
| f"body_q was stale when collide() ran: diff={diff:.4f}m, jq={jq_root.tolist()}, bq={bq_root.tolist()}" | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| pytest.main([__file__, "-v", "--maxfail=1"]) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
eval_fkruns eagerly outside the CUDA graph. When graphs are enabled, the execution order is:eval_fk→ updatesbody_qin_state_0wp.capture_launch(graph)→ replays graphedcollide()which reads the now-currentbody_qThis is correct because CUDA graphs replay on the same memory pointers, so the eager write is visible to the graphed kernel. The FK call is a single Warp kernel launch, so the overhead when
_fk_dirtyis True is minimal.