Skip to content

Commit 7497d24

Browse files
Avoid replicator wait during export close
LEAPP export does not use Replicator workflows, so avoid waiting for Replicator during SimulationApp shutdown. This keeps teardown from blocking on unrelated queued Replicator work and keeps traceback diagnostics around close for future CI timeouts.
1 parent ccb6d49 commit 7497d24

1 file changed

Lines changed: 30 additions & 12 deletions

File tree

  • scripts/reinforcement_learning/leapp/rsl_rl

scripts/reinforcement_learning/leapp/rsl_rl/export.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import argparse
1313
import contextlib
14+
import faulthandler
1415
import importlib.metadata as metadata
1516
import os
1617
import sys
@@ -40,17 +41,22 @@
4041

4142
_TIMING_PREFIX = "[LEAPP_EXPORT_TIMING]"
4243
_PROCESS_START = time.perf_counter()
44+
_CLOSE_HANG_TRACEBACK_TIMEOUT = 120
45+
46+
47+
def _emit_timing(message: str) -> None:
48+
"""Print timing diagnostics to the original stderr stream."""
49+
print(message, file=sys.__stderr__ or sys.stderr, flush=True)
4350

4451

4552
@contextlib.contextmanager
4653
def _timed_phase(task_name: str | None, phase: str):
4754
"""Print timing information for a single export phase."""
4855
start = time.perf_counter()
4956
task_label = task_name or "<unknown>"
50-
print(
57+
_emit_timing(
5158
f"{_TIMING_PREFIX} task={task_label} phase={phase} status=start "
52-
f"pid={os.getpid()} total_elapsed={start - _PROCESS_START:.2f}s",
53-
flush=True,
59+
f"pid={os.getpid()} total_elapsed={start - _PROCESS_START:.2f}s"
5460
)
5561
status = "done"
5662
try:
@@ -60,13 +66,26 @@ def _timed_phase(task_name: str | None, phase: str):
6066
raise
6167
finally:
6268
end = time.perf_counter()
63-
print(
69+
_emit_timing(
6470
f"{_TIMING_PREFIX} task={task_label} phase={phase} status={status} "
65-
f"phase_elapsed={end - start:.2f}s total_elapsed={end - _PROCESS_START:.2f}s",
66-
flush=True,
71+
f"phase_elapsed={end - start:.2f}s total_elapsed={end - _PROCESS_START:.2f}s"
6772
)
6873

6974

75+
def _close_simulation_app() -> None:
76+
"""Close Isaac Sim without waiting for Replicator workflows."""
77+
traceback_stream = sys.__stderr__ or sys.stderr
78+
faulthandler.dump_traceback_later(
79+
_CLOSE_HANG_TRACEBACK_TIMEOUT,
80+
repeat=True,
81+
file=traceback_stream,
82+
)
83+
try:
84+
simulation_app.close(wait_for_replicator=False)
85+
finally:
86+
faulthandler.cancel_dump_traceback_later()
87+
88+
7089
parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
7190
parser.add_argument(
7291
"--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -125,7 +144,7 @@ def _timed_phase(task_name: str | None, phase: str):
125144

126145
installed_version = metadata.version("rsl-rl-lib")
127146

128-
print(f"[INFO] LEAPP version: {metadata.version('leapp')}", flush=True)
147+
_emit_timing(f"[INFO] LEAPP version: {metadata.version('leapp')}")
129148
with _timed_phase(args_cli.task, "app_launcher"):
130149
app_launcher = AppLauncher(args_cli)
131150
simulation_app = app_launcher.app
@@ -202,7 +221,7 @@ def main(env_cfg: ManagerBasedRLEnvCfg, agent_cfg: RslRlBaseRunnerCfg):
202221
task_name = args_cli.task.split(":")[-1]
203222
train_task_name = task_name.replace("-Play", "")
204223
task_start = time.perf_counter()
205-
print(f"{_TIMING_PREFIX} task={task_name} phase=main status=start", flush=True)
224+
_emit_timing(f"{_TIMING_PREFIX} task={task_name} phase=main status=start")
206225

207226
with _timed_phase(task_name, "agent_config"):
208227
agent_cfg: RslRlBaseRunnerCfg = cli_args.update_rsl_rl_cfg(agent_cfg, args_cli)
@@ -317,10 +336,9 @@ def main(env_cfg: ManagerBasedRLEnvCfg, agent_cfg: RslRlBaseRunnerCfg):
317336
with _timed_phase(task_name, "env_close"):
318337
env.close()
319338

320-
print(
339+
_emit_timing(
321340
f"{_TIMING_PREFIX} task={task_name} phase=main status=done "
322-
f"phase_elapsed={time.perf_counter() - task_start:.2f}s",
323-
flush=True,
341+
f"phase_elapsed={time.perf_counter() - task_start:.2f}s"
324342
)
325343

326344

@@ -329,4 +347,4 @@ def main(env_cfg: ManagerBasedRLEnvCfg, agent_cfg: RslRlBaseRunnerCfg):
329347
main()
330348
finally:
331349
with _timed_phase(args_cli.task, "simulation_app_close"):
332-
simulation_app.close()
350+
_close_simulation_app()

0 commit comments

Comments
 (0)