From 2eb3ef6ae156e8d95e14fd28af93cf55f31295c7 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Mon, 12 May 2025 16:11:06 -0700 Subject: [PATCH 1/2] Change default log wait timeout to 10s Signed-off-by: Hemil Desai --- nemo_run/run/torchx_backend/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_run/run/torchx_backend/launcher.py b/nemo_run/run/torchx_backend/launcher.py index 93f510b9..b44d83c7 100644 --- a/nemo_run/run/torchx_backend/launcher.py +++ b/nemo_run/run/torchx_backend/launcher.py @@ -122,7 +122,7 @@ def wait_and_exit( log: bool, runner: Runner | None = None, timeout: int = 10, - log_join_timeout: int = 600, + log_join_timeout: int = 10, ) -> specs.AppStatus: if runner is None: runner = get_runner() From 611c61fa66ef337c8682a13966de08c66b70d64f Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Mon, 12 May 2025 16:16:48 -0700 Subject: [PATCH 2/2] fix Signed-off-by: Hemil Desai --- nemo_run/run/torchx_backend/launcher.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_run/run/torchx_backend/launcher.py b/nemo_run/run/torchx_backend/launcher.py index b44d83c7..ff51c23c 100644 --- a/nemo_run/run/torchx_backend/launcher.py +++ b/nemo_run/run/torchx_backend/launcher.py @@ -161,7 +161,9 @@ def wait_and_exit( logger.info(f"Job {app_id} finished: {status.state}") if log_thread and log_thread.is_alive(): - logger.debug("Waiting for log thread to complete...") + logger.warning( + f"Waiting for {app_id}'s log thread to complete for {log_join_timeout} seconds..." + ) log_thread.join(timeout=log_join_timeout) if log_thread.is_alive(): logger.warning("Log thread did not complete within timeout, some logs may be missing")