diff --git a/nemo_run/run/torchx_backend/launcher.py b/nemo_run/run/torchx_backend/launcher.py index 93f510b9..ff51c23c 100644 --- a/nemo_run/run/torchx_backend/launcher.py +++ b/nemo_run/run/torchx_backend/launcher.py @@ -122,7 +122,7 @@ def wait_and_exit( log: bool, runner: Runner | None = None, timeout: int = 10, - log_join_timeout: int = 600, + log_join_timeout: int = 10, ) -> specs.AppStatus: if runner is None: runner = get_runner() @@ -161,7 +161,9 @@ def wait_and_exit( logger.info(f"Job {app_id} finished: {status.state}") if log_thread and log_thread.is_alive(): - logger.debug("Waiting for log thread to complete...") + logger.warning( + f"Waiting for {app_id}'s log thread to complete for {log_join_timeout} seconds..." + ) log_thread.join(timeout=log_join_timeout) if log_thread.is_alive(): logger.warning("Log thread did not complete within timeout, some logs may be missing")