Skip to content

Commit cf4f333

Browse files
authored
Merge pull request #472 from FederatedAI/feature-1.11.2-deepspeed_log
fix
2 parents beb22fd + b39d2f4 commit cf4f333

2 files changed

Lines changed: 10 additions & 7 deletions

File tree

python/fate_flow/controller/task_controller.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -207,12 +207,15 @@ def kill_task(cls, task: Task):
207207
kill_status = False
208208
try:
209209
# kill task executor
210-
backend_engine = build_engine(
211-
task.f_engine_conf.get("computing_engine"),
212-
task.f_is_deepspeed
213-
)
214-
if backend_engine:
215-
backend_engine.kill(task)
210+
try:
211+
backend_engine = build_engine(
212+
task.f_engine_conf.get("computing_engine"),
213+
task.f_is_deepspeed
214+
)
215+
if backend_engine:
216+
backend_engine.kill(task)
217+
except Exception as e:
218+
schedule_logger(task.f_job_id).exception(e)
216219
WorkerManager.kill_task_all_workers(task)
217220
except Exception as e:
218221
schedule_logger(task.f_job_id).exception(e)

python/fate_flow/detection/detector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def detect_running_task(cls):
6969
process_exist = build_engine(task.f_engine_conf.get("computing_engine")).is_alive(task)
7070
if not process_exist:
7171
msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}"
72-
detect_logger(job_id=task.f_job_id).info(f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist")
72+
detect_logger(job_id=task.f_job_id).error(f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist")
7373
time.sleep(3)
7474
_tasks = JobSaver.query_task(task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id)
7575
if _tasks:

0 commit comments

Comments
 (0)