add stop engine return argument

binary-husky · binary-husky · commit 1604076de443 · 2026-02-22T23:50:46.000+08:00
diff --git a/ajet/tuner_lib/experimental/as_swarm_client.py b/ajet/tuner_lib/experimental/as_swarm_client.py
@@ -544,7 +544,10 @@ def auto_sync_train_config_and_start_engine(self, agent_jet_job: AgentJetJob, fo
             self.logger_info("Engine is already ROLLING. No action needed.")
         elif current_status == "ENGINE.ROLLING_POST":
             self.logger_info("Engine is already ROLLING. No action needed.")
-        elif current_status in ["ENGINE.BOOTING", "ENGINE.CANNOT_CONNECT", "ENGINE.WEIGHT_SYNCING"]:
+        elif current_status in ["ENGINE.CANNOT_CONNECT"]:
+            logger.error("Unable to connect to swarm server.")
+            raise RuntimeError(f"Unable to connect to swarm server.")
+        elif current_status in ["ENGINE.BOOTING", "ENGINE.WEIGHT_SYNCING"]:
             self.logger_info(f"Engine is {current_status}. Waiting until it becomes ROLLING...")
             self._wait_until_status_change_to(desired_status="ENGINE.ROLLING")
             logger.success("Training engine is now ROLLING and ready.")
@@ -568,7 +571,7 @@ def stop_engine(self):
         )
         raise_for_status_with_detail(resp)
         result = resp.json()
-        if result.get("success"):
+        if result and result.get("success"):
             self.logger_info("Successfully stopped training engine on Swarm server")
         else:
             logger.error("Failed to stop training engine")
diff --git a/ajet/tuner_lib/experimental/as_swarm_server.py b/ajet/tuner_lib/experimental/as_swarm_server.py
@@ -778,5 +778,6 @@ async def stop_engine():
         - Clean up shared memory state
         """
         kill_process_tree(shared_mem_dict_lock, shared_mem_dict)
+        return BoolResponse(success=True)
 
     return app, register_episode_ready_listener()
diff --git a/tutorial/example_math_swarm/math.py b/tutorial/example_math_swarm/math.py
@@ -22,7 +22,8 @@
 
 REMOTE_BATCH_SIZE = 32
 REMOTE_ALLOCATE_GPU_PER_NODE = 8
-REMOTE_TRAIN_MODEL = '/root/agentjet/modelscope_cache/Qwen/Qwen2.5-7B-Instruct'
+# REMOTE_TRAIN_MODEL = '/root/agentjet/modelscope_cache/Qwen/Qwen2.5-7B-Instruct'
+REMOTE_TRAIN_MODEL = '/mnt/data_cpfs/model_cache/modelscope/hub/Qwen/Qwen/Qwen2.5-3B-Instruct'
 
 def main():
 
@@ -48,7 +49,8 @@ def main():
             model=REMOTE_TRAIN_MODEL,
             batch_size=REMOTE_BATCH_SIZE,
             num_repeat=GRPO_N,
-        )
+        ),
+        force_restart=True,
     )
 
     def rollout(task):