modelscope
diff --git a/‎ajet/launcher.py‎
Lines changed: 51 additions & 28 deletions b/‎ajet/launcher.py‎
Lines changed: 51 additions & 28 deletions
diff --git a/‎ajet/tuner_lib/weight_tuner/experimental/as_swarm_client.py‎
Lines changed: 40 additions & 17 deletions b/‎ajet/tuner_lib/weight_tuner/experimental/as_swarm_client.py‎
Lines changed: 40 additions & 17 deletions
diff --git a/‎ajet/tuner_lib/weight_tuner/experimental/as_swarm_server.py‎
Lines changed: 1 addition & 1 deletion b/‎ajet/tuner_lib/weight_tuner/experimental/as_swarm_server.py‎
Lines changed: 1 addition & 1 deletion
@@ -63,16 +63,10 @@ def parse_args():
         help="Path to configuration file",
     )
     parser.add_argument(
-        "--with-ray",
-        action="store_true",
-        default=False,
-        help="Launch ray"
+        "--with-ray", action="store_true", default=False, help="Launch ray"
     )
     parser.add_argument(
-        "--with-ray-cluster",
-        action="store_true",
-        default=False,
-        help="Launch ray"
+        "--with-ray-cluster", action="store_true", default=False, help="Launch ray"
     )
     parser.add_argument(
         "--with-appworld",
@@ -93,10 +87,7 @@ def parse_args():
         help="Launch webshop",
     )
     parser.add_argument(
-        "--with-bfcl",
-        action="store_true",
-        default=False,
-        help="Launch bfcl"
+        "--with-bfcl", action="store_true", default=False, help="Launch bfcl"
     )
     parser.add_argument(
         "--with-logview",
@@ -114,7 +105,7 @@ def parse_args():
         "--skip-check-avail-gpu",
         action="store_true",
         default=False,
-        help="Skip GPU availability check"
+        help="Skip GPU availability check",
     )
     parser.add_argument(
         "--kill",
@@ -134,7 +125,14 @@ def parse_args():
         type=str,
         default="",
         required=False,
-        help="Prefix for deepfinance service names"
+        help="Prefix for deepfinance service names",
+    )
+    parser.add_argument(
+        "--swarm-overwatch",
+        type=str,
+        default="",
+        required=False,
+        help="Swarm server URL for overwatch monitoring (e.g., http://localhost:10086)",
     )
     return parser.parse_args()
 
@@ -143,22 +141,37 @@ def check_model_file_exists(exp_config):
     model_path = exp_config["ajet"]["model"]["path"]
     # if model_path has more than 2 '/', we consider it as a dir path
     if model_path.count("/") > 2:
-        assert os.path.exists(model_path), f"Model path {model_path} does not exist. Please check your configuration."
+        assert os.path.exists(model_path), (
+            f"Model path {model_path} does not exist. Please check your configuration."
+        )
 
 
 def start_swarm_server(env, config):
     config = dict_to_namespace(config)
-    assert config.ajet.enable_swarm_mode, \
+    assert config.ajet.enable_swarm_mode, (
         "Please enable_swarm_mode in config to start swarm server."
-    assert config.ajet.enable_experimental_interchange_server, \
+    )
+    assert config.ajet.enable_experimental_interchange_server, (
         "Please enable_experimental_interchange_server in config to start swarm server."
-    from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import start_interchange_server
+    )
+    from ajet.tuner_lib.weight_tuner.experimental.as_oai_model_server import (
+        start_interchange_server,
+    )
+
     start_interchange_server(config, blocking=True, env=env)
 
 
 def main():
     args = parse_args()
 
+    # Handle swarm overwatch mode
+    if args.swarm_overwatch:
+        from ajet.utils.swarm_overwatch import start_overwatch
+
+        logger.info(f"Starting Swarm Overwatch for server: {args.swarm_overwatch}")
+        start_overwatch(args.swarm_overwatch, refresh_interval=1.0)
+        return
+
     # Enforce GPU availability and free memory threshold before proceeding
     if not args.skip_check_avail_gpu:
         if (args.backbone != "debug") and (not args.kill) and (not args.autokill):
@@ -174,7 +187,9 @@ def main():
             logger.info(f"Killing processes matching keyword: {keyword}")
             killed_pids = fast_kill_by_keyword_bash(keyword)
             if killed_pids:
-                logger.success(f"Successfully killed processes with PIDs: {killed_pids}")
+                logger.success(
+                    f"Successfully killed processes with PIDs: {killed_pids}"
+                )
             else:
                 logger.warning(f"No processes found matching keyword: {keyword}")
         if not args.conf:
@@ -192,16 +207,24 @@ def main():
     exp_config = None
     exp_dir = args.exp_dir or "saved_experiments"
     if args.swarm_server and (not args.conf):
-        args.conf = os.path.abspath(os.path.join(os.path.dirname(__file__), "default_config/ajet_ts_default.yaml"))
-        assert os.path.exists(args.conf), "Please provide a valid config file for swarm server mode."
+        args.conf = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "default_config/ajet_ts_default.yaml"
+            )
+        )
+        assert os.path.exists(args.conf), (
+            "Please provide a valid config file for swarm server mode."
+        )
     if args.conf:
         yaml_path = args.conf
         (
             main_yaml_fp,
             exe_exp_base,
             exp_name,
             exp_config,
-        ) = prepare_experiment_config(yaml_path, exp_dir, args.backbone, storage=(not args.swarm_server))
+        ) = prepare_experiment_config(
+            yaml_path, exp_dir, args.backbone, storage=(not args.swarm_server)
+        )
 
     # setup environment variables
     env, exp_config = setup_environment_vars(args, exp_config, main_yaml_fp)
@@ -211,9 +234,9 @@ def main():
         return
 
     if args.with_ray:
-        assert (
-            not args.with_ray_cluster
-        ), "Cannot use both --with-ray and --with-ray-cluster simultaneously."
+        assert not args.with_ray_cluster, (
+            "Cannot use both --with-ray and --with-ray-cluster simultaneously."
+        )
         start_ray_service(args, env)
 
     if args.with_appworld:
@@ -235,9 +258,9 @@ def main():
         launch_logview(exp_name)
 
     if args.with_ray_cluster:
-        assert (
-            not args.with_ray
-        ), "Cannot use both --with-ray and --with-ray-cluster simultaneously."
+        assert not args.with_ray, (
+            "Cannot use both --with-ray and --with-ray-cluster simultaneously."
+        )
         start_ray_service(args, env, cluster=True)
 
     if args.conf and main_yaml_fp and exe_exp_base and exp_config:
 
@@ -54,6 +54,29 @@ def __init__(self, server_url: str):
         self.record_episode_expire_time = {}
         self.auto_batching_tasks = []
 
+        # better logging management
+        self._last_second_print_buffer: dict[str, float] = {}
+
+    def logger_info(self, message):
+        # logger with de-duplication within 1 second to prevent log flooding
+
+        if message in self._last_second_print_buffer.keys():
+            timestamp = self._last_second_print_buffer
+            if time.time() - timestamp[message] < 1:
+                return
+            else:
+                self._last_second_print_buffer[message] = time.time()
+                logger.info(message)
+                # clean up old records to prevent memory leak
+                keys_to_delete = [key for key, ts in self._last_second_print_buffer.items() if time.time() - ts > 1]
+                for key in keys_to_delete:
+                    del self._last_second_print_buffer[key]
+        else:
+            self._last_second_print_buffer[message] = time.time()
+            logger.info(message)
+
+        return
+
 
     def _clean_up_expired_records(self):
         # remove records that have expired and expired at least CLEAN_RECORD_TIMEOUT seconds ago
@@ -82,7 +105,7 @@ def begin_episode(self, discard_episode_timeout=60, max_episode_time=120, episod
         """
         status, status_json = self.get_engine_status()  # warm up connection and log the status
         if status not in ["ENGINE.ROLLING"]:
-            logger.info(f"Engine status is {status}. Waiting until ENGINE.ROLLING...")
+            self.logger_info(f"Engine status is {status}. Waiting until ENGINE.ROLLING...")
             self._wait_until_status_change_to(desired_status="ENGINE.ROLLING", verbose=False)
 
         while True:
@@ -107,7 +130,7 @@ def begin_episode(self, discard_episode_timeout=60, max_episode_time=120, episod
                     episode_uuid = data.episode_uuid
                     openai_base_url = data.openai_base_url
                     openai_api_key = data.openai_api_key
-                    logger.info(f"Claimed episode {episode_uuid}, current global step: {status_json.get('global_step', 'unknown')}")
+                    self.logger_info(f"Claimed episode {episode_uuid}, current global step: {status_json.get('global_step', 'unknown')}")
                     return episode_uuid, OpenaiBaseUrlAndApiKey(
                         base_url=openai_base_url,
                         api_key=openai_api_key,
@@ -121,7 +144,7 @@ def begin_episode(self, discard_episode_timeout=60, max_episode_time=120, episod
                     ]
                     if any(scenario in data.fail_cause for scenario in need_wait_scenarios):
                         if time.time() - self.previous_warning_time > 60:
-                            logger.info(f"{data.fail_cause}. Retrying in 15s...")
+                            self.logger_info(f"{data.fail_cause}. Retrying in 15s...")
                             self.previous_warning_time = time.time()
                         time.sleep(15)
                     else:
@@ -169,7 +192,7 @@ def end_episode(self, task:Task, episode_uuid: str, workflow_output: WorkflowOut
         data = EndEpisodeResponse.model_validate(resp.json())
 
         if data.success:
-            logger.info(f"Ended episode {episode_uuid}")
+            self.logger_info(f"Ended episode {episode_uuid}")
         else:
             logger.error(f"Failed to end episode {episode_uuid}")
             raise RuntimeError(f"Failed to end episode {episode_uuid}")
@@ -198,7 +221,7 @@ def abort_episode(self, episode_uuid: str):
             data = EndEpisodeResponse.model_validate(resp.json())
 
             if data.success:
-                logger.info(f"Aborted episode {episode_uuid}")
+                self.logger_info(f"Aborted episode {episode_uuid}")
             else:
                 logger.error(f"Failed to end episode {episode_uuid}")
 
@@ -227,7 +250,7 @@ def sync_train_config(self, agent_jet_job: AgentJetJob):
                 timeout=GENERAL_TIMEOUT
             )
             raise_for_status_with_detail(resp)
-            logger.info("Synced train config to Swarm server")
+            self.logger_info("Synced train config to Swarm server")
         except Exception as e:
             logger.error(f"Error syncing train config: {e}")
             raise
@@ -252,7 +275,7 @@ def start_engine(self):
         raise_for_status_with_detail(resp)
         result = resp.json()
         if result.get("success"):
-            logger.info("Successfully started training engine on Swarm server (current model global step)")
+            self.logger_info("Successfully started training engine on Swarm server (current model global step)")
         else:
             logger.error("Failed to start training engine")
             raise RuntimeError("Failed to start training engine")
@@ -267,7 +290,7 @@ def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=
         Reports status every 5 seconds while waiting.
         """
         if verbose:
-            logger.info(f"Polling engine status until {desired_status}...")
+            self.logger_info(f"Polling engine status until {desired_status}...")
         last_report_time = time.time()
         init_poll_time = last_report_time
 
@@ -279,13 +302,13 @@ def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=
                 # Report status every 5 seconds
                 if current_time - last_report_time >= 30:
                     if verbose:
-                        logger.info(f"Current engine status (already waited {int(current_time - init_poll_time)}s): {current_status}")
+                        self.logger_info(f"Current engine status (already waited {int(current_time - init_poll_time)}s): {current_status}")
                     last_report_time = current_time
 
                 # Check if engine has reached the desired status
                 if current_status == desired_status:
                     if verbose:
-                        logger.info(f"Engine status is {desired_status}.")
+                        self.logger_info(f"Engine status is {desired_status}.")
                     break
 
                 # Wait a bit before next poll
@@ -363,15 +386,15 @@ def auto_sync_train_config_and_start_engine(self, agent_jet_job: AgentJetJob, fo
             time.sleep(8)
         current_status, _ = self.get_engine_status()
         if current_status == "ENGINE.OFFLINE":
-            logger.info("Engine is OFFLINE. Syncing train config and starting engine...")
+            self.logger_info("Engine is OFFLINE. Syncing train config and starting engine...")
             self.sync_train_config(agent_jet_job)
             self.start_engine()
         elif current_status == "ENGINE.ROLLING":
-            logger.info("Engine is already ROLLING. No action needed.")
+            self.logger_info("Engine is already ROLLING. No action needed.")
         elif current_status == "ENGINE.ROLLING_POST":
-            logger.info("Engine is already ROLLING. No action needed.")
+            self.logger_info("Engine is already ROLLING. No action needed.")
         elif current_status == "ENGINE.BOOTING":
-            logger.info("Engine is BOOTING. Waiting until it becomes ROLLING...")
+            self.logger_info("Engine is BOOTING. Waiting until it becomes ROLLING...")
             self._wait_until_status_change_to(desired_status="ENGINE.ROLLING")
             logger.success("Training engine is now ROLLING and ready.")
         elif current_status == "ENGINE.CANNOT_CONNECT":
@@ -388,7 +411,7 @@ def stop_engine(self):
         """
         current_status, _ = self.get_engine_status()
         if current_status == "ENGINE.OFFLINE":
-            logger.info("Engine is already OFFLINE. No action needed.")
+            self.logger_info("Engine is already OFFLINE. No action needed.")
             return
 
         resp = httpx.post(
@@ -399,7 +422,7 @@ def stop_engine(self):
         raise_for_status_with_detail(resp)
         result = resp.json()
         if result.get("success"):
-            logger.info("Successfully stopped training engine on Swarm server")
+            self.logger_info("Successfully stopped training engine on Swarm server")
         else:
             logger.error("Failed to stop training engine")
             raise RuntimeError("Failed to stop training engine")
@@ -502,5 +525,5 @@ def rollout(task) -> float | None:
             if len(episodes) == (remote_batch_size * local_grpo_n):
                 episode_results = run_episodes_until_all_complete(episodes, func=rollout, auto_retry=True)
                 for episode, reward in zip(episodes, episode_results):
-                    logger.info(f"Episode for task {episode.task_id} completed with reward: {reward}")
+                    self.logger_info(f"Episode for task {episode.task_id} completed with reward: {reward}")
                 episodes.clear()
@@ -554,7 +554,7 @@ async def end_episode(req: EndEpisodeRequest):
 
         if episode_status != "claimed":
             logger.error(f"[server] Episode {episode_uuid} is not in claimed status.")
-            raise HTTPException(status_code=400, detail=f"Episode {episode_uuid} is not in claimed status, maybe you take **too long** to submit the workflow output, increase `discard_episode_timeout` when `begin_episode`.")
+            raise HTTPException(status_code=400, detail=f"Episode {episode_uuid} is not in claimed status, maybe you take **too long** to submit the workflow output, try increase `discard_episode_timeout` when `begin_episode`.")
 
         if client_uuid_recorded != client_uuid:
             logger.error(f"[server] Episode {episode_uuid} is claimed by different client: {client_uuid_recorded}, but got {client_uuid}.")