Skip to content

Commit 544bff1

Browse files
committed
stage werewolve example
1 parent c621922 commit 544bff1

File tree

19 files changed

+698
-1006
lines changed

19 files changed

+698
-1006
lines changed

ajet/copilot/job.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,9 @@ def __init__(
132132

133133

134134
def build_job_from_yaml(self, yaml_path: str | None) -> dict:
135-
assert self.experiment_dir is not None, "experiment_dir must be provided either in constructor or in yaml config."
136135
self.config_as_dict = read_ajet_hierarchical_config(
137136
yaml_path,
138-
exp_name=self.experiment_name,
139-
backbone=self.backbone,
140137
write_to=None,
141-
exp_dir=self.experiment_dir,
142138
)
143139
self.config_as_dict = expand_ajet_hierarchical_config(self.config_as_dict, write_to=None)
144140
logger.info(f"Built AgentJet job config: {yaml_path}")

ajet/tuner_lib/experimental/as_swarm_client.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def raise_for_status_with_detail(resp):
5252
raise RuntimeError(f"SwarmClient error {resp.status_code} with non-JSON response: {response_text}") from e
5353

5454

55+
class SwarmServerOfflineError(Exception): ...
56+
5557

5658
class SwarmClient(object):
5759

@@ -437,7 +439,7 @@ def start_engine(self):
437439
self._wait_until_status_change_to(desired_status="ENGINE.ROLLING")
438440
logger.success("Training engine is now ROLLING and ready.")
439441

440-
def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=True):
442+
def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=True, timeout=1800):
441443
"""
442444
Poll engine status until it reaches desired_status.
443445
Reports status every 5 seconds while waiting.
@@ -446,12 +448,20 @@ def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=
446448
self.logger_info(f"Polling engine status until {desired_status}...")
447449
last_report_time = time.time()
448450
init_poll_time = last_report_time
451+
initial_status, _ = self.get_engine_status()
449452

450453
while True:
451454
try:
452455
current_status, _ = self.get_engine_status()
453456
current_time = time.time()
454457

458+
# Check if timeout has been reached
459+
if current_time - init_poll_time >= timeout:
460+
raise TimeoutError(f"Timeout reached while waiting for engine status to change to {desired_status}")
461+
462+
if (initial_status == "ENGINE.OFFLINE") and (current_status == "ENGINE.OFFLINE"):
463+
raise SwarmServerOfflineError(f"Engine status changed from {initial_status} to OFFLINE while waiting for {desired_status}. This may indicate an error in the engine. Please check the swarm server logs for details.")
464+
455465
# Report status every 5 seconds
456466
if current_time - last_report_time >= 30:
457467
if verbose:
@@ -467,6 +477,9 @@ def _wait_until_status_change_to(self, desired_status="ENGINE.ROLLING", verbose=
467477
# Wait a bit before next poll
468478
time.sleep(5)
469479

480+
except SwarmServerOfflineError as e:
481+
raise e
482+
470483
except Exception as e:
471484
logger.error(f"Error polling engine status: {e}")
472485
time.sleep(5)

ajet/tuner_lib/experimental/as_swarm_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def override_param_callback(config):
393393
main_yaml_fp,
394394
env,
395395
exp_config,
396+
True, # is_swarm_server
396397
),
397398
)
398399
p.daemon = True

ajet/tuner_lib/experimental/interchange_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,12 @@ class UpdateEngineStatusRequest(BaseModel):
111111

112112
def get_interchange_server_url(config):
113113
port = os.getenv("AJET_DAT_INTERCHANGE_PORT")
114-
if config.ajet.interchange_server.interchange_server_port != 'auto':
115-
port = str(int(config.ajet.interchange_server.interchange_server_port))
114+
if isinstance(config, dict):
115+
interchange_server_port = config.get("ajet", {}).get("interchange_server", {}).get("interchange_server_port", "auto")
116+
else:
117+
interchange_server_port = config.ajet.interchange_server.interchange_server_port
118+
if interchange_server_port != 'auto':
119+
port = str(int(interchange_server_port))
116120
assert port is not None, "AJET_DAT_INTERCHANGE_PORT env var must be set"
117121
master_node_ip = os.getenv("MASTER_NODE_IP", "localhost")
118122
base_url = f"http://{master_node_ip}:{port}"

ajet/utils/config_utils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def config_safe_guard(config: dict, backbone: str) -> dict:
171171

172172

173173
def read_ajet_hierarchical_config(
174-
yaml_fp, exp_name, backbone, write_to=None, exp_dir=DEFAULT_DIR, override_param_callback=None
174+
yaml_fp, exp_name=None, backbone=None, write_to=None, exp_dir=None, override_param_callback=None
175175
):
176176
if yaml_fp is None:
177177
config = {
@@ -193,9 +193,12 @@ def read_ajet_hierarchical_config(
193193
else:
194194
with open(yaml_fp, "r", encoding="utf-8") as file:
195195
config = yaml.safe_load(file)
196-
config["ajet"]["experiment_name"] = exp_name
197-
config["ajet"]["experiment_dir"] = os.path.join(exp_dir, exp_name)
198-
config["ajet"]["backbone"] = backbone
196+
if exp_name is not None:
197+
config["ajet"]["experiment_name"] = exp_name
198+
if (exp_dir is not None) and (exp_name is not None):
199+
config["ajet"]["experiment_dir"] = os.path.join(exp_dir, exp_name)
200+
if backbone is not None:
201+
config["ajet"]["backbone"] = backbone
199202

200203
# remove extra config of verl for trinity
201204
if backbone == "debug":
@@ -324,7 +327,7 @@ def prepare_experiment_config(yaml_path, exp_dir, backbone, override_param_callb
324327

325328
## 4. edit new yaml
326329
config = read_ajet_hierarchical_config(
327-
yaml_backup_dst, exp_name, backbone, write_to=yaml_backup_dst, exp_dir=exp_dir, override_param_callback=override_param_callback
330+
yaml_backup_dst, exp_name=exp_name, backbone=backbone, write_to=yaml_backup_dst, exp_dir=exp_dir, override_param_callback=override_param_callback
328331
)
329332
config_final = expand_ajet_hierarchical_config(config, write_to=yaml_backup_dst)
330333

ajet/utils/launch_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ def execute_training_process(
319319
exe_yaml_path,
320320
env,
321321
exp_config,
322+
is_swarm_server=False,
322323
):
323324
"""
324325
Execute the training process based on the specified backbone and configuration.
@@ -403,7 +404,13 @@ def execute_training_process(
403404
subprocess.run(cmd, check=True, cwd=os.path.abspath("./"), env=env)
404405
except subprocess.CalledProcessError as e:
405406
logger.error(f"Error running subprocess: {e}")
407+
if is_swarm_server:
408+
from ajet.tuner_lib.experimental.interchange_utils import http_change_engine_status
409+
http_change_engine_status(exp_config, "ENGINE.OFFLINE", global_step=0)
406410
sys.exit(1)
407411
except Exception as e:
408412
logger.error(f"Unexpected error: {e}")
413+
if is_swarm_server:
414+
from ajet.tuner_lib.experimental.interchange_utils import http_change_engine_status
415+
http_change_engine_status(exp_config, "ENGINE.OFFLINE", global_step=0)
409416
sys.exit(1)

docs/en/ajet-swarm-docker.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ docker run --rm -it \
2424
-v ./swarmlog:/workspace/log \
2525
-v ./swarmexp:/workspace/saved_experiments \
2626
-p 10086:10086 \
27+
-e SWANLAB_API_KEY=$SWANLAB_API_KEY \
2728
--gpus=all \
2829
--shm-size=32GB \
2930
ghcr.io/modelscope/agentjet:main \
@@ -89,6 +90,7 @@ docker run --rm -it \
8990
-v ./swarmlog:/workspace/log \
9091
-v ./swarmexp:/workspace/saved_experiments \
9192
-p 10086:10086 \
93+
-e SWANLAB_API_KEY=$SWANLAB_API_KEY \
9294
--gpus=all \
9395
--shm-size=32GB \
9496
ghcr.io/modelscope/agentjet:main \

0 commit comments

Comments
 (0)