Improve map-verl-config skill with schema requirements

binary-husky · claude · binary-husky · commit 99642c574aa4 · 2026-04-16T12:55:43.000+08:00
Add detailed instructions for updating the config schema, explaining that
every nested level needs a dataclass to avoid raw dict issues at runtime.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/ajet/copilot/job.py b/ajet/copilot/job.py
@@ -110,15 +110,6 @@ def __init__(
         if not (all(p is None for p in length_params) or all(p is not None for p in length_params)):
             raise ValueError("(`max_prompt_length`, `max_response_length`, `max_model_len`, `max_response_length_in_one_turn`) must all be None or all be non-None")
 
-        # Validate: when lora_rank > 0, load_format must be safetensors
-        if lora_rank is not None and lora_rank > 0:
-            if lora_load_format != "safetensors":
-                raise ValueError(f"When lora_rank > 0, lora_load_format must be 'safetensors', got '{lora_load_format}'")
-            if lr is None:
-                raise ValueError("lr should be provided for lora training")
-            if lr <= 1e-5:
-                raise ValueError(f"lr should usually be greater than 1e-5 for lora training, got {lr}")
-
         self.config_as_dict: dict = self.build_job_from_yaml(base_yaml_config)
         self.config = Config.update_from_dict_recursive(Config(), self.config_as_dict)
 
@@ -198,6 +189,16 @@ def __init__(
 
         assert self.max_prompt_length + self.max_response_length <= self.max_model_len, "illegal token length"
         assert self.max_response_length_in_one_turn <= self.max_response_length
+
+        # Validate: when lora_rank > 0, load_format must be safetensors
+        if self.lora_rank > 0:
+            if self.lora_load_format != "safetensors":
+                raise ValueError(f"When lora_rank > 0, lora_load_format must be 'safetensors', got '{self.lora_load_format}'")
+            if self.lr is None:
+                raise ValueError("lr should be provided for lora training")
+            if self.lr <= 1e-5:
+                raise ValueError(f"lr should usually be greater than 1e-5 for lora training, got {self.lr}")
+
         if self.backbone == "trinity":
             raise NotImplementedError("Trinity backbone is not yet supported in AgentJetJob.")
 
diff --git a/ajet/copilot/map-verl-config/SKILL.md b/ajet/copilot/map-verl-config/SKILL.md
@@ -5,7 +5,7 @@ license: Complete terms in LICENSE.txt
 ---
 
 
-1. find user requested verl config in in codebase/agentjet/ajet/default_config/verl/verl_default.yaml
+1. find user requested verl config in codebase/agentjet/ajet/default_config/verl/verl_default.yaml
 
 2. check `codebase/agentjet/ajet/default_config/verl/config_auto_convertion_verl.jsonc`, whether a mapping to this config already exists.
 
@@ -15,5 +15,14 @@ license: Complete terms in LICENSE.txt
 
 5. ask user whether to add to AgentJetJob (ajet/copilot/job.py), if the user confirms:
   - learn how other config is added in ajet/copilot/job.py
-  - add to __init__, update docstring
-  - add to ajet/default_config/ajet_config_schema.py
+  - add to __init__ signature (with type hint and default None)
+  - update docstring with parameter description
+  - add instance attribute assignment with cast()
+  - add mapping to `overrides` dict
+
+6. **CRITICAL**: update `ajet/default_config/ajet_config_schema.py`
+  - the schema must have a dataclass for EVERY nested level in the config path
+  - e.g., for `ajet.trainer_common.optim.lr`, need:
+    - `AjetOptim` dataclass with `lr: float = 1e-6`
+    - `AjetTrainerCommon` must have `optim: AjetOptim = field(default_factory=AjetOptim)`
+  - if parent dataclass is missing the nested field, config loading will store it as a raw dict instead of a typed dataclass, causing `getattr()` to fail at runtime
diff --git a/tutorial/example_train_multi_model/README.md b/tutorial/example_train_multi_model/README.md
@@ -199,3 +199,33 @@ REMOTE_14B_BATCH_SIZE = 8             # Batch size for 14B model
 REMOTE_7B_ALLOCATE_GPU_PER_NODE = 8   # GPUs for 7B model
 REMOTE_14B_ALLOCATE_GPU_PER_NODE = 8  # GPUs for 14B model
 ```
+
+
+## cheat sheet
+
+PROJECT_DIR="/mnt/data_cpfs/qingxu.fu/agentjet/hello-agentjet"
+
+# --- Swarm Server 1 ---
+tmux new-session -d -s "SWARM_SERVER_M1"    # warning: do not add command here, otherwise it will be executed immediately and the session will exit
+tmux send-keys -t "SWARM_SERVER_M1" "cd ${PROJECT_DIR}" Enter
+tmux send-keys -t "SWARM_SERVER_M1" "source .venv/bin/activate" Enter
+tmux send-keys -t "SWARM_SERVER_M1" "export SETUPTOOLS_USE_DISTUTILS=local" Enter
+tmux send-keys -t "SWARM_SERVER_M1" "ajet-swarm start --swarm-port=10086" Enter
+echo "Started SWARM_SERVER_M1 on port 10086"
+
+# --- Swarm Server 2 ---
+tmux new-session -d -s "SWARM_SERVER_M2"
+tmux send-keys -t "SWARM_SERVER_M2" "cd ${PROJECT_DIR}" Enter
+tmux send-keys -t "SWARM_SERVER_M2" "source .venv/bin/activate" Enter
+tmux send-keys -t "SWARM_SERVER_M2" "export SETUPTOOLS_USE_DISTUTILS=local" Enter
+tmux send-keys -t "SWARM_SERVER_M2" "ajet-swarm start --swarm-port=10087" Enter
+echo "Started SWARM_SERVER_M2 on port 10087"
+
+# --- Swarm Client ---
+tmux new-session -d -s "SWARM_CLIENT_EXP1"
+tmux send-keys -t "SWARM_CLIENT_EXP1" "cd ${PROJECT_DIR}" Enter
+tmux send-keys -t "SWARM_CLIENT_EXP1" "source .venv/bin/activate" Enter
+tmux send-keys -t "SWARM_CLIENT_EXP1" "export SETUPTOOLS_USE_DISTUTILS=local" Enter
+tmux send-keys -t "SWARM_CLIENT_EXP1" "sleep 30s" Enter
+tmux send-keys -t "SWARM_CLIENT_EXP1" "python -m tutorial.example_train_multi_model.trans_roll_lora" Enter
+echo "Started SWARM_CLIENT_EXP1"