modelscope
diff --git a/‎ajet/backbone/verl/dp_actor.py‎
Lines changed: 8 additions & 0 deletions b/‎ajet/backbone/verl/dp_actor.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ajet/copilot/job.py‎
Lines changed: 6 additions & 0 deletions b/‎ajet/copilot/job.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎ajet/default_config/ajet_config_schema.py‎
Lines changed: 6 additions & 14 deletions b/‎ajet/default_config/ajet_config_schema.py‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎ajet/default_config/ajet_default.yaml‎
Lines changed: 9 additions & 0 deletions b/‎ajet/default_config/ajet_default.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎ajet/default_config/verl/config_auto_convertion_verl.jsonc‎
Lines changed: 0 additions & 1 deletion b/‎ajet/default_config/verl/config_auto_convertion_verl.jsonc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ajet/utils/config_utils.py‎
Lines changed: 14 additions & 0 deletions b/‎ajet/utils/config_utils.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎ajet/utils/core_env_vars.py‎
Lines changed: 6 additions & 0 deletions b/‎ajet/utils/core_env_vars.py‎
Lines changed: 6 additions & 0 deletions
@@ -138,6 +138,14 @@ def update_policy(self, data: DataProto):
         # make sure we are in training mode
         self.actor_module.train()
 
+        # [AJET] Optional: estimate the GPU-memory limit of ppo_max_token_len_per_gpu and raise.
+        # Triggered by env AGENTJET_FIND_MAX_PPO_TOKEN_LEN. Intercepts the *first* real PPO update
+        # (model + grads + optimizer already resident, so the measurement is realistic). See
+        # ajet.utils.find_max_ppo_token_len. It never returns.
+        if os.environ.get("AGENTJET_FIND_MAX_PPO_TOKEN_LEN"):
+            from ajet.utils.find_max_ppo_token_len import find_max_ppo_token_len_per_gpu
+            find_max_ppo_token_len_per_gpu(self, data)
+
         temperature = data.meta_info["temperature"]  # temperature must be in the data.meta_info to avoid silent error
         pad_token_id = data.meta_info.get("pad_token_id", 0)
 
 
@@ -127,6 +127,8 @@ def __init__(
         max_model_len: int | None = None,
         tensor_model_parallel_size: int | None = None,
         max_num_seqs: int | None = None,
+        ppo_max_token_len_per_gpu: int | None = None,
+        ulysses_sequence_parallel_size: int | None = None,
         mini_batch_num: int | None = None,
         lora_rank: int | None = None,
         lora_alpha: int | None = None,
@@ -195,6 +197,8 @@ def __init__(
         self.max_model_len: int = cast(int, max_model_len)
         self.tensor_model_parallel_size: int = cast(int, tensor_model_parallel_size)
         self.max_num_seqs: int = cast(int, max_num_seqs)
+        self.ppo_max_token_len_per_gpu: int | None = ppo_max_token_len_per_gpu
+        self.ulysses_sequence_parallel_size: int | None = ulysses_sequence_parallel_size
         self.mini_batch_num: int = cast(int, mini_batch_num)
         self.lora_rank: int = cast(int, lora_rank)
         self.lora_alpha: int = cast(int, lora_alpha)
@@ -236,6 +240,8 @@ def __init__(
             "ajet.rollout.max_model_len":                   "max_model_len",
             "ajet.rollout.tensor_model_parallel_size":      "tensor_model_parallel_size",
             "ajet.rollout.max_num_seqs":                    "max_num_seqs",
+            "ajet.rollout.ppo_max_token_len_per_gpu":       "ppo_max_token_len_per_gpu",
+            "ajet.trainer_common.ulysses_sequence_parallel_size": "ulysses_sequence_parallel_size",
             "ajet.trainer_common.mini_batch_num":           "mini_batch_num",
             "ajet.lora.lora_rank":                          "lora_rank",
             "ajet.lora.lora_alpha":                         "lora_alpha",
 
@@ -3,7 +3,7 @@
 
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 
 @dataclass
@@ -26,6 +26,7 @@ class AjetTrainerCommon:
     use_kl_in_reward: bool = False
     kl_penalty_type: str = "kl"
     ppo_epochs: int = 1
+    ulysses_sequence_parallel_size: int = 1
     val_print_to_markdown_file_path: str | None = None
     train_print_to_markdown_file_path: str | None = None
     total_training_steps: int | None = None
@@ -34,20 +35,9 @@ class AjetTrainerCommon:
     total_epochs: int = 50
     val_pass_n: int = 4
     val_before_train: bool = False
-    # When enabled, every sample produced by the same episode (same
-    # non_tensor_batch["episode_uuids"]) gets its loss weight multiplied by
-    # 1/N (N = number of samples in that episode) so each episode contributes
-    # equally to the policy-gradient update regardless of how many samples it
-    # generated. Disabled by default (current behaviour: every sample weighted
-    # equally).
+    # When enabled, every sample produced by the same episode (same non_tensor_batch["episode_uuids"]) gets its loss weight multiplied by 1/N (N = number of samples in that episode)
     loss_weight_normalization_episode_level: bool = False
-    # When enabled, GRPO group statistics (baseline mean / std) are computed at
-    # episode scope instead of sample scope: each episode (same
-    # non_tensor_batch["episode_uuids"]) is first reduced to its mean reward,
-    # then the per-task (same non_tensor_batch["uid"]) baseline is the mean over
-    # those episode means. This makes every episode contribute equally to the
-    # advantage baseline regardless of how many samples it generated. Disabled
-    # by default (current behaviour: baseline is the mean over all samples).
+    # When enabled, GRPO group statistics (baseline mean / std) are computed at episode scope instead of sample scope
     advantage_estimation_episode_level: bool = False
 
 @dataclass
@@ -71,6 +61,8 @@ class AjetRollout:
     max_num_seqs: int = 64
     num_repeat: int = 8
     gpu_memory_utilization: float = 0.85
+    # Per-GPU token budget for the PPO actor update (dynamic batching).
+    ppo_max_token_len_per_gpu: Optional[int] = None  # None => track ajet.rollout.max_model_len (legacy behaviour).
     compute_madness_checklist: List[str] = field(default_factory=list)
 
 
 
@@ -49,6 +49,15 @@ ajet:
     # max token length allowed for the model during rollout
     max_model_len: 18000
 
+    # Per-GPU token budget for the PPO actor update under dynamic batching.
+    # null  => follow max_model_len (the four verl *_token_len_per_gpu keys all track max_model_len).
+    # <int> => override ONLY actor_rollout_ref.actor.ppo_max_token_len_per_gpu, so the PPO update
+    #          can pack larger micro-batches (squeeze more out of GPU memory) while keeping the
+    #          rollout/log_prob lengths tied to max_model_len.
+    # Tip: set env AGENTJET_FIND_MAX_PPO_TOKEN_LEN=1 to auto-probe the hardware limit (the run will
+    #      raise with the discovered value, which you then paste here).
+    ppo_max_token_len_per_gpu: null
+
     multi_turn:
       # how many samples should be collected for each task run
       max_sample_per_task: 30
 
@@ -49,7 +49,6 @@
     "ajet.rollout.max_model_len": [
         "actor_rollout_ref.rollout.max_model_len",
         "actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu",
-        "actor_rollout_ref.actor.ppo_max_token_len_per_gpu",
         "actor_rollout_ref.ref.log_prob_max_token_len_per_gpu"
     ],
 
 
@@ -166,6 +166,20 @@ def recursive_copy(src_dict, dst_dict, parent_key=""):
                 f"[Note]: Aligned parameter from [{from_key}] to [{to_key}] with value: [{value}]"
             )
 
+    # [AJET] ajet.rollout.ppo_max_token_len_per_gpu is the SOLE source of the actor's per-gpu PPO
+    # token budget (it is deliberately NOT pulled in via the ajet.rollout.max_model_len mapping
+    # above). None means "track max_model_len" (legacy behaviour); an explicit int decouples the
+    # PPO update budget from max_model_len. The verl key only carries the resolved value so it can
+    # reach the Ray actor (which receives the actor_rollout_ref subtree, not the ajet namespace).
+    ppo_token_len = _dive_to_fetch_value(from_config, "ajet.rollout.ppo_max_token_len_per_gpu")
+    if ppo_token_len is None:
+        ppo_token_len = _dive_to_fetch_value(from_config, "ajet.rollout.max_model_len")
+    _dive_to_set_value(to_config, "actor_rollout_ref.actor.ppo_max_token_len_per_gpu", ppo_token_len)
+    logger.success(
+        f"[Note]: Resolved [actor_rollout_ref.actor.ppo_max_token_len_per_gpu] = [{ppo_token_len}] "
+        "from [ajet.rollout.ppo_max_token_len_per_gpu] (None => ajet.rollout.max_model_len)."
+    )
+
     # backbone specific safe guard
     to_config = align_parameter_safe_guard(to_config, backbone)
 
 
@@ -64,6 +64,12 @@ def get_runtime_env(config, is_trinity: bool = False) -> dict:
         "OPENAI_BASE_URL",
         "API_KEY",
         "BASE_URL",
+        "AGENTJET_FIND_MAX_PPO_TOKEN_LEN",
+        "AGENTJET_FIND_MAX_START",
+        "AGENTJET_FIND_MAX_CAP",
+        "AGENTJET_FIND_MAX_TOL",
+        "AGENTJET_FIND_MAX_BUDGET_S",
+        "AGENTJET_FIND_MAX_SEQ",
     ]
 
     for var in optional_env_vars: