feat: implement PeriodicDrainThreadPoolExecutor for improved task management and auto-retry functionality

binary-husky · binary-husky · commit 484d1bc08f34 · 2026-02-22T18:06:38.000+08:00
diff --git a/ajet/utils/thread_executors.py b/ajet/utils/thread_executors.py
@@ -1,5 +1,6 @@
 from concurrent.futures import ThreadPoolExecutor
 from ajet.utils.sington import singleton
+from loguru import logger
 import threading
 
 
@@ -41,3 +42,40 @@ def wrapped_fn(*args, **kwargs):
     def shutdown(self, wait=True):
         self.executor.shutdown(wait=wait)
 
+class PeriodicDrainThreadPoolExecutor:
+    """A ThreadPoolExecutor that bounds the number of pending tasks via a semaphore."""
+
+    def __init__(self, workers=100, auto_retry=True):
+        self._max_workers = workers
+        self._executor = ThreadPoolExecutor(max_workers=workers)
+        self._submitted_count = 0
+        self._auto_retry = auto_retry
+
+    def submit(self, fn, *args, **kwargs):
+        """Submit a task, blocking if the pending queue is full."""
+
+        def retry_wrapper(func, arg):
+            while True:
+                try:
+                    return func(arg)
+                except Exception as e:
+                    logger.exception(f"[run_episodes_until_all_complete] Error executing episode: {e}. Retrying...")
+
+        if self._auto_retry:
+            return self._executor.submit(retry_wrapper, fn, *args, **kwargs)
+        else:
+            return self._executor.submit(fn, *args, **kwargs)
+
+    def submit_with_periodic_drain(self, fn, *args, **kwargs):
+        """Submit a task, draining all in-flight work every `drain_every_n_job` submissions."""
+        drain_every_n_job = self._max_workers
+        if self._submitted_count > 0 and self._submitted_count % drain_every_n_job == 0:
+            self._executor.shutdown(wait=True)
+            self._executor = ThreadPoolExecutor(max_workers=self._max_workers)
+
+        self._submitted_count += 1
+        return self.submit(fn, *args, **kwargs)
+
+    def shutdown(self, wait=True):
+        """Shut down the underlying executor."""
+        self._executor.shutdown(wait=wait)
diff --git a/tutorial/example_math_swarm/math.py b/tutorial/example_math_swarm/math.py
@@ -1,34 +1,27 @@
+import os
 import re
 import requests
 from textwrap import dedent
 from ajet.schema.task import Task, WorkflowOutput
 from ajet.copilot.job import AgentJetJob
 from ajet.task_reader import RouterTaskReader
 from ajet.utils.retry import retry_with_backoff
-from ajet.utils.thread_executors import BoundedThreadPoolExecutor
+from ajet.utils.thread_executors import PeriodicDrainThreadPoolExecutor
 from ajet.tuner_lib.as_oai_baseurl_apikey import OpenaiBaseUrlAndApiKey
 from ajet.tuner_lib.experimental.interchange_utils import SwarmThrottlePolicy
 from ajet.default_config.ajet_default import AjetTaskReader, HuggingfaceDatRepo
 from ajet.tuner_lib.experimental.as_swarm_client import SwarmClient, SwarmThrottlePolicy
 
-# --------- configurations that take effect locally -------------
-LOCAL_GRPO_N = 4  # grpo group size
-LOCAL_NUM_EPOCH = 10000
-LOCAL_NUM_EPOCH = 1
-LOCAL_MAX_PARALLEL = 64
-LOCAL_DATASET_PATH = "/mnt/data_cpfs/qingxu.fu/dataset/openai/gsm8k/main"
-REMOTE_SWARM_URL = "http://localhost:10086" # Change to your swarm remote url
-
-# --------- configurations that take effect remotely -------------
-REMOTE_BATCH_SIZE = 32
-REMOTE_ALLOCATE_GPU_PER_NODE = 4
-REMOTE_TRAIN_MODEL_01 = '/mnt/data_cpfs/model_cache/modelscope/hub/Qwen/Qwen/Qwen2.5-3B-Instruct'
-
 # python -m tutorial.example_math_swarm.math
 
-class WeightUpdatedHalfway(Exception):
-    """Raised when the remote side starts updating model weights halfway through an episode."""
+GRPO_N = 4  # grpo group size
+NUM_EPOCH = 10000
+DATASET_PATH = "/mnt/data_cpfs/qingxu.fu/dataset/openai/gsm8k/main"
+AJET_SWARM_URL = os.getenv("AJET_SWARM_URL", "http://localhost:10086")
 
+REMOTE_BATCH_SIZE = 32
+REMOTE_ALLOCATE_GPU_PER_NODE = 4
+REMOTE_TRAIN_MODEL = '/mnt/data_cpfs/model_cache/modelscope/hub/Qwen/Qwen/Qwen2.5-3B-Instruct'
 
 def main():
 
@@ -37,21 +30,21 @@ def main():
         reader_type = "huggingface_dat_repo",
         reader_config = AjetTaskReader(
             huggingface_dat_repo = HuggingfaceDatRepo(
-                dataset_path = LOCAL_DATASET_PATH
+                dataset_path = DATASET_PATH
             )
         )
     )
 
     # # Hand shake with remote swarm server
-    swarm_worker = SwarmClient(REMOTE_SWARM_URL)
+    swarm_worker = SwarmClient(AJET_SWARM_URL)
     swarm_worker.auto_sync_train_config_and_start_engine(
         AgentJetJob(
             experiment_name="math_gsm8k_grpo",
             algorithm="grpo",
             n_gpu=REMOTE_ALLOCATE_GPU_PER_NODE,
-            model=REMOTE_TRAIN_MODEL_01,
+            model=REMOTE_TRAIN_MODEL,
             batch_size=REMOTE_BATCH_SIZE,
-            num_repeat=LOCAL_GRPO_N,
+            num_repeat=GRPO_N,
         )
     )
 
@@ -62,7 +55,7 @@ def rollout(task):
                 throttle_policy=SwarmThrottlePolicy(
                     ratio=0.5,
                     expected_batch_size=REMOTE_BATCH_SIZE,
-                    expected_num_repeat=LOCAL_GRPO_N,
+                    expected_num_repeat=GRPO_N,
                     current_task_id=task.task_id
                 )
             )
@@ -76,11 +69,11 @@ def rollout(task):
         except:
             pass
 
-    executor = BoundedThreadPoolExecutor(max_workers=LOCAL_MAX_PARALLEL)
-    for epoch in range(LOCAL_NUM_EPOCH):
+    executor = PeriodicDrainThreadPoolExecutor(workers=GRPO_N * REMOTE_BATCH_SIZE, auto_retry=True)
+    for _ in range(NUM_EPOCH):
         for _, task in enumerate(dataset.generate_training_tasks()):
-            for _ in range(LOCAL_GRPO_N):
-                executor.submit(rollout, task)
+            for _ in range(GRPO_N):
+                executor.submit_with_periodic_drain(fn=rollout, task=task)
 
     return None