feat: enhance HuggingfaceDatRepo and HuggingFaceTaskReader for improved dataset handling and proxy configuration

binary-husky · binary-husky · commit 094673e0baf0 · 2026-02-22T21:53:54.000+08:00
fix: update PeriodicDrainThreadPoolExecutor to manage task results and auto-retry functionality
chore: modify example_math_swarm to use updated dataset path and configuration
diff --git a/ajet/default_config/ajet_default.py b/ajet/default_config/ajet_default.py
@@ -36,8 +36,10 @@ class AjetRollout:
 @dataclass
 class HuggingfaceDatRepo:
     dataset_path: str = "gsm8k"
+    dataset_name: str | None = None
     training_split: str = "train"
     validation_split: str = "validation"
+    http_proxy_address: str = ""
 
 
 @dataclass
diff --git a/ajet/task_reader/hf_dataset_reader.py b/ajet/task_reader/hf_dataset_reader.py
@@ -1,5 +1,7 @@
 
 import datasets
+import httpx
+import huggingface_hub
 
 from ajet.schema.task import Task
 from ajet.task_reader.task_reader_base import BaseTaskReader
@@ -17,7 +19,30 @@ def __init__(self, reader_config):
         super().__init__(reader_config)
         self.reader_config = reader_config
         self.as_generator = False
-        self.dataset_name = self.reader_config.huggingface_dat_repo.dataset_path
+        self.dataset_path = self.reader_config.huggingface_dat_repo.dataset_path
+
+
+        try:
+            self.dataset_name = self.reader_config.huggingface_dat_repo.dataset_name
+        except Exception:
+            self.dataset_name = None
+
+        try:
+            self.http_proxy_address = getattr(
+                self.reader_config.huggingface_dat_repo, "http_proxy_address", ""
+            ) or getattr(self.reader_config.huggingface_dat_repo, "http_proxy", "")
+        except Exception:
+            self.http_proxy_address = ""
+
+        # Configure httpx proxy via set_client_factory (replaces deprecated proxies= arg)
+        if self.http_proxy_address:
+            proxy_url = self.http_proxy_address
+            huggingface_hub.set_client_factory(
+                lambda **kwargs: httpx.Client(
+                    proxy=proxy_url,
+                    **{k: v for k, v in kwargs.items() if k != "proxies"},
+                )
+            )
 
     def _load_dataset_split(self, split: str):
         """
@@ -30,22 +55,26 @@ def _load_dataset_split(self, split: str):
             Generator: List of Task objects created from the dataset.
         """
         try:
-            if self.dataset_name.endswith(".parquet"):
+
+            if self.dataset_path.endswith(".parquet"):
                 # Load from local parquet file
-                dataset = datasets.load_dataset("parquet", data_files=self.dataset_name, split=split)
+                dataset = datasets.load_dataset(
+                    "parquet", data_files=self.dataset_path, split=split
+                )
             else:
-                # Load from Hugging Face hub
-                dataset = datasets.load_dataset(self.dataset_name, split=split)
+                dataset = datasets.load_dataset(
+                    self.dataset_path, split=split, name=self.dataset_name
+                )
             # shuffle dataset
             dataset = dataset.map(lambda example, idx: {"original_idx": idx}, with_indices=True)
             dataset = dataset.shuffle()
         except Exception as e:
             raise ValueError(
-                f"Failed to load dataset '{self.dataset_name}' with split '{split}': {str(e)}"
+                f"Failed to load dataset '{self.dataset_path}' with split '{split}': {str(e)}"
             )
 
         if len(dataset) == 0:
-            raise ValueError(f"No examples found in dataset '{self.dataset_name}' with split '{split}'")
+            raise ValueError(f"No examples found in dataset '{self.dataset_path}' with split '{split}'")
 
         self.as_generator = True
 
diff --git a/ajet/tuner_lib/experimental/as_swarm_client.py b/ajet/tuner_lib/experimental/as_swarm_client.py
@@ -3,6 +3,7 @@
 import time
 import httpx
 import json
+import re
 import yaml
 from beast_logger import print_dict
 from typing import List, Tuple
@@ -60,7 +61,7 @@ def __init__(self, server_url: str):
 
         # better logging management
         self._last_second_print_buffer: dict[str, float] = {}
-        self.begin_episode_lock = threading.Lock()
+        self._begin_episode_lock = threading.Lock()
         # record last registered AgentJetJob
         self._agent_jet_job = None
         # throttle
@@ -202,7 +203,9 @@ def begin_episode(self, discard_episode_timeout=600, episode_type="train", throt
         Return:
             (episode_uuid, openai_base_url, openai_api_key)
         """
+        return self._begin_episode_auto_repeat(discard_episode_timeout, episode_type, throttle_policy)
 
+    def _begin_episode_auto_repeat(self, discard_episode_timeout=600, episode_type="train", throttle_policy: SwarmThrottlePolicy|None = None) -> Tuple[str, OpenaiBaseUrlAndApiKey]:
         # max_episode_time: when an episode has **lasted** for more than X seconds, it will be terminated **locally** by client (call `end_episode` will be re-route to `abort_episode`)
         max_episode_time = 2*discard_episode_timeout
 
@@ -225,7 +228,7 @@ def begin_episode(self, discard_episode_timeout=600, episode_type="train", throt
 
             # when throttle_policy is set, acquire lock to prevent multiple threads from claiming episode at the same time and causing throttle policy to fail
             if throttle_policy is not None:
-                self.begin_episode_lock.acquire()
+                self._begin_episode_lock.acquire()
 
             try:
                 # Check throttle policy before claiming episode (only for train episodes)
@@ -259,6 +262,10 @@ def begin_episode(self, discard_episode_timeout=600, episode_type="train", throt
                     episode_uuid = data.episode_uuid
                     openai_base_url = data.openai_base_url
                     openai_api_key = data.openai_api_key
+
+                    # force replace openai_base_url host with self.server_url
+                    openai_base_url = re.sub(r'^https?://[^/]+', self.server_url, openai_base_url)
+
                     self.logger_info(f"Claimed episode {episode_uuid}, current global step: {status_json.get('global_step', 'unknown')}")
                     return episode_uuid, OpenaiBaseUrlAndApiKey(
                         base_url=openai_base_url,
@@ -290,8 +297,8 @@ def begin_episode(self, discard_episode_timeout=600, episode_type="train", throt
 
             finally:
                 if throttle_policy is not None:
-                    if self.begin_episode_lock.locked():
-                        self.begin_episode_lock.release()
+                    if self._begin_episode_lock.locked():
+                        self._begin_episode_lock.release()
 
     def end_episode(self, task:Task, episode_uuid: str, workflow_output: WorkflowOutput):
 
diff --git a/ajet/utils/thread_executors.py b/ajet/utils/thread_executors.py
@@ -50,14 +50,15 @@ def __init__(self, workers=100, auto_retry=True):
         self._executor = ThreadPoolExecutor(max_workers=workers)
         self._submitted_count = 0
         self._auto_retry = auto_retry
+        self.current_futures = []
 
     def submit(self, fn, *args, **kwargs):
         """Submit a task, blocking if the pending queue is full."""
 
-        def retry_wrapper(func, arg):
+        def retry_wrapper(fn, *args, **kwargs):
             while True:
                 try:
-                    return func(arg)
+                    return fn(*args, **kwargs)
                 except Exception as e:
                     logger.exception(f"[run_episodes_until_all_complete] Error executing episode: {e}. Retrying...")
 
@@ -69,12 +70,19 @@ def retry_wrapper(func, arg):
     def submit_with_periodic_drain(self, fn, *args, **kwargs):
         """Submit a task, draining all in-flight work every `drain_every_n_job` submissions."""
         drain_every_n_job = self._max_workers
+        results = []
         if self._submitted_count > 0 and self._submitted_count % drain_every_n_job == 0:
-            self._executor.shutdown(wait=True)
-            self._executor = ThreadPoolExecutor(max_workers=self._max_workers)
+            for future in self.current_futures:
+                try:
+                    results += [future.result()]  # Wait for the task to complete and raise exceptions if any
+                except Exception as e:
+                    logger.exception(f"Error in task execution: {e}")
+            self.current_futures = []
 
         self._submitted_count += 1
-        return self.submit(fn, *args, **kwargs)
+        future = self.submit(fn, *args, **kwargs)
+        self.current_futures.append(future)
+        return future, results
 
     def shutdown(self, wait=True):
         """Shut down the underlying executor."""
diff --git a/tutorial/example_math_swarm/math.py b/tutorial/example_math_swarm/math.py
@@ -18,11 +18,10 @@
 
 GRPO_N = 4  # grpo group size
 NUM_EPOCH = 10000
-DATASET_PATH = "/mnt/data_cpfs/qingxu.fu/dataset/openai/gsm8k/main"
 AJET_SWARM_URL = os.getenv("AJET_SWARM_URL", "http://localhost:10086")
 
 REMOTE_BATCH_SIZE = 32
-REMOTE_ALLOCATE_GPU_PER_NODE = 4
+REMOTE_ALLOCATE_GPU_PER_NODE = 8
 REMOTE_TRAIN_MODEL = '/root/agentjet/modelscope_cache/Qwen/Qwen2.5-7B-Instruct'
 
 def main():
@@ -32,7 +31,9 @@ def main():
         reader_type = "huggingface_dat_repo",
         reader_config = AjetTaskReader(
             huggingface_dat_repo = HuggingfaceDatRepo(
-                dataset_path = DATASET_PATH
+                dataset_path = "C:/Users/fuqingxu-hub/Downloads/dataset/gsm8k/socratic",
+                # dataset_path = "openai/gsm8k",
+                # dataset_name = "main",
             )
         )
     )
@@ -53,20 +54,11 @@ def main():
     def rollout(task):
         try:
             # begin episode
-            episode_uuid, api_baseurl_key = swarm_worker.begin_episode(
-                throttle_policy=SwarmThrottlePolicy(
-                    ratio=0.5,
-                    expected_batch_size=REMOTE_BATCH_SIZE,
-                    expected_num_repeat=GRPO_N,
-                    current_task_id=task.task_id
-                )
-            )
+            episode_uuid, api_baseurl_key = swarm_worker.begin_episode(discard_episode_timeout=60)
             # execute agent ( base_url = api_baseurl_key.base_url, api_key = api_baseurl_key.api_key )
             workflow_output = execute_agent(task, api_baseurl_key)  # reward is in `workflow_output`
             # report output back to swarm remote
             swarm_worker.end_episode(task, episode_uuid, workflow_output)
-            # print global rollout status across the swarm
-            swarm_worker.print_rollout_stat()
             return
         except:
             pass